Introduce new experimental parser (#2885)

Co-authored-by: Raphael Darley <raphael@raphaeldarley.com>
This commit is contained in:
Mees Delzenne 2024-01-10 17:43:56 +01:00 committed by GitHub
parent f7e6e028a2
commit 2755f572fc
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
152 changed files with 14640 additions and 823 deletions

View file

@ -180,7 +180,7 @@ jobs:
- name: Run CLI integration tests - name: Run CLI integration tests
run: cargo make ci-cli-integration run: cargo make ci-cli-integration
- name: Debug info - name: Debug info
if: always() if: always()
run: | run: |
@ -189,7 +189,7 @@ jobs:
df -h df -h
ps auxf ps auxf
cat /tmp/surrealdb.log || true cat /tmp/surrealdb.log || true
http-server: http-server:
name: HTTP integration tests name: HTTP integration tests
@ -326,6 +326,45 @@ jobs:
path: target/llvm-cov/html/ path: target/llvm-cov/html/
retention-days: 5 retention-days: 5
test-parser:
name: Test workspace with experimental parser
runs-on: ubuntu-latest
steps:
- name: Install stable toolchain
uses: dtolnay/rust-toolchain@stable
with:
toolchain: 1.71.1
- name: Checkout sources
uses: actions/checkout@v4
- name: Setup cache
uses: Swatinem/rust-cache@v2
with:
save-if: ${{ github.ref == 'refs/heads/main' }}
- name: Install dependencies
run: |
sudo apt-get -y update
- name: Free up some disk space
run: |
(set -x; df -h)
# Free up some disk space by removing unused files
(set -x; sudo rm -rf /imagegeneration || true)
(set -x; sudo rm -rf /opt/az || true)
(set -x; sudo rm -rf /opt/hostedtoolcache || true)
(set -x; sudo rm -rf /opt/google || true)
(set -x; sudo rm -rf /opt/pipx || true)
(set -x; df -h)
- name: Install cargo-make
run: cargo install --debug --locked cargo-make
- name: Test workspace for experimental_parser
run: cargo make test-experimental-parser
ws-engine: ws-engine:
name: WebSocket engine name: WebSocket engine
runs-on: ubuntu-latest runs-on: ubuntu-latest

767
Cargo.lock generated

File diff suppressed because it is too large Load diff

View file

@ -8,7 +8,7 @@ authors = ["Tobie Morgan Hitchcock <tobie@surrealdb.com>"]
[features] [features]
# Public features # Public features
default = ["storage-mem", "storage-rocksdb", "scripting", "http", "jwks"] default = ["storage-mem", "storage-rocksdb", "scripting", "http"]
storage-mem = ["surrealdb/kv-mem"] storage-mem = ["surrealdb/kv-mem"]
storage-rocksdb = ["surrealdb/kv-rocksdb"] storage-rocksdb = ["surrealdb/kv-rocksdb"]
storage-speedb = ["surrealdb/kv-speedb"] storage-speedb = ["surrealdb/kv-speedb"]
@ -18,6 +18,7 @@ scripting = ["surrealdb/scripting"]
http = ["surrealdb/http"] http = ["surrealdb/http"]
http-compression = [] http-compression = []
ml = ["surrealdb/ml", "surrealml-core"] ml = ["surrealdb/ml", "surrealml-core"]
experimental-parser = ["surrealdb/experimental-parser"]
jwks = ["surrealdb/jwks"] jwks = ["surrealdb/jwks"]
[workspace] [workspace]
@ -60,6 +61,7 @@ reqwest = { version = "0.11.22", default-features = false, features = ["blocking
rmpv = "1.0.1" rmpv = "1.0.1"
rustyline = { version = "12.0.0", features = ["derive"] } rustyline = { version = "12.0.0", features = ["derive"] }
serde = { version = "1.0.193", features = ["derive"] } serde = { version = "1.0.193", features = ["derive"] }
serde_cbor = "0.11.2"
serde_json = "1.0.108" serde_json = "1.0.108"
serde_pack = { version = "1.1.2", package = "rmp-serde" } serde_pack = { version = "1.1.2", package = "rmp-serde" }
surrealdb = { path = "lib", features = ["protocol-http", "protocol-ws", "rustls"] } surrealdb = { path = "lib", features = ["protocol-http", "protocol-ws", "rustls"] }

126
Cargo.toml.orig Normal file
View file

@ -0,0 +1,126 @@
[package]
name = "surreal"
publish = false
edition = "2021"
version = "1.1.0"
license-file = "LICENSE"
authors = ["Tobie Morgan Hitchcock <tobie@surrealdb.com>"]
[features]
# Public features
default = ["storage-mem", "storage-rocksdb", "scripting", "http", "jwks"]
storage-mem = ["surrealdb/kv-mem"]
storage-rocksdb = ["surrealdb/kv-rocksdb"]
storage-speedb = ["surrealdb/kv-speedb"]
storage-tikv = ["surrealdb/kv-tikv"]
storage-fdb = ["surrealdb/kv-fdb-7_1"]
scripting = ["surrealdb/scripting"]
http = ["surrealdb/http"]
http-compression = []
ml = ["surrealdb/ml", "surrealml-core"]
<<<<<<< HEAD
experimental-parser = ["surrealdb/experimental-parser"]
=======
jwks = ["surrealdb/jwks"]
>>>>>>> upstream/main
[workspace]
members = ["lib", "lib/examples/actix", "lib/examples/axum"]
[profile.release]
lto = true
strip = true
opt-level = 3
panic = 'abort'
codegen-units = 1
[profile.bench]
strip = false
[dependencies]
argon2 = "0.5.2"
axum = { version = "0.6.20", features = ["tracing", "ws", "headers"] }
axum-client-ip = "0.5.0"
axum-extra = { version = "0.7.7", features = ["query", "typed-routing"] }
axum-server = { version = "0.5.1", features = ["tls-rustls"] }
base64 = "0.21.5"
bytes = "1.5.0"
ciborium = "0.2.1"
clap = { version = "4.4.11", features = ["env", "derive", "wrap_help", "unicode"] }
futures = "0.3.29"
futures-util = "0.3.29"
glob = "0.3.1"
http = "0.2.11"
http-body = "0.4.5"
hyper = "0.14.27"
ipnet = "2.9.0"
ndarray = { version = "0.15.6", optional = true }
once_cell = "1.18.0"
opentelemetry = { version = "0.19", features = ["rt-tokio"] }
opentelemetry-otlp = { version = "0.12.0", features = ["metrics"] }
pin-project-lite = "0.2.13"
rand = "0.8.5"
reqwest = { version = "0.11.22", default-features = false, features = ["blocking", "gzip"] }
rmpv = "1.0.1"
rustyline = { version = "12.0.0", features = ["derive"] }
serde = { version = "1.0.193", features = ["derive"] }
serde_cbor = "0.11.2"
serde_json = "1.0.108"
serde_pack = { version = "1.1.2", package = "rmp-serde" }
surrealdb = { path = "lib", features = ["protocol-http", "protocol-ws", "rustls"] }
surrealml-core = { version = "0.0.3", optional = true}
tempfile = "3.8.1"
thiserror = "1.0.50"
tokio = { version = "1.34.0", features = ["macros", "signal"] }
tokio-util = { version = "0.7.10", features = ["io"] }
tower = "0.4.13"
tower-http = { version = "0.4.4", features = ["trace", "sensitive-headers", "auth", "request-id", "util", "catch-panic", "cors", "set-header", "limit", "add-extension", "compression-full"] }
tracing = "0.1"
tracing-opentelemetry = "0.19.0"
tracing-subscriber = { version = "0.3.18", features = ["env-filter"] }
urlencoding = "2.1.3"
uuid = { version = "1.6.1", features = ["serde", "js", "v4", "v7"] }
[target.'cfg(unix)'.dependencies]
nix = { version = "0.27.1", features = ["user"] }
[target.'cfg(unix)'.dev-dependencies]
nix = { version = "0.27.1", features = ["signal", "user"] }
[target.'cfg(any(target_os = "linux", target_os = "macos", target_os = "ios"))'.dependencies]
mimalloc = { version = "0.1.39", default-features = false }
[target.'cfg(any(target_os = "android", target_os = "freebsd", target_os = "netbsd", target_os = "openbsd"))'.dependencies]
jemallocator = "0.5.4"
[dev-dependencies]
assert_fs = "1.0.13"
env_logger = "0.10.1"
opentelemetry-proto = { version = "0.2.0", features = ["gen-tonic", "traces", "metrics", "logs"] }
rcgen = "0.11.3"
serial_test = "2.0.0"
temp-env = { version = "0.3.6", features = ["async_closure"] }
test-log = { version = "0.2.13", features = ["trace"] }
tokio-stream = { version = "0.1", features = ["net"] }
tokio-tungstenite = { version = "0.20.1" }
tonic = "0.8.3"
ulid = "1.1.0"
wiremock = "0.5.22"
[build-dependencies]
semver = "1.0.20"
[package.metadata.deb]
maintainer-scripts = "pkg/deb/"
maintainer = "Tobie Morgan Hitchcock <tobie@surrealdb.com>"
copyright = "SurrealDB Ltd. 2022"
systemd-units = { enable = true }
depends = "$auto"
section = "utility"
priority = "optional"
assets = [
["target/release/surreal", "usr/share/surrealdb/surreal", "755"],
["pkg/deb/README", "usr/share/surrealdb/README", "644"],
]
extended-description = "A scalable, distributed, collaborative, document-graph database, for the realtime web."
license-file = ["LICENSE", "4"]

View file

@ -56,6 +56,18 @@ args = [
"--skip", "ws_integration" "--skip", "ws_integration"
] ]
[tasks.test-experimental-parser]
category = "CI - INTEGRATION TESTS"
command = "cargo"
args = [
"test", "--locked", "--no-default-features", "--features", "storage-mem,scripting,http,experimental-parser", "--workspace", "--",
"--skip", "api_integration",
"--skip", "cli_integration",
"--skip", "http_integration",
"--skip", "ws_integration"
]
[tasks.test-workspace-coverage-complete] [tasks.test-workspace-coverage-complete]
category = "CI - INTEGRATION TESTS" category = "CI - INTEGRATION TESTS"
command = "cargo" command = "cargo"

View file

@ -40,6 +40,7 @@ rustls = ["dep:rustls", "reqwest?/rustls-tls", "tokio-tungstenite?/rustls-tls-we
ml = ["surrealml-core", "ndarray"] ml = ["surrealml-core", "ndarray"]
jwks = ["dep:reqwest"] jwks = ["dep:reqwest"]
arbitrary = ["dep:arbitrary", "dep:regex-syntax", "rust_decimal/rust-fuzz", "geo-types/arbitrary", "uuid/arbitrary"] arbitrary = ["dep:arbitrary", "dep:regex-syntax", "rust_decimal/rust-fuzz", "geo-types/arbitrary", "uuid/arbitrary"]
experimental-parser = ["dep:phf", "dep:unicase"]
# Private features # Private features
kv-fdb = ["foundationdb", "tokio/time"] kv-fdb = ["foundationdb", "tokio/time"]
@ -123,6 +124,8 @@ tracing = "0.1.40"
trice = "0.4.0" trice = "0.4.0"
ulid = { version = "1.1.0", features = ["serde"] } ulid = { version = "1.1.0", features = ["serde"] }
url = "2.5.0" url = "2.5.0"
phf = { version = "0.11.2", features = ["macros", "unicase"], optional=true }
unicase = { version = "2.7.0", optional = true }
arbitrary = { version = "1.3.2", features = ["derive"], optional = true } arbitrary = { version = "1.3.2", features = ["derive"], optional = true }
regex-syntax = { version = "0.8.2", optional = true, features = ["arbitrary"] } regex-syntax = { version = "0.8.2", optional = true, features = ["arbitrary"] }
geo-types = { version = "0.7.12", features = ["arbitrary"] } geo-types = { version = "0.7.12", features = ["arbitrary"] }

View file

@ -59,6 +59,7 @@ fn bench_parser(c: &mut Criterion) {
&(1..=100).map(|n| format!("'{n}': {n}")).collect::<Vec<_>>().join(", ") &(1..=100).map(|n| format!("'{n}': {n}")).collect::<Vec<_>>().join(", ")
) )
); );
parser!(c, full_test, surrealdb::sql::parse, include_str!("../test.surql"));
c.finish(); c.finish();
} }

1
lib/fuzz/Cargo.lock generated
View file

@ -2584,7 +2584,6 @@ dependencies = [
"futures-concurrency", "futures-concurrency",
"fuzzy-matcher", "fuzzy-matcher",
"geo 0.27.0", "geo 0.27.0",
"geo-types",
"hex", "hex",
"indexmap 2.1.0", "indexmap 2.1.0",
"ipnet", "ipnet",

View file

@ -255,7 +255,7 @@ mod tests {
use crate::dbs::Capabilities; use crate::dbs::Capabilities;
use crate::opt::auth::Root; use crate::opt::auth::Root;
use crate::sql::Value; use crate::sql::Value;
use crate::syn::test::Parse; use crate::syn::Parse;
#[tokio::test] #[tokio::test]
async fn local_engine_without_auth() { async fn local_engine_without_auth() {

View file

@ -153,6 +153,7 @@ struct IntervalStream {
} }
impl IntervalStream { impl IntervalStream {
#[allow(unused)]
fn new(interval: Interval) -> Self { fn new(interval: Interval) -> Self {
Self { Self {
inner: interval, inner: interval,

View file

@ -177,7 +177,8 @@ impl<T: Target + Hash + Eq + PartialEq + std::fmt::Display> std::fmt::Display fo
/// # Examples /// # Examples
/// ///
/// Create a new instance, and allow all capabilities /// Create a new instance, and allow all capabilities
/// ```no_run #[cfg_attr(feature = "kv-rocksdb", doc = "```no_run")]
#[cfg_attr(not(feature = "kv-rocksdb"), doc = "```ignore")]
/// # use surrealdb::opt::capabilities::Capabilities; /// # use surrealdb::opt::capabilities::Capabilities;
/// # use surrealdb::opt::Config; /// # use surrealdb::opt::Config;
/// # use surrealdb::Surreal; /// # use surrealdb::Surreal;
@ -192,7 +193,8 @@ impl<T: Target + Hash + Eq + PartialEq + std::fmt::Display> std::fmt::Display fo
/// ``` /// ```
/// ///
/// Create a new instance, and allow certain functions /// Create a new instance, and allow certain functions
/// ```no_run #[cfg_attr(feature = "kv-rocksdb", doc = "```no_run")]
#[cfg_attr(not(feature = "kv-rocksdb"), doc = "```ignore")]
/// # use std::str::FromStr; /// # use std::str::FromStr;
/// # use surrealdb::engine::local::File; /// # use surrealdb::engine::local::File;
/// # use surrealdb::opt::capabilities::Capabilities; /// # use surrealdb::opt::capabilities::Capabilities;

View file

@ -423,6 +423,7 @@ pub async fn asynchronous(
mod tests { mod tests {
#[cfg(all(feature = "scripting", feature = "kv-mem"))] #[cfg(all(feature = "scripting", feature = "kv-mem"))]
use crate::dbs::Capabilities; use crate::dbs::Capabilities;
use crate::sql::{statements::OutputStatement, Function, Query, Statement, Value};
#[tokio::test] #[tokio::test]
async fn implementations_are_present() { async fn implementations_are_present() {
@ -442,8 +443,28 @@ mod tests {
let (quote, _) = line.split_once("=>").unwrap(); let (quote, _) = line.split_once("=>").unwrap();
let name = quote.trim().trim_matches('"'); let name = quote.trim().trim_matches('"');
let builtin_name = crate::syn::test::builtin_name(name); let res = crate::syn::parse(&format!("RETURN {}()", name));
if builtin_name.is_err() { if let Ok(Query(mut x)) = res {
match x.0.pop() {
Some(Statement::Output(OutputStatement {
what: Value::Function(x),
..
})) => match *x {
Function::Normal(parsed_name, _) => {
if parsed_name != name {
problems
.push(format!("function `{name}` parsed as `{parsed_name}`"));
}
}
_ => {
problems.push(format!("couldn't parse {name} function"));
}
},
_ => {
problems.push(format!("couldn't parse {name} function"));
}
}
} else {
problems.push(format!("couldn't parse {name} function")); problems.push(format!("couldn't parse {name} function"));
} }

View file

@ -1 +1,2 @@
#[cfg(feature = "http")]
mod fetch; mod fetch;

View file

@ -14,10 +14,10 @@ use std::str::{self, FromStr};
use std::sync::Arc; use std::sync::Arc;
async fn config( async fn config(
kvs: &Datastore, _kvs: &Datastore,
de_kind: Algorithm, de_kind: Algorithm,
de_code: String, de_code: String,
token_header: Header, _token_header: Header,
) -> Result<(DecodingKey, Validation), Error> { ) -> Result<(DecodingKey, Validation), Error> {
if de_kind == Algorithm::Jwks { if de_kind == Algorithm::Jwks {
#[cfg(not(feature = "jwks"))] #[cfg(not(feature = "jwks"))]
@ -27,8 +27,8 @@ async fn config(
} }
#[cfg(feature = "jwks")] #[cfg(feature = "jwks")]
// The key identifier header must be present // The key identifier header must be present
if let Some(kid) = token_header.kid { if let Some(kid) = _token_header.kid {
jwks::config(kvs, &kid, &de_code).await jwks::config(_kvs, &kid, &de_code).await
} else { } else {
Err(Error::MissingTokenHeader("kid".to_string())) Err(Error::MissingTokenHeader("kid".to_string()))
} }
@ -1125,7 +1125,7 @@ mod tests {
// Test with generic user identifier // Test with generic user identifier
// //
{ {
let resource_id = "user:2k9qnabxuxh8k4d5gfto".to_string(); let resource_id = "user:`2k9qnabxuxh8k4d5gfto`".to_string();
// Prepare the claims object // Prepare the claims object
let mut claims = claims.clone(); let mut claims = claims.clone();
claims.id = Some(resource_id.clone()); claims.id = Some(resource_id.clone());
@ -1254,6 +1254,7 @@ mod tests {
} }
} }
#[cfg(feature = "jwks")]
#[tokio::test] #[tokio::test]
async fn test_token_scope_jwks() { async fn test_token_scope_jwks() {
use crate::opt::capabilities::{Capabilities, NetTarget, Targets}; use crate::opt::capabilities::{Capabilities, NetTarget, Targets};

View file

@ -8,8 +8,7 @@ use crate::idx::ft::postings::TermFrequency;
use crate::idx::ft::terms::{TermId, Terms}; use crate::idx::ft::terms::{TermId, Terms};
use crate::sql::statements::DefineAnalyzerStatement; use crate::sql::statements::DefineAnalyzerStatement;
use crate::sql::tokenizer::Tokenizer as SqlTokenizer; use crate::sql::tokenizer::Tokenizer as SqlTokenizer;
use crate::sql::Value; use crate::sql::{Function, Strand, Value};
use crate::syn::path_like;
use async_recursion::async_recursion; use async_recursion::async_recursion;
use filter::Filter; use filter::Filter;
use std::collections::hash_map::Entry; use std::collections::hash_map::Entry;
@ -194,26 +193,16 @@ impl Analyzer {
txn: &Transaction, txn: &Transaction,
mut input: String, mut input: String,
) -> Result<Tokens, Error> { ) -> Result<Tokens, Error> {
if let Some(function_name) = &self.function { if let Some(function_name) = self.function.clone() {
let fns = format!("fn::{function_name}(\"{input}\")"); let fns = Function::Custom(function_name.clone(), vec![Value::Strand(Strand(input))]);
match path_like(&fns) { let val = fns.compute(ctx, opt, txn, None).await?;
Ok(func_value) => { if let Value::Strand(val) = val {
let val = func_value.compute(ctx, opt, txn, None).await?; input = val.0;
if let Value::Strand(val) = val { } else {
input = val.0; return Err(Error::InvalidFunction {
} else { name: function_name,
return Err(Error::InvalidFunction { message: "The function should return a string.".to_string(),
name: function_name.to_string(), });
message: "The function should return a string.".to_string(),
});
}
}
Err(e) => {
return Err(Error::InvalidFunction {
name: function_name.to_string(),
message: e.to_string(),
})
}
} }
} }
if let Some(t) = &self.tokenizers { if let Some(t) = &self.tokenizers {

View file

@ -308,7 +308,7 @@ impl RangeQueryBuilder {
mod tests { mod tests {
use crate::idx::planner::plan::{IndexOperator, IndexOption, RangeValue}; use crate::idx::planner::plan::{IndexOperator, IndexOption, RangeValue};
use crate::sql::{Array, Idiom, Value}; use crate::sql::{Array, Idiom, Value};
use crate::syn::test::Parse; use crate::syn::Parse;
use std::collections::HashSet; use std::collections::HashSet;
use std::sync::Arc; use std::sync::Arc;

View file

@ -193,7 +193,7 @@ mod tests {
#[test] #[test]
fn key() { fn key() {
use super::*; use super::*;
use crate::syn::test::Parse; use crate::syn::Parse;
let fk = Thing::parse("other:test"); let fk = Thing::parse("other:test");
#[rustfmt::skip] #[rustfmt::skip]
let val = Graph::new( let val = Graph::new(

View file

@ -90,8 +90,7 @@ mod tests {
let dec = Thing::decode(&enc).unwrap(); let dec = Thing::decode(&enc).unwrap();
assert_eq!(val, dec); assert_eq!(val, dec);
println!("---"); println!("---");
// let id2 = "foo:[u'f8e238f2-e734-47b8-9a16-476b291bd78a']";
let id2 = "foo:['f8e238f2-e734-47b8-9a16-476b291bd78a']";
let thing = syn::thing(id2).expect("Failed to parse the ID"); let thing = syn::thing(id2).expect("Failed to parse the ID");
let id2 = thing.id; let id2 = thing.id;
let val = Thing::new("testns", "testdb", "testtb", id2); let val = Thing::new("testns", "testdb", "testtb", id2);

View file

@ -219,6 +219,18 @@ impl Datastore {
#[allow(unused_variables)] #[allow(unused_variables)]
let default_clock: Arc<RwLock<SizedClock>> = let default_clock: Arc<RwLock<SizedClock>> =
Arc::new(RwLock::new(SizedClock::System(SystemClock::new()))); Arc::new(RwLock::new(SizedClock::System(SystemClock::new())));
// removes warning if no storage is enabled.
#[cfg(not(any(
feature = "kv-mem",
feature = "kv-rocksdb",
feature = "kv-speedb",
feature = "kv-indxdb",
feature = "kv-tikv",
feature = "kv-fdb"
)))]
let _ = (clock_override, default_clock);
// Initiate the desired datastore // Initiate the desired datastore
let (inner, clock): (Result<Inner, Error>, Arc<RwLock<SizedClock>>) = match path { let (inner, clock): (Result<Inner, Error>, Arc<RwLock<SizedClock>>) = match path {
"memory" => { "memory" => {
@ -340,7 +352,7 @@ impl Datastore {
// The datastore path is not valid // The datastore path is not valid
_ => { _ => {
// use clock_override and default_clock to remove warning when no kv is enabled. // use clock_override and default_clock to remove warning when no kv is enabled.
let _ = (clock_override, default_clock); let _ = default_clock;
info!("Unable to load the specified datastore {}", path); info!("Unable to load the specified datastore {}", path);
Err(Error::Ds("Unable to load the specified datastore".into())) Err(Error::Ds("Unable to load the specified datastore".into()))
} }

View file

@ -25,6 +25,14 @@ mod tx;
mod clock; mod clock;
#[cfg(test)] #[cfg(test)]
#[cfg(any(
feature = "kv-mem",
feature = "kv-rocksdb",
feature = "kv-speedb",
feature = "kv-indxdb",
feature = "kv-tikv",
feature = "kv-fdb"
))]
mod tests; mod tests;
pub use self::ds::*; pub use self::ds::*;

View file

@ -2,7 +2,7 @@ use revision::revisioned;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use std::fmt; use std::fmt;
#[derive(Clone, Debug, Eq, PartialEq, PartialOrd, Serialize, Deserialize, Hash)] #[derive(Clone, Copy, Debug, Eq, PartialEq, PartialOrd, Serialize, Deserialize, Hash)]
#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] #[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
#[revisioned(revision = 1)] #[revisioned(revision = 1)]
pub enum Algorithm { pub enum Algorithm {

View file

@ -80,7 +80,13 @@ pub fn duration(i: &str) -> IResult<&str, ()> {
pub fn field(i: &str) -> IResult<&str, ()> { pub fn field(i: &str) -> IResult<&str, ()> {
peek(alt(( peek(alt((
value((), preceded(shouldbespace, tag_no_case("FROM"))), value(
(),
preceded(
shouldbespace,
alt((tag_no_case("FROM"), tag_no_case("TIMEOUT"), tag_no_case("PARALLEL"))),
),
),
value((), char(';')), value((), char(';')),
value((), eof), value((), eof),
)))(i) )))(i)

View file

@ -1,4 +1,3 @@
use nom::character::is_digit;
use std::borrow::Cow; use std::borrow::Cow;
const SINGLE: char = '\''; const SINGLE: char = '\'';
@ -54,9 +53,15 @@ pub fn quote_str(s: &str) -> String {
#[inline] #[inline]
pub fn quote_plain_str(s: &str) -> String { pub fn quote_plain_str(s: &str) -> String {
let mut ret = quote_str(s); #[cfg(not(feature = "experimental-parser"))]
#[cfg(not(feature = "experimental_parser"))]
{ {
if crate::syn::thing(s).is_ok() {
let mut ret = quote_str(s);
ret.insert(0, 's');
return ret;
}
let mut ret = quote_str(s);
// HACK: We need to prefix strands which look like records, uuids, or datetimes with an `s` // HACK: We need to prefix strands which look like records, uuids, or datetimes with an `s`
// otherwise the strands will parsed as a different type when parsed again. // otherwise the strands will parsed as a different type when parsed again.
// This is not required for the new parser. // This is not required for the new parser.
@ -64,13 +69,14 @@ pub fn quote_plain_str(s: &str) -> String {
// directly to avoid having to create a common interface between the old and new parser. // directly to avoid having to create a common interface between the old and new parser.
if crate::syn::v1::literal::uuid(&ret).is_ok() if crate::syn::v1::literal::uuid(&ret).is_ok()
|| crate::syn::v1::literal::datetime(&ret).is_ok() || crate::syn::v1::literal::datetime(&ret).is_ok()
|| crate::syn::thing(&ret).is_ok()
{ {
ret.insert(0, 's'); ret.insert(0, 's');
} }
ret
} }
ret #[cfg(feature = "experimental-parser")]
quote_str(s)
} }
#[inline] #[inline]
@ -106,24 +112,16 @@ pub fn escape_normal<'a>(s: &'a str, l: char, r: char, e: &str) -> Cow<'a, str>
#[inline] #[inline]
pub fn escape_numeric<'a>(s: &'a str, l: char, r: char, e: &str) -> Cow<'a, str> { pub fn escape_numeric<'a>(s: &'a str, l: char, r: char, e: &str) -> Cow<'a, str> {
// Presume this is numeric
let mut numeric = true;
// Loop over each character // Loop over each character
for x in s.bytes() { for (idx, x) in s.bytes().enumerate() {
// the first character is not allowed to be a digit.
if idx == 0 && x.is_ascii_digit() {
return Cow::Owned(format!("{l}{}{r}", s.replace(r, e)));
}
// Check if character is allowed // Check if character is allowed
if !(x.is_ascii_alphanumeric() || x == b'_') { if !(x.is_ascii_alphanumeric() || x == b'_') {
return Cow::Owned(format!("{l}{}{r}", s.replace(r, e))); return Cow::Owned(format!("{l}{}{r}", s.replace(r, e)));
} }
// Check if character is non-numeric
if !is_digit(x) {
numeric = false;
}
}
// Output the id value
match numeric {
// This is numeric so escape it
true => Cow::Owned(format!("{l}{}{r}", s.replace(r, e))),
// No need to escape the value
_ => Cow::Borrowed(s),
} }
Cow::Borrowed(s)
} }

View file

@ -48,29 +48,6 @@ impl Expression {
r, r,
} }
} }
/// Augment an existing expression
pub(crate) fn augment(mut self, l: Value, o: Operator) -> Self {
match &mut self {
Self::Binary {
l: left,
o: op,
..
} if o.precedence() >= op.precedence() => match left {
Value::Expression(x) => {
*x.as_mut() = std::mem::take(x).augment(l, o);
self
}
_ => {
*left = Self::new(l, o, std::mem::take(left)).into();
self
}
},
e => {
let r = Value::from(std::mem::take(e));
Self::new(l, o, r)
}
}
}
} }
impl Expression { impl Expression {
@ -132,6 +109,8 @@ impl Expression {
let operand = v.compute(ctx, opt, txn, doc).await?; let operand = v.compute(ctx, opt, txn, doc).await?;
return match o { return match o {
Operator::Neg => fnc::operate::neg(operand), Operator::Neg => fnc::operate::neg(operand),
// TODO: Check if it is a number?
Operator::Add => Ok(operand),
Operator::Not => fnc::operate::not(operand), Operator::Not => fnc::operate::not(operand),
op => unreachable!("{op:?} is not a unary op"), op => unreachable!("{op:?} is not a unary op"),
}; };

View file

@ -2,13 +2,13 @@ use crate::ctx::Context;
use crate::dbs::{Options, Transaction}; use crate::dbs::{Options, Transaction};
use crate::doc::CursorDoc; use crate::doc::CursorDoc;
use crate::err::Error; use crate::err::Error;
use crate::sql::fmt::{fmt_separated_by, Fmt}; use crate::sql::{
use crate::sql::part::Next; fmt::{fmt_separated_by, Fmt},
use crate::sql::part::Part; part::Next,
use crate::sql::paths::{ID, IN, META, OUT}; paths::{ID, IN, META, OUT},
use crate::sql::value::Value; Part, Value,
use md5::Digest; };
use md5::Md5; use md5::{Digest, Md5};
use revision::revisioned; use revision::revisioned;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use std::fmt::{self, Display, Formatter}; use std::fmt::{self, Display, Formatter};
@ -73,6 +73,11 @@ impl From<&[Part]> for Idiom {
Self(v.to_vec()) Self(v.to_vec())
} }
} }
impl From<Part> for Idiom {
fn from(v: Part) -> Self {
Self(vec![v])
}
}
impl Idiom { impl Idiom {
/// Appends a part to the end of this Idiom /// Appends a part to the end of this Idiom

View file

@ -1,5 +1,4 @@
use crate::sql::fmt::Fmt; use crate::sql::{fmt::Fmt, Table};
use crate::sql::table::Table;
use revision::revisioned; use revision::revisioned;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use std::fmt::{self, Display, Formatter}; use std::fmt::{self, Display, Formatter};

View file

@ -3,7 +3,7 @@ use serde::{Deserialize, Serialize};
use std::fmt; use std::fmt;
use std::fmt::Display; use std::fmt::Display;
#[derive(Clone, Debug, Eq, PartialEq, PartialOrd, Serialize, Deserialize, Hash)] #[derive(Clone, Copy, Debug, Eq, PartialEq, PartialOrd, Serialize, Deserialize, Hash)]
#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] #[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
#[revisioned(revision = 1)] #[revisioned(revision = 1)]
pub enum Language { pub enum Language {
@ -26,9 +26,9 @@ pub enum Language {
Turkish, Turkish,
} }
impl Display for Language { impl Language {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { pub fn as_str(&self) -> &'static str {
f.write_str(match self { match self {
Self::Arabic => "ARABIC", Self::Arabic => "ARABIC",
Self::Danish => "DANISH", Self::Danish => "DANISH",
Self::Dutch => "DUTCH", Self::Dutch => "DUTCH",
@ -46,6 +46,12 @@ impl Display for Language {
Self::Swedish => "SWEDISH", Self::Swedish => "SWEDISH",
Self::Tamil => "TAMIL", Self::Tamil => "TAMIL",
Self::Turkish => "TURKISH", Self::Turkish => "TURKISH",
}) }
}
}
impl Display for Language {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
f.write_str(self.as_str())
} }
} }

View file

@ -149,4 +149,4 @@ mod parser {
pub use crate::syn::*; pub use crate::syn::*;
} }
pub use self::parser::{idiom, json, parse, subquery, thing, v1::ParseError, value}; pub use self::parser::{error::ParseError, idiom, json, parse, subquery, thing, value};

View file

@ -39,6 +39,7 @@ pub struct Order {
pub random: bool, pub random: bool,
pub collate: bool, pub collate: bool,
pub numeric: bool, pub numeric: bool,
/// true if the direction is ascending
pub direction: bool, pub direction: bool,
} }

View file

@ -1,6 +1,6 @@
use crate::sql::fmt::Pretty; use crate::sql::fmt::Pretty;
use crate::sql::statement::{Statement, Statements};
use crate::sql::statements::{DefineStatement, RemoveStatement}; use crate::sql::statements::{DefineStatement, RemoveStatement};
use crate::sql::{Statement, Statements};
use derive::Store; use derive::Store;
use revision::revisioned; use revision::revisioned;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};

View file

@ -23,6 +23,12 @@ pub struct DefineScopeStatement {
pub comment: Option<Strand>, pub comment: Option<Strand>,
} }
impl DefineScopeStatement {
pub(crate) fn random_code() -> String {
rand::thread_rng().sample_iter(&Alphanumeric).take(128).map(char::from).collect::<String>()
}
}
impl DefineScopeStatement { impl DefineScopeStatement {
/// Process this type returning a computed simple Value /// Process this type returning a computed simple Value
pub(crate) async fn compute( pub(crate) async fn compute(
@ -46,10 +52,6 @@ impl DefineScopeStatement {
// Ok all good // Ok all good
Ok(Value::None) Ok(Value::None)
} }
pub fn random_code() -> String {
rand::thread_rng().sample_iter(&Alphanumeric).take(128).map(char::from).collect::<String>()
}
} }
impl Display for DefineScopeStatement { impl Display for DefineScopeStatement {

View file

@ -47,6 +47,31 @@ impl From<(Base, &str, &str)> for DefineUserStatement {
} }
impl DefineUserStatement { impl DefineUserStatement {
pub(crate) fn from_parsed_values(name: Ident, base: Base, roles: Vec<Ident>) -> Self {
DefineUserStatement {
name,
base,
roles, // New users get the viewer role by default
code: rand::thread_rng()
.sample_iter(&Alphanumeric)
.take(128)
.map(char::from)
.collect::<String>(),
..Default::default()
}
}
pub(crate) fn set_password(&mut self, password: &str) {
self.hash = Argon2::default()
.hash_password(password.as_bytes(), &SaltString::generate(&mut OsRng))
.unwrap()
.to_string()
}
pub(crate) fn set_passhash(&mut self, passhash: String) {
self.hash = passhash;
}
/// Process this type returning a computed simple Value /// Process this type returning a computed simple Value
pub(crate) async fn compute( pub(crate) async fn compute(
&self, &self,

View file

@ -13,7 +13,9 @@ use std::fmt::{self, Display, Write};
#[revisioned(revision = 1)] #[revisioned(revision = 1)]
#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] #[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
pub struct IfelseStatement { pub struct IfelseStatement {
/// The first if condition followed by a body, followed by any number of else if's
pub exprs: Vec<(Value, Value)>, pub exprs: Vec<(Value, Value)>,
/// the final else body, if there is one
pub close: Option<Value>, pub close: Option<Value>,
} }

View file

@ -3,6 +3,8 @@ use revision::revisioned;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use std::fmt; use std::fmt;
use crate::sql::escape::escape_ident;
#[derive(Clone, Debug, Default, Eq, PartialEq, PartialOrd, Serialize, Deserialize, Store, Hash)] #[derive(Clone, Debug, Default, Eq, PartialEq, PartialOrd, Serialize, Deserialize, Store, Hash)]
#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] #[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
#[revisioned(revision = 1)] #[revisioned(revision = 1)]
@ -15,9 +17,11 @@ impl fmt::Display for UseStatement {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
f.write_str("USE")?; f.write_str("USE")?;
if let Some(ref ns) = self.ns { if let Some(ref ns) = self.ns {
let ns = escape_ident(ns);
write!(f, " NS {ns}")?; write!(f, " NS {ns}")?;
} }
if let Some(ref db) = self.db { if let Some(ref db) = self.db {
let db = escape_ident(db);
write!(f, " DB {db}")?; write!(f, " DB {db}")?;
} }
Ok(()) Ok(())

View file

@ -130,7 +130,7 @@ pub(crate) mod no_nul_bytes {
#[cfg(test)] #[cfg(test)]
mod test { mod test {
#[cfg(not(feature = "experimental_parser"))] #[cfg(not(feature = "experimental-parser"))]
#[test] #[test]
fn ensure_strands_are_prefixed() { fn ensure_strands_are_prefixed() {
use super::Strand; use super::Strand;

View file

@ -75,7 +75,7 @@ impl TryFrom<Strand> for Thing {
impl TryFrom<&str> for Thing { impl TryFrom<&str> for Thing {
type Error = (); type Error = ();
fn try_from(v: &str) -> Result<Self, Self::Error> { fn try_from(v: &str) -> Result<Self, Self::Error> {
match syn::thing_raw(v) { match syn::thing(v) {
Ok(v) => Ok(v), Ok(v) => Ok(v),
_ => Err(()), _ => Err(()),
} }

View file

@ -42,7 +42,7 @@ impl Value {
mod tests { mod tests {
use super::*; use super::*;
use crate::syn::test::Parse; use crate::syn::Parse;
#[test] #[test]
fn changed_none() { fn changed_none() {

View file

@ -12,7 +12,7 @@ impl Value {
mod tests { mod tests {
use super::*; use super::*;
use crate::syn::test::Parse; use crate::syn::Parse;
#[tokio::test] #[tokio::test]
async fn clear_value() { async fn clear_value() {

View file

@ -92,7 +92,7 @@ mod tests {
use super::*; use super::*;
use crate::sql::idiom::Idiom; use crate::sql::idiom::Idiom;
use crate::syn::test::Parse; use crate::syn::Parse;
#[test] #[test]
fn compare_none() { fn compare_none() {

View file

@ -97,7 +97,7 @@ mod tests {
use super::*; use super::*;
use crate::sql::idiom::Idiom; use crate::sql::idiom::Idiom;
use crate::syn::test::Parse; use crate::syn::Parse;
#[tokio::test] #[tokio::test]
async fn cut_none() { async fn cut_none() {

View file

@ -30,7 +30,7 @@ mod tests {
use super::*; use super::*;
use crate::sql::idiom::Idiom; use crate::sql::idiom::Idiom;
use crate::syn::test::Parse; use crate::syn::Parse;
#[tokio::test] #[tokio::test]
async fn decrement_none() { async fn decrement_none() {

View file

@ -41,7 +41,7 @@ mod tests {
use super::*; use super::*;
use crate::dbs::test::mock; use crate::dbs::test::mock;
use crate::sql::idiom::Idiom; use crate::sql::idiom::Idiom;
use crate::syn::test::Parse; use crate::syn::Parse;
#[tokio::test] #[tokio::test]
async fn decrement_none() { async fn decrement_none() {

View file

@ -201,7 +201,7 @@ mod tests {
use super::*; use super::*;
use crate::dbs::test::mock; use crate::dbs::test::mock;
use crate::sql::idiom::Idiom; use crate::sql::idiom::Idiom;
use crate::syn::test::Parse; use crate::syn::Parse;
#[tokio::test] #[tokio::test]
async fn del_none() { async fn del_none() {

View file

@ -78,7 +78,7 @@ impl Value {
mod tests { mod tests {
use super::*; use super::*;
use crate::syn::test::Parse; use crate::syn::Parse;
#[test] #[test]
fn diff_none() { fn diff_none() {

View file

@ -59,7 +59,7 @@ mod tests {
use super::*; use super::*;
use crate::sql::idiom::Idiom; use crate::sql::idiom::Idiom;
use crate::syn::test::Parse; use crate::syn::Parse;
#[test] #[test]
fn each_none() { fn each_none() {

View file

@ -53,7 +53,7 @@ mod tests {
use super::*; use super::*;
use crate::sql::idiom::Idiom; use crate::sql::idiom::Idiom;
use crate::syn::test::Parse; use crate::syn::Parse;
#[test] #[test]
fn every_with_empty_objects_arrays() { fn every_with_empty_objects_arrays() {

View file

@ -34,7 +34,7 @@ mod tests {
use super::*; use super::*;
use crate::dbs::test::mock; use crate::dbs::test::mock;
use crate::sql::idiom::Idiom; use crate::sql::idiom::Idiom;
use crate::syn::test::Parse; use crate::syn::Parse;
#[tokio::test] #[tokio::test]
async fn extend_array_value() { async fn extend_array_value() {

View file

@ -250,7 +250,7 @@ mod tests {
use crate::sql::id::Id; use crate::sql::id::Id;
use crate::sql::idiom::Idiom; use crate::sql::idiom::Idiom;
use crate::sql::thing::Thing; use crate::sql::thing::Thing;
use crate::syn::test::Parse; use crate::syn::Parse;
#[tokio::test] #[tokio::test]
async fn get_none() { async fn get_none() {

View file

@ -30,7 +30,7 @@ mod tests {
use super::*; use super::*;
use crate::sql::idiom::Idiom; use crate::sql::idiom::Idiom;
use crate::syn::test::Parse; use crate::syn::Parse;
#[tokio::test] #[tokio::test]
async fn increment_none() { async fn increment_none() {

View file

@ -42,7 +42,7 @@ mod tests {
use super::*; use super::*;
use crate::dbs::test::mock; use crate::dbs::test::mock;
use crate::sql::idiom::Idiom; use crate::sql::idiom::Idiom;
use crate::syn::test::Parse; use crate::syn::Parse;
#[tokio::test] #[tokio::test]
async fn increment_none() { async fn increment_none() {

View file

@ -24,7 +24,7 @@ impl Value {
mod tests { mod tests {
use super::*; use super::*;
use crate::syn::test::Parse; use crate::syn::Parse;
#[tokio::test] #[tokio::test]
async fn merge_none() { async fn merge_none() {

View file

@ -86,7 +86,7 @@ impl Value {
mod tests { mod tests {
use super::*; use super::*;
use crate::syn::test::Parse; use crate::syn::Parse;
#[tokio::test] #[tokio::test]
async fn patch_add_simple() { async fn patch_add_simple() {

View file

@ -54,7 +54,7 @@ mod tests {
use crate::sql::id::Id; use crate::sql::id::Id;
use crate::sql::idiom::Idiom; use crate::sql::idiom::Idiom;
use crate::sql::thing::Thing; use crate::sql::thing::Thing;
use crate::syn::test::Parse; use crate::syn::Parse;
#[test] #[test]
fn pick_none() { fn pick_none() {

View file

@ -87,7 +87,7 @@ mod tests {
use super::*; use super::*;
use crate::sql::idiom::Idiom; use crate::sql::idiom::Idiom;
use crate::syn::test::Parse; use crate::syn::Parse;
#[tokio::test] #[tokio::test]
async fn put_none() { async fn put_none() {

View file

@ -19,7 +19,7 @@ impl Value {
mod tests { mod tests {
use super::*; use super::*;
use crate::syn::test::Parse; use crate::syn::Parse;
#[tokio::test] #[tokio::test]
async fn replace() { async fn replace() {

View file

@ -13,7 +13,7 @@ mod tests {
use super::*; use super::*;
use crate::sql::id::Id; use crate::sql::id::Id;
use crate::sql::thing::Thing; use crate::sql::thing::Thing;
use crate::syn::test::Parse; use crate::syn::Parse;
#[tokio::test] #[tokio::test]
async fn rid_none() { async fn rid_none() {

View file

@ -692,7 +692,7 @@ mod tests {
#[test] #[test]
fn duration() { fn duration() {
let duration = Duration::default(); let duration = Duration::default();
let value = to_value(&duration).unwrap(); let value = to_value(duration).unwrap();
let expected = Value::Duration(duration); let expected = Value::Duration(duration);
assert_eq!(value, expected); assert_eq!(value, expected);
assert_eq!(expected, to_value(&expected).unwrap()); assert_eq!(expected, to_value(&expected).unwrap());

View file

@ -159,7 +159,7 @@ mod tests {
use super::*; use super::*;
use crate::dbs::test::mock; use crate::dbs::test::mock;
use crate::sql::idiom::Idiom; use crate::sql::idiom::Idiom;
use crate::syn::test::Parse; use crate::syn::Parse;
#[tokio::test] #[tokio::test]
async fn set_none() { async fn set_none() {

View file

@ -1087,7 +1087,8 @@ impl Value {
| Value::Array(_) | Value::Array(_)
| Value::Param(_) | Value::Param(_)
| Value::Edges(_) | Value::Edges(_)
| Value::Thing(_) => true, | Value::Thing(_)
| Value::Table(_) => true,
_ => false, _ => false,
} }
} }
@ -2774,7 +2775,7 @@ mod tests {
use super::*; use super::*;
use crate::sql::uuid::Uuid; use crate::sql::uuid::Uuid;
use crate::syn::test::Parse; use crate::syn::Parse;
#[test] #[test]
fn check_none() { fn check_none() {

View file

@ -62,7 +62,7 @@ mod tests {
use super::*; use super::*;
use crate::sql::idiom::Idiom; use crate::sql::idiom::Idiom;
use crate::syn::test::Parse; use crate::syn::Parse;
#[test] #[test]
fn walk_blank() { fn walk_blank() {

View file

@ -1,3 +1,8 @@
#[cfg(feature = "experimental-parser")]
use super::v2::token::Span;
#[cfg(feature = "experimental-parser")]
use std::ops::Range;
/// A human readable location inside a string. /// A human readable location inside a string.
/// ///
/// Locations are 1 indexed, the first character on the first line being on line 1 column 1. /// Locations are 1 indexed, the first character on the first line being on line 1 column 1.
@ -19,10 +24,9 @@ impl Location {
.expect("tried to find location of substring in unrelated string"); .expect("tried to find location of substring in unrelated string");
// Bytes of input prior to line being iteratated. // Bytes of input prior to line being iteratated.
let mut bytes_prior = 0; let mut bytes_prior = 0;
for (line_idx, line) in input.split('\n').enumerate() { for (line_idx, (line, seperator_offset)) in LineIterator::new(input).enumerate() {
// +1 for the '\n' let bytes_so_far = bytes_prior + line.len() + seperator_offset.unwrap_or(0) as usize;
let bytes_so_far = bytes_prior + line.len() + 1; if bytes_so_far >= offset {
if bytes_so_far > offset {
// found line. // found line.
let line_offset = offset - bytes_prior; let line_offset = offset - bytes_prior;
let column = line[..line_offset].chars().count(); let column = line[..line_offset].chars().count();
@ -37,16 +41,13 @@ impl Location {
unreachable!() unreachable!()
} }
#[cfg(feature = "experimental_parser")] #[cfg(feature = "experimental-parser")]
pub fn of_span_start(source: &str, span: Span) -> Self { pub fn of_offset(source: &str, offset: usize) -> Self {
// Bytes of input before substr.
let offset = span.offset as usize;
// Bytes of input prior to line being iteratated. // Bytes of input prior to line being iteratated.
let mut bytes_prior = 0; let mut bytes_prior = 0;
for (line_idx, line) in source.split('\n').enumerate() { for (line_idx, (line, seperator_offset)) in LineIterator::new(source).enumerate() {
// +1 for the '\n' let bytes_so_far = bytes_prior + line.len() + seperator_offset.unwrap_or(0) as usize;
let bytes_so_far = bytes_prior + line.len() + 1; if bytes_so_far >= offset {
if bytes_so_far > offset {
// found line. // found line.
let line_offset = offset - bytes_prior; let line_offset = offset - bytes_prior;
let column = line[..line_offset].chars().count(); let column = line[..line_offset].chars().count();
@ -61,31 +62,22 @@ impl Location {
unreachable!() unreachable!()
} }
#[cfg(feature = "experimental_parser")] #[cfg(feature = "experimental-parser")]
pub fn of_span_start(source: &str, span: Span) -> Self {
// Bytes of input before substr.
let offset = span.offset as usize;
Self::of_offset(source, offset)
}
#[cfg(feature = "experimental-parser")]
pub fn of_span_end(source: &str, span: Span) -> Self { pub fn of_span_end(source: &str, span: Span) -> Self {
// Bytes of input before substr. // Bytes of input before substr.
let offset = span.offset as usize + span.len as usize; let offset = span.offset as usize + span.len as usize;
// Bytes of input prior to line being iteratated. Self::of_offset(source, offset)
let mut bytes_prior = 0;
for (line_idx, line) in source.split('\n').enumerate() {
// +1 for the '\n'
let bytes_so_far = bytes_prior + line.len() + 1;
if bytes_so_far > offset {
// found line.
let line_offset = offset - bytes_prior;
let column = line[..line_offset].chars().count();
// +1 because line and column are 1 index.
return Self {
line: line_idx + 1,
column: column + 1,
};
}
bytes_prior = bytes_so_far;
}
unreachable!()
} }
#[cfg(feature = "experimental_parser")] #[cfg(feature = "experimental-parser")]
pub fn range_of_span(source: &str, span: Span) -> Range<Self> { pub fn range_of_span(source: &str, span: Span) -> Range<Self> {
// Bytes of input before substr. // Bytes of input before substr.
let offset = span.offset as usize; let offset = span.offset as usize;
@ -93,19 +85,18 @@ impl Location {
// Bytes of input prior to line being iteratated. // Bytes of input prior to line being iteratated.
let mut bytes_prior = 0; let mut bytes_prior = 0;
let mut iterator = source.split('\n').enumerate(); let mut iterator = LineIterator::new(source).enumerate();
let start = loop { let start = loop {
let Some((line_idx, line)) = iterator.next() else { let Some((line_idx, (line, seperator_offset))) = iterator.next() else {
panic!("tried to find location of span not belonging to string"); panic!("tried to find location of span not belonging to string");
}; };
// +1 for the '\n' let bytes_so_far = bytes_prior + line.len() + seperator_offset.unwrap_or(0) as usize;
let bytes_so_far = bytes_prior + line.len() + 1; if bytes_so_far >= offset {
if bytes_so_far > offset {
// found line. // found line.
let line_offset = offset - bytes_prior; let line_offset = offset - bytes_prior;
let column = line[..line_offset].chars().count(); let column = line[..line_offset].chars().count();
// +1 because line and column are 1 index. // +1 because line and column are 1 index.
if bytes_so_far > end { if bytes_so_far >= end {
// end is on the same line, finish immediatly. // end is on the same line, finish immediatly.
let line_offset = end - bytes_prior; let line_offset = end - bytes_prior;
let end_column = line[..line_offset].chars().count(); let end_column = line[..line_offset].chars().count();
@ -127,12 +118,11 @@ impl Location {
}; };
loop { loop {
let Some((line_idx, line)) = iterator.next() else { let Some((line_idx, (line, seperator_offset))) = iterator.next() else {
panic!("tried to find location of span not belonging to string"); panic!("tried to find location of span not belonging to string");
}; };
// +1 for the '\n' let bytes_so_far = bytes_prior + line.len() + seperator_offset.unwrap_or(0) as usize;
let bytes_so_far = bytes_prior + line.len() + 1; if bytes_so_far >= end {
if bytes_so_far > end {
let line_offset = end - bytes_prior; let line_offset = end - bytes_prior;
let column = line[..line_offset].chars().count(); let column = line[..line_offset].chars().count();
return start..Self { return start..Self {
@ -143,3 +133,93 @@ impl Location {
} }
} }
} }
struct LineIterator<'a> {
current: &'a str,
}
impl<'a> LineIterator<'a> {
pub fn new(s: &'a str) -> Self {
LineIterator {
current: s,
}
}
}
impl<'a> Iterator for LineIterator<'a> {
type Item = (&'a str, Option<u8>);
fn next(&mut self) -> Option<Self::Item> {
if self.current.is_empty() {
return None;
}
let bytes = self.current.as_bytes();
for i in 0..bytes.len() {
match bytes[i] {
b'\r' => {
if let Some(b'\n') = bytes.get(i + 1) {
let res = &self.current[..i];
self.current = &self.current[i + 2..];
return Some((res, Some(2)));
}
let res = &self.current[..i];
self.current = &self.current[i + 1..];
return Some((res, Some(1)));
}
0xb | 0xC | b'\n' => {
// vertical tab VT and form feed FF.
let res = &self.current[..i];
self.current = &self.current[i + 1..];
return Some((res, Some(1)));
}
0xc2 => {
// next line NEL
if bytes.get(i + 1).copied() != Some(0x85) {
continue;
}
let res = &self.current[..i];
self.current = &self.current[i + 2..];
return Some((res, Some(2)));
}
0xe2 => {
// line separator and paragraph seperator.
if bytes.get(i + 1).copied() != Some(0x80) {
continue;
}
let next_byte = bytes.get(i + 2).copied();
if next_byte != Some(0xA8) && next_byte != Some(0xA9) {
continue;
}
// vertical tab VT, next line NEL and form feed FF.
let res = &self.current[..i];
self.current = &self.current[i + 3..];
return Some((res, Some(3)));
}
_ => {}
}
}
Some((std::mem::take(&mut self.current), None))
}
}
#[cfg(test)]
mod test {
use super::LineIterator;
#[test]
fn test_line_iterator() {
let lines = "foo\nbar\r\nfoo\rbar\u{000B}foo\u{000C}bar\u{0085}foo\u{2028}bar\u{2029}\n";
let mut iterator = LineIterator::new(lines);
assert_eq!(iterator.next(), Some(("foo", Some(1))));
assert_eq!(iterator.next(), Some(("bar", Some(2))));
assert_eq!(iterator.next(), Some(("foo", Some(1))));
assert_eq!(iterator.next(), Some(("bar", Some(1))));
assert_eq!(iterator.next(), Some(("foo", Some(1))));
assert_eq!(iterator.next(), Some(("bar", Some(2))));
assert_eq!(iterator.next(), Some(("foo", Some(3))));
assert_eq!(iterator.next(), Some(("bar", Some(3))));
assert_eq!(iterator.next(), Some(("", Some(1))));
assert_eq!(iterator.next(), None);
}
}

View file

@ -2,6 +2,9 @@ use std::{fmt, ops::Range};
use super::common::Location; use super::common::Location;
mod nom_error;
pub use nom_error::ParseError;
#[derive(Clone, Debug)] #[derive(Clone, Debug)]
pub struct RenderedError { pub struct RenderedError {
pub text: String, pub text: String,

View file

@ -5,19 +5,12 @@ use crate::syn::{
use nom::error::ErrorKind; use nom::error::ErrorKind;
use nom::error::FromExternalError; use nom::error::FromExternalError;
use nom::error::ParseError as NomParseError; use nom::error::ParseError as NomParseError;
use nom::Err;
use std::fmt::Write; use std::fmt::Write;
use std::num::ParseFloatError; use std::num::ParseFloatError;
use std::num::ParseIntError; use std::num::ParseIntError;
use std::ops::Bound; use std::ops::Bound;
use thiserror::Error; use thiserror::Error;
mod utils;
pub use utils::*;
mod render;
pub type IResult<I, O, E = ParseError<I>> = Result<(I, O), Err<E>>;
#[derive(Error, Debug, Clone)] #[derive(Error, Debug, Clone)]
pub enum ParseError<I> { pub enum ParseError<I> {
Base(I), Base(I),

View file

@ -3,11 +3,20 @@
pub mod common; pub mod common;
pub mod error; pub mod error;
#[cfg(not(feature = "experimental-parser"))]
pub mod v1; pub mod v1;
pub use v1::{ #[cfg(not(feature = "experimental-parser"))]
datetime, datetime_raw, duration, idiom, json, parse, path_like, range, subquery, thing, pub use v1::{datetime_raw, duration, idiom, json, parse, range, subquery, thing, value};
thing_raw, value,
#[cfg(feature = "experimental-parser")]
pub mod v2;
#[cfg(feature = "experimental-parser")]
pub use v2::{
datetime_raw, duration, idiom, json, json_legacy_strand, parse, range, subquery, thing, value,
value_legacy_strand,
}; };
#[cfg(test)] #[cfg(test)]
pub mod test; pub trait Parse<T> {
fn parse(val: &str) -> T;
}

View file

@ -1,50 +0,0 @@
pub(crate) use super::v1::builtin::builtin_name;
use crate::sql::{Array, Expression, Idiom, Param, Script, Thing, Value};
use super::v1::test::*;
pub trait Parse<T> {
fn parse(val: &str) -> T;
}
impl Parse<Self> for Value {
fn parse(val: &str) -> Self {
value(val).unwrap().1
}
}
impl Parse<Self> for Array {
fn parse(val: &str) -> Self {
array(val).unwrap().1
}
}
impl Parse<Self> for Param {
fn parse(val: &str) -> Self {
param(val).unwrap().1
}
}
impl Parse<Self> for Idiom {
fn parse(val: &str) -> Self {
idiom(val).unwrap().1
}
}
impl Parse<Self> for Script {
fn parse(val: &str) -> Self {
script(val).unwrap().1
}
}
impl Parse<Self> for Thing {
fn parse(val: &str) -> Self {
thing(val).unwrap().1
}
}
impl Parse<Self> for Expression {
fn parse(val: &str) -> Self {
expression(val).unwrap().1
}
}

View file

@ -1,7 +1,7 @@
use super::{IResult, ParseError}; pub use crate::syn::error::ParseError;
use nom::bytes::complete::tag_no_case; use nom::{bytes::complete::tag_no_case, Err, Parser};
use nom::Err;
use nom::Parser; pub type IResult<I, O, E = ParseError<I>> = Result<(I, O), Err<E>>;
pub fn expected<I, O, P>(expect: &'static str, mut parser: P) -> impl FnMut(I) -> IResult<I, O> pub fn expected<I, O, P>(expect: &'static str, mut parser: P) -> impl FnMut(I) -> IResult<I, O>
where where

View file

@ -1 +0,0 @@

View file

@ -7,7 +7,7 @@ use super::{
value::single, value::single,
IResult, IResult,
}; };
use crate::sql::{Cast, Expression, Future}; use crate::sql::{Cast, Expression, Future, Operator, Value};
use nom::{bytes::complete::tag, character::complete::char, combinator::cut, sequence::delimited}; use nom::{bytes::complete::tag, character::complete::char, combinator::cut, sequence::delimited};
pub fn cast(i: &str) -> IResult<&str, Cast> { pub fn cast(i: &str) -> IResult<&str, Cast> {
@ -30,10 +30,32 @@ pub fn unary(i: &str) -> IResult<&str, Expression> {
)) ))
} }
/// Augment an existing expression
pub(crate) fn augment(mut this: Expression, l: Value, o: Operator) -> Expression {
match &mut this {
Expression::Binary {
l: left,
o: op,
..
} if o.precedence() >= op.precedence() => match left {
Value::Expression(x) => {
*x.as_mut() = augment(std::mem::take(x), l, o);
this
}
_ => {
*left = Expression::new(l, o, std::mem::take(left)).into();
this
}
},
e => {
let r = Value::from(std::mem::take(e));
Expression::new(l, o, r)
}
}
}
#[cfg(test)] #[cfg(test)]
pub fn binary(i: &str) -> IResult<&str, Expression> { pub fn binary(i: &str) -> IResult<&str, Expression> {
use crate::sql::Value;
use super::depth; use super::depth;
use super::value; use super::value;
@ -43,7 +65,7 @@ pub fn binary(i: &str) -> IResult<&str, Expression> {
let _diving = depth::dive(i)?; let _diving = depth::dive(i)?;
let (i, r) = value::value(i)?; let (i, r) = value::value(i)?;
let v = match r { let v = match r {
Value::Expression(r) => r.augment(l, o), Value::Expression(r) => augment(*r, l, o),
_ => Expression::new(l, o, r), _ => Expression::new(l, o, r),
}; };
Ok((i, v)) Ok((i, v))

View file

@ -189,7 +189,7 @@ mod tests {
use super::super::builtin::{builtin_name, BuiltinName}; use super::super::builtin::{builtin_name, BuiltinName};
use super::*; use super::*;
use crate::sql::Value; use crate::sql::Value;
use crate::syn::{self, test::Parse}; use crate::syn::{self, Parse};
fn function(i: &str) -> IResult<&str, Function> { fn function(i: &str) -> IResult<&str, Function> {
alt((defined_function, |i| { alt((defined_function, |i| {

View file

@ -275,9 +275,8 @@ pub fn bracketed_value(i: &str) -> IResult<&str, Part> {
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use crate::sql::{Dir, Expression, Id, Number, Param, Strand, Table, Thing}; use crate::sql::{Dir, Expression, Id, Number, Param, Strand, Table, Thing};
use crate::syn::test::Parse; use crate::syn::Parse;
use super::*; use super::*;

View file

@ -29,8 +29,8 @@ fn datetime_single(i: &str) -> IResult<&str, Datetime> {
fn datetime_double(i: &str) -> IResult<&str, Datetime> { fn datetime_double(i: &str) -> IResult<&str, Datetime> {
alt(( alt((
delimited(tag("d\""), cut(datetime_raw), cut(char('\"'))), delimited(tag("d\""), cut(datetime_raw), cut(char('"'))),
delimited(char('\"'), datetime_raw, char('\"')), delimited(char('"'), datetime_raw, char('"')),
))(i) ))(i)
} }
@ -194,7 +194,7 @@ mod tests {
// use chrono::Date; // use chrono::Date;
use crate::{sql::Value, syn::test::Parse}; use crate::{sql::Value, syn::Parse};
use super::*; use super::*;

View file

@ -108,7 +108,7 @@ pub fn tables(i: &str) -> IResult<&str, Tables> {
mod tests { mod tests {
use super::*; use super::*;
use crate::syn::test::Parse; use crate::syn::Parse;
#[test] #[test]
fn ident_normal() { fn ident_normal() {

View file

@ -163,7 +163,7 @@ fn char_unicode_bracketed(i: &str) -> IResult<&str, char> {
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use crate::{sql::Value, syn::test::Parse}; use crate::{sql::Value, syn::Parse};
use super::*; use super::*;

View file

@ -54,7 +54,7 @@ fn uuid_raw(i: &str) -> IResult<&str, Uuid> {
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use crate::{sql::Value, syn::test::Parse}; use crate::{sql::Value, syn::Parse};
use super::*; use super::*;

View file

@ -7,7 +7,7 @@ mod part;
mod stmt; mod stmt;
mod block; mod block;
pub(crate) mod builtin; mod builtin;
mod comment; mod comment;
mod common; mod common;
mod depth; mod depth;
@ -79,10 +79,6 @@ pub fn idiom(input: &str) -> Result<Idiom, Error> {
parse_impl(input, idiom::plain) parse_impl(input, idiom::plain)
} }
pub fn datetime(input: &str) -> Result<Datetime, Error> {
parse_impl(input, literal::datetime)
}
pub fn datetime_raw(input: &str) -> Result<Datetime, Error> { pub fn datetime_raw(input: &str) -> Result<Datetime, Error> {
parse_impl(input, literal::datetime_all_raw) parse_impl(input, literal::datetime_all_raw)
} }
@ -91,20 +87,12 @@ pub fn duration(input: &str) -> Result<Duration, Error> {
parse_impl(input, literal::duration) parse_impl(input, literal::duration)
} }
pub fn path_like(input: &str) -> Result<Value, Error> {
parse_impl(input, value::path_like)
}
pub fn range(input: &str) -> Result<Range, Error> { pub fn range(input: &str) -> Result<Range, Error> {
parse_impl(input, literal::range) parse_impl(input, literal::range)
} }
/// Parses a SurrealQL [`Thing`] /// Parses a SurrealQL [`Thing`]
pub fn thing(input: &str) -> Result<Thing, Error> { pub fn thing(input: &str) -> Result<Thing, Error> {
parse_impl(input, thing::thing)
}
pub fn thing_raw(input: &str) -> Result<Thing, Error> {
parse_impl(input, thing::thing_raw) parse_impl(input, thing::thing_raw)
} }

View file

@ -149,6 +149,7 @@ pub fn knn_distance(i: &str) -> IResult<&str, Distance> {
} }
pub fn knn(i: &str) -> IResult<&str, Operator> { pub fn knn(i: &str) -> IResult<&str, Operator> {
let (i, _) = opt(tag_no_case("knn"))(i)?;
let (i, _) = char('<')(i)?; let (i, _) = char('<')(i)?;
let (i, k) = u32(i)?; let (i, k) = u32(i)?;
let (i, dist) = opt(knn_distance)(i)?; let (i, dist) = opt(knn_distance)(i)?;
@ -228,4 +229,13 @@ mod tests {
assert_eq!("<3,EUCLIDEAN>", format!("{}", out)); assert_eq!("<3,EUCLIDEAN>", format!("{}", out));
assert_eq!(out, Operator::Knn(3, Some(Distance::Euclidean))); assert_eq!(out, Operator::Knn(3, Some(Distance::Euclidean)));
} }
#[test]
fn test_knn_with_prefix() {
let res = knn("knn<5>");
assert!(res.is_ok());
let out = res.unwrap().1;
assert_eq!("<5>", format!("{}", out));
assert_eq!(out, Operator::Knn(5, None));
}
} }

View file

@ -72,7 +72,6 @@ pub fn single(i: &str) -> IResult<&str, Data> {
pub fn values(i: &str) -> IResult<&str, Data> { pub fn values(i: &str) -> IResult<&str, Data> {
let (i, _) = tag_no_case("(")(i)?; let (i, _) = tag_no_case("(")(i)?;
// TODO: look at call tree here.
let (i, fields) = separated_list1(commas, plain)(i)?; let (i, fields) = separated_list1(commas, plain)(i)?;
let (i, _) = tag_no_case(")")(i)?; let (i, _) = tag_no_case(")")(i)?;
let (i, _) = shouldbespace(i)?; let (i, _) = shouldbespace(i)?;

View file

@ -6,7 +6,6 @@ use super::{
literal::{datetime, duration, ident, table, tables}, literal::{datetime, duration, ident, table, tables},
operator::dir, operator::dir,
thing::thing, thing::thing,
// TODO: go through and check every import for alias.
value::value, value::value,
IResult, IResult,
}; };
@ -238,7 +237,7 @@ mod tests {
use super::*; use super::*;
use crate::sql::{Datetime, Idiom, Value}; use crate::sql::{Datetime, Idiom, Value};
use crate::syn::test::Parse; use crate::syn::Parse;
use std::time; use std::time;
#[test] #[test]

View file

@ -121,7 +121,7 @@ fn rule(i: &str) -> IResult<&str, Vec<(PermissionKind, Permission)>> {
#[cfg(test)] #[cfg(test)]
mod test { mod test {
use crate::sql::{Expression, Value}; use crate::sql::{Expression, Value};
use crate::syn::test::Parse; use crate::syn::Parse;
use super::*; use super::*;

View file

@ -24,7 +24,7 @@ fn split_raw(i: &str) -> IResult<&str, Split> {
mod tests { mod tests {
use super::*; use super::*;
use crate::{sql::Idiom, syn::test::Parse}; use crate::{sql::Idiom, syn::Parse};
#[test] #[test]
fn split_statement() { fn split_statement() {

View file

@ -11,7 +11,6 @@ use crate::{
iam::Role, iam::Role,
sql::{statements::DefineUserStatement, Ident, Strand}, sql::{statements::DefineUserStatement, Ident, Strand},
}; };
use argon2::{password_hash::SaltString, Argon2, PasswordHasher};
use nom::{ use nom::{
branch::alt, branch::alt,
bytes::complete::tag_no_case, bytes::complete::tag_no_case,
@ -19,7 +18,6 @@ use nom::{
multi::{many0, separated_list1}, multi::{many0, separated_list1},
Err, Err,
}; };
use rand::{distributions::Alphanumeric, rngs::OsRng, Rng};
pub fn user(i: &str) -> IResult<&str, DefineUserStatement> { pub fn user(i: &str) -> IResult<&str, DefineUserStatement> {
let (i, _) = tag_no_case("USER")(i)?; let (i, _) = tag_no_case("USER")(i)?;
@ -35,28 +33,19 @@ pub fn user(i: &str) -> IResult<&str, DefineUserStatement> {
Ok((i, (name, base, opts))) Ok((i, (name, base, opts)))
})(i)?; })(i)?;
// Create the base statement // Create the base statement
let mut res = DefineUserStatement { let mut res = DefineUserStatement::from_parsed_values(
name, name,
base, base,
roles: vec!["Viewer".into()], // New users get the viewer role by default vec!["Viewer".into()], // New users get the viewer role by default
code: rand::thread_rng() );
.sample_iter(&Alphanumeric)
.take(128)
.map(char::from)
.collect::<String>(),
..Default::default()
};
// Assign any defined options // Assign any defined options
for opt in opts { for opt in opts {
match opt { match opt {
DefineUserOption::Password(v) => { DefineUserOption::Password(v) => {
res.hash = Argon2::default() res.set_password(&v);
.hash_password(v.as_ref(), &SaltString::generate(&mut OsRng))
.unwrap()
.to_string()
} }
DefineUserOption::Passhash(v) => { DefineUserOption::Passhash(v) => {
res.hash = v; res.set_passhash(v);
} }
DefineUserOption::Roles(v) => { DefineUserOption::Roles(v) => {
res.roles = v; res.roles = v;

View file

@ -9,7 +9,7 @@ use nom::{
branch::alt, branch::alt,
bytes::complete::tag_no_case, bytes::complete::tag_no_case,
character::complete::char, character::complete::char,
combinator::{cut, opt, value}, combinator::{opt, value},
sequence::tuple, sequence::tuple,
}; };
@ -19,10 +19,10 @@ pub fn option(i: &str) -> IResult<&str, OptionStatement> {
let (i, n) = ident(i)?; let (i, n) = ident(i)?;
let (i, v) = expected( let (i, v) = expected(
"'=' followed by a value for the option", "'=' followed by a value for the option",
cut(opt(alt(( opt(alt((
value(true, tuple((mightbespace, char('='), mightbespace, tag_no_case("TRUE")))), value(true, tuple((mightbespace, char('='), mightbespace, tag_no_case("TRUE")))),
value(false, tuple((mightbespace, char('='), mightbespace, tag_no_case("FALSE")))), value(false, tuple((mightbespace, char('='), mightbespace, tag_no_case("FALSE")))),
)))), ))),
)(i)?; )(i)?;
Ok(( Ok((
i, i,

View file

@ -87,7 +87,6 @@ fn disallowed_subquery_statements(i: &str) -> IResult<&str, ()> {
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use super::*; use super::*;
#[test] #[test]

View file

@ -1,4 +1,5 @@
pub use super::{ use super::{
super::Parse,
expression::binary as expression, expression::binary as expression,
function::script_body as script, function::script_body as script,
idiom::plain as idiom, idiom::plain as idiom,
@ -6,3 +7,48 @@ pub use super::{
thing::thing, thing::thing,
value::{array, value}, value::{array, value},
}; };
use nom::Finish;
use crate::sql::{Array, Expression, Idiom, Param, Script, Thing, Value};
impl Parse<Self> for Value {
fn parse(val: &str) -> Self {
value(val).finish().unwrap().1
}
}
impl Parse<Self> for Array {
fn parse(val: &str) -> Self {
array(val).finish().unwrap().1
}
}
impl Parse<Self> for Param {
fn parse(val: &str) -> Self {
param(val).finish().unwrap().1
}
}
impl Parse<Self> for Idiom {
fn parse(val: &str) -> Self {
idiom(val).finish().unwrap().1
}
}
impl Parse<Self> for Script {
fn parse(val: &str) -> Self {
script(val).finish().unwrap().1
}
}
impl Parse<Self> for Thing {
fn parse(val: &str) -> Self {
thing(val).finish().unwrap().1
}
}
impl Parse<Self> for Expression {
fn parse(val: &str) -> Self {
expression(val).finish().unwrap().1
}
}

View file

@ -81,7 +81,7 @@ mod tests {
use crate::sql::object::Object; use crate::sql::object::Object;
use crate::sql::value::Value; use crate::sql::value::Value;
use crate::sql::Strand; use crate::sql::Strand;
use crate::syn::test::Parse; use crate::syn::Parse;
#[test] #[test]
fn thing_normal() { fn thing_normal() {
@ -249,7 +249,7 @@ mod tests {
let res = id(sql); let res = id(sql);
let out = res.unwrap().1; let out = res.unwrap().1;
assert_eq!(Id::from("100test"), out); assert_eq!(Id::from("100test"), out);
assert_eq!("100test", format!("{}", out)); assert_eq!("100test", format!("{}", out));
} }
#[test] #[test]

View file

@ -9,7 +9,7 @@ use super::{
depth, depth,
ending::keyword, ending::keyword,
error::expected, error::expected,
expression::{cast, future, unary}, expression::{augment, cast, future, unary},
function::{builtin_function, defined_function, model}, function::{builtin_function, defined_function, model},
idiom::{self, reparse_idiom_start}, idiom::{self, reparse_idiom_start},
literal::{ literal::{
@ -62,7 +62,7 @@ pub fn value(i: &str) -> IResult<&str, Value> {
let _diving = depth::dive(i)?; let _diving = depth::dive(i)?;
let (i, r) = cut(value)(i)?; let (i, r) = cut(value)(i)?;
let expr = match r { let expr = match r {
Value::Expression(r) => r.augment(start, o), Value::Expression(r) => augment(*r, start, o),
_ => Expression::new(start, o, r), _ => Expression::new(start, o, r),
}; };
let v = Value::from(expr); let v = Value::from(expr);
@ -179,7 +179,7 @@ pub fn select(i: &str) -> IResult<&str, Value> {
}; };
let (i, r) = cut(value)(i)?; let (i, r) = cut(value)(i)?;
let expr = match r { let expr = match r {
Value::Expression(r) => r.augment(start, op), Value::Expression(r) => augment(*r, start, op),
_ => Expression::new(start, op, r), _ => Expression::new(start, op, r),
}; };
let v = Value::from(expr); let v = Value::from(expr);

View file

@ -0,0 +1,387 @@
use crate::syn::v2::{
lexer::{
unicode::{byte, chars},
Error, Lexer,
},
token::{t, Token, TokenKind},
};
impl<'a> Lexer<'a> {
/// Eats a single line comment.
pub fn eat_single_line_comment(&mut self) {
loop {
let Some(byte) = self.reader.next() else {
break;
};
match byte {
byte::CR => {
self.eat(byte::LF);
break;
}
byte::LF => {
break;
}
x if !x.is_ascii() => {
// -1 because we already ate the byte.
let backup = self.reader.offset() - 1;
let char = match self.reader.complete_char(x) {
Ok(x) => x,
Err(_) => {
// let the next token handle the error.
self.reader.backup(backup);
break;
}
};
match char {
chars::LS | chars::PS | chars::NEL => break,
_ => {}
}
}
_ => {}
}
}
self.set_whitespace_span(self.current_span());
self.skip_offset();
}
/// Eats a multi line comment and returns an error if `*/` would be missing.
pub fn eat_multi_line_comment(&mut self) -> Result<(), Error> {
loop {
let Some(byte) = self.reader.next() else {
return Err(Error::UnexpectedEof);
};
if let b'*' = byte {
let Some(byte) = self.reader.next() else {
return Err(Error::UnexpectedEof);
};
if b'/' == byte {
self.set_whitespace_span(self.current_span());
self.skip_offset();
return Ok(());
}
}
}
}
/// Eat whitespace like spaces tables and new-lines.
pub fn eat_whitespace(&mut self) {
loop {
let Some(byte) = self.reader.peek() else {
return;
};
match byte {
byte::CR | byte::FF | byte::LF | byte::SP | byte::VT | byte::TAB => {
self.reader.next();
}
x if !x.is_ascii() => {
let backup = self.reader.offset();
self.reader.next();
let char = match self.reader.complete_char(x) {
Ok(x) => x,
Err(_) => {
self.reader.backup(backup);
break;
}
};
match char {
'\u{00A0}' | '\u{1680}' | '\u{2000}' | '\u{2001}' | '\u{2002}'
| '\u{2003}' | '\u{2004}' | '\u{2005}' | '\u{2006}' | '\u{2007}'
| '\u{2008}' | '\u{2009}' | '\u{200A}' | '\u{202F}' | '\u{205F}'
| '\u{3000}' => {}
_ => {
self.reader.backup(backup);
break;
}
}
}
_ => break,
}
}
self.set_whitespace_span(self.current_span());
self.skip_offset();
}
// re-lexes a `/` token to a regex token.
pub fn relex_regex(&mut self, token: Token) -> Token {
debug_assert_eq!(token.kind, t!("/"));
debug_assert_eq!(token.span.offset + 1, self.last_offset);
debug_assert_eq!(token.span.len, 1);
debug_assert_eq!(self.scratch, "");
self.last_offset = token.span.offset;
loop {
match self.reader.next() {
Some(b'\\') => {
if let Some(b'/') = self.reader.peek() {
self.reader.next();
self.scratch.push('/')
} else {
self.scratch.push('\\')
}
}
Some(b'/') => break,
Some(x) => {
if x.is_ascii() {
self.scratch.push(x as char);
} else {
match self.reader.complete_char(x) {
Ok(x) => {
self.scratch.push(x);
}
Err(e) => return self.invalid_token(e.into()),
}
}
}
None => return self.invalid_token(Error::UnexpectedEof),
}
}
match self.scratch.parse() {
Ok(x) => {
self.scratch.clear();
self.regex = Some(x);
self.finish_token(TokenKind::Regex)
}
Err(e) => self.invalid_token(Error::Regex(e)),
}
}
/// Lex the next token, starting from the given byte.
pub fn lex_ascii(&mut self, byte: u8) -> Token {
let kind = match byte {
b'{' => t!("{"),
b'}' => t!("}"),
b'[' => t!("["),
b']' => t!("]"),
b')' => t!(")"),
b'(' => t!("("),
b';' => t!(";"),
b',' => t!(","),
b'@' => t!("@"),
byte::CR | byte::FF | byte::LF | byte::SP | byte::VT | byte::TAB => {
self.eat_whitespace();
return self.next_token_inner();
}
b'|' => match self.reader.peek() {
Some(b'|') => {
self.reader.next();
t!("||")
}
_ => t!("|"),
},
b'&' => match self.reader.peek() {
Some(b'&') => {
self.reader.next();
t!("&&")
}
_ => return self.invalid_token(Error::ExpectedEnd('&')),
},
b'.' => match self.reader.peek() {
Some(b'.') => {
self.reader.next();
match self.reader.peek() {
Some(b'.') => {
self.reader.next();
t!("...")
}
_ => t!(".."),
}
}
_ => t!("."),
},
b'!' => match self.reader.peek() {
Some(b'=') => {
self.reader.next();
t!("!=")
}
Some(b'~') => {
self.reader.next();
t!("!~")
}
_ => t!("!"),
},
b'?' => match self.reader.peek() {
Some(b'?') => {
self.reader.next();
t!("??")
}
Some(b':') => {
self.reader.next();
t!("?:")
}
Some(b'~') => {
self.reader.next();
t!("?~")
}
Some(b'=') => {
self.reader.next();
t!("?=")
}
_ => t!("?"),
},
b'<' => match self.reader.peek() {
Some(b'=') => {
self.reader.next();
t!("<=")
}
Some(b'-') => {
self.reader.next();
match self.reader.peek() {
Some(b'>') => {
self.reader.next();
t!("<->")
}
_ => t!("<-"),
}
}
_ => t!("<"),
},
b'>' => match self.reader.peek() {
Some(b'=') => {
self.reader.next();
t!(">=")
}
_ => t!(">"),
},
b'-' => match self.reader.peek() {
Some(b'>') => {
self.reader.next();
t!("->")
}
Some(b'-') => {
self.reader.next();
self.eat_single_line_comment();
return self.next_token_inner();
}
Some(b'=') => {
self.reader.next();
t!("-=")
}
_ => t!("-"),
},
b'+' => match self.reader.peek() {
Some(b'=') => {
self.reader.next();
t!("+=")
}
Some(b'?') => {
self.reader.next();
match self.reader.peek() {
Some(b'=') => {
self.reader.next();
t!("+?=")
}
_ => return self.invalid_token(Error::ExpectedEnd('=')),
}
}
_ => t!("+"),
},
b'/' => match self.reader.peek() {
Some(b'*') => {
self.reader.next();
// A `*/` could be missing which would be invalid.
if let Err(e) = self.eat_multi_line_comment() {
return self.invalid_token(e);
}
return self.next_token_inner();
}
Some(b'/') => {
self.reader.next();
self.eat_single_line_comment();
return self.next_token_inner();
}
_ => t!("/"),
},
b'*' => match self.reader.peek() {
Some(b'*') => {
self.reader.next();
t!("**")
}
Some(b'=') => {
self.reader.next();
t!("*=")
}
Some(b'~') => {
self.reader.next();
t!("*~")
}
_ => t!("*"),
},
b'=' => match self.reader.peek() {
Some(b'=') => {
self.reader.next();
t!("==")
}
_ => t!("="),
},
b':' => match self.reader.peek() {
Some(b':') => {
self.reader.next();
t!("::")
}
_ => t!(":"),
},
b'$' => {
if self.reader.peek().map(|x| x.is_ascii_alphabetic()).unwrap_or(false) {
return self.lex_param();
}
t!("$")
}
b'#' => {
self.eat_single_line_comment();
return self.next_token_inner();
}
b'`' => return self.lex_surrounded_ident(true),
b'"' => return self.lex_strand(true),
b'\'' => return self.lex_strand(false),
b'd' => {
match self.reader.peek() {
Some(b'"') => {
self.reader.next();
return self.lex_datetime(true);
}
Some(b'\'') => {
self.reader.next();
return self.lex_datetime(false);
}
_ => {}
}
return self.lex_ident_from_next_byte(b'd');
}
b'u' => {
match self.reader.peek() {
Some(b'"') => {
self.reader.next();
return self.lex_uuid(true);
}
Some(b'\'') => {
self.reader.next();
return self.lex_uuid(false);
}
_ => {}
}
return self.lex_ident_from_next_byte(b'u');
}
b'r' => match self.reader.peek() {
Some(b'\"') => {
self.reader.next();
t!("r\"")
}
Some(b'\'') => {
self.reader.next();
t!("r'")
}
_ => return self.lex_ident_from_next_byte(byte),
},
b'a'..=b'z' | b'A'..=b'Z' | b'_' => {
return self.lex_ident_from_next_byte(byte);
}
b'0'..=b'9' => return self.lex_number(byte),
x => return self.invalid_token(Error::UnexpectedCharacter(x as char)),
};
self.finish_token(kind)
}
}

View file

@ -0,0 +1,37 @@
use crate::syn::v2::{
lexer::{CharError, Lexer},
token::{t, Token},
};
use super::Error;
impl<'a> Lexer<'a> {
/// lex non-ascii characters.
///
/// Should only be called after determining that the byte is not a valid ascii character.
pub fn lex_char(&mut self, byte: u8) -> Token {
let c = match self.reader.complete_char(byte) {
Ok(x) => x,
Err(CharError::Eof) => return self.invalid_token(Error::InvalidUtf8),
Err(CharError::Unicode) => return self.invalid_token(Error::InvalidUtf8),
};
let kind = match c {
'⟨' => return self.lex_surrounded_ident(false),
'…' => t!("..."),
'∋' => t!(""),
'∌' => t!(""),
'∈' => t!(""),
'∉' => t!(""),
'⊇' => t!(""),
'⊃' => t!(""),
'⊅' => t!(""),
'⊆' => t!(""),
'⊂' => t!(""),
'⊄' => t!(""),
'×' => t!("×"),
'÷' => t!("÷"),
x => return self.invalid_token(Error::UnexpectedCharacter(x)),
};
self.finish_token(kind)
}
}

View file

@ -0,0 +1,267 @@
use std::ops::RangeInclusive;
use chrono::{FixedOffset, NaiveDate, NaiveDateTime, NaiveTime, Offset, TimeZone, Utc};
use thiserror::Error;
use crate::{
sql::Datetime,
syn::v2::token::{Token, TokenKind},
};
use super::{Error as LexError, Lexer};
#[derive(Error, Debug)]
pub enum PartError {
#[error("value outside of allowed range")]
OutsideRange,
#[error("missing digit(s)")]
MissingDigits,
#[error("too many digits")]
TooManyDigits,
}
#[derive(Error, Debug)]
pub enum Error {
#[error("invalid year, {0}")]
Year(PartError),
#[error("invalid month, {0}")]
Month(PartError),
#[error("invalid day, {0}")]
Day(PartError),
#[error("invalid hour, {0}")]
Hour(PartError),
#[error("invalid time minute, {0}")]
Minute(PartError),
#[error("invalid second, {0}")]
Second(PartError),
#[error("invalid nano_seconds, {0}")]
NanoSeconds(PartError),
#[error("invalid time-zone hour, {0}")]
TimeZoneHour(PartError),
#[error("invalid time-zone minute, {0}")]
TimeZoneMinute(PartError),
#[error("missing seperator `{}`",*(.0) as char)]
MissingSeparator(u8),
#[error("expected date-time strand to end")]
ExpectedEnd,
#[error("missing time-zone")]
MissingTimeZone,
#[error("date does not exist")]
NonExistantDate,
#[error("time does not exist")]
NonExistantTime,
#[error("time-zone offset too big")]
TimeZoneOutOfRange,
}
impl<'a> Lexer<'a> {
/// Lex a date-time strand.
pub fn lex_datetime(&mut self, double: bool) -> Token {
match self.lex_datetime_err(double) {
Ok(x) => {
self.datetime = Some(x);
self.finish_token(TokenKind::DateTime)
}
Err(e) => self.invalid_token(LexError::DateTime(e)),
}
}
/// Lex datetime without enclosing `"` or `'` but return a result or parser error.
pub fn lex_datetime_raw_err(&mut self) -> Result<Datetime, Error> {
let negative = match self.reader.peek() {
Some(b'+') => {
self.reader.next();
false
}
Some(b'-') => {
self.reader.next();
true
}
_ => false,
};
let mut year = self.lex_datetime_part(4, 0..=9999).map_err(Error::Year)? as i16;
if negative {
year = -year;
}
if !self.eat(b'-') {
return Err(Error::MissingSeparator(b'-'));
}
let month = self.lex_datetime_part(2, 1..=12).map_err(Error::Month)?;
if !self.eat(b'-') {
return Err(Error::MissingSeparator(b'-'));
}
let day = self.lex_datetime_part(2, 1..=31).map_err(Error::Day)?;
if !self.eat(b'T') {
let Some(date) = NaiveDate::from_ymd_opt(year as i32, month as u32, day as u32) else {
return Err(Error::NonExistantDate);
};
let time = NaiveTime::default();
let date_time = NaiveDateTime::new(date, time);
let datetime = Utc
.fix()
.from_local_datetime(&date_time)
.earliest()
// this should never panic with a fixed offset.
.unwrap()
.with_timezone(&Utc);
return Ok(Datetime(datetime));
}
let hour = self.lex_datetime_part(2, 0..=24).map_err(Error::Hour)?;
if !self.eat(b':') {
return Err(Error::MissingSeparator(b':'));
}
let minutes = self.lex_datetime_part(2, 0..=59).map_err(Error::Minute)?;
if !self.eat(b':') {
return Err(Error::MissingSeparator(b':'));
}
let seconds = self.lex_datetime_part(2, 0..=59).map_err(Error::Second)?;
// nano seconds
let nano = if let Some(b'.') = self.reader.peek() {
self.reader.next();
// check if there is atleast one digit.
if !matches!(self.reader.peek(), Some(b'0'..=b'9')) {
return Err(Error::NanoSeconds(PartError::MissingDigits));
}
let mut number = 0u32;
for i in 0..9 {
let Some(c) = self.reader.peek() else {
// always invalid token, just let the next section handle the error.
break;
};
if !c.is_ascii_digit() {
// If digits are missing they are counted as 0's
for _ in i..9 {
number *= 10;
}
break;
}
self.reader.next();
number *= 10;
number += (c - b'0') as u32;
}
// ensure nano_seconds are at most 9 digits.
if matches!(self.reader.peek(), Some(b'0'..=b'9')) {
return Err(Error::NanoSeconds(PartError::TooManyDigits));
}
number
} else {
0
};
// time zone
let time_zone = match self.reader.peek() {
Some(b'Z') => {
self.reader.next();
None
}
Some(x @ (b'-' | b'+')) => {
self.reader.next();
let negative = x == b'-';
let hour = self.lex_datetime_part(2, 0..=24).map_err(Error::TimeZoneHour)? as i32;
let Some(b':') = self.reader.next() else {
return Err(Error::MissingSeparator(b':'));
};
let minute =
self.lex_datetime_part(2, 0..=59).map_err(Error::TimeZoneMinute)? as i32;
let time = hour * 3600 + minute * 60;
if negative {
Some(-time)
} else {
Some(time)
}
}
_ => return Err(Error::MissingTimeZone),
};
// calculate the given datetime from individual parts.
let Some(date) = NaiveDate::from_ymd_opt(year as i32, month as u32, day as u32) else {
return Err(Error::NonExistantDate);
};
let Some(time) =
NaiveTime::from_hms_nano_opt(hour as u32, minutes as u32, seconds as u32, nano)
else {
return Err(Error::NonExistantTime);
};
let date_time = NaiveDateTime::new(date, time);
let zone = match time_zone {
None => Utc.fix(),
Some(offset) => if offset < 0 {
FixedOffset::west_opt(-offset)
} else {
FixedOffset::east_opt(offset)
}
.ok_or(Error::TimeZoneOutOfRange)?,
};
let datetime = zone
.from_local_datetime(&date_time)
.earliest()
// this should never panic with a fixed offset.
.unwrap()
.with_timezone(&Utc);
Ok(Datetime(datetime))
}
/// Lex full datetime but return an result instead of a token.
pub fn lex_datetime_err(&mut self, double: bool) -> Result<Datetime, Error> {
let datetime = self.lex_datetime_raw_err()?;
let end_char = if double {
b'"'
} else {
b'\''
};
if !self.eat(end_char) {
return Err(Error::ExpectedEnd);
}
Ok(datetime)
}
/// Lexes a digit part of date time.
///
/// This function eats an amount of digits and then checks if the value the digits represent
/// is within the given range.
pub fn lex_datetime_part(
&mut self,
mut amount: u8,
range: RangeInclusive<u16>,
) -> Result<u16, PartError> {
let mut value = 0u16;
while amount != 0 {
value *= 10;
let Some(char) = self.reader.peek() else {
return Err(PartError::MissingDigits);
};
if !char.is_ascii_digit() {
return Err(PartError::MissingDigits);
}
self.reader.next();
value += (char - b'0') as u16;
amount -= 1;
}
if matches!(self.reader.peek(), Some(b'0'..=b'8')) {
return Err(PartError::TooManyDigits);
}
if !range.contains(&value) {
return Err(PartError::OutsideRange);
}
Ok(value)
}
}

View file

@ -0,0 +1,170 @@
use std::time::Duration as StdDuration;
use thiserror::Error;
use crate::{
sql::duration::{
Duration, SECONDS_PER_DAY, SECONDS_PER_HOUR, SECONDS_PER_MINUTE, SECONDS_PER_WEEK,
SECONDS_PER_YEAR,
},
syn::v2::token::{Token, TokenKind},
};
use super::{Error as LexError, Lexer};
#[derive(Error, Debug)]
pub enum Error {
#[error("invalid duration suffix")]
InvalidSuffix,
#[error("duration value overflowed")]
Overflow,
}
impl<'a> Lexer<'a> {
/// Lex a duration.
///
/// Expect the lexer to have already eaten the digits starting the duration.
pub fn lex_duration(&mut self) -> Token {
match self.lex_duration_err() {
Ok(x) => {
self.duration = Some(x);
self.finish_token(TokenKind::Duration)
}
Err(e) => self.invalid_token(LexError::Duration(e)),
}
}
fn invalid_suffix_duration(&mut self) -> Error {
// eat the whole suffix.
while let Some(x) = self.reader.peek() {
if !x.is_ascii_alphanumeric() {
break;
}
self.reader.next();
}
Error::InvalidSuffix
}
/// Lex a duration,
///
/// Should only be called from lexing a number.
///
/// Expects any number but at least one numeric characters be pushed into scratch.
pub fn lex_duration_err(&mut self) -> Result<Duration, Error> {
let mut duration = StdDuration::ZERO;
let mut current_value = 0u64;
// use the existing eat span to generate the current value.
// span already contains
let mut span = self.current_span();
span.len -= 1;
for b in self.scratch.as_bytes() {
debug_assert!(b.is_ascii_digit(), "`{}` is not a digit", b);
current_value = current_value.checked_mul(10).ok_or(Error::Overflow)?;
current_value = current_value.checked_add((b - b'0') as u64).ok_or(Error::Overflow)?;
}
self.scratch.clear();
loop {
let Some(next) = self.reader.peek() else {
return Err(Error::InvalidSuffix);
};
// Match the suffix.
let new_duration = match next {
x @ (b'n' | b'u') => {
// Nano or micro suffix
self.reader.next();
if !self.eat(b's') {
return Err(Error::InvalidSuffix);
};
if x == b'n' {
StdDuration::from_nanos(current_value)
} else {
StdDuration::from_micros(current_value)
}
}
// Starting byte of 'µ'
0xc2 => {
self.reader.next();
// Second byte of 'µ'.
// Always consume as the next byte will always be part of a two byte character.
if !self.eat(0xb5) {
return Err(self.invalid_suffix_duration());
}
if !self.eat(b's') {
return Err(self.invalid_suffix_duration());
}
StdDuration::from_micros(current_value)
}
b'm' => {
self.reader.next();
// Either milli or minute
let is_milli = self.eat(b's');
if is_milli {
StdDuration::from_millis(current_value)
} else {
let Some(number) = current_value.checked_mul(SECONDS_PER_MINUTE) else {
return Err(Error::Overflow);
};
StdDuration::from_secs(number)
}
}
x @ (b's' | b'h' | b'd' | b'w' | b'y') => {
self.reader.next();
// second, hour, day, week or year.
let new_duration = match x {
b's' => Some(StdDuration::from_secs(current_value)),
b'h' => {
current_value.checked_mul(SECONDS_PER_HOUR).map(StdDuration::from_secs)
}
b'd' => {
current_value.checked_mul(SECONDS_PER_DAY).map(StdDuration::from_secs)
}
b'w' => {
current_value.checked_mul(SECONDS_PER_WEEK).map(StdDuration::from_secs)
}
b'y' => {
current_value.checked_mul(SECONDS_PER_YEAR).map(StdDuration::from_secs)
}
_ => unreachable!(),
};
let Some(new_duration) = new_duration else {
return Err(Error::Overflow);
};
new_duration
}
_ => {
return Err(self.invalid_suffix_duration());
}
};
duration = duration.checked_add(new_duration).ok_or(Error::Overflow)?;
let next = self.reader.peek();
match next {
// there was some remaining alphabetic characters after the valid suffix, so the
// suffix is invalid.
Some(b'a'..=b'z' | b'A'..=b'Z' | b'_') => {
return Err(self.invalid_suffix_duration())
}
Some(b'0'..=b'9') => {} // Duration continues.
_ => return Ok(Duration(duration)),
}
current_value = 0;
// Eat all the next numbers
while let Some(b @ b'0'..=b'9') = self.reader.peek() {
self.reader.next();
current_value = current_value.checked_mul(10).ok_or(Error::Overflow)?;
current_value =
current_value.checked_add((b - b'0') as u64).ok_or(Error::Overflow)?;
}
}
}
}

View file

@ -0,0 +1,164 @@
use std::mem;
use unicase::UniCase;
use crate::syn::v2::lexer::{keywords::KEYWORDS, Error, Lexer};
use crate::syn::v2::token::{NumberKind, Token, TokenKind};
use super::unicode::{chars, U8Ext};
impl<'a> Lexer<'a> {
/// Lex a parameter in the form of `$[a-zA-Z0-9_]*`
///
/// # Lexer State
/// Expected the lexer to have already eaten the param starting `$`
pub fn lex_param(&mut self) -> Token {
debug_assert_eq!(self.scratch, "");
loop {
if let Some(x) = self.reader.peek() {
if x.is_ascii_alphanumeric() || x == b'_' {
self.scratch.push(x as char);
self.reader.next();
continue;
}
}
self.string = Some(mem::take(&mut self.scratch));
return self.finish_token(TokenKind::Parameter);
}
}
/// Lex an not surrounded identifier in the form of `[a-zA-Z0-9_]*`
///
/// The start byte should already a valid byte of the identifier.
///
/// When calling the caller should already know that the token can't be any other token covered
/// by `[a-zA-Z0-9_]*`.
pub fn lex_ident_from_next_byte(&mut self, start: u8) -> Token {
debug_assert!(matches!(start, b'a'..=b'z' | b'A'..=b'Z' | b'_'));
debug_assert_eq!(self.scratch, "");
self.scratch.push(start as char);
self.lex_ident()
}
/// Lex a not surrounded identfier.
///
/// The scratch should contain only identifier valid chars.
pub fn lex_ident(&mut self) -> Token {
loop {
if let Some(x) = self.reader.peek() {
if x.is_identifier_continue() {
self.scratch.push(x as char);
self.reader.next();
continue;
}
}
// When finished parsing the identifier, try to match it to an keyword.
// If there is one, return it as the keyword. Original identifier can be reconstructed
// from the token.
if let Some(x) = KEYWORDS.get(&UniCase::ascii(&self.scratch)).copied() {
self.scratch.clear();
return self.finish_token(x);
}
if self.scratch == "NaN" {
self.scratch.clear();
return self.finish_token(TokenKind::Number(NumberKind::NaN));
} else {
self.string = Some(mem::take(&mut self.scratch));
return self.finish_token(TokenKind::Identifier);
}
}
}
/// Lex an ident which is surround by delimiters.
pub fn lex_surrounded_ident(&mut self, is_backtick: bool) -> Token {
match self.lex_surrounded_ident_err(is_backtick) {
Ok(x) => x,
Err(e) => {
self.scratch.clear();
self.invalid_token(e)
}
}
}
/// Lex an ident surrounded either by `⟨⟩` or `\`\``
pub fn lex_surrounded_ident_err(&mut self, is_backtick: bool) -> Result<Token, Error> {
loop {
let Some(x) = self.reader.next() else {
let end_char = if is_backtick {
'`'
} else {
'⟩'
};
return Err(Error::ExpectedEnd(end_char));
};
if x.is_ascii() {
match x {
b'`' if is_backtick => {
self.string = Some(mem::take(&mut self.scratch));
return Ok(self.finish_token(TokenKind::Identifier));
}
b'\0' => {
// null bytes not allowed
return Err(Error::UnexpectedCharacter('\0'));
}
b'\\' if is_backtick => {
// handle escape sequences.
// This is compliant with the orignal parser which didn't permit
// escape sequences in `⟨⟩` surrounded idents.
let Some(next) = self.reader.next() else {
let end_char = if is_backtick {
'`'
} else {
'⟩'
};
return Err(Error::ExpectedEnd(end_char));
};
match next {
b'\\' => {
self.scratch.push('\\');
}
b'`' => {
self.scratch.push('`');
}
b'/' => {
self.scratch.push('/');
}
b'b' => {
self.scratch.push(chars::BS);
}
b'f' => {
self.scratch.push(chars::FF);
}
b'n' => {
self.scratch.push(chars::LF);
}
b'r' => {
self.scratch.push(chars::CR);
}
b't' => {
self.scratch.push(chars::TAB);
}
_ => {
let char = if x.is_ascii() {
x as char
} else {
self.reader.complete_char(x)?
};
return Err(Error::InvalidEscapeCharacter(char));
}
}
}
x => self.scratch.push(x as char),
}
} else {
let c = self.reader.complete_char(x)?;
if !is_backtick && c == '⟩' {
self.string = Some(mem::take(&mut self.scratch));
return Ok(self.finish_token(TokenKind::Identifier));
}
self.scratch.push(c);
}
}
}
}

View file

@ -0,0 +1,97 @@
use crate::syn::v2::token::Span;
use super::{unicode::chars::JS_LINE_TERIMATORS, Error, Lexer};
impl Lexer<'_> {
/// Lex the body of a js functions.
///
/// This function will never be called while lexing normally.
pub fn lex_js_function_body(&mut self) -> Result<String, (Error, Span)> {
self.lex_js_function_body_inner().map_err(|e| (e, self.current_span()))
}
/// Lex the body of a js function.
fn lex_js_function_body_inner(&mut self) -> Result<String, Error> {
let mut block_depth = 1;
loop {
let byte = self.reader.next().ok_or(Error::UnexpectedEof)?;
match byte {
b'`' => self.lex_js_string(b'`')?,
b'\'' => self.lex_js_string(b'\'')?,
b'\"' => self.lex_js_string(b'\"')?,
b'/' => match self.reader.peek() {
Some(b'/') => {
self.reader.next();
self.lex_js_single_comment()?;
}
Some(b'*') => {
self.reader.next();
self.lex_js_multi_comment()?
}
_ => {}
},
b'{' => {
block_depth += 1;
}
b'}' => {
block_depth -= 1;
if block_depth == 0 {
break;
}
}
x if !x.is_ascii() => {
// check for invalid characters.
self.reader.complete_char(x)?;
}
_ => {}
}
}
let mut span = self.current_span();
// remove the `}` from the source text;
span.len -= 1;
// lexer ensures that it is valid utf8
let source = String::from_utf8(self.reader.span(span).to_vec()).unwrap();
Ok(source)
}
/// lex a js string with the given delimiter.
fn lex_js_string(&mut self, enclosing_byte: u8) -> Result<(), Error> {
loop {
let byte = self.reader.next().ok_or(Error::UnexpectedEof)?;
if byte == enclosing_byte {
return Ok(());
}
if byte == b'\\' {
self.reader.next();
}
// check for invalid characters.
self.reader.convert_to_char(byte)?;
}
}
/// lex a single line js comment.
fn lex_js_single_comment(&mut self) -> Result<(), Error> {
loop {
let Some(byte) = self.reader.next() else {
return Ok(());
};
let char = self.reader.convert_to_char(byte)?;
if JS_LINE_TERIMATORS.contains(&char) {
return Ok(());
}
}
}
/// lex a multi line js comment.
fn lex_js_multi_comment(&mut self) -> Result<(), Error> {
loop {
let byte = self.reader.next().ok_or(Error::UnexpectedEof)?;
if byte == b'*' && self.reader.peek() == Some(b'/') {
self.reader.next();
return Ok(());
}
// check for invalid characters.
self.reader.convert_to_char(byte)?;
}
}
}

View file

@ -0,0 +1,285 @@
use crate::{
sql::{language::Language, Algorithm},
syn::v2::token::{DistanceKind, Keyword, TokenKind},
};
use phf::phf_map;
use unicase::UniCase;
/// A map for mapping keyword strings to a tokenkind,
pub(crate) static KEYWORDS: phf::Map<UniCase<&'static str>, TokenKind> = phf_map! {
// Keywords
UniCase::ascii("AFTER") => TokenKind::Keyword(Keyword::After),
UniCase::ascii("ALL") => TokenKind::Keyword(Keyword::All),
UniCase::ascii("ANALYZE") => TokenKind::Keyword(Keyword::Analyze),
UniCase::ascii("ANALYZER") => TokenKind::Keyword(Keyword::Analyzer),
UniCase::ascii("AS") => TokenKind::Keyword(Keyword::As),
UniCase::ascii("ASCENDING") => TokenKind::Keyword(Keyword::Ascending),
UniCase::ascii("ASC") => TokenKind::Keyword(Keyword::Ascending),
UniCase::ascii("ASCII") => TokenKind::Keyword(Keyword::Ascii),
UniCase::ascii("ASSERT") => TokenKind::Keyword(Keyword::Assert),
UniCase::ascii("AT") => TokenKind::Keyword(Keyword::At),
UniCase::ascii("BEFORE") => TokenKind::Keyword(Keyword::Before),
UniCase::ascii("BEGIN") => TokenKind::Keyword(Keyword::Begin),
UniCase::ascii("BLANK") => TokenKind::Keyword(Keyword::Blank),
UniCase::ascii("BM25") => TokenKind::Keyword(Keyword::Bm25),
UniCase::ascii("BREAK") => TokenKind::Keyword(Keyword::Break),
UniCase::ascii("BY") => TokenKind::Keyword(Keyword::By),
UniCase::ascii("CAMEL") => TokenKind::Keyword(Keyword::Camel),
UniCase::ascii("CANCEL") => TokenKind::Keyword(Keyword::Cancel),
UniCase::ascii("CHANGEFEED") => TokenKind::Keyword(Keyword::ChangeFeed),
UniCase::ascii("CHANGES") => TokenKind::Keyword(Keyword::Changes),
UniCase::ascii("CAPACITY") => TokenKind::Keyword(Keyword::Capacity),
UniCase::ascii("CLASS") => TokenKind::Keyword(Keyword::Class),
UniCase::ascii("COMMENT") => TokenKind::Keyword(Keyword::Comment),
UniCase::ascii("COMMIT") => TokenKind::Keyword(Keyword::Commit),
UniCase::ascii("CONTENT") => TokenKind::Keyword(Keyword::Content),
UniCase::ascii("CONTINUE") => TokenKind::Keyword(Keyword::Continue),
UniCase::ascii("CREATE") => TokenKind::Keyword(Keyword::Create),
UniCase::ascii("DATABASE") => TokenKind::Keyword(Keyword::Database),
UniCase::ascii("DB") => TokenKind::Keyword(Keyword::Database),
UniCase::ascii("DEFAULT") => TokenKind::Keyword(Keyword::Default),
UniCase::ascii("DEFINE") => TokenKind::Keyword(Keyword::Define),
UniCase::ascii("DELETE") => TokenKind::Keyword(Keyword::Delete),
UniCase::ascii("DESCENDING") => TokenKind::Keyword(Keyword::Descending),
UniCase::ascii("DESC") => TokenKind::Keyword(Keyword::Descending),
UniCase::ascii("DIFF") => TokenKind::Keyword(Keyword::Diff),
UniCase::ascii("DIMENSION") => TokenKind::Keyword(Keyword::Dimension),
UniCase::ascii("DISTANCE") => TokenKind::Keyword(Keyword::Distance),
UniCase::ascii("DIST") => TokenKind::Keyword(Keyword::Distance),
UniCase::ascii("DOC_IDS_CACHE") => TokenKind::Keyword(Keyword::DocIdsCache),
UniCase::ascii("DOC_IDS_ORDER") => TokenKind::Keyword(Keyword::DocIdsOrder),
UniCase::ascii("DOC_LENGTHS_CACHE") => TokenKind::Keyword(Keyword::DocLengthsCache),
UniCase::ascii("DOC_LENGTHS_ORDER") => TokenKind::Keyword(Keyword::DocLengthsOrder),
UniCase::ascii("DROP") => TokenKind::Keyword(Keyword::Drop),
UniCase::ascii("DUPLICATE") => TokenKind::Keyword(Keyword::Duplicate),
UniCase::ascii("EDGENGRAM") => TokenKind::Keyword(Keyword::Edgengram),
UniCase::ascii("EVENT") => TokenKind::Keyword(Keyword::Event),
UniCase::ascii("ELSE") => TokenKind::Keyword(Keyword::Else),
UniCase::ascii("END") => TokenKind::Keyword(Keyword::End),
UniCase::ascii("EXPLAIN") => TokenKind::Keyword(Keyword::Explain),
UniCase::ascii("false") => TokenKind::Keyword(Keyword::False),
UniCase::ascii("FETCH") => TokenKind::Keyword(Keyword::Fetch),
UniCase::ascii("FIELD") => TokenKind::Keyword(Keyword::Field),
UniCase::ascii("FIELDS") => TokenKind::Keyword(Keyword::Fields),
UniCase::ascii("COLUMNS") => TokenKind::Keyword(Keyword::Fields),
UniCase::ascii("FILTERS") => TokenKind::Keyword(Keyword::Filters),
UniCase::ascii("FLEXIBLE") => TokenKind::Keyword(Keyword::Flexible),
UniCase::ascii("FLEXI") => TokenKind::Keyword(Keyword::Flexible),
UniCase::ascii("FLEX") => TokenKind::Keyword(Keyword::Flexible),
UniCase::ascii("FOR") => TokenKind::Keyword(Keyword::For),
UniCase::ascii("FROM") => TokenKind::Keyword(Keyword::From),
UniCase::ascii("FULL") => TokenKind::Keyword(Keyword::Full),
UniCase::ascii("FUNCTION") => TokenKind::Keyword(Keyword::Function),
UniCase::ascii("GROUP") => TokenKind::Keyword(Keyword::Group),
UniCase::ascii("HIGHLIGHTS") => TokenKind::Keyword(Keyword::Highlights),
UniCase::ascii("IGNORE") => TokenKind::Keyword(Keyword::Ignore),
UniCase::ascii("INDEX") => TokenKind::Keyword(Keyword::Index),
UniCase::ascii("INFO") => TokenKind::Keyword(Keyword::Info),
UniCase::ascii("INSERT") => TokenKind::Keyword(Keyword::Insert),
UniCase::ascii("INTO") => TokenKind::Keyword(Keyword::Into),
UniCase::ascii("IF") => TokenKind::Keyword(Keyword::If),
UniCase::ascii("IS") => TokenKind::Keyword(Keyword::Is),
UniCase::ascii("KEY") => TokenKind::Keyword(Keyword::Key),
UniCase::ascii("KILL") => TokenKind::Keyword(Keyword::Kill),
UniCase::ascii("KNN") => TokenKind::Keyword(Keyword::Knn),
UniCase::ascii("LET") => TokenKind::Keyword(Keyword::Let),
UniCase::ascii("LIMIT") => TokenKind::Keyword(Keyword::Limit),
UniCase::ascii("LIVE") => TokenKind::Keyword(Keyword::Live),
UniCase::ascii("LOWERCASE") => TokenKind::Keyword(Keyword::Lowercase),
UniCase::ascii("MERGE") => TokenKind::Keyword(Keyword::Merge),
UniCase::ascii("MODEL") => TokenKind::Keyword(Keyword::Model),
UniCase::ascii("MTREE") => TokenKind::Keyword(Keyword::MTree),
UniCase::ascii("MTREE_CACHE") => TokenKind::Keyword(Keyword::MTreeCache),
UniCase::ascii("NAMESPACE") => TokenKind::Keyword(Keyword::Namespace),
UniCase::ascii("NS") => TokenKind::Keyword(Keyword::Namespace),
UniCase::ascii("NGRAM") => TokenKind::Keyword(Keyword::Ngram),
UniCase::ascii("NO") => TokenKind::Keyword(Keyword::No),
UniCase::ascii("NOINDEX") => TokenKind::Keyword(Keyword::NoIndex),
UniCase::ascii("NONE") => TokenKind::Keyword(Keyword::None),
UniCase::ascii("NULL") => TokenKind::Keyword(Keyword::Null),
UniCase::ascii("NUMERIC") => TokenKind::Keyword(Keyword::Numeric),
UniCase::ascii("OMIT") => TokenKind::Keyword(Keyword::Omit),
UniCase::ascii("ON") => TokenKind::Keyword(Keyword::On),
UniCase::ascii("ONLY") => TokenKind::Keyword(Keyword::Only),
UniCase::ascii("OPTION") => TokenKind::Keyword(Keyword::Option),
UniCase::ascii("ORDER") => TokenKind::Keyword(Keyword::Order),
UniCase::ascii("PARALLEL") => TokenKind::Keyword(Keyword::Parallel),
UniCase::ascii("PARAM") => TokenKind::Keyword(Keyword::Param),
UniCase::ascii("PASSHASH") => TokenKind::Keyword(Keyword::Passhash),
UniCase::ascii("PASSWORD") => TokenKind::Keyword(Keyword::Password),
UniCase::ascii("PATCH") => TokenKind::Keyword(Keyword::Patch),
UniCase::ascii("PERMISSIONS") => TokenKind::Keyword(Keyword::Permissions),
UniCase::ascii("POSTINGS_CACHE") => TokenKind::Keyword(Keyword::PostingsCache),
UniCase::ascii("POSTINGS_ORDER") => TokenKind::Keyword(Keyword::PostingsOrder),
UniCase::ascii("PUNCT") => TokenKind::Keyword(Keyword::Punct),
UniCase::ascii("RELATE") => TokenKind::Keyword(Keyword::Relate),
UniCase::ascii("REMOVE") => TokenKind::Keyword(Keyword::Remove),
UniCase::ascii("REPLACE") => TokenKind::Keyword(Keyword::Replace),
UniCase::ascii("RETURN") => TokenKind::Keyword(Keyword::Return),
UniCase::ascii("ROLES") => TokenKind::Keyword(Keyword::Roles),
UniCase::ascii("ROOT") => TokenKind::Keyword(Keyword::Root),
UniCase::ascii("KV") => TokenKind::Keyword(Keyword::Root),
UniCase::ascii("SCHEMAFULL") => TokenKind::Keyword(Keyword::Schemafull),
UniCase::ascii("SCHEMAFUL") => TokenKind::Keyword(Keyword::Schemafull),
UniCase::ascii("SCHEMALESS") => TokenKind::Keyword(Keyword::Schemaless),
UniCase::ascii("SCOPE") => TokenKind::Keyword(Keyword::Scope),
UniCase::ascii("SC") => TokenKind::Keyword(Keyword::Scope),
UniCase::ascii("SEARCH") => TokenKind::Keyword(Keyword::Search),
UniCase::ascii("SELECT") => TokenKind::Keyword(Keyword::Select),
UniCase::ascii("SESSION") => TokenKind::Keyword(Keyword::Session),
UniCase::ascii("SET") => TokenKind::Keyword(Keyword::Set),
UniCase::ascii("SHOW") => TokenKind::Keyword(Keyword::Show),
UniCase::ascii("SIGNIN") => TokenKind::Keyword(Keyword::Signin),
UniCase::ascii("SIGNUP") => TokenKind::Keyword(Keyword::Signup),
UniCase::ascii("SINCE") => TokenKind::Keyword(Keyword::Since),
UniCase::ascii("SLEEP") => TokenKind::Keyword(Keyword::Sleep),
UniCase::ascii("SNOWBALL") => TokenKind::Keyword(Keyword::Snowball),
UniCase::ascii("SPLIT") => TokenKind::Keyword(Keyword::Split),
UniCase::ascii("START") => TokenKind::Keyword(Keyword::Start),
UniCase::ascii("TABLE") => TokenKind::Keyword(Keyword::Table),
UniCase::ascii("TB") => TokenKind::Keyword(Keyword::Table),
UniCase::ascii("TERMS_CACHE") => TokenKind::Keyword(Keyword::TermsCache),
UniCase::ascii("TERMS_ORDER") => TokenKind::Keyword(Keyword::TermsOrder),
UniCase::ascii("THEN") => TokenKind::Keyword(Keyword::Then),
UniCase::ascii("THROW") => TokenKind::Keyword(Keyword::Throw),
UniCase::ascii("TIMEOUT") => TokenKind::Keyword(Keyword::Timeout),
UniCase::ascii("TOKENIZERS") => TokenKind::Keyword(Keyword::Tokenizers),
UniCase::ascii("TOKEN") => TokenKind::Keyword(Keyword::Token),
UniCase::ascii("TRANSACTION") => TokenKind::Keyword(Keyword::Transaction),
UniCase::ascii("true") => TokenKind::Keyword(Keyword::True),
UniCase::ascii("TYPE") => TokenKind::Keyword(Keyword::Type),
UniCase::ascii("UNIQUE") => TokenKind::Keyword(Keyword::Unique),
UniCase::ascii("UNSET") => TokenKind::Keyword(Keyword::Unset),
UniCase::ascii("UPDATE") => TokenKind::Keyword(Keyword::Update),
UniCase::ascii("UPPERCASE") => TokenKind::Keyword(Keyword::Uppercase),
UniCase::ascii("USE") => TokenKind::Keyword(Keyword::Use),
UniCase::ascii("USER") => TokenKind::Keyword(Keyword::User),
UniCase::ascii("VALUE") => TokenKind::Keyword(Keyword::Value),
UniCase::ascii("VALUES") => TokenKind::Keyword(Keyword::Values),
UniCase::ascii("VERSION") => TokenKind::Keyword(Keyword::Version),
UniCase::ascii("VS") => TokenKind::Keyword(Keyword::Vs),
UniCase::ascii("WHEN") => TokenKind::Keyword(Keyword::When),
UniCase::ascii("WHERE") => TokenKind::Keyword(Keyword::Where),
UniCase::ascii("WITH") => TokenKind::Keyword(Keyword::With),
UniCase::ascii("ALLINSIDE") => TokenKind::Keyword(Keyword::AllInside),
UniCase::ascii("ANDKW") => TokenKind::Keyword(Keyword::AndKw),
UniCase::ascii("ANYINSIDE") => TokenKind::Keyword(Keyword::AnyInside),
UniCase::ascii("INSIDE") => TokenKind::Keyword(Keyword::Inside),
UniCase::ascii("INTERSECTS") => TokenKind::Keyword(Keyword::Intersects),
UniCase::ascii("NONEINSIDE") => TokenKind::Keyword(Keyword::NoneInside),
UniCase::ascii("NOTINSIDE") => TokenKind::Keyword(Keyword::NotInside),
UniCase::ascii("OR") => TokenKind::Keyword(Keyword::OrKw),
UniCase::ascii("OUTSIDE") => TokenKind::Keyword(Keyword::Outside),
UniCase::ascii("NOT") => TokenKind::Keyword(Keyword::Not),
UniCase::ascii("AND") => TokenKind::Keyword(Keyword::And),
UniCase::ascii("COLLATE") => TokenKind::Keyword(Keyword::Collate),
UniCase::ascii("CONTAINSALL") => TokenKind::Keyword(Keyword::ContainsAll),
UniCase::ascii("CONTAINSANY") => TokenKind::Keyword(Keyword::ContainsAny),
UniCase::ascii("CONTAINSNONE") => TokenKind::Keyword(Keyword::ContainsNone),
UniCase::ascii("CONTAINSNOT") => TokenKind::Keyword(Keyword::ContainsNot),
UniCase::ascii("CONTAINS") => TokenKind::Keyword(Keyword::Contains),
UniCase::ascii("IN") => TokenKind::Keyword(Keyword::In),
UniCase::ascii("ANY") => TokenKind::Keyword(Keyword::Any),
UniCase::ascii("ARRAY") => TokenKind::Keyword(Keyword::Array),
UniCase::ascii("GEOMETRY") => TokenKind::Keyword(Keyword::Geometry),
UniCase::ascii("RECORD") => TokenKind::Keyword(Keyword::Record),
UniCase::ascii("FUTURE") => TokenKind::Keyword(Keyword::Future),
UniCase::ascii("BOOL") => TokenKind::Keyword(Keyword::Bool),
UniCase::ascii("BYTES") => TokenKind::Keyword(Keyword::Bytes),
UniCase::ascii("DATETIME") => TokenKind::Keyword(Keyword::Datetime),
UniCase::ascii("DECIMAL") => TokenKind::Keyword(Keyword::Decimal),
UniCase::ascii("DURATION") => TokenKind::Keyword(Keyword::Duration),
UniCase::ascii("FLOAT") => TokenKind::Keyword(Keyword::Float),
UniCase::ascii("fn") => TokenKind::Keyword(Keyword::Fn),
UniCase::ascii("ml") => TokenKind::Keyword(Keyword::ML),
UniCase::ascii("INT") => TokenKind::Keyword(Keyword::Int),
UniCase::ascii("NUMBER") => TokenKind::Keyword(Keyword::Number),
UniCase::ascii("OBJECT") => TokenKind::Keyword(Keyword::Object),
UniCase::ascii("STRING") => TokenKind::Keyword(Keyword::String),
UniCase::ascii("UUID") => TokenKind::Keyword(Keyword::Uuid),
UniCase::ascii("ULID") => TokenKind::Keyword(Keyword::Ulid),
UniCase::ascii("RAND") => TokenKind::Keyword(Keyword::Rand),
UniCase::ascii("FEATURE") => TokenKind::Keyword(Keyword::Feature),
UniCase::ascii("LINE") => TokenKind::Keyword(Keyword::Line),
UniCase::ascii("POINT") => TokenKind::Keyword(Keyword::Point),
UniCase::ascii("POLYGON") => TokenKind::Keyword(Keyword::Polygon),
UniCase::ascii("MULTIPOINT") => TokenKind::Keyword(Keyword::MultiPoint),
UniCase::ascii("MULTILINE") => TokenKind::Keyword(Keyword::MultiLine),
UniCase::ascii("MULTIPOLYGON") => TokenKind::Keyword(Keyword::MultiPolygon),
UniCase::ascii("COLLECTION") => TokenKind::Keyword(Keyword::Collection),
// Languages
UniCase::ascii("ARABIC") => TokenKind::Language(Language::Arabic),
UniCase::ascii("ARA") => TokenKind::Language(Language::Arabic),
UniCase::ascii("AR") => TokenKind::Language(Language::Arabic),
UniCase::ascii("DANISH") => TokenKind::Language(Language::Danish),
UniCase::ascii("DAN") => TokenKind::Language(Language::Danish),
UniCase::ascii("DA") => TokenKind::Language(Language::Danish),
UniCase::ascii("DUTCH") => TokenKind::Language(Language::Dutch),
UniCase::ascii("NLD") => TokenKind::Language(Language::Dutch),
UniCase::ascii("NL") => TokenKind::Language(Language::Dutch),
UniCase::ascii("ENGLISH") => TokenKind::Language(Language::English),
UniCase::ascii("ENG") => TokenKind::Language(Language::English),
UniCase::ascii("EN") => TokenKind::Language(Language::English),
UniCase::ascii("FRENCH") => TokenKind::Language(Language::French),
UniCase::ascii("FRA") => TokenKind::Language(Language::French),
UniCase::ascii("FR") => TokenKind::Language(Language::French),
UniCase::ascii("GERMAN") => TokenKind::Language(Language::German),
UniCase::ascii("DEU") => TokenKind::Language(Language::German),
UniCase::ascii("DE") => TokenKind::Language(Language::German),
UniCase::ascii("GREEK") => TokenKind::Language(Language::Greek),
UniCase::ascii("ELL") => TokenKind::Language(Language::Greek),
UniCase::ascii("EL") => TokenKind::Language(Language::Greek),
UniCase::ascii("HUNGARIAN") => TokenKind::Language(Language::Hungarian),
UniCase::ascii("HUN") => TokenKind::Language(Language::Hungarian),
UniCase::ascii("HU") => TokenKind::Language(Language::Hungarian),
UniCase::ascii("ITALIAN") => TokenKind::Language(Language::Italian),
UniCase::ascii("ITA") => TokenKind::Language(Language::Italian),
UniCase::ascii("IT") => TokenKind::Language(Language::Italian),
UniCase::ascii("NORWEGIAN") => TokenKind::Language(Language::Norwegian),
UniCase::ascii("NOR") => TokenKind::Language(Language::Norwegian),
UniCase::ascii("PORTUGUESE") => TokenKind::Language(Language::Portuguese),
UniCase::ascii("POR") => TokenKind::Language(Language::Portuguese),
UniCase::ascii("PT") => TokenKind::Language(Language::Portuguese),
UniCase::ascii("ROMANIAN") => TokenKind::Language(Language::Romanian),
UniCase::ascii("RON") => TokenKind::Language(Language::Romanian),
UniCase::ascii("RO") => TokenKind::Language(Language::Romanian),
UniCase::ascii("RUSSIAN") => TokenKind::Language(Language::Russian),
UniCase::ascii("RUS") => TokenKind::Language(Language::Russian),
UniCase::ascii("RU") => TokenKind::Language(Language::Russian),
UniCase::ascii("SPANISH") => TokenKind::Language(Language::Spanish),
UniCase::ascii("SPA") => TokenKind::Language(Language::Spanish),
UniCase::ascii("ES") => TokenKind::Language(Language::Spanish),
UniCase::ascii("SWEDISH") => TokenKind::Language(Language::Swedish),
UniCase::ascii("SWE") => TokenKind::Language(Language::Swedish),
UniCase::ascii("SV") => TokenKind::Language(Language::Swedish),
UniCase::ascii("TAMIL") => TokenKind::Language(Language::Tamil),
UniCase::ascii("TAM") => TokenKind::Language(Language::Tamil),
UniCase::ascii("TA") => TokenKind::Language(Language::Tamil),
UniCase::ascii("TURKISH") => TokenKind::Language(Language::Turkish),
UniCase::ascii("TUR") => TokenKind::Language(Language::Turkish),
UniCase::ascii("TR") => TokenKind::Language(Language::Turkish),
// Algorithms
UniCase::ascii("EDDSA") => TokenKind::Algorithm(Algorithm::EdDSA),
UniCase::ascii("ES256") => TokenKind::Algorithm(Algorithm::Es256),
UniCase::ascii("ES384") => TokenKind::Algorithm(Algorithm::Es384),
UniCase::ascii("ES512") => TokenKind::Algorithm(Algorithm::Es512),
UniCase::ascii("HS256") => TokenKind::Algorithm(Algorithm::Hs256),
UniCase::ascii("HS384") => TokenKind::Algorithm(Algorithm::Hs384),
UniCase::ascii("HS512") => TokenKind::Algorithm(Algorithm::Hs512),
UniCase::ascii("PS256") => TokenKind::Algorithm(Algorithm::Ps256),
UniCase::ascii("PS384") => TokenKind::Algorithm(Algorithm::Ps384),
UniCase::ascii("PS512") => TokenKind::Algorithm(Algorithm::Ps512),
UniCase::ascii("RS256") => TokenKind::Algorithm(Algorithm::Rs256),
UniCase::ascii("RS384") => TokenKind::Algorithm(Algorithm::Rs384),
UniCase::ascii("RS512") => TokenKind::Algorithm(Algorithm::Rs512),
UniCase::ascii("JWKS") => TokenKind::Algorithm(Algorithm::Jwks),
// Distance
UniCase::ascii("EUCLIDEAN") => TokenKind::Distance(DistanceKind::Euclidean),
UniCase::ascii("MANHATTAN") => TokenKind::Distance(DistanceKind::Manhattan),
UniCase::ascii("HAMMING") => TokenKind::Distance(DistanceKind::Hamming),
UniCase::ascii("MINKOWSKI") => TokenKind::Distance(DistanceKind::Minkowski),
};

417
lib/src/syn/v2/lexer/mod.rs Normal file
View file

@ -0,0 +1,417 @@
use crate::{
sql::{Datetime, Duration, Regex, Uuid},
syn::v2::token::{Span, Token, TokenKind},
};
use thiserror::Error;
mod byte;
mod char;
mod datetime;
mod duration;
mod ident;
mod js;
mod keywords;
mod number;
mod reader;
mod strand;
mod unicode;
mod uuid;
#[cfg(test)]
mod test;
pub use reader::{BytesReader, CharError};
/// A error returned by the lexer when an invalid token is encountered.
///
/// Can be retrieved from the `Lexer::error` field whenever it returned a [`TokenKind::Invalid`]
/// token.
#[derive(Error, Debug)]
pub enum Error {
#[error("Lexer encountered unexpected character {0:?}")]
UnexpectedCharacter(char),
#[error("invalid escape character {0:?}")]
InvalidEscapeCharacter(char),
#[error("Lexer encountered unexpected end of source characters")]
UnexpectedEof,
#[error("source was not valid utf-8")]
InvalidUtf8,
#[error("expected next character to be '{0}'")]
ExpectedEnd(char),
#[error("failed to lex date-time, {0}")]
DateTime(#[from] datetime::Error),
#[error("failed to lex uuid, {0}")]
Uuid(#[from] uuid::Error),
#[error("failed to lex duration, {0}")]
Duration(#[from] duration::Error),
#[error("failed to lex number, {0}")]
Number(#[from] number::Error),
#[error("failed to parse regex, {0}")]
Regex(regex::Error),
}
impl From<CharError> for Error {
fn from(value: CharError) -> Self {
match value {
CharError::Eof => Self::UnexpectedEof,
CharError::Unicode => Self::InvalidUtf8,
}
}
}
/// The SurrealQL lexer.
/// Takes a slice of bytes and turns it into tokens. The lexer is designed with possible invalid utf-8
/// in mind and will handle bytes which are invalid utf-8 with an error.
///
/// The lexer generates tokens lazily. whenever [`Lexer::next_token`] is called on the lexer it will
/// try to lex the next bytes in the give source as a token. The lexer always returns a token, even
/// if the source contains invalid tokens or as at the end of the source. In both cases a specific
/// type of token is returned.
///
/// Note that SurrealQL syntax cannot be lexed in advance. For example, record strings and regexes,
/// both cannot be parsed correctly without knowledge of previous tokens as they are both ambigious
/// with other tokens.
pub struct Lexer<'a> {
/// The reader for reading the source bytes.
pub reader: BytesReader<'a>,
/// The one past the last character of the previous token.
last_offset: u32,
/// The span of whitespace if it was read between two tokens.
whitespace_span: Option<Span>,
/// A buffer used to build the value of tokens which can't be read straight from the source.
/// like for example strings with escape characters.
scratch: String,
// below are a collection of storage for values produced by tokens.
// For performance reasons we wan't to keep the tokens as small as possible.
// As only some tokens have an additional value associated with them we don't store that value
// in the token itself but, instead, in the lexer ensureing a smaller size for each individual
// token.
//
// This does result in some additional state to keep track of as peeking a token while a token
// value is still in the variables below will overwrite the previous value.
//
// Both numbers and actual strings are stored as a string value.
// The parser can, depending on position in syntax, decide to parse a number in a variety of
// different precisions or formats. The only way to support all is to delay parsing the
// actual number value to when the parser can decide on a format.
pub string: Option<String>,
pub duration: Option<Duration>,
pub datetime: Option<Datetime>,
pub regex: Option<Regex>,
pub uuid: Option<Uuid>,
pub error: Option<Error>,
}
impl<'a> Lexer<'a> {
/// Create a new lexer.
/// # Panic
/// This function will panic if the source is longer then u32::MAX.
pub fn new(source: &'a [u8]) -> Lexer<'a> {
let reader = BytesReader::new(source);
assert!(reader.len() <= u32::MAX as usize, "source code exceeded maximum size");
Lexer {
reader,
last_offset: 0,
whitespace_span: None,
scratch: String::new(),
string: None,
datetime: None,
duration: None,
regex: None,
uuid: None,
error: None,
}
}
/// Reset the state of the lexer.
///
/// Doesn't change the state of the reader.
pub fn reset(&mut self) {
self.last_offset = 0;
self.scratch.clear();
self.whitespace_span = None;
self.string = None;
self.datetime = None;
self.duration = None;
self.regex = None;
self.uuid = None;
self.error = None;
}
/// Change the used source from the lexer to a new buffer.
///
/// Usefull for reusing buffers.
///
/// # Panic
/// This function will panic if the source is longer then u32::MAX.
pub fn change_source<'b>(self, source: &'b [u8]) -> Lexer<'b> {
let reader = BytesReader::<'b>::new(source);
assert!(reader.len() <= u32::MAX as usize, "source code exceeded maximum size");
Lexer {
reader,
last_offset: 0,
whitespace_span: None,
scratch: self.scratch,
string: self.string,
datetime: self.datetime,
duration: self.duration,
regex: self.regex,
uuid: self.uuid,
error: self.error,
}
}
/// return the whitespace of the last token buffered, either peeked or poped.
pub fn whitespace_span(&self) -> Option<Span> {
self.whitespace_span
}
/// Used for seting the span of whitespace between tokens. Will extend the current whitespace
/// if there already is one.
fn set_whitespace_span(&mut self, span: Span) {
if let Some(existing) = self.whitespace_span.as_mut() {
*existing = existing.covers(span);
} else {
self.whitespace_span = Some(span);
}
}
/// Returns the next token, driving the lexer forward.
///
/// If the lexer is at the end the source it will always return the Eof token.
pub fn next_token(&mut self) -> Token {
self.whitespace_span = None;
self.next_token_inner()
}
fn next_token_inner(&mut self) -> Token {
let Some(byte) = self.reader.next() else {
return self.eof_token();
};
if byte.is_ascii() {
self.lex_ascii(byte)
} else {
self.lex_char(byte)
}
}
/// Creates the eof token.
///
/// An eof token has tokenkind Eof and an span which points to the last character of the
/// source.
fn eof_token(&mut self) -> Token {
Token {
kind: TokenKind::Eof,
span: Span {
offset: self.last_offset.saturating_sub(1),
len: 1,
},
}
}
/// Skip the last consumed bytes in the reader.
///
/// The bytes consumed before this point won't be part of the span.
fn skip_offset(&mut self) {
self.last_offset = self.reader.offset() as u32;
}
/// Return an invalid token.
fn invalid_token(&mut self, error: Error) -> Token {
self.error = Some(error);
self.finish_token(TokenKind::Invalid)
}
// Returns the span for the current token being lexed.
pub fn current_span(&self) -> Span {
// We make sure that the source is no longer then u32::MAX so this can't overflow.
let new_offset = self.reader.offset() as u32;
let len = new_offset - self.last_offset;
Span {
offset: self.last_offset,
len,
}
}
/// Builds a token from an TokenKind.
///
/// Attaches a span to the token and returns, updates the new offset.
fn finish_token(&mut self, kind: TokenKind) -> Token {
let span = self.current_span();
// We make sure that the source is no longer then u32::MAX so this can't overflow.
self.last_offset = self.reader.offset() as u32;
Token {
kind,
span,
}
}
/// Moves the lexer state back to before the give span.
///
/// # Warning
/// Moving the lexer into a state where the next byte is within a multibyte character will
/// result in spurious errors.
pub fn backup_before(&mut self, span: Span) {
self.reader.backup(span.offset as usize);
self.last_offset = span.offset;
}
/// Moves the lexer state to after the give span.
///
/// # Warning
/// Moving the lexer into a state where the next byte is within a multibyte character will
/// result in spurious errors.
pub fn backup_after(&mut self, span: Span) {
let offset = span.offset + span.len;
self.reader.backup(offset as usize);
self.last_offset = offset;
}
/// Checks if the next byte is the given byte, if it is it consumes the byte and returns true.
/// Otherwise returns false.
///
/// Also returns false if there is no next character.
pub fn eat(&mut self, byte: u8) -> bool {
if self.reader.peek() == Some(byte) {
self.reader.next();
true
} else {
false
}
}
/// Checks if the closure returns true when given the next byte, if it is it consumes the byte
/// and returns true. Otherwise returns false.
///
/// Also returns false if there is no next character.
pub fn eat_when<F: FnOnce(u8) -> bool>(&mut self, f: F) -> bool {
let Some(x) = self.reader.peek() else {
return false;
};
if f(x) {
self.reader.next();
true
} else {
false
}
}
/// Lex a single `"` character with possible leading whitespace.
///
/// Used for parsing record strings.
pub fn lex_record_string_close(&mut self) -> Token {
loop {
let Some(byte) = self.reader.next() else {
return self.invalid_token(Error::UnexpectedEof);
};
match byte {
unicode::byte::CR
| unicode::byte::FF
| unicode::byte::LF
| unicode::byte::SP
| unicode::byte::VT
| unicode::byte::TAB => {
self.eat_whitespace();
continue;
}
b'"' => {
return self.finish_token(TokenKind::CloseRecordString {
double: true,
});
}
b'\'' => {
return self.finish_token(TokenKind::CloseRecordString {
double: false,
});
}
b'-' => match self.reader.next() {
Some(b'-') => {
self.eat_single_line_comment();
continue;
}
Some(x) => match self.reader.convert_to_char(x) {
Ok(c) => return self.invalid_token(Error::UnexpectedCharacter(c)),
Err(e) => return self.invalid_token(e.into()),
},
None => return self.invalid_token(Error::UnexpectedEof),
},
b'/' => match self.reader.next() {
Some(b'*') => {
if let Err(e) = self.eat_multi_line_comment() {
return self.invalid_token(e);
}
continue;
}
Some(b'/') => {
self.eat_single_line_comment();
continue;
}
Some(x) => match self.reader.convert_to_char(x) {
Ok(c) => return self.invalid_token(Error::UnexpectedCharacter(c)),
Err(e) => return self.invalid_token(e.into()),
},
None => return self.invalid_token(Error::UnexpectedEof),
},
b'#' => {
self.eat_single_line_comment();
continue;
}
x => match self.reader.convert_to_char(x) {
Ok(c) => return self.invalid_token(Error::UnexpectedCharacter(c)),
Err(e) => return self.invalid_token(e.into()),
},
}
}
}
/// Lex only a datetime without enclosing delimiters.
///
/// Used for reusing lexer lexing code for parsing datetimes. Should not be called during
/// normal parsing.
pub fn lex_only_datetime(&mut self) -> Result<Datetime, Error> {
self.lex_datetime_raw_err().map_err(Error::DateTime)
}
/// Lex only a duration.
///
/// Used for reusing lexer lexing code for parsing durations. Should not be used during normal
/// parsing.
pub fn lex_only_duration(&mut self) -> Result<Duration, Error> {
match self.reader.next() {
Some(x @ b'0'..=b'9') => {
self.scratch.push(x as char);
while let Some(x @ b'0'..=b'9') = self.reader.peek() {
self.reader.next();
self.scratch.push(x as char);
}
self.lex_duration_err().map_err(Error::Duration)
}
Some(x) => {
let char = self.reader.convert_to_char(x)?;
Err(Error::UnexpectedCharacter(char))
}
None => Err(Error::UnexpectedEof),
}
}
/// Lex only a UUID.
///
/// Used for reusing lexer lexing code for parsing UUID's. Should not be used during normal
/// parsing.
pub fn lex_only_uuid(&mut self) -> Result<Uuid, Error> {
Ok(self.lex_uuid_err_inner()?)
}
}
impl Iterator for Lexer<'_> {
type Item = Token;
fn next(&mut self) -> Option<Self::Item> {
let token = self.next_token();
if token.is_eof() {
return None;
}
Some(token)
}
}

View file

@ -0,0 +1,257 @@
use crate::syn::v2::{
lexer::{unicode::U8Ext, Error as LexError, Lexer},
token::{NumberKind, Token, TokenKind},
};
use std::mem;
use thiserror::Error;
#[derive(Error, Debug)]
pub enum Error {
#[error("invalid number suffix")]
InvalidSuffix,
#[error("expected atleast a single digit in the exponent")]
DigitExpectedExponent,
}
impl Lexer<'_> {
/// Lex only an integer.
/// Use when a number can be followed immediatly by a `.` like in a model version.
pub fn lex_only_integer(&mut self) -> Token {
match self.lex_only_integer_err() {
Ok(x) => x,
Err(e) => self.invalid_token(LexError::Number(e)),
}
}
fn lex_only_integer_err(&mut self) -> Result<Token, Error> {
let Some(next) = self.reader.peek() else {
return Ok(self.eof_token());
};
// not a number, return a different token kind, for error reporting.
if !next.is_ascii_digit() {
return Ok(self.next_token());
}
self.scratch.push(next as char);
self.reader.next();
// eat all the ascii digits
while let Some(x) = self.reader.peek() {
if x == b'_' {
self.reader.next();
} else if !x.is_ascii_digit() {
break;
} else {
self.scratch.push(x as char);
self.reader.next();
}
}
// test for a suffix.
match self.reader.peek() {
Some(b'd' | b'f') => {
// not an integer but parse anyway for error reporting.
return self.lex_suffix(true);
}
Some(x) if x.is_ascii_alphabetic() => return Err(self.invalid_suffix()),
_ => {}
}
self.string = Some(mem::take(&mut self.scratch));
Ok(self.finish_token(TokenKind::Number(NumberKind::Integer)))
}
pub fn lex_number(&mut self, start: u8) -> Token {
match self.lex_number_err(start) {
Ok(x) => x,
Err(e) => self.invalid_token(LexError::Number(e)),
}
}
/// Lex a number.
///
/// Expects the digit which started the number as the start argument.
pub fn lex_number_err(&mut self, start: u8) -> Result<Token, Error> {
debug_assert!(start.is_ascii_digit());
debug_assert_eq!(self.scratch, "");
self.scratch.push(start as char);
loop {
let Some(x) = self.reader.peek() else {
self.string = Some(mem::take(&mut self.scratch));
return Ok(self.finish_token(TokenKind::Number(NumberKind::Integer)));
};
match x {
b'0'..=b'9' => {
// next digits.
self.reader.next();
self.scratch.push(x as char);
}
b'.' => {
// mantissa
let backup = self.reader.offset();
self.reader.next();
let next = self.reader.peek();
if let Some(b'0'..=b'9') = next {
self.scratch.push('.');
return self.lex_mantissa();
} else {
// indexing a number
self.reader.backup(backup);
self.string = Some(mem::take(&mut self.scratch));
return Ok(self.finish_token(TokenKind::Number(NumberKind::Integer)));
}
}
b'f' | b'd' => return self.lex_suffix(true),
// Oxc2 is the start byte of 'µ'
0xc2 | b'n' | b'u' | b'm' | b'h' | b'w' | b'y' | b's' => {
// duration suffix, switch to lexing duration.
return Ok(self.lex_duration());
}
b'_' => {
self.reader.next();
}
b'a'..=b'z' | b'A'..=b'Z' => {
return Err(self.invalid_suffix());
// invalid token, unexpected identifier character immediatly after number.
// Eat all remaining identifier like characters.
}
_ => {
self.string = Some(mem::take(&mut self.scratch));
return Ok(self.finish_token(TokenKind::Number(NumberKind::Integer)));
}
}
}
}
fn invalid_suffix(&mut self) -> Error {
// eat the whole suffix.
while let Some(x) = self.reader.peek() {
if !x.is_ascii_alphanumeric() {
break;
}
self.reader.next();
}
self.scratch.clear();
Error::InvalidSuffix
}
/// Lex a number suffix, either 'f' or 'dec'.
fn lex_suffix(&mut self, can_be_duration: bool) -> Result<Token, Error> {
match self.reader.peek() {
Some(b'f') => {
// float suffix
self.reader.next();
if let Some(true) = self.reader.peek().map(|x| x.is_identifier_continue()) {
Err(self.invalid_suffix())
} else {
self.string = Some(mem::take(&mut self.scratch));
Ok(self.finish_token(TokenKind::Number(NumberKind::Float)))
}
}
Some(b'd') => {
// decimal suffix
self.reader.next();
let checkpoint = self.reader.offset();
if !self.eat(b'e') {
if can_be_duration {
self.reader.backup(checkpoint - 1);
return Ok(self.lex_duration());
} else {
return Err(self.invalid_suffix());
}
}
if !self.eat(b'c') {
return Err(self.invalid_suffix());
}
if let Some(true) = self.reader.peek().map(|x| x.is_identifier_continue()) {
Err(self.invalid_suffix())
} else {
self.string = Some(mem::take(&mut self.scratch));
Ok(self.finish_token(TokenKind::Number(NumberKind::Decimal)))
}
}
_ => unreachable!(),
}
}
/// Lexes the mantissa of a number, i.e. `.8` in `1.8`
pub fn lex_mantissa(&mut self) -> Result<Token, Error> {
loop {
// lex_number already checks if there exists a digit after the dot.
// So this will never fail the first iteration of the loop.
let Some(x) = self.reader.peek() else {
self.string = Some(mem::take(&mut self.scratch));
return Ok(self.finish_token(TokenKind::Number(NumberKind::Mantissa)));
};
match x {
b'0'..=b'9' => {
// next digit.
self.reader.next();
self.scratch.push(x as char);
}
b'e' | b'E' => {
// scientific notation
self.reader.next();
self.scratch.push('e');
return self.lex_exponent(true);
}
b'_' => {
self.reader.next();
}
b'f' | b'd' => return self.lex_suffix(false),
b'a'..=b'z' | b'A'..=b'Z' => {
// invalid token, random identifier characters immediately after number.
self.scratch.clear();
return Err(Error::InvalidSuffix);
}
_ => {
self.string = Some(mem::take(&mut self.scratch));
return Ok(self.finish_token(TokenKind::Number(NumberKind::Mantissa)));
}
}
}
}
/// Lexes the exponent of a number, i.e. `e10` in `1.1e10`;
fn lex_exponent(&mut self, had_mantissa: bool) -> Result<Token, Error> {
let mut atleast_one = false;
match self.reader.peek() {
Some(b'-' | b'+') => {}
Some(b'0'..=b'9') => {
atleast_one = true;
}
_ => {
// random other character, expected atleast one digit.
return Err(Error::DigitExpectedExponent);
}
}
self.reader.next();
loop {
match self.reader.peek() {
Some(x @ b'0'..=b'9') => {
self.reader.next();
self.scratch.push(x as char);
}
Some(b'_') => {
self.reader.next();
}
Some(b'f' | b'd') => return self.lex_suffix(false),
_ => {
if atleast_one {
let kind = if had_mantissa {
NumberKind::MantissaExponent
} else {
NumberKind::Exponent
};
self.string = Some(mem::take(&mut self.scratch));
return Ok(self.finish_token(TokenKind::Number(kind)));
} else {
return Err(Error::DigitExpectedExponent);
}
}
}
}
}
}

View file

@ -0,0 +1,157 @@
use thiserror::Error;
use crate::syn::v2::token::Span;
use std::fmt;
#[derive(Error, Debug)]
pub enum CharError {
#[error("found eof inside multi byte character")]
Eof,
#[error("string is not valid utf-8")]
Unicode,
}
#[derive(Clone)]
pub struct BytesReader<'a> {
data: &'a [u8],
current: usize,
}
impl fmt::Debug for BytesReader<'_> {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
f.debug_struct("BytesReader")
.field("used", &self.used())
.field("remaining", &self.remaining())
.finish()
}
}
impl<'a> BytesReader<'a> {
pub fn new(slice: &'a [u8]) -> Self {
BytesReader {
data: slice,
current: 0,
}
}
#[inline]
pub fn full(&self) -> &'a [u8] {
self.data
}
#[inline]
pub fn used(&self) -> &'a [u8] {
&self.data[..self.current]
}
#[inline]
pub fn remaining(&self) -> &'a [u8] {
&self.data[self.current..]
}
#[inline]
pub fn len(&self) -> usize {
self.remaining().len()
}
#[inline]
pub fn offset(&self) -> usize {
self.current
}
#[inline]
pub fn backup(&mut self, offset: usize) {
assert!(offset <= self.offset());
self.current = offset;
}
#[inline]
pub fn is_empty(&self) -> bool {
self.remaining().is_empty()
}
#[inline]
pub fn peek(&self) -> Option<u8> {
self.remaining().get(0).copied()
}
#[inline]
pub fn span(&self, span: Span) -> &[u8] {
&self.data[(span.offset as usize)..(span.offset as usize + span.len as usize)]
}
#[inline]
pub fn next_continue_byte(&mut self) -> Result<u8, CharError> {
const CONTINUE_BYTE_PREFIX_MASK: u8 = 0b1100_0000;
const CONTINUE_BYTE_MASK: u8 = 0b0011_1111;
let byte = self.next().ok_or(CharError::Eof)?;
if byte & CONTINUE_BYTE_PREFIX_MASK != 0b1000_0000 {
return Err(CharError::Eof);
}
Ok(byte & CONTINUE_BYTE_MASK)
}
pub fn convert_to_char(&mut self, start: u8) -> Result<char, CharError> {
if start.is_ascii() {
return Ok(start as char);
}
self.complete_char(start)
}
pub fn complete_char(&mut self, start: u8) -> Result<char, CharError> {
match start & 0b1111_1000 {
0b1100_0000 | 0b1101_0000 | 0b1100_1000 | 0b1101_1000 => {
let mut val = (start & 0b0001_1111) as u32;
val <<= 6;
let next = self.next_continue_byte()?;
val |= next as u32;
char::from_u32(val).ok_or(CharError::Unicode)
}
0b1110_0000 | 0b1110_1000 => {
let mut val = (start & 0b0000_1111) as u32;
val <<= 6;
let next = self.next_continue_byte()?;
val |= next as u32;
val <<= 6;
let next = self.next_continue_byte()?;
val |= next as u32;
char::from_u32(val).ok_or(CharError::Unicode)
}
0b1111_0000 => {
let mut val = (start & 0b0000_0111) as u32;
val <<= 6;
let next = self.next_continue_byte()?;
val |= next as u32;
val <<= 6;
let next = self.next_continue_byte()?;
val |= next as u32;
val <<= 6;
let next = self.next_continue_byte()?;
val |= next as u32;
char::from_u32(val).ok_or(CharError::Unicode)
}
x => panic!("start byte did not start multi byte character: {:b}", x),
}
}
}
impl<'a> Iterator for BytesReader<'a> {
type Item = u8;
#[inline]
fn next(&mut self) -> Option<Self::Item> {
let res = self.peek()?;
self.current += 1;
Some(res)
}
fn size_hint(&self) -> (usize, Option<usize>) {
let len = self.len();
(len, Some(len))
}
}
impl<'a> ExactSizeIterator for BytesReader<'a> {
fn len(&self) -> usize {
self.len()
}
}

View file

@ -0,0 +1,95 @@
//! Lexing of strand like characters.
use std::mem;
use crate::syn::v2::token::{Token, TokenKind};
use super::{unicode::chars, Error, Lexer};
impl<'a> Lexer<'a> {
/// Lex a plain strand with either single or double quotes.
pub fn lex_strand(&mut self, is_double: bool) -> Token {
match self.lex_strand_err(is_double) {
Ok(x) => x,
Err(x) => {
self.scratch.clear();
self.invalid_token(x)
}
}
}
/// Lex a strand with either double or single quotes but return an result instead of a token.
pub fn lex_strand_err(&mut self, is_double: bool) -> Result<Token, Error> {
loop {
let Some(x) = self.reader.next() else {
self.scratch.clear();
return Ok(self.eof_token());
};
if x.is_ascii() {
match x {
b'\'' if !is_double => {
self.string = Some(mem::take(&mut self.scratch));
return Ok(self.finish_token(TokenKind::Strand));
}
b'"' if is_double => {
self.string = Some(mem::take(&mut self.scratch));
return Ok(self.finish_token(TokenKind::Strand));
}
b'\0' => {
// null bytes not allowed
return Err(Error::UnexpectedCharacter('\0'));
}
b'\\' => {
// Handle escape sequences.
let Some(next) = self.reader.next() else {
self.scratch.clear();
return Ok(self.eof_token());
};
match next {
b'\\' => {
self.scratch.push('\\');
}
b'\'' if !is_double => {
self.scratch.push('\'');
}
b'\"' if is_double => {
self.scratch.push('\"');
}
b'/' => {
self.scratch.push('/');
}
b'b' => {
self.scratch.push(chars::BS);
}
b'f' => {
self.scratch.push(chars::FF);
}
b'n' => {
self.scratch.push(chars::LF);
}
b'r' => {
self.scratch.push(chars::CR);
}
b't' => {
self.scratch.push(chars::TAB);
}
x => {
let char = if x.is_ascii() {
x as char
} else {
self.reader.complete_char(x)?
};
return Err(Error::InvalidEscapeCharacter(char));
}
}
}
x => self.scratch.push(x as char),
}
} else {
let c = self.reader.complete_char(x)?;
self.scratch.push(c);
}
}
}
}

View file

@ -0,0 +1,482 @@
use chrono::{FixedOffset, NaiveDate, Offset, TimeZone, Utc};
use crate::syn::v2::token::{t, NumberKind, TokenKind};
macro_rules! test_case(
($source:expr => [$($token:expr),*$(,)?]) => {
let mut lexer = crate::syn::v2::lexer::Lexer::new($source.as_bytes());
let mut i = 0;
$(
let next = lexer.next();
if let Some(next) = next {
let span = std::str::from_utf8(lexer.reader.span(next.span)).unwrap_or("invalid utf8");
if let TokenKind::Invalid = next.kind{
let error = lexer.error.take().unwrap();
assert_eq!(next.kind, $token, "{} = {}:{} => {}",span, i, stringify!($token), error);
}else{
assert_eq!(next.kind, $token, "{} = {}:{}", span, i, stringify!($token));
}
}else{
assert_eq!(next,None);
}
i += 1;
)*
let _ = i;
assert_eq!(lexer.next(),None)
};
);
#[test]
fn operators() {
test_case! {
r#"- + / * ! **
< > <= >= <- <-> ->
= == -= += != +?=
? ?? ?: ?~ ?=
{ } [ ] ( )
; , | || & &&
$
. .. ...
^
"# => [
t!("-"), t!("+"), t!("/"), t!("*"), t!("!"), t!("**"),
t!("<"), t!(">"), t!("<="), t!(">="), t!("<-"), t!("<->"), t!("->"),
t!("="), t!("=="), t!("-="), t!("+="), t!("!="), t!("+?="),
t!("?"), t!("??"), t!("?:"), t!("?~"), t!("?="),
t!("{"), t!("}"), t!("["), t!("]"), t!("("), t!(")"),
t!(";"), t!(","), t!("|"), t!("||"), TokenKind::Invalid, t!("&&"),
t!("$"),
t!("."), t!(".."), t!("..."),
TokenKind::Invalid
]
}
}
#[test]
fn comments() {
test_case! {
r"
+ /* some comment */
- // another comment
+ -- a third comment
-
" => [
t!("+"),
t!("-"),
t!("+"),
t!("-"),
]
}
}
#[test]
fn whitespace() {
test_case! {
"+= \t\n\r -=" => [
t!("+="),
t!("-="),
]
}
}
#[test]
fn identifiers() {
test_case! {
r#"
123123adwad +
akdwkj +
akdwkj1231312313123 +
_a_k_d_wkj1231312313123 +
____wdw____ +
"#
=> [
TokenKind::Invalid,
t!("+"),
TokenKind::Identifier,
t!("+"),
TokenKind::Identifier,
t!("+"),
TokenKind::Identifier,
t!("+"),
TokenKind::Identifier,
t!("+"),
]
}
}
#[test]
fn numbers() {
test_case! {
r#"
123123+32010230.123012031+33043030dec+33043030f+
"#
=> [
TokenKind::Number(NumberKind::Integer),
t!("+"),
TokenKind::Number(NumberKind::Mantissa),
t!("+"),
TokenKind::Number(NumberKind::Decimal),
t!("+"),
TokenKind::Number(NumberKind::Float),
t!("+"),
]
}
test_case! {
"+123129decs+"
=> [
t!("+"),
TokenKind::Invalid,
t!("+"),
]
}
test_case! {
"+39349fs+"
=> [
t!("+"),
TokenKind::Invalid,
t!("+"),
]
}
test_case! {
"+394393df+"
=> [
t!("+"),
TokenKind::Invalid,
t!("+"),
]
}
test_case! {
"+32932932def+"
=> [
t!("+"),
TokenKind::Invalid,
t!("+"),
]
}
test_case! {
"+329239329z+"
=> [
t!("+"),
TokenKind::Invalid,
t!("+"),
]
}
}
#[test]
fn duration() {
test_case! {
r#"
1ns+1µs+1us+1ms+1s+1m+1h+1w+1y
1nsa+1ans+1aus+1usa+1ams+1msa+1am+1ma+1ah+1ha+1aw+1wa+1ay+1ya+1µsa
"#
=> [
TokenKind::Duration,
t!("+"),
TokenKind::Duration,
t!("+"),
TokenKind::Duration,
t!("+"),
TokenKind::Duration,
t!("+"),
TokenKind::Duration,
t!("+"),
TokenKind::Duration,
t!("+"),
TokenKind::Duration,
t!("+"),
TokenKind::Duration,
t!("+"),
TokenKind::Duration,
TokenKind::Invalid,
t!("+"),
TokenKind::Invalid,
t!("+"),
TokenKind::Invalid,
t!("+"),
TokenKind::Invalid,
t!("+"),
TokenKind::Invalid,
t!("+"),
TokenKind::Invalid,
t!("+"),
TokenKind::Invalid,
t!("+"),
TokenKind::Invalid,
t!("+"),
TokenKind::Invalid,
t!("+"),
TokenKind::Invalid,
t!("+"),
TokenKind::Invalid,
t!("+"),
TokenKind::Invalid,
t!("+"),
TokenKind::Invalid,
t!("+"),
TokenKind::Invalid,
t!("+"),
TokenKind::Invalid,
]
}
}
#[test]
fn keyword() {
test_case! {
r#"select SELECT sElEcT"# => [
t!("SELECT"),
t!("SELECT"),
t!("SELECT"),
]
}
}
#[test]
fn uuid() {
let mut lexer = crate::syn::v2::lexer::Lexer::new(
r#" u"e72bee20-f49b-11ec-b939-0242ac120002" "#.as_bytes(),
);
let token = lexer.next_token();
if let Some(error) = lexer.error {
println!("ERROR: {} @ ", error);
}
assert_eq!(token.kind, TokenKind::Uuid);
let uuid = lexer.uuid.take().unwrap();
assert_eq!(uuid.0.to_string(), "e72bee20-f49b-11ec-b939-0242ac120002");
let mut lexer = crate::syn::v2::lexer::Lexer::new(
r#" u"b19bc00b-aa98-486c-ae37-c8e1c54295b1" "#.as_bytes(),
);
let token = lexer.next_token();
if let Some(error) = lexer.error {
println!("ERROR: {} @ ", error);
}
assert_eq!(token.kind, TokenKind::Uuid);
let uuid = lexer.uuid.take().unwrap();
assert_eq!(uuid.0.to_string(), "b19bc00b-aa98-486c-ae37-c8e1c54295b1");
}
#[test]
fn date_time_just_date() {
let mut lexer = crate::syn::v2::lexer::Lexer::new(r#" d"2012-04-23" "#.as_bytes());
let token = lexer.next_token();
if let Some(error) = lexer.error {
println!("ERROR: {} @ ", error);
}
assert_eq!(token.kind, TokenKind::DateTime);
let datetime = lexer.datetime.take().unwrap();
let expected_datetime = Utc
.fix()
.from_local_datetime(
&NaiveDate::from_ymd_opt(2012, 4, 23).unwrap().and_hms_nano_opt(0, 0, 0, 0).unwrap(),
)
.earliest()
.unwrap()
.with_timezone(&Utc);
assert_eq!(datetime.0, expected_datetime);
}
#[test]
fn date_zone_time() {
let mut lexer = crate::syn::v2::lexer::Lexer::new(r#" d"2020-01-01T00:00:00Z" "#.as_bytes());
let token = lexer.next_token();
if let Some(error) = lexer.error {
println!("ERROR: {} @ ", error);
}
assert_eq!(token.kind, TokenKind::DateTime);
let datetime = lexer.datetime.take().unwrap();
let expected_datetime = Utc
.fix()
.from_local_datetime(
&NaiveDate::from_ymd_opt(2020, 1, 1).unwrap().and_hms_nano_opt(0, 0, 0, 0).unwrap(),
)
.earliest()
.unwrap()
.with_timezone(&Utc);
assert_eq!(datetime.0, expected_datetime);
}
#[test]
fn date_time_with_time() {
let mut lexer = crate::syn::v2::lexer::Lexer::new(r#" d"2012-04-23T18:25:43Z" "#.as_bytes());
let token = lexer.next_token();
if let Some(error) = lexer.error {
println!("ERROR: {} @ ", error);
}
assert_eq!(token.kind, TokenKind::DateTime);
let datetime = lexer.datetime.take().unwrap();
let expected_datetime = Utc
.fix()
.from_local_datetime(
&NaiveDate::from_ymd_opt(2012, 4, 23).unwrap().and_hms_nano_opt(18, 25, 43, 0).unwrap(),
)
.earliest()
.unwrap()
.with_timezone(&Utc);
assert_eq!(datetime.0, expected_datetime);
}
#[test]
fn date_time_nanos() {
let mut lexer =
crate::syn::v2::lexer::Lexer::new(r#" d"2012-04-23T18:25:43.5631Z" "#.as_bytes());
let token = lexer.next_token();
if let Some(error) = lexer.error {
println!("ERROR: {} @ ", error);
}
assert_eq!(token.kind, TokenKind::DateTime);
let datetime = lexer.datetime.take().unwrap();
let expected_datetime = Utc
.fix()
.from_local_datetime(
&NaiveDate::from_ymd_opt(2012, 4, 23)
.unwrap()
.and_hms_nano_opt(18, 25, 43, 563_100_000)
.unwrap(),
)
.earliest()
.unwrap()
.with_timezone(&Utc);
assert_eq!(datetime.0, expected_datetime);
}
#[test]
fn date_time_timezone_utc() {
let mut lexer =
crate::syn::v2::lexer::Lexer::new(r#" d"2012-04-23T18:25:43.0000511Z" "#.as_bytes());
let token = lexer.next_token();
if let Some(error) = lexer.error {
println!("ERROR: {}", error);
}
assert_eq!(token.kind, TokenKind::DateTime);
let datetime = lexer.datetime.take().unwrap();
let expected_datetime = Utc
.fix()
.from_local_datetime(
&NaiveDate::from_ymd_opt(2012, 4, 23)
.unwrap()
.and_hms_nano_opt(18, 25, 43, 51_100)
.unwrap(),
)
.earliest()
.unwrap()
.with_timezone(&Utc);
assert_eq!(datetime.0, expected_datetime);
}
#[test]
fn date_time_timezone_pacific() {
let mut lexer =
crate::syn::v2::lexer::Lexer::new(r#" d"2012-04-23T18:25:43.511-08:00" "#.as_bytes());
let token = lexer.next_token();
if let Some(error) = lexer.error {
println!("ERROR: {}", error);
}
assert_eq!(token.kind, TokenKind::DateTime);
let datetime = lexer.datetime.take().unwrap();
let offset = FixedOffset::west_opt(8 * 3600).unwrap();
let expected_datetime = offset
.from_local_datetime(
&NaiveDate::from_ymd_opt(2012, 4, 23)
.unwrap()
.and_hms_nano_opt(18, 25, 43, 511_000_000)
.unwrap(),
)
.earliest()
.unwrap()
.with_timezone(&Utc);
assert_eq!(datetime.0, expected_datetime);
}
#[test]
fn date_time_timezone_pacific_partial() {
let mut lexer =
crate::syn::v2::lexer::Lexer::new(r#" d"2012-04-23T18:25:43.511+08:30" "#.as_bytes());
let token = lexer.next_token();
if let Some(error) = lexer.error {
println!("ERROR: {}", error);
}
assert_eq!(token.kind, TokenKind::DateTime);
let datetime = lexer.datetime.take().unwrap();
let offset = FixedOffset::east_opt(8 * 3600 + 30 * 60).unwrap();
let expected_datetime = offset
.from_local_datetime(
&NaiveDate::from_ymd_opt(2012, 4, 23)
.unwrap()
.and_hms_nano_opt(18, 25, 43, 511_000_000)
.unwrap(),
)
.earliest()
.unwrap()
.with_timezone(&Utc);
assert_eq!(datetime.0, expected_datetime);
}
#[test]
fn date_time_timezone_utc_nanoseconds() {
let mut lexer =
crate::syn::v2::lexer::Lexer::new(r#" d"2012-04-23T18:25:43.5110000Z" "#.as_bytes());
let token = lexer.next_token();
if let Some(error) = lexer.error {
println!("ERROR: {}", error);
}
assert_eq!(token.kind, TokenKind::DateTime);
let datetime = lexer.datetime.take().unwrap();
let offset = Utc.fix();
let expected_datetime = offset
.from_local_datetime(
&NaiveDate::from_ymd_opt(2012, 4, 23)
.unwrap()
.and_hms_nano_opt(18, 25, 43, 511_000_000)
.unwrap(),
)
.earliest()
.unwrap()
.with_timezone(&Utc);
assert_eq!(datetime.0, expected_datetime);
}
#[test]
fn date_time_timezone_utc_sub_nanoseconds() {
let mut lexer =
crate::syn::v2::lexer::Lexer::new(r#" d"2012-04-23T18:25:43.0000511Z" "#.as_bytes());
let token = lexer.next_token();
if let Some(error) = lexer.error {
println!("ERROR: {}", error);
}
assert_eq!(token.kind, TokenKind::DateTime);
let datetime = lexer.datetime.take().unwrap();
let offset = Utc.fix();
let expected_datetime = offset
.from_local_datetime(
&NaiveDate::from_ymd_opt(2012, 4, 23)
.unwrap()
.and_hms_nano_opt(18, 25, 43, 51_100)
.unwrap(),
)
.earliest()
.unwrap()
.with_timezone(&Utc);
assert_eq!(datetime.0, expected_datetime);
}

View file

@ -0,0 +1,68 @@
//! Unicode related utilities.
/// Character constants
pub mod chars {
// Character tabulation
pub const TAB: char = '\u{0009}';
/// Form feed
pub const FF: char = '\u{000C}';
/// Line feed
pub const LF: char = '\u{000A}';
/// Carriage return
pub const CR: char = '\u{000D}';
/// Line separator
pub const LS: char = '\u{2020}';
/// Backspace
pub const BS: char = '\u{0008}';
/// Paragraph separator
pub const PS: char = '\u{2029}';
/// Next line
pub const NEL: char = '\u{0085}';
/// Line terminators for javascript source code.
pub const JS_LINE_TERIMATORS: [char; 4] = [LF, CR, LS, PS];
}
pub mod byte {
/// Character tabulation
pub const TAB: u8 = b'\t';
/// Line tabulation
pub const VT: u8 = 0xB;
/// Form feed
pub const FF: u8 = 0xC;
/// Line feed
pub const LF: u8 = 0xA;
/// Carriage return
pub const CR: u8 = 0xD;
/// Space
pub const SP: u8 = 0x20;
}
/// A trait extending u8 for adding some extra function.
pub trait U8Ext {
///. Returns if the u8 is the start of an identifier.
fn is_identifier_start(&self) -> bool;
/// Returns if the u8 can start an identifier.
fn is_number_start(&self) -> bool;
/// Returns if the u8 can continue an identifier after the first character.
fn is_identifier_continue(&self) -> bool;
}
impl U8Ext for u8 {
fn is_identifier_start(&self) -> bool {
matches!(self, b'a'..=b'z' | b'A'..=b'Z' | b'_')
}
fn is_identifier_continue(&self) -> bool {
matches!(self, b'a'..=b'z' | b'A'..=b'Z' | b'0'..=b'9' | b'_')
}
fn is_number_start(&self) -> bool {
self.is_ascii_digit()
}
}

View file

@ -0,0 +1,124 @@
use crate::{
sql::Uuid,
syn::v2::token::{Token, TokenKind},
};
use super::{Error as LexError, Lexer};
use thiserror::Error;
#[derive(Error, Debug)]
pub enum Error {
#[error("missing digits")]
MissingDigits,
#[error("digit was not in allowed range")]
InvalidRange,
#[error("expected uuid-strand to end")]
ExpectedStrandEnd,
#[error("missing a uuid seperator")]
MissingSeperator,
}
impl<'a> Lexer<'a> {
/// Lex a uuid strand with either double or single quotes.
///
/// Expects the first delimiter to already have been eaten.
pub fn lex_uuid(&mut self, double: bool) -> Token {
match self.lex_uuid_err(double) {
Ok(x) => {
debug_assert!(self.uuid.is_none());
self.uuid = Some(x);
self.finish_token(TokenKind::Uuid)
}
Err(_) => self.invalid_token(LexError::Uuid(Error::MissingDigits)),
}
}
/// Lex a uuid strand with either double or single quotes but return an result instead of a
/// token.
///
/// Expects the first delimiter to already have been eaten.
pub fn lex_uuid_err(&mut self, double: bool) -> Result<Uuid, Error> {
let uuid = self.lex_uuid_err_inner()?;
let end_char = if double {
b'"'
} else {
b'\''
};
// closing strand character
if !self.eat(end_char) {
return Err(Error::ExpectedStrandEnd);
}
Ok(uuid)
}
/// Lex a uuid strand without delimiting quotes but return an result instead of a
/// token.
///
/// Expects the first delimiter to already have been eaten.
pub fn lex_uuid_err_inner(&mut self) -> Result<Uuid, Error> {
let start = self.reader.offset();
if !self.lex_hex(8) {
return Err(Error::MissingDigits);
}
if !self.eat(b'-') {
return Err(Error::MissingSeperator);
}
if !self.lex_hex(4) {
return Err(Error::MissingDigits);
}
if !self.eat(b'-') {
return Err(Error::MissingSeperator);
}
if !self.eat_when(|x| (b'1'..=b'8').contains(&x)) {
if self.reader.peek().map(|x| x.is_ascii_digit()).unwrap_or(false) {
// byte was an ascii digit but not in the valid range.
return Err(Error::InvalidRange);
}
return Err(Error::MissingDigits);
};
if !self.lex_hex(3) {
return Err(Error::MissingDigits);
}
if !self.eat(b'-') {
return Err(Error::MissingSeperator);
}
if !self.lex_hex(4) {
return Err(Error::MissingDigits);
}
if !self.eat(b'-') {
return Err(Error::MissingSeperator);
}
if !self.lex_hex(12) {
return Err(Error::MissingDigits);
}
let end = self.reader.offset();
// The lexer ensures that the section of bytes is valid utf8 so this should never panic.
let uuid_str = std::str::from_utf8(&self.reader.full()[start..end]).unwrap();
// The lexer ensures that the bytes are a valid uuid so this should never panic.
Ok(Uuid(uuid::Uuid::try_from(uuid_str).unwrap()))
}
/// lexes a given amount of hex characters. returns true if the lexing was successfull, false
/// otherwise.
pub fn lex_hex(&mut self, amount: u8) -> bool {
for _ in 0..amount {
if !self.eat_when(|x| matches!(x,b'0'..=b'9' | b'a'..=b'f' | b'A'..=b'F')) {
return false;
}
}
true
}
}

Some files were not shown because too many files have changed in this diff Show more