Restructure the parser around the concept of token gluing. ()

This commit is contained in:
Mees Delzenne 2024-06-12 15:52:08 +02:00 committed by GitHub
parent 2184e80f45
commit 3539eac25d
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
45 changed files with 3137 additions and 2785 deletions

View file

@ -95,7 +95,8 @@ impl Debug for Regex {
impl Display for Regex {
fn fmt(&self, f: &mut Formatter) -> fmt::Result {
write!(f, "/{}/", &self.0)
let t = self.0.to_string().replace('/', "\\/");
write!(f, "/{}/", &t)
}
}

View file

@ -88,6 +88,7 @@ impl Uuid {
impl Display for Uuid {
fn fmt(&self, f: &mut Formatter) -> fmt::Result {
write!(f, "u")?;
Display::fmt(&quote_str(&self.0.to_string()), f)
}
}

View file

@ -49,6 +49,21 @@ impl Location {
pub fn of_offset(source: &str, offset: usize) -> Self {
assert!(offset <= source.len(), "tried to find location of substring in unrelated string");
if offset == source.len() {
// Eof character
let (last_line, column) = LineIterator::new(source)
.enumerate()
.last()
.map(|(idx, (l, _))| (idx, l.len()))
.unwrap_or((0, 0));
return Self {
line: last_line + 1,
column: column + 1,
};
}
// Bytes of input prior to line being iterated.
let mut bytes_prior = 0;
for (line_idx, (line, seperator_len)) in LineIterator::new(source).enumerate() {
@ -109,6 +124,22 @@ impl Location {
let offset = span.offset as usize;
let end = offset + span.len as usize;
if span.len == 0 && source.len() == span.offset as usize {
// EOF span
let (last_line, column) = LineIterator::new(source)
.enumerate()
.last()
.map(|(idx, (l, _))| (idx, l.len()))
.unwrap_or((0, 0));
return Self {
line: last_line + 1,
column,
}..Self {
line: last_line + 1,
column: column + 1,
};
}
// Bytes of input prior to line being iteratated.
let mut bytes_prior = 0;
let mut iterator = LineIterator::new(source).enumerate().peekable();

View file

@ -3,7 +3,7 @@ use crate::syn::{
unicode::{byte, chars},
Error, Lexer,
},
token::{t, Token, TokenKind},
token::{t, DatetimeChars, Token, TokenKind},
};
impl<'a> Lexer<'a> {
@ -41,8 +41,6 @@ impl<'a> Lexer<'a> {
_ => {}
}
}
self.set_whitespace_span(self.current_span());
self.skip_offset();
}
/// Eats a multi line comment and returns an error if `*/` would be missing.
@ -57,8 +55,6 @@ impl<'a> Lexer<'a> {
};
if b'/' == byte {
self.reader.next();
self.set_whitespace_span(self.current_span());
self.skip_offset();
return Ok(());
}
}
@ -100,8 +96,6 @@ impl<'a> Lexer<'a> {
_ => break,
}
}
self.set_whitespace_span(self.current_span());
self.skip_offset();
}
// re-lexes a `/` token to a regex token.
@ -109,7 +103,6 @@ impl<'a> Lexer<'a> {
debug_assert_eq!(token.kind, t!("/"));
debug_assert_eq!(token.span.offset + 1, self.last_offset);
debug_assert_eq!(token.span.len, 1);
debug_assert_eq!(self.scratch, "");
self.last_offset = token.span.offset;
loop {
@ -117,21 +110,13 @@ impl<'a> Lexer<'a> {
Some(b'\\') => {
if let Some(b'/') = self.reader.peek() {
self.reader.next();
self.scratch.push('/')
} else {
self.scratch.push('\\')
}
}
Some(b'/') => break,
Some(x) => {
if x.is_ascii() {
self.scratch.push(x as char);
} else {
match self.reader.complete_char(x) {
Ok(x) => {
self.scratch.push(x);
}
Err(e) => return self.invalid_token(e.into()),
if !x.is_ascii() {
if let Err(e) = self.reader.complete_char(x) {
return self.invalid_token(e.into());
}
}
}
@ -139,14 +124,7 @@ impl<'a> Lexer<'a> {
}
}
match self.scratch.parse() {
Ok(x) => {
self.scratch.clear();
self.regex = Some(x);
self.finish_token(TokenKind::Regex)
}
Err(e) => self.invalid_token(Error::Regex(e)),
}
self.finish_token(TokenKind::Regex)
}
/// Lex the next token, starting from the given byte.
@ -163,7 +141,7 @@ impl<'a> Lexer<'a> {
b'@' => t!("@"),
byte::CR | byte::FF | byte::LF | byte::SP | byte::VT | byte::TAB => {
self.eat_whitespace();
return self.next_token_inner();
TokenKind::WhiteSpace
}
b'|' => match self.reader.peek() {
Some(b'|') => {
@ -262,7 +240,7 @@ impl<'a> Lexer<'a> {
Some(b'-') => {
self.reader.next();
self.eat_single_line_comment();
return self.next_token_inner();
TokenKind::WhiteSpace
}
Some(b'=') => {
self.reader.next();
@ -294,12 +272,12 @@ impl<'a> Lexer<'a> {
if let Err(e) = self.eat_multi_line_comment() {
return self.invalid_token(e);
}
return self.next_token_inner();
TokenKind::WhiteSpace
}
Some(b'/') => {
self.reader.next();
self.eat_single_line_comment();
return self.next_token_inner();
TokenKind::WhiteSpace
}
_ => t!("/"),
},
@ -340,41 +318,140 @@ impl<'a> Lexer<'a> {
}
b'#' => {
self.eat_single_line_comment();
return self.next_token_inner();
TokenKind::WhiteSpace
}
b'`' => return self.lex_surrounded_ident(true),
b'"' => return self.lex_strand(true),
b'\'' => return self.lex_strand(false),
b'd' => {
match self.reader.peek() {
Some(b'"') => {
self.reader.next();
return self.lex_datetime(true);
}
Some(b'\'') => {
self.reader.next();
return self.lex_datetime(false);
}
_ => {}
b'"' => t!("\""),
b'\'' => t!("'"),
b'd' => match self.reader.peek() {
Some(b'"') => {
self.reader.next();
t!("d\"")
}
return self.lex_ident_from_next_byte(b'd');
}
b'u' => {
match self.reader.peek() {
Some(b'"') => {
self.reader.next();
return self.lex_uuid(true);
}
Some(b'\'') => {
self.reader.next();
return self.lex_uuid(false);
}
_ => {}
Some(b'\'') => {
self.reader.next();
t!("d'")
}
Some(b'e') => {
self.reader.next();
let Some(b'c') = self.reader.peek() else {
self.scratch.push('d');
return self.lex_ident_from_next_byte(b'e');
};
self.reader.next();
if self.reader.peek().map(|x| x.is_ascii_alphanumeric()).unwrap_or(false) {
self.scratch.push('d');
self.scratch.push('e');
return self.lex_ident_from_next_byte(b'c');
}
t!("dec")
}
Some(x) if !x.is_ascii_alphabetic() => {
t!("d")
}
None => {
t!("d")
}
_ => {
return self.lex_ident_from_next_byte(b'd');
}
},
b'f' => match self.reader.peek() {
Some(x) if !x.is_ascii_alphanumeric() => {
t!("f")
}
None => t!("f"),
_ => {
return self.lex_ident_from_next_byte(b'f');
}
},
b'n' => match self.reader.peek() {
Some(b's') => {
self.reader.next();
if self.reader.peek().map(|x| x.is_ascii_alphabetic()).unwrap_or(false) {
self.scratch.push('n');
return self.lex_ident_from_next_byte(b's');
}
t!("ns")
}
_ => {
return self.lex_ident_from_next_byte(b'n');
}
},
b'm' => match self.reader.peek() {
Some(b's') => {
self.reader.next();
if self.reader.peek().map(|x| x.is_ascii_alphabetic()).unwrap_or(false) {
self.scratch.push('m');
return self.lex_ident_from_next_byte(b's');
}
t!("ms")
}
Some(x) if !x.is_ascii_alphabetic() => {
t!("m")
}
None => {
t!("m")
}
_ => {
return self.lex_ident_from_next_byte(b'm');
}
},
b's' => {
if self.reader.peek().map(|x| x.is_ascii_alphabetic()).unwrap_or(false) {
return self.lex_ident_from_next_byte(b's');
} else {
t!("s")
}
return self.lex_ident_from_next_byte(b'u');
}
b'h' => {
if self.reader.peek().map(|x| x.is_ascii_alphabetic()).unwrap_or(false) {
return self.lex_ident_from_next_byte(b'h');
} else {
t!("h")
}
}
b'w' => {
if self.reader.peek().map(|x| x.is_ascii_alphabetic()).unwrap_or(false) {
return self.lex_ident_from_next_byte(b'w');
} else {
t!("w")
}
}
b'y' => {
if self.reader.peek().map(|x| x.is_ascii_alphabetic()).unwrap_or(false) {
return self.lex_ident_from_next_byte(b'y');
} else {
t!("y")
}
}
b'u' => match self.reader.peek() {
Some(b'"') => {
self.reader.next();
t!("u\"")
}
Some(b'\'') => {
self.reader.next();
t!("u'")
}
Some(b's') => {
self.reader.next();
if self.reader.peek().map(|x| x.is_ascii_alphabetic()).unwrap_or(false) {
self.scratch.push('u');
return self.lex_ident_from_next_byte(b's');
}
t!("us")
}
_ => {
return self.lex_ident_from_next_byte(b'u');
}
},
b'r' => match self.reader.peek() {
Some(b'\"') => {
Some(b'"') => {
self.reader.next();
t!("r\"")
}
@ -382,12 +459,33 @@ impl<'a> Lexer<'a> {
self.reader.next();
t!("r'")
}
_ => return self.lex_ident_from_next_byte(byte),
_ => {
return self.lex_ident_from_next_byte(b'r');
}
},
b'Z' => match self.reader.peek() {
Some(x) if x.is_ascii_alphabetic() => {
return self.lex_ident_from_next_byte(b'Z');
}
_ => TokenKind::DatetimeChars(DatetimeChars::Z),
},
b'T' => match self.reader.peek() {
Some(x) if x.is_ascii_alphabetic() => {
return self.lex_ident_from_next_byte(b'T');
}
_ => TokenKind::DatetimeChars(DatetimeChars::T),
},
b'e' => {
return self.lex_exponent(b'e');
}
b'E' => {
return self.lex_exponent(b'E');
}
b'0'..=b'9' => return self.lex_digits(),
b'a'..=b'z' | b'A'..=b'Z' | b'_' => {
return self.lex_ident_from_next_byte(byte);
}
b'0'..=b'9' => return self.lex_number(byte),
//b'0'..=b'9' => return self.lex_number(byte),
x => return self.invalid_token(Error::UnexpectedCharacter(x as char)),
};

View file

@ -30,6 +30,18 @@ impl<'a> Lexer<'a> {
'⊄' => t!(""),
'×' => t!("×"),
'÷' => t!("÷"),
'µ' => {
let Some(b's') = self.reader.peek() else {
return self.invalid_token(Error::UnexpectedCharacter('µ'));
};
self.reader.next();
if self.reader.peek().map(|x| x.is_ascii_alphabetic()).unwrap_or(false) {
return self.invalid_token(Error::UnexpectedCharacter('µ'));
}
t!("µs")
}
x => return self.invalid_token(Error::UnexpectedCharacter(x)),
};
self.finish_token(kind)

View file

@ -1,269 +0,0 @@
use std::ops::RangeInclusive;
use chrono::{FixedOffset, NaiveDate, NaiveDateTime, NaiveTime, Offset, TimeZone, Utc};
use thiserror::Error;
use crate::{
sql::Datetime,
syn::token::{Token, TokenKind},
};
use super::{Error as LexError, Lexer};
#[derive(Error, Debug)]
#[non_exhaustive]
pub enum PartError {
#[error("value outside of allowed range")]
OutsideRange,
#[error("missing digit(s)")]
MissingDigits,
#[error("too many digits")]
TooManyDigits,
}
#[derive(Error, Debug)]
#[non_exhaustive]
pub enum Error {
#[error("invalid year, {0}")]
Year(PartError),
#[error("invalid month, {0}")]
Month(PartError),
#[error("invalid day, {0}")]
Day(PartError),
#[error("invalid hour, {0}")]
Hour(PartError),
#[error("invalid time minute, {0}")]
Minute(PartError),
#[error("invalid second, {0}")]
Second(PartError),
#[error("invalid nano_seconds, {0}")]
NanoSeconds(PartError),
#[error("invalid time-zone hour, {0}")]
TimeZoneHour(PartError),
#[error("invalid time-zone minute, {0}")]
TimeZoneMinute(PartError),
#[error("missing seperator `{}`",*(.0) as char)]
MissingSeparator(u8),
#[error("expected date-time strand to end")]
ExpectedEnd,
#[error("missing time-zone")]
MissingTimeZone,
#[error("date does not exist")]
NonExistantDate,
#[error("time does not exist")]
NonExistantTime,
#[error("time-zone offset too big")]
TimeZoneOutOfRange,
}
impl<'a> Lexer<'a> {
/// Lex a date-time strand.
pub fn lex_datetime(&mut self, double: bool) -> Token {
match self.lex_datetime_err(double) {
Ok(x) => {
self.datetime = Some(x);
self.finish_token(TokenKind::DateTime)
}
Err(e) => self.invalid_token(LexError::DateTime(e)),
}
}
/// Lex datetime without enclosing `"` or `'` but return a result or parser error.
pub fn lex_datetime_raw_err(&mut self) -> Result<Datetime, Error> {
let negative = match self.reader.peek() {
Some(b'+') => {
self.reader.next();
false
}
Some(b'-') => {
self.reader.next();
true
}
_ => false,
};
let mut year = self.lex_datetime_part(4, 0..=9999).map_err(Error::Year)? as i16;
if negative {
year = -year;
}
if !self.eat(b'-') {
return Err(Error::MissingSeparator(b'-'));
}
let month = self.lex_datetime_part(2, 1..=12).map_err(Error::Month)?;
if !self.eat(b'-') {
return Err(Error::MissingSeparator(b'-'));
}
let day = self.lex_datetime_part(2, 1..=31).map_err(Error::Day)?;
if !self.eat(b'T') {
let Some(date) = NaiveDate::from_ymd_opt(year as i32, month as u32, day as u32) else {
return Err(Error::NonExistantDate);
};
let time = NaiveTime::default();
let date_time = NaiveDateTime::new(date, time);
let datetime = Utc
.fix()
.from_local_datetime(&date_time)
.earliest()
// this should never panic with a fixed offset.
.unwrap()
.with_timezone(&Utc);
return Ok(Datetime(datetime));
}
let hour = self.lex_datetime_part(2, 0..=24).map_err(Error::Hour)?;
if !self.eat(b':') {
return Err(Error::MissingSeparator(b':'));
}
let minutes = self.lex_datetime_part(2, 0..=59).map_err(Error::Minute)?;
if !self.eat(b':') {
return Err(Error::MissingSeparator(b':'));
}
let seconds = self.lex_datetime_part(2, 0..=59).map_err(Error::Second)?;
// nano seconds
let nano = if let Some(b'.') = self.reader.peek() {
self.reader.next();
// check if there is atleast one digit.
if !matches!(self.reader.peek(), Some(b'0'..=b'9')) {
return Err(Error::NanoSeconds(PartError::MissingDigits));
}
let mut number = 0u32;
for i in 0..9 {
let Some(c) = self.reader.peek() else {
// always invalid token, just let the next section handle the error.
break;
};
if !c.is_ascii_digit() {
// If digits are missing they are counted as 0's
for _ in i..9 {
number *= 10;
}
break;
}
self.reader.next();
number *= 10;
number += (c - b'0') as u32;
}
// ensure nano_seconds are at most 9 digits.
if matches!(self.reader.peek(), Some(b'0'..=b'9')) {
return Err(Error::NanoSeconds(PartError::TooManyDigits));
}
number
} else {
0
};
// time zone
let time_zone = match self.reader.peek() {
Some(b'Z') => {
self.reader.next();
None
}
Some(x @ (b'-' | b'+')) => {
self.reader.next();
let negative = x == b'-';
let hour = self.lex_datetime_part(2, 0..=24).map_err(Error::TimeZoneHour)? as i32;
let Some(b':') = self.reader.next() else {
return Err(Error::MissingSeparator(b':'));
};
let minute =
self.lex_datetime_part(2, 0..=59).map_err(Error::TimeZoneMinute)? as i32;
let time = hour * 3600 + minute * 60;
if negative {
Some(-time)
} else {
Some(time)
}
}
_ => return Err(Error::MissingTimeZone),
};
// calculate the given datetime from individual parts.
let Some(date) = NaiveDate::from_ymd_opt(year as i32, month as u32, day as u32) else {
return Err(Error::NonExistantDate);
};
let Some(time) =
NaiveTime::from_hms_nano_opt(hour as u32, minutes as u32, seconds as u32, nano)
else {
return Err(Error::NonExistantTime);
};
let date_time = NaiveDateTime::new(date, time);
let zone = match time_zone {
None => Utc.fix(),
Some(offset) => if offset < 0 {
FixedOffset::west_opt(-offset)
} else {
FixedOffset::east_opt(offset)
}
.ok_or(Error::TimeZoneOutOfRange)?,
};
let datetime = zone
.from_local_datetime(&date_time)
.earliest()
// this should never panic with a fixed offset.
.unwrap()
.with_timezone(&Utc);
Ok(Datetime(datetime))
}
/// Lex full datetime but return an result instead of a token.
pub fn lex_datetime_err(&mut self, double: bool) -> Result<Datetime, Error> {
let datetime = self.lex_datetime_raw_err()?;
let end_char = if double {
b'"'
} else {
b'\''
};
if !self.eat(end_char) {
return Err(Error::ExpectedEnd);
}
Ok(datetime)
}
/// Lexes a digit part of date time.
///
/// This function eats an amount of digits and then checks if the value the digits represent
/// is within the given range.
pub fn lex_datetime_part(
&mut self,
mut amount: u8,
range: RangeInclusive<u16>,
) -> Result<u16, PartError> {
let mut value = 0u16;
while amount != 0 {
value *= 10;
let Some(char) = self.reader.peek() else {
return Err(PartError::MissingDigits);
};
if !char.is_ascii_digit() {
return Err(PartError::MissingDigits);
}
self.reader.next();
value += (char - b'0') as u16;
amount -= 1;
}
if matches!(self.reader.peek(), Some(b'0'..=b'8')) {
return Err(PartError::TooManyDigits);
}
if !range.contains(&value) {
return Err(PartError::OutsideRange);
}
Ok(value)
}
}

View file

@ -1,179 +0,0 @@
use std::time::Duration as StdDuration;
use thiserror::Error;
use crate::{
sql::duration::{
Duration, SECONDS_PER_DAY, SECONDS_PER_HOUR, SECONDS_PER_MINUTE, SECONDS_PER_WEEK,
SECONDS_PER_YEAR,
},
syn::token::{Token, TokenKind},
};
use super::{Error as LexError, Lexer};
#[derive(Error, Debug)]
#[non_exhaustive]
pub enum Error {
#[error("invalid duration suffix")]
InvalidSuffix,
#[error("duration value overflowed")]
Overflow,
}
impl<'a> Lexer<'a> {
/// Lex a duration.
///
/// Expect the lexer to have already eaten the digits starting the duration.
pub fn lex_duration(&mut self) -> Token {
let backup = self.reader.offset();
match self.lex_duration_err() {
Ok(x) => {
self.scratch.clear();
self.duration = Some(x);
self.finish_token(TokenKind::Duration)
}
Err(e) => {
if self.flexible_ident {
self.reader.backup(backup);
return self.lex_ident();
}
self.scratch.clear();
self.invalid_token(LexError::Duration(e))
}
}
}
fn invalid_suffix_duration(&mut self) -> Error {
// eat the whole suffix.
while let Some(x) = self.reader.peek() {
if !x.is_ascii_alphanumeric() {
break;
}
self.reader.next();
}
Error::InvalidSuffix
}
/// Lex a duration,
///
/// Should only be called from lexing a number.
///
/// Expects any number but at least one numeric characters be pushed into scratch.
pub fn lex_duration_err(&mut self) -> Result<Duration, Error> {
let mut duration = StdDuration::ZERO;
let mut current_value = 0u64;
// use the existing eat span to generate the current value.
// span already contains
let mut span = self.current_span();
span.len -= 1;
for b in self.scratch.as_bytes() {
debug_assert!(b.is_ascii_digit(), "`{}` is not a digit", b);
current_value = current_value.checked_mul(10).ok_or(Error::Overflow)?;
current_value = current_value.checked_add((b - b'0') as u64).ok_or(Error::Overflow)?;
}
loop {
let Some(next) = self.reader.peek() else {
return Err(Error::InvalidSuffix);
};
// Match the suffix.
let new_duration = match next {
x @ (b'n' | b'u') => {
// Nano or micro suffix
self.reader.next();
if !self.eat(b's') {
return Err(Error::InvalidSuffix);
};
if x == b'n' {
StdDuration::from_nanos(current_value)
} else {
StdDuration::from_micros(current_value)
}
}
// Starting byte of 'µ'
0xc2 => {
self.reader.next();
// Second byte of 'µ'.
// Always consume as the next byte will always be part of a two byte character.
if !self.eat(0xb5) {
return Err(self.invalid_suffix_duration());
}
if !self.eat(b's') {
return Err(self.invalid_suffix_duration());
}
StdDuration::from_micros(current_value)
}
b'm' => {
self.reader.next();
// Either milli or minute
let is_milli = self.eat(b's');
if is_milli {
StdDuration::from_millis(current_value)
} else {
let Some(number) = current_value.checked_mul(SECONDS_PER_MINUTE) else {
return Err(Error::Overflow);
};
StdDuration::from_secs(number)
}
}
x @ (b's' | b'h' | b'd' | b'w' | b'y') => {
self.reader.next();
// second, hour, day, week or year.
let new_duration = match x {
b's' => Some(StdDuration::from_secs(current_value)),
b'h' => {
current_value.checked_mul(SECONDS_PER_HOUR).map(StdDuration::from_secs)
}
b'd' => {
current_value.checked_mul(SECONDS_PER_DAY).map(StdDuration::from_secs)
}
b'w' => {
current_value.checked_mul(SECONDS_PER_WEEK).map(StdDuration::from_secs)
}
b'y' => {
current_value.checked_mul(SECONDS_PER_YEAR).map(StdDuration::from_secs)
}
_ => unreachable!(),
};
let Some(new_duration) = new_duration else {
return Err(Error::Overflow);
};
new_duration
}
_ => {
return Err(self.invalid_suffix_duration());
}
};
duration = duration.checked_add(new_duration).ok_or(Error::Overflow)?;
let next = self.reader.peek();
match next {
// there was some remaining alphabetic characters after the valid suffix, so the
// suffix is invalid.
Some(b'a'..=b'z' | b'A'..=b'Z' | b'_') => {
return Err(self.invalid_suffix_duration())
}
Some(b'0'..=b'9') => {} // Duration continues.
_ => return Ok(Duration(duration)),
}
current_value = 0;
// Eat all the next numbers
while let Some(b @ b'0'..=b'9') = self.reader.peek() {
self.reader.next();
current_value = current_value.checked_mul(10).ok_or(Error::Overflow)?;
current_value =
current_value.checked_add((b - b'0') as u64).ok_or(Error::Overflow)?;
}
}
}
}

View file

@ -2,10 +2,16 @@ use std::mem;
use unicase::UniCase;
use crate::syn::lexer::{keywords::KEYWORDS, Error, Lexer};
use crate::syn::token::{NumberKind, Token, TokenKind};
use crate::syn::{
lexer::{keywords::KEYWORDS, Error, Lexer},
token::{Token, TokenKind},
};
use super::unicode::{chars, U8Ext};
use super::unicode::chars;
fn is_identifier_continue(x: u8) -> bool {
matches!(x, b'a'..=b'z' | b'A'..=b'Z' | b'0'..=b'9' | b'_')
}
impl<'a> Lexer<'a> {
/// Lex a parameter in the form of `$[a-zA-Z0-9_]*`
@ -35,7 +41,6 @@ impl<'a> Lexer<'a> {
/// by `[a-zA-Z0-9_]*`.
pub fn lex_ident_from_next_byte(&mut self, start: u8) -> Token {
debug_assert!(matches!(start, b'a'..=b'z' | b'A'..=b'Z' | b'_'));
debug_assert_eq!(self.scratch, "");
self.scratch.push(start as char);
self.lex_ident()
}
@ -46,7 +51,7 @@ impl<'a> Lexer<'a> {
pub fn lex_ident(&mut self) -> Token {
loop {
if let Some(x) = self.reader.peek() {
if x.is_identifier_continue() {
if is_identifier_continue(x) {
self.scratch.push(x as char);
self.reader.next();
continue;
@ -64,7 +69,7 @@ impl<'a> Lexer<'a> {
if self.scratch == "NaN" {
self.scratch.clear();
return self.finish_token(TokenKind::Number(NumberKind::NaN));
return self.finish_token(TokenKind::NaN);
} else {
self.string = Some(mem::take(&mut self.scratch));
return self.finish_token(TokenKind::Identifier);

View file

@ -1,8 +1,6 @@
use crate::syn::token::VectorTypeKind;
use crate::{
sql::change_feed_include::ChangeFeedInclude,
sql::{language::Language, Algorithm},
syn::token::{DistanceKind, Keyword, TokenKind},
syn::token::{DistanceKind, Keyword, TokenKind, VectorTypeKind},
};
use phf::{phf_map, phf_set};
use unicase::UniCase;
@ -167,6 +165,7 @@ pub(crate) static KEYWORDS: phf::Map<UniCase<&'static str>, TokenKind> = phf_map
UniCase::ascii("ONLY") => TokenKind::Keyword(Keyword::Only),
UniCase::ascii("OPTION") => TokenKind::Keyword(Keyword::Option),
UniCase::ascii("ORDER") => TokenKind::Keyword(Keyword::Order),
UniCase::ascii("ORIGINAL") => TokenKind::Keyword(Keyword::Original),
UniCase::ascii("PARALLEL") => TokenKind::Keyword(Keyword::Parallel),
UniCase::ascii("PARAM") => TokenKind::Keyword(Keyword::Param),
UniCase::ascii("PASSHASH") => TokenKind::Keyword(Keyword::Passhash),
@ -366,6 +365,4 @@ pub(crate) static KEYWORDS: phf::Map<UniCase<&'static str>, TokenKind> = phf_map
UniCase::ascii("I32") => TokenKind::VectorType(VectorTypeKind::I32),
UniCase::ascii("I16") => TokenKind::VectorType(VectorTypeKind::I16),
// Change Feed keywords
UniCase::ascii("ORIGINAL") => TokenKind::ChangeFeedInclude(ChangeFeedInclude::Original),
};

View file

@ -1,13 +1,10 @@
use crate::{
sql::{Datetime, Duration, Regex, Uuid},
syn::token::{Span, Token, TokenKind},
};
use std::time::Duration;
use chrono::{DateTime, Utc};
use thiserror::Error;
mod byte;
mod char;
mod datetime;
mod duration;
mod ident;
mod js;
pub mod keywords;
@ -15,12 +12,14 @@ mod number;
mod reader;
mod strand;
mod unicode;
mod uuid;
#[cfg(test)]
mod test;
pub use reader::{BytesReader, CharError};
use uuid::Uuid;
use crate::syn::token::{Span, Token, TokenKind};
/// A error returned by the lexer when an invalid token is encountered.
///
@ -39,16 +38,6 @@ pub enum Error {
InvalidUtf8,
#[error("expected next character to be '{0}'")]
ExpectedEnd(char),
#[error("failed to lex date-time, {0}")]
DateTime(#[from] datetime::Error),
#[error("failed to lex uuid, {0}")]
Uuid(#[from] uuid::Error),
#[error("failed to lex duration, {0}")]
Duration(#[from] duration::Error),
#[error("failed to lex number, {0}")]
Number(#[from] number::Error),
#[error("failed to parse regex, {0}")]
Regex(regex::Error),
}
impl From<CharError> for Error {
@ -78,16 +67,10 @@ pub struct Lexer<'a> {
pub reader: BytesReader<'a>,
/// The one past the last character of the previous token.
last_offset: u32,
/// The span of whitespace if it was read between two tokens.
whitespace_span: Option<Span>,
/// A buffer used to build the value of tokens which can't be read straight from the source.
/// like for example strings with escape characters.
scratch: String,
/// Allow the next parsed idents to be flexible, i.e. support idents which don't start with a
/// number.
pub flexible_ident: bool,
// below are a collection of storage for values produced by tokens.
// For performance reasons we wan't to keep the tokens as small as possible.
// As only some tokens have an additional value associated with them we don't store that value
@ -103,8 +86,7 @@ pub struct Lexer<'a> {
// actual number value to when the parser can decide on a format.
pub string: Option<String>,
pub duration: Option<Duration>,
pub datetime: Option<Datetime>,
pub regex: Option<Regex>,
pub datetime: Option<DateTime<Utc>>,
pub uuid: Option<Uuid>,
pub error: Option<Error>,
}
@ -119,15 +101,12 @@ impl<'a> Lexer<'a> {
Lexer {
reader,
last_offset: 0,
whitespace_span: None,
scratch: String::new(),
flexible_ident: false,
string: None,
datetime: None,
duration: None,
regex: None,
uuid: None,
error: None,
duration: None,
datetime: None,
uuid: None,
}
}
@ -137,13 +116,7 @@ impl<'a> Lexer<'a> {
pub fn reset(&mut self) {
self.last_offset = 0;
self.scratch.clear();
self.flexible_ident = false;
self.whitespace_span = None;
self.string = None;
self.datetime = None;
self.duration = None;
self.regex = None;
self.uuid = None;
self.error = None;
}
@ -159,30 +132,12 @@ impl<'a> Lexer<'a> {
Lexer {
reader,
last_offset: 0,
whitespace_span: None,
scratch: self.scratch,
flexible_ident: false,
string: self.string,
datetime: self.datetime,
duration: self.duration,
regex: self.regex,
uuid: self.uuid,
error: self.error,
}
}
/// return the whitespace of the last token buffered, either peeked or poped.
pub fn whitespace_span(&self) -> Option<Span> {
self.whitespace_span
}
/// Used for seting the span of whitespace between tokens. Will extend the current whitespace
/// if there already is one.
fn set_whitespace_span(&mut self, span: Span) {
if let Some(existing) = self.whitespace_span.as_mut() {
*existing = existing.covers(span);
} else {
self.whitespace_span = Some(span);
duration: self.duration,
datetime: self.datetime,
uuid: self.uuid,
}
}
@ -190,11 +145,6 @@ impl<'a> Lexer<'a> {
///
/// If the lexer is at the end the source it will always return the Eof token.
pub fn next_token(&mut self) -> Token {
self.whitespace_span = None;
self.next_token_inner()
}
fn next_token_inner(&mut self) -> Token {
let Some(byte) = self.reader.next() else {
return self.eof_token();
};
@ -219,13 +169,6 @@ impl<'a> Lexer<'a> {
}
}
/// Skip the last consumed bytes in the reader.
///
/// The bytes consumed before this point won't be part of the span.
fn skip_offset(&mut self) {
self.last_offset = self.reader.offset() as u32;
}
/// Return an invalid token.
fn invalid_token(&mut self, error: Error) -> Token {
self.error = Some(error);
@ -305,112 +248,6 @@ impl<'a> Lexer<'a> {
false
}
}
/// Lex a single `"` character with possible leading whitespace.
///
/// Used for parsing record strings.
pub fn lex_record_string_close(&mut self) -> Token {
loop {
let Some(byte) = self.reader.next() else {
return self.invalid_token(Error::UnexpectedEof);
};
match byte {
unicode::byte::CR
| unicode::byte::FF
| unicode::byte::LF
| unicode::byte::SP
| unicode::byte::VT
| unicode::byte::TAB => {
self.eat_whitespace();
continue;
}
b'"' => {
return self.finish_token(TokenKind::CloseRecordString {
double: true,
});
}
b'\'' => {
return self.finish_token(TokenKind::CloseRecordString {
double: false,
});
}
b'-' => match self.reader.next() {
Some(b'-') => {
self.eat_single_line_comment();
continue;
}
Some(x) => match self.reader.convert_to_char(x) {
Ok(c) => return self.invalid_token(Error::UnexpectedCharacter(c)),
Err(e) => return self.invalid_token(e.into()),
},
None => return self.invalid_token(Error::UnexpectedEof),
},
b'/' => match self.reader.next() {
Some(b'*') => {
if let Err(e) = self.eat_multi_line_comment() {
return self.invalid_token(e);
}
continue;
}
Some(b'/') => {
self.eat_single_line_comment();
continue;
}
Some(x) => match self.reader.convert_to_char(x) {
Ok(c) => return self.invalid_token(Error::UnexpectedCharacter(c)),
Err(e) => return self.invalid_token(e.into()),
},
None => return self.invalid_token(Error::UnexpectedEof),
},
b'#' => {
self.eat_single_line_comment();
continue;
}
x => match self.reader.convert_to_char(x) {
Ok(c) => return self.invalid_token(Error::UnexpectedCharacter(c)),
Err(e) => return self.invalid_token(e.into()),
},
}
}
}
/// Lex only a datetime without enclosing delimiters.
///
/// Used for reusing lexer lexing code for parsing datetimes. Should not be called during
/// normal parsing.
pub fn lex_only_datetime(&mut self) -> Result<Datetime, Error> {
self.lex_datetime_raw_err().map_err(Error::DateTime)
}
/// Lex only a duration.
///
/// Used for reusing lexer lexing code for parsing durations. Should not be used during normal
/// parsing.
pub fn lex_only_duration(&mut self) -> Result<Duration, Error> {
match self.reader.next() {
Some(x @ b'0'..=b'9') => {
self.scratch.push(x as char);
while let Some(x @ b'0'..=b'9') = self.reader.peek() {
self.reader.next();
self.scratch.push(x as char);
}
self.lex_duration_err().map_err(Error::Duration)
}
Some(x) => {
let char = self.reader.convert_to_char(x)?;
Err(Error::UnexpectedCharacter(char))
}
None => Err(Error::UnexpectedEof),
}
}
/// Lex only a UUID.
///
/// Used for reusing lexer lexing code for parsing UUID's. Should not be used during normal
/// parsing.
pub fn lex_only_uuid(&mut self) -> Result<Uuid, Error> {
Ok(self.lex_uuid_err_inner()?)
}
}
impl Iterator for Lexer<'_> {

View file

@ -1,277 +1,24 @@
use crate::syn::{
lexer::{unicode::U8Ext, Error as LexError, Lexer},
token::{NumberKind, Token, TokenKind},
};
use std::mem;
use thiserror::Error;
use crate::syn::token::{Token, TokenKind};
#[derive(Error, Debug)]
#[non_exhaustive]
pub enum Error {
#[error("invalid number suffix")]
InvalidSuffix,
#[error("expected atleast a single digit in the exponent")]
DigitExpectedExponent,
}
use super::Lexer;
impl Lexer<'_> {
pub fn finish_number_token(&mut self, kind: NumberKind) -> Token {
let mut str = mem::take(&mut self.scratch);
str.retain(|x| x != '_');
self.string = Some(str);
self.finish_token(TokenKind::Number(kind))
}
/// Lex only an integer.
/// Use when a number can be followed immediatly by a `.` like in a model version.
pub fn lex_only_integer(&mut self) -> Token {
let Some(next) = self.reader.peek() else {
return self.eof_token();
};
// not a number, return a different token kind, for error reporting.
if !next.is_ascii_digit() {
return self.next_token();
}
self.scratch.push(next as char);
self.reader.next();
// eat all the ascii digits
while let Some(x) = self.reader.peek() {
if !x.is_ascii_digit() && x != b'_' {
break;
} else {
self.scratch.push(x as char);
self.reader.next();
}
}
// test for a suffix.
match self.reader.peek() {
Some(b'd' | b'f') => {
// not an integer but parse anyway for error reporting.
return self.lex_suffix(false, false, false);
}
Some(x) if x.is_ascii_alphabetic() => return self.invalid_suffix_token(),
_ => {}
}
self.finish_number_token(NumberKind::Integer)
}
/// Lex a number.
///
/// Expects the digit which started the number as the start argument.
pub fn lex_number(&mut self, start: u8) -> Token {
debug_assert!(start.is_ascii_digit());
debug_assert_eq!(self.scratch, "");
self.scratch.push(start as char);
loop {
let Some(x) = self.reader.peek() else {
return self.finish_number_token(NumberKind::Integer);
};
match x {
b'0'..=b'9' => {
// next digits.
self.reader.next();
self.scratch.push(x as char);
}
x @ (b'e' | b'E') => {
// scientific notation
self.reader.next();
self.scratch.push(x as char);
return self.lex_exponent(false);
}
b'.' => {
// mantissa
let backup = self.reader.offset();
self.reader.next();
let next = self.reader.peek();
if let Some(b'0'..=b'9') = next {
self.scratch.push('.');
return self.lex_mantissa();
} else {
// indexing a number
self.reader.backup(backup);
return self.finish_number_token(NumberKind::Integer);
}
}
b'f' | b'd' => return self.lex_suffix(false, false, false),
// Oxc2 is the start byte of 'µ'
0xc2 | b'n' | b'u' | b'm' | b'h' | b'w' | b'y' | b's' => {
// duration suffix, switch to lexing duration.
return self.lex_duration();
}
b'_' => {
self.reader.next();
}
b'a'..=b'z' | b'A'..=b'Z' => {
if self.flexible_ident {
return self.lex_ident();
} else {
return self.invalid_suffix_token();
}
}
_ => {
return self.finish_number_token(NumberKind::Integer);
}
}
}
}
fn invalid_suffix_token(&mut self) -> Token {
// eat the whole suffix.
while let Some(x) = self.reader.peek() {
if !x.is_ascii_alphanumeric() {
break;
}
pub fn lex_digits(&mut self) -> Token {
while let Some(b'0'..=b'9' | b'_') = self.reader.peek() {
self.reader.next();
}
self.scratch.clear();
self.invalid_token(LexError::Number(Error::InvalidSuffix))
self.finish_token(TokenKind::Digits)
}
/// Lex a number suffix, either 'f' or 'dec'.
fn lex_suffix(&mut self, had_mantissa: bool, had_exponent: bool, had_operator: bool) -> Token {
match self.reader.peek() {
Some(b'f') => {
// float suffix
self.reader.next();
if let Some(true) = self.reader.peek().map(|x| x.is_identifier_continue()) {
if self.flexible_ident && !had_mantissa && !had_operator {
self.scratch.push('f');
self.lex_ident()
} else {
self.invalid_suffix_token()
}
} else {
let kind = if had_mantissa {
NumberKind::FloatMantissa
} else {
NumberKind::Float
};
self.finish_number_token(kind)
}
}
Some(b'd') => {
// decimal suffix
self.reader.next();
let checkpoint = self.reader.offset();
if !self.eat(b'e') {
if !had_mantissa && !had_exponent && !had_operator {
self.reader.backup(checkpoint - 1);
return self.lex_duration();
} else if !had_mantissa && self.flexible_ident {
self.scratch.push('d');
return self.lex_ident();
} else {
return self.invalid_suffix_token();
}
}
if !self.eat(b'c') {
if self.flexible_ident {
self.scratch.push('d');
self.scratch.push('e');
return self.lex_ident();
} else {
return self.invalid_suffix_token();
}
}
if let Some(true) = self.reader.peek().map(|x| x.is_identifier_continue()) {
self.invalid_suffix_token()
} else {
let kind = if had_exponent {
NumberKind::DecimalExponent
} else {
NumberKind::Decimal
};
self.finish_number_token(kind)
}
}
// Caller should ensure this is unreachable
_ => unreachable!(),
}
}
/// Lexes the mantissa of a number, i.e. `.8` in `1.8`
pub fn lex_mantissa(&mut self) -> Token {
loop {
// lex_number already checks if there exists a digit after the dot.
// So this will never fail the first iteration of the loop.
let Some(x) = self.reader.peek() else {
return self.finish_number_token(NumberKind::Mantissa);
};
match x {
b'0'..=b'9' | b'_' => {
// next digit.
self.reader.next();
self.scratch.push(x as char);
}
b'e' | b'E' => {
// scientific notation
self.reader.next();
self.scratch.push('e');
return self.lex_exponent(true);
}
b'f' | b'd' => return self.lex_suffix(true, false, false),
b'a'..=b'z' | b'A'..=b'Z' => {
// invalid token, random identifier characters immediately after number.
self.scratch.clear();
return self.invalid_suffix_token();
}
_ => {
return self.finish_number_token(NumberKind::Mantissa);
}
}
}
}
/// Lexes the exponent of a number, i.e. `e10` in `1.1e10`;
fn lex_exponent(&mut self, had_mantissa: bool) -> Token {
let mut had_operator = false;
let mut peek = self.reader.peek();
if let Some(x @ b'-' | x @ b'+') = peek {
had_operator = true;
self.reader.next();
self.scratch.push(x as char);
peek = self.reader.peek();
}
if let Some(x @ b'0'..=b'9') = peek {
self.reader.next();
self.scratch.push(x as char);
} else {
if self.flexible_ident && !had_mantissa && !had_operator {
pub fn lex_exponent(&mut self, start: u8) -> Token {
if let Some(x) = self.reader.peek() {
if x.is_ascii_alphabetic() || x == b'_' {
self.scratch.push(start as char);
return self.lex_ident();
}
return self.invalid_token(LexError::Number(Error::DigitExpectedExponent));
}
};
loop {
match self.reader.peek() {
Some(x @ (b'0'..=b'9' | b'_')) => {
self.reader.next();
self.scratch.push(x as char);
}
Some(b'f' | b'd') => return self.lex_suffix(had_mantissa, true, had_operator),
Some(x) if x.is_identifier_continue() => {
if self.flexible_ident && !had_operator && !had_mantissa {
return self.lex_ident();
}
return self.invalid_token(LexError::Number(Error::InvalidSuffix));
}
_ => {
let kind = if had_mantissa {
NumberKind::MantissaExponent
} else {
NumberKind::Exponent
};
return self.finish_number_token(kind);
}
}
}
self.finish_token(TokenKind::Exponent)
}
}

View file

@ -77,7 +77,7 @@ impl<'a> BytesReader<'a> {
self.remaining().first().copied()
}
#[inline]
pub fn span(&self, span: Span) -> &[u8] {
pub fn span(&self, span: Span) -> &'a [u8] {
&self.data[(span.offset as usize)..(span.offset as usize + span.len as usize)]
}
#[inline]

View file

@ -2,49 +2,45 @@
use std::mem;
use crate::syn::token::{Token, TokenKind};
use crate::syn::token::{QouteKind, Token, TokenKind};
use super::{unicode::chars, Error, Lexer};
impl<'a> Lexer<'a> {
/// Lex a plain strand with either single or double quotes.
pub fn lex_strand(&mut self, is_double: bool) -> Token {
match self.lex_strand_err(is_double) {
Ok(x) => x,
Err(x) => {
self.scratch.clear();
self.invalid_token(x)
}
}
}
pub fn relex_strand(&mut self, token: Token) -> Token {
let is_double = match token.kind {
TokenKind::Qoute(QouteKind::Plain) => false,
TokenKind::Qoute(QouteKind::PlainDouble) => true,
x => panic!("invalid token kind, '{:?}' is not allowed for re-lexing strands", x),
};
self.last_offset = token.span.offset;
/// Lex a strand with either double or single quotes but return an result instead of a token.
pub fn lex_strand_err(&mut self, is_double: bool) -> Result<Token, Error> {
loop {
let Some(x) = self.reader.next() else {
self.scratch.clear();
return Ok(self.eof_token());
return self.eof_token();
};
if x.is_ascii() {
match x {
b'\'' if !is_double => {
self.string = Some(mem::take(&mut self.scratch));
return Ok(self.finish_token(TokenKind::Strand));
return self.finish_token(TokenKind::Strand);
}
b'"' if is_double => {
self.string = Some(mem::take(&mut self.scratch));
return Ok(self.finish_token(TokenKind::Strand));
return self.finish_token(TokenKind::Strand);
}
b'\0' => {
// null bytes not allowed
return Err(Error::UnexpectedCharacter('\0'));
return self.invalid_token(Error::UnexpectedCharacter('\0'));
}
b'\\' => {
// Handle escape sequences.
let Some(next) = self.reader.next() else {
self.scratch.clear();
return Ok(self.eof_token());
return self.eof_token();
};
match next {
b'\\' => {
@ -78,17 +74,22 @@ impl<'a> Lexer<'a> {
let char = if x.is_ascii() {
x as char
} else {
self.reader.complete_char(x)?
match self.reader.complete_char(x) {
Ok(x) => x,
Err(e) => return self.invalid_token(e.into()),
}
};
return Err(Error::InvalidEscapeCharacter(char));
return self.invalid_token(Error::InvalidEscapeCharacter(char));
}
}
}
x => self.scratch.push(x as char),
}
} else {
let c = self.reader.complete_char(x)?;
self.scratch.push(c);
match self.reader.complete_char(x) {
Ok(x) => self.scratch.push(x),
Err(e) => return self.invalid_token(e.into()),
}
}
}
}

View file

@ -1,6 +1,4 @@
use chrono::{FixedOffset, NaiveDate, Offset, TimeZone, Utc};
use crate::syn::token::{t, NumberKind, TokenKind};
use crate::syn::token::{t, DurationSuffix, TokenKind};
macro_rules! test_case(
($source:expr => [$($token:expr),*$(,)?]) => {
@ -40,23 +38,23 @@ fn operators() {
^
"# => [
t!("-"), t!("+"), t!("/"), t!("*"), t!("!"), t!("**"),
t!("-"), t!(" "), t!("+"),t!(" "), t!("/"),t!(" "), t!("*"),t!(" "), t!("!"),t!(" "), t!("**"), t!(" "),
t!("<"), t!(">"), t!("<="), t!(">="), t!("<-"), t!("<->"), t!("->"),
t!("<"), t!(" "), t!(">"), t!(" "), t!("<="), t!(" "), t!(">="), t!(" "), t!("<-"), t!(" "), t!("<->"), t!(" "), t!("->"), t!(" "),
t!("="), t!("=="), t!("-="), t!("+="), t!("!="), t!("+?="),
t!("="), t!(" "), t!("=="), t!(" "), t!("-="), t!(" "), t!("+="), t!(" "), t!("!="), t!(" "), t!("+?="), t!(" "),
t!("?"), t!("??"), t!("?:"), t!("?~"), t!("?="),
t!("?"), t!(" "), t!("??"), t!(" "), t!("?:"), t!(" "), t!("?~"), t!(" "), t!("?="), t!(" "),
t!("{"), t!("}"), t!("["), t!("]"), t!("("), t!(")"),
t!("{"), t!(" "), t!("}"), t!(" "), t!("["), t!(" "), t!("]"), t!(" "), t!("("), t!(" "), t!(")"), t!(" "),
t!(";"), t!(","), t!("|"), t!("||"), TokenKind::Invalid, t!("&&"),
t!(";"), t!(" "), t!(","), t!(" "), t!("|"), t!(" "), t!("||"), t!(" "), TokenKind::Invalid, t!(" "), t!("&&"), t!(" "),
t!("$"),
t!("$"), t!(" "),
t!("."), t!(".."), t!("..."),
t!("."), t!(" "), t!(".."), t!(" "), t!("..."), t!(" "),
TokenKind::Invalid
TokenKind::Invalid, t!(" ")
]
}
}
@ -70,10 +68,10 @@ fn comments() {
+ -- a third comment
-
" => [
t!("+"),
t!("-"),
t!("+"),
t!("-"),
t!(" "), t!("+"), t!(" "), t!(" "), t!(" "),
t!("-"), t!(" "), t!(" "), t!(" "),
t!("+"), t!(" "), t!(" "), t!(" "),
t!("-"), t!(" ")
]
}
}
@ -82,7 +80,7 @@ fn comments() {
fn whitespace() {
test_case! {
"+= \t\n\r -=" => [
t!("+="),
t!("+="), t!(" "),
t!("-="),
]
}
@ -92,23 +90,22 @@ fn whitespace() {
fn identifiers() {
test_case! {
r#"
123123adwad +
akdwkj +
akdwkj1231312313123 +
_a_k_d_wkj1231312313123 +
____wdw____ +
123123adwad+akdwkj+akdwkj1231312313123+_a_k_d_wkj1231312313123+____wdw____+
"#
=> [
TokenKind::Invalid,
t!(" "),
TokenKind::Digits, // 123123
TokenKind::Identifier, // adwad
t!("+"),
TokenKind::Identifier,
TokenKind::Identifier, // akdwkj
t!("+"),
TokenKind::Identifier,
TokenKind::Identifier, // akdwkj1231312313123
t!("+"),
TokenKind::Identifier,
TokenKind::Identifier, // _a_k_d_wkj1231312313123
t!("+"),
TokenKind::Identifier,
TokenKind::Identifier, // ____wdw____
t!("+"),
t!(" "),
]
}
}
@ -116,20 +113,24 @@ fn identifiers() {
#[test]
fn numbers() {
test_case! {
r#"
123123+32010230.123012031+33043030dec+33043030f+303e10dec+
"#
r#"123123+32010230.123012031+33043030dec+33043030f+303e10dec+"#
=> [
TokenKind::Number(NumberKind::Integer),
TokenKind::Digits, // 123123
t!("+"),
TokenKind::Number(NumberKind::Mantissa),
TokenKind::Digits, // 32010230
t!("."),
TokenKind::Digits, // 123012031
t!("+"),
TokenKind::Number(NumberKind::Decimal),
TokenKind::Digits, // 33043030
t!("dec"),
t!("+"),
TokenKind::Number(NumberKind::Float),
TokenKind::Digits, // 33043030
t!("f"),
t!("+"),
TokenKind::Number(NumberKind::DecimalExponent),
TokenKind::Digits, // 303
TokenKind::Exponent , // e
TokenKind::Digits, // 10
t!("dec"),
t!("+"),
]
}
@ -138,7 +139,8 @@ fn numbers() {
"+123129decs+"
=> [
t!("+"),
TokenKind::Invalid,
TokenKind::Digits, // 123129
TokenKind::Identifier, // decs
t!("+"),
]
}
@ -147,7 +149,8 @@ fn numbers() {
"+39349fs+"
=> [
t!("+"),
TokenKind::Invalid,
TokenKind::Digits, // 39349
TokenKind::Identifier, // fs
t!("+"),
]
}
@ -156,7 +159,8 @@ fn numbers() {
"+394393df+"
=> [
t!("+"),
TokenKind::Invalid,
TokenKind::Digits, // 39349
TokenKind::Identifier, // df
t!("+"),
]
}
@ -165,7 +169,8 @@ fn numbers() {
"+32932932def+"
=> [
t!("+"),
TokenKind::Invalid,
TokenKind::Digits, // 32932932
TokenKind::Identifier, // def
t!("+"),
]
}
@ -174,7 +179,8 @@ fn numbers() {
"+329239329z+"
=> [
t!("+"),
TokenKind::Invalid,
TokenKind::Digits, // 329239329
TokenKind::Identifier, // z
t!("+"),
]
}
@ -189,53 +195,82 @@ fn duration() {
1nsa+1ans+1aus+1usa+1ams+1msa+1am+1ma+1ah+1ha+1aw+1wa+1ay+1ya+1µsa
"#
=> [
TokenKind::Duration,
t!(" "),
TokenKind::Digits,
TokenKind::DurationSuffix(DurationSuffix::Nano),
t!("+"),
TokenKind::Duration,
TokenKind::Digits,
TokenKind::DurationSuffix(DurationSuffix::MicroUnicode),
t!("+"),
TokenKind::Duration,
TokenKind::Digits,
TokenKind::DurationSuffix(DurationSuffix::Micro),
t!("+"),
TokenKind::Duration,
TokenKind::Digits,
TokenKind::DurationSuffix(DurationSuffix::Milli),
t!("+"),
TokenKind::Duration,
TokenKind::Digits,
TokenKind::DurationSuffix(DurationSuffix::Second),
t!("+"),
TokenKind::Duration,
TokenKind::Digits,
TokenKind::DurationSuffix(DurationSuffix::Minute),
t!("+"),
TokenKind::Duration,
TokenKind::Digits,
TokenKind::DurationSuffix(DurationSuffix::Hour),
t!("+"),
TokenKind::Duration,
TokenKind::Digits,
TokenKind::DurationSuffix(DurationSuffix::Week),
t!("+"),
TokenKind::Duration,
TokenKind::Digits,
TokenKind::DurationSuffix(DurationSuffix::Year),
TokenKind::Invalid,
t!(" "),
TokenKind::Digits,
TokenKind::Identifier,
t!("+"),
TokenKind::Invalid,
TokenKind::Digits,
TokenKind::Identifier,
t!("+"),
TokenKind::Invalid,
TokenKind::Digits,
TokenKind::Identifier,
t!("+"),
TokenKind::Invalid,
TokenKind::Digits,
TokenKind::Identifier,
t!("+"),
TokenKind::Invalid,
TokenKind::Digits,
TokenKind::Identifier,
t!("+"),
TokenKind::Invalid,
TokenKind::Digits,
TokenKind::Identifier,
t!("+"),
TokenKind::Invalid,
TokenKind::Digits,
TokenKind::Identifier,
t!("+"),
TokenKind::Invalid,
TokenKind::Digits,
TokenKind::Identifier,
t!("+"),
TokenKind::Invalid,
TokenKind::Digits,
TokenKind::Identifier,
t!("+"),
TokenKind::Invalid,
TokenKind::Digits,
TokenKind::Identifier,
t!("+"),
TokenKind::Invalid,
TokenKind::Digits,
TokenKind::Identifier,
t!("+"),
TokenKind::Invalid,
TokenKind::Digits,
TokenKind::Identifier,
t!("+"),
TokenKind::Invalid,
TokenKind::Digits,
TokenKind::Identifier,
t!("+"),
TokenKind::Invalid,
TokenKind::Digits,
TokenKind::Identifier,
t!("+"),
TokenKind::Digits,
TokenKind::Invalid,
TokenKind::Identifier,
t!(" "),
]
}
}
@ -244,238 +279,9 @@ fn duration() {
fn keyword() {
test_case! {
r#"select SELECT sElEcT"# => [
t!("SELECT"),
t!("SELECT"),
t!("SELECT"),t!(" "),
t!("SELECT"),t!(" "),
t!("SELECT"),
]
}
}
#[test]
fn uuid() {
let mut lexer =
crate::syn::lexer::Lexer::new(r#" u"e72bee20-f49b-11ec-b939-0242ac120002" "#.as_bytes());
let token = lexer.next_token();
if let Some(error) = lexer.error {
println!("ERROR: {} @ ", error);
}
assert_eq!(token.kind, TokenKind::Uuid);
let uuid = lexer.uuid.take().unwrap();
assert_eq!(uuid.0.to_string(), "e72bee20-f49b-11ec-b939-0242ac120002");
let mut lexer =
crate::syn::lexer::Lexer::new(r#" u"b19bc00b-aa98-486c-ae37-c8e1c54295b1" "#.as_bytes());
let token = lexer.next_token();
if let Some(error) = lexer.error {
println!("ERROR: {} @ ", error);
}
assert_eq!(token.kind, TokenKind::Uuid);
let uuid = lexer.uuid.take().unwrap();
assert_eq!(uuid.0.to_string(), "b19bc00b-aa98-486c-ae37-c8e1c54295b1");
}
#[test]
fn date_time_just_date() {
let mut lexer = crate::syn::lexer::Lexer::new(r#" d"2012-04-23" "#.as_bytes());
let token = lexer.next_token();
if let Some(error) = lexer.error {
println!("ERROR: {} @ ", error);
}
assert_eq!(token.kind, TokenKind::DateTime);
let datetime = lexer.datetime.take().unwrap();
let expected_datetime = Utc
.fix()
.from_local_datetime(
&NaiveDate::from_ymd_opt(2012, 4, 23).unwrap().and_hms_nano_opt(0, 0, 0, 0).unwrap(),
)
.earliest()
.unwrap()
.with_timezone(&Utc);
assert_eq!(datetime.0, expected_datetime);
}
#[test]
fn date_zone_time() {
let mut lexer = crate::syn::lexer::Lexer::new(r#" d"2020-01-01T00:00:00Z" "#.as_bytes());
let token = lexer.next_token();
if let Some(error) = lexer.error {
println!("ERROR: {} @ ", error);
}
assert_eq!(token.kind, TokenKind::DateTime);
let datetime = lexer.datetime.take().unwrap();
let expected_datetime = Utc
.fix()
.from_local_datetime(
&NaiveDate::from_ymd_opt(2020, 1, 1).unwrap().and_hms_nano_opt(0, 0, 0, 0).unwrap(),
)
.earliest()
.unwrap()
.with_timezone(&Utc);
assert_eq!(datetime.0, expected_datetime);
}
#[test]
fn date_time_with_time() {
let mut lexer = crate::syn::lexer::Lexer::new(r#" d"2012-04-23T18:25:43Z" "#.as_bytes());
let token = lexer.next_token();
if let Some(error) = lexer.error {
println!("ERROR: {} @ ", error);
}
assert_eq!(token.kind, TokenKind::DateTime);
let datetime = lexer.datetime.take().unwrap();
let expected_datetime = Utc
.fix()
.from_local_datetime(
&NaiveDate::from_ymd_opt(2012, 4, 23).unwrap().and_hms_nano_opt(18, 25, 43, 0).unwrap(),
)
.earliest()
.unwrap()
.with_timezone(&Utc);
assert_eq!(datetime.0, expected_datetime);
}
#[test]
fn date_time_nanos() {
let mut lexer = crate::syn::lexer::Lexer::new(r#" d"2012-04-23T18:25:43.5631Z" "#.as_bytes());
let token = lexer.next_token();
if let Some(error) = lexer.error {
println!("ERROR: {} @ ", error);
}
assert_eq!(token.kind, TokenKind::DateTime);
let datetime = lexer.datetime.take().unwrap();
let expected_datetime = Utc
.fix()
.from_local_datetime(
&NaiveDate::from_ymd_opt(2012, 4, 23)
.unwrap()
.and_hms_nano_opt(18, 25, 43, 563_100_000)
.unwrap(),
)
.earliest()
.unwrap()
.with_timezone(&Utc);
assert_eq!(datetime.0, expected_datetime);
}
#[test]
fn date_time_timezone_utc() {
let mut lexer =
crate::syn::lexer::Lexer::new(r#" d"2012-04-23T18:25:43.0000511Z" "#.as_bytes());
let token = lexer.next_token();
if let Some(error) = lexer.error {
println!("ERROR: {}", error);
}
assert_eq!(token.kind, TokenKind::DateTime);
let datetime = lexer.datetime.take().unwrap();
let expected_datetime = Utc
.fix()
.from_local_datetime(
&NaiveDate::from_ymd_opt(2012, 4, 23)
.unwrap()
.and_hms_nano_opt(18, 25, 43, 51_100)
.unwrap(),
)
.earliest()
.unwrap()
.with_timezone(&Utc);
assert_eq!(datetime.0, expected_datetime);
}
#[test]
fn date_time_timezone_pacific() {
let mut lexer =
crate::syn::lexer::Lexer::new(r#" d"2012-04-23T18:25:43.511-08:00" "#.as_bytes());
let token = lexer.next_token();
if let Some(error) = lexer.error {
println!("ERROR: {}", error);
}
assert_eq!(token.kind, TokenKind::DateTime);
let datetime = lexer.datetime.take().unwrap();
let offset = FixedOffset::west_opt(8 * 3600).unwrap();
let expected_datetime = offset
.from_local_datetime(
&NaiveDate::from_ymd_opt(2012, 4, 23)
.unwrap()
.and_hms_nano_opt(18, 25, 43, 511_000_000)
.unwrap(),
)
.earliest()
.unwrap()
.with_timezone(&Utc);
assert_eq!(datetime.0, expected_datetime);
}
#[test]
fn date_time_timezone_pacific_partial() {
let mut lexer =
crate::syn::lexer::Lexer::new(r#" d"2012-04-23T18:25:43.511+08:30" "#.as_bytes());
let token = lexer.next_token();
if let Some(error) = lexer.error {
println!("ERROR: {}", error);
}
assert_eq!(token.kind, TokenKind::DateTime);
let datetime = lexer.datetime.take().unwrap();
let offset = FixedOffset::east_opt(8 * 3600 + 30 * 60).unwrap();
let expected_datetime = offset
.from_local_datetime(
&NaiveDate::from_ymd_opt(2012, 4, 23)
.unwrap()
.and_hms_nano_opt(18, 25, 43, 511_000_000)
.unwrap(),
)
.earliest()
.unwrap()
.with_timezone(&Utc);
assert_eq!(datetime.0, expected_datetime);
}
#[test]
fn date_time_timezone_utc_nanoseconds() {
let mut lexer =
crate::syn::lexer::Lexer::new(r#" d"2012-04-23T18:25:43.5110000Z" "#.as_bytes());
let token = lexer.next_token();
if let Some(error) = lexer.error {
println!("ERROR: {}", error);
}
assert_eq!(token.kind, TokenKind::DateTime);
let datetime = lexer.datetime.take().unwrap();
let offset = Utc.fix();
let expected_datetime = offset
.from_local_datetime(
&NaiveDate::from_ymd_opt(2012, 4, 23)
.unwrap()
.and_hms_nano_opt(18, 25, 43, 511_000_000)
.unwrap(),
)
.earliest()
.unwrap()
.with_timezone(&Utc);
assert_eq!(datetime.0, expected_datetime);
}
#[test]
fn date_time_timezone_utc_sub_nanoseconds() {
let mut lexer =
crate::syn::lexer::Lexer::new(r#" d"2012-04-23T18:25:43.0000511Z" "#.as_bytes());
let token = lexer.next_token();
if let Some(error) = lexer.error {
println!("ERROR: {}", error);
}
assert_eq!(token.kind, TokenKind::DateTime);
let datetime = lexer.datetime.take().unwrap();
let offset = Utc.fix();
let expected_datetime = offset
.from_local_datetime(
&NaiveDate::from_ymd_opt(2012, 4, 23)
.unwrap()
.and_hms_nano_opt(18, 25, 43, 51_100)
.unwrap(),
)
.earliest()
.unwrap()
.with_timezone(&Utc);
assert_eq!(datetime.0, expected_datetime);
}

View file

@ -40,29 +40,3 @@ pub mod byte {
/// Space
pub const SP: u8 = 0x20;
}
/// A trait extending u8 for adding some extra function.
pub trait U8Ext {
///. Returns if the u8 is the start of an identifier.
fn is_identifier_start(&self) -> bool;
/// Returns if the u8 can start an identifier.
fn is_number_start(&self) -> bool;
/// Returns if the u8 can continue an identifier after the first character.
fn is_identifier_continue(&self) -> bool;
}
impl U8Ext for u8 {
fn is_identifier_start(&self) -> bool {
matches!(self, b'a'..=b'z' | b'A'..=b'Z' | b'_')
}
fn is_identifier_continue(&self) -> bool {
matches!(self, b'a'..=b'z' | b'A'..=b'Z' | b'0'..=b'9' | b'_')
}
fn is_number_start(&self) -> bool {
self.is_ascii_digit()
}
}

View file

@ -1,125 +0,0 @@
use crate::{
sql::Uuid,
syn::token::{Token, TokenKind},
};
use super::{Error as LexError, Lexer};
use thiserror::Error;
#[derive(Error, Debug)]
#[non_exhaustive]
pub enum Error {
#[error("missing digits")]
MissingDigits,
#[error("digit was not in allowed range")]
InvalidRange,
#[error("expected uuid-strand to end")]
ExpectedStrandEnd,
#[error("missing a uuid separator")]
MissingSeperator,
}
impl<'a> Lexer<'a> {
/// Lex a uuid strand with either double or single quotes.
///
/// Expects the first delimiter to already have been eaten.
pub fn lex_uuid(&mut self, double: bool) -> Token {
match self.lex_uuid_err(double) {
Ok(x) => {
debug_assert!(self.uuid.is_none());
self.uuid = Some(x);
self.finish_token(TokenKind::Uuid)
}
Err(_) => self.invalid_token(LexError::Uuid(Error::MissingDigits)),
}
}
/// Lex a uuid strand with either double or single quotes but return an result instead of a
/// token.
///
/// Expects the first delimiter to already have been eaten.
pub fn lex_uuid_err(&mut self, double: bool) -> Result<Uuid, Error> {
let uuid = self.lex_uuid_err_inner()?;
let end_char = if double {
b'"'
} else {
b'\''
};
// closing strand character
if !self.eat(end_char) {
return Err(Error::ExpectedStrandEnd);
}
Ok(uuid)
}
/// Lex a uuid strand without delimiting quotes but return an result instead of a
/// token.
///
/// Expects the first delimiter to already have been eaten.
pub fn lex_uuid_err_inner(&mut self) -> Result<Uuid, Error> {
let start = self.reader.offset();
if !self.lex_hex(8) {
return Err(Error::MissingDigits);
}
if !self.eat(b'-') {
return Err(Error::MissingSeperator);
}
if !self.lex_hex(4) {
return Err(Error::MissingDigits);
}
if !self.eat(b'-') {
return Err(Error::MissingSeperator);
}
if !self.eat_when(|x| (b'1'..=b'8').contains(&x)) {
if self.reader.peek().map(|x| x.is_ascii_digit()).unwrap_or(false) {
// byte was an ascii digit but not in the valid range.
return Err(Error::InvalidRange);
}
return Err(Error::MissingDigits);
};
if !self.lex_hex(3) {
return Err(Error::MissingDigits);
}
if !self.eat(b'-') {
return Err(Error::MissingSeperator);
}
if !self.lex_hex(4) {
return Err(Error::MissingDigits);
}
if !self.eat(b'-') {
return Err(Error::MissingSeperator);
}
if !self.lex_hex(12) {
return Err(Error::MissingDigits);
}
let end = self.reader.offset();
// The lexer ensures that the section of bytes is valid utf8 so this should never panic.
let uuid_str = std::str::from_utf8(&self.reader.full()[start..end]).unwrap();
// The lexer ensures that the bytes are a valid uuid so this should never panic.
Ok(Uuid(uuid::Uuid::try_from(uuid_str).unwrap()))
}
/// lexes a given amount of hex characters. returns true if the lexing was successfull, false
/// otherwise.
pub fn lex_hex(&mut self, amount: u8) -> bool {
for _ in 0..amount {
if !self.eat_when(|x| x.is_ascii_hexdigit()) {
return false;
}
}
true
}
}

View file

@ -1,5 +1,3 @@
#![allow(dead_code)]
//! Module containing the implementation of the surrealql tokens, lexer, and parser.
use crate::{
@ -21,8 +19,7 @@ pub trait Parse<T> {
#[cfg(test)]
mod test;
use lexer::Lexer;
use parser::{ParseError, ParseErrorKind, Parser};
use parser::Parser;
use reblessive::Stack;
/// Takes a string and returns if it could be a reserved keyword in certain contexts.
@ -134,26 +131,16 @@ pub fn idiom(input: &str) -> Result<Idiom, Error> {
/// Parse a datetime without enclosing delimiters from a string.
pub fn datetime_raw(input: &str) -> Result<Datetime, Error> {
debug!("parsing datetime, input = {input}");
let mut lexer = Lexer::new(input.as_bytes());
lexer
.lex_datetime_raw_err()
.map_err(|e| {
ParseError::new(
ParseErrorKind::InvalidToken(lexer::Error::DateTime(e)),
lexer.current_span(),
)
})
.map_err(|e| e.render_on(input))
.map_err(Error::InvalidQuery)
let mut parser = Parser::new(input.as_bytes());
parser.parse_inner_datetime().map_err(|e| e.render_on(input)).map_err(Error::InvalidQuery)
}
/// Parse a duration from a string.
pub fn duration(input: &str) -> Result<Duration, Error> {
debug!("parsing duration, input = {input}");
let mut lexer = Lexer::new(input.as_bytes());
lexer
.lex_only_duration()
.map_err(|e| ParseError::new(ParseErrorKind::InvalidToken(e), lexer.current_span()))
let mut parser = Parser::new(input.as_bytes());
parser
.next_token_value::<Duration>()
.map_err(|e| e.render_on(input))
.map_err(Error::InvalidQuery)
}

View file

@ -1,360 +0,0 @@
use crate::{
sql::{
language::Language, Datetime, Duration, Ident, Number, Param, Regex, Strand, Table, Uuid,
},
syn::{
parser::mac::unexpected,
token::{t, NumberKind, Token, TokenKind},
},
};
use super::{ParseError, ParseErrorKind, ParseResult, Parser};
/// A trait for parsing single tokens with a specific value.
pub trait TokenValue: Sized {
fn from_token(parser: &mut Parser<'_>, token: Token) -> ParseResult<Self>;
}
impl TokenValue for Ident {
fn from_token(parser: &mut Parser<'_>, token: Token) -> ParseResult<Self> {
match token.kind {
TokenKind::Keyword(_)
| TokenKind::Language(_)
| TokenKind::Algorithm(_)
| TokenKind::Distance(_)
| TokenKind::VectorType(_) => {
let str = parser.lexer.reader.span(token.span);
// Lexer should ensure that the token is valid utf-8
let str = std::str::from_utf8(str).unwrap().to_owned();
Ok(Ident(str))
}
TokenKind::Identifier => {
let str = parser.lexer.string.take().unwrap();
Ok(Ident(str))
}
x => {
unexpected!(parser, x, "a identifier");
}
}
}
}
impl TokenValue for Table {
fn from_token(parser: &mut Parser<'_>, token: Token) -> ParseResult<Self> {
parser.token_value::<Ident>(token).map(|x| Table(x.0))
}
}
impl TokenValue for u64 {
fn from_token(parser: &mut Parser<'_>, token: Token) -> ParseResult<Self> {
match token.kind {
TokenKind::Number(NumberKind::Integer) => {
let number = parser.lexer.string.take().unwrap().parse().map_err(|e| {
ParseError::new(
ParseErrorKind::InvalidInteger {
error: e,
},
token.span,
)
})?;
Ok(number)
}
x => unexpected!(parser, x, "an integer"),
}
}
}
impl TokenValue for u32 {
fn from_token(parser: &mut Parser<'_>, token: Token) -> ParseResult<Self> {
match token.kind {
TokenKind::Number(NumberKind::Integer) => {
let number = parser.lexer.string.take().unwrap().parse().map_err(|e| {
ParseError::new(
ParseErrorKind::InvalidInteger {
error: e,
},
token.span,
)
})?;
Ok(number)
}
x => unexpected!(parser, x, "an integer"),
}
}
}
impl TokenValue for u16 {
fn from_token(parser: &mut Parser<'_>, token: Token) -> ParseResult<Self> {
match token.kind {
TokenKind::Number(NumberKind::Integer) => {
let number = parser.lexer.string.take().unwrap().parse().map_err(|e| {
ParseError::new(
ParseErrorKind::InvalidInteger {
error: e,
},
token.span,
)
})?;
Ok(number)
}
x => unexpected!(parser, x, "an integer"),
}
}
}
impl TokenValue for u8 {
fn from_token(parser: &mut Parser<'_>, token: Token) -> ParseResult<Self> {
match token.kind {
TokenKind::Number(NumberKind::Integer) => {
let number = parser.lexer.string.take().unwrap().parse().map_err(|e| {
ParseError::new(
ParseErrorKind::InvalidInteger {
error: e,
},
token.span,
)
})?;
Ok(number)
}
x => unexpected!(parser, x, "an integer"),
}
}
}
impl TokenValue for f32 {
fn from_token(parser: &mut Parser<'_>, token: Token) -> ParseResult<Self> {
match token.kind {
TokenKind::Number(NumberKind::NaN) => Ok(f32::NAN),
TokenKind::Number(
NumberKind::Integer
| NumberKind::Float
| NumberKind::FloatMantissa
| NumberKind::Mantissa
| NumberKind::MantissaExponent,
) => {
let number = parser.lexer.string.take().unwrap().parse().map_err(|e| {
ParseError::new(
ParseErrorKind::InvalidFloat {
error: e,
},
token.span,
)
})?;
Ok(number)
}
x => unexpected!(parser, x, "a floating point number"),
}
}
}
impl TokenValue for f64 {
fn from_token(parser: &mut Parser<'_>, token: Token) -> ParseResult<Self> {
match token.kind {
TokenKind::Number(NumberKind::NaN) => Ok(f64::NAN),
TokenKind::Number(
NumberKind::Integer
| NumberKind::Float
| NumberKind::FloatMantissa
| NumberKind::Mantissa
| NumberKind::MantissaExponent,
) => {
let number = parser.lexer.string.take().unwrap().parse().map_err(|e| {
ParseError::new(
ParseErrorKind::InvalidFloat {
error: e,
},
token.span,
)
})?;
Ok(number)
}
x => unexpected!(parser, x, "a floating point number"),
}
}
}
impl TokenValue for Language {
fn from_token(parser: &mut Parser<'_>, token: Token) -> ParseResult<Self> {
match token.kind {
TokenKind::Language(x) => Ok(x),
// `NO` can both be used as a keyword and as a language.
t!("NO") => Ok(Language::Norwegian),
x => unexpected!(parser, x, "a language"),
}
}
}
impl TokenValue for Number {
fn from_token(parser: &mut Parser<'_>, token: Token) -> ParseResult<Self> {
match token.kind {
TokenKind::Number(NumberKind::NaN) => Ok(Number::Float(f64::NAN)),
TokenKind::Number(NumberKind::Integer) => {
let source = parser.lexer.string.take().unwrap();
if let Ok(x) = source.parse() {
return Ok(Number::Int(x));
}
// integer overflowed, fallback to floating point
// As far as I can tell this will never fail for valid integers.
let x = source.parse().map_err(|e| {
ParseError::new(
ParseErrorKind::InvalidFloat {
error: e,
},
token.span,
)
})?;
Ok(Number::Float(x))
}
TokenKind::Number(
NumberKind::Mantissa
| NumberKind::MantissaExponent
| NumberKind::Float
| NumberKind::FloatMantissa,
) => {
let source = parser.lexer.string.take().unwrap();
// As far as I can tell this will never fail for valid integers.
let x = source.parse().map_err(|e| {
ParseError::new(
ParseErrorKind::InvalidFloat {
error: e,
},
token.span,
)
})?;
Ok(Number::Float(x))
}
TokenKind::Number(NumberKind::Decimal) => {
let source = parser.lexer.string.take().unwrap();
// As far as I can tell this will never fail for valid integers.
let x: rust_decimal::Decimal = source.parse().map_err(|error| {
ParseError::new(
ParseErrorKind::InvalidDecimal {
error,
},
token.span,
)
})?;
Ok(Number::Decimal(x))
}
TokenKind::Number(NumberKind::DecimalExponent) => {
let source = parser.lexer.string.take().unwrap();
// As far as I can tell this will never fail for valid integers.
let x = rust_decimal::Decimal::from_scientific(&source).map_err(|error| {
ParseError::new(
ParseErrorKind::InvalidDecimal {
error,
},
token.span,
)
})?;
Ok(Number::Decimal(x))
}
x => unexpected!(parser, x, "a number"),
}
}
}
impl TokenValue for Param {
fn from_token(parser: &mut Parser<'_>, token: Token) -> ParseResult<Self> {
match token.kind {
TokenKind::Parameter => {
let param = parser.lexer.string.take().unwrap();
Ok(Param(Ident(param)))
}
x => unexpected!(parser, x, "a parameter"),
}
}
}
impl TokenValue for Duration {
fn from_token(parser: &mut Parser<'_>, token: Token) -> ParseResult<Self> {
let TokenKind::Duration = token.kind else {
unexpected!(parser, token.kind, "a duration")
};
let duration = parser.lexer.duration.take().expect("token data was already consumed");
Ok(duration)
}
}
impl TokenValue for Datetime {
fn from_token(parser: &mut Parser<'_>, token: Token) -> ParseResult<Self> {
let TokenKind::DateTime = token.kind else {
unexpected!(parser, token.kind, "a duration")
};
let datetime = parser.lexer.datetime.take().expect("token data was already consumed");
Ok(datetime)
}
}
impl TokenValue for Strand {
fn from_token(parser: &mut Parser<'_>, token: Token) -> ParseResult<Self> {
match token.kind {
TokenKind::Strand => {
let strand = parser.lexer.string.take().unwrap();
Ok(Strand(strand))
}
x => unexpected!(parser, x, "a strand"),
}
}
}
impl TokenValue for Uuid {
fn from_token(parser: &mut Parser<'_>, token: Token) -> ParseResult<Self> {
let TokenKind::Uuid = token.kind else {
unexpected!(parser, token.kind, "a duration")
};
Ok(parser.lexer.uuid.take().expect("token data was already consumed"))
}
}
impl TokenValue for Regex {
fn from_token(parser: &mut Parser<'_>, token: Token) -> ParseResult<Self> {
let TokenKind::Regex = token.kind else {
unexpected!(parser, token.kind, "a regex")
};
Ok(parser.lexer.regex.take().expect("token data was already consumed"))
}
}
impl Parser<'_> {
/// Parse a token value from the next token in the parser.
pub fn next_token_value<V: TokenValue>(&mut self) -> ParseResult<V> {
let next = self.peek();
let res = V::from_token(self, next);
if res.is_ok() {
self.pop_peek();
}
res
}
pub fn parse_signed_float(&mut self) -> ParseResult<f64> {
let neg = self.eat(t!("-"));
if !neg {
self.eat(t!("+"));
}
let res: f64 = self.next_token_value()?;
if neg {
Ok(-res)
} else {
Ok(res)
}
}
/// Parse a token value from the given token.
pub fn token_value<V: TokenValue>(&mut self, token: Token) -> ParseResult<V> {
V::from_token(self, token)
}
/// Returns if the peeked token can be a identifier.
pub fn peek_can_be_ident(&mut self) -> bool {
matches!(
self.peek_kind(),
TokenKind::Keyword(_)
| TokenKind::Language(_)
| TokenKind::Algorithm(_)
| TokenKind::Distance(_)
| TokenKind::VectorType(_)
| TokenKind::Identifier
)
}
}

View file

@ -0,0 +1,192 @@
use std::ops::RangeInclusive;
use chrono::{FixedOffset, NaiveDate, NaiveDateTime, NaiveTime, Offset, TimeZone, Utc};
use crate::{
sql::Datetime,
syn::{
parser::{
mac::{expected_whitespace, unexpected},
ParseError, ParseErrorKind, ParseResult, Parser,
},
token::{t, DatetimeChars, TokenKind},
},
};
impl Parser<'_> {
pub fn parse_datetime(&mut self) -> ParseResult<Datetime> {
let start = self.peek();
let double = match start.kind {
t!("d\"") => true,
t!("d'") => false,
x => unexpected!(self, x, "a datetime"),
};
self.pop_peek();
let datetime = self.parse_inner_datetime()?;
if double {
expected_whitespace!(self, t!("\""));
} else {
expected_whitespace!(self, t!("'"));
}
Ok(datetime)
}
/// Parses the datetimem without surrounding qoutes
pub fn parse_inner_datetime(&mut self) -> ParseResult<Datetime> {
let start_date = self.peek_whitespace().span;
let year_neg = self.eat_whitespace(t!("-"));
if !year_neg {
self.eat_whitespace(t!("+"));
}
let year = self.parse_datetime_digits(4, 0..=9999)?;
expected_whitespace!(self, t!("-"));
let month = self.parse_datetime_digits(2, 1..=12)?;
expected_whitespace!(self, t!("-"));
let day = self.parse_datetime_digits(2, 1..=31)?;
let date_span = start_date.covers(self.last_span());
let year = if year_neg {
-(year as i32)
} else {
year as i32
};
let date = NaiveDate::from_ymd_opt(year, month as u32, day as u32)
.ok_or_else(|| ParseError::new(ParseErrorKind::InvalidDatetimeDate, date_span))?;
if !self.eat(TokenKind::DatetimeChars(DatetimeChars::T)) {
let time = NaiveTime::default();
let date_time = NaiveDateTime::new(date, time);
let datetime =
Utc.fix().from_local_datetime(&date_time).earliest().unwrap().with_timezone(&Utc);
return Ok(Datetime(datetime));
}
let start_time = self.peek_whitespace().span;
let hour = self.parse_datetime_digits(2, 0..=24)?;
expected_whitespace!(self, t!(":"));
let minute = self.parse_datetime_digits(2, 0..=59)?;
expected_whitespace!(self, t!(":"));
let second = self.parse_datetime_digits(2, 0..=59)?;
let nanos = if self.eat_whitespace(t!(".")) {
let digits_token = expected_whitespace!(self, TokenKind::Digits);
let slice = self.span_bytes(digits_token.span);
if slice.len() > 9 {
return Err(ParseError::new(
ParseErrorKind::TooManyNanosecondsDatetime,
digits_token.span,
));
}
let mut number = 0u32;
for i in 0..9 {
let Some(c) = slice.get(i).copied() else {
// If digits are missing they are counted as 0's
for _ in i..9 {
number *= 10;
}
break;
};
number *= 10;
number += (c - b'0') as u32;
}
number
} else {
0
};
let time_span = start_time.covers(self.last_span());
let time =
NaiveTime::from_hms_nano_opt(hour as u32, minute as u32, second as u32, nanos)
.ok_or_else(|| ParseError::new(ParseErrorKind::InvalidDatetimeTime, time_span))?;
let peek = self.peek_whitespace();
let timezone = match peek.kind {
t!("+") => self.parse_datetime_timezone(false)?,
t!("-") => self.parse_datetime_timezone(true)?,
TokenKind::DatetimeChars(DatetimeChars::Z) => {
self.pop_peek();
Utc.fix()
}
x => unexpected!(self, x, "`Z` or a timezone"),
};
let date_time = NaiveDateTime::new(date, time);
let datetime = timezone
.from_local_datetime(&date_time)
.earliest()
// this should never panic with a fixed offset.
.unwrap()
.with_timezone(&Utc);
Ok(Datetime(datetime))
}
fn parse_datetime_timezone(&mut self, neg: bool) -> ParseResult<FixedOffset> {
self.pop_peek();
let hour = self.parse_datetime_digits(2, 0..=23)?;
expected_whitespace!(self, t!(":"));
let minute = self.parse_datetime_digits(2, 0..=59)?;
// The range checks on the digits ensure that the offset can't exceed 23:59 so below
// unwraps won't panic.
if neg {
Ok(FixedOffset::west_opt((hour * 3600 + minute * 60) as i32).unwrap())
} else {
Ok(FixedOffset::east_opt((hour * 3600 + minute * 60) as i32).unwrap())
}
}
fn parse_datetime_digits(
&mut self,
len: usize,
range: RangeInclusive<usize>,
) -> ParseResult<usize> {
let t = self.peek_whitespace();
match t.kind {
TokenKind::Digits => {}
x => unexpected!(self, x, "datetime digits"),
}
let digits_str = self.span_str(t.span);
if digits_str.len() != len {
return Err(ParseError::new(
ParseErrorKind::InvalidDatetimePart {
len,
},
t.span,
));
}
self.pop_peek();
// This should always parse as it has been validated by the lexer.
let value = digits_str.parse().unwrap();
if !range.contains(&value) {
return Err(ParseError::new(
ParseErrorKind::OutrangeDatetimePart {
range,
},
t.span,
));
}
Ok(value)
}
}

View file

@ -0,0 +1,192 @@
use crate::{
sql::{language::Language, Datetime, Duration, Ident, Param, Regex, Strand, Table, Uuid},
syn::{
parser::{mac::unexpected, ParseError, ParseErrorKind, ParseResult, Parser},
token::{t, QouteKind, TokenKind},
},
};
mod datetime;
mod number;
mod uuid;
/// A trait for parsing single tokens with a specific value.
pub trait TokenValue: Sized {
fn from_token(parser: &mut Parser<'_>) -> ParseResult<Self>;
}
impl TokenValue for Ident {
fn from_token(parser: &mut Parser<'_>) -> ParseResult<Self> {
match parser.glue_ident(false)?.kind {
TokenKind::Identifier => {
parser.pop_peek();
let str = parser.lexer.string.take().unwrap();
Ok(Ident(str))
}
TokenKind::Keyword(_) | TokenKind::Language(_) | TokenKind::Algorithm(_) => {
let s = parser.pop_peek().span;
Ok(Ident(parser.span_str(s).to_owned()))
}
x => {
unexpected!(parser, x, "an identifier");
}
}
}
}
impl TokenValue for Table {
fn from_token(parser: &mut Parser<'_>) -> ParseResult<Self> {
parser.next_token_value::<Ident>().map(|x| Table(x.0))
}
}
impl TokenValue for Language {
fn from_token(parser: &mut Parser<'_>) -> ParseResult<Self> {
match parser.peek_kind() {
TokenKind::Language(x) => {
parser.pop_peek();
Ok(x)
}
// `NO` can both be used as a keyword and as a language.
t!("NO") => {
parser.pop_peek();
Ok(Language::Norwegian)
}
x => unexpected!(parser, x, "a language"),
}
}
}
impl TokenValue for Param {
fn from_token(parser: &mut Parser<'_>) -> ParseResult<Self> {
match parser.peek_kind() {
TokenKind::Parameter => {
parser.pop_peek();
let param = parser.lexer.string.take().unwrap();
Ok(Param(Ident(param)))
}
x => unexpected!(parser, x, "a parameter"),
}
}
}
impl TokenValue for Duration {
fn from_token(parser: &mut Parser<'_>) -> ParseResult<Self> {
match parser.glue_duration()?.kind {
TokenKind::Duration => {
parser.pop_peek();
Ok(Duration(parser.lexer.duration.unwrap()))
}
x => unexpected!(parser, x, "a duration"),
}
}
}
impl TokenValue for Datetime {
fn from_token(parser: &mut Parser<'_>) -> ParseResult<Self> {
parser.parse_datetime()
}
}
impl TokenValue for Strand {
fn from_token(parser: &mut Parser<'_>) -> ParseResult<Self> {
let token = parser.peek();
match token.kind {
TokenKind::Qoute(QouteKind::Plain | QouteKind::PlainDouble) => {
parser.pop_peek();
let t = parser.lexer.relex_strand(token);
let TokenKind::Strand = t.kind else {
unexpected!(parser, t.kind, "a strand")
};
Ok(Strand(parser.lexer.string.take().unwrap()))
}
TokenKind::Strand => {
parser.pop_peek();
Ok(Strand(parser.lexer.string.take().unwrap()))
}
x => unexpected!(parser, x, "a strand"),
}
}
}
impl TokenValue for Uuid {
fn from_token(parser: &mut Parser<'_>) -> ParseResult<Self> {
parser.parse_uuid()
}
}
impl TokenValue for Regex {
fn from_token(parser: &mut Parser<'_>) -> ParseResult<Self> {
match parser.peek().kind {
t!("/") => {
let pop = parser.pop_peek();
assert!(!parser.has_peek());
let token = parser.lexer.relex_regex(pop);
let mut span = token.span;
// remove the starting and ending `/` characters.
span.offset += 1;
span.len -= 2;
let regex = parser
.span_str(span)
.parse()
.map_err(|e| ParseError::new(ParseErrorKind::InvalidRegex(e), token.span))?;
Ok(regex)
}
x => unexpected!(parser, x, "a regex"),
}
}
}
impl Parser<'_> {
/// Parse a token value from the next token in the parser.
pub fn next_token_value<V: TokenValue>(&mut self) -> ParseResult<V> {
V::from_token(self)
}
}
#[cfg(test)]
mod test {
#[test]
fn identifiers() {
use crate::sql;
fn assert_ident_parses_correctly(ident: &str) {
use crate::syn::Parser;
use reblessive::Stack;
let mut parser = Parser::new(ident.as_bytes());
let mut stack = Stack::new();
let r = stack
.enter(|ctx| async move { parser.parse_query(ctx).await })
.finish()
.expect(&format!("failed on {}", ident));
assert_eq!(
r,
sql::Query(sql::Statements(vec![sql::Statement::Value(sql::Value::Idiom(
sql::Idiom(vec![sql::Part::Field(sql::Ident(ident.to_string()))])
))]))
)
}
assert_ident_parses_correctly("select123");
assert_ident_parses_correctly("e123");
assert_ident_parses_correctly("dec123");
assert_ident_parses_correctly("f123");
assert_ident_parses_correctly("y123");
assert_ident_parses_correctly("w123");
assert_ident_parses_correctly("d123");
assert_ident_parses_correctly("h123");
assert_ident_parses_correctly("m123");
assert_ident_parses_correctly("s123");
assert_ident_parses_correctly("ms123");
assert_ident_parses_correctly("us123");
assert_ident_parses_correctly("ns123");
}
}

View file

@ -0,0 +1,184 @@
use rust_decimal::Decimal;
use std::{
borrow::Cow,
num::{ParseFloatError, ParseIntError},
str::FromStr,
};
use crate::{
sql::Number,
syn::{
parser::{mac::unexpected, ParseError, ParseErrorKind, ParseResult, Parser},
token::{t, NumberKind, TokenKind},
},
};
use super::TokenValue;
fn prepare_number_str(str: &str) -> Cow<str> {
if str.contains('_') {
Cow::Owned(str.chars().filter(|x| *x != '_').collect())
} else {
Cow::Borrowed(str)
}
}
/// Generic integer parsing method,
/// works for all unsigned integers.
fn parse_integer<I>(parser: &mut Parser<'_>) -> ParseResult<I>
where
I: FromStr<Err = ParseIntError>,
{
let mut peek = parser.peek();
if let t!("-") = peek.kind {
unexpected!(parser,t!("-"),"an integer" => "only positive integers are allowed here")
}
if let t!("+") = peek.kind {
peek = parser.peek_whitespace();
}
match peek.kind {
TokenKind::Digits => {
parser.pop_peek();
assert!(!parser.has_peek());
let p = parser.peek_whitespace();
match p.kind {
t!(".") => {
unexpected!(parser, p.kind, "an integer")
}
t!("dec") => {
unexpected!(parser, p.kind, "an integer" => "decimal numbers not supported here")
}
x if Parser::tokenkind_continues_ident(x) => {
unexpected!(parser, p.kind, "an integer")
}
_ => {}
}
// remove the possible "f" number suffix and any '_' characters
let res = prepare_number_str(parser.span_str(peek.span))
.parse()
.map_err(ParseErrorKind::InvalidInteger)
.map_err(|e| ParseError::new(e, peek.span))?;
Ok(res)
}
x => unexpected!(parser, x, "an integer"),
}
}
impl TokenValue for u64 {
fn from_token(parser: &mut Parser<'_>) -> ParseResult<Self> {
parse_integer(parser)
}
}
impl TokenValue for u32 {
fn from_token(parser: &mut Parser<'_>) -> ParseResult<Self> {
parse_integer(parser)
}
}
impl TokenValue for u16 {
fn from_token(parser: &mut Parser<'_>) -> ParseResult<Self> {
parse_integer(parser)
}
}
impl TokenValue for u8 {
fn from_token(parser: &mut Parser<'_>) -> ParseResult<Self> {
parse_integer(parser)
}
}
/// Generic float parsing method,
/// works for both f32 and f64
fn parse_float<F>(parser: &mut Parser<'_>) -> ParseResult<F>
where
F: FromStr<Err = ParseFloatError>,
{
let peek = parser.peek();
// find initial digits
match peek.kind {
TokenKind::NaN => return Ok("NaN".parse().unwrap()),
TokenKind::Digits | t!("+") | t!("-") => {}
x => unexpected!(parser, x, "a floating point number"),
};
let float_token = parser.glue_float()?;
match float_token.kind {
TokenKind::Number(NumberKind::Float) => {
parser.pop_peek();
}
x => unexpected!(parser, x, "a floating point number"),
};
let span = parser.span_str(float_token.span);
// remove the possible "f" number suffix and any '_' characters
prepare_number_str(span.strip_suffix('f').unwrap_or(span))
.parse()
.map_err(ParseErrorKind::InvalidFloat)
.map_err(|e| ParseError::new(e, float_token.span))
}
impl TokenValue for f32 {
fn from_token(parser: &mut Parser<'_>) -> ParseResult<Self> {
parse_float(parser)
}
}
impl TokenValue for f64 {
fn from_token(parser: &mut Parser<'_>) -> ParseResult<Self> {
parse_float(parser)
}
}
impl TokenValue for Number {
fn from_token(parser: &mut Parser<'_>) -> ParseResult<Self> {
let number = parser.glue_number()?;
let number_kind = match number.kind {
TokenKind::NaN => {
parser.pop_peek();
return Ok(Number::Float(f64::NAN));
}
TokenKind::Number(x) => x,
x => unexpected!(parser, x, "a number"),
};
parser.pop_peek();
let span = parser.span_str(number.span);
match number_kind {
NumberKind::Decimal => {
let str = prepare_number_str(span.strip_suffix("dec").unwrap_or(span));
let decimal = if str.contains('e') {
Decimal::from_scientific(str.as_ref()).map_err(|e| {
ParseError::new(ParseErrorKind::InvalidDecimal(e), number.span)
})?
} else {
Decimal::from_str(str.as_ref()).map_err(|e| {
ParseError::new(ParseErrorKind::InvalidDecimal(e), number.span)
})?
};
Ok(Number::Decimal(decimal))
}
NumberKind::Float => {
let float = prepare_number_str(span.strip_suffix('f').unwrap_or(span))
.parse()
.map_err(|e| ParseError::new(ParseErrorKind::InvalidFloat(e), number.span))?;
Ok(Number::Float(float))
}
NumberKind::Integer => {
let integer = prepare_number_str(span.strip_suffix('f').unwrap_or(span))
.parse()
.map_err(|e| ParseError::new(ParseErrorKind::InvalidInteger(e), number.span))?;
Ok(Number::Int(integer))
}
}
}
}

View file

@ -0,0 +1,195 @@
use crate::{
sql::Uuid,
syn::{
parser::{
mac::{expected_whitespace, unexpected},
ParseError, ParseErrorKind, ParseResult, Parser,
},
token::{t, DurationSuffix, NumberSuffix, TokenKind},
},
};
impl Parser<'_> {
/// Parses a uuid strand.
pub fn parse_uuid(&mut self) -> ParseResult<Uuid> {
let quote_token = self.peek_whitespace();
let double = match quote_token.kind {
t!("u\"") => true,
t!("u'") => false,
x => unexpected!(self, x, "a uuid"),
};
self.pop_peek();
// number of bytes is 4-2-2-2-6
let mut uuid_buffer = [0u8; 16];
self.eat_uuid_hex(&mut uuid_buffer[0..4])?;
expected_whitespace!(self, t!("-"));
self.eat_uuid_hex(&mut uuid_buffer[4..6])?;
expected_whitespace!(self, t!("-"));
self.eat_uuid_hex(&mut uuid_buffer[6..8])?;
expected_whitespace!(self, t!("-"));
self.eat_uuid_hex(&mut uuid_buffer[8..10])?;
expected_whitespace!(self, t!("-"));
self.eat_uuid_hex(&mut uuid_buffer[10..16])?;
if double {
expected_whitespace!(self, t!("\""));
} else {
expected_whitespace!(self, t!("'"));
}
Ok(Uuid(uuid::Uuid::from_bytes(uuid_buffer)))
}
/// Eats a uuid hex section, enough to fill the given buffer with bytes.
fn eat_uuid_hex(&mut self, buffer: &mut [u8]) -> ParseResult<()> {
// A function to covert a hex digit to its number representation.
fn ascii_to_hex(b: u8) -> Option<u8> {
if b.is_ascii_digit() {
return Some(b - b'0');
}
if (b'a'..=b'f').contains(&b) {
return Some(b - (b'a' - 10));
}
if (b'A'..=b'F').contains(&b) {
return Some(b - (b'A' - 10));
}
None
}
// the amounts of character required is twice the buffer len.
// since every character is half a byte.
let required_len = buffer.len() * 2;
// The next token should be digits or an identifier
// If it is digits an identifier might be after it.
let start_token = self.peek_whitespace();
let mut cur = start_token;
loop {
let next = self.peek_whitespace();
match next.kind {
TokenKind::Identifier => {
cur = self.pop_peek();
break;
}
TokenKind::Exponent
| TokenKind::Digits
| TokenKind::DurationSuffix(DurationSuffix::Day)
| TokenKind::NumberSuffix(NumberSuffix::Float) => {
cur = self.pop_peek();
}
TokenKind::Language(_) | TokenKind::Keyword(_) => {
// there are some keywords and languages keywords which could be part of the
// hex section.
if !self.span_bytes(next.span).iter().all(|x| x.is_ascii_hexdigit()) {
unexpected!(self, TokenKind::Identifier, "UUID hex digits");
}
cur = self.pop_peek();
break;
}
t!("-") | t!("\"") | t!("'") => break,
_ => unexpected!(self, TokenKind::Identifier, "UUID hex digits"),
}
}
// Get the span that covered all eaten tokens.
let digits_span = start_token.span.covers(cur.span);
let digits_bytes = self.span_str(digits_span).as_bytes();
// for error handling, the incorrect hex character should be returned first, before
// returning the not correct length for segment error even if both are valid.
if !digits_bytes.iter().all(|x| x.is_ascii_hexdigit()) {
return Err(ParseError::new(
ParseErrorKind::Unexpected {
found: TokenKind::Strand,
expected: "UUID hex digits",
},
digits_span,
));
}
if digits_bytes.len() != required_len {
return Err(ParseError::new(
ParseErrorKind::InvalidUuidPart {
len: required_len,
},
digits_span,
));
}
// write into the buffer
for (i, b) in buffer.iter_mut().enumerate() {
*b = ascii_to_hex(digits_bytes[i * 2]).unwrap() << 4
| ascii_to_hex(digits_bytes[i * 2 + 1]).unwrap();
}
Ok(())
}
}
#[cfg(test)]
mod test {
use crate::syn::parser::Parser;
#[test]
fn uuid_parsing() {
fn assert_uuid_parses(s: &str) {
let uuid_str = format!("u'{s}'");
let mut parser = Parser::new(uuid_str.as_bytes());
let uuid = parser.parse_uuid().unwrap();
assert_eq!(uuid::Uuid::parse_str(s).unwrap(), *uuid);
}
assert_uuid_parses("0531956f-20ec-4575-bb68-3e6b49d813fa");
assert_uuid_parses("0531956d-20ec-4575-bb68-3e6b49d813fa");
assert_uuid_parses("0531956e-20ec-4575-bb68-3e6b49d813fa");
assert_uuid_parses("0531956a-20ec-4575-bb68-3e6b49d813fa");
assert_uuid_parses("053195f1-20ec-4575-bb68-3e6b49d813fa");
assert_uuid_parses("053195d1-20ec-4575-bb68-3e6b49d813fa");
assert_uuid_parses("053195e1-20ec-4575-bb68-3e6b49d813fa");
assert_uuid_parses("053195a1-20ec-4575-bb68-3e6b49d813fa");
assert_uuid_parses("f0531951-20ec-4575-bb68-3e6b49d813fa");
assert_uuid_parses("d0531951-20ec-4575-bb68-3e6b49d813fa");
assert_uuid_parses("e0531951-20ec-4575-bb68-3e6b49d813fa");
assert_uuid_parses("a0531951-20ec-4575-bb68-3e6b49d813fa");
}
#[test]
fn test_uuid_characters() {
let hex_characters =
[b'0', b'a', b'b', b'c', b'd', b'e', b'f', b'A', b'B', b'C', b'D', b'E', b'F'];
let mut uuid_string: Vec<u8> = "u'0531956f-20ec-4575-bb68-3e6b49d813fa'".to_string().into();
fn assert_uuid_parses(s: &[u8]) {
let mut parser = Parser::new(s);
parser.parse_uuid().unwrap();
}
for i in hex_characters.iter() {
for j in hex_characters.iter() {
for k in hex_characters.iter() {
uuid_string[3] = *i;
uuid_string[4] = *j;
uuid_string[5] = *k;
assert_uuid_parses(&uuid_string)
}
}
}
}
}

View file

@ -1,8 +1,11 @@
use super::{ParseResult, Parser};
use crate::{
sql::{Constant, Function, Ident, Value},
sql::{Constant, Function, Value},
syn::{
parser::{mac::expected, ParseError, ParseErrorKind},
parser::{
mac::{expected, unexpected},
ParseError, ParseErrorKind,
},
token::{t, Span},
},
};
@ -440,15 +443,16 @@ impl Parser<'_> {
pub async fn parse_builtin(&mut self, stk: &mut Stk, start: Span) -> ParseResult<Value> {
let mut last_span = start;
while self.eat(t!("::")) {
self.next_token_value::<Ident>()?;
let t = self.glue_ident(false)?;
if !t.kind.can_be_identifier() {
unexpected!(self, t.kind, "an identifier")
}
self.pop_peek();
last_span = self.last_span();
}
let span = start.covers(last_span);
let slice = self.lexer.reader.span(span);
// parser implementations guarentess that the slice is a valid utf8 string.
let str = std::str::from_utf8(slice).unwrap();
let str = self.span_str(span);
match PATHS.get_entry(&UniCase::ascii(str)) {
Some((_, PathKind::Constant(x))) => Ok(Value::Constant(x.clone())),

View file

@ -7,6 +7,7 @@ use crate::syn::{
use std::{
fmt::Write,
num::{ParseFloatError, ParseIntError},
ops::RangeInclusive,
};
#[derive(Debug)]
@ -48,17 +49,12 @@ pub enum ParseErrorKind {
should_close: Span,
},
/// An error for parsing an integer
InvalidInteger {
error: ParseIntError,
},
InvalidInteger(ParseIntError),
/// An error for parsing an float
InvalidFloat {
error: ParseFloatError,
},
InvalidFloat(ParseFloatError),
/// An error for parsing an decimal.
InvalidDecimal {
error: rust_decimal::Error,
},
InvalidDecimal(rust_decimal::Error),
InvalidIdent,
DisallowedStatement {
found: TokenKind,
expected: TokenKind,
@ -70,13 +66,27 @@ pub enum ParseErrorKind {
InvalidPath {
possibly: Option<&'static str>,
},
InvalidRegex(regex::Error),
MissingField {
field: Span,
idiom: String,
kind: MissingKind,
},
InvalidUuidPart {
len: usize,
},
InvalidDatetimePart {
len: usize,
},
OutrangeDatetimePart {
range: RangeInclusive<usize>,
},
TooManyNanosecondsDatetime,
InvalidDatetimeDate,
InvalidDatetimeTime,
ExceededObjectDepthLimit,
ExceededQueryDepthLimit,
DurationOverflow,
NoWhitespace,
}
@ -102,7 +112,7 @@ impl ParseError {
/// Create a rendered error from the string this error was generated from.
pub fn render_on_inner(source: &str, kind: &ParseErrorKind, at: Span) -> RenderedError {
match &kind {
match kind {
ParseErrorKind::Unexpected {
found,
expected,
@ -208,9 +218,7 @@ impl ParseError {
snippets: vec![snippet],
}
}
ParseErrorKind::InvalidInteger {
ref error,
} => {
ParseErrorKind::InvalidInteger(ref error) => {
let text = format!("failed to parse integer, {error}");
let locations = Location::range_of_span(source, at);
let snippet = Snippet::from_source_location_range(source, locations, None);
@ -219,9 +227,7 @@ impl ParseError {
snippets: vec![snippet],
}
}
ParseErrorKind::InvalidFloat {
ref error,
} => {
ParseErrorKind::InvalidFloat(ref error) => {
let text = format!("failed to parse floating point, {error}");
let locations = Location::range_of_span(source, at);
let snippet = Snippet::from_source_location_range(source, locations, None);
@ -230,9 +236,7 @@ impl ParseError {
snippets: vec![snippet],
}
}
ParseErrorKind::InvalidDecimal {
ref error,
} => {
ParseErrorKind::InvalidDecimal(ref error) => {
let text = format!("failed to parse decimal number, {error}");
let locations = Location::range_of_span(source, at);
let snippet = Snippet::from_source_location_range(source, locations, None);
@ -241,6 +245,15 @@ impl ParseError {
snippets: vec![snippet],
}
}
ParseErrorKind::InvalidRegex(ref error) => {
let text = format!("failed to parse regex, {error}");
let locations = Location::range_of_span(source, at);
let snippet = Snippet::from_source_location_range(source, locations, None);
RenderedError {
text: text.to_string(),
snippets: vec![snippet],
}
}
ParseErrorKind::NoWhitespace => {
let text = "Whitespace is dissallowed in this position";
let locations = Location::range_of_span(source, at);
@ -297,6 +310,96 @@ impl ParseError {
snippets: vec![snippet_error, snippet_hint],
}
}
ParseErrorKind::DurationOverflow => {
let text = "Duration specified exceeds maximum allowed value";
let locations = Location::range_of_span(source, at);
let snippet = Snippet::from_source_location_range(source, locations, None);
RenderedError {
text: text.to_string(),
snippets: vec![snippet],
}
}
ParseErrorKind::InvalidIdent => {
let text = "Duration specified exceeds maximum allowed value";
let locations = Location::range_of_span(source, at);
let snippet = Snippet::from_source_location_range(source, locations, None);
RenderedError {
text: text.to_string(),
snippets: vec![snippet],
}
}
ParseErrorKind::InvalidUuidPart {
len,
} => {
let text = format!(
"Uuid hex section not the correct length, needs to be {len} characters"
);
let locations = Location::range_of_span(source, at);
let snippet = Snippet::from_source_location_range(source, locations, None);
RenderedError {
text,
snippets: vec![snippet],
}
}
ParseErrorKind::InvalidDatetimePart {
len,
} => {
let text = format!(
"Datetime digits section not the correct length, needs to be {len} characters"
);
let locations = Location::range_of_span(source, at);
let snippet = Snippet::from_source_location_range(source, locations, None);
RenderedError {
text,
snippets: vec![snippet],
}
}
ParseErrorKind::OutrangeDatetimePart {
range,
} => {
let text = format!(
"Datetime digits not within valid range {}..={}",
range.start(),
range.end()
);
let locations = Location::range_of_span(source, at);
let snippet = Snippet::from_source_location_range(source, locations, None);
RenderedError {
text,
snippets: vec![snippet],
}
}
ParseErrorKind::TooManyNanosecondsDatetime => {
let text = "Too many digits in Datetime nanoseconds".to_owned();
let locations = Location::range_of_span(source, at);
let snippet = Snippet::from_source_location_range(
source,
locations,
Some("Nanoseconds can at most be 9 characters"),
);
RenderedError {
text,
snippets: vec![snippet],
}
}
ParseErrorKind::InvalidDatetimeDate => {
let text = "Invalid Datetime date".to_owned();
let locations = Location::range_of_span(source, at);
let snippet = Snippet::from_source_location_range(source, locations, None);
RenderedError {
text,
snippets: vec![snippet],
}
}
ParseErrorKind::InvalidDatetimeTime => {
let text = "Datetime time outside of valid time range".to_owned();
let locations = Location::range_of_span(source, at);
let snippet = Snippet::from_source_location_range(source, locations, None);
RenderedError {
text,
snippets: vec![snippet],
}
}
}
}
}

View file

@ -8,9 +8,8 @@ use crate::sql::{value::TryNeg, Cast, Expression, Number, Operator, Value};
use crate::syn::token::Token;
use crate::syn::{
parser::{mac::expected, ParseErrorKind, ParseResult, Parser},
token::{t, NumberKind, TokenKind},
token::{t, TokenKind},
};
use std::cmp::Ordering;
impl Parser<'_> {
/// Parsers a generic value.
@ -124,20 +123,47 @@ impl Parser<'_> {
fn prefix_binding_power(&mut self, token: TokenKind) -> Option<((), u8)> {
match token {
t!("!") | t!("+") | t!("-") => Some(((), 19)),
t!("<") if self.peek_token_at(1).kind != t!("FUTURE") => Some(((), 20)),
t!("<") => {
if self.peek_token_at(1).kind != t!("FUTURE") {
Some(((), 20))
} else {
None
}
}
_ => None,
}
}
async fn parse_prefix_op(&mut self, ctx: &mut Stk, min_bp: u8) -> ParseResult<Value> {
const I64_ABS_MAX: u64 = 9223372036854775808;
let token = self.next();
let token = self.peek();
let operator = match token.kind {
t!("+") => Operator::Add,
t!("-") => Operator::Neg,
t!("!") => Operator::Not,
t!("+") => {
// +123 is a single number token, so parse it as such
let p = self.peek_whitespace_token_at(1);
if matches!(p.kind, TokenKind::Digits) {
return self.next_token_value::<Number>().map(Value::Number);
}
self.pop_peek();
Operator::Add
}
t!("-") => {
// -123 is a single number token, so parse it as such
let p = self.peek_whitespace_token_at(1);
if matches!(p.kind, TokenKind::Digits) {
return self.next_token_value::<Number>().map(Value::Number);
}
self.pop_peek();
Operator::Neg
}
t!("!") => {
self.pop_peek();
Operator::Not
}
t!("<") => {
self.pop_peek();
let kind = self.parse_kind(ctx, token.span).await?;
let value = ctx.run(|ctx| self.pratt_parse_expr(ctx, min_bp)).await?;
let cast = Cast(kind, value);
@ -147,29 +173,6 @@ impl Parser<'_> {
_ => unreachable!(),
};
// HACK: The way we handle numbers in the parser has one downside: We can't parse i64::MIN
// directly.
// The tokens [`-`, `1232`] are parsed independently where - is parsed as a unary operator then 1232
// as a positive i64 integer. This results in a problem when 9223372036854775808 is the
// positive integer. This is larger then i64::MAX so the parser fallsback to parsing a
// floating point number. However -9223372036854775808 does fit in a i64 but the parser is,
// when parsing the number, unaware that the number will be negative.
// To handle this correctly we parse negation operator followed by an integer here so we can
// make sure this specific case is handled correctly.
if let Operator::Neg = operator {
// parse -12301230 immediately as a negative number,
if let TokenKind::Number(NumberKind::Integer) = self.peek_kind() {
let token = self.next();
let number = self.token_value::<u64>(token)?;
let number = match number.cmp(&I64_ABS_MAX) {
Ordering::Less => Number::Int(-(number as i64)),
Ordering::Equal => Number::Int(i64::MIN),
Ordering::Greater => self.token_value::<Number>(token)?.try_neg().unwrap(),
};
return Ok(Value::Number(number));
}
}
let v = ctx.run(|ctx| self.pratt_parse_expr(ctx, min_bp)).await?;
// HACK: For compatiblity with the old parser apply + and - operator immediately if the
@ -195,17 +198,18 @@ impl Parser<'_> {
})))
}
}
pub fn parse_knn(&mut self, token: Token) -> ParseResult<Operator> {
let amount = self.next_token_value()?;
let op = if self.eat(t!(",")) {
let token = self.next();
match &token.kind {
TokenKind::Distance(k) => {
match self.peek_kind(){
TokenKind::Distance(ref k) => {
self.pop_peek();
let d = self.convert_distance(k).map(Some)?;
Operator::Knn(amount, d)
},
TokenKind::Number(NumberKind::Integer) => {
let ef = self.token_value(token)?;
TokenKind::Digits | TokenKind::Number(_) => {
let ef = self.next_token_value()?;
Operator::Ann(amount, ef)
}
_ => {

View file

@ -3,8 +3,11 @@ use reblessive::Stk;
use crate::{
sql::{Function, Ident, Model},
syn::{
parser::mac::{expected, unexpected},
token::{t, NumberKind, TokenKind},
parser::{
mac::{expected, expected_whitespace, unexpected},
ParseError, ParseErrorKind,
},
token::{t, TokenKind},
},
};
@ -52,26 +55,38 @@ impl Parser<'_> {
}
let start = expected!(self, t!("<")).span;
let token = self.lexer.lex_only_integer();
let major = match token.kind {
TokenKind::Number(NumberKind::Integer) => self.token_value::<u64>(token)?,
x => unexpected!(self, x, "a integer"),
let token = self.next();
let major: u32 = match token.kind {
TokenKind::Digits => std::str::from_utf8(self.lexer.reader.span(token.span))
.unwrap()
.parse()
.map_err(ParseErrorKind::InvalidInteger)
.map_err(|e| ParseError::new(e, token.span))?,
x => unexpected!(self, x, "an integer"),
};
expected!(self, t!("."));
expected_whitespace!(self, t!("."));
let token = self.lexer.lex_only_integer();
let minor = match token.kind {
TokenKind::Number(NumberKind::Integer) => self.token_value::<u64>(token)?,
x => unexpected!(self, x, "a integer"),
let token = self.next_whitespace();
let minor: u32 = match token.kind {
TokenKind::Digits => std::str::from_utf8(self.lexer.reader.span(token.span))
.unwrap()
.parse()
.map_err(ParseErrorKind::InvalidInteger)
.map_err(|e| ParseError::new(e, token.span))?,
x => unexpected!(self, x, "an integer"),
};
expected!(self, t!("."));
expected_whitespace!(self, t!("."));
let token = self.lexer.lex_only_integer();
let patch = match token.kind {
TokenKind::Number(NumberKind::Integer) => self.token_value::<u64>(token)?,
x => unexpected!(self, x, "a integer"),
let token = self.next_whitespace();
let patch: u32 = match token.kind {
TokenKind::Digits => std::str::from_utf8(self.lexer.reader.span(token.span))
.unwrap()
.parse()
.map_err(ParseErrorKind::InvalidInteger)
.map_err(|e| ParseError::new(e, token.span))?,
x => unexpected!(self, x, "an integer"),
};
self.expect_closing_delimiter(t!(">"), start)?;

View file

@ -267,14 +267,22 @@ impl Parser<'_> {
self.pop_peek();
Part::Last
}
t!("123") => Part::Index(self.next_token_value()?),
t!("+") | TokenKind::Digits | TokenKind::Number(_) => {
Part::Index(self.next_token_value()?)
}
t!("-") => {
if let TokenKind::Digits = self.peek_whitespace_token_at(1).kind {
unexpected!(self, t!("-"),"$, * or a number" => "an index can't be negative");
}
unexpected!(self, t!("-"), "$, * or a number");
}
t!("?") | t!("WHERE") => {
self.pop_peek();
let value = ctx.run(|ctx| self.parse_value_field(ctx)).await?;
Part::Where(value)
}
t!("$param") => Part::Value(Value::Param(self.next_token_value()?)),
TokenKind::Strand => Part::Value(Value::Strand(self.next_token_value()?)),
TokenKind::Qoute(_x) => Part::Value(Value::Strand(self.next_token_value()?)),
_ => {
let idiom = self.parse_basic_idiom()?;
Part::Value(Value::Idiom(idiom))
@ -318,10 +326,18 @@ impl Parser<'_> {
self.pop_peek();
Part::Last
}
t!("123") => {
let number = self.token_value(token)?;
TokenKind::Digits | t!("+") | TokenKind::Number(_) => {
let number = self.next_token_value()?;
Part::Index(number)
}
t!("-") => {
let peek_digit = self.peek_whitespace_token_at(1);
if let TokenKind::Digits = peek_digit.kind {
let span = self.recent_span().covers(peek_digit.span);
unexpected!(@ span, self, t!("-"),"$, * or a number" => "an index can't be negative");
}
unexpected!(self, t!("-"), "$, * or a number");
}
x => unexpected!(self, x, "$, * or a number"),
};
self.expect_closing_delimiter(t!("]"), token.span)?;
@ -356,10 +372,18 @@ impl Parser<'_> {
self.pop_peek();
Part::All
}
t!("123") => {
TokenKind::Digits | t!("+") | TokenKind::Number(_) => {
let number = self.next_token_value()?;
Part::Index(number)
}
t!("-") => {
let peek_digit = self.peek_whitespace_token_at(1);
if let TokenKind::Digits = peek_digit.kind {
let span = self.recent_span().covers(peek_digit.span);
unexpected!(@ span, self, t!("-"),"$, * or a number" => "an index can't be negative");
}
unexpected!(self, t!("-"), "$, * or a number");
}
x => unexpected!(self, x, "$, * or a number"),
};
self.expect_closing_delimiter(t!("]"), token.span)?;

View file

@ -6,34 +6,55 @@ use crate::{
sql::{Array, Ident, Object, Strand, Value},
syn::{
parser::mac::expected,
token::{t, Span, TokenKind},
token::{t, QouteKind, Span, TokenKind},
},
};
use super::{ParseResult, Parser};
use super::{mac::unexpected, ParseResult, Parser};
impl Parser<'_> {
pub async fn parse_json(&mut self, ctx: &mut Stk) -> ParseResult<Value> {
let token = self.next();
let token = self.peek();
match token.kind {
t!("NULL") => Ok(Value::Null),
t!("true") => Ok(Value::Bool(true)),
t!("false") => Ok(Value::Bool(false)),
t!("{") => self.parse_json_object(ctx, token.span).await.map(Value::Object),
t!("[") => self.parse_json_array(ctx, token.span).await.map(Value::Array),
TokenKind::Duration => self.token_value(token).map(Value::Duration),
TokenKind::DateTime => self.token_value(token).map(Value::Datetime),
TokenKind::Strand => {
t!("NULL") => {
self.pop_peek();
Ok(Value::Null)
}
t!("true") => {
self.pop_peek();
Ok(Value::Bool(true))
}
t!("false") => {
self.pop_peek();
Ok(Value::Bool(false))
}
t!("{") => {
self.pop_peek();
self.parse_json_object(ctx, token.span).await.map(Value::Object)
}
t!("[") => {
self.pop_peek();
self.parse_json_array(ctx, token.span).await.map(Value::Array)
}
TokenKind::Qoute(QouteKind::Plain | QouteKind::PlainDouble) => {
let strand: Strand = self.next_token_value()?;
if self.legacy_strands {
self.parse_legacy_strand(ctx).await
} else {
Ok(Value::Strand(Strand(self.lexer.string.take().unwrap())))
if let Some(x) = self.reparse_legacy_strand(ctx, &strand.0).await {
return Ok(x);
}
}
Ok(Value::Strand(strand))
}
TokenKind::Digits | TokenKind::Number(_) => {
let peek = self.glue()?;
match peek.kind {
TokenKind::Duration => Ok(Value::Duration(self.next_token_value()?)),
TokenKind::Number(_) => Ok(Value::Number(self.next_token_value()?)),
x => unexpected!(self, x, "a number"),
}
}
TokenKind::Number(_) => self.token_value(token).map(Value::Number),
TokenKind::Uuid => self.token_value(token).map(Value::Uuid),
_ => {
let ident = self.token_value::<Ident>(token)?.0;
let ident = self.next_token_value::<Ident>()?.0;
self.parse_thing_from_ident(ctx, ident).await.map(Value::Thing)
}
}

View file

@ -1,12 +1,21 @@
/// A macro for requiring a certain token to be next, returning an error otherwise..
macro_rules! unexpected {
($parser:expr, $found:expr, $expected:expr) => {
(@ $span:expr, $parser:expr, $found:expr, $expected:expr $(=> $explain:expr)?) => {{
unexpected!(@@withSpan, $span, $parser,$found, $expected $(=> $explain)?)
}};
($parser:expr, $found:expr, $expected:expr $(=> $explain:expr)?) => {{
let span = $parser.recent_span();
unexpected!(@@withSpan, span, $parser,$found, $expected $(=> $explain)?)
}};
(@@withSpan, $span:expr, $parser:expr, $found:expr, $expected:expr) => {
match $found {
$crate::syn::token::TokenKind::Invalid => {
let error = $parser.lexer.error.take().unwrap();
return Err($crate::syn::parser::ParseError::new(
$crate::syn::parser::ParseErrorKind::InvalidToken(error),
$parser.recent_span(),
$span
));
}
$crate::syn::token::TokenKind::Eof => {
@ -15,7 +24,7 @@ macro_rules! unexpected {
$crate::syn::parser::ParseErrorKind::UnexpectedEof {
expected,
},
$parser.recent_span(),
$span
));
}
x => {
@ -25,7 +34,39 @@ macro_rules! unexpected {
found: x,
expected,
},
$parser.recent_span(),
$span
));
}
}
};
(@@withSpan, $span:expr, $parser:expr, $found:expr, $expected:expr => $explain:expr) => {
match $found {
$crate::syn::token::TokenKind::Invalid => {
let error = $parser.lexer.error.take().unwrap();
return Err($crate::syn::parser::ParseError::new(
$crate::syn::parser::ParseErrorKind::InvalidToken(error),
$span
));
}
$crate::syn::token::TokenKind::Eof => {
let expected = $expected;
return Err($crate::syn::parser::ParseError::new(
$crate::syn::parser::ParseErrorKind::UnexpectedEof {
expected,
},
$span
));
}
x => {
let expected = $expected;
return Err($crate::syn::parser::ParseError::new(
$crate::syn::parser::ParseErrorKind::UnexpectedExplain {
found: x,
expected,
explain: $explain,
},
$span
));
}
}
@ -64,6 +105,38 @@ macro_rules! expected {
}};
}
/// A macro for indicating that the parser encountered an token which it didn't expect.
macro_rules! expected_whitespace {
($parser:expr, $($kind:tt)*) => {{
let token = $parser.next_whitespace();
match token.kind {
$($kind)* => token,
$crate::syn::parser::TokenKind::Invalid => {
let error = $parser.lexer.error.take().unwrap();
return Err($crate::syn::parser::ParseError::new(
$crate::syn::parser::ParseErrorKind::InvalidToken(error),
$parser.recent_span(),
));
}
x => {
let expected = $($kind)*.as_str();
let kind = if let $crate::syn::token::TokenKind::Eof = x {
$crate::syn::parser::ParseErrorKind::UnexpectedEof {
expected,
}
} else {
$crate::syn::parser::ParseErrorKind::Unexpected {
found: x,
expected,
}
};
return Err($crate::syn::parser::ParseError::new(kind, $parser.last_span()));
}
}
}};
}
#[cfg(test)]
#[macro_export]
macro_rules! test_parse {
@ -149,40 +222,8 @@ macro_rules! enter_query_recursion {
}};
}
#[macro_export]
macro_rules! enter_flexible_ident{
($name:ident = $this:expr => ($enabled:expr){ $($t:tt)* }) => {{
struct Dropper<'a, 'b>(&'a mut $crate::syn::parser::Parser<'b>,bool);
impl Drop for Dropper<'_, '_> {
fn drop(&mut self) {
self.0.lexer.flexible_ident = self.1;
}
}
impl<'a> ::std::ops::Deref for Dropper<'_,'a>{
type Target = $crate::syn::parser::Parser<'a>;
fn deref(&self) -> &Self::Target{
self.0
}
}
impl<'a> ::std::ops::DerefMut for Dropper<'_,'a>{
fn deref_mut(&mut self) -> &mut Self::Target{
self.0
}
}
let enabled = $this.lexer.flexible_ident;
$this.lexer.flexible_ident = $enabled;
#[allow(unused_mut)]
let mut $name = Dropper($this,enabled);
{
$($t)*
}
}};
}
pub(super) use expected;
pub(super) use expected_whitespace;
pub(super) use unexpected;
#[cfg(test)]

View file

@ -4,6 +4,8 @@
//! Most of the functions of the SurrealQL parser peek a token from the lexer and then decide to
//! take a path depending on which token is next.
//!
//! # Implementation Details
//!
//! There are a bunch of common patterns for which this module has some confinence functions.
//! - Whenever only one token can be next you should use the [`expected!`] macro. This macro
//! ensures that the given token type is next and if not returns a parser error.
@ -15,6 +17,39 @@
//! - If a closing delimiting token is expected use [`Parser::expect_closing_delimiter`]. This
//! function will raise an error if the expected delimiter isn't the next token. This error will
//! also point to which delimiter the parser expected to be closed.
//!
//! ## Far Token Peek
//!
//! Occasionally the parser needs to check further ahead than peeking allows.
//! This is done with the [`Parser::peek_token_at`] function. This function peeks a given number
//! of tokens further than normal up to 3 tokens further.
//!
//! ## WhiteSpace Tokens
//!
//! The lexer produces whitespace tokens, these are tokens which are normally ignored in most place
//! in the syntax as they have no bearing on the meaning of a statements. [`Parser::next`] and
//! [`Parser::peek`] automatically skip over any whitespace tokens. However in some places, like
//! in a record-id and when gluing tokens, these white-space tokens are required for correct
//! parsing. In which case the function [`Parser::next_whitespace`] and others with `_whitespace`
//! are used. These functions don't skip whitespace tokens. However these functions do not undo
//! whitespace tokens which might have been skipped. Implementers must be carefull to not call a
//! functions which requires whitespace tokens when they may already have been skipped.
//!
//! ## Token Gluing
//!
//! Tokens produces from the lexer are in some place more fine-grained then normal. Numbers,
//! Identifiers and strand-like productions could be making up from multiple smaller tokens. A
//! floating point number for example can be at most made up from a 3 digits token, a dot token,
//! an exponent token and number suffix token and two `-` or `+` tokens. Whenever these tokens
//! are required the parser calls a `glue_` method which will take the current peeked token and
//! replace it with a more complex glued together token if possible.
//!
//! ## Use of reblessive
//!
//! This parser uses reblessive to be able to parse deep without overflowing the stack. This means
//! that all functions which might recurse, i.e. in some paths can call themselves again, are async
//! functions taking argument from reblessive to call recursive functions without using more stack
//! with each depth.
use self::token_buffer::TokenBuffer;
use crate::{
@ -39,6 +74,7 @@ mod object;
mod prime;
mod stmt;
mod thing;
mod token;
mod token_buffer;
#[cfg(test)]
@ -99,16 +135,23 @@ impl<'a> Parser<'a> {
}
}
/// Disallow a query to have objects deeper that limit.
/// Arrays also count towards objects. So `[{foo: [] }]` would be 3 deep.
pub fn with_object_recursion_limit(mut self, limit: usize) -> Self {
self.object_recursion = limit;
self
}
/// Disallow a query from being deeper than the give limit.
/// A query recurses when a statement contains another statement within itself.
/// Examples are subquery and blocks like block statements and if statements and such.
pub fn with_query_recursion_limit(mut self, limit: usize) -> Self {
self.query_recursion = limit;
self
}
/// Parse strand like the old parser where a strand which looks like a UUID, Record-Id, Or a
/// DateTime will be parsed as a date-time.
pub fn with_allow_legacy_strand(mut self, value: bool) -> Self {
self.legacy_strands = value;
self
@ -120,6 +163,8 @@ impl<'a> Parser<'a> {
}
/// Set whether to allow record-id's which don't adheare to regular ident rules.
/// Setting this to true will allow parsing of, for example, `foo:0bar`. This would be rejected
/// by normal identifier rules as most identifiers can't start with a number.
pub fn allow_fexible_record_id(&mut self, value: bool) {
self.flexible_record_id = value;
}
@ -149,11 +194,31 @@ impl<'a> Parser<'a> {
/// Returns the next token and advance the parser one token forward.
#[allow(clippy::should_implement_trait)]
pub fn next(&mut self) -> Token {
let res = loop {
let res = self.token_buffer.pop().unwrap_or_else(|| self.lexer.next_token());
if res.kind != TokenKind::WhiteSpace {
break res;
}
};
self.last_span = res.span;
res
}
/// Returns the next token and advance the parser one token forward.
///
/// This function is like next but returns whitespace tokens which are normally skipped
#[allow(clippy::should_implement_trait)]
pub fn next_whitespace(&mut self) -> Token {
let res = self.token_buffer.pop().unwrap_or_else(|| self.lexer.next_token());
self.last_span = res.span;
res
}
/// Returns if there is a token in the token buffer, meaning that a token was peeked.
pub fn has_peek(&self) -> bool {
self.token_buffer.is_empty()
}
/// Consume the current peeked value and advance the parser one token forward.
///
/// Should only be called after peeking a value.
@ -165,6 +230,30 @@ impl<'a> Parser<'a> {
/// Returns the next token without consuming it.
pub fn peek(&mut self) -> Token {
loop {
let Some(x) = self.token_buffer.first() else {
let res = loop {
let res = self.lexer.next_token();
if res.kind != TokenKind::WhiteSpace {
break res;
}
};
self.token_buffer.push(res);
return res;
};
if x.kind == TokenKind::WhiteSpace {
self.token_buffer.pop();
continue;
}
break x;
}
}
/// Returns the next token without consuming it.
///
/// This function is like peek but returns whitespace tokens which are normally skipped
/// Does not undo tokens skipped in a previous normal peek.
pub fn peek_whitespace(&mut self) -> Token {
let Some(x) = self.token_buffer.first() else {
let res = self.lexer.next_token();
self.token_buffer.push(res);
@ -175,19 +264,30 @@ impl<'a> Parser<'a> {
/// Return the token kind of the next token without consuming it.
pub fn peek_kind(&mut self) -> TokenKind {
let Some(x) = self.token_buffer.first().map(|x| x.kind) else {
let res = self.lexer.next_token();
self.token_buffer.push(res);
return res.kind;
};
x
self.peek().kind
}
/// Returns the next n'th token without consuming it.
/// `peek_token_at(0)` is equivalent to `peek`.
pub fn peek_token_at(&mut self, at: u8) -> Token {
for _ in self.token_buffer.len()..=at {
self.token_buffer.push(self.lexer.next_token());
let r = loop {
let r = self.lexer.next_token();
if r.kind != TokenKind::WhiteSpace {
break r;
}
};
self.token_buffer.push(r);
}
self.token_buffer.at(at).unwrap()
}
/// Returns the next n'th token without consuming it.
/// `peek_token_at(0)` is equivalent to `peek`.
pub fn peek_whitespace_token_at(&mut self, at: u8) -> Token {
for _ in self.token_buffer.len()..=at {
let r = self.lexer.next_token();
self.token_buffer.push(r);
}
self.token_buffer.at(at).unwrap()
}
@ -206,14 +306,49 @@ impl<'a> Parser<'a> {
/// Eat the next token if it is of the given kind.
/// Returns whether a token was eaten.
pub fn eat(&mut self, token: TokenKind) -> bool {
if token == self.peek().kind {
let peek = self.peek();
if token == peek.kind {
self.token_buffer.pop();
self.last_span = peek.span;
true
} else {
false
}
}
/// Eat the next token if it is of the given kind.
/// Returns whether a token was eaten.
///
/// Unlike [`Parser::eat`] this doesn't skip whitespace tokens
pub fn eat_whitespace(&mut self, token: TokenKind) -> bool {
let peek = self.peek_whitespace();
if token == peek.kind {
self.token_buffer.pop();
self.last_span = peek.span;
true
} else {
false
}
}
/// Forces the next token to be the given one.
/// Used in token gluing to replace the current one with the glued token.
fn prepend_token(&mut self, token: Token) {
self.token_buffer.push_front(token);
}
/// Returns the string for a given span of the source.
/// Will panic if the given span was not valid for the source, or invalid utf8
fn span_str(&self, span: Span) -> &'a str {
std::str::from_utf8(self.span_bytes(span)).expect("invalid span segment for source")
}
/// Returns the string for a given span of the source.
/// Will panic if the given span was not valid for the source, or invalid utf8
fn span_bytes(&self, span: Span) -> &'a [u8] {
self.lexer.reader.span(span)
}
/// Checks if the next token is of the given kind. If it isn't it returns a UnclosedDelimiter
/// error.
fn expect_closing_delimiter(&mut self, kind: TokenKind, should_close: Span) -> ParseResult<()> {
@ -229,17 +364,6 @@ impl<'a> Parser<'a> {
Ok(())
}
/// Ensure that there was no whitespace parser between the last token and the current one.
///
/// This is used in places where whitespace is prohibited like inside a record id.
fn no_whitespace(&mut self) -> ParseResult<()> {
if let Some(span) = self.lexer.whitespace_span() {
Err(ParseError::new(ParseErrorKind::NoWhitespace, span))
} else {
Ok(())
}
}
/// Recover the parser state to after a given span.
pub fn backup_after(&mut self, span: Span) {
self.token_buffer.clear();
@ -295,7 +419,8 @@ impl<'a> Parser<'a> {
}) => {
// Ensure the we are sure that the last token was fully parsed.
self.backup_after(at);
if self.peek().kind != TokenKind::Eof || self.lexer.whitespace_span().is_some() {
let peek = self.peek_whitespace();
if peek.kind != TokenKind::Eof && peek.kind != TokenKind::WhiteSpace {
// if there is a next token or we ate whitespace after the eof we can be sure
// that the error is not the result of a token only being partially present.
return PartialResult::Ready {

View file

@ -5,7 +5,7 @@ use reblessive::Stk;
use crate::{
enter_object_recursion,
sql::{Block, Geometry, Object, Strand, Value},
sql::{Block, Geometry, Number, Object, Strand, Value},
syn::{
parser::{mac::expected, ParseError, ParseErrorKind, ParseResult, Parser},
token::{t, Span, TokenKind},
@ -30,7 +30,10 @@ impl Parser<'_> {
})
}
// Check first if it can be an object.
// glue possible complex tokens.
self.glue()?;
// Now check first if it can be an object.
if self.peek_token_at(1).kind == t!(":") {
enter_object_recursion!(this = self => {
return this.parse_object_or_geometry(ctx, start).await;
@ -41,6 +44,373 @@ impl Parser<'_> {
self.parse_block(ctx, start).await.map(Box::new).map(Value::Block)
}
async fn parse_object_or_geometry_after_type(
&mut self,
ctx: &mut Stk,
start: Span,
key: String,
) -> ParseResult<Value> {
expected!(self, t!(":"));
// for it to be geometry the next value must be a strand like.
let (t!("\"") | t!("'")) = self.peek_kind() else {
return self
.parse_object_from_key(ctx, key, BTreeMap::new(), start)
.await
.map(Value::Object);
};
// We know it is a strand so check if the type is one of the allowe geometry types.
// If it is, there are some which all take roughly the save type of value and produce a
// similar output, which is parsed with parse_geometry_after_type
//
// GeometryCollection however has a different object key for its value, so it is handled
// appart from the others.
let type_value = self.next_token_value::<Strand>()?.0;
match type_value.as_str() {
"Point" => {
// we matched a type correctly but the field containing the geometry value
// can still be wrong.
//
// we can unwrap strand since we just matched it to not be an err.
self.parse_geometry_after_type(ctx, start, key, type_value, Self::to_point, |x| {
Value::Geometry(Geometry::Point(x))
})
.await
}
"LineString" => {
self.parse_geometry_after_type(ctx, start, key, type_value, Self::to_line, |x| {
Value::Geometry(Geometry::Line(x))
})
.await
}
"Polygon" => {
self.parse_geometry_after_type(ctx, start, key, type_value, Self::to_polygon, |x| {
Value::Geometry(Geometry::Polygon(x))
})
.await
}
"MultiPoint" => {
self.parse_geometry_after_type(
ctx,
start,
key,
type_value,
Self::to_multipoint,
|x| Value::Geometry(Geometry::MultiPoint(x)),
)
.await
}
"MultiLineString" => {
self.parse_geometry_after_type(
ctx,
start,
key,
type_value,
Self::to_multiline,
|x| Value::Geometry(Geometry::MultiLine(x)),
)
.await
}
"MultiPolygon" => {
self.parse_geometry_after_type(
ctx,
start,
key,
type_value,
Self::to_multipolygon,
|x| Value::Geometry(Geometry::MultiPolygon(x)),
)
.await
}
"GeometryCollection" => {
if !self.eat(t!(",")) {
// missing next field, not a geometry.
return self
.parse_object_from_map(
ctx,
BTreeMap::from([(key, Value::Strand(type_value.into()))]),
start,
)
.await
.map(Value::Object);
}
let coord_key = self.parse_object_key()?;
if coord_key != "geometries" {
expected!(self, t!(":"));
// invalid field key, not a Geometry
return self
.parse_object_from_key(
ctx,
coord_key,
BTreeMap::from([(key, Value::Strand(type_value.into()))]),
start,
)
.await
.map(Value::Object);
}
expected!(self, t!(":"));
let value = ctx.run(|ctx| self.parse_value_field(ctx)).await?;
// check for an object end, if it doesn't end it is not a geometry.
if !self.eat(t!(",")) {
self.expect_closing_delimiter(t!("}"), start)?;
} else {
if self.peek_kind() != t!("}") {
// A comma and then no brace. more then two fields, not a geometry.
return self
.parse_object_from_map(
ctx,
BTreeMap::from([
(key, Value::Strand(type_value.into())),
(coord_key, value),
]),
start,
)
.await
.map(Value::Object);
}
self.pop_peek();
}
// try to convert to the right value.
if let Value::Array(x) = value {
// test first to avoid a cloning.
if x.iter().all(|x| matches!(x, Value::Geometry(_))) {
let geometries =
x.0.into_iter()
.map(|x| {
if let Value::Geometry(x) = x {
x
} else {
unreachable!()
}
})
.collect();
return Ok(Value::Geometry(Geometry::Collection(geometries)));
}
return Ok(Value::Object(Object(BTreeMap::from([
(key, Value::Strand(type_value.into())),
(coord_key, Value::Array(x)),
]))));
}
// Couldn't convert so it is a normal object.
Ok(Value::Object(Object(BTreeMap::from([
(key, Value::Strand(type_value.into())),
(coord_key, value),
]))))
}
// key was not one of the allowed keys so it is a normal object.
_ => {
let object = BTreeMap::from([(key, Value::Strand(type_value.into()))]);
if self.eat(t!(",")) {
self.parse_object_from_map(ctx, object, start).await.map(Value::Object)
} else {
self.expect_closing_delimiter(t!("}"), start)?;
Ok(Value::Object(Object(object)))
}
}
}
}
async fn parse_object_or_geometry_after_coordinates(
&mut self,
ctx: &mut Stk,
start: Span,
key: String,
) -> ParseResult<Value> {
expected!(self, t!(":"));
// found coordinates field, next must be a coordinates value but we don't know
// which until we match type.
let value = ctx.run(|ctx| self.parse_value_field(ctx)).await?;
if !self.eat(t!(",")) {
// no comma object must end early.
self.expect_closing_delimiter(t!("}"), start)?;
return Ok(Value::Object(Object(BTreeMap::from([(key, value)]))));
}
if self.eat(t!("}")) {
// object ends early.
return Ok(Value::Object(Object(BTreeMap::from([(key, value)]))));
}
let type_key = self.parse_object_key()?;
if type_key != "type" {
expected!(self, t!(":"));
// not the right field, return object.
return self
.parse_object_from_key(ctx, type_key, BTreeMap::from([(key, value)]), start)
.await
.map(Value::Object);
}
expected!(self, t!(":"));
let (t!("\"") | t!("'")) = self.peek_kind() else {
// not the right value also move back to parsing an object.
return self
.parse_object_from_key(ctx, type_key, BTreeMap::from([(key, value)]), start)
.await
.map(Value::Object);
};
let type_value = self.next_token_value::<Strand>()?.0;
let ate_comma = self.eat(t!(","));
// match the type and then match the coordinates field to a value of that type.
match type_value.as_str() {
"Point" => {
if self.eat(t!("}")) {
if let Some(point) = Self::to_point(&value) {
return Ok(Value::Geometry(Geometry::Point(point)));
}
}
}
"LineString" => {
if self.eat(t!("}")) {
if let Some(point) = Self::to_line(&value) {
return Ok(Value::Geometry(Geometry::Line(point)));
}
}
}
"Polygon" => {
if self.eat(t!("}")) {
if let Some(point) = Self::to_polygon(&value) {
return Ok(Value::Geometry(Geometry::Polygon(point)));
}
}
}
"MultiPoint" => {
if self.eat(t!("}")) {
if let Some(point) = Self::to_multipolygon(&value) {
return Ok(Value::Geometry(Geometry::MultiPolygon(point)));
}
}
}
"MultiLineString" => {
if self.eat(t!("}")) {
if let Some(point) = Self::to_multiline(&value) {
return Ok(Value::Geometry(Geometry::MultiLine(point)));
}
}
}
"MultiPolygon" => {
if self.eat(t!("}")) {
if let Some(point) = Self::to_multipolygon(&value) {
return Ok(Value::Geometry(Geometry::MultiPolygon(point)));
}
}
}
_ => {}
};
// type field or coordinates value didn't match or the object continues after to
// fields.
if !ate_comma {
self.expect_closing_delimiter(t!("}"), start)?;
return Ok(Value::Object(Object(BTreeMap::from([
(key, value),
(type_key, Value::Strand(type_value.into())),
]))));
}
self.parse_object_from_map(
ctx,
BTreeMap::from([(key, value), (type_key, Value::Strand(type_value.into()))]),
start,
)
.await
.map(Value::Object)
}
async fn parse_object_or_geometry_after_geometries(
&mut self,
ctx: &mut Stk,
start: Span,
key: String,
) -> ParseResult<Value> {
// 'geometries' key can only happen in a GeometryCollection, so try to parse that.
expected!(self, t!(":"));
let value = ctx.run(|ctx| self.parse_value_field(ctx)).await?;
// if the object ends here, it is not a geometry.
if !self.eat(t!(",")) || self.peek_kind() == t!("}") {
self.expect_closing_delimiter(t!("}"), start)?;
return Ok(Value::Object(Object(BTreeMap::from([(key, value)]))));
}
// parse the next objectkey
let type_key = self.parse_object_key()?;
// it if isn't 'type' this object is not a geometry, so bail.
if type_key != "type" {
expected!(self, t!(":"));
return self
.parse_object_from_key(ctx, type_key, BTreeMap::from([(key, value)]), start)
.await
.map(Value::Object);
}
expected!(self, t!(":"));
// check if the next key is a strand.
let (t!("\"") | t!("'")) = self.peek_kind() else {
// not the right value also move back to parsing an object.
return self
.parse_object_from_key(ctx, type_key, BTreeMap::from([(key, value)]), start)
.await
.map(Value::Object);
};
let type_value = self.next_token_value::<Strand>()?.0;
let ate_comma = self.eat(t!(","));
if type_value == "GeometryCollection" && self.eat(t!("}")) {
if let Value::Array(ref x) = value {
if x.iter().all(|x| matches!(x, Value::Geometry(_))) {
let Value::Array(x) = value else {
unreachable!()
};
let geometries = x
.into_iter()
.map(|x| {
if let Value::Geometry(x) = x {
x
} else {
unreachable!()
}
})
.collect();
return Ok(Value::Geometry(Geometry::Collection(geometries)));
}
}
}
// Either type value didn't match or gemoetry value didn't match.
// Regardless the current object is not a geometry.
if !ate_comma {
self.expect_closing_delimiter(t!("}"), start)?;
return Ok(Value::Object(Object(BTreeMap::from([
(key, value),
(type_key, Value::Strand(type_value.into())),
]))));
}
self.parse_object_from_map(
ctx,
BTreeMap::from([(key, value), (type_key, Value::Strand(type_value.into()))]),
start,
)
.await
.map(Value::Object)
}
/// Parse a production starting with an `{` as either an object or a geometry.
///
/// This function tries to match an object to an geometry like object and if it is unable
@ -48,374 +418,20 @@ impl Parser<'_> {
async fn parse_object_or_geometry(&mut self, ctx: &mut Stk, start: Span) -> ParseResult<Value> {
// empty object was already matched previously so next must be a key.
let key = self.parse_object_key()?;
expected!(self, t!(":"));
// the order of fields of a geometry does not matter so check if it is any of geometry like keys
// "type" : could be the type of the object.
// "collections": could be a geometry collection.
// "geometry": could be the values of geometry.
match key.as_str() {
"type" => {
// for it to be geometry the next value must be a strand like.
let token = self.peek();
let strand = self.token_value::<Strand>(token);
match strand.as_ref().map(|x| x.as_str()) {
Ok("Point") => {
// we matched a type correctly but the field containing the geometry value
// can still be wrong.
//
// we can unwrap strand since we just matched it to not be an err.
self.parse_geometry_after_type(
ctx,
start,
key,
strand.unwrap(),
Self::to_point,
|x| Value::Geometry(Geometry::Point(x)),
)
.await
}
Ok("LineString") => {
self.parse_geometry_after_type(
ctx,
start,
key,
strand.unwrap(),
Self::to_line,
|x| Value::Geometry(Geometry::Line(x)),
)
.await
}
Ok("Polygon") => {
self.parse_geometry_after_type(
ctx,
start,
key,
strand.unwrap(),
Self::to_polygon,
|x| Value::Geometry(Geometry::Polygon(x)),
)
.await
}
Ok("MultiPoint") => {
self.parse_geometry_after_type(
ctx,
start,
key,
strand.unwrap(),
Self::to_multipoint,
|x| Value::Geometry(Geometry::MultiPoint(x)),
)
.await
}
Ok("MultiLineString") => {
self.parse_geometry_after_type(
ctx,
start,
key,
strand.unwrap(),
Self::to_multiline,
|x| Value::Geometry(Geometry::MultiLine(x)),
)
.await
}
Ok("MultiPolygon") => {
self.parse_geometry_after_type(
ctx,
start,
key,
strand.unwrap(),
Self::to_multipolygon,
|x| Value::Geometry(Geometry::MultiPolygon(x)),
)
.await
}
Ok("GeometryCollection") => {
self.next();
let strand = strand.unwrap();
if !self.eat(t!(",")) {
// missing next field, not a geometry.
return self
.parse_object_from_map(
ctx,
BTreeMap::from([(key, Value::Strand(strand))]),
start,
)
.await
.map(Value::Object);
}
let coord_key = self.parse_object_key()?;
expected!(self, t!(":"));
if coord_key != "geometries" {
// invalid field key, not a Geometry
return self
.parse_object_from_key(
ctx,
coord_key,
BTreeMap::from([(key, Value::Strand(strand))]),
start,
)
.await
.map(Value::Object);
}
let value = ctx.run(|ctx| self.parse_value_field(ctx)).await?;
let comma = self.eat(t!(","));
if !self.eat(t!("}")) {
if !comma {
// No brace after no comma, missing brace.
return Err(ParseError::new(
ParseErrorKind::UnclosedDelimiter {
expected: t!("}"),
should_close: start,
},
self.last_span(),
));
}
// A comma and then no brace. more then two fields, not a geometry.
return self
.parse_object_from_map(
ctx,
BTreeMap::from([
(key, Value::Strand(strand)),
(coord_key, value),
]),
start,
)
.await
.map(Value::Object);
}
if let Value::Array(x) = value {
// test first to avoid a cloning.
if x.iter().all(|x| matches!(x, Value::Geometry(_))) {
let geometries =
x.0.into_iter()
.map(|x| {
if let Value::Geometry(x) = x {
x
} else {
unreachable!()
}
})
.collect();
return Ok(Value::Geometry(Geometry::Collection(geometries)));
}
return Ok(Value::Object(Object(BTreeMap::from([
(key, Value::Strand(strand)),
(coord_key, Value::Array(x)),
]))));
}
Ok(Value::Object(Object(BTreeMap::from([
(key, Value::Strand(strand)),
(coord_key, value),
]))))
}
Ok(_) => {
self.pop_peek();
if !self.eat(t!(",")) {
self.expect_closing_delimiter(t!("}"), start)?;
Ok(Value::Object(Object(BTreeMap::from([(
key,
Value::Strand(strand.unwrap()),
)]))))
} else {
self.parse_object_from_map(
ctx,
BTreeMap::from([(key, Value::Strand(strand.unwrap()))]),
start,
)
.await
.map(Value::Object)
}
}
_ => self
.parse_object_from_key(ctx, key, BTreeMap::new(), start)
.await
.map(Value::Object),
}
}
"coordinates" => {
// found coordinates field, next must be a coordinates value but we don't know
// which until we match type.
let value = ctx.run(|ctx| self.parse_value_field(ctx)).await?;
if !self.eat(t!(",")) {
// no comma object must end early.
self.expect_closing_delimiter(t!("}"), start)?;
return Ok(Value::Object(Object(BTreeMap::from([(key, value)]))));
}
if self.eat(t!("}")) {
// object ends early.
return Ok(Value::Object(Object(BTreeMap::from([(key, value)]))));
}
let type_key = self.parse_object_key()?;
"type" => self.parse_object_or_geometry_after_type(ctx, start, key).await,
"coordinates" => self.parse_object_or_geometry_after_coordinates(ctx, start, key).await,
"geometries" => self.parse_object_or_geometry_after_geometries(ctx, start, key).await,
_ => {
expected!(self, t!(":"));
if type_key != "type" {
// not the right field, return object.
return self
.parse_object_from_key(ctx, type_key, BTreeMap::from([(key, value)]), start)
.await
.map(Value::Object);
}
let peek = self.peek();
let strand = self.token_value::<Strand>(peek);
// match the type and then match the coordinates field to a value of that type.
let (ate_comma, type_value) = match strand.as_ref().map(|x| x.as_str()) {
Ok("Point") => {
self.next();
let ate_comma = self.eat(t!(","));
if self.eat(t!("}")) {
if let Some(point) = Self::to_point(&value) {
return Ok(Value::Geometry(Geometry::Point(point)));
}
}
// At this point the value does not match, or there are more fields.
// since we matched `Ok("Point")` strand cannot be an error so this unwrap
// will never panic.
(ate_comma, Value::Strand(strand.unwrap()))
}
Ok("LineString") => {
self.next();
let ate_comma = self.eat(t!(","));
if self.eat(t!("}")) {
if let Some(point) = Self::to_line(&value) {
return Ok(Value::Geometry(Geometry::Line(point)));
}
}
(ate_comma, Value::Strand(strand.unwrap()))
}
Ok("Polygon") => {
self.next();
let ate_comma = self.eat(t!(","));
if self.eat(t!("}")) {
if let Some(point) = Self::to_polygon(&value) {
return Ok(Value::Geometry(Geometry::Polygon(point)));
}
}
(ate_comma, Value::Strand(strand.unwrap()))
}
Ok("MultiPoint") => {
self.next();
let ate_comma = self.eat(t!(","));
if self.eat(t!("}")) {
if let Some(point) = Self::to_multipolygon(&value) {
return Ok(Value::Geometry(Geometry::MultiPolygon(point)));
}
}
(ate_comma, Value::Strand(strand.unwrap()))
}
Ok("MultiLineString") => {
self.next();
let ate_comma = self.eat(t!(","));
if self.eat(t!("}")) {
if let Some(point) = Self::to_multiline(&value) {
return Ok(Value::Geometry(Geometry::MultiLine(point)));
}
}
(ate_comma, Value::Strand(strand.unwrap()))
}
Ok("MultiPolygon") => {
self.next();
let ate_comma = self.eat(t!(","));
if self.eat(t!("}")) {
if let Some(point) = Self::to_multipolygon(&value) {
return Ok(Value::Geometry(Geometry::MultiPolygon(point)));
}
}
(ate_comma, Value::Strand(strand.unwrap()))
}
_ => {
let value = ctx.run(|ctx| self.parse_value_field(ctx)).await?;
(self.eat(t!(",")), value)
}
};
// type field or coordinates value didn't match or the object continues after to
// fields.
if !ate_comma {
self.expect_closing_delimiter(t!("}"), start)?;
return Ok(Value::Object(Object(BTreeMap::from([
(key, value),
(type_key, type_value),
]))));
}
self.parse_object_from_map(
ctx,
BTreeMap::from([(key, value), (type_key, type_value)]),
start,
)
.await
.map(Value::Object)
self.parse_object_from_key(ctx, key, BTreeMap::new(), start)
.await
.map(Value::Object)
}
"geometries" => {
let value = ctx.run(|ctx| self.parse_value_field(ctx)).await?;
if !self.eat(t!(",")) {
self.expect_closing_delimiter(t!("}"), start)?;
return Ok(Value::Object(Object(BTreeMap::from([(key, value)]))));
}
let type_key = self.parse_object_key()?;
expected!(self, t!(":"));
if type_key != "type" {
return self
.parse_object_from_key(ctx, type_key, BTreeMap::from([(key, value)]), start)
.await
.map(Value::Object);
}
let peek = self.peek();
let strand = self.token_value::<Strand>(peek);
let (ate_comma, type_value) =
if let Ok("GeometryCollection") = strand.as_ref().map(|x| x.as_str()) {
self.next();
let ate_comma = self.eat(t!(","));
if self.eat(t!("}")) {
if let Value::Array(ref x) = value {
if x.iter().all(|x| matches!(x, Value::Geometry(_))) {
let Value::Array(x) = value else {
unreachable!()
};
let geometries = x
.into_iter()
.map(|x| {
if let Value::Geometry(x) = x {
x
} else {
unreachable!()
}
})
.collect();
return Ok(Value::Geometry(Geometry::Collection(geometries)));
}
}
}
(ate_comma, Value::Strand(strand.unwrap()))
} else {
let value = ctx.run(|ctx| self.parse_value_field(ctx)).await?;
(self.eat(t!(",")), value)
};
if !ate_comma {
self.expect_closing_delimiter(t!("}"), start)?;
return Ok(Value::Object(Object(BTreeMap::from([
(key, value),
(type_key, type_value),
]))));
}
self.parse_object_from_map(
ctx,
BTreeMap::from([(key, value), (type_key, type_value)]),
start,
)
.await
.map(Value::Object)
}
_ => self
.parse_object_from_key(ctx, key, BTreeMap::new(), start)
.await
.map(Value::Object),
}
}
@ -424,7 +440,7 @@ impl Parser<'_> {
ctx: &mut Stk,
start: Span,
key: String,
strand: Strand,
strand: String,
capture: F,
map: Fm,
) -> ParseResult<Value>
@ -432,27 +448,29 @@ impl Parser<'_> {
F: FnOnce(&Value) -> Option<R>,
Fm: FnOnce(R) -> Value,
{
// eat the strand with the type name.
self.next();
if !self.eat(t!(",")) {
// there is not second field. not a geometry
self.expect_closing_delimiter(t!("}"), start)?;
return Ok(Value::Object(Object(BTreeMap::from([(key, Value::Strand(strand))]))));
return Ok(Value::Object(Object(BTreeMap::from([(
key,
Value::Strand(strand.into()),
)]))));
}
let coord_key = self.parse_object_key()?;
expected!(self, t!(":"));
if coord_key != "coordinates" {
expected!(self, t!(":"));
// next field was not correct, fallback to parsing plain object.
return self
.parse_object_from_key(
ctx,
coord_key,
BTreeMap::from([(key, Value::Strand(strand))]),
BTreeMap::from([(key, Value::Strand(strand.into()))]),
start,
)
.await
.map(Value::Object);
}
expected!(self, t!(":"));
let value = ctx.run(|ctx| self.parse_value_field(ctx)).await?;
let comma = self.eat(t!(","));
if !self.eat(t!("}")) {
@ -470,7 +488,7 @@ impl Parser<'_> {
return self
.parse_object_from_map(
ctx,
BTreeMap::from([(key, Value::Strand(strand)), (coord_key, value)]),
BTreeMap::from([(key, Value::Strand(strand.into())), (coord_key, value)]),
start,
)
.await
@ -480,7 +498,7 @@ impl Parser<'_> {
let Some(v) = capture(&value) else {
// failed to match the geometry value, just a plain object.
return Ok(Value::Object(Object(BTreeMap::from([
(key, Value::Strand(strand)),
(key, Value::Strand(strand.into())),
(coord_key, value),
]))));
};
@ -648,7 +666,7 @@ impl Parser<'_> {
/// Parses the key of an object, i.e. `field` in the object `{ field: 1 }`.
pub fn parse_object_key(&mut self) -> ParseResult<String> {
let token = self.peek();
let token = self.glue()?;
match token.kind {
TokenKind::Keyword(_)
| TokenKind::Language(_)
@ -661,14 +679,18 @@ impl Parser<'_> {
let str = std::str::from_utf8(str).unwrap().to_owned();
Ok(str)
}
TokenKind::Identifier | TokenKind::Strand => {
TokenKind::Identifier => {
self.pop_peek();
let str = self.lexer.string.take().unwrap();
Ok(str)
}
TokenKind::Number(_) => {
self.pop_peek();
Ok(self.lexer.string.take().unwrap())
t!("\"") | t!("'") | TokenKind::Strand => {
let str = self.next_token_value::<Strand>()?.0;
Ok(str)
}
TokenKind::Digits | TokenKind::Number(_) => {
let number = self.next_token_value::<Number>()?.to_string();
Ok(number)
}
x => unexpected!(self, x, "an object key"),
}

View file

@ -5,16 +5,15 @@ use super::{ParseResult, Parser};
use crate::{
enter_object_recursion, enter_query_recursion,
sql::{
Array, Dir, Function, Geometry, Ident, Idiom, Mock, Part, Script, Strand, Subquery, Table,
Value,
Array, Dir, Function, Geometry, Ident, Idiom, Mock, Number, Part, Script, Strand, Subquery,
Table, Value,
},
syn::{
lexer::Lexer,
parser::{
mac::{expected, unexpected},
ParseError, ParseErrorKind,
},
token::{t, NumberKind, Span, TokenKind},
token::{t, Span, TokenKind},
},
};
@ -24,14 +23,6 @@ impl Parser<'_> {
/// What's are values which are more restricted in what expressions they can contain.
pub async fn parse_what_primary(&mut self, ctx: &mut Stk) -> ParseResult<Value> {
match self.peek_kind() {
TokenKind::Duration => {
let duration = self.next_token_value()?;
Ok(Value::Duration(duration))
}
TokenKind::DateTime => {
let datetime = self.next_token_value()?;
Ok(Value::Datetime(datetime))
}
t!("r\"") => {
self.pop_peek();
let thing = self.parse_record_string(ctx, true).await?;
@ -42,6 +33,14 @@ impl Parser<'_> {
let thing = self.parse_record_string(ctx, false).await?;
Ok(Value::Thing(thing))
}
t!("d\"") | t!("d'") => {
let datetime = self.next_token_value()?;
Ok(Value::Datetime(datetime))
}
t!("u\"") | t!("u'") => {
let uuid = self.next_token_value()?;
Ok(Value::Uuid(uuid))
}
t!("$param") => {
let param = self.next_token_value()?;
Ok(Value::Param(param))
@ -73,11 +72,7 @@ impl Parser<'_> {
let start = self.pop_peek().span;
self.parse_mock(start).map(Value::Mock)
}
t!("/") => {
let token = self.pop_peek();
let regex = self.lexer.relex_regex(token);
self.token_value(regex).map(Value::Regex)
}
t!("/") => self.next_token_value().map(Value::Regex),
t!("RETURN")
| t!("SELECT")
| t!("CREATE")
@ -93,15 +88,19 @@ impl Parser<'_> {
t!("fn") => self.parse_custom_function(ctx).await.map(|x| Value::Function(Box::new(x))),
t!("ml") => self.parse_model(ctx).await.map(|x| Value::Model(Box::new(x))),
x => {
if !self.peek_can_be_ident() {
if !self.peek_can_start_ident() {
unexpected!(self, x, "a value")
}
let token = self.next();
match self.peek_kind() {
t!("::") | t!("(") => self.parse_builtin(ctx, token.span).await,
let span = self.glue()?.span;
match self.peek_token_at(1).kind {
t!("::") | t!("(") => {
self.pop_peek();
self.parse_builtin(ctx, span).await
}
t!(":") => {
let str = self.token_value::<Ident>(token)?.0;
let str = self.next_token_value::<Ident>()?.0;
self.parse_thing_or_range(ctx, str).await
}
x => {
@ -110,7 +109,7 @@ impl Parser<'_> {
// always an invalid production so just return error.
unexpected!(self, x, "a value");
} else {
Ok(Value::Table(self.token_value(token)?))
Ok(Value::Table(self.next_token_value()?))
}
}
}
@ -118,6 +117,15 @@ impl Parser<'_> {
}
}
pub fn parse_number_like_prime(&mut self) -> ParseResult<Value> {
let token = self.glue_numeric()?;
match token.kind {
TokenKind::Number(_) => self.next_token_value().map(Value::Number),
TokenKind::Duration => self.next_token_value().map(Value::Duration),
x => unexpected!(self, x, "a value"),
}
}
/// Parse an expressions
pub async fn parse_idiom_expression(&mut self, ctx: &mut Stk) -> ParseResult<Value> {
let token = self.peek();
@ -147,35 +155,6 @@ impl Parser<'_> {
let block = self.parse_block(ctx, next).await?;
return Ok(Value::Future(Box::new(crate::sql::Future(block))));
}
TokenKind::Strand => {
self.pop_peek();
if self.legacy_strands {
return self.parse_legacy_strand(ctx).await;
} else {
let strand = self.token_value(token)?;
return Ok(Value::Strand(strand));
}
}
TokenKind::Duration => {
self.pop_peek();
let duration = self.token_value(token)?;
Value::Duration(duration)
}
TokenKind::Number(_) => {
self.pop_peek();
let number = self.token_value(token)?;
Value::Number(number)
}
TokenKind::Uuid => {
self.pop_peek();
let uuid = self.token_value(token)?;
Value::Uuid(uuid)
}
TokenKind::DateTime => {
self.pop_peek();
let datetime = self.token_value(token)?;
Value::Datetime(datetime)
}
t!("r\"") => {
self.pop_peek();
let thing = self.parse_record_string(ctx, true).await?;
@ -186,9 +165,32 @@ impl Parser<'_> {
let thing = self.parse_record_string(ctx, false).await?;
Value::Thing(thing)
}
t!("$param") => {
t!("d\"") | t!("d'") => {
let datetime = self.next_token_value()?;
Value::Datetime(datetime)
}
t!("u\"") | t!("u'") => {
let uuid = self.next_token_value()?;
Value::Uuid(uuid)
}
t!("'") | t!("\"") | TokenKind::Strand => {
let s = self.next_token_value::<Strand>()?;
if self.legacy_strands {
if let Some(x) = self.reparse_legacy_strand(ctx, &s.0).await {
return Ok(x);
}
}
Value::Strand(s)
}
t!("+") | t!("-") | TokenKind::Number(_) | TokenKind::Digits | TokenKind::Duration => {
self.parse_number_like_prime()?
}
TokenKind::NaN => {
self.pop_peek();
let param = self.token_value(token)?;
return Ok(Value::Number(Number::Float(f64::NAN)));
}
t!("$param") => {
let param = self.next_token_value()?;
Value::Param(param)
}
t!("FUNCTION") => {
@ -234,11 +236,7 @@ impl Parser<'_> {
self.pop_peek();
self.parse_inner_subquery_or_coordinate(ctx, token.span).await?
}
t!("/") => {
self.pop_peek();
let regex = self.lexer.relex_regex(token);
self.token_value(regex).map(Value::Regex)?
}
t!("/") => self.next_token_value().map(Value::Regex)?,
t!("RETURN")
| t!("SELECT")
| t!("CREATE")
@ -260,20 +258,24 @@ impl Parser<'_> {
self.parse_model(ctx).await.map(|x| Value::Model(Box::new(x)))?
}
_ => {
self.pop_peek();
match self.peek_kind() {
t!("::") | t!("(") => self.parse_builtin(ctx, token.span).await?,
self.glue()?;
match self.peek_token_at(1).kind {
t!("::") | t!("(") => {
self.pop_peek();
self.parse_builtin(ctx, token.span).await?
}
t!(":") => {
let str = self.token_value::<Ident>(token)?.0;
let str = self.next_token_value::<Ident>()?.0;
self.parse_thing_or_range(ctx, str).await?
}
x => {
if x.has_data() {
unexpected!(self, x, "a value");
} else if self.table_as_field {
Value::Idiom(Idiom(vec![Part::Field(self.token_value(token)?)]))
Value::Idiom(Idiom(vec![Part::Field(self.next_token_value()?)]))
} else {
Value::Table(self.token_value(token)?)
Value::Table(self.next_token_value()?)
}
}
}
@ -426,69 +428,46 @@ impl Parser<'_> {
let stmt = self.parse_rebuild_stmt()?;
Subquery::Rebuild(stmt)
}
t!("+") | t!("-") => {
// handle possible coordinate in the shape of ([-+]?number,[-+]?number)
if let TokenKind::Number(kind) = self.peek_token_at(1).kind {
// take the value so we don't overwrite it if the next token happens to be an
// strand or an ident, both of which are invalid syntax.
let number_value = self.lexer.string.take().unwrap();
if self.peek_token_at(2).kind == t!(",") {
match kind {
NumberKind::Decimal | NumberKind::NaN => {
return Err(ParseError::new(
ParseErrorKind::UnexpectedExplain {
found: TokenKind::Number(kind),
expected: "a non-decimal, non-nan number",
explain: "coordinate numbers can't be NaN or a decimal",
},
peek.span,
));
}
_ => {}
}
TokenKind::Digits | TokenKind::Number(_) | t!("+") | t!("-") => {
let number_token = self.glue()?;
if matches!(self.peek_kind(), TokenKind::Number(_))
&& self.peek_token_at(1).kind == t!(",")
{
let number = self.next_token_value::<Number>()?;
// eat ','
self.next();
self.lexer.string = Some(number_value);
let a = self.parse_signed_float()?;
self.next();
let b = self.parse_signed_float()?;
self.expect_closing_delimiter(t!(")"), start)?;
return Ok(Value::Geometry(Geometry::Point(Point::from((a, b)))));
}
self.lexer.string = Some(number_value);
}
Subquery::Value(ctx.run(|ctx| self.parse_value_field(ctx)).await?)
}
TokenKind::Number(kind) => {
// handle possible coordinate in the shape of ([-+]?number,[-+]?number)
// take the value so we don't overwrite it if the next token happens to be an
// strand or an ident, both of which are invalid syntax.
let number_value = self.lexer.string.take().unwrap();
if self.peek_token_at(1).kind == t!(",") {
match kind {
NumberKind::Decimal | NumberKind::NaN => {
match number {
Number::Decimal(_) => {
return Err(ParseError::new(
ParseErrorKind::UnexpectedExplain {
found: TokenKind::Number(kind),
found: TokenKind::Digits,
expected: "a non-decimal, non-nan number",
explain: "coordinate numbers can't be NaN or a decimal",
},
peek.span,
number_token.span,
));
}
Number::Float(x) if x.is_nan() => {
return Err(ParseError::new(
ParseErrorKind::UnexpectedExplain {
found: TokenKind::Digits,
expected: "a non-decimal, non-nan number",
explain: "coordinate numbers can't be NaN or a decimal",
},
number_token.span,
));
}
_ => {}
}
self.pop_peek();
// was a semicolon, put the strand back for code reuse.
self.lexer.string = Some(number_value);
let a = self.token_value::<f64>(peek)?;
// eat the semicolon.
self.next();
let b = self.parse_signed_float()?;
let x = number.as_float();
let y = self.next_token_value::<f64>()?;
self.expect_closing_delimiter(t!(")"), start)?;
return Ok(Value::Geometry(Geometry::Point(Point::from((a, b)))));
return Ok(Value::Geometry(Geometry::Point(Point::from((x, y)))));
} else {
let value = ctx.run(|ctx| self.parse_value_field(ctx)).await?;
Subquery::Value(value)
}
self.lexer.string = Some(number_value);
Subquery::Value(ctx.run(|ctx| self.parse_value_field(ctx)).await?)
}
_ => {
let value = ctx.run(|ctx| self.parse_value_field(ctx)).await?;
@ -628,18 +607,17 @@ impl Parser<'_> {
/// Parses a strand with legacy rules, parsing to a record id, datetime or uuid if the string
/// matches.
pub async fn parse_legacy_strand(&mut self, ctx: &mut Stk) -> ParseResult<Value> {
let text = self.lexer.string.take().unwrap();
pub async fn reparse_legacy_strand(&mut self, ctx: &mut Stk, text: &str) -> Option<Value> {
if let Ok(x) = Parser::new(text.as_bytes()).parse_thing(ctx).await {
return Ok(Value::Thing(x));
return Some(Value::Thing(x));
}
if let Ok(x) = Lexer::new(text.as_bytes()).lex_only_datetime() {
return Ok(Value::Datetime(x));
if let Ok(x) = Parser::new(text.as_bytes()).next_token_value() {
return Some(Value::Datetime(x));
}
if let Ok(x) = Lexer::new(text.as_bytes()).lex_only_uuid() {
return Ok(Value::Uuid(x));
if let Ok(x) = Parser::new(text.as_bytes()).next_token_value() {
return Some(Value::Uuid(x));
}
Ok(Value::Strand(Strand(text)))
None
}
async fn parse_script(&mut self, ctx: &mut Stk) -> ParseResult<Function> {
@ -741,7 +719,7 @@ mod tests {
fn regex_complex() {
let sql = r"/(?i)test\/[a-z]+\/\s\d\w{1}.*/";
let out = Value::parse(sql);
assert_eq!(r"/(?i)test/[a-z]+/\s\d\w{1}.*/", format!("{}", out));
assert_eq!(r"/(?i)test\/[a-z]+\/\s\d\w{1}.*/", format!("{}", out));
let Value::Regex(regex) = out else {
panic!()
};

View file

@ -15,22 +15,20 @@ impl Parser<'_> {
) -> ParseResult<InsertStatement> {
let relation = self.eat(t!("RELATION"));
let ignore = self.eat(t!("IGNORE"));
let into = match self.eat(t!("INTO")) {
false => None,
true => {
let next = self.next();
// TODO: Explain that more complicated expressions are not allowed here.
Some(match next.kind {
t!("$param") => {
let param = self.token_value(next)?;
Value::Param(param)
}
_ => {
let table = self.token_value(next)?;
Value::Table(table)
}
})
}
let into = if self.eat(t!("INTO")) {
let r = match self.peek().kind {
t!("$param") => {
let param = self.next_token_value()?;
Value::Param(param)
}
_ => {
let table = self.next_token_value()?;
Value::Table(table)
}
};
Some(r)
} else {
None
};
let data = match self.peek_kind() {

View file

@ -476,12 +476,8 @@ impl Parser<'_> {
/// Expects `KILL` to already be consumed.
pub(crate) fn parse_kill_stmt(&mut self) -> ParseResult<KillStatement> {
let id = match self.peek_kind() {
TokenKind::Uuid => self.next_token_value().map(Value::Uuid)?,
t!("$param") => {
let token = self.pop_peek();
let param = self.token_value(token)?;
Value::Param(param)
}
t!("u\"") | t!("u'") => self.next_token_value().map(Value::Uuid)?,
t!("$param") => self.next_token_value().map(Value::Param)?,
x => unexpected!(self, x, "a UUID or a parameter"),
};
Ok(KillStatement {
@ -614,10 +610,12 @@ impl Parser<'_> {
expected!(self, t!("SINCE"));
let next = self.next();
let next = self.peek();
let since = match next.kind {
TokenKind::Number(_) => ShowSince::Versionstamp(self.token_value(next)?),
TokenKind::DateTime => ShowSince::Timestamp(self.token_value(next)?),
TokenKind::Digits | TokenKind::Number(_) => {
ShowSince::Versionstamp(self.next_token_value()?)
}
t!("d\"") | t!("d'") => ShowSince::Timestamp(self.next_token_value()?),
x => unexpected!(self, x, "a version stamp or a date-time"),
};

View file

@ -4,9 +4,9 @@ use reblessive::Stk;
use crate::{
sql::{
change_feed_include::ChangeFeedInclude, changefeed::ChangeFeed, index::Distance,
index::VectorType, Base, Cond, Data, Duration, Fetch, Fetchs, Field, Fields, Group, Groups,
Ident, Idiom, Output, Permission, Permissions, Tables, Timeout, Value, View,
changefeed::ChangeFeed, index::Distance, index::VectorType, Base, Cond, Data, Duration,
Fetch, Fetchs, Field, Fields, Group, Groups, Ident, Idiom, Output, Permission, Permissions,
Tables, Timeout, Value, View,
},
syn::{
parser::{
@ -343,7 +343,7 @@ impl Parser<'_> {
pub fn parse_changefeed(&mut self) -> ParseResult<ChangeFeed> {
let expiry = self.next_token_value::<Duration>()?.0;
let store_diff = if self.eat(t!("INCLUDE")) {
expected!(self, TokenKind::ChangeFeedInclude(ChangeFeedInclude::Original));
expected!(self, t!("ORIGINAL"));
true
} else {
false

View file

@ -107,6 +107,7 @@ impl Parser<'_> {
}
pub async fn parse_thing_or_table(&mut self, ctx: &mut Stk) -> ParseResult<Value> {
self.glue()?;
if self.peek_token_at(1).kind == t!(":") {
self.parse_thing(ctx).await.map(Value::Thing)
} else {

View file

@ -2,14 +2,13 @@ use reblessive::Stk;
use super::{ParseResult, Parser};
use crate::{
enter_flexible_ident,
sql::{id::Gen, Id, Ident, Range, Thing, Value},
syn::{
parser::{
mac::{expected, unexpected},
mac::{expected, expected_whitespace, unexpected},
ParseError, ParseErrorKind,
},
token::{t, NumberKind, TokenKind},
token::{t, TokenKind},
},
};
use std::{cmp::Ordering, ops::Bound};
@ -17,33 +16,20 @@ use std::{cmp::Ordering, ops::Bound};
impl Parser<'_> {
pub async fn parse_record_string(&mut self, ctx: &mut Stk, double: bool) -> ParseResult<Thing> {
let thing = self.parse_thing(ctx).await?;
// can't have any tokens in the buffer, since the next token must be produced by a specific
// call.
debug_assert_eq!(self.token_buffer.len(), 0);
// manually handle the trailing `"`.
let token = self.lexer.lex_record_string_close();
if token.kind == TokenKind::Invalid {
return Err(ParseError::new(
ParseErrorKind::InvalidToken(self.lexer.error.take().unwrap()),
token.span,
));
}
if token.kind == t!("'r") && double {
unexpected!(self, token.kind, "a single quote")
}
if token.kind == t!("\"r") && !double {
unexpected!(self, token.kind, "a double quote")
}
debug_assert!(matches!(token.kind, TokenKind::CloseRecordString { .. }));
debug_assert!(self.last_span().is_followed_by(&self.peek_whitespace().span));
if double {
expected_whitespace!(self, t!("\""));
} else {
expected_whitespace!(self, t!("'"));
};
Ok(thing)
}
fn peek_can_start_id(&mut self) -> bool {
self.peek_can_be_ident()
|| matches!(
self.peek_kind(),
TokenKind::Number(_) | t!("{") | t!("[") | TokenKind::Duration
)
fn kind_cast_start_id(kind: TokenKind) -> bool {
Self::tokenkind_can_start_ident(kind)
|| matches!(kind, TokenKind::Digits | t!("{") | t!("[") | t!("+") | t!("-"))
}
pub async fn parse_thing_or_range(
@ -51,157 +37,133 @@ impl Parser<'_> {
stk: &mut Stk,
ident: String,
) -> ParseResult<Value> {
expected!(self, t!(":"));
expected_whitespace!(self, t!(":"));
enter_flexible_ident!(this = self =>(self.flexible_record_id){
this.peek();
this.no_whitespace()?;
if this.eat(t!("..")) {
let end = if this.eat(t!("=")) {
this.no_whitespace()?;
let id = stk.run(|stk| this.parse_id(stk)).await?;
Bound::Included(id)
} else if this.peek_can_start_id() {
this.no_whitespace()?;
let id = stk.run(|stk| this.parse_id(stk)).await?;
Bound::Excluded(id)
} else {
Bound::Unbounded
};
return Ok(Value::Range(Box::new(Range {
tb: ident,
beg: Bound::Unbounded,
end,
})));
}
let beg = if this.peek_can_start_id(){
let id = stk.run(|ctx| this.parse_id(ctx)).await?;
if this.eat(t!(">")) {
this.no_whitespace()?;
Bound::Excluded(id)
} else {
Bound::Included(id)
}
} else {
Bound::Unbounded
};
if this.eat(t!("..")) {
let end = if this.eat(t!("=")) {
this.no_whitespace()?;
let id = stk.run(|ctx| this.parse_id(ctx)).await?;
Bound::Included(id)
} else if this.peek_can_start_id(){
this.no_whitespace()?;
let id = stk.run(|ctx| this.parse_id(ctx)).await?;
Bound::Excluded(id)
} else {
Bound::Unbounded
};
Ok(Value::Range(Box::new(Range {
tb: ident,
beg,
end,
})))
// If self starts with a range operator self is a range with no start bound
if self.eat_whitespace(t!("..")) {
// Check for inclusive
let end = if self.eat_whitespace(t!("=")) {
let id = stk.run(|stk| self.parse_id(stk)).await?;
Bound::Included(id)
} else if Self::kind_cast_start_id(self.peek_whitespace().kind) {
let id = stk.run(|stk| self.parse_id(stk)).await?;
Bound::Excluded(id)
} else {
let id = match beg {
Bound::Unbounded => {
if this.peek_kind() == t!("$param") {
return Err(ParseError::new(
Bound::Unbounded
};
return Ok(Value::Range(Box::new(Range {
tb: ident,
beg: Bound::Unbounded,
end,
})));
}
// Didn't eat range yet so we need to parse the id.
let beg = if Self::kind_cast_start_id(self.peek_whitespace().kind) {
let id = stk.run(|ctx| self.parse_id(ctx)).await?;
// check for exclusive
if self.eat_whitespace(t!(">")) {
Bound::Excluded(id)
} else {
Bound::Included(id)
}
} else {
Bound::Unbounded
};
// Check if self is actually a range.
// If we already ate the exclusive it must be a range.
if self.eat_whitespace(t!("..")) {
let end = if self.eat_whitespace(t!("=")) {
let id = stk.run(|ctx| self.parse_id(ctx)).await?;
Bound::Included(id)
} else if Self::kind_cast_start_id(self.peek_whitespace().kind) {
let id = stk.run(|ctx| self.parse_id(ctx)).await?;
Bound::Excluded(id)
} else {
Bound::Unbounded
};
Ok(Value::Range(Box::new(Range {
tb: ident,
beg,
end,
})))
} else {
let id = match beg {
Bound::Unbounded => {
if self.peek_whitespace().kind == t!("$param") {
return Err(ParseError::new(
ParseErrorKind::UnexpectedExplain {
found: t!("$param"),
expected: "a record-id id",
explain: "you can create a record-id from a param with the function 'type::thing'",
},
this.recent_span(),
self.recent_span(),
));
}
}
// we haven't matched anythong so far so we still want any type of id.
unexpected!(this, this.peek_kind(), "a record-id id")
}
Bound::Excluded(_) => {
// we have matched a bounded id but we don't see an range operator.
unexpected!(this, this.peek_kind(), "the range operator `..`")
}
Bound::Included(id) => id,
};
Ok(Value::Thing(Thing {
tb: ident,
id,
}))
}
})
// we haven't matched anythong so far so we still want any type of id.
unexpected!(self, self.peek_whitespace().kind, "a record-id id")
}
Bound::Excluded(_) => {
// we have matched a bounded id but we don't see an range operator.
unexpected!(self, self.peek_whitespace().kind, "the range operator `..`")
}
Bound::Included(id) => id,
};
Ok(Value::Thing(Thing {
tb: ident,
id,
}))
}
}
/// Parse an range
pub async fn parse_range(&mut self, ctx: &mut Stk) -> ParseResult<Range> {
let tb = self.next_token_value::<Ident>()?.0;
expected!(self, t!(":"));
expected_whitespace!(self, t!(":"));
enter_flexible_ident!(this = self =>(self.flexible_record_id){
this.peek();
this.no_whitespace()?;
// Check for beginning id
let beg = if Self::tokenkind_can_start_ident(self.peek_whitespace().kind) {
let id = ctx.run(|ctx| self.parse_id(ctx)).await?;
let beg = if this.peek_can_be_ident() {
this.peek();
this.no_whitespace()?;
let id = ctx.run(|ctx| this.parse_id(ctx)).await?;
this.peek();
this.no_whitespace()?;
if this.eat(t!(">")) {
Bound::Excluded(id)
} else {
Bound::Included(id)
}
if self.eat_whitespace(t!(">")) {
Bound::Excluded(id)
} else {
Bound::Unbounded
};
Bound::Included(id)
}
} else {
Bound::Unbounded
};
this.peek();
this.no_whitespace()?;
expected_whitespace!(self, t!(".."));
expected!(this, t!(".."));
let inclusive = self.eat_whitespace(t!("="));
this.peek();
this.no_whitespace()?;
let inclusive = this.eat(t!("="));
this.peek();
this.no_whitespace()?;
let end = if this.peek_can_be_ident() {
let id = ctx.run(|ctx| this.parse_id(ctx)).await?;
if inclusive {
Bound::Included(id)
} else {
Bound::Excluded(id)
}
// parse ending id.
let end = if Self::tokenkind_can_start_ident(self.peek_whitespace().kind) {
let id = ctx.run(|ctx| self.parse_id(ctx)).await?;
if inclusive {
Bound::Included(id)
} else {
Bound::Unbounded
};
Bound::Excluded(id)
}
} else {
Bound::Unbounded
};
Ok(Range {
tb,
beg,
end,
})
Ok(Range {
tb,
beg,
end,
})
}
pub async fn parse_thing(&mut self, ctx: &mut Stk) -> ParseResult<Thing> {
let ident = self.next_token_value::<Ident>()?.0;
enter_flexible_ident!(this = self =>(self.flexible_record_id){
this.parse_thing_from_ident(ctx, ident).await
})
self.parse_thing_from_ident(ctx, ident).await
}
pub async fn parse_thing_from_ident(
@ -211,12 +173,7 @@ impl Parser<'_> {
) -> ParseResult<Thing> {
expected!(self, t!(":"));
let id = enter_flexible_ident!(this = self =>(self.flexible_record_id){
this.peek();
this.no_whitespace()?;
ctx.run(|ctx| this.parse_id(ctx)).await
})?;
let id = ctx.run(|ctx| self.parse_id(ctx)).await?;
Ok(Thing {
tb: ident,
@ -225,87 +182,110 @@ impl Parser<'_> {
}
pub async fn parse_id(&mut self, stk: &mut Stk) -> ParseResult<Id> {
let token = self.next();
let token = self.peek_whitespace();
match token.kind {
t!("{") => {
let object = enter_flexible_ident!(this = self => (false){
this.parse_object(stk, token.span).await
})?;
self.pop_peek();
// object record id
let object = self.parse_object(stk, token.span).await?;
Ok(Id::Object(object))
}
t!("[") => {
let array = enter_flexible_ident!(this = self => (false){
this.parse_array(stk, token.span).await
})?;
self.pop_peek();
// array record id
let array = self.parse_array(stk, token.span).await?;
Ok(Id::Array(array))
}
t!("+") => {
self.peek();
self.no_whitespace()?;
expected!(self, TokenKind::Number(NumberKind::Integer));
let text = self.lexer.string.take().unwrap();
if let Ok(number) = text.parse() {
self.pop_peek();
// starting with a + so it must be a number
let digits_token = self.peek_whitespace();
match digits_token.kind {
TokenKind::Digits => {}
x => unexpected!(self, x, "an integer"),
}
let next = self.peek_whitespace();
match next.kind {
t!(".") | TokenKind::Exponent | TokenKind::NumberSuffix(_) => {
// TODO(delskayn) explain that record-id's cant have matissas,
// exponents or a number suffix
unexpected!(self, next.kind, "an integer");
}
x if Self::tokenkind_continues_ident(x) => {
let span = token.span.covers(next.span);
unexpected!(@span, self, x, "an integer");
}
// allowed
_ => {}
}
let digits_str = self.span_str(digits_token.span);
if let Ok(number) = digits_str.parse() {
Ok(Id::Number(number))
} else {
Ok(Id::String(text))
Ok(Id::String(digits_str.to_owned()))
}
}
t!("-") => {
self.peek();
self.no_whitespace()?;
expected!(self, TokenKind::Number(NumberKind::Integer));
let text = self.lexer.string.take().unwrap();
if let Ok(number) = text.parse::<u64>() {
self.pop_peek();
// starting with a + so it must be a number
let digits_token = self.peek_whitespace();
match digits_token.kind {
TokenKind::Digits => {}
x => unexpected!(self, x, "an integer"),
}
let next = self.peek_whitespace();
match next.kind {
t!(".") | TokenKind::Exponent | TokenKind::NumberSuffix(_) => {
// TODO(delskayn) explain that record-id's cant have matissas,
// exponents or a number suffix
unexpected!(self, next.kind, "an integer");
}
x if Self::tokenkind_continues_ident(x) => {
let span = token.span.covers(next.span);
unexpected!(@span, self, x, "an integer");
}
// allowed
_ => {}
}
let digits_str = self.span_str(digits_token.span);
if let Ok(number) = digits_str.parse::<u64>() {
// Parse to u64 and check if the value is equal to `-i64::MIN` via u64 as
// `-i64::MIN` doesn't fit in an i64
match number.cmp(&((i64::MAX as u64) + 1)) {
Ordering::Less => Ok(Id::Number(-(number as i64))),
Ordering::Equal => Ok(Id::Number(i64::MIN)),
Ordering::Greater => Ok(Id::String(format!("-{}", text))),
Ordering::Greater => Ok(Id::String(format!("-{}", digits_str))),
}
} else {
Ok(Id::String(text))
Ok(Id::String(format!("-{}", digits_str)))
}
}
TokenKind::Number(NumberKind::Integer) => {
// Id handle numbers more loose then other parts of the code.
// If number can't fit in a i64 it will instead be parsed as a string.
let text = self.lexer.string.take().unwrap();
if let Ok(number) = text.parse() {
TokenKind::Digits => {
let next = self.peek_whitespace_token_at(1);
if Self::tokenkind_can_start_ident(next.kind) {
let glued = self.glue_ident(self.flexible_record_id)?;
if let TokenKind::Identifier = glued.kind {
self.pop_peek();
return Ok(Id::String(self.lexer.string.take().unwrap()));
} else {
unexpected!(self, glued.kind, "a record-id id")
}
}
self.pop_peek();
let digits_str = self.span_str(token.span);
if let Ok(number) = digits_str.parse::<i64>() {
Ok(Id::Number(number))
} else {
Ok(Id::String(text))
Ok(Id::String(digits_str.to_owned()))
}
}
TokenKind::Number(NumberKind::Exponent) if self.flexible_record_id => {
let text = self.lexer.string.take().unwrap();
if text.bytes().any(|x| !x.is_ascii_alphanumeric()) {
unexpected!(self, token.kind, "a identifier");
}
Ok(Id::String(text))
}
TokenKind::Number(NumberKind::Decimal) if self.flexible_record_id => {
let mut text = self.lexer.string.take().unwrap();
text.push('d');
text.push('e');
text.push('c');
Ok(Id::String(text))
}
TokenKind::Number(NumberKind::DecimalExponent) if self.flexible_record_id => {
let mut text = self.lexer.string.take().unwrap();
if text.bytes().any(|x| !x.is_ascii_alphanumeric()) {
unexpected!(self, token.kind, "a identifier");
}
text.push('d');
text.push('e');
text.push('c');
Ok(Id::String(text))
}
TokenKind::Number(NumberKind::Float) if self.flexible_record_id => {
let mut text = self.lexer.string.take().unwrap();
text.push('f');
Ok(Id::String(text))
}
TokenKind::Duration if self.flexible_record_id => {
self.lexer.duration = None;
let slice = self.lexer.reader.span(token.span);
@ -317,23 +297,27 @@ impl Parser<'_> {
Ok(Id::String(text))
}
t!("ULID") => {
self.pop_peek();
// TODO: error message about how to use `ulid` as an identifier.
expected!(self, t!("("));
expected!(self, t!(")"));
Ok(Id::Generate(Gen::Ulid))
}
t!("UUID") => {
self.pop_peek();
expected!(self, t!("("));
expected!(self, t!(")"));
Ok(Id::Generate(Gen::Uuid))
}
t!("RAND") => {
self.pop_peek();
expected!(self, t!("("));
expected!(self, t!(")"));
Ok(Id::Generate(Gen::Rand))
}
_ => {
let ident = self.token_value::<Ident>(token)?.0;
self.glue_ident(self.flexible_record_id)?;
let ident = self.next_token_value::<Ident>()?.0;
Ok(Id::String(ident))
}
}
@ -582,5 +566,18 @@ mod tests {
assert_ident_parses_correctly("1ns1h");
assert_ident_parses_correctly("000e8");
assert_ident_parses_correctly("000e8bla");
assert_ident_parses_correctly("y123");
assert_ident_parses_correctly("w123");
assert_ident_parses_correctly("d123");
assert_ident_parses_correctly("h123");
assert_ident_parses_correctly("m123");
assert_ident_parses_correctly("s123");
assert_ident_parses_correctly("ms123");
assert_ident_parses_correctly("us123");
assert_ident_parses_correctly("ns123");
assert_ident_parses_correctly("dec123");
assert_ident_parses_correctly("f123");
assert_ident_parses_correctly("e123");
}
}

View file

@ -0,0 +1,527 @@
//! Implements token gluing logic.
use crate::{
sql::duration::{
SECONDS_PER_DAY, SECONDS_PER_HOUR, SECONDS_PER_MINUTE, SECONDS_PER_WEEK, SECONDS_PER_YEAR,
},
syn::{
parser::{mac::unexpected, ParseError, ParseErrorKind, ParseResult, Parser},
token::{t, DurationSuffix, NumberKind, NumberSuffix, Token, TokenKind},
},
};
use std::time::Duration as StdDuration;
impl Parser<'_> {
/// Returns if a token kind can start an identifier.
pub fn tokenkind_can_start_ident(t: TokenKind) -> bool {
matches!(
t,
TokenKind::Keyword(_)
| TokenKind::Language(_)
| TokenKind::Algorithm(_)
| TokenKind::Distance(_)
| TokenKind::VectorType(_)
| TokenKind::Identifier
| TokenKind::Exponent
| TokenKind::DatetimeChars(_)
| TokenKind::NumberSuffix(_)
| TokenKind::DurationSuffix(
// All except Micro unicode
DurationSuffix::Nano
| DurationSuffix::Micro | DurationSuffix::Milli
| DurationSuffix::Second | DurationSuffix::Minute
| DurationSuffix::Hour | DurationSuffix::Day
| DurationSuffix::Week | DurationSuffix::Year
)
)
}
/// Returns if a token kind can start continue an identifier.
pub fn tokenkind_continues_ident(t: TokenKind) -> bool {
matches!(
t,
TokenKind::Keyword(_)
| TokenKind::Language(_)
| TokenKind::Algorithm(_)
| TokenKind::Distance(_)
| TokenKind::VectorType(_)
| TokenKind::Identifier
| TokenKind::DatetimeChars(_)
| TokenKind::Exponent
| TokenKind::NumberSuffix(_)
| TokenKind::NaN | TokenKind::DurationSuffix(
// All except Micro unicode
DurationSuffix::Nano
| DurationSuffix::Micro
| DurationSuffix::Milli
| DurationSuffix::Second
| DurationSuffix::Minute
| DurationSuffix::Hour
| DurationSuffix::Day
| DurationSuffix::Week
)
)
}
/// Returns if the peeked token can be a identifier.
pub fn peek_can_start_ident(&mut self) -> bool {
Self::tokenkind_can_start_ident(self.peek_kind())
}
/// Returns if the peeked token can be a identifier.
pub fn peek_continues_ident(&mut self) -> bool {
Self::tokenkind_can_start_ident(self.peek_kind())
}
/// Glue an token and immediately consume it.
pub fn glue_next(&mut self) -> ParseResult<Token> {
self.glue()?;
Ok(self.next())
}
/// Glues the next token together, returning its value, doesnt consume the token.
pub fn glue(&mut self) -> ParseResult<Token> {
let token = self.peek();
match token.kind {
TokenKind::Exponent
| TokenKind::NumberSuffix(_)
| TokenKind::DurationSuffix(_)
| TokenKind::DatetimeChars(_) => self.glue_ident(false),
TokenKind::Digits => self.glue_numeric(),
t!("\"") | t!("'") => {
self.pop_peek();
let t = self.lexer.relex_strand(token);
let TokenKind::Strand = t.kind else {
unexpected!(self, t.kind, "a strand")
};
self.prepend_token(t);
Ok(t)
}
t!("+") | t!("-") => {
if let TokenKind::Digits = self.peek_whitespace_token_at(1).kind {
self.glue_number()
} else {
Ok(token)
}
}
_ => Ok(token),
}
}
/// Glues all next tokens follow eachother, which can make up an ident into a single string.
pub fn glue_ident(&mut self, flexible: bool) -> ParseResult<Token> {
let start = self.peek();
let mut token_buffer = match start.kind {
TokenKind::Exponent | TokenKind::NumberSuffix(_) => {
self.pop_peek();
self.span_str(start.span).to_owned()
}
TokenKind::Digits if flexible => {
self.pop_peek();
self.span_str(start.span).to_owned()
}
TokenKind::DurationSuffix(x) if x.can_be_ident() => {
self.pop_peek();
self.span_str(start.span).to_owned()
}
_ => return Ok(start),
};
debug_assert!(
start.is_followed_by(&self.peek_whitespace()),
"a whitespace token was eaten where eating it would disturb parsing\n {:?}@{:?} => {:?}@{:?}",
start.kind,
start.span,
self.peek_whitespace().kind,
self.peek_whitespace().span
);
let mut prev = start;
loop {
let p = self.peek_whitespace();
match p.kind {
// These token_kinds always complete an ident, no more identifier parts can happen
// after this.
TokenKind::Identifier => {
self.pop_peek();
let buffer = self.lexer.string.take().unwrap();
token_buffer.push_str(&buffer);
prev = p;
break;
}
TokenKind::Keyword(_)
| TokenKind::Language(_)
| TokenKind::Algorithm(_)
| TokenKind::Distance(_)
| TokenKind::VectorType(_)
| TokenKind::NumberSuffix(_) => {
self.pop_peek();
let str = self.span_str(p.span);
token_buffer.push_str(str);
prev = p;
break;
}
// These tokens might have some more parts following them
TokenKind::Exponent => {
self.pop_peek();
let str = self.span_str(p.span);
token_buffer.push_str(str);
prev = p;
}
TokenKind::DurationSuffix(suffix) => {
self.pop_peek();
if !suffix.can_be_ident() {
return Err(ParseError::new(ParseErrorKind::InvalidIdent, p.span));
}
token_buffer.push_str(suffix.as_str());
prev = p;
}
TokenKind::Digits => {
self.pop_peek();
let str = self.span_str(p.span);
token_buffer.push_str(str);
prev = p;
}
_ => break,
}
}
let token = Token {
kind: TokenKind::Identifier,
span: start.span.covers(prev.span),
};
self.lexer.string = Some(token_buffer);
self.prepend_token(token);
Ok(token)
}
pub fn glue_numeric(&mut self) -> ParseResult<Token> {
let peek = self.peek();
match peek.kind {
TokenKind::Digits => {
if matches!(self.peek_whitespace_token_at(1).kind, TokenKind::DurationSuffix(_)) {
return self.glue_duration();
}
self.glue_number()
}
t!("+") | t!("-") => self.glue_number(),
_ => Ok(peek),
}
}
pub fn glue_number(&mut self) -> ParseResult<Token> {
let start = self.peek();
match start.kind {
t!("+") | t!("-") => {
self.pop_peek();
debug_assert!(
start.is_followed_by(&self.peek_whitespace()),
"a whitespace token was eaten where eating it would disturb parsing\n {:?}@{:?} => {:?}@{:?}",
start.kind,
start.span,
self.peek_whitespace().kind,
self.peek_whitespace().span
);
let n = self.peek_whitespace();
if n.kind != TokenKind::Digits {
unexpected!(self, start.kind, "a number")
}
self.pop_peek();
}
TokenKind::Digits => {
self.pop_peek();
debug_assert!(
start.is_followed_by(&self.peek_whitespace()),
"a whitespace token was eaten where eating it would disturb parsing\n {:?}@{:?} => {:?}@{:?}",
start.kind,
start.span,
self.peek_whitespace().kind,
self.peek_whitespace().span
);
}
_ => return Ok(start),
};
let mut kind = NumberKind::Integer;
// Check for mantissa
if let t!(".") = self.peek_whitespace().kind {
self.pop_peek();
let next = self.peek_whitespace();
if next.kind != TokenKind::Digits {
unexpected!(self, next.kind, "digits after the dot");
}
self.pop_peek();
kind = NumberKind::Float;
}
// Check for exponent
if let TokenKind::Exponent = self.peek_whitespace().kind {
self.pop_peek();
let exponent_token = self.peek_whitespace();
match exponent_token.kind {
t!("+") | t!("-") => {
self.pop_peek();
let exponent_token = self.peek_whitespace();
if exponent_token.kind != TokenKind::Digits {
unexpected!(self, exponent_token.kind, "digits after the exponent")
}
}
TokenKind::Digits => {}
x => unexpected!(self, x, "digits after the exponent"),
}
self.pop_peek();
kind = NumberKind::Float;
}
// Check for number suffix
let suffix_token = self.peek_whitespace();
if let TokenKind::NumberSuffix(suffix) = suffix_token.kind {
self.pop_peek();
match suffix {
NumberSuffix::Float => {
kind = NumberKind::Float;
}
NumberSuffix::Decimal => {
kind = NumberKind::Decimal;
}
}
}
// Check that no ident-like identifiers follow
let next = self.peek_whitespace();
if Self::tokenkind_continues_ident(next.kind) {
unexpected!(self, next.kind, "number to end")
}
let token = Token {
kind: TokenKind::Number(kind),
span: start.span.covers(self.last_span()),
};
self.prepend_token(token);
Ok(token)
}
pub fn glue_duration(&mut self) -> ParseResult<Token> {
let mut duration = StdDuration::ZERO;
let start = self.peek();
match start.kind {
TokenKind::Digits => {
self.pop_peek();
}
_ => return Ok(start),
};
debug_assert!(
start.is_followed_by(&self.peek_whitespace()),
"a whitespace token was eaten where eating it would disturb parsing"
);
let mut cur = start;
loop {
let p = self.peek_whitespace();
let suffix = match p.kind {
TokenKind::DurationSuffix(x) => x,
x => unexpected!(self, x, "a duration suffix"),
};
self.pop_peek();
let digits_str = self.span_str(cur.span);
let digits_value: u64 = digits_str
.parse()
.map_err(ParseErrorKind::InvalidInteger)
.map_err(|e| ParseError::new(e, p.span))?;
let addition = match suffix {
DurationSuffix::Nano => StdDuration::from_nanos(digits_value),
DurationSuffix::Micro | DurationSuffix::MicroUnicode => {
StdDuration::from_micros(digits_value)
}
DurationSuffix::Milli => StdDuration::from_millis(digits_value),
DurationSuffix::Second => StdDuration::from_secs(digits_value),
DurationSuffix::Minute => {
let minutes =
digits_value.checked_mul(SECONDS_PER_MINUTE).ok_or_else(|| {
let span = start.span.covers(p.span);
ParseError::new(ParseErrorKind::DurationOverflow, span)
})?;
StdDuration::from_secs(minutes)
}
DurationSuffix::Hour => {
let hours = digits_value.checked_mul(SECONDS_PER_HOUR).ok_or_else(|| {
let span = start.span.covers(p.span);
ParseError::new(ParseErrorKind::DurationOverflow, span)
})?;
StdDuration::from_secs(hours)
}
DurationSuffix::Day => {
let days = digits_value.checked_mul(SECONDS_PER_DAY).ok_or_else(|| {
let span = start.span.covers(p.span);
ParseError::new(ParseErrorKind::DurationOverflow, span)
})?;
StdDuration::from_secs(days)
}
DurationSuffix::Week => {
let weeks = digits_value.checked_mul(SECONDS_PER_WEEK).ok_or_else(|| {
let span = start.span.covers(p.span);
ParseError::new(ParseErrorKind::DurationOverflow, span)
})?;
StdDuration::from_secs(weeks)
}
DurationSuffix::Year => {
let years = digits_value.checked_mul(SECONDS_PER_YEAR).ok_or_else(|| {
let span = start.span.covers(p.span);
ParseError::new(ParseErrorKind::DurationOverflow, span)
})?;
StdDuration::from_secs(years)
}
};
duration = duration.checked_add(addition).ok_or_else(|| {
let span = start.span.covers(p.span);
ParseError::new(ParseErrorKind::DurationOverflow, span)
})?;
match self.peek_whitespace().kind {
TokenKind::Digits => {
cur = self.pop_peek();
}
x if Parser::tokenkind_continues_ident(x) => {
let span = start.span.covers(p.span);
unexpected!(@span, self, x, "a duration")
}
_ => break,
}
}
let span = start.span.covers(cur.span);
let token = Token {
kind: TokenKind::Duration,
span,
};
self.lexer.duration = Some(duration);
self.prepend_token(token);
Ok(token)
}
/// Glues the next tokens which would make up a float together into a single buffer.
/// Return err if the tokens would return a invalid float.
pub fn glue_float(&mut self) -> ParseResult<Token> {
let start = self.peek();
match start.kind {
t!("+") | t!("-") => {
self.pop_peek();
debug_assert!(
start.is_followed_by(&self.peek_whitespace()),
"a whitespace token was eaten where eating it would disturb parsing"
);
let digits_token = self.peek_whitespace();
if TokenKind::Digits != digits_token.kind {
let span = start.span.covers(digits_token.span);
unexpected!(@span, self,digits_token.kind, "a floating point number")
}
}
TokenKind::Digits => {
self.pop_peek();
debug_assert!(
start.is_followed_by(&self.peek_whitespace()),
"a whitespace token was eaten where eating it would disturb parsing"
);
}
TokenKind::NumberSuffix(NumberSuffix::Float) => {
return Ok(start);
}
_ => return Ok(start),
}
// check for mantissa
if let t!(".") = self.peek_whitespace().kind {
self.pop_peek();
let digits_token = self.peek_whitespace();
if TokenKind::Digits != digits_token.kind {
unexpected!(self, digits_token.kind, "a floating point number")
}
self.pop_peek();
};
// check for exponent
if let TokenKind::Exponent = self.peek_whitespace().kind {
self.pop_peek();
let mut digits_token = self.peek_whitespace();
if let t!("+") | t!("-") = digits_token.kind {
self.pop_peek();
digits_token = self.peek_whitespace();
}
if TokenKind::Digits != digits_token.kind {
unexpected!(self, digits_token.kind, "a floating point number")
}
self.pop_peek();
}
// check for exponent
if let TokenKind::NumberSuffix(suffix) = self.peek_whitespace().kind {
match suffix {
NumberSuffix::Float => {
self.pop_peek();
}
NumberSuffix::Decimal => {
unexpected!(self, t!("dec"), "a floating point number")
}
}
}
let t = self.peek_whitespace();
if Self::tokenkind_continues_ident(t.kind) {
unexpected!(self, t.kind, "a floating point number to end")
}
let span = start.span.covers(self.last_span());
let token = Token {
kind: TokenKind::Number(NumberKind::Float),
span,
};
self.prepend_token(token);
Ok(token)
}
pub fn glue_plain_strand(&mut self) -> ParseResult<Token> {
let start = self.peek();
match start.kind {
t!("\"") | t!("'") => {}
_ => return Ok(start),
};
let token = self.lexer.relex_strand(start);
self.prepend_token(token);
Ok(token)
}
}

View file

@ -27,6 +27,16 @@ impl<const S: usize> TokenBuffer<S> {
self.write = next_write;
}
#[inline]
pub fn push_front(&mut self, token: Token) {
let next_read = self.read.checked_sub(1).unwrap_or((S - 1) as u8);
if next_read == self.write {
panic!("token buffer full");
}
self.buffer[next_read as usize] = token;
self.read = next_read;
}
#[inline]
pub fn pop(&mut self) -> Option<Token> {
if self.write == self.read {
@ -57,6 +67,10 @@ impl<const S: usize> TokenBuffer<S> {
}
}
pub fn is_empty(&self) -> bool {
self.write != self.read
}
pub fn at(&mut self, at: u8) -> Option<Token> {
if at >= self.len() {
return None;

View file

@ -125,6 +125,7 @@ keyword! {
Only => "ONLY",
Option => "OPTION",
Order => "ORDER",
Original => "ORIGINAL",
Parallel => "PARALLEL",
Param => "PARAM",
Passhash => "PASSHASH",

View file

@ -1,5 +1,8 @@
/// A shorthand for token kinds.
macro_rules! t {
(" ") => {
$crate::syn::token::TokenKind::WhiteSpace
};
("invalid") => {
$crate::syn::token::TokenKind::Invalid
};
@ -26,27 +29,80 @@ macro_rules! t {
};
("r\"") => {
$crate::syn::token::TokenKind::OpenRecordString {
double: true,
}
$crate::syn::token::TokenKind::Qoute($crate::syn::token::QouteKind::RecordIdDouble)
};
("r'") => {
$crate::syn::token::TokenKind::OpenRecordString {
double: false,
}
$crate::syn::token::TokenKind::Qoute($crate::syn::token::QouteKind::RecordId)
};
("u\"") => {
$crate::syn::token::TokenKind::Qoute($crate::syn::token::QouteKind::UuidDouble)
};
("u'") => {
$crate::syn::token::TokenKind::Qoute($crate::syn::token::QouteKind::Uuid)
};
("d\"") => {
$crate::syn::token::TokenKind::Qoute($crate::syn::token::QouteKind::DateTimeDouble)
};
("d'") => {
$crate::syn::token::TokenKind::Qoute($crate::syn::token::QouteKind::DateTime)
};
("\"") => {
$crate::syn::token::TokenKind::Qoute($crate::syn::token::QouteKind::PlainDouble)
};
("'") => {
$crate::syn::token::TokenKind::Qoute($crate::syn::token::QouteKind::Plain)
};
("\"r") => {
$crate::syn::token::TokenKind::CloseRecordString {
$crate::syn::token::TokenKind::CloseString {
double: true,
}
};
("'r") => {
$crate::syn::token::TokenKind::CloseRecordString {
$crate::syn::token::TokenKind::CloseString {
double: false,
}
};
("ns") => {
$crate::syn::token::TokenKind::DurationSuffix($crate::syn::token::DurationSuffix::Nano)
};
("us") => {
$crate::syn::token::TokenKind::DurationSuffix($crate::syn::token::DurationSuffix::Micro)
};
("µs") => {
$crate::syn::token::TokenKind::DurationSuffix(
$crate::syn::token::DurationSuffix::MicroUnicode,
)
};
("ms") => {
$crate::syn::token::TokenKind::DurationSuffix($crate::syn::token::DurationSuffix::Milli)
};
("s") => {
$crate::syn::token::TokenKind::DurationSuffix($crate::syn::token::DurationSuffix::Second)
};
("m") => {
$crate::syn::token::TokenKind::DurationSuffix($crate::syn::token::DurationSuffix::Minute)
};
("h") => {
$crate::syn::token::TokenKind::DurationSuffix($crate::syn::token::DurationSuffix::Hour)
};
("d") => {
$crate::syn::token::TokenKind::DurationSuffix($crate::syn::token::DurationSuffix::Day)
};
("w") => {
$crate::syn::token::TokenKind::DurationSuffix($crate::syn::token::DurationSuffix::Week)
};
("y") => {
$crate::syn::token::TokenKind::DurationSuffix($crate::syn::token::DurationSuffix::Year)
};
("f") => {
$crate::syn::token::TokenKind::NumberSuffix($crate::syn::token::NumberSuffix::Float)
};
("dec") => {
$crate::syn::token::TokenKind::NumberSuffix($crate::syn::token::NumberSuffix::Decimal)
};
("<") => {
$crate::syn::token::TokenKind::LeftChefron
};
@ -144,9 +200,6 @@ macro_rules! t {
("$param") => {
$crate::syn::token::TokenKind::Parameter
};
("123") => {
$crate::syn::token::TokenKind::Number(_)
};
("!") => {
$crate::syn::token::TokenKind::Operator($crate::syn::token::Operator::Not)

View file

@ -6,10 +6,8 @@ mod keyword;
pub(crate) use keyword::keyword_t;
pub use keyword::Keyword;
mod mac;
pub(crate) use mac::t;
use crate::sql::change_feed_include::ChangeFeedInclude;
use crate::sql::{language::Language, Algorithm};
pub(crate) use mac::t;
/// A location in the source passed to the lexer.
#[derive(Clone, Copy, Eq, PartialEq, Hash, Debug)]
@ -52,6 +50,18 @@ impl Span {
len: 0,
}
}
/// Returns if the given span is the next span after this one.
pub fn is_followed_by(&self, other: &Self) -> bool {
let end = self.offset as usize + self.len as usize;
other.offset as usize == end
}
/// Returns if this span immediately follows the given.
pub fn follows_from(&self, other: &Self) -> bool {
let end = self.offset as usize + self.len as usize;
other.offset as usize == end
}
}
#[repr(u8)]
@ -244,58 +254,134 @@ impl VectorTypeKind {
}
#[derive(Clone, Copy, Eq, PartialEq, Hash, Debug)]
#[non_exhaustive]
pub enum NumberKind {
// A plain integer number.
Integer,
// A number with a decimal postfix.
Decimal,
// A number with a decimal postfix.
DecimalExponent,
// A number with a float postfix.
pub enum DurationSuffix {
Nano,
Micro,
MicroUnicode,
Milli,
Second,
Minute,
Hour,
Day,
Week,
Year,
}
impl DurationSuffix {
pub fn can_be_ident(&self) -> bool {
!matches!(self, DurationSuffix::MicroUnicode)
}
pub fn as_str(&self) -> &'static str {
match self {
DurationSuffix::Nano => "ns",
DurationSuffix::Micro => "us",
DurationSuffix::MicroUnicode => "µs",
DurationSuffix::Milli => "ms",
DurationSuffix::Second => "s",
DurationSuffix::Minute => "m",
DurationSuffix::Hour => "h",
DurationSuffix::Day => "d",
DurationSuffix::Week => "w",
DurationSuffix::Year => "y",
}
}
}
#[derive(Clone, Copy, Eq, PartialEq, Hash, Debug)]
pub enum NumberSuffix {
Float,
// A number with a float postfix that had a mantissa.
FloatMantissa,
// A number with a `.3` part.
Mantissa,
// A number with a `.3e10` part.
MantissaExponent,
// A number with a `.3e10` part.
Exponent,
NaN,
Decimal,
}
impl Algorithm {
pub fn as_str(&self) -> &'static str {
match self {
Self::EdDSA => "EDDSA",
Self::Es256 => "ES256",
Self::Es384 => "ES384",
Self::Es512 => "ES512",
Self::Hs256 => "HS256",
Self::Hs384 => "HS384",
Self::Hs512 => "HS512",
Self::Ps256 => "PS256",
Self::Ps384 => "PS384",
Self::Ps512 => "PS512",
Self::Rs256 => "RS256",
Self::Rs384 => "RS384",
Self::Rs512 => "RS512",
}
}
}
#[derive(Clone, Copy, Eq, PartialEq, Hash, Debug)]
pub enum QouteKind {
/// `'`
Plain,
/// `"`
PlainDouble,
/// `r'`
RecordId,
/// `r"`
RecordIdDouble,
/// `u'`
Uuid,
/// `u"`
UuidDouble,
/// `d'`
DateTime,
/// `d"`
DateTimeDouble,
}
impl QouteKind {
pub fn as_str(&self) -> &'static str {
match self {
QouteKind::Plain | QouteKind::PlainDouble => "a strand",
QouteKind::RecordId | QouteKind::RecordIdDouble => "a record-id strand",
QouteKind::Uuid | QouteKind::UuidDouble => "a uuid",
QouteKind::DateTime | QouteKind::DateTimeDouble => "a datetime",
}
}
}
#[derive(Clone, Copy, Eq, PartialEq, Hash, Debug)]
pub enum NumberKind {
Decimal,
Float,
Integer,
}
#[derive(Clone, Copy, Eq, PartialEq, Hash, Debug)]
pub enum DatetimeChars {
T,
Z,
}
/// The type of token
#[derive(Clone, Copy, Eq, PartialEq, Hash, Debug)]
#[non_exhaustive]
pub enum TokenKind {
WhiteSpace,
Keyword(Keyword),
Algorithm(Algorithm),
ChangeFeedInclude(ChangeFeedInclude),
Language(Language),
Distance(DistanceKind),
VectorType(VectorTypeKind),
Operator(Operator),
OpenDelim(Delim),
CloseDelim(Delim),
// a token denoting the opening of a record string, i.e. `r"`
OpenRecordString {
double: bool,
},
/// a token denoting the clsoing of a record string, i.e. `"`
/// Never produced normally by the lexer.
CloseRecordString {
double: bool,
},
Regex,
Uuid,
DateTime,
/// a token denoting the opening of a string, i.e. `r"`
Qoute(QouteKind),
/// Not produced by the lexer but only the result of token gluing.
Number(NumberKind),
/// Not produced by the lexer but only the result of token gluing.
Duration,
/// Not produced by the lexer but only the result of token gluing.
Strand,
Regex,
/// A parameter like `$name`.
Parameter,
/// A duration.
Duration,
Number(NumberKind),
Identifier,
/// `<`
LeftChefron,
@ -337,6 +423,18 @@ pub enum TokenKind {
Invalid,
/// A token which indicates the end of the file.
Eof,
/// A token consiting of one or more ascii digits.
Digits,
/// A identifier like token which matches a duration suffix.
DurationSuffix(DurationSuffix),
/// A part of a datetime like token which matches a duration suffix.
DatetimeChars(DatetimeChars),
/// A identifier like token which matches an exponent.
Exponent,
/// A identifier like token which matches an number suffix.
NumberSuffix(NumberSuffix),
/// The Not-A-Number number token.
NaN,
}
/// An assertion statically checking that the size of Tokenkind remains two bytes
@ -344,15 +442,7 @@ const _TOKEN_KIND_SIZE_ASSERT: [(); 2] = [(); std::mem::size_of::<TokenKind>()];
impl TokenKind {
pub fn has_data(&self) -> bool {
matches!(
self,
TokenKind::Identifier
| TokenKind::Uuid
| TokenKind::DateTime
| TokenKind::Strand
| TokenKind::Parameter
| TokenKind::Regex
)
matches!(self, TokenKind::Identifier | TokenKind::Duration)
}
pub fn can_be_identifier(&self) -> bool {
@ -362,6 +452,8 @@ impl TokenKind {
| TokenKind::Keyword(_)
| TokenKind::Language(_)
| TokenKind::Algorithm(_)
| TokenKind::DatetimeChars(_)
| TokenKind::Distance(_),
)
}
@ -397,17 +489,9 @@ impl TokenKind {
TokenKind::CloseDelim(Delim::Paren) => ")",
TokenKind::CloseDelim(Delim::Brace) => "}",
TokenKind::CloseDelim(Delim::Bracket) => "]",
TokenKind::OpenRecordString {
..
} => "a record string",
TokenKind::CloseRecordString {
..
} => "a closing record string",
TokenKind::Uuid => "a uuid",
TokenKind::DateTime => "a date-time",
TokenKind::DurationSuffix(x) => x.as_str(),
TokenKind::Strand => "a strand",
TokenKind::Parameter => "a parameter",
TokenKind::Duration => "a duration",
TokenKind::Number(_) => "a number",
TokenKind::Identifier => "an identifier",
TokenKind::Regex => "a regex",
@ -431,7 +515,15 @@ impl TokenKind {
TokenKind::At => "@",
TokenKind::Invalid => "Invalid",
TokenKind::Eof => "Eof",
TokenKind::ChangeFeedInclude(_) => "change feed include",
TokenKind::WhiteSpace => "whitespace",
TokenKind::Qoute(x) => x.as_str(),
TokenKind::Duration => "a duration",
TokenKind::Digits => "a number",
TokenKind::NaN => "NaN",
// below are small broken up tokens which are most of the time identifiers.
TokenKind::DatetimeChars(_) => "an identifier",
TokenKind::Exponent => "an identifier",
TokenKind::NumberSuffix(_) => "an identifier",
}
}
}
@ -460,4 +552,12 @@ impl Token {
pub fn is_eof(&self) -> bool {
matches!(self.kind, TokenKind::Eof)
}
pub fn is_followed_by(&self, other: &Token) -> bool {
self.span.is_followed_by(&other.span)
}
pub fn follows_from(&self, other: &Token) -> bool {
self.span.follows_from(&other.span)
}
}

View file

@ -182,8 +182,6 @@ where
}),
Err(_) => Err(crate::Error::from(Error::NotLiveQuery(idx))),
};
dbg!(&response);
response.live_queries.insert(idx, res);
}

View file

@ -416,7 +416,8 @@ impl Drop for Test {
/// Drops the instance of the struct
/// This method will panic if there are remaining responses that have not been checked.
fn drop(&mut self) {
if !self.responses.is_empty() {
// Check for a panic to make sure test doesnt cause a double panic.
if !std::thread::panicking() && !self.responses.is_empty() {
panic!("Not every response has been checked");
}
}

View file

@ -1365,7 +1365,7 @@ async fn select_with_uuid_value() -> Result<(), Error> {
plan: {
index: 'sessionUid',
operator: '=',
value: '00ad70db-f435-442e-9012-1cd853102084'
value: u'00ad70db-f435-442e-9012-1cd853102084'
},
table: 'sessions'
},
@ -1388,7 +1388,7 @@ async fn select_with_uuid_value() -> Result<(), Error> {
r#"[
{
"id": sessions:1,
"sessionUid": "00ad70db-f435-442e-9012-1cd853102084"
"sessionUid": u"00ad70db-f435-442e-9012-1cd853102084"
}
]"#,
);