From ee8e6f00d7a5507eb20d098ad2801ad484a18396 Mon Sep 17 00:00:00 2001 From: Mees Delzenne Date: Tue, 13 Aug 2024 22:50:40 +0200 Subject: [PATCH] Fix params not supporting delimiters (#4411) --- core/src/syn/lexer/byte.rs | 29 +++++++++++++++++++++++------ core/src/syn/lexer/ident.rs | 19 +++++++++++++++---- core/src/syn/lexer/reader.rs | 4 +++- core/src/syn/parser/mac.rs | 2 +- core/src/syn/parser/test/mod.rs | 24 ++++++++++++++++++++++++ 5 files changed, 66 insertions(+), 12 deletions(-) diff --git a/core/src/syn/lexer/byte.rs b/core/src/syn/lexer/byte.rs index db5dc1e3..824f181f 100644 --- a/core/src/syn/lexer/byte.rs +++ b/core/src/syn/lexer/byte.rs @@ -6,6 +6,8 @@ use crate::syn::{ token::{t, DatetimeChars, Token, TokenKind}, }; +use super::CharError; + impl<'a> Lexer<'a> { /// Eats a single line comment. pub fn eat_single_line_comment(&mut self) { @@ -312,13 +314,28 @@ impl<'a> Lexer<'a> { } _ => t!(":"), }, - b'$' => { - if self.reader.peek().map(|x| x.is_ascii_alphabetic() || x == b'_').unwrap_or(false) - { - return self.lex_param(); + b'$' => match self.reader.peek() { + Some(b'_') => return self.lex_param(), + Some(b'`') => { + self.reader.next(); + return self.lex_surrounded_param(true); } - t!("$") - } + Some(x) if x.is_ascii_alphabetic() => return self.lex_param(), + Some(x) if !x.is_ascii() => { + let backup = self.reader.offset(); + self.reader.next(); + match self.reader.complete_char(x) { + Ok('⟨') => return self.lex_surrounded_param(false), + Err(CharError::Eof) => return self.invalid_token(Error::InvalidUtf8), + Err(CharError::Unicode) => return self.invalid_token(Error::InvalidUtf8), + _ => { + self.reader.backup(backup); + t!("$") + } + } + } + _ => t!("$"), + }, b'#' => { self.eat_single_line_comment(); TokenKind::WhiteSpace diff --git a/core/src/syn/lexer/ident.rs b/core/src/syn/lexer/ident.rs index f491d647..f07f3ed4 100644 --- a/core/src/syn/lexer/ident.rs +++ b/core/src/syn/lexer/ident.rs @@ -33,6 +33,17 @@ impl<'a> Lexer<'a> { } } + pub fn lex_surrounded_param(&mut self, is_backtick: bool) -> Token { + debug_assert_eq!(self.scratch, ""); + match self.lex_surrounded_ident_err(is_backtick) { + Ok(_) => self.finish_token(TokenKind::Parameter), + Err(e) => { + self.scratch.clear(); + self.invalid_token(e) + } + } + } + /// Lex an not surrounded identifier in the form of `[a-zA-Z0-9_]*` /// /// The start byte should already a valid byte of the identifier. @@ -80,7 +91,7 @@ impl<'a> Lexer<'a> { /// Lex an ident which is surround by delimiters. pub fn lex_surrounded_ident(&mut self, is_backtick: bool) -> Token { match self.lex_surrounded_ident_err(is_backtick) { - Ok(x) => x, + Ok(_) => self.finish_token(TokenKind::Identifier), Err(e) => { self.scratch.clear(); self.invalid_token(e) @@ -89,7 +100,7 @@ impl<'a> Lexer<'a> { } /// Lex an ident surrounded either by `⟨⟩` or `\`\`` - pub fn lex_surrounded_ident_err(&mut self, is_backtick: bool) -> Result { + pub fn lex_surrounded_ident_err(&mut self, is_backtick: bool) -> Result<(), Error> { loop { let Some(x) = self.reader.next() else { let end_char = if is_backtick { @@ -103,7 +114,7 @@ impl<'a> Lexer<'a> { match x { b'`' if is_backtick => { self.string = Some(mem::take(&mut self.scratch)); - return Ok(self.finish_token(TokenKind::Identifier)); + return Ok(()); } b'\0' => { // null bytes not allowed @@ -162,7 +173,7 @@ impl<'a> Lexer<'a> { let c = self.reader.complete_char(x)?; if !is_backtick && c == '⟩' { self.string = Some(mem::take(&mut self.scratch)); - return Ok(self.finish_token(TokenKind::Identifier)); + return Ok(()); } self.scratch.push(c); } diff --git a/core/src/syn/lexer/reader.rs b/core/src/syn/lexer/reader.rs index 65b564f5..97382ffc 100644 --- a/core/src/syn/lexer/reader.rs +++ b/core/src/syn/lexer/reader.rs @@ -76,10 +76,12 @@ impl<'a> BytesReader<'a> { pub fn peek(&self) -> Option { self.remaining().first().copied() } + #[inline] pub fn span(&self, span: Span) -> &'a [u8] { &self.data[(span.offset as usize)..(span.offset as usize + span.len as usize)] } + #[inline] pub fn next_continue_byte(&mut self) -> Result { const CONTINUE_BYTE_PREFIX_MASK: u8 = 0b1100_0000; @@ -87,7 +89,7 @@ impl<'a> BytesReader<'a> { let byte = self.next().ok_or(CharError::Eof)?; if byte & CONTINUE_BYTE_PREFIX_MASK != 0b1000_0000 { - return Err(CharError::Eof); + return Err(CharError::Unicode); } Ok(byte & CONTINUE_BYTE_MASK) diff --git a/core/src/syn/parser/mac.rs b/core/src/syn/parser/mac.rs index b1c9d1ab..98a66c0b 100644 --- a/core/src/syn/parser/mac.rs +++ b/core/src/syn/parser/mac.rs @@ -140,7 +140,7 @@ macro_rules! expected_whitespace { #[cfg(test)] #[macro_export] macro_rules! test_parse { - ($func:ident$( ( $($e:expr),* $(,)? ))? , $t:literal) => {{ + ($func:ident$( ( $($e:expr),* $(,)? ))? , $t:expr) => {{ let mut parser = $crate::syn::parser::Parser::new($t.as_bytes()); let mut stack = reblessive::Stack::new(); stack.enter(|ctx| parser.$func(ctx,$($($e),*)*)).finish() diff --git a/core/src/syn/parser/test/mod.rs b/core/src/syn/parser/test/mod.rs index 3cf99acb..b7e89d36 100644 --- a/core/src/syn/parser/test/mod.rs +++ b/core/src/syn/parser/test/mod.rs @@ -1,3 +1,5 @@ +use nom::AsBytes; + use crate::{sql, syn::parser::mac::test_parse}; mod limit; @@ -11,3 +13,25 @@ fn multiple_semicolons() { let expected = sql::Query(sql::Statements(vec![])); assert_eq!(res, expected); } + +#[test] +fn escaped_params() { + let src = r#"LET $⟨R-_fYU8Wa31kg7tz0JI6Kme⟩ = 5; + RETURN $⟨R-_fYU8Wa31kg7tz0JI6Kme⟩"#; + + for (idx, b) in src.as_bytes().iter().enumerate() { + println!("{:0>4}: {:0>8b}", idx, b); + } + + test_parse!(parse_query, src).unwrap(); +} + +#[test] +fn escaped_params_backtick() { + test_parse!( + parse_query, + r#"LET $`R-_fYU8Wa31kg7tz0JI6Kme` = 5; + RETURN $`R-_fYU8Wa31kg7tz0JI6Kme`"# + ) + .unwrap(); +}