surrealpatch/core/src/syn/parser/mod.rs

//! Module implementing the SurrealQL parser.
//!
//! The SurrealQL parse is a relatively simple recursive decent parser.
//! Most of the functions of the SurrealQL parser peek a token from the lexer and then decide to
//! take a path depending on which token is next.
//!
//! # Implementation Details
//!
//! There are a bunch of common patterns for which this module has some confinence functions.
//! - Whenever only one token can be next you should use the `expected!` macro. This macro
//!     ensures that the given token type is next and if not returns a parser error.
//! - Whenever a limited set of tokens can be next it is common to match the token kind and then
//!     have a catch all arm which calles the macro `unexpected!`. This macro will raise an parse
//!     error with information about the type of token it recieves and what it expected.
//! - If a single token can be optionally next use [`Parser::eat`] this function returns a bool
//!     depending on if the given tokenkind was eaten.
//! - If a closing delimiting token is expected use `Parser::expect_closing_delimiter`. This
//!     function will raise an error if the expected delimiter isn't the next token. This error will
//!     also point to which delimiter the parser expected to be closed.
//!
//! ## Far Token Peek
//!
//! Occasionally the parser needs to check further ahead than peeking allows.
//! This is done with the `Parser::peek_token_at` function. This function peeks a given number
//! of tokens further than normal up to 3 tokens further.
//!
//! ## WhiteSpace Tokens
//!
//! The lexer produces whitespace tokens, these are tokens which are normally ignored in most place
//! in the syntax as they have no bearing on the meaning of a statements. [`Parser::next`] and
//! [`Parser::peek`] automatically skip over any whitespace tokens. However in some places, like
//! in a record-id and when gluing tokens, these white-space tokens are required for correct
//! parsing. In which case the function [`Parser::next_whitespace`] and others with `_whitespace`
//! are used. These functions don't skip whitespace tokens. However these functions do not undo
//! whitespace tokens which might have been skipped. Implementers must be carefull to not call a
//! functions which requires whitespace tokens when they may already have been skipped.
//!
//! ## Compound tokens and token gluing.
//!
//! SurrealQL has a bunch of tokens which have complex rules for when they are allowed and the
//! value they contain. Such tokens are named compound tokens, and examples include a javascript
//! body, strand-like tokens, regex, numbers, etc.
//!
//! These tokens need to be manually requested from the lexer with the [`Lexer::lex_compound`]
//! function.
//!
//! This manually request of tokens leads to a problems when used in conjunction with peeking. Take
//! for instance the production `{ "foo": "bar"}`. `"foo"` is a compound token so when intially
//! encountered the lexer only returns a `"` token and then that token needs to be collected into a
//! the full strand token. However the parser needs to figure out if we are parsing an object
//! or a block so it needs to look past the compound token to see if the next token is `:`. This is
//! where gluing comes in. Calling `Parser::glue` checks if the next token could start a compound
//! token and combines them into a single token. This can only be done in places where we know if
//! we encountered a leading token of a compound token it will result in the 'default' compound token.
use self::token_buffer::TokenBuffer;
use crate::{
	sql::{self, Datetime, Duration, Strand, Uuid},
	syn::{
		error::{bail, SyntaxError},
		lexer::{compound::NumberKind, Lexer},
		token::{t, Span, Token, TokenKind},
	},
};
use reblessive::Stk;

mod basic;
mod builtin;
mod expression;
mod function;
mod glue;
mod idiom;
mod json;
mod kind;
pub(crate) mod mac;
mod object;
mod prime;
mod stmt;
mod thing;
mod token;
mod token_buffer;

pub(crate) use mac::{enter_object_recursion, enter_query_recursion, unexpected};

#[cfg(test)]
pub mod test;

/// The result returned by most parser function.
pub type ParseResult<T> = Result<T, SyntaxError>;

/// A result of trying to parse a possibly partial query.
#[derive(Debug)]
#[non_exhaustive]
pub enum PartialResult<T> {
	MoreData,
	Ok {
		value: T,
		used: usize,
	},
	Err {
		err: SyntaxError,
		used: usize,
	},
}

#[derive(Default)]
pub enum GluedValue {
	Duration(Duration),
	Datetime(Datetime),
	Uuid(Uuid),
	Number(NumberKind),
	Strand(Strand),
	#[default]
	None,
}

/// The SurrealQL parser.
pub struct Parser<'a> {
	lexer: Lexer<'a>,
	last_span: Span,
	token_buffer: TokenBuffer<4>,
	glued_value: GluedValue,
	pub(crate) table_as_field: bool,
	legacy_strands: bool,
	flexible_record_id: bool,
	object_recursion: usize,
	query_recursion: usize,
}

impl<'a> Parser<'a> {
	/// Create a new parser from a give source.
	pub fn new(source: &'a [u8]) -> Self {
		Parser {
			lexer: Lexer::new(source),
			last_span: Span::empty(),
			token_buffer: TokenBuffer::new(),
			glued_value: GluedValue::None,
			table_as_field: false,
			legacy_strands: false,
			flexible_record_id: true,
			object_recursion: 100,
			query_recursion: 20,
		}
	}

	/// Disallow a query to have objects deeper that limit.
	/// Arrays also count towards objects. So `[{foo: [] }]` would be 3 deep.
	pub fn with_object_recursion_limit(mut self, limit: usize) -> Self {
		self.object_recursion = limit;
		self
	}

	/// Disallow a query from being deeper than the give limit.
	/// A query recurses when a statement contains another statement within itself.
	/// Examples are subquery and blocks like block statements and if statements and such.
	pub fn with_query_recursion_limit(mut self, limit: usize) -> Self {
		self.query_recursion = limit;
		self
	}

	/// Parse strand like the old parser where a strand which looks like a UUID, Record-Id, Or a
	/// DateTime will be parsed as a date-time.
	pub fn with_allow_legacy_strand(mut self, value: bool) -> Self {
		self.legacy_strands = value;
		self
	}

	/// Set whether to parse strands as legacy strands.
	pub fn allow_legacy_strand(&mut self, value: bool) {
		self.legacy_strands = value;
	}

	/// Set whether to allow record-id's which don't adheare to regular ident rules.
	/// Setting this to true will allow parsing of, for example, `foo:0bar`. This would be rejected
	/// by normal identifier rules as most identifiers can't start with a number.
	pub fn allow_fexible_record_id(&mut self, value: bool) {
		self.flexible_record_id = value;
	}

	/// Reset the parser state. Doesnt change the position of the parser in buffer.
	pub fn reset(&mut self) {
		self.last_span = Span::empty();
		self.token_buffer.clear();
		self.table_as_field = false;
		self.lexer.reset();
	}

	/// Change the source of the parser reusing the existing buffers.
	pub fn change_source(self, source: &[u8]) -> Parser {
		Parser {
			lexer: self.lexer.change_source(source),
			last_span: Span::empty(),
			token_buffer: TokenBuffer::new(),
			glued_value: GluedValue::None,
			legacy_strands: self.legacy_strands,
			flexible_record_id: self.flexible_record_id,
			table_as_field: false,
			object_recursion: self.object_recursion,
			query_recursion: self.query_recursion,
		}
	}

	/// Returns the next token and advance the parser one token forward.
	#[allow(clippy::should_implement_trait)]
	pub fn next(&mut self) -> Token {
		let res = loop {
			let res = self.token_buffer.pop().unwrap_or_else(|| self.lexer.next_token());
			if res.kind != TokenKind::WhiteSpace {
				break res;
			}
		};
		self.last_span = res.span;
		res
	}

	/// Returns the next token and advance the parser one token forward.
	///
	/// This function is like next but returns whitespace tokens which are normally skipped
	#[allow(clippy::should_implement_trait)]
	pub fn next_whitespace(&mut self) -> Token {
		let res = self.token_buffer.pop().unwrap_or_else(|| self.lexer.next_token());
		self.last_span = res.span;
		res
	}

	/// Returns if there is a token in the token buffer, meaning that a token was peeked.
	pub fn has_peek(&self) -> bool {
		self.token_buffer.is_empty()
	}

	/// Consume the current peeked value and advance the parser one token forward.
	///
	/// Should only be called after peeking a value.
	pub fn pop_peek(&mut self) -> Token {
		let res = self.token_buffer.pop().unwrap();
		self.last_span = res.span;
		res
	}

	/// Returns the next token without consuming it.
	pub fn peek(&mut self) -> Token {
		loop {
			let Some(x) = self.token_buffer.first() else {
				let res = loop {
					let res = self.lexer.next_token();
					if res.kind != TokenKind::WhiteSpace {
						break res;
					}
				};
				self.token_buffer.push(res);
				return res;
			};
			if x.kind == TokenKind::WhiteSpace {
				self.token_buffer.pop();
				continue;
			}
			break x;
		}
	}

	/// Returns the next token without consuming it.
	///
	/// This function is like peek but returns whitespace tokens which are normally skipped
	/// Does not undo tokens skipped in a previous normal peek.
	pub fn peek_whitespace(&mut self) -> Token {
		let Some(x) = self.token_buffer.first() else {
			let res = self.lexer.next_token();
			self.token_buffer.push(res);
			return res;
		};
		x
	}

	/// Return the token kind of the next token without consuming it.
	pub fn peek_kind(&mut self) -> TokenKind {
		self.peek().kind
	}

	/// Returns the next n'th token without consuming it.
	/// `peek_token_at(0)` is equivalent to `peek`.
	pub(crate) fn peek_token_at(&mut self, at: u8) -> Token {
		for _ in self.token_buffer.len()..=at {
			let r = loop {
				let r = self.lexer.next_token();
				if r.kind != TokenKind::WhiteSpace {
					break r;
				}
			};
			self.token_buffer.push(r);
		}
		self.token_buffer.at(at).unwrap()
	}

	pub fn peek1(&mut self) -> Token {
		self.peek_token_at(1)
	}

	/// Returns the next n'th token without consuming it.
	/// `peek_token_at(0)` is equivalent to `peek`.
	pub fn peek_whitespace_token_at(&mut self, at: u8) -> Token {
		for _ in self.token_buffer.len()..=at {
			let r = self.lexer.next_token();
			self.token_buffer.push(r);
		}
		self.token_buffer.at(at).unwrap()
	}

	pub fn peek_whitespace1(&mut self) -> Token {
		self.peek_whitespace_token_at(1)
	}

	/// Returns the span of the next token if it was already peeked, otherwise returns the token of
	/// the last consumed token.
	pub fn recent_span(&mut self) -> Span {
		self.token_buffer.first().map(|x| x.span).unwrap_or(self.last_span)
	}

	///  returns the token of the last consumed token.
	pub fn last_span(&mut self) -> Span {
		self.last_span
	}

	pub fn assert_finished(&self) -> ParseResult<()> {
		self.lexer.assert_finished()
	}

	/// Eat the next token if it is of the given kind.
	/// Returns whether a token was eaten.
	pub fn eat(&mut self, token: TokenKind) -> bool {
		let peek = self.peek();
		if token == peek.kind {
			self.token_buffer.pop();
			self.last_span = peek.span;
			true
		} else {
			false
		}
	}

	/// Eat the next token if it is of the given kind.
	/// Returns whether a token was eaten.
	///
	/// Unlike [`Parser::eat`] this doesn't skip whitespace tokens
	pub fn eat_whitespace(&mut self, token: TokenKind) -> bool {
		let peek = self.peek_whitespace();
		if token == peek.kind {
			self.token_buffer.pop();
			self.last_span = peek.span;
			true
		} else {
			false
		}
	}

	/// Forces the next token to be the given one.
	/// Used in token gluing to replace the current one with the glued token.
	fn prepend_token(&mut self, token: Token) {
		self.token_buffer.push_front(token);
	}

	/// Checks if the next token is of the given kind. If it isn't it returns a UnclosedDelimiter
	/// error.
	fn expect_closing_delimiter(&mut self, kind: TokenKind, should_close: Span) -> ParseResult<()> {
		let peek = self.peek();
		if peek.kind != kind {
			bail!("Unexpected token `{}` expected delimiter `{kind}`",
				peek.kind,
				@self.recent_span(),
				@should_close => "expected this delimiter to close"
			);
		}
		self.pop_peek();
		Ok(())
	}

	/// Recover the parser state to after a given span.
	pub fn backup_after(&mut self, span: Span) {
		self.token_buffer.clear();
		self.lexer.backup_after(span);
	}

	/// Parse a full query.
	///
	/// This is the primary entry point of the parser.
	pub async fn parse_query(&mut self, ctx: &mut Stk) -> ParseResult<sql::Query> {
		let statements = self.parse_stmt_list(ctx).await?;
		Ok(sql::Query(statements))
	}

	/// Parse a single statement.
	pub async fn parse_statement(&mut self, ctx: &mut Stk) -> ParseResult<sql::Statement> {
		self.parse_stmt(ctx).await
	}

	/// Parse a possibly partial statement.
	///
	/// This will try to parse a statement if a full statement can be parsed from the buffer parser
	/// is operating on.
	pub async fn parse_partial_statement(
		&mut self,
		ctx: &mut Stk,
	) -> PartialResult<sql::Statement> {
		while self.eat(t!(";")) {}

		let res = ctx.run(|ctx| self.parse_stmt(ctx)).await;
		let v = match res {
			Err(e) => {
				let peek = self.peek_whitespace1();
				if e.is_data_pending()
					|| matches!(peek.kind, TokenKind::Eof | TokenKind::WhiteSpace)
				{
					return PartialResult::MoreData;
				}
				return PartialResult::Err {
					err: e,
					used: self.lexer.reader.offset(),
				};
			}
			Ok(x) => x,
		};

		if self.eat(t!(";")) {
			return PartialResult::Ok {
				value: v,
				used: self.lexer.reader.offset(),
			};
		}

		PartialResult::MoreData
	}
}