2023-01-08 17:11:35 +00:00
|
|
|
use crate::sql::error::Error::Parser;
|
2022-01-16 20:31:50 +00:00
|
|
|
use crate::sql::error::IResult;
|
2023-05-31 07:36:29 +00:00
|
|
|
use crate::sql::escape::quote_str;
|
2020-06-29 15:36:01 +00:00
|
|
|
use nom::branch::alt;
|
2023-04-30 18:33:48 +00:00
|
|
|
use nom::bytes::complete::{escaped_transform, is_not, tag, take, take_while_m_n};
|
2022-09-26 00:23:57 +00:00
|
|
|
use nom::character::complete::char;
|
|
|
|
use nom::combinator::value;
|
2023-04-30 18:33:48 +00:00
|
|
|
use nom::sequence::preceded;
|
2023-02-16 12:22:23 +00:00
|
|
|
use nom::Err::Failure;
|
2020-06-29 15:36:01 +00:00
|
|
|
use serde::{Deserialize, Serialize};
|
2022-10-04 21:51:18 +00:00
|
|
|
use std::fmt::{self, Display, Formatter};
|
2022-05-05 04:30:32 +00:00
|
|
|
use std::ops::Deref;
|
2023-04-30 18:33:48 +00:00
|
|
|
use std::ops::{self, RangeInclusive};
|
2020-06-29 15:36:01 +00:00
|
|
|
use std::str;
|
|
|
|
|
2023-03-30 10:41:44 +00:00
|
|
|
pub(crate) const TOKEN: &str = "$surrealdb::private::sql::Strand";
|
|
|
|
|
2022-09-26 00:23:57 +00:00
|
|
|
const SINGLE: char = '\'';
|
2023-05-09 17:48:14 +00:00
|
|
|
const SINGLE_ESC_NUL: &str = "'\\\0";
|
2021-05-24 08:18:58 +00:00
|
|
|
|
2022-09-26 00:23:57 +00:00
|
|
|
const DOUBLE: char = '"';
|
2023-05-09 17:48:14 +00:00
|
|
|
const DOUBLE_ESC_NUL: &str = "\"\\\0";
|
2021-05-24 08:18:58 +00:00
|
|
|
|
2023-04-30 18:33:48 +00:00
|
|
|
const LEADING_SURROGATES: RangeInclusive<u16> = 0xD800..=0xDBFF;
|
|
|
|
const TRAILING_SURROGATES: RangeInclusive<u16> = 0xDC00..=0xDFFF;
|
2023-02-16 12:22:23 +00:00
|
|
|
|
2023-05-09 17:48:14 +00:00
|
|
|
/// A string that doesn't contain NUL bytes.
|
|
|
|
#[derive(Clone, Debug, Default, Eq, PartialEq, Ord, PartialOrd, Serialize, Deserialize, Hash)]
|
2023-04-29 15:58:22 +00:00
|
|
|
#[serde(rename = "$surrealdb::private::sql::Strand")]
|
2023-05-09 17:48:14 +00:00
|
|
|
pub struct Strand(#[serde(with = "no_nul_bytes")] pub String);
|
2020-06-29 15:36:01 +00:00
|
|
|
|
2021-03-29 15:43:37 +00:00
|
|
|
impl From<String> for Strand {
|
|
|
|
fn from(s: String) -> Self {
|
2023-05-09 17:48:14 +00:00
|
|
|
debug_assert!(!s.contains('\0'));
|
2022-05-05 04:30:32 +00:00
|
|
|
Strand(s)
|
2021-03-29 15:43:37 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-05-25 09:42:10 +00:00
|
|
|
impl From<&str> for Strand {
|
2020-06-29 15:36:01 +00:00
|
|
|
fn from(s: &str) -> Self {
|
2023-05-09 17:48:14 +00:00
|
|
|
debug_assert!(!s.contains('\0'));
|
2022-10-04 21:51:18 +00:00
|
|
|
Self::from(String::from(s))
|
2022-05-05 04:30:32 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
impl Deref for Strand {
|
|
|
|
type Target = String;
|
|
|
|
fn deref(&self) -> &Self::Target {
|
|
|
|
&self.0
|
2020-06-29 15:36:01 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-10-26 14:01:09 +00:00
|
|
|
impl From<Strand> for String {
|
|
|
|
fn from(s: Strand) -> Self {
|
|
|
|
s.0
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-01-13 17:36:41 +00:00
|
|
|
impl Strand {
|
2022-10-19 09:55:19 +00:00
|
|
|
/// Get the underlying String slice
|
2022-01-13 17:36:41 +00:00
|
|
|
pub fn as_str(&self) -> &str {
|
2022-05-05 04:30:32 +00:00
|
|
|
self.0.as_str()
|
|
|
|
}
|
2022-10-19 09:55:19 +00:00
|
|
|
/// Returns the underlying String
|
2022-05-05 04:30:32 +00:00
|
|
|
pub fn as_string(self) -> String {
|
|
|
|
self.0
|
2022-01-13 17:36:41 +00:00
|
|
|
}
|
2022-10-19 09:55:19 +00:00
|
|
|
/// Convert the Strand to a raw String
|
2022-07-04 01:03:26 +00:00
|
|
|
pub fn to_raw(self) -> String {
|
|
|
|
self.0
|
|
|
|
}
|
2022-01-13 17:36:41 +00:00
|
|
|
}
|
|
|
|
|
2022-10-04 21:51:18 +00:00
|
|
|
impl Display for Strand {
|
|
|
|
fn fmt(&self, f: &mut Formatter) -> fmt::Result {
|
2023-05-31 07:36:29 +00:00
|
|
|
Display::fmt("e_str(&self.0), f)
|
2020-06-29 15:36:01 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-01-13 17:36:41 +00:00
|
|
|
impl ops::Add for Strand {
|
|
|
|
type Output = Self;
|
2023-05-09 17:48:14 +00:00
|
|
|
fn add(mut self, other: Self) -> Self {
|
|
|
|
self.0.push_str(other.as_str());
|
|
|
|
self
|
2022-01-13 17:36:41 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-06-29 15:36:01 +00:00
|
|
|
pub fn strand(i: &str) -> IResult<&str, Strand> {
|
|
|
|
let (i, v) = strand_raw(i)?;
|
2022-05-05 04:30:32 +00:00
|
|
|
Ok((i, Strand(v)))
|
2020-06-29 15:36:01 +00:00
|
|
|
}
|
|
|
|
|
2021-05-24 08:18:58 +00:00
|
|
|
pub fn strand_raw(i: &str) -> IResult<&str, String> {
|
2022-09-26 00:23:57 +00:00
|
|
|
alt((strand_blank, strand_single, strand_double))(i)
|
|
|
|
}
|
|
|
|
|
|
|
|
fn strand_blank(i: &str) -> IResult<&str, String> {
|
|
|
|
alt((
|
|
|
|
|i| {
|
|
|
|
let (i, _) = char(SINGLE)(i)?;
|
|
|
|
let (i, _) = char(SINGLE)(i)?;
|
|
|
|
Ok((i, String::new()))
|
|
|
|
},
|
|
|
|
|i| {
|
|
|
|
let (i, _) = char(DOUBLE)(i)?;
|
|
|
|
let (i, _) = char(DOUBLE)(i)?;
|
|
|
|
Ok((i, String::new()))
|
|
|
|
},
|
|
|
|
))(i)
|
2020-06-29 15:36:01 +00:00
|
|
|
}
|
|
|
|
|
2021-05-24 08:18:58 +00:00
|
|
|
fn strand_single(i: &str) -> IResult<&str, String> {
|
2022-09-26 00:23:57 +00:00
|
|
|
let (i, _) = char(SINGLE)(i)?;
|
|
|
|
let (i, v) = escaped_transform(
|
2023-05-09 17:48:14 +00:00
|
|
|
is_not(SINGLE_ESC_NUL),
|
2022-09-26 00:23:57 +00:00
|
|
|
'\\',
|
|
|
|
alt((
|
2023-04-30 18:33:48 +00:00
|
|
|
char_unicode,
|
2022-09-27 11:38:17 +00:00
|
|
|
value('\u{5c}', char('\\')),
|
2022-09-26 00:23:57 +00:00
|
|
|
value('\u{27}', char('\'')),
|
|
|
|
value('\u{2f}', char('/')),
|
|
|
|
value('\u{08}', char('b')),
|
|
|
|
value('\u{0c}', char('f')),
|
|
|
|
value('\u{0a}', char('n')),
|
|
|
|
value('\u{0d}', char('r')),
|
|
|
|
value('\u{09}', char('t')),
|
|
|
|
)),
|
|
|
|
)(i)?;
|
|
|
|
let (i, _) = char(SINGLE)(i)?;
|
|
|
|
Ok((i, v))
|
2021-05-24 08:18:58 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
fn strand_double(i: &str) -> IResult<&str, String> {
|
2022-09-26 00:23:57 +00:00
|
|
|
let (i, _) = char(DOUBLE)(i)?;
|
|
|
|
let (i, v) = escaped_transform(
|
2023-05-09 17:48:14 +00:00
|
|
|
is_not(DOUBLE_ESC_NUL),
|
2022-09-26 00:23:57 +00:00
|
|
|
'\\',
|
|
|
|
alt((
|
2023-04-30 18:33:48 +00:00
|
|
|
char_unicode,
|
2022-09-27 11:38:17 +00:00
|
|
|
value('\u{5c}', char('\\')),
|
2022-09-26 00:23:57 +00:00
|
|
|
value('\u{22}', char('\"')),
|
|
|
|
value('\u{2f}', char('/')),
|
|
|
|
value('\u{08}', char('b')),
|
|
|
|
value('\u{0c}', char('f')),
|
|
|
|
value('\u{0a}', char('n')),
|
|
|
|
value('\u{0d}', char('r')),
|
|
|
|
value('\u{09}', char('t')),
|
|
|
|
)),
|
|
|
|
)(i)?;
|
|
|
|
let (i, _) = char(DOUBLE)(i)?;
|
|
|
|
Ok((i, v))
|
|
|
|
}
|
|
|
|
|
2023-04-30 18:33:48 +00:00
|
|
|
fn char_unicode(i: &str) -> IResult<&str, char> {
|
|
|
|
preceded(char('u'), alt((char_unicode_bracketed, char_unicode_bare)))(i)
|
|
|
|
}
|
|
|
|
|
|
|
|
// \uABCD or \uDBFF\uDFFF (surrogate pair)
|
|
|
|
fn char_unicode_bare(i: &str) -> IResult<&str, char> {
|
|
|
|
// Take exactly 4 bytes
|
|
|
|
let (i, v) = take(4usize)(i)?;
|
|
|
|
// Parse them as hex, where an error indicates invalid hex digits
|
|
|
|
let v: u16 = u16::from_str_radix(v, 16).map_err(|_| Failure(Parser(i)))?;
|
|
|
|
|
|
|
|
if LEADING_SURROGATES.contains(&v) {
|
|
|
|
let leading = v;
|
|
|
|
|
|
|
|
// Read the next \u.
|
|
|
|
let (i, _) = tag("\\u")(i)?;
|
|
|
|
// Take exactly 4 more bytes
|
|
|
|
let (i, v) = take(4usize)(i)?;
|
|
|
|
// Parse them as hex, where an error indicates invalid hex digits
|
|
|
|
let trailing = u16::from_str_radix(v, 16).map_err(|_| Failure(Parser(i)))?;
|
|
|
|
if !TRAILING_SURROGATES.contains(&trailing) {
|
|
|
|
return Err(Failure(Parser(i)));
|
|
|
|
}
|
|
|
|
// Compute the codepoint.
|
|
|
|
// https://datacadamia.com/data/type/text/surrogate#from_surrogate_to_character_code
|
|
|
|
let codepoint = 0x10000
|
|
|
|
+ ((leading as u32 - *LEADING_SURROGATES.start() as u32) << 10)
|
|
|
|
+ trailing as u32
|
|
|
|
- *TRAILING_SURROGATES.start() as u32;
|
|
|
|
// Convert to char
|
|
|
|
let v = char::from_u32(codepoint).ok_or(Failure(Parser(i)))?;
|
|
|
|
// Return the char
|
|
|
|
Ok((i, v))
|
|
|
|
} else {
|
|
|
|
// We can convert this to char or error in the case of invalid Unicode character
|
2023-05-09 17:48:14 +00:00
|
|
|
let v = char::from_u32(v as u32).filter(|c| *c != 0 as char).ok_or(Failure(Parser(i)))?;
|
2023-04-30 18:33:48 +00:00
|
|
|
// Return the char
|
|
|
|
Ok((i, v))
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// \u{10ffff}
|
|
|
|
fn char_unicode_bracketed(i: &str) -> IResult<&str, char> {
|
|
|
|
// Read the { character
|
|
|
|
let (i, _) = char('{')(i)?;
|
|
|
|
// Let's up to 6 ascii hexadecimal characters
|
2022-10-16 20:05:31 +00:00
|
|
|
let (i, v) = take_while_m_n(1, 6, |c: char| c.is_ascii_hexdigit())(i)?;
|
2023-04-30 18:33:48 +00:00
|
|
|
// We can convert this to u32 as the max is 0xffffff
|
|
|
|
let v = u32::from_str_radix(v, 16).unwrap();
|
|
|
|
// We can convert this to char or error in the case of invalid Unicode character
|
2023-05-09 17:48:14 +00:00
|
|
|
let v = char::from_u32(v).filter(|c| *c != 0 as char).ok_or(Failure(Parser(i)))?;
|
2023-04-30 18:33:48 +00:00
|
|
|
// Read the } character
|
|
|
|
let (i, _) = char('}')(i)?;
|
2022-10-16 20:05:31 +00:00
|
|
|
// Return the char
|
2022-09-26 00:23:57 +00:00
|
|
|
Ok((i, v))
|
2020-06-29 15:36:01 +00:00
|
|
|
}
|
|
|
|
|
2023-05-09 17:48:14 +00:00
|
|
|
// serde(with = no_nul_bytes) will (de)serialize with no NUL bytes.
|
|
|
|
pub(crate) mod no_nul_bytes {
|
|
|
|
use serde::{
|
|
|
|
de::{self, Visitor},
|
|
|
|
Deserializer, Serializer,
|
|
|
|
};
|
|
|
|
use std::fmt;
|
|
|
|
|
2023-05-10 16:58:04 +00:00
|
|
|
pub(crate) fn serialize<S>(s: &str, serializer: S) -> Result<S::Ok, S::Error>
|
2023-05-09 17:48:14 +00:00
|
|
|
where
|
|
|
|
S: Serializer,
|
|
|
|
{
|
|
|
|
debug_assert!(!s.contains('\0'));
|
|
|
|
serializer.serialize_str(s)
|
|
|
|
}
|
|
|
|
|
|
|
|
pub(crate) fn deserialize<'de, D>(deserializer: D) -> Result<String, D::Error>
|
|
|
|
where
|
|
|
|
D: Deserializer<'de>,
|
|
|
|
{
|
|
|
|
struct NoNulBytesVisitor;
|
|
|
|
|
|
|
|
impl<'de> Visitor<'de> for NoNulBytesVisitor {
|
|
|
|
type Value = String;
|
|
|
|
|
|
|
|
fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
|
|
|
|
formatter.write_str("a string without any NUL bytes")
|
|
|
|
}
|
|
|
|
|
|
|
|
fn visit_str<E>(self, value: &str) -> Result<Self::Value, E>
|
|
|
|
where
|
|
|
|
E: de::Error,
|
|
|
|
{
|
|
|
|
if value.contains('\0') {
|
|
|
|
Err(de::Error::custom("contained NUL byte"))
|
|
|
|
} else {
|
|
|
|
Ok(value.to_owned())
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
fn visit_string<E>(self, value: String) -> Result<Self::Value, E>
|
|
|
|
where
|
|
|
|
E: de::Error,
|
|
|
|
{
|
|
|
|
if value.contains('\0') {
|
|
|
|
Err(de::Error::custom("contained NUL byte"))
|
|
|
|
} else {
|
|
|
|
Ok(value)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
deserializer.deserialize_string(NoNulBytesVisitor)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-05-24 08:18:58 +00:00
|
|
|
#[cfg(test)]
|
|
|
|
mod tests {
|
|
|
|
|
|
|
|
use super::*;
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn strand_empty() {
|
|
|
|
let sql = r#""""#;
|
|
|
|
let res = strand(sql);
|
|
|
|
assert!(res.is_ok());
|
|
|
|
let out = res.unwrap().1;
|
2022-10-19 14:48:50 +00:00
|
|
|
assert_eq!(r#"''"#, format!("{}", out));
|
2021-05-24 08:18:58 +00:00
|
|
|
assert_eq!(out, Strand::from(""));
|
|
|
|
}
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn strand_single() {
|
|
|
|
let sql = r#"'test'"#;
|
|
|
|
let res = strand(sql);
|
|
|
|
assert!(res.is_ok());
|
|
|
|
let out = res.unwrap().1;
|
2022-10-19 14:48:50 +00:00
|
|
|
assert_eq!(r#"'test'"#, format!("{}", out));
|
2021-05-24 08:18:58 +00:00
|
|
|
assert_eq!(out, Strand::from("test"));
|
|
|
|
}
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn strand_double() {
|
|
|
|
let sql = r#""test""#;
|
|
|
|
let res = strand(sql);
|
|
|
|
assert!(res.is_ok());
|
|
|
|
let out = res.unwrap().1;
|
2022-10-19 14:48:50 +00:00
|
|
|
assert_eq!(r#"'test'"#, format!("{}", out));
|
2021-05-24 08:18:58 +00:00
|
|
|
assert_eq!(out, Strand::from("test"));
|
|
|
|
}
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn strand_quoted_single() {
|
|
|
|
let sql = r#"'te\'st'"#;
|
|
|
|
let res = strand(sql);
|
|
|
|
assert!(res.is_ok());
|
|
|
|
let out = res.unwrap().1;
|
|
|
|
assert_eq!(r#""te'st""#, format!("{}", out));
|
|
|
|
assert_eq!(out, Strand::from(r#"te'st"#));
|
|
|
|
}
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn strand_quoted_double() {
|
|
|
|
let sql = r#""te\"st""#;
|
|
|
|
let res = strand(sql);
|
|
|
|
assert!(res.is_ok());
|
|
|
|
let out = res.unwrap().1;
|
2022-10-19 14:48:50 +00:00
|
|
|
assert_eq!(r#"'te"st'"#, format!("{}", out));
|
2021-05-24 08:18:58 +00:00
|
|
|
assert_eq!(out, Strand::from(r#"te"st"#));
|
|
|
|
}
|
2022-09-26 00:23:57 +00:00
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn strand_quoted_escaped() {
|
|
|
|
let sql = r#""te\"st\n\tand\bsome\u05d9""#;
|
|
|
|
let res = strand(sql);
|
|
|
|
assert!(res.is_ok());
|
|
|
|
let out = res.unwrap().1;
|
2022-10-19 14:48:50 +00:00
|
|
|
assert_eq!("'te\"st\n\tand\u{08}some\u{05d9}'", format!("{}", out));
|
2022-09-26 00:23:57 +00:00
|
|
|
assert_eq!(out, Strand::from("te\"st\n\tand\u{08}some\u{05d9}"));
|
|
|
|
}
|
2023-04-30 18:33:48 +00:00
|
|
|
|
2023-05-09 17:48:14 +00:00
|
|
|
#[test]
|
|
|
|
fn strand_nul_byte() {
|
|
|
|
assert!(strand("'a\0b'").is_err());
|
|
|
|
assert!(strand("'a\\u0000b'").is_err());
|
|
|
|
assert!(strand("'a\\u{0}b'").is_err());
|
|
|
|
}
|
|
|
|
|
2023-04-30 18:33:48 +00:00
|
|
|
#[test]
|
|
|
|
fn strand_fuzz_escape() {
|
2023-05-09 17:48:14 +00:00
|
|
|
for n in (1..=char::MAX as u32).step_by(101) {
|
2023-04-30 18:33:48 +00:00
|
|
|
if let Some(c) = char::from_u32(n) {
|
|
|
|
let expected = format!("a{c}b");
|
|
|
|
|
|
|
|
let utf32 = format!("\"a\\u{{{n:x}}}b\"");
|
|
|
|
let (rest, s) = strand(&utf32).unwrap();
|
|
|
|
assert_eq!(rest, "");
|
|
|
|
assert_eq!(s.as_str(), &expected);
|
|
|
|
|
|
|
|
let mut utf16 = String::with_capacity(16);
|
|
|
|
utf16 += "\"a";
|
|
|
|
let mut buf = [0; 2];
|
|
|
|
for &mut n in c.encode_utf16(&mut buf) {
|
|
|
|
utf16 += &format!("\\u{n:04x}");
|
|
|
|
}
|
|
|
|
utf16 += "b\"";
|
|
|
|
let (rest, s) = strand(&utf16).unwrap();
|
|
|
|
assert_eq!(rest, "");
|
|
|
|
assert_eq!(s.as_str(), &expected);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Unpaired surrogate.
|
|
|
|
assert!(strand("\"\\u{DBFF}\"").is_err());
|
|
|
|
}
|
2020-06-29 15:36:01 +00:00
|
|
|
}
|