parent
7a86ed3a3d
commit
12bf8dea6b
1 changed files with 87 additions and 30 deletions
|
@ -2,16 +2,15 @@ use crate::sql::error::Error::Parser;
|
||||||
use crate::sql::error::IResult;
|
use crate::sql::error::IResult;
|
||||||
use crate::sql::escape::escape_str;
|
use crate::sql::escape::escape_str;
|
||||||
use nom::branch::alt;
|
use nom::branch::alt;
|
||||||
use nom::bytes::complete::escaped_transform;
|
use nom::bytes::complete::{escaped_transform, is_not, tag, take, take_while_m_n};
|
||||||
use nom::bytes::complete::is_not;
|
|
||||||
use nom::bytes::complete::take_while_m_n;
|
|
||||||
use nom::character::complete::char;
|
use nom::character::complete::char;
|
||||||
use nom::combinator::value;
|
use nom::combinator::value;
|
||||||
|
use nom::sequence::preceded;
|
||||||
use nom::Err::Failure;
|
use nom::Err::Failure;
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
use std::fmt::{self, Display, Formatter};
|
use std::fmt::{self, Display, Formatter};
|
||||||
use std::ops;
|
|
||||||
use std::ops::Deref;
|
use std::ops::Deref;
|
||||||
|
use std::ops::{self, RangeInclusive};
|
||||||
use std::str;
|
use std::str;
|
||||||
|
|
||||||
pub(crate) const TOKEN: &str = "$surrealdb::private::sql::Strand";
|
pub(crate) const TOKEN: &str = "$surrealdb::private::sql::Strand";
|
||||||
|
@ -22,7 +21,8 @@ const SINGLE_ESC: &str = r#"\'"#;
|
||||||
const DOUBLE: char = '"';
|
const DOUBLE: char = '"';
|
||||||
const DOUBLE_ESC: &str = r#"\""#;
|
const DOUBLE_ESC: &str = r#"\""#;
|
||||||
|
|
||||||
const SURROGATES: [u32; 2] = [55296, 57343];
|
const LEADING_SURROGATES: RangeInclusive<u16> = 0xD800..=0xDBFF;
|
||||||
|
const TRAILING_SURROGATES: RangeInclusive<u16> = 0xDC00..=0xDFFF;
|
||||||
|
|
||||||
#[derive(Clone, Debug, Default, Eq, PartialEq, PartialOrd, Serialize, Deserialize, Hash)]
|
#[derive(Clone, Debug, Default, Eq, PartialEq, PartialOrd, Serialize, Deserialize, Hash)]
|
||||||
#[serde(rename = "$surrealdb::private::sql::Strand")]
|
#[serde(rename = "$surrealdb::private::sql::Strand")]
|
||||||
|
@ -111,7 +111,7 @@ fn strand_single(i: &str) -> IResult<&str, String> {
|
||||||
is_not(SINGLE_ESC),
|
is_not(SINGLE_ESC),
|
||||||
'\\',
|
'\\',
|
||||||
alt((
|
alt((
|
||||||
strand_unicode,
|
char_unicode,
|
||||||
value('\u{5c}', char('\\')),
|
value('\u{5c}', char('\\')),
|
||||||
value('\u{27}', char('\'')),
|
value('\u{27}', char('\'')),
|
||||||
value('\u{2f}', char('/')),
|
value('\u{2f}', char('/')),
|
||||||
|
@ -132,7 +132,7 @@ fn strand_double(i: &str) -> IResult<&str, String> {
|
||||||
is_not(DOUBLE_ESC),
|
is_not(DOUBLE_ESC),
|
||||||
'\\',
|
'\\',
|
||||||
alt((
|
alt((
|
||||||
strand_unicode,
|
char_unicode,
|
||||||
value('\u{5c}', char('\\')),
|
value('\u{5c}', char('\\')),
|
||||||
value('\u{22}', char('\"')),
|
value('\u{22}', char('\"')),
|
||||||
value('\u{2f}', char('/')),
|
value('\u{2f}', char('/')),
|
||||||
|
@ -147,30 +147,59 @@ fn strand_double(i: &str) -> IResult<&str, String> {
|
||||||
Ok((i, v))
|
Ok((i, v))
|
||||||
}
|
}
|
||||||
|
|
||||||
fn strand_unicode(i: &str) -> IResult<&str, char> {
|
fn char_unicode(i: &str) -> IResult<&str, char> {
|
||||||
// Read the \u character
|
preceded(char('u'), alt((char_unicode_bracketed, char_unicode_bare)))(i)
|
||||||
let (i, _) = char('u')(i)?;
|
}
|
||||||
// Let's read the next 6 ascii hexadecimal characters
|
|
||||||
|
// \uABCD or \uDBFF\uDFFF (surrogate pair)
|
||||||
|
fn char_unicode_bare(i: &str) -> IResult<&str, char> {
|
||||||
|
// Take exactly 4 bytes
|
||||||
|
let (i, v) = take(4usize)(i)?;
|
||||||
|
// Parse them as hex, where an error indicates invalid hex digits
|
||||||
|
let v: u16 = u16::from_str_radix(v, 16).map_err(|_| Failure(Parser(i)))?;
|
||||||
|
|
||||||
|
if LEADING_SURROGATES.contains(&v) {
|
||||||
|
let leading = v;
|
||||||
|
|
||||||
|
// Read the next \u.
|
||||||
|
let (i, _) = tag("\\u")(i)?;
|
||||||
|
// Take exactly 4 more bytes
|
||||||
|
let (i, v) = take(4usize)(i)?;
|
||||||
|
// Parse them as hex, where an error indicates invalid hex digits
|
||||||
|
let trailing = u16::from_str_radix(v, 16).map_err(|_| Failure(Parser(i)))?;
|
||||||
|
if !TRAILING_SURROGATES.contains(&trailing) {
|
||||||
|
return Err(Failure(Parser(i)));
|
||||||
|
}
|
||||||
|
// Compute the codepoint.
|
||||||
|
// https://datacadamia.com/data/type/text/surrogate#from_surrogate_to_character_code
|
||||||
|
let codepoint = 0x10000
|
||||||
|
+ ((leading as u32 - *LEADING_SURROGATES.start() as u32) << 10)
|
||||||
|
+ trailing as u32
|
||||||
|
- *TRAILING_SURROGATES.start() as u32;
|
||||||
|
// Convert to char
|
||||||
|
let v = char::from_u32(codepoint).ok_or(Failure(Parser(i)))?;
|
||||||
|
// Return the char
|
||||||
|
Ok((i, v))
|
||||||
|
} else {
|
||||||
|
// We can convert this to char or error in the case of invalid Unicode character
|
||||||
|
let v = char::from_u32(v as u32).ok_or(Failure(Parser(i)))?;
|
||||||
|
// Return the char
|
||||||
|
Ok((i, v))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// \u{10ffff}
|
||||||
|
fn char_unicode_bracketed(i: &str) -> IResult<&str, char> {
|
||||||
|
// Read the { character
|
||||||
|
let (i, _) = char('{')(i)?;
|
||||||
|
// Let's up to 6 ascii hexadecimal characters
|
||||||
let (i, v) = take_while_m_n(1, 6, |c: char| c.is_ascii_hexdigit())(i)?;
|
let (i, v) = take_while_m_n(1, 6, |c: char| c.is_ascii_hexdigit())(i)?;
|
||||||
// We can convert this to u32 as we only have 6 chars
|
// We can convert this to u32 as the max is 0xffffff
|
||||||
let v = match u32::from_str_radix(v, 16) {
|
let v = u32::from_str_radix(v, 16).unwrap();
|
||||||
// We found an invalid unicode sequence
|
// We can convert this to char or error in the case of invalid Unicode character
|
||||||
Err(_) => return Err(Failure(Parser(i))),
|
let v = char::from_u32(v).ok_or(Failure(Parser(i)))?;
|
||||||
// The unicode sequence was valid
|
// Read the } character
|
||||||
Ok(v) => match v {
|
let (i, _) = char('}')(i)?;
|
||||||
// This is a surrogate, so convert to a space
|
|
||||||
v if v >= SURROGATES[0] && v <= SURROGATES[1] => 32,
|
|
||||||
// This is a valid UTF-8 / UTF-16 character
|
|
||||||
_ => v,
|
|
||||||
},
|
|
||||||
};
|
|
||||||
// We can convert this to char as we know it is valid
|
|
||||||
let v = match std::char::from_u32(v) {
|
|
||||||
// We found an invalid unicode sequence
|
|
||||||
None => return Err(Failure(Parser(i))),
|
|
||||||
// The unicode sequence was valid
|
|
||||||
Some(v) => v,
|
|
||||||
};
|
|
||||||
// Return the char
|
// Return the char
|
||||||
Ok((i, v))
|
Ok((i, v))
|
||||||
}
|
}
|
||||||
|
@ -239,4 +268,32 @@ mod tests {
|
||||||
assert_eq!("'te\"st\n\tand\u{08}some\u{05d9}'", format!("{}", out));
|
assert_eq!("'te\"st\n\tand\u{08}some\u{05d9}'", format!("{}", out));
|
||||||
assert_eq!(out, Strand::from("te\"st\n\tand\u{08}some\u{05d9}"));
|
assert_eq!(out, Strand::from("te\"st\n\tand\u{08}some\u{05d9}"));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn strand_fuzz_escape() {
|
||||||
|
for n in (0..=char::MAX as u32).step_by(101) {
|
||||||
|
if let Some(c) = char::from_u32(n) {
|
||||||
|
let expected = format!("a{c}b");
|
||||||
|
|
||||||
|
let utf32 = format!("\"a\\u{{{n:x}}}b\"");
|
||||||
|
let (rest, s) = strand(&utf32).unwrap();
|
||||||
|
assert_eq!(rest, "");
|
||||||
|
assert_eq!(s.as_str(), &expected);
|
||||||
|
|
||||||
|
let mut utf16 = String::with_capacity(16);
|
||||||
|
utf16 += "\"a";
|
||||||
|
let mut buf = [0; 2];
|
||||||
|
for &mut n in c.encode_utf16(&mut buf) {
|
||||||
|
utf16 += &format!("\\u{n:04x}");
|
||||||
|
}
|
||||||
|
utf16 += "b\"";
|
||||||
|
let (rest, s) = strand(&utf16).unwrap();
|
||||||
|
assert_eq!(rest, "");
|
||||||
|
assert_eq!(s.as_str(), &expected);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Unpaired surrogate.
|
||||||
|
assert!(strand("\"\\u{DBFF}\"").is_err());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in a new issue