Fixes highlighting bug (out or range for slice length...) (#2169)

This commit is contained in:
Emmanuel Keller 2023-06-23 12:58:51 +01:00 committed by GitHub
parent be3e2637b9
commit f688eb4ef5
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
9 changed files with 164 additions and 99 deletions

16
Cargo.lock generated
View file

@ -3847,9 +3847,9 @@ dependencies = [
[[package]] [[package]]
name = "rquickjs" name = "rquickjs"
version = "0.3.0" version = "0.3.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9a9b746ee3a4f6dc8b8f9b369d13c043a2f21fb645253aeac3f1404b12cff7d9" checksum = "6db7788c2818f4546daabe9ae2d1ee2f4db61ab1998d4b483494c4193cc38dab"
dependencies = [ dependencies = [
"rquickjs-core", "rquickjs-core",
"rquickjs-macro", "rquickjs-macro",
@ -3857,9 +3857,9 @@ dependencies = [
[[package]] [[package]]
name = "rquickjs-core" name = "rquickjs-core"
version = "0.3.0" version = "0.3.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9f4f48d37b8e455a76544505915dcd7a82b7a3a1898cf2b4a0aba2d7bd66dcc4" checksum = "b12cf8646fe0af5bcff2822ccd162990f0679a1f9287c7257f4f4193a9d31ea9"
dependencies = [ dependencies = [
"async-lock", "async-lock",
"relative-path", "relative-path",
@ -3868,9 +3868,9 @@ dependencies = [
[[package]] [[package]]
name = "rquickjs-macro" name = "rquickjs-macro"
version = "0.3.0" version = "0.3.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6ecbbe8ec4319fe42d3ec165dc7fa473580c2fa09747a39aa93f81ab422792a8" checksum = "80564583a91b0ae6b2d6b9b3d0f8ffd69a4b17202cc63a12df78dfa8983885fc"
dependencies = [ dependencies = [
"darling", "darling",
"fnv", "fnv",
@ -3886,9 +3886,9 @@ dependencies = [
[[package]] [[package]]
name = "rquickjs-sys" name = "rquickjs-sys"
version = "0.3.0" version = "0.3.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3f65ccee59bf4eb61361e7d516fea4a77273b96fe17811fc7da2558369914f55" checksum = "b747058afd4d988d056e4972ec8516a5a86fdfc103c1c1485bfee8966a0743ae"
dependencies = [ dependencies = [
"bindgen 0.65.1", "bindgen 0.65.1",
"cc", "cc",

View file

@ -76,7 +76,7 @@ fuzzy-matcher = "0.3.7"
geo = { version = "0.25.0", features = ["use-serde"] } geo = { version = "0.25.0", features = ["use-serde"] }
indexmap = { version = "1.9.3", features = ["serde"] } indexmap = { version = "1.9.3", features = ["serde"] }
indxdb = { version = "0.3.0", optional = true } indxdb = { version = "0.3.0", optional = true }
js = { version = "0.3.0" , package = "rquickjs", features = ["array-buffer", "bindgen", "classes", "futures", "loader", "macro", "parallel", "properties","rust-alloc"], optional = true } js = { version = "0.3.1" , package = "rquickjs", features = ["array-buffer", "bindgen", "classes", "futures", "loader", "macro", "parallel", "properties","rust-alloc"], optional = true }
jsonwebtoken = "8.3.0" jsonwebtoken = "8.3.0"
lexicmp = "0.1.0" lexicmp = "0.1.0"
log = "0.4.17" log = "0.4.17"

View file

@ -469,9 +469,13 @@ pub enum Error {
}, },
/// Represents an error when analyzing a value /// Represents an error when analyzing a value
#[error("A string can't be analyzed: {0}")] #[error("A value can't be analyzed: {0}")]
AnalyzerError(String), AnalyzerError(String),
/// Represents an error when trying to highlight a value
#[error("A value can't be highlighted: {0}")]
HighlightError(String),
/// Represents an underlying error with Bincode serializing / deserializing /// Represents an underlying error with Bincode serializing / deserializing
#[error("Bincode error: {0}")] #[error("Bincode error: {0}")]
Bincode(#[from] BincodeError), Bincode(#[from] BincodeError),

View file

@ -1,3 +1,4 @@
use crate::err::Error;
use crate::idx::ft::analyzer::tokenizer::Tokens; use crate::idx::ft::analyzer::tokenizer::Tokens;
use crate::sql::filter::Filter as SqlFilter; use crate::sql::filter::Filter as SqlFilter;
use crate::sql::language::Language; use crate::sql::language::Language;
@ -60,13 +61,13 @@ impl Filter {
} }
} }
pub(super) fn apply_filters(mut t: Tokens, f: &Option<Vec<Filter>>) -> Tokens { pub(super) fn apply_filters(mut t: Tokens, f: &Option<Vec<Filter>>) -> Result<Tokens, Error> {
if let Some(f) = f { if let Some(f) = f {
for f in f { for f in f {
t = t.filter(f); t = t.filter(f)?;
} }
} }
t Ok(t)
} }
pub(super) fn apply_filter(&self, c: &str) -> FilterResult { pub(super) fn apply_filter(&self, c: &str) -> FilterResult {

View file

@ -42,7 +42,7 @@ impl Analyzer {
tx: &mut Transaction, tx: &mut Transaction,
query_string: String, query_string: String,
) -> Result<(Vec<TermId>, bool), Error> { ) -> Result<(Vec<TermId>, bool), Error> {
let tokens = self.analyze(query_string); let tokens = self.analyze(query_string)?;
// We first collect every unique terms // We first collect every unique terms
// as it can contains duplicates // as it can contains duplicates
let mut terms = HashSet::new(); let mut terms = HashSet::new();
@ -53,7 +53,7 @@ impl Analyzer {
// Now we can extract the term ids // Now we can extract the term ids
let mut res = Vec::with_capacity(terms.len()); let mut res = Vec::with_capacity(terms.len());
for term in terms { for term in terms {
if let Some(term_id) = t.get_term_id(tx, tokens.get_token_string(term)).await? { if let Some(term_id) = t.get_term_id(tx, tokens.get_token_string(term)?).await? {
res.push(term_id); res.push(term_id);
} else { } else {
missing = false; missing = false;
@ -80,7 +80,7 @@ impl Analyzer {
for tks in &inputs { for tks in &inputs {
for tk in tks.list() { for tk in tks.list() {
dl += 1; dl += 1;
let s = tks.get_token_string(tk); let s = tks.get_token_string(tk)?;
match tf.entry(s) { match tf.entry(s) {
Entry::Vacant(e) => { Entry::Vacant(e) => {
e.insert(1); e.insert(1);
@ -117,7 +117,7 @@ impl Analyzer {
for (i, tks) in inputs.iter().enumerate() { for (i, tks) in inputs.iter().enumerate() {
for tk in tks.list() { for tk in tks.list() {
dl += 1; dl += 1;
let s = tks.get_token_string(tk); let s = tks.get_token_string(tk)?;
let o = tk.new_offset(i as u32); let o = tk.new_offset(i as u32);
match tfos.entry(s) { match tfos.entry(s) {
Entry::Vacant(e) => { Entry::Vacant(e) => {
@ -141,33 +141,34 @@ impl Analyzer {
fn analyze_content(&self, field_content: &Array, tks: &mut Vec<Tokens>) -> Result<(), Error> { fn analyze_content(&self, field_content: &Array, tks: &mut Vec<Tokens>) -> Result<(), Error> {
for v in &field_content.0 { for v in &field_content.0 {
self.analyze_value(v, tks); self.analyze_value(v, tks)?;
} }
Ok(()) Ok(())
} }
fn analyze_value(&self, val: &Value, tks: &mut Vec<Tokens>) { fn analyze_value(&self, val: &Value, tks: &mut Vec<Tokens>) -> Result<(), Error> {
match val { match val {
Value::Strand(s) => tks.push(self.analyze(s.0.clone())), Value::Strand(s) => tks.push(self.analyze(s.0.clone())?),
Value::Number(n) => tks.push(self.analyze(n.to_string())), Value::Number(n) => tks.push(self.analyze(n.to_string())?),
Value::Bool(b) => tks.push(self.analyze(b.to_string())), Value::Bool(b) => tks.push(self.analyze(b.to_string())?),
Value::Array(a) => { Value::Array(a) => {
for v in &a.0 { for v in &a.0 {
self.analyze_value(v, tks); self.analyze_value(v, tks)?;
} }
} }
_ => {} _ => {}
} };
Ok(())
} }
fn analyze(&self, input: String) -> Tokens { fn analyze(&self, input: String) -> Result<Tokens, Error> {
if let Some(t) = &self.t { if let Some(t) = &self.t {
if !input.is_empty() { if !input.is_empty() {
let t = Tokenizer::tokenize(t, input); let t = Tokenizer::tokenize(t, input);
return Filter::apply_filters(t, &self.f); return Filter::apply_filters(t, &self.f);
} }
} }
Tokens::new(input) Ok(Tokens::new(input))
} }
} }
@ -180,10 +181,10 @@ mod tests {
let (_, az) = analyzer(def).unwrap(); let (_, az) = analyzer(def).unwrap();
let a: Analyzer = az.into(); let a: Analyzer = az.into();
let tokens = a.analyze(input.to_string()); let tokens = a.analyze(input.to_string()).unwrap();
let mut res = vec![]; let mut res = vec![];
for t in tokens.list() { for t in tokens.list() {
res.push(tokens.get_token_string(t)); res.push(tokens.get_token_string(t).unwrap());
} }
assert_eq!(&res, expected); assert_eq!(&res, expected);
} }

View file

@ -1,3 +1,4 @@
use crate::err::Error;
use crate::idx::ft::analyzer::filter::{Filter, FilterResult, Term}; use crate::idx::ft::analyzer::filter::{Filter, FilterResult, Term};
use crate::idx::ft::offsets::{Offset, Position}; use crate::idx::ft::offsets::{Offset, Position};
use crate::sql::tokenizer::Tokenizer as SqlTokenizer; use crate::sql::tokenizer::Tokenizer as SqlTokenizer;
@ -17,18 +18,18 @@ impl Tokens {
} }
} }
pub(super) fn get_token_string<'a>(&'a self, t: &'a Token) -> &str { pub(super) fn get_token_string<'a>(&'a self, t: &'a Token) -> Result<&str, Error> {
t.get_str(&self.i) t.get_str(&self.i)
} }
pub(super) fn filter(self, f: &Filter) -> Tokens { pub(super) fn filter(self, f: &Filter) -> Result<Tokens, Error> {
let mut tks = Vec::new(); let mut tks = Vec::new();
let mut res = vec![]; let mut res = vec![];
for t in self.t { for t in self.t {
if t.is_empty() { if t.is_empty() {
continue; continue;
} }
let c = t.get_str(&self.i); let c = t.get_str(&self.i)?;
let r = f.apply_filter(c); let r = f.apply_filter(c);
res.push((t, r)); res.push((t, r));
} }
@ -55,10 +56,10 @@ impl Tokens {
FilterResult::Ignore => {} FilterResult::Ignore => {}
}; };
} }
Tokens { Ok(Tokens {
i: self.i, i: self.i,
t: tks, t: tks,
} })
} }
pub(super) fn list(&self) -> &Vec<Token> { pub(super) fn list(&self) -> &Vec<Token> {
@ -68,36 +69,86 @@ impl Tokens {
#[derive(Clone, Debug, PartialOrd, PartialEq, Eq, Ord, Hash)] #[derive(Clone, Debug, PartialOrd, PartialEq, Eq, Ord, Hash)]
pub(super) enum Token { pub(super) enum Token {
Ref(Position, Position), Ref {
String(Position, Position, String), chars: (Position, Position),
bytes: (Position, Position),
},
String {
chars: (Position, Position),
bytes: (Position, Position),
term: String,
},
} }
impl Token { impl Token {
fn new_token(&self, t: String) -> Self { fn new_token(&self, term: String) -> Self {
match self { match self {
Token::Ref(s, e) => Token::String(*s, *e, t), Token::Ref {
Token::String(s, e, _) => Token::String(*s, *e, t), chars,
bytes,
} => Token::String {
chars: *chars,
bytes: *bytes,
term,
},
Token::String {
chars,
bytes,
..
} => Token::String {
chars: *chars,
bytes: *bytes,
term,
},
} }
} }
pub(super) fn new_offset(&self, i: u32) -> Offset { pub(super) fn new_offset(&self, i: u32) -> Offset {
match self { match self {
Token::Ref(s, e) => Offset::new(i, *s, *e), Token::Ref {
Token::String(s, e, _) => Offset::new(i, *s, *e), chars,
..
} => Offset::new(i, chars.0, chars.1),
Token::String {
chars,
..
} => Offset::new(i, chars.0, chars.1),
} }
} }
fn is_empty(&self) -> bool { fn is_empty(&self) -> bool {
match self { match self {
Token::Ref(start, end) => start == end, Token::Ref {
Token::String(_, _, s) => s.is_empty(), chars,
..
} => chars.0 == chars.1,
Token::String {
term,
..
} => term.is_empty(),
} }
} }
pub(super) fn get_str<'a>(&'a self, i: &'a str) -> &str { pub(super) fn get_str<'a>(&'a self, i: &'a str) -> Result<&str, Error> {
match self { match self {
Token::Ref(s, e) => &i[(*s as usize)..(*e as usize)], Token::Ref {
Token::String(_, _, s) => s, bytes,
..
} => {
let s = bytes.0 as usize;
let e = bytes.1 as usize;
let l = i.len();
if s >= l || e > l {
return Err(Error::AnalyzerError(format!(
"Unable to extract the token. The offset position ({s},{e}) is out of range ({l})."
)));
}
Ok(&i[(bytes.0 as usize)..(bytes.1 as usize)])
}
Token::String {
term,
..
} => Ok(term),
} }
} }
} }
@ -129,28 +180,40 @@ impl Tokenizer {
pub(super) fn tokenize(t: &[SqlTokenizer], i: String) -> Tokens { pub(super) fn tokenize(t: &[SqlTokenizer], i: String) -> Tokens {
let mut w = Tokenizer::new(t); let mut w = Tokenizer::new(t);
let mut last_pos = 0; let mut last_char_pos = 0;
let mut current_pos = 0; let mut last_byte_pos = 0;
let mut current_char_pos = 0;
let mut current_byte_pos = 0;
let mut t = Vec::new(); let mut t = Vec::new();
for c in i.chars() { for c in i.chars() {
let char_len = c.len_utf8() as Position;
let is_valid = Self::is_valid(c); let is_valid = Self::is_valid(c);
let should_split = w.should_split(c); let should_split = w.should_split(c);
if should_split || !is_valid { if should_split || !is_valid {
// The last pos may be more advanced due to the is_valid process // The last pos may be more advanced due to the is_valid process
if last_pos < current_pos { if last_char_pos < current_char_pos {
t.push(Token::Ref(last_pos, current_pos)); t.push(Token::Ref {
chars: (last_char_pos, current_char_pos),
bytes: (last_byte_pos, current_byte_pos),
});
} }
last_pos = current_pos; last_char_pos = current_char_pos;
last_byte_pos = current_byte_pos;
// If the character is not valid for indexing (space, control...) // If the character is not valid for indexing (space, control...)
// Then we increase the last position to the next character // Then we increase the last position to the next character
if !is_valid { if !is_valid {
last_pos += c.len_utf8() as Position; last_char_pos += 1;
last_byte_pos += char_len;
} }
} }
current_pos += c.len_utf8() as Position; current_char_pos += 1;
current_byte_pos += char_len;
} }
if current_pos != last_pos { if current_char_pos != last_char_pos {
t.push(Token::Ref(last_pos, current_pos)); t.push(Token::Ref {
chars: (last_char_pos, current_char_pos),
bytes: (last_byte_pos, current_byte_pos),
});
} }
Tokens { Tokens {
i, i,

View file

@ -32,17 +32,17 @@ impl Highlighter {
self.offseter.highlight(os); self.offseter.highlight(os);
} }
fn extract(val: Value, vals: &mut Vec<Option<String>>) { fn extract(val: Value, vals: &mut Vec<String>) {
match val { match val {
Value::Strand(s) => vals.push(Some(s.0)), Value::Strand(s) => vals.push(s.0),
Value::Number(n) => vals.push(Some(n.to_string())), Value::Number(n) => vals.push(n.to_string()),
Value::Bool(b) => vals.push(Some(b.to_string())), Value::Bool(b) => vals.push(b.to_string()),
Value::Array(a) => { Value::Array(a) => {
for v in a.0 { for v in a.0 {
Self::extract(v, vals); Self::extract(v, vals);
} }
} }
_ => vals.push(None), _ => {}
} }
} }
} }
@ -60,24 +60,35 @@ impl TryFrom<Highlighter> for Value {
} }
let mut res = Vec::with_capacity(vals.len()); let mut res = Vec::with_capacity(vals.len());
for (idx, val) in vals.into_iter().enumerate() { for (idx, val) in vals.into_iter().enumerate() {
let idx = idx as u32; if let Some(m) = hl.offseter.offsets.get(&(idx as u32)) {
if let Some(v) = val { let mut v: Vec<char> = val.chars().collect();
if let Some(m) = hl.offseter.offsets.get(&idx) { let mut l = v.len();
let mut v: Vec<char> = v.chars().collect(); let mut d = 0;
let mut d = 0;
for (s, e) in m { // We use a closure to append the prefix and the suffix
let p = (*s as usize) + d; let mut append = |s: u32, ix: &Vec<char>| -> Result<(), Error> {
v.splice(p..p, hl.prefix.clone()); let p = (s as usize) + d;
d += hl.prefix.len(); if p > l {
let p = (*e as usize) + d; return Err(Error::HighlightError(format!(
v.splice(p..p, hl.suffix.clone()); "position overflow: {s} - len: {l}"
d += hl.suffix.len(); )));
} }
let s: String = v.iter().collect(); v.splice(p..p, ix.clone());
res.push(Value::from(s)); let xl = ix.len();
} else { d += xl;
res.push(Value::from(v)); l += xl;
Ok(())
};
for (s, e) in m {
append(*s, &hl.prefix)?;
append(*e, &hl.suffix)?;
} }
let s: String = v.iter().collect();
res.push(Value::from(s));
} else {
res.push(Value::from(val));
} }
} }
Ok(match res.len() { Ok(match res.len() {

View file

@ -132,23 +132,8 @@ mod tests {
#[test] #[test]
fn test_offset_records() { fn test_offset_records() {
let o = OffsetRecords(vec![ let o =
Offset { OffsetRecords(vec![Offset::new(0, 1, 2), Offset::new(0, 11, 22), Offset::new(1, 3, 4)]);
index: 0,
start: 1,
end: 2,
},
Offset {
index: 0,
start: 11,
end: 22,
},
Offset {
index: 1,
start: 3,
end: 4,
},
]);
let v: Val = o.clone().try_into().unwrap(); let v: Val = o.clone().try_into().unwrap();
let o2 = v.try_into().unwrap(); let o2 = v.try_into().unwrap();
assert_eq!(o, o2) assert_eq!(o, o2)

View file

@ -95,10 +95,10 @@ async fn select_where_matches_without_using_index_iterator() -> Result<(), Error
#[tokio::test] #[tokio::test]
async fn select_where_matches_using_index_and_arrays() -> Result<(), Error> { async fn select_where_matches_using_index_and_arrays() -> Result<(), Error> {
let sql = r" let sql = r"
CREATE blog:1 SET content = ['Hello World!', 'Be Bop', 'Foo Bar']; CREATE blog:1 SET content = ['Hello World!', 'Be Bop', 'Foo Bãr'];
DEFINE ANALYZER simple TOKENIZERS blank,class; DEFINE ANALYZER simple TOKENIZERS blank,class;
DEFINE INDEX blog_content ON blog FIELDS content SEARCH ANALYZER simple BM25(1.2,0.75) HIGHLIGHTS; DEFINE INDEX blog_content ON blog FIELDS content SEARCH ANALYZER simple BM25(1.2,0.75) HIGHLIGHTS;
SELECT id, search::highlight('<em>', '</em>', 1) AS content FROM blog WHERE content @1@ 'Hello Bar' EXPLAIN; SELECT id, search::highlight('<em>', '</em>', 1) AS content FROM blog WHERE content @1@ 'Hello Bãr' EXPLAIN;
"; ";
let dbs = Datastore::new("memory").await?; let dbs = Datastore::new("memory").await?;
let ses = Session::for_kv().with_ns("test").with_db("test"); let ses = Session::for_kv().with_ns("test").with_db("test");
@ -116,7 +116,7 @@ async fn select_where_matches_using_index_and_arrays() -> Result<(), Error> {
content: [ content: [
'<em>Hello</em> World!', '<em>Hello</em> World!',
'Be Bop', 'Be Bop',
'Foo <em>Bar</em>' 'Foo <em>Bãr</em>'
] ]
}, },
{ {
@ -127,7 +127,7 @@ async fn select_where_matches_using_index_and_arrays() -> Result<(), Error> {
plan: { plan: {
index: 'blog_content', index: 'blog_content',
operator: '@1@', operator: '@1@',
value: 'Hello Bar' value: 'Hello Bãr'
}, },
table: 'blog', table: 'blog',
}, },
@ -144,11 +144,11 @@ async fn select_where_matches_using_index_and_arrays() -> Result<(), Error> {
#[tokio::test] #[tokio::test]
async fn select_where_matches_using_index_offsets() -> Result<(), Error> { async fn select_where_matches_using_index_offsets() -> Result<(), Error> {
let sql = r" let sql = r"
CREATE blog:1 SET title = 'Blog title!', content = ['Hello World!', 'Be Bop', 'Foo Bar']; CREATE blog:1 SET title = 'Blog title!', content = ['Hello World!', 'Be Bop', 'Foo Bãr'];
DEFINE ANALYZER simple TOKENIZERS blank,class; DEFINE ANALYZER simple TOKENIZERS blank,class;
DEFINE INDEX blog_title ON blog FIELDS title SEARCH ANALYZER simple BM25(1.2,0.75) HIGHLIGHTS; DEFINE INDEX blog_title ON blog FIELDS title SEARCH ANALYZER simple BM25(1.2,0.75) HIGHLIGHTS;
DEFINE INDEX blog_content ON blog FIELDS content SEARCH ANALYZER simple BM25(1.2,0.75) HIGHLIGHTS; DEFINE INDEX blog_content ON blog FIELDS content SEARCH ANALYZER simple BM25(1.2,0.75) HIGHLIGHTS;
SELECT id, search::offsets(0) AS title, search::offsets(1) AS content FROM blog WHERE title @0@ 'title' AND content @1@ 'Hello Bar' EXPLAIN; SELECT id, search::offsets(0) AS title, search::offsets(1) AS content FROM blog WHERE title @0@ 'title' AND content @1@ 'Hello Bãr' EXPLAIN;
"; ";
let dbs = Datastore::new("memory").await?; let dbs = Datastore::new("memory").await?;
let ses = Session::for_kv().with_ns("test").with_db("test"); let ses = Session::for_kv().with_ns("test").with_db("test");
@ -179,7 +179,7 @@ async fn select_where_matches_using_index_offsets() -> Result<(), Error> {
plan: { plan: {
index: 'blog_content', index: 'blog_content',
operator: '@1@', operator: '@1@',
value: 'Hello Bar' value: 'Hello Bãr'
}, },
table: 'blog', table: 'blog',
}, },