From f688eb4ef5ae9bc5a74e12b29f388f07e5380f9a Mon Sep 17 00:00:00 2001 From: Emmanuel Keller Date: Fri, 23 Jun 2023 12:58:51 +0100 Subject: [PATCH] Fixes highlighting bug (out or range for slice length...) (#2169) --- Cargo.lock | 16 ++-- lib/Cargo.toml | 2 +- lib/src/err/mod.rs | 6 +- lib/src/idx/ft/analyzer/filter.rs | 7 +- lib/src/idx/ft/analyzer/mod.rs | 31 ++++---- lib/src/idx/ft/analyzer/tokenizer.rs | 115 +++++++++++++++++++++------ lib/src/idx/ft/highlighter.rs | 53 +++++++----- lib/src/idx/ft/offsets.rs | 19 +---- lib/tests/matches.rs | 14 ++-- 9 files changed, 164 insertions(+), 99 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 665a3f79..aa9433ff 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3847,9 +3847,9 @@ dependencies = [ [[package]] name = "rquickjs" -version = "0.3.0" +version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9a9b746ee3a4f6dc8b8f9b369d13c043a2f21fb645253aeac3f1404b12cff7d9" +checksum = "6db7788c2818f4546daabe9ae2d1ee2f4db61ab1998d4b483494c4193cc38dab" dependencies = [ "rquickjs-core", "rquickjs-macro", @@ -3857,9 +3857,9 @@ dependencies = [ [[package]] name = "rquickjs-core" -version = "0.3.0" +version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9f4f48d37b8e455a76544505915dcd7a82b7a3a1898cf2b4a0aba2d7bd66dcc4" +checksum = "b12cf8646fe0af5bcff2822ccd162990f0679a1f9287c7257f4f4193a9d31ea9" dependencies = [ "async-lock", "relative-path", @@ -3868,9 +3868,9 @@ dependencies = [ [[package]] name = "rquickjs-macro" -version = "0.3.0" +version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6ecbbe8ec4319fe42d3ec165dc7fa473580c2fa09747a39aa93f81ab422792a8" +checksum = "80564583a91b0ae6b2d6b9b3d0f8ffd69a4b17202cc63a12df78dfa8983885fc" dependencies = [ "darling", "fnv", @@ -3886,9 +3886,9 @@ dependencies = [ [[package]] name = "rquickjs-sys" -version = "0.3.0" +version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3f65ccee59bf4eb61361e7d516fea4a77273b96fe17811fc7da2558369914f55" +checksum = "b747058afd4d988d056e4972ec8516a5a86fdfc103c1c1485bfee8966a0743ae" dependencies = [ "bindgen 0.65.1", "cc", diff --git a/lib/Cargo.toml b/lib/Cargo.toml index 689da52b..2547a54a 100644 --- a/lib/Cargo.toml +++ b/lib/Cargo.toml @@ -76,7 +76,7 @@ fuzzy-matcher = "0.3.7" geo = { version = "0.25.0", features = ["use-serde"] } indexmap = { version = "1.9.3", features = ["serde"] } indxdb = { version = "0.3.0", optional = true } -js = { version = "0.3.0" , package = "rquickjs", features = ["array-buffer", "bindgen", "classes", "futures", "loader", "macro", "parallel", "properties","rust-alloc"], optional = true } +js = { version = "0.3.1" , package = "rquickjs", features = ["array-buffer", "bindgen", "classes", "futures", "loader", "macro", "parallel", "properties","rust-alloc"], optional = true } jsonwebtoken = "8.3.0" lexicmp = "0.1.0" log = "0.4.17" diff --git a/lib/src/err/mod.rs b/lib/src/err/mod.rs index 7a1ed90a..44c117b3 100644 --- a/lib/src/err/mod.rs +++ b/lib/src/err/mod.rs @@ -469,9 +469,13 @@ pub enum Error { }, /// Represents an error when analyzing a value - #[error("A string can't be analyzed: {0}")] + #[error("A value can't be analyzed: {0}")] AnalyzerError(String), + /// Represents an error when trying to highlight a value + #[error("A value can't be highlighted: {0}")] + HighlightError(String), + /// Represents an underlying error with Bincode serializing / deserializing #[error("Bincode error: {0}")] Bincode(#[from] BincodeError), diff --git a/lib/src/idx/ft/analyzer/filter.rs b/lib/src/idx/ft/analyzer/filter.rs index 9134cc18..1d780932 100644 --- a/lib/src/idx/ft/analyzer/filter.rs +++ b/lib/src/idx/ft/analyzer/filter.rs @@ -1,3 +1,4 @@ +use crate::err::Error; use crate::idx::ft::analyzer::tokenizer::Tokens; use crate::sql::filter::Filter as SqlFilter; use crate::sql::language::Language; @@ -60,13 +61,13 @@ impl Filter { } } - pub(super) fn apply_filters(mut t: Tokens, f: &Option>) -> Tokens { + pub(super) fn apply_filters(mut t: Tokens, f: &Option>) -> Result { if let Some(f) = f { for f in f { - t = t.filter(f); + t = t.filter(f)?; } } - t + Ok(t) } pub(super) fn apply_filter(&self, c: &str) -> FilterResult { diff --git a/lib/src/idx/ft/analyzer/mod.rs b/lib/src/idx/ft/analyzer/mod.rs index e70b9da6..2327cd54 100644 --- a/lib/src/idx/ft/analyzer/mod.rs +++ b/lib/src/idx/ft/analyzer/mod.rs @@ -42,7 +42,7 @@ impl Analyzer { tx: &mut Transaction, query_string: String, ) -> Result<(Vec, bool), Error> { - let tokens = self.analyze(query_string); + let tokens = self.analyze(query_string)?; // We first collect every unique terms // as it can contains duplicates let mut terms = HashSet::new(); @@ -53,7 +53,7 @@ impl Analyzer { // Now we can extract the term ids let mut res = Vec::with_capacity(terms.len()); for term in terms { - if let Some(term_id) = t.get_term_id(tx, tokens.get_token_string(term)).await? { + if let Some(term_id) = t.get_term_id(tx, tokens.get_token_string(term)?).await? { res.push(term_id); } else { missing = false; @@ -80,7 +80,7 @@ impl Analyzer { for tks in &inputs { for tk in tks.list() { dl += 1; - let s = tks.get_token_string(tk); + let s = tks.get_token_string(tk)?; match tf.entry(s) { Entry::Vacant(e) => { e.insert(1); @@ -117,7 +117,7 @@ impl Analyzer { for (i, tks) in inputs.iter().enumerate() { for tk in tks.list() { dl += 1; - let s = tks.get_token_string(tk); + let s = tks.get_token_string(tk)?; let o = tk.new_offset(i as u32); match tfos.entry(s) { Entry::Vacant(e) => { @@ -141,33 +141,34 @@ impl Analyzer { fn analyze_content(&self, field_content: &Array, tks: &mut Vec) -> Result<(), Error> { for v in &field_content.0 { - self.analyze_value(v, tks); + self.analyze_value(v, tks)?; } Ok(()) } - fn analyze_value(&self, val: &Value, tks: &mut Vec) { + fn analyze_value(&self, val: &Value, tks: &mut Vec) -> Result<(), Error> { match val { - Value::Strand(s) => tks.push(self.analyze(s.0.clone())), - Value::Number(n) => tks.push(self.analyze(n.to_string())), - Value::Bool(b) => tks.push(self.analyze(b.to_string())), + Value::Strand(s) => tks.push(self.analyze(s.0.clone())?), + Value::Number(n) => tks.push(self.analyze(n.to_string())?), + Value::Bool(b) => tks.push(self.analyze(b.to_string())?), Value::Array(a) => { for v in &a.0 { - self.analyze_value(v, tks); + self.analyze_value(v, tks)?; } } _ => {} - } + }; + Ok(()) } - fn analyze(&self, input: String) -> Tokens { + fn analyze(&self, input: String) -> Result { if let Some(t) = &self.t { if !input.is_empty() { let t = Tokenizer::tokenize(t, input); return Filter::apply_filters(t, &self.f); } } - Tokens::new(input) + Ok(Tokens::new(input)) } } @@ -180,10 +181,10 @@ mod tests { let (_, az) = analyzer(def).unwrap(); let a: Analyzer = az.into(); - let tokens = a.analyze(input.to_string()); + let tokens = a.analyze(input.to_string()).unwrap(); let mut res = vec![]; for t in tokens.list() { - res.push(tokens.get_token_string(t)); + res.push(tokens.get_token_string(t).unwrap()); } assert_eq!(&res, expected); } diff --git a/lib/src/idx/ft/analyzer/tokenizer.rs b/lib/src/idx/ft/analyzer/tokenizer.rs index 586d7ad0..c89835a6 100644 --- a/lib/src/idx/ft/analyzer/tokenizer.rs +++ b/lib/src/idx/ft/analyzer/tokenizer.rs @@ -1,3 +1,4 @@ +use crate::err::Error; use crate::idx::ft::analyzer::filter::{Filter, FilterResult, Term}; use crate::idx::ft::offsets::{Offset, Position}; use crate::sql::tokenizer::Tokenizer as SqlTokenizer; @@ -17,18 +18,18 @@ impl Tokens { } } - pub(super) fn get_token_string<'a>(&'a self, t: &'a Token) -> &str { + pub(super) fn get_token_string<'a>(&'a self, t: &'a Token) -> Result<&str, Error> { t.get_str(&self.i) } - pub(super) fn filter(self, f: &Filter) -> Tokens { + pub(super) fn filter(self, f: &Filter) -> Result { let mut tks = Vec::new(); let mut res = vec![]; for t in self.t { if t.is_empty() { continue; } - let c = t.get_str(&self.i); + let c = t.get_str(&self.i)?; let r = f.apply_filter(c); res.push((t, r)); } @@ -55,10 +56,10 @@ impl Tokens { FilterResult::Ignore => {} }; } - Tokens { + Ok(Tokens { i: self.i, t: tks, - } + }) } pub(super) fn list(&self) -> &Vec { @@ -68,36 +69,86 @@ impl Tokens { #[derive(Clone, Debug, PartialOrd, PartialEq, Eq, Ord, Hash)] pub(super) enum Token { - Ref(Position, Position), - String(Position, Position, String), + Ref { + chars: (Position, Position), + bytes: (Position, Position), + }, + String { + chars: (Position, Position), + bytes: (Position, Position), + term: String, + }, } impl Token { - fn new_token(&self, t: String) -> Self { + fn new_token(&self, term: String) -> Self { match self { - Token::Ref(s, e) => Token::String(*s, *e, t), - Token::String(s, e, _) => Token::String(*s, *e, t), + Token::Ref { + chars, + bytes, + } => Token::String { + chars: *chars, + bytes: *bytes, + term, + }, + Token::String { + chars, + bytes, + .. + } => Token::String { + chars: *chars, + bytes: *bytes, + term, + }, } } pub(super) fn new_offset(&self, i: u32) -> Offset { match self { - Token::Ref(s, e) => Offset::new(i, *s, *e), - Token::String(s, e, _) => Offset::new(i, *s, *e), + Token::Ref { + chars, + .. + } => Offset::new(i, chars.0, chars.1), + Token::String { + chars, + .. + } => Offset::new(i, chars.0, chars.1), } } fn is_empty(&self) -> bool { match self { - Token::Ref(start, end) => start == end, - Token::String(_, _, s) => s.is_empty(), + Token::Ref { + chars, + .. + } => chars.0 == chars.1, + Token::String { + term, + .. + } => term.is_empty(), } } - pub(super) fn get_str<'a>(&'a self, i: &'a str) -> &str { + pub(super) fn get_str<'a>(&'a self, i: &'a str) -> Result<&str, Error> { match self { - Token::Ref(s, e) => &i[(*s as usize)..(*e as usize)], - Token::String(_, _, s) => s, + Token::Ref { + bytes, + .. + } => { + let s = bytes.0 as usize; + let e = bytes.1 as usize; + let l = i.len(); + if s >= l || e > l { + return Err(Error::AnalyzerError(format!( + "Unable to extract the token. The offset position ({s},{e}) is out of range ({l})." + ))); + } + Ok(&i[(bytes.0 as usize)..(bytes.1 as usize)]) + } + Token::String { + term, + .. + } => Ok(term), } } } @@ -129,28 +180,40 @@ impl Tokenizer { pub(super) fn tokenize(t: &[SqlTokenizer], i: String) -> Tokens { let mut w = Tokenizer::new(t); - let mut last_pos = 0; - let mut current_pos = 0; + let mut last_char_pos = 0; + let mut last_byte_pos = 0; + let mut current_char_pos = 0; + let mut current_byte_pos = 0; let mut t = Vec::new(); for c in i.chars() { + let char_len = c.len_utf8() as Position; let is_valid = Self::is_valid(c); let should_split = w.should_split(c); if should_split || !is_valid { // The last pos may be more advanced due to the is_valid process - if last_pos < current_pos { - t.push(Token::Ref(last_pos, current_pos)); + if last_char_pos < current_char_pos { + t.push(Token::Ref { + chars: (last_char_pos, current_char_pos), + bytes: (last_byte_pos, current_byte_pos), + }); } - last_pos = current_pos; + last_char_pos = current_char_pos; + last_byte_pos = current_byte_pos; // If the character is not valid for indexing (space, control...) // Then we increase the last position to the next character if !is_valid { - last_pos += c.len_utf8() as Position; + last_char_pos += 1; + last_byte_pos += char_len; } } - current_pos += c.len_utf8() as Position; + current_char_pos += 1; + current_byte_pos += char_len; } - if current_pos != last_pos { - t.push(Token::Ref(last_pos, current_pos)); + if current_char_pos != last_char_pos { + t.push(Token::Ref { + chars: (last_char_pos, current_char_pos), + bytes: (last_byte_pos, current_byte_pos), + }); } Tokens { i, diff --git a/lib/src/idx/ft/highlighter.rs b/lib/src/idx/ft/highlighter.rs index 6cac00de..a9c5af15 100644 --- a/lib/src/idx/ft/highlighter.rs +++ b/lib/src/idx/ft/highlighter.rs @@ -32,17 +32,17 @@ impl Highlighter { self.offseter.highlight(os); } - fn extract(val: Value, vals: &mut Vec>) { + fn extract(val: Value, vals: &mut Vec) { match val { - Value::Strand(s) => vals.push(Some(s.0)), - Value::Number(n) => vals.push(Some(n.to_string())), - Value::Bool(b) => vals.push(Some(b.to_string())), + Value::Strand(s) => vals.push(s.0), + Value::Number(n) => vals.push(n.to_string()), + Value::Bool(b) => vals.push(b.to_string()), Value::Array(a) => { for v in a.0 { Self::extract(v, vals); } } - _ => vals.push(None), + _ => {} } } } @@ -60,24 +60,35 @@ impl TryFrom for Value { } let mut res = Vec::with_capacity(vals.len()); for (idx, val) in vals.into_iter().enumerate() { - let idx = idx as u32; - if let Some(v) = val { - if let Some(m) = hl.offseter.offsets.get(&idx) { - let mut v: Vec = v.chars().collect(); - let mut d = 0; - for (s, e) in m { - let p = (*s as usize) + d; - v.splice(p..p, hl.prefix.clone()); - d += hl.prefix.len(); - let p = (*e as usize) + d; - v.splice(p..p, hl.suffix.clone()); - d += hl.suffix.len(); + if let Some(m) = hl.offseter.offsets.get(&(idx as u32)) { + let mut v: Vec = val.chars().collect(); + let mut l = v.len(); + let mut d = 0; + + // We use a closure to append the prefix and the suffix + let mut append = |s: u32, ix: &Vec| -> Result<(), Error> { + let p = (s as usize) + d; + if p > l { + return Err(Error::HighlightError(format!( + "position overflow: {s} - len: {l}" + ))); } - let s: String = v.iter().collect(); - res.push(Value::from(s)); - } else { - res.push(Value::from(v)); + v.splice(p..p, ix.clone()); + let xl = ix.len(); + d += xl; + l += xl; + Ok(()) + }; + + for (s, e) in m { + append(*s, &hl.prefix)?; + append(*e, &hl.suffix)?; } + + let s: String = v.iter().collect(); + res.push(Value::from(s)); + } else { + res.push(Value::from(val)); } } Ok(match res.len() { diff --git a/lib/src/idx/ft/offsets.rs b/lib/src/idx/ft/offsets.rs index 59cce4f4..21adfd0f 100644 --- a/lib/src/idx/ft/offsets.rs +++ b/lib/src/idx/ft/offsets.rs @@ -132,23 +132,8 @@ mod tests { #[test] fn test_offset_records() { - let o = OffsetRecords(vec![ - Offset { - index: 0, - start: 1, - end: 2, - }, - Offset { - index: 0, - start: 11, - end: 22, - }, - Offset { - index: 1, - start: 3, - end: 4, - }, - ]); + let o = + OffsetRecords(vec![Offset::new(0, 1, 2), Offset::new(0, 11, 22), Offset::new(1, 3, 4)]); let v: Val = o.clone().try_into().unwrap(); let o2 = v.try_into().unwrap(); assert_eq!(o, o2) diff --git a/lib/tests/matches.rs b/lib/tests/matches.rs index 820aeeb9..6153a3f8 100644 --- a/lib/tests/matches.rs +++ b/lib/tests/matches.rs @@ -95,10 +95,10 @@ async fn select_where_matches_without_using_index_iterator() -> Result<(), Error #[tokio::test] async fn select_where_matches_using_index_and_arrays() -> Result<(), Error> { let sql = r" - CREATE blog:1 SET content = ['Hello World!', 'Be Bop', 'Foo Bar']; + CREATE blog:1 SET content = ['Hello World!', 'Be Bop', 'Foo Bãr']; DEFINE ANALYZER simple TOKENIZERS blank,class; DEFINE INDEX blog_content ON blog FIELDS content SEARCH ANALYZER simple BM25(1.2,0.75) HIGHLIGHTS; - SELECT id, search::highlight('', '', 1) AS content FROM blog WHERE content @1@ 'Hello Bar' EXPLAIN; + SELECT id, search::highlight('', '', 1) AS content FROM blog WHERE content @1@ 'Hello Bãr' EXPLAIN; "; let dbs = Datastore::new("memory").await?; let ses = Session::for_kv().with_ns("test").with_db("test"); @@ -116,7 +116,7 @@ async fn select_where_matches_using_index_and_arrays() -> Result<(), Error> { content: [ 'Hello World!', 'Be Bop', - 'Foo Bar' + 'Foo Bãr' ] }, { @@ -127,7 +127,7 @@ async fn select_where_matches_using_index_and_arrays() -> Result<(), Error> { plan: { index: 'blog_content', operator: '@1@', - value: 'Hello Bar' + value: 'Hello Bãr' }, table: 'blog', }, @@ -144,11 +144,11 @@ async fn select_where_matches_using_index_and_arrays() -> Result<(), Error> { #[tokio::test] async fn select_where_matches_using_index_offsets() -> Result<(), Error> { let sql = r" - CREATE blog:1 SET title = 'Blog title!', content = ['Hello World!', 'Be Bop', 'Foo Bar']; + CREATE blog:1 SET title = 'Blog title!', content = ['Hello World!', 'Be Bop', 'Foo Bãr']; DEFINE ANALYZER simple TOKENIZERS blank,class; DEFINE INDEX blog_title ON blog FIELDS title SEARCH ANALYZER simple BM25(1.2,0.75) HIGHLIGHTS; DEFINE INDEX blog_content ON blog FIELDS content SEARCH ANALYZER simple BM25(1.2,0.75) HIGHLIGHTS; - SELECT id, search::offsets(0) AS title, search::offsets(1) AS content FROM blog WHERE title @0@ 'title' AND content @1@ 'Hello Bar' EXPLAIN; + SELECT id, search::offsets(0) AS title, search::offsets(1) AS content FROM blog WHERE title @0@ 'title' AND content @1@ 'Hello Bãr' EXPLAIN; "; let dbs = Datastore::new("memory").await?; let ses = Session::for_kv().with_ns("test").with_db("test"); @@ -179,7 +179,7 @@ async fn select_where_matches_using_index_offsets() -> Result<(), Error> { plan: { index: 'blog_content', operator: '@1@', - value: 'Hello Bar' + value: 'Hello Bãr' }, table: 'blog', },