Fixes highlighting bug (out or range for slice length...) (#2169)

This commit is contained in:
Emmanuel Keller 2023-06-23 12:58:51 +01:00 committed by GitHub
parent be3e2637b9
commit f688eb4ef5
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
9 changed files with 164 additions and 99 deletions

16
Cargo.lock generated
View file

@ -3847,9 +3847,9 @@ dependencies = [
[[package]]
name = "rquickjs"
version = "0.3.0"
version = "0.3.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9a9b746ee3a4f6dc8b8f9b369d13c043a2f21fb645253aeac3f1404b12cff7d9"
checksum = "6db7788c2818f4546daabe9ae2d1ee2f4db61ab1998d4b483494c4193cc38dab"
dependencies = [
"rquickjs-core",
"rquickjs-macro",
@ -3857,9 +3857,9 @@ dependencies = [
[[package]]
name = "rquickjs-core"
version = "0.3.0"
version = "0.3.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9f4f48d37b8e455a76544505915dcd7a82b7a3a1898cf2b4a0aba2d7bd66dcc4"
checksum = "b12cf8646fe0af5bcff2822ccd162990f0679a1f9287c7257f4f4193a9d31ea9"
dependencies = [
"async-lock",
"relative-path",
@ -3868,9 +3868,9 @@ dependencies = [
[[package]]
name = "rquickjs-macro"
version = "0.3.0"
version = "0.3.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6ecbbe8ec4319fe42d3ec165dc7fa473580c2fa09747a39aa93f81ab422792a8"
checksum = "80564583a91b0ae6b2d6b9b3d0f8ffd69a4b17202cc63a12df78dfa8983885fc"
dependencies = [
"darling",
"fnv",
@ -3886,9 +3886,9 @@ dependencies = [
[[package]]
name = "rquickjs-sys"
version = "0.3.0"
version = "0.3.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3f65ccee59bf4eb61361e7d516fea4a77273b96fe17811fc7da2558369914f55"
checksum = "b747058afd4d988d056e4972ec8516a5a86fdfc103c1c1485bfee8966a0743ae"
dependencies = [
"bindgen 0.65.1",
"cc",

View file

@ -76,7 +76,7 @@ fuzzy-matcher = "0.3.7"
geo = { version = "0.25.0", features = ["use-serde"] }
indexmap = { version = "1.9.3", features = ["serde"] }
indxdb = { version = "0.3.0", optional = true }
js = { version = "0.3.0" , package = "rquickjs", features = ["array-buffer", "bindgen", "classes", "futures", "loader", "macro", "parallel", "properties","rust-alloc"], optional = true }
js = { version = "0.3.1" , package = "rquickjs", features = ["array-buffer", "bindgen", "classes", "futures", "loader", "macro", "parallel", "properties","rust-alloc"], optional = true }
jsonwebtoken = "8.3.0"
lexicmp = "0.1.0"
log = "0.4.17"

View file

@ -469,9 +469,13 @@ pub enum Error {
},
/// Represents an error when analyzing a value
#[error("A string can't be analyzed: {0}")]
#[error("A value can't be analyzed: {0}")]
AnalyzerError(String),
/// Represents an error when trying to highlight a value
#[error("A value can't be highlighted: {0}")]
HighlightError(String),
/// Represents an underlying error with Bincode serializing / deserializing
#[error("Bincode error: {0}")]
Bincode(#[from] BincodeError),

View file

@ -1,3 +1,4 @@
use crate::err::Error;
use crate::idx::ft::analyzer::tokenizer::Tokens;
use crate::sql::filter::Filter as SqlFilter;
use crate::sql::language::Language;
@ -60,13 +61,13 @@ impl Filter {
}
}
pub(super) fn apply_filters(mut t: Tokens, f: &Option<Vec<Filter>>) -> Tokens {
pub(super) fn apply_filters(mut t: Tokens, f: &Option<Vec<Filter>>) -> Result<Tokens, Error> {
if let Some(f) = f {
for f in f {
t = t.filter(f);
t = t.filter(f)?;
}
}
t
Ok(t)
}
pub(super) fn apply_filter(&self, c: &str) -> FilterResult {

View file

@ -42,7 +42,7 @@ impl Analyzer {
tx: &mut Transaction,
query_string: String,
) -> Result<(Vec<TermId>, bool), Error> {
let tokens = self.analyze(query_string);
let tokens = self.analyze(query_string)?;
// We first collect every unique terms
// as it can contains duplicates
let mut terms = HashSet::new();
@ -53,7 +53,7 @@ impl Analyzer {
// Now we can extract the term ids
let mut res = Vec::with_capacity(terms.len());
for term in terms {
if let Some(term_id) = t.get_term_id(tx, tokens.get_token_string(term)).await? {
if let Some(term_id) = t.get_term_id(tx, tokens.get_token_string(term)?).await? {
res.push(term_id);
} else {
missing = false;
@ -80,7 +80,7 @@ impl Analyzer {
for tks in &inputs {
for tk in tks.list() {
dl += 1;
let s = tks.get_token_string(tk);
let s = tks.get_token_string(tk)?;
match tf.entry(s) {
Entry::Vacant(e) => {
e.insert(1);
@ -117,7 +117,7 @@ impl Analyzer {
for (i, tks) in inputs.iter().enumerate() {
for tk in tks.list() {
dl += 1;
let s = tks.get_token_string(tk);
let s = tks.get_token_string(tk)?;
let o = tk.new_offset(i as u32);
match tfos.entry(s) {
Entry::Vacant(e) => {
@ -141,33 +141,34 @@ impl Analyzer {
fn analyze_content(&self, field_content: &Array, tks: &mut Vec<Tokens>) -> Result<(), Error> {
for v in &field_content.0 {
self.analyze_value(v, tks);
self.analyze_value(v, tks)?;
}
Ok(())
}
fn analyze_value(&self, val: &Value, tks: &mut Vec<Tokens>) {
fn analyze_value(&self, val: &Value, tks: &mut Vec<Tokens>) -> Result<(), Error> {
match val {
Value::Strand(s) => tks.push(self.analyze(s.0.clone())),
Value::Number(n) => tks.push(self.analyze(n.to_string())),
Value::Bool(b) => tks.push(self.analyze(b.to_string())),
Value::Strand(s) => tks.push(self.analyze(s.0.clone())?),
Value::Number(n) => tks.push(self.analyze(n.to_string())?),
Value::Bool(b) => tks.push(self.analyze(b.to_string())?),
Value::Array(a) => {
for v in &a.0 {
self.analyze_value(v, tks);
self.analyze_value(v, tks)?;
}
}
_ => {}
}
};
Ok(())
}
fn analyze(&self, input: String) -> Tokens {
fn analyze(&self, input: String) -> Result<Tokens, Error> {
if let Some(t) = &self.t {
if !input.is_empty() {
let t = Tokenizer::tokenize(t, input);
return Filter::apply_filters(t, &self.f);
}
}
Tokens::new(input)
Ok(Tokens::new(input))
}
}
@ -180,10 +181,10 @@ mod tests {
let (_, az) = analyzer(def).unwrap();
let a: Analyzer = az.into();
let tokens = a.analyze(input.to_string());
let tokens = a.analyze(input.to_string()).unwrap();
let mut res = vec![];
for t in tokens.list() {
res.push(tokens.get_token_string(t));
res.push(tokens.get_token_string(t).unwrap());
}
assert_eq!(&res, expected);
}

View file

@ -1,3 +1,4 @@
use crate::err::Error;
use crate::idx::ft::analyzer::filter::{Filter, FilterResult, Term};
use crate::idx::ft::offsets::{Offset, Position};
use crate::sql::tokenizer::Tokenizer as SqlTokenizer;
@ -17,18 +18,18 @@ impl Tokens {
}
}
pub(super) fn get_token_string<'a>(&'a self, t: &'a Token) -> &str {
pub(super) fn get_token_string<'a>(&'a self, t: &'a Token) -> Result<&str, Error> {
t.get_str(&self.i)
}
pub(super) fn filter(self, f: &Filter) -> Tokens {
pub(super) fn filter(self, f: &Filter) -> Result<Tokens, Error> {
let mut tks = Vec::new();
let mut res = vec![];
for t in self.t {
if t.is_empty() {
continue;
}
let c = t.get_str(&self.i);
let c = t.get_str(&self.i)?;
let r = f.apply_filter(c);
res.push((t, r));
}
@ -55,10 +56,10 @@ impl Tokens {
FilterResult::Ignore => {}
};
}
Tokens {
Ok(Tokens {
i: self.i,
t: tks,
}
})
}
pub(super) fn list(&self) -> &Vec<Token> {
@ -68,36 +69,86 @@ impl Tokens {
#[derive(Clone, Debug, PartialOrd, PartialEq, Eq, Ord, Hash)]
pub(super) enum Token {
Ref(Position, Position),
String(Position, Position, String),
Ref {
chars: (Position, Position),
bytes: (Position, Position),
},
String {
chars: (Position, Position),
bytes: (Position, Position),
term: String,
},
}
impl Token {
fn new_token(&self, t: String) -> Self {
fn new_token(&self, term: String) -> Self {
match self {
Token::Ref(s, e) => Token::String(*s, *e, t),
Token::String(s, e, _) => Token::String(*s, *e, t),
Token::Ref {
chars,
bytes,
} => Token::String {
chars: *chars,
bytes: *bytes,
term,
},
Token::String {
chars,
bytes,
..
} => Token::String {
chars: *chars,
bytes: *bytes,
term,
},
}
}
pub(super) fn new_offset(&self, i: u32) -> Offset {
match self {
Token::Ref(s, e) => Offset::new(i, *s, *e),
Token::String(s, e, _) => Offset::new(i, *s, *e),
Token::Ref {
chars,
..
} => Offset::new(i, chars.0, chars.1),
Token::String {
chars,
..
} => Offset::new(i, chars.0, chars.1),
}
}
fn is_empty(&self) -> bool {
match self {
Token::Ref(start, end) => start == end,
Token::String(_, _, s) => s.is_empty(),
Token::Ref {
chars,
..
} => chars.0 == chars.1,
Token::String {
term,
..
} => term.is_empty(),
}
}
pub(super) fn get_str<'a>(&'a self, i: &'a str) -> &str {
pub(super) fn get_str<'a>(&'a self, i: &'a str) -> Result<&str, Error> {
match self {
Token::Ref(s, e) => &i[(*s as usize)..(*e as usize)],
Token::String(_, _, s) => s,
Token::Ref {
bytes,
..
} => {
let s = bytes.0 as usize;
let e = bytes.1 as usize;
let l = i.len();
if s >= l || e > l {
return Err(Error::AnalyzerError(format!(
"Unable to extract the token. The offset position ({s},{e}) is out of range ({l})."
)));
}
Ok(&i[(bytes.0 as usize)..(bytes.1 as usize)])
}
Token::String {
term,
..
} => Ok(term),
}
}
}
@ -129,28 +180,40 @@ impl Tokenizer {
pub(super) fn tokenize(t: &[SqlTokenizer], i: String) -> Tokens {
let mut w = Tokenizer::new(t);
let mut last_pos = 0;
let mut current_pos = 0;
let mut last_char_pos = 0;
let mut last_byte_pos = 0;
let mut current_char_pos = 0;
let mut current_byte_pos = 0;
let mut t = Vec::new();
for c in i.chars() {
let char_len = c.len_utf8() as Position;
let is_valid = Self::is_valid(c);
let should_split = w.should_split(c);
if should_split || !is_valid {
// The last pos may be more advanced due to the is_valid process
if last_pos < current_pos {
t.push(Token::Ref(last_pos, current_pos));
if last_char_pos < current_char_pos {
t.push(Token::Ref {
chars: (last_char_pos, current_char_pos),
bytes: (last_byte_pos, current_byte_pos),
});
}
last_pos = current_pos;
last_char_pos = current_char_pos;
last_byte_pos = current_byte_pos;
// If the character is not valid for indexing (space, control...)
// Then we increase the last position to the next character
if !is_valid {
last_pos += c.len_utf8() as Position;
last_char_pos += 1;
last_byte_pos += char_len;
}
}
current_pos += c.len_utf8() as Position;
current_char_pos += 1;
current_byte_pos += char_len;
}
if current_pos != last_pos {
t.push(Token::Ref(last_pos, current_pos));
if current_char_pos != last_char_pos {
t.push(Token::Ref {
chars: (last_char_pos, current_char_pos),
bytes: (last_byte_pos, current_byte_pos),
});
}
Tokens {
i,

View file

@ -32,17 +32,17 @@ impl Highlighter {
self.offseter.highlight(os);
}
fn extract(val: Value, vals: &mut Vec<Option<String>>) {
fn extract(val: Value, vals: &mut Vec<String>) {
match val {
Value::Strand(s) => vals.push(Some(s.0)),
Value::Number(n) => vals.push(Some(n.to_string())),
Value::Bool(b) => vals.push(Some(b.to_string())),
Value::Strand(s) => vals.push(s.0),
Value::Number(n) => vals.push(n.to_string()),
Value::Bool(b) => vals.push(b.to_string()),
Value::Array(a) => {
for v in a.0 {
Self::extract(v, vals);
}
}
_ => vals.push(None),
_ => {}
}
}
}
@ -60,24 +60,35 @@ impl TryFrom<Highlighter> for Value {
}
let mut res = Vec::with_capacity(vals.len());
for (idx, val) in vals.into_iter().enumerate() {
let idx = idx as u32;
if let Some(v) = val {
if let Some(m) = hl.offseter.offsets.get(&idx) {
let mut v: Vec<char> = v.chars().collect();
if let Some(m) = hl.offseter.offsets.get(&(idx as u32)) {
let mut v: Vec<char> = val.chars().collect();
let mut l = v.len();
let mut d = 0;
for (s, e) in m {
let p = (*s as usize) + d;
v.splice(p..p, hl.prefix.clone());
d += hl.prefix.len();
let p = (*e as usize) + d;
v.splice(p..p, hl.suffix.clone());
d += hl.suffix.len();
// We use a closure to append the prefix and the suffix
let mut append = |s: u32, ix: &Vec<char>| -> Result<(), Error> {
let p = (s as usize) + d;
if p > l {
return Err(Error::HighlightError(format!(
"position overflow: {s} - len: {l}"
)));
}
v.splice(p..p, ix.clone());
let xl = ix.len();
d += xl;
l += xl;
Ok(())
};
for (s, e) in m {
append(*s, &hl.prefix)?;
append(*e, &hl.suffix)?;
}
let s: String = v.iter().collect();
res.push(Value::from(s));
} else {
res.push(Value::from(v));
}
res.push(Value::from(val));
}
}
Ok(match res.len() {

View file

@ -132,23 +132,8 @@ mod tests {
#[test]
fn test_offset_records() {
let o = OffsetRecords(vec![
Offset {
index: 0,
start: 1,
end: 2,
},
Offset {
index: 0,
start: 11,
end: 22,
},
Offset {
index: 1,
start: 3,
end: 4,
},
]);
let o =
OffsetRecords(vec![Offset::new(0, 1, 2), Offset::new(0, 11, 22), Offset::new(1, 3, 4)]);
let v: Val = o.clone().try_into().unwrap();
let o2 = v.try_into().unwrap();
assert_eq!(o, o2)

View file

@ -95,10 +95,10 @@ async fn select_where_matches_without_using_index_iterator() -> Result<(), Error
#[tokio::test]
async fn select_where_matches_using_index_and_arrays() -> Result<(), Error> {
let sql = r"
CREATE blog:1 SET content = ['Hello World!', 'Be Bop', 'Foo Bar'];
CREATE blog:1 SET content = ['Hello World!', 'Be Bop', 'Foo Bãr'];
DEFINE ANALYZER simple TOKENIZERS blank,class;
DEFINE INDEX blog_content ON blog FIELDS content SEARCH ANALYZER simple BM25(1.2,0.75) HIGHLIGHTS;
SELECT id, search::highlight('<em>', '</em>', 1) AS content FROM blog WHERE content @1@ 'Hello Bar' EXPLAIN;
SELECT id, search::highlight('<em>', '</em>', 1) AS content FROM blog WHERE content @1@ 'Hello Bãr' EXPLAIN;
";
let dbs = Datastore::new("memory").await?;
let ses = Session::for_kv().with_ns("test").with_db("test");
@ -116,7 +116,7 @@ async fn select_where_matches_using_index_and_arrays() -> Result<(), Error> {
content: [
'<em>Hello</em> World!',
'Be Bop',
'Foo <em>Bar</em>'
'Foo <em>Bãr</em>'
]
},
{
@ -127,7 +127,7 @@ async fn select_where_matches_using_index_and_arrays() -> Result<(), Error> {
plan: {
index: 'blog_content',
operator: '@1@',
value: 'Hello Bar'
value: 'Hello Bãr'
},
table: 'blog',
},
@ -144,11 +144,11 @@ async fn select_where_matches_using_index_and_arrays() -> Result<(), Error> {
#[tokio::test]
async fn select_where_matches_using_index_offsets() -> Result<(), Error> {
let sql = r"
CREATE blog:1 SET title = 'Blog title!', content = ['Hello World!', 'Be Bop', 'Foo Bar'];
CREATE blog:1 SET title = 'Blog title!', content = ['Hello World!', 'Be Bop', 'Foo Bãr'];
DEFINE ANALYZER simple TOKENIZERS blank,class;
DEFINE INDEX blog_title ON blog FIELDS title SEARCH ANALYZER simple BM25(1.2,0.75) HIGHLIGHTS;
DEFINE INDEX blog_content ON blog FIELDS content SEARCH ANALYZER simple BM25(1.2,0.75) HIGHLIGHTS;
SELECT id, search::offsets(0) AS title, search::offsets(1) AS content FROM blog WHERE title @0@ 'title' AND content @1@ 'Hello Bar' EXPLAIN;
SELECT id, search::offsets(0) AS title, search::offsets(1) AS content FROM blog WHERE title @0@ 'title' AND content @1@ 'Hello Bãr' EXPLAIN;
";
let dbs = Datastore::new("memory").await?;
let ses = Session::for_kv().with_ns("test").with_db("test");
@ -179,7 +179,7 @@ async fn select_where_matches_using_index_offsets() -> Result<(), Error> {
plan: {
index: 'blog_content',
operator: '@1@',
value: 'Hello Bar'
value: 'Hello Bãr'
},
table: 'blog',
},