bug fix: partial highlight with ngram and other cleanups ()

This commit is contained in:
Emmanuel Keller 2024-03-20 13:11:02 +00:00 committed by GitHub
parent ec3bb1f659
commit d9eb60f2a5
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
4 changed files with 276 additions and 38 deletions
core/src/idx/ft/analyzer
lib/tests

View file

@ -1,10 +1,17 @@
use crate::err::Error;
use crate::idx::ft::analyzer::tokenizer::Tokens;
use crate::idx::ft::offsets::Position;
use crate::sql::filter::Filter as SqlFilter;
use crate::sql::language::Language;
use deunicode::deunicode;
use rust_stemmers::{Algorithm, Stemmer};
#[derive(Clone, Copy)]
pub(super) enum FilteringStage {
Indexing,
Querying,
}
pub(super) enum Filter {
Stemmer(Stemmer),
Ascii,
@ -61,10 +68,24 @@ impl Filter {
}
}
pub(super) fn apply_filters(mut t: Tokens, f: &Option<Vec<Filter>>) -> Result<Tokens, Error> {
if let Some(f) = f {
for f in f {
t = t.filter(f)?;
fn is_stage(&self, stage: FilteringStage) -> bool {
if let FilteringStage::Querying = stage {
!matches!(self, Filter::EdgeNgram(_, _) | Filter::Ngram(_, _))
} else {
true
}
}
pub(super) fn apply_filters(
mut t: Tokens,
f: &Option<Vec<Filter>>,
stage: FilteringStage,
) -> Result<Tokens, Error> {
if let Some(filters) = f {
for filter in filters {
if filter.is_stage(stage) {
t = t.filter(filter)?;
}
}
}
Ok(t)
@ -88,7 +109,7 @@ impl Filter {
} else if s.eq(c) {
FilterResult::Term(Term::Unchanged)
} else {
FilterResult::Term(Term::NewTerm(s))
FilterResult::Term(Term::NewTerm(s, 0))
}
}
@ -121,20 +142,17 @@ impl Filter {
return FilterResult::Ignore;
}
let mut ng = vec![];
let r1 = 0..(l - min);
let r1 = 0..=(l - min);
let max = max as usize;
for s in r1 {
let mut e = s + max;
if e > l {
e = l;
}
let r2 = (s + min)..(e + 1);
let e = (s + max).min(l);
let r2 = (s + min)..=e;
for p in r2 {
let n = &c[s..p];
if c.eq(n) {
ng.push(Term::Unchanged);
} else {
ng.push(Term::NewTerm(n.iter().collect()));
ng.push(Term::NewTerm(n.iter().collect(), s as Position));
}
}
}
@ -149,10 +167,7 @@ impl Filter {
if l < min {
return FilterResult::Ignore;
}
let mut max = max as usize;
if max > l {
max = l;
}
let max = (max as usize).min(l);
let mut ng = vec![];
let r = min..(max + 1);
for p in r {
@ -160,7 +175,7 @@ impl Filter {
if c.eq(n) {
ng.push(Term::Unchanged);
} else {
ng.push(Term::NewTerm(n.iter().collect()));
ng.push(Term::NewTerm(n.iter().collect(), 0));
}
}
FilterResult::Terms(ng)
@ -175,12 +190,13 @@ pub(super) enum FilterResult {
pub(super) enum Term {
Unchanged,
NewTerm(String),
NewTerm(String, Position),
}
#[cfg(test)]
mod tests {
use crate::idx::ft::analyzer::tests::test_analyzer;
use crate::idx::ft::analyzer::tests::{test_analyzer, test_analyzer_tokens};
use crate::idx::ft::analyzer::tokenizer::Token;
#[tokio::test]
async fn test_arabic_stemmer() {
@ -702,7 +718,93 @@ mod tests {
test_analyzer(
"ANALYZER test TOKENIZERS blank,class FILTERS lowercase,ngram(2,3);",
"Ālea iacta est",
&["āl", "āle", "le", "lea", "ia", "iac", "ac", "act", "ct", "cta", "es", "est"],
&[
"āl", "āle", "le", "lea", "ea", "ia", "iac", "ac", "act", "ct", "cta", "ta", "es",
"est", "st",
],
)
.await;
}
#[tokio::test]
async fn test_ngram_tokens() {
test_analyzer_tokens(
"ANALYZER test TOKENIZERS blank,class FILTERS lowercase,ngram(2,3);",
"Ālea iacta",
&vec![
Token::String {
chars: (0, 0, 4),
bytes: (0, 5),
term: "āl".to_string(),
len: 2,
},
Token::String {
chars: (0, 0, 4),
bytes: (0, 5),
term: "āle".to_string(),
len: 3,
},
Token::String {
chars: (0, 1, 4),
bytes: (0, 5),
term: "le".to_string(),
len: 2,
},
Token::String {
chars: (0, 1, 4),
bytes: (0, 5),
term: "lea".to_string(),
len: 3,
},
Token::String {
chars: (0, 2, 4),
bytes: (0, 5),
term: "ea".to_string(),
len: 2,
},
Token::String {
chars: (5, 5, 10),
bytes: (6, 11),
term: "ia".to_string(),
len: 2,
},
Token::String {
chars: (5, 5, 10),
bytes: (6, 11),
term: "iac".to_string(),
len: 3,
},
Token::String {
chars: (5, 6, 10),
bytes: (6, 11),
term: "ac".to_string(),
len: 2,
},
Token::String {
chars: (5, 6, 10),
bytes: (6, 11),
term: "act".to_string(),
len: 3,
},
Token::String {
chars: (5, 7, 10),
bytes: (6, 11),
term: "ct".to_string(),
len: 2,
},
Token::String {
chars: (5, 7, 10),
bytes: (6, 11),
term: "cta".to_string(),
len: 3,
},
Token::String {
chars: (5, 8, 10),
bytes: (6, 11),
term: "ta".to_string(),
len: 2,
},
],
)
.await;
}
@ -716,4 +818,32 @@ mod tests {
)
.await;
}
#[tokio::test]
async fn test_lowercase_tokens() {
test_analyzer_tokens(
"ANALYZER test TOKENIZERS blank,class FILTERS lowercase",
"Ālea IactA!",
&[
Token::String {
chars: (0, 0, 4),
bytes: (0, 5),
term: "ālea".to_string(),
len: 4,
},
Token::String {
chars: (5, 5, 10),
bytes: (6, 11),
term: "iacta".to_string(),
len: 5,
},
Token::Ref {
chars: (10, 10, 11),
bytes: (11, 12),
len: 1,
},
],
)
.await;
}
}

View file

@ -1,6 +1,7 @@
use crate::ctx::Context;
use crate::dbs::{Options, Transaction};
use crate::err::Error;
use crate::idx::ft::analyzer::filter::FilteringStage;
use crate::idx::ft::analyzer::tokenizer::{Tokenizer, Tokens};
use crate::idx::ft::doclength::DocLength;
use crate::idx::ft::offsets::{Offset, OffsetRecords};
@ -42,7 +43,8 @@ impl Analyzer {
t: &Terms,
query_string: String,
) -> Result<Vec<Option<(TermId, u32)>>, Error> {
let tokens = self.generate_tokens(ctx, opt, txn, query_string).await?;
let tokens =
self.generate_tokens(ctx, opt, txn, FilteringStage::Querying, query_string).await?;
// We first collect every unique terms
// as it can contains duplicates
let mut terms = HashSet::new();
@ -73,7 +75,8 @@ impl Analyzer {
// Let's first collect all the inputs, and collect the tokens.
// We need to store them because everything after is zero-copy
let mut inputs = vec![];
self.analyze_content(ctx, opt, txn, field_content, &mut inputs).await?;
self.analyze_content(ctx, opt, txn, field_content, FilteringStage::Indexing, &mut inputs)
.await?;
// We then collect every unique terms and count the frequency
let mut tf: HashMap<&str, TermFrequency> = HashMap::new();
for tks in &inputs {
@ -113,7 +116,7 @@ impl Analyzer {
// Let's first collect all the inputs, and collect the tokens.
// We need to store them because everything after is zero-copy
let mut inputs = Vec::with_capacity(content.len());
self.analyze_content(ctx, opt, txn, content, &mut inputs).await?;
self.analyze_content(ctx, opt, txn, content, FilteringStage::Indexing, &mut inputs).await?;
// We then collect every unique terms and count the frequency and extract the offsets
let mut tfos: HashMap<&str, Vec<Offset>> = HashMap::new();
for (i, tks) in inputs.iter().enumerate() {
@ -150,10 +153,11 @@ impl Analyzer {
opt: &Options,
txn: &Transaction,
content: Vec<Value>,
stage: FilteringStage,
tks: &mut Vec<Tokens>,
) -> Result<(), Error> {
for v in content {
self.analyze_value(ctx, opt, txn, v, tks).await?;
self.analyze_value(ctx, opt, txn, v, stage, tks).await?;
}
Ok(())
}
@ -166,20 +170,25 @@ impl Analyzer {
opt: &Options,
txn: &Transaction,
val: Value,
stage: FilteringStage,
tks: &mut Vec<Tokens>,
) -> Result<(), Error> {
match val {
Value::Strand(s) => tks.push(self.generate_tokens(ctx, opt, txn, s.0).await?),
Value::Number(n) => tks.push(self.generate_tokens(ctx, opt, txn, n.to_string()).await?),
Value::Bool(b) => tks.push(self.generate_tokens(ctx, opt, txn, b.to_string()).await?),
Value::Strand(s) => tks.push(self.generate_tokens(ctx, opt, txn, stage, s.0).await?),
Value::Number(n) => {
tks.push(self.generate_tokens(ctx, opt, txn, stage, n.to_string()).await?)
}
Value::Bool(b) => {
tks.push(self.generate_tokens(ctx, opt, txn, stage, b.to_string()).await?)
}
Value::Array(a) => {
for v in a.0 {
self.analyze_value(ctx, opt, txn, v, tks).await?;
self.analyze_value(ctx, opt, txn, v, stage, tks).await?;
}
}
Value::Object(o) => {
for (_, v) in o.0 {
self.analyze_value(ctx, opt, txn, v, tks).await?;
self.analyze_value(ctx, opt, txn, v, stage, tks).await?;
}
}
_ => {}
@ -187,12 +196,12 @@ impl Analyzer {
Ok(())
}
#[allow(unused_variables, unused_mut)]
async fn generate_tokens(
&self,
ctx: &Context<'_>,
opt: &Options,
txn: &Transaction,
stage: FilteringStage,
mut input: String,
) -> Result<Tokens, Error> {
if let Some(function_name) = self.function.clone() {
@ -210,7 +219,7 @@ impl Analyzer {
if let Some(t) = &self.tokenizers {
if !input.is_empty() {
let t = Tokenizer::tokenize(t, input);
return Filter::apply_filters(t, &self.filters);
return Filter::apply_filters(t, &self.filters, stage);
}
}
Ok(Tokens::new(input))
@ -224,7 +233,7 @@ impl Analyzer {
txn: &Transaction,
input: String,
) -> Result<Value, Error> {
self.generate_tokens(ctx, opt, txn, input).await?.try_into()
self.generate_tokens(ctx, opt, txn, FilteringStage::Indexing, input).await?.try_into()
}
}
@ -233,6 +242,7 @@ mod tests {
use super::Analyzer;
use crate::ctx::Context;
use crate::dbs::{Options, Transaction};
use crate::idx::ft::analyzer::filter::FilteringStage;
use crate::idx::ft::analyzer::tokenizer::{Token, Tokens};
use crate::kvs::{Datastore, LockType, TransactionType};
use crate::{
@ -253,7 +263,13 @@ mod tests {
};
let a: Analyzer = az.into();
let tokens = a
.generate_tokens(&Context::default(), &Options::default(), &txn, input.to_string())
.generate_tokens(
&Context::default(),
&Options::default(),
&txn,
FilteringStage::Indexing,
input.to_string(),
)
.await
.unwrap();
tokens
@ -267,4 +283,9 @@ mod tests {
}
assert_eq!(&res, expected);
}
pub(super) async fn test_analyzer_tokens(def: &str, input: &str, expected: &[Token]) {
let tokens = get_analyzer_tokens(def, input).await;
assert_eq!(tokens.list(), expected);
}
}

View file

@ -39,7 +39,7 @@ impl Tokens {
match fr {
FilterResult::Term(t) => match t {
Term::Unchanged => tks.push(tk),
Term::NewTerm(s) => tks.push(tk.new_token(s)),
Term::NewTerm(t, s) => tks.push(tk.new_token(t, s)),
},
FilterResult::Terms(ts) => {
let mut already_pushed = false;
@ -51,7 +51,7 @@ impl Tokens {
already_pushed = true;
}
}
Term::NewTerm(s) => tks.push(tk.new_token(s)),
Term::NewTerm(t, s) => tks.push(tk.new_token(t, s)),
}
}
}
@ -97,7 +97,7 @@ pub(super) enum Token {
}
impl Token {
fn new_token(&self, term: String) -> Self {
fn new_token(&self, term: String, start: Position) -> Self {
let len = term.chars().count() as u32;
match self {
Token::Ref {
@ -105,7 +105,7 @@ impl Token {
bytes,
..
} => Token::String {
chars: *chars,
chars: (chars.0, chars.1 + start, chars.2),
bytes: *bytes,
term,
len,
@ -115,7 +115,7 @@ impl Token {
bytes,
..
} => Token::String {
chars: *chars,
chars: (chars.0, chars.1 + start, chars.2),
bytes: *bytes,
term,
len,

View file

@ -276,6 +276,93 @@ async fn select_where_matches_partial_highlight() -> Result<(), Error> {
Ok(())
}
#[tokio::test]
async fn select_where_matches_partial_highlight_ngram() -> Result<(), Error> {
let sql = r"
CREATE blog:1 SET content = 'Hello World!';
DEFINE ANALYZER simple TOKENIZERS blank,class FILTERS lowercase,ngram(1,32);
DEFINE INDEX blog_content ON blog FIELDS content SEARCH ANALYZER simple BM25 HIGHLIGHTS;
SELECT id, search::highlight('<em>', '</em>', 1) AS content FROM blog WHERE content @1@ 'Hello';
SELECT id, search::highlight('<em>', '</em>', 1) AS content FROM blog WHERE content @1@ 'el';
SELECT id, search::highlight('<em>', '</em>', 1, false) AS content FROM blog WHERE content @1@ 'el';
SELECT id, search::highlight('<em>', '</em>', 1, true) AS content FROM blog WHERE content @1@ 'el';
SELECT id, search::offsets(1) AS content FROM blog WHERE content @1@ 'el';
SELECT id, search::offsets(1, false) AS content FROM blog WHERE content @1@ 'el';
SELECT id, search::offsets(1, true) AS content FROM blog WHERE content @1@ 'el';
";
let dbs = new_ds().await?;
let ses = Session::owner().with_ns("test").with_db("test");
let res = &mut dbs.execute(&sql, &ses, None).await?;
assert_eq!(res.len(), 10);
//
for _ in 0..3 {
let _ = res.remove(0).result?;
}
//
for i in 0..3 {
let tmp = res.remove(0).result?;
let val = Value::parse(
"[
{
id: blog:1,
content: '<em>Hello</em> World!'
}
]",
);
assert_eq!(format!("{:#}", tmp), format!("{:#}", val), "{i}");
}
//
let tmp = res.remove(0).result?;
let val = Value::parse(
"[
{
id: blog:1,
content: 'H<em>el</em>lo World!'
}
]",
);
assert_eq!(format!("{:#}", tmp), format!("{:#}", val));
//
for i in 0..2 {
let tmp = res.remove(0).result?;
let val = Value::parse(
"[
{
content: {
0: [
{
e: 5,
s: 0
}
]
},
id: blog:1
}
]",
);
assert_eq!(format!("{:#}", tmp), format!("{:#}", val), "{i}");
}
//
let tmp = res.remove(0).result?;
let val = Value::parse(
"[
{
content: {
0: [
{
e: 3,
s: 1
}
]
},
id: blog:1
}
]",
);
assert_eq!(format!("{:#}", tmp), format!("{:#}", val));
Ok(())
}
async fn select_where_matches_using_index_and_objects(parallel: bool) -> Result<(), Error> {
let p = if parallel {
"PARALLEL"