bug fix: partial highlight with ngram and other cleanups (#3725)
This commit is contained in:
parent
ec3bb1f659
commit
d9eb60f2a5
4 changed files with 276 additions and 38 deletions
|
@ -1,10 +1,17 @@
|
|||
use crate::err::Error;
|
||||
use crate::idx::ft::analyzer::tokenizer::Tokens;
|
||||
use crate::idx::ft::offsets::Position;
|
||||
use crate::sql::filter::Filter as SqlFilter;
|
||||
use crate::sql::language::Language;
|
||||
use deunicode::deunicode;
|
||||
use rust_stemmers::{Algorithm, Stemmer};
|
||||
|
||||
#[derive(Clone, Copy)]
|
||||
pub(super) enum FilteringStage {
|
||||
Indexing,
|
||||
Querying,
|
||||
}
|
||||
|
||||
pub(super) enum Filter {
|
||||
Stemmer(Stemmer),
|
||||
Ascii,
|
||||
|
@ -61,10 +68,24 @@ impl Filter {
|
|||
}
|
||||
}
|
||||
|
||||
pub(super) fn apply_filters(mut t: Tokens, f: &Option<Vec<Filter>>) -> Result<Tokens, Error> {
|
||||
if let Some(f) = f {
|
||||
for f in f {
|
||||
t = t.filter(f)?;
|
||||
fn is_stage(&self, stage: FilteringStage) -> bool {
|
||||
if let FilteringStage::Querying = stage {
|
||||
!matches!(self, Filter::EdgeNgram(_, _) | Filter::Ngram(_, _))
|
||||
} else {
|
||||
true
|
||||
}
|
||||
}
|
||||
|
||||
pub(super) fn apply_filters(
|
||||
mut t: Tokens,
|
||||
f: &Option<Vec<Filter>>,
|
||||
stage: FilteringStage,
|
||||
) -> Result<Tokens, Error> {
|
||||
if let Some(filters) = f {
|
||||
for filter in filters {
|
||||
if filter.is_stage(stage) {
|
||||
t = t.filter(filter)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(t)
|
||||
|
@ -88,7 +109,7 @@ impl Filter {
|
|||
} else if s.eq(c) {
|
||||
FilterResult::Term(Term::Unchanged)
|
||||
} else {
|
||||
FilterResult::Term(Term::NewTerm(s))
|
||||
FilterResult::Term(Term::NewTerm(s, 0))
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -121,20 +142,17 @@ impl Filter {
|
|||
return FilterResult::Ignore;
|
||||
}
|
||||
let mut ng = vec![];
|
||||
let r1 = 0..(l - min);
|
||||
let r1 = 0..=(l - min);
|
||||
let max = max as usize;
|
||||
for s in r1 {
|
||||
let mut e = s + max;
|
||||
if e > l {
|
||||
e = l;
|
||||
}
|
||||
let r2 = (s + min)..(e + 1);
|
||||
let e = (s + max).min(l);
|
||||
let r2 = (s + min)..=e;
|
||||
for p in r2 {
|
||||
let n = &c[s..p];
|
||||
if c.eq(n) {
|
||||
ng.push(Term::Unchanged);
|
||||
} else {
|
||||
ng.push(Term::NewTerm(n.iter().collect()));
|
||||
ng.push(Term::NewTerm(n.iter().collect(), s as Position));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -149,10 +167,7 @@ impl Filter {
|
|||
if l < min {
|
||||
return FilterResult::Ignore;
|
||||
}
|
||||
let mut max = max as usize;
|
||||
if max > l {
|
||||
max = l;
|
||||
}
|
||||
let max = (max as usize).min(l);
|
||||
let mut ng = vec![];
|
||||
let r = min..(max + 1);
|
||||
for p in r {
|
||||
|
@ -160,7 +175,7 @@ impl Filter {
|
|||
if c.eq(n) {
|
||||
ng.push(Term::Unchanged);
|
||||
} else {
|
||||
ng.push(Term::NewTerm(n.iter().collect()));
|
||||
ng.push(Term::NewTerm(n.iter().collect(), 0));
|
||||
}
|
||||
}
|
||||
FilterResult::Terms(ng)
|
||||
|
@ -175,12 +190,13 @@ pub(super) enum FilterResult {
|
|||
|
||||
pub(super) enum Term {
|
||||
Unchanged,
|
||||
NewTerm(String),
|
||||
NewTerm(String, Position),
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use crate::idx::ft::analyzer::tests::test_analyzer;
|
||||
use crate::idx::ft::analyzer::tests::{test_analyzer, test_analyzer_tokens};
|
||||
use crate::idx::ft::analyzer::tokenizer::Token;
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_arabic_stemmer() {
|
||||
|
@ -702,7 +718,93 @@ mod tests {
|
|||
test_analyzer(
|
||||
"ANALYZER test TOKENIZERS blank,class FILTERS lowercase,ngram(2,3);",
|
||||
"Ālea iacta est",
|
||||
&["āl", "āle", "le", "lea", "ia", "iac", "ac", "act", "ct", "cta", "es", "est"],
|
||||
&[
|
||||
"āl", "āle", "le", "lea", "ea", "ia", "iac", "ac", "act", "ct", "cta", "ta", "es",
|
||||
"est", "st",
|
||||
],
|
||||
)
|
||||
.await;
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_ngram_tokens() {
|
||||
test_analyzer_tokens(
|
||||
"ANALYZER test TOKENIZERS blank,class FILTERS lowercase,ngram(2,3);",
|
||||
"Ālea iacta",
|
||||
&vec![
|
||||
Token::String {
|
||||
chars: (0, 0, 4),
|
||||
bytes: (0, 5),
|
||||
term: "āl".to_string(),
|
||||
len: 2,
|
||||
},
|
||||
Token::String {
|
||||
chars: (0, 0, 4),
|
||||
bytes: (0, 5),
|
||||
term: "āle".to_string(),
|
||||
len: 3,
|
||||
},
|
||||
Token::String {
|
||||
chars: (0, 1, 4),
|
||||
bytes: (0, 5),
|
||||
term: "le".to_string(),
|
||||
len: 2,
|
||||
},
|
||||
Token::String {
|
||||
chars: (0, 1, 4),
|
||||
bytes: (0, 5),
|
||||
term: "lea".to_string(),
|
||||
len: 3,
|
||||
},
|
||||
Token::String {
|
||||
chars: (0, 2, 4),
|
||||
bytes: (0, 5),
|
||||
term: "ea".to_string(),
|
||||
len: 2,
|
||||
},
|
||||
Token::String {
|
||||
chars: (5, 5, 10),
|
||||
bytes: (6, 11),
|
||||
term: "ia".to_string(),
|
||||
len: 2,
|
||||
},
|
||||
Token::String {
|
||||
chars: (5, 5, 10),
|
||||
bytes: (6, 11),
|
||||
term: "iac".to_string(),
|
||||
len: 3,
|
||||
},
|
||||
Token::String {
|
||||
chars: (5, 6, 10),
|
||||
bytes: (6, 11),
|
||||
term: "ac".to_string(),
|
||||
len: 2,
|
||||
},
|
||||
Token::String {
|
||||
chars: (5, 6, 10),
|
||||
bytes: (6, 11),
|
||||
term: "act".to_string(),
|
||||
len: 3,
|
||||
},
|
||||
Token::String {
|
||||
chars: (5, 7, 10),
|
||||
bytes: (6, 11),
|
||||
term: "ct".to_string(),
|
||||
len: 2,
|
||||
},
|
||||
Token::String {
|
||||
chars: (5, 7, 10),
|
||||
bytes: (6, 11),
|
||||
term: "cta".to_string(),
|
||||
len: 3,
|
||||
},
|
||||
Token::String {
|
||||
chars: (5, 8, 10),
|
||||
bytes: (6, 11),
|
||||
term: "ta".to_string(),
|
||||
len: 2,
|
||||
},
|
||||
],
|
||||
)
|
||||
.await;
|
||||
}
|
||||
|
@ -716,4 +818,32 @@ mod tests {
|
|||
)
|
||||
.await;
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_lowercase_tokens() {
|
||||
test_analyzer_tokens(
|
||||
"ANALYZER test TOKENIZERS blank,class FILTERS lowercase",
|
||||
"Ālea IactA!",
|
||||
&[
|
||||
Token::String {
|
||||
chars: (0, 0, 4),
|
||||
bytes: (0, 5),
|
||||
term: "ālea".to_string(),
|
||||
len: 4,
|
||||
},
|
||||
Token::String {
|
||||
chars: (5, 5, 10),
|
||||
bytes: (6, 11),
|
||||
term: "iacta".to_string(),
|
||||
len: 5,
|
||||
},
|
||||
Token::Ref {
|
||||
chars: (10, 10, 11),
|
||||
bytes: (11, 12),
|
||||
len: 1,
|
||||
},
|
||||
],
|
||||
)
|
||||
.await;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
use crate::ctx::Context;
|
||||
use crate::dbs::{Options, Transaction};
|
||||
use crate::err::Error;
|
||||
use crate::idx::ft::analyzer::filter::FilteringStage;
|
||||
use crate::idx::ft::analyzer::tokenizer::{Tokenizer, Tokens};
|
||||
use crate::idx::ft::doclength::DocLength;
|
||||
use crate::idx::ft::offsets::{Offset, OffsetRecords};
|
||||
|
@ -42,7 +43,8 @@ impl Analyzer {
|
|||
t: &Terms,
|
||||
query_string: String,
|
||||
) -> Result<Vec<Option<(TermId, u32)>>, Error> {
|
||||
let tokens = self.generate_tokens(ctx, opt, txn, query_string).await?;
|
||||
let tokens =
|
||||
self.generate_tokens(ctx, opt, txn, FilteringStage::Querying, query_string).await?;
|
||||
// We first collect every unique terms
|
||||
// as it can contains duplicates
|
||||
let mut terms = HashSet::new();
|
||||
|
@ -73,7 +75,8 @@ impl Analyzer {
|
|||
// Let's first collect all the inputs, and collect the tokens.
|
||||
// We need to store them because everything after is zero-copy
|
||||
let mut inputs = vec![];
|
||||
self.analyze_content(ctx, opt, txn, field_content, &mut inputs).await?;
|
||||
self.analyze_content(ctx, opt, txn, field_content, FilteringStage::Indexing, &mut inputs)
|
||||
.await?;
|
||||
// We then collect every unique terms and count the frequency
|
||||
let mut tf: HashMap<&str, TermFrequency> = HashMap::new();
|
||||
for tks in &inputs {
|
||||
|
@ -113,7 +116,7 @@ impl Analyzer {
|
|||
// Let's first collect all the inputs, and collect the tokens.
|
||||
// We need to store them because everything after is zero-copy
|
||||
let mut inputs = Vec::with_capacity(content.len());
|
||||
self.analyze_content(ctx, opt, txn, content, &mut inputs).await?;
|
||||
self.analyze_content(ctx, opt, txn, content, FilteringStage::Indexing, &mut inputs).await?;
|
||||
// We then collect every unique terms and count the frequency and extract the offsets
|
||||
let mut tfos: HashMap<&str, Vec<Offset>> = HashMap::new();
|
||||
for (i, tks) in inputs.iter().enumerate() {
|
||||
|
@ -150,10 +153,11 @@ impl Analyzer {
|
|||
opt: &Options,
|
||||
txn: &Transaction,
|
||||
content: Vec<Value>,
|
||||
stage: FilteringStage,
|
||||
tks: &mut Vec<Tokens>,
|
||||
) -> Result<(), Error> {
|
||||
for v in content {
|
||||
self.analyze_value(ctx, opt, txn, v, tks).await?;
|
||||
self.analyze_value(ctx, opt, txn, v, stage, tks).await?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
@ -166,20 +170,25 @@ impl Analyzer {
|
|||
opt: &Options,
|
||||
txn: &Transaction,
|
||||
val: Value,
|
||||
stage: FilteringStage,
|
||||
tks: &mut Vec<Tokens>,
|
||||
) -> Result<(), Error> {
|
||||
match val {
|
||||
Value::Strand(s) => tks.push(self.generate_tokens(ctx, opt, txn, s.0).await?),
|
||||
Value::Number(n) => tks.push(self.generate_tokens(ctx, opt, txn, n.to_string()).await?),
|
||||
Value::Bool(b) => tks.push(self.generate_tokens(ctx, opt, txn, b.to_string()).await?),
|
||||
Value::Strand(s) => tks.push(self.generate_tokens(ctx, opt, txn, stage, s.0).await?),
|
||||
Value::Number(n) => {
|
||||
tks.push(self.generate_tokens(ctx, opt, txn, stage, n.to_string()).await?)
|
||||
}
|
||||
Value::Bool(b) => {
|
||||
tks.push(self.generate_tokens(ctx, opt, txn, stage, b.to_string()).await?)
|
||||
}
|
||||
Value::Array(a) => {
|
||||
for v in a.0 {
|
||||
self.analyze_value(ctx, opt, txn, v, tks).await?;
|
||||
self.analyze_value(ctx, opt, txn, v, stage, tks).await?;
|
||||
}
|
||||
}
|
||||
Value::Object(o) => {
|
||||
for (_, v) in o.0 {
|
||||
self.analyze_value(ctx, opt, txn, v, tks).await?;
|
||||
self.analyze_value(ctx, opt, txn, v, stage, tks).await?;
|
||||
}
|
||||
}
|
||||
_ => {}
|
||||
|
@ -187,12 +196,12 @@ impl Analyzer {
|
|||
Ok(())
|
||||
}
|
||||
|
||||
#[allow(unused_variables, unused_mut)]
|
||||
async fn generate_tokens(
|
||||
&self,
|
||||
ctx: &Context<'_>,
|
||||
opt: &Options,
|
||||
txn: &Transaction,
|
||||
stage: FilteringStage,
|
||||
mut input: String,
|
||||
) -> Result<Tokens, Error> {
|
||||
if let Some(function_name) = self.function.clone() {
|
||||
|
@ -210,7 +219,7 @@ impl Analyzer {
|
|||
if let Some(t) = &self.tokenizers {
|
||||
if !input.is_empty() {
|
||||
let t = Tokenizer::tokenize(t, input);
|
||||
return Filter::apply_filters(t, &self.filters);
|
||||
return Filter::apply_filters(t, &self.filters, stage);
|
||||
}
|
||||
}
|
||||
Ok(Tokens::new(input))
|
||||
|
@ -224,7 +233,7 @@ impl Analyzer {
|
|||
txn: &Transaction,
|
||||
input: String,
|
||||
) -> Result<Value, Error> {
|
||||
self.generate_tokens(ctx, opt, txn, input).await?.try_into()
|
||||
self.generate_tokens(ctx, opt, txn, FilteringStage::Indexing, input).await?.try_into()
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -233,6 +242,7 @@ mod tests {
|
|||
use super::Analyzer;
|
||||
use crate::ctx::Context;
|
||||
use crate::dbs::{Options, Transaction};
|
||||
use crate::idx::ft::analyzer::filter::FilteringStage;
|
||||
use crate::idx::ft::analyzer::tokenizer::{Token, Tokens};
|
||||
use crate::kvs::{Datastore, LockType, TransactionType};
|
||||
use crate::{
|
||||
|
@ -253,7 +263,13 @@ mod tests {
|
|||
};
|
||||
let a: Analyzer = az.into();
|
||||
let tokens = a
|
||||
.generate_tokens(&Context::default(), &Options::default(), &txn, input.to_string())
|
||||
.generate_tokens(
|
||||
&Context::default(),
|
||||
&Options::default(),
|
||||
&txn,
|
||||
FilteringStage::Indexing,
|
||||
input.to_string(),
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
tokens
|
||||
|
@ -267,4 +283,9 @@ mod tests {
|
|||
}
|
||||
assert_eq!(&res, expected);
|
||||
}
|
||||
|
||||
pub(super) async fn test_analyzer_tokens(def: &str, input: &str, expected: &[Token]) {
|
||||
let tokens = get_analyzer_tokens(def, input).await;
|
||||
assert_eq!(tokens.list(), expected);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -39,7 +39,7 @@ impl Tokens {
|
|||
match fr {
|
||||
FilterResult::Term(t) => match t {
|
||||
Term::Unchanged => tks.push(tk),
|
||||
Term::NewTerm(s) => tks.push(tk.new_token(s)),
|
||||
Term::NewTerm(t, s) => tks.push(tk.new_token(t, s)),
|
||||
},
|
||||
FilterResult::Terms(ts) => {
|
||||
let mut already_pushed = false;
|
||||
|
@ -51,7 +51,7 @@ impl Tokens {
|
|||
already_pushed = true;
|
||||
}
|
||||
}
|
||||
Term::NewTerm(s) => tks.push(tk.new_token(s)),
|
||||
Term::NewTerm(t, s) => tks.push(tk.new_token(t, s)),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -97,7 +97,7 @@ pub(super) enum Token {
|
|||
}
|
||||
|
||||
impl Token {
|
||||
fn new_token(&self, term: String) -> Self {
|
||||
fn new_token(&self, term: String, start: Position) -> Self {
|
||||
let len = term.chars().count() as u32;
|
||||
match self {
|
||||
Token::Ref {
|
||||
|
@ -105,7 +105,7 @@ impl Token {
|
|||
bytes,
|
||||
..
|
||||
} => Token::String {
|
||||
chars: *chars,
|
||||
chars: (chars.0, chars.1 + start, chars.2),
|
||||
bytes: *bytes,
|
||||
term,
|
||||
len,
|
||||
|
@ -115,7 +115,7 @@ impl Token {
|
|||
bytes,
|
||||
..
|
||||
} => Token::String {
|
||||
chars: *chars,
|
||||
chars: (chars.0, chars.1 + start, chars.2),
|
||||
bytes: *bytes,
|
||||
term,
|
||||
len,
|
||||
|
|
|
@ -276,6 +276,93 @@ async fn select_where_matches_partial_highlight() -> Result<(), Error> {
|
|||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn select_where_matches_partial_highlight_ngram() -> Result<(), Error> {
|
||||
let sql = r"
|
||||
CREATE blog:1 SET content = 'Hello World!';
|
||||
DEFINE ANALYZER simple TOKENIZERS blank,class FILTERS lowercase,ngram(1,32);
|
||||
DEFINE INDEX blog_content ON blog FIELDS content SEARCH ANALYZER simple BM25 HIGHLIGHTS;
|
||||
SELECT id, search::highlight('<em>', '</em>', 1) AS content FROM blog WHERE content @1@ 'Hello';
|
||||
SELECT id, search::highlight('<em>', '</em>', 1) AS content FROM blog WHERE content @1@ 'el';
|
||||
SELECT id, search::highlight('<em>', '</em>', 1, false) AS content FROM blog WHERE content @1@ 'el';
|
||||
SELECT id, search::highlight('<em>', '</em>', 1, true) AS content FROM blog WHERE content @1@ 'el';
|
||||
SELECT id, search::offsets(1) AS content FROM blog WHERE content @1@ 'el';
|
||||
SELECT id, search::offsets(1, false) AS content FROM blog WHERE content @1@ 'el';
|
||||
SELECT id, search::offsets(1, true) AS content FROM blog WHERE content @1@ 'el';
|
||||
";
|
||||
let dbs = new_ds().await?;
|
||||
let ses = Session::owner().with_ns("test").with_db("test");
|
||||
let res = &mut dbs.execute(&sql, &ses, None).await?;
|
||||
assert_eq!(res.len(), 10);
|
||||
//
|
||||
for _ in 0..3 {
|
||||
let _ = res.remove(0).result?;
|
||||
}
|
||||
//
|
||||
for i in 0..3 {
|
||||
let tmp = res.remove(0).result?;
|
||||
let val = Value::parse(
|
||||
"[
|
||||
{
|
||||
id: blog:1,
|
||||
content: '<em>Hello</em> World!'
|
||||
}
|
||||
]",
|
||||
);
|
||||
assert_eq!(format!("{:#}", tmp), format!("{:#}", val), "{i}");
|
||||
}
|
||||
//
|
||||
let tmp = res.remove(0).result?;
|
||||
let val = Value::parse(
|
||||
"[
|
||||
{
|
||||
id: blog:1,
|
||||
content: 'H<em>el</em>lo World!'
|
||||
}
|
||||
]",
|
||||
);
|
||||
assert_eq!(format!("{:#}", tmp), format!("{:#}", val));
|
||||
//
|
||||
for i in 0..2 {
|
||||
let tmp = res.remove(0).result?;
|
||||
let val = Value::parse(
|
||||
"[
|
||||
{
|
||||
content: {
|
||||
0: [
|
||||
{
|
||||
e: 5,
|
||||
s: 0
|
||||
}
|
||||
]
|
||||
},
|
||||
id: blog:1
|
||||
}
|
||||
]",
|
||||
);
|
||||
assert_eq!(format!("{:#}", tmp), format!("{:#}", val), "{i}");
|
||||
}
|
||||
//
|
||||
let tmp = res.remove(0).result?;
|
||||
let val = Value::parse(
|
||||
"[
|
||||
{
|
||||
content: {
|
||||
0: [
|
||||
{
|
||||
e: 3,
|
||||
s: 1
|
||||
}
|
||||
]
|
||||
},
|
||||
id: blog:1
|
||||
}
|
||||
]",
|
||||
);
|
||||
assert_eq!(format!("{:#}", tmp), format!("{:#}", val));
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn select_where_matches_using_index_and_objects(parallel: bool) -> Result<(), Error> {
|
||||
let p = if parallel {
|
||||
"PARALLEL"
|
||||
|
|
Loading…
Reference in a new issue