From 73a4e54ad5f784420f9bc3ce2246e13920a55788 Mon Sep 17 00:00:00 2001 From: Emmanuel Keller Date: Mon, 26 Jun 2023 19:23:05 +0100 Subject: [PATCH] Bugfix - matches operator returns results on non-matching terms (#2185) --- lib/src/idx/ft/analyzer/mod.rs | 12 +++------ lib/src/idx/ft/mod.rs | 47 ++++++++++++++++++++------------- lib/src/idx/ft/scorer.rs | 6 ++--- lib/src/idx/ft/termdocs.rs | 3 +++ lib/src/idx/planner/executor.rs | 21 +++++++++++---- lib/src/idx/planner/plan.rs | 5 ++-- lib/tests/matches.rs | 11 ++++++-- 7 files changed, 66 insertions(+), 39 deletions(-) diff --git a/lib/src/idx/ft/analyzer/mod.rs b/lib/src/idx/ft/analyzer/mod.rs index 2327cd54..582f9cc8 100644 --- a/lib/src/idx/ft/analyzer/mod.rs +++ b/lib/src/idx/ft/analyzer/mod.rs @@ -41,7 +41,7 @@ impl Analyzer { t: &Terms, tx: &mut Transaction, query_string: String, - ) -> Result<(Vec, bool), Error> { + ) -> Result>, Error> { let tokens = self.analyze(query_string)?; // We first collect every unique terms // as it can contains duplicates @@ -49,17 +49,13 @@ impl Analyzer { for token in tokens.list() { terms.insert(token); } - let mut missing = false; // Now we can extract the term ids let mut res = Vec::with_capacity(terms.len()); for term in terms { - if let Some(term_id) = t.get_term_id(tx, tokens.get_token_string(term)?).await? { - res.push(term_id); - } else { - missing = false; - } + let opt_term_id = t.get_term_id(tx, tokens.get_token_string(term)?).await?; + res.push(opt_term_id); } - Ok((res, missing)) + Ok(res) } /// This method is used for indexing. diff --git a/lib/src/idx/ft/mod.rs b/lib/src/idx/ft/mod.rs index 2f61802e..2a51880e 100644 --- a/lib/src/idx/ft/mod.rs +++ b/lib/src/idx/ft/mod.rs @@ -5,7 +5,7 @@ mod highlighter; mod offsets; mod postings; pub(super) mod scorer; -mod termdocs; +pub(super) mod termdocs; pub(crate) mod terms; use crate::err::Error; @@ -299,22 +299,29 @@ impl FtIndex { &self, tx: &mut Transaction, query_string: String, - ) -> Result, Error> { + ) -> Result>, Error> { let t = self.terms(tx).await?; - let (terms, _) = self.analyzer.extract_terms(&t, tx, query_string).await?; + let terms = self.analyzer.extract_terms(&t, tx, query_string).await?; Ok(terms) } pub(super) async fn get_terms_docs( &self, tx: &mut Transaction, - terms: &Vec, - ) -> Result, Error> { + terms: &Vec>, + ) -> Result>, Error> { let mut terms_docs = Vec::with_capacity(terms.len()); let td = self.term_docs(); - for term_id in terms { - if let Some(term_docs) = td.get_docs(tx, *term_id).await? { - terms_docs.push((*term_id, term_docs)); + for opt_term_id in terms { + if let Some(term_id) = opt_term_id { + let docs = td.get_docs(tx, *term_id).await?; + if let Some(docs) = docs { + terms_docs.push(Some((*term_id, docs))); + } else { + terms_docs.push(Some((*term_id, RoaringTreemap::new()))); + } + } else { + terms_docs.push(None); } } Ok(terms_docs) @@ -323,14 +330,18 @@ impl FtIndex { pub(super) async fn new_hits_iterator( &self, tx: &mut Transaction, - terms_docs: Arc>, + terms_docs: Arc>>, ) -> Result, Error> { let mut hits: Option = None; - for (_, term_docs) in terms_docs.iter() { - if let Some(h) = hits { - hits = Some(h.bitand(term_docs)); + for opt_term_docs in terms_docs.iter() { + if let Some((_, term_docs)) = opt_term_docs { + if let Some(h) = hits { + hits = Some(h.bitand(term_docs)); + } else { + hits = Some(term_docs.clone()); + } } else { - hits = Some(term_docs.clone()); + return Ok(None); } } if let Some(hits) = hits { @@ -345,7 +356,7 @@ impl FtIndex { pub(super) async fn new_scorer( &self, tx: &mut Transaction, - terms_docs: Arc>, + terms_docs: Arc>>, ) -> Result, Error> { if let Some(bm25) = &self.bm25 { return Ok(Some(BM25Scorer::new( @@ -365,7 +376,7 @@ impl FtIndex { &self, tx: &mut Transaction, thg: &Thing, - terms: &Vec, + terms: &[Option], prefix: Value, suffix: Value, idiom: &Idiom, @@ -376,7 +387,7 @@ impl FtIndex { if let Some(doc_id) = doc_ids.get_doc_id(tx, doc_key).await? { let o = self.offsets(); let mut hl = Highlighter::new(prefix, suffix, idiom, doc); - for term_id in terms { + for term_id in terms.iter().flatten() { let o = o.get_offsets(tx, doc_id, *term_id).await?; if let Some(o) = o { hl.highlight(o.0); @@ -391,14 +402,14 @@ impl FtIndex { &self, tx: &mut Transaction, thg: &Thing, - terms: &Vec, + terms: &[Option], ) -> Result { let doc_key: Key = thg.into(); let doc_ids = self.doc_ids(tx).await?; if let Some(doc_id) = doc_ids.get_doc_id(tx, doc_key).await? { let o = self.offsets(); let mut or = Offseter::default(); - for term_id in terms { + for term_id in terms.iter().flatten() { let o = o.get_offsets(tx, doc_id, *term_id).await?; if let Some(o) = o { or.highlight(o.0); diff --git a/lib/src/idx/ft/scorer.rs b/lib/src/idx/ft/scorer.rs index 86a64537..73a42509 100644 --- a/lib/src/idx/ft/scorer.rs +++ b/lib/src/idx/ft/scorer.rs @@ -12,7 +12,7 @@ pub(super) type Score = f32; pub(crate) struct BM25Scorer { postings: Postings, - terms_docs: Arc>, + terms_docs: Arc>>, doc_lengths: DocLengths, average_doc_length: f32, doc_count: f32, @@ -22,7 +22,7 @@ pub(crate) struct BM25Scorer { impl BM25Scorer { pub(super) fn new( postings: Postings, - terms_docs: Arc>, + terms_docs: Arc>>, doc_lengths: DocLengths, total_docs_length: u128, doc_count: u64, @@ -55,7 +55,7 @@ impl BM25Scorer { doc_id: DocId, ) -> Result, Error> { let mut sc = 0.0; - for (term_id, docs) in self.terms_docs.iter() { + for (term_id, docs) in self.terms_docs.iter().flatten() { if docs.contains(doc_id) { if let Some(term_freq) = self.postings.get_term_frequency(tx, *term_id, doc_id).await? diff --git a/lib/src/idx/ft/termdocs.rs b/lib/src/idx/ft/termdocs.rs index 5e9c8caa..d96969b3 100644 --- a/lib/src/idx/ft/termdocs.rs +++ b/lib/src/idx/ft/termdocs.rs @@ -5,6 +5,9 @@ use crate::idx::ft::terms::TermId; use crate::idx::{IndexKeyBase, SerdeState}; use crate::kvs::Transaction; use roaring::RoaringTreemap; +use std::sync::Arc; + +pub(in crate::idx) type TermsDocs = Arc>>; pub(super) struct TermDocs { index_key_base: IndexKeyBase, diff --git a/lib/src/idx/planner/executor.rs b/lib/src/idx/planner/executor.rs index 2ee9d769..f2ccd164 100644 --- a/lib/src/idx/planner/executor.rs +++ b/lib/src/idx/planner/executor.rs @@ -2,6 +2,7 @@ use crate::dbs::{Options, Transaction}; use crate::err::Error; use crate::idx::ft::docids::{DocId, DocIds}; use crate::idx::ft::scorer::BM25Scorer; +use crate::idx::ft::termdocs::TermsDocs; use crate::idx::ft::terms::TermId; use crate::idx::ft::{FtIndex, MatchRef}; use crate::idx::planner::plan::IndexOption; @@ -92,7 +93,7 @@ impl QueryExecutor { }) } - pub(super) fn pre_match_terms_docs(&self) -> Option>> { + pub(super) fn pre_match_terms_docs(&self) -> Option { if let Some(entry) = &self.pre_match_entry { return Some(entry.0.terms_docs.clone()); } @@ -130,8 +131,18 @@ impl QueryExecutor { let mut run = txn.lock().await; let doc_key: Key = thg.into(); if let Some(doc_id) = ft.0.doc_ids.get_doc_id(&mut run, doc_key).await? { - for (_, docs) in ft.0.terms_docs.iter() { - if !docs.contains(doc_id) { + let term_goals = ft.0.terms_docs.len(); + // If there is no terms, it can't be a match + if term_goals == 0 { + return Ok(Value::Bool(false)); + } + for opt_td in ft.0.terms_docs.iter() { + if let Some((_, docs)) = opt_td { + if !docs.contains(doc_id) { + return Ok(Value::Bool(false)); + } + } else { + // If one of the term is missing, it can't be a match return Ok(Value::Bool(false)); } } @@ -227,8 +238,8 @@ struct FtEntry(Arc); struct Inner { index_option: IndexOption, doc_ids: DocIds, - terms: Vec, - terms_docs: Arc>, + terms: Vec>, + terms_docs: Arc>>, scorer: Option, } diff --git a/lib/src/idx/planner/plan.rs b/lib/src/idx/planner/plan.rs index ed28ebce..990cf476 100644 --- a/lib/src/idx/planner/plan.rs +++ b/lib/src/idx/planner/plan.rs @@ -1,7 +1,7 @@ use crate::dbs::{Options, Transaction}; use crate::err::Error; use crate::idx::ft::docids::{DocId, NO_DOC_ID}; -use crate::idx::ft::terms::TermId; +use crate::idx::ft::termdocs::TermsDocs; use crate::idx::ft::{FtIndex, HitsIterator, MatchRef}; use crate::idx::planner::executor::QueryExecutor; use crate::idx::IndexKeyBase; @@ -12,7 +12,6 @@ use crate::sql::scoring::Scoring; use crate::sql::statements::DefineIndexStatement; use crate::sql::{Array, Expression, Ident, Idiom, Object, Operator, Thing, Value}; use async_trait::async_trait; -use roaring::RoaringTreemap; use std::collections::HashMap; use std::hash::Hash; use std::sync::Arc; @@ -260,7 +259,7 @@ impl MatchesThingIterator { hl: bool, sc: &Scoring, order: u32, - terms_docs: Option>>, + terms_docs: Option, ) -> Result { let ikb = IndexKeyBase::new(opt, ix); if let Scoring::Bm { diff --git a/lib/tests/matches.rs b/lib/tests/matches.rs index f8001e67..6ca02ae2 100644 --- a/lib/tests/matches.rs +++ b/lib/tests/matches.rs @@ -234,16 +234,18 @@ async fn select_where_matches_without_using_index_and_score() -> Result<(), Erro CREATE blog:4 SET title = 'the dog sat there and did nothing'; DEFINE ANALYZER simple TOKENIZERS blank,class; DEFINE INDEX blog_title ON blog FIELDS title SEARCH ANALYZER simple BM25(1.2,0.75) HIGHLIGHTS; - SELECT id,search::score(1) AS score FROM blog WHERE (title @1@ 'animals' AND id>0) OR (title @1@ 'animals' AND id<99); + SELECT id,search::score(1) AS score FROM blog WHERE (title @1@ 'animals' AND id>0) OR (title @1@ 'animals' AND id<99); + SELECT id,search::score(1) + search::score(2) AS score FROM blog WHERE title @1@ 'dummy1' OR title @2@ 'dummy2'; "; let dbs = Datastore::new("memory").await?; let ses = Session::for_kv().with_ns("test").with_db("test"); let res = &mut dbs.execute(&sql, &ses, None, false).await?; - assert_eq!(res.len(), 7); + assert_eq!(res.len(), 8); // for _ in 0..6 { let _ = res.remove(0).result?; } + let tmp = res.remove(0).result?; let val = Value::parse( "[ @@ -254,5 +256,10 @@ async fn select_where_matches_without_using_index_and_score() -> Result<(), Erro ]", ); assert_eq!(tmp, val); + + // This result should be empty, as we are looking for non-existing terms (dummy1 and dummy2). + let tmp = res.remove(0).result?; + let val = Value::parse("[]"); + assert_eq!(tmp, val); Ok(()) }