Bugfix - matches operator returns results on non-matching terms (#2185)

This commit is contained in:
Emmanuel Keller 2023-06-26 19:23:05 +01:00 committed by GitHub
parent e0f885e736
commit 73a4e54ad5
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
7 changed files with 66 additions and 39 deletions

View file

@ -41,7 +41,7 @@ impl Analyzer {
t: &Terms, t: &Terms,
tx: &mut Transaction, tx: &mut Transaction,
query_string: String, query_string: String,
) -> Result<(Vec<TermId>, bool), Error> { ) -> Result<Vec<Option<TermId>>, Error> {
let tokens = self.analyze(query_string)?; let tokens = self.analyze(query_string)?;
// We first collect every unique terms // We first collect every unique terms
// as it can contains duplicates // as it can contains duplicates
@ -49,17 +49,13 @@ impl Analyzer {
for token in tokens.list() { for token in tokens.list() {
terms.insert(token); terms.insert(token);
} }
let mut missing = false;
// Now we can extract the term ids // Now we can extract the term ids
let mut res = Vec::with_capacity(terms.len()); let mut res = Vec::with_capacity(terms.len());
for term in terms { for term in terms {
if let Some(term_id) = t.get_term_id(tx, tokens.get_token_string(term)?).await? { let opt_term_id = t.get_term_id(tx, tokens.get_token_string(term)?).await?;
res.push(term_id); res.push(opt_term_id);
} else {
missing = false;
}
} }
Ok((res, missing)) Ok(res)
} }
/// This method is used for indexing. /// This method is used for indexing.

View file

@ -5,7 +5,7 @@ mod highlighter;
mod offsets; mod offsets;
mod postings; mod postings;
pub(super) mod scorer; pub(super) mod scorer;
mod termdocs; pub(super) mod termdocs;
pub(crate) mod terms; pub(crate) mod terms;
use crate::err::Error; use crate::err::Error;
@ -299,22 +299,29 @@ impl FtIndex {
&self, &self,
tx: &mut Transaction, tx: &mut Transaction,
query_string: String, query_string: String,
) -> Result<Vec<TermId>, Error> { ) -> Result<Vec<Option<TermId>>, Error> {
let t = self.terms(tx).await?; let t = self.terms(tx).await?;
let (terms, _) = self.analyzer.extract_terms(&t, tx, query_string).await?; let terms = self.analyzer.extract_terms(&t, tx, query_string).await?;
Ok(terms) Ok(terms)
} }
pub(super) async fn get_terms_docs( pub(super) async fn get_terms_docs(
&self, &self,
tx: &mut Transaction, tx: &mut Transaction,
terms: &Vec<TermId>, terms: &Vec<Option<TermId>>,
) -> Result<Vec<(TermId, RoaringTreemap)>, Error> { ) -> Result<Vec<Option<(TermId, RoaringTreemap)>>, Error> {
let mut terms_docs = Vec::with_capacity(terms.len()); let mut terms_docs = Vec::with_capacity(terms.len());
let td = self.term_docs(); let td = self.term_docs();
for term_id in terms { for opt_term_id in terms {
if let Some(term_docs) = td.get_docs(tx, *term_id).await? { if let Some(term_id) = opt_term_id {
terms_docs.push((*term_id, term_docs)); let docs = td.get_docs(tx, *term_id).await?;
if let Some(docs) = docs {
terms_docs.push(Some((*term_id, docs)));
} else {
terms_docs.push(Some((*term_id, RoaringTreemap::new())));
}
} else {
terms_docs.push(None);
} }
} }
Ok(terms_docs) Ok(terms_docs)
@ -323,14 +330,18 @@ impl FtIndex {
pub(super) async fn new_hits_iterator( pub(super) async fn new_hits_iterator(
&self, &self,
tx: &mut Transaction, tx: &mut Transaction,
terms_docs: Arc<Vec<(TermId, RoaringTreemap)>>, terms_docs: Arc<Vec<Option<(TermId, RoaringTreemap)>>>,
) -> Result<Option<HitsIterator>, Error> { ) -> Result<Option<HitsIterator>, Error> {
let mut hits: Option<RoaringTreemap> = None; let mut hits: Option<RoaringTreemap> = None;
for (_, term_docs) in terms_docs.iter() { for opt_term_docs in terms_docs.iter() {
if let Some(h) = hits { if let Some((_, term_docs)) = opt_term_docs {
hits = Some(h.bitand(term_docs)); if let Some(h) = hits {
hits = Some(h.bitand(term_docs));
} else {
hits = Some(term_docs.clone());
}
} else { } else {
hits = Some(term_docs.clone()); return Ok(None);
} }
} }
if let Some(hits) = hits { if let Some(hits) = hits {
@ -345,7 +356,7 @@ impl FtIndex {
pub(super) async fn new_scorer( pub(super) async fn new_scorer(
&self, &self,
tx: &mut Transaction, tx: &mut Transaction,
terms_docs: Arc<Vec<(TermId, RoaringTreemap)>>, terms_docs: Arc<Vec<Option<(TermId, RoaringTreemap)>>>,
) -> Result<Option<BM25Scorer>, Error> { ) -> Result<Option<BM25Scorer>, Error> {
if let Some(bm25) = &self.bm25 { if let Some(bm25) = &self.bm25 {
return Ok(Some(BM25Scorer::new( return Ok(Some(BM25Scorer::new(
@ -365,7 +376,7 @@ impl FtIndex {
&self, &self,
tx: &mut Transaction, tx: &mut Transaction,
thg: &Thing, thg: &Thing,
terms: &Vec<TermId>, terms: &[Option<TermId>],
prefix: Value, prefix: Value,
suffix: Value, suffix: Value,
idiom: &Idiom, idiom: &Idiom,
@ -376,7 +387,7 @@ impl FtIndex {
if let Some(doc_id) = doc_ids.get_doc_id(tx, doc_key).await? { if let Some(doc_id) = doc_ids.get_doc_id(tx, doc_key).await? {
let o = self.offsets(); let o = self.offsets();
let mut hl = Highlighter::new(prefix, suffix, idiom, doc); let mut hl = Highlighter::new(prefix, suffix, idiom, doc);
for term_id in terms { for term_id in terms.iter().flatten() {
let o = o.get_offsets(tx, doc_id, *term_id).await?; let o = o.get_offsets(tx, doc_id, *term_id).await?;
if let Some(o) = o { if let Some(o) = o {
hl.highlight(o.0); hl.highlight(o.0);
@ -391,14 +402,14 @@ impl FtIndex {
&self, &self,
tx: &mut Transaction, tx: &mut Transaction,
thg: &Thing, thg: &Thing,
terms: &Vec<TermId>, terms: &[Option<TermId>],
) -> Result<Value, Error> { ) -> Result<Value, Error> {
let doc_key: Key = thg.into(); let doc_key: Key = thg.into();
let doc_ids = self.doc_ids(tx).await?; let doc_ids = self.doc_ids(tx).await?;
if let Some(doc_id) = doc_ids.get_doc_id(tx, doc_key).await? { if let Some(doc_id) = doc_ids.get_doc_id(tx, doc_key).await? {
let o = self.offsets(); let o = self.offsets();
let mut or = Offseter::default(); let mut or = Offseter::default();
for term_id in terms { for term_id in terms.iter().flatten() {
let o = o.get_offsets(tx, doc_id, *term_id).await?; let o = o.get_offsets(tx, doc_id, *term_id).await?;
if let Some(o) = o { if let Some(o) = o {
or.highlight(o.0); or.highlight(o.0);

View file

@ -12,7 +12,7 @@ pub(super) type Score = f32;
pub(crate) struct BM25Scorer { pub(crate) struct BM25Scorer {
postings: Postings, postings: Postings,
terms_docs: Arc<Vec<(TermId, RoaringTreemap)>>, terms_docs: Arc<Vec<Option<(TermId, RoaringTreemap)>>>,
doc_lengths: DocLengths, doc_lengths: DocLengths,
average_doc_length: f32, average_doc_length: f32,
doc_count: f32, doc_count: f32,
@ -22,7 +22,7 @@ pub(crate) struct BM25Scorer {
impl BM25Scorer { impl BM25Scorer {
pub(super) fn new( pub(super) fn new(
postings: Postings, postings: Postings,
terms_docs: Arc<Vec<(TermId, RoaringTreemap)>>, terms_docs: Arc<Vec<Option<(TermId, RoaringTreemap)>>>,
doc_lengths: DocLengths, doc_lengths: DocLengths,
total_docs_length: u128, total_docs_length: u128,
doc_count: u64, doc_count: u64,
@ -55,7 +55,7 @@ impl BM25Scorer {
doc_id: DocId, doc_id: DocId,
) -> Result<Option<Score>, Error> { ) -> Result<Option<Score>, Error> {
let mut sc = 0.0; let mut sc = 0.0;
for (term_id, docs) in self.terms_docs.iter() { for (term_id, docs) in self.terms_docs.iter().flatten() {
if docs.contains(doc_id) { if docs.contains(doc_id) {
if let Some(term_freq) = if let Some(term_freq) =
self.postings.get_term_frequency(tx, *term_id, doc_id).await? self.postings.get_term_frequency(tx, *term_id, doc_id).await?

View file

@ -5,6 +5,9 @@ use crate::idx::ft::terms::TermId;
use crate::idx::{IndexKeyBase, SerdeState}; use crate::idx::{IndexKeyBase, SerdeState};
use crate::kvs::Transaction; use crate::kvs::Transaction;
use roaring::RoaringTreemap; use roaring::RoaringTreemap;
use std::sync::Arc;
pub(in crate::idx) type TermsDocs = Arc<Vec<Option<(TermId, RoaringTreemap)>>>;
pub(super) struct TermDocs { pub(super) struct TermDocs {
index_key_base: IndexKeyBase, index_key_base: IndexKeyBase,

View file

@ -2,6 +2,7 @@ use crate::dbs::{Options, Transaction};
use crate::err::Error; use crate::err::Error;
use crate::idx::ft::docids::{DocId, DocIds}; use crate::idx::ft::docids::{DocId, DocIds};
use crate::idx::ft::scorer::BM25Scorer; use crate::idx::ft::scorer::BM25Scorer;
use crate::idx::ft::termdocs::TermsDocs;
use crate::idx::ft::terms::TermId; use crate::idx::ft::terms::TermId;
use crate::idx::ft::{FtIndex, MatchRef}; use crate::idx::ft::{FtIndex, MatchRef};
use crate::idx::planner::plan::IndexOption; use crate::idx::planner::plan::IndexOption;
@ -92,7 +93,7 @@ impl QueryExecutor {
}) })
} }
pub(super) fn pre_match_terms_docs(&self) -> Option<Arc<Vec<(TermId, RoaringTreemap)>>> { pub(super) fn pre_match_terms_docs(&self) -> Option<TermsDocs> {
if let Some(entry) = &self.pre_match_entry { if let Some(entry) = &self.pre_match_entry {
return Some(entry.0.terms_docs.clone()); return Some(entry.0.terms_docs.clone());
} }
@ -130,8 +131,18 @@ impl QueryExecutor {
let mut run = txn.lock().await; let mut run = txn.lock().await;
let doc_key: Key = thg.into(); let doc_key: Key = thg.into();
if let Some(doc_id) = ft.0.doc_ids.get_doc_id(&mut run, doc_key).await? { if let Some(doc_id) = ft.0.doc_ids.get_doc_id(&mut run, doc_key).await? {
for (_, docs) in ft.0.terms_docs.iter() { let term_goals = ft.0.terms_docs.len();
if !docs.contains(doc_id) { // If there is no terms, it can't be a match
if term_goals == 0 {
return Ok(Value::Bool(false));
}
for opt_td in ft.0.terms_docs.iter() {
if let Some((_, docs)) = opt_td {
if !docs.contains(doc_id) {
return Ok(Value::Bool(false));
}
} else {
// If one of the term is missing, it can't be a match
return Ok(Value::Bool(false)); return Ok(Value::Bool(false));
} }
} }
@ -227,8 +238,8 @@ struct FtEntry(Arc<Inner>);
struct Inner { struct Inner {
index_option: IndexOption, index_option: IndexOption,
doc_ids: DocIds, doc_ids: DocIds,
terms: Vec<TermId>, terms: Vec<Option<TermId>>,
terms_docs: Arc<Vec<(TermId, RoaringTreemap)>>, terms_docs: Arc<Vec<Option<(TermId, RoaringTreemap)>>>,
scorer: Option<BM25Scorer>, scorer: Option<BM25Scorer>,
} }

View file

@ -1,7 +1,7 @@
use crate::dbs::{Options, Transaction}; use crate::dbs::{Options, Transaction};
use crate::err::Error; use crate::err::Error;
use crate::idx::ft::docids::{DocId, NO_DOC_ID}; use crate::idx::ft::docids::{DocId, NO_DOC_ID};
use crate::idx::ft::terms::TermId; use crate::idx::ft::termdocs::TermsDocs;
use crate::idx::ft::{FtIndex, HitsIterator, MatchRef}; use crate::idx::ft::{FtIndex, HitsIterator, MatchRef};
use crate::idx::planner::executor::QueryExecutor; use crate::idx::planner::executor::QueryExecutor;
use crate::idx::IndexKeyBase; use crate::idx::IndexKeyBase;
@ -12,7 +12,6 @@ use crate::sql::scoring::Scoring;
use crate::sql::statements::DefineIndexStatement; use crate::sql::statements::DefineIndexStatement;
use crate::sql::{Array, Expression, Ident, Idiom, Object, Operator, Thing, Value}; use crate::sql::{Array, Expression, Ident, Idiom, Object, Operator, Thing, Value};
use async_trait::async_trait; use async_trait::async_trait;
use roaring::RoaringTreemap;
use std::collections::HashMap; use std::collections::HashMap;
use std::hash::Hash; use std::hash::Hash;
use std::sync::Arc; use std::sync::Arc;
@ -260,7 +259,7 @@ impl MatchesThingIterator {
hl: bool, hl: bool,
sc: &Scoring, sc: &Scoring,
order: u32, order: u32,
terms_docs: Option<Arc<Vec<(TermId, RoaringTreemap)>>>, terms_docs: Option<TermsDocs>,
) -> Result<Self, Error> { ) -> Result<Self, Error> {
let ikb = IndexKeyBase::new(opt, ix); let ikb = IndexKeyBase::new(opt, ix);
if let Scoring::Bm { if let Scoring::Bm {

View file

@ -234,16 +234,18 @@ async fn select_where_matches_without_using_index_and_score() -> Result<(), Erro
CREATE blog:4 SET title = 'the dog sat there and did nothing'; CREATE blog:4 SET title = 'the dog sat there and did nothing';
DEFINE ANALYZER simple TOKENIZERS blank,class; DEFINE ANALYZER simple TOKENIZERS blank,class;
DEFINE INDEX blog_title ON blog FIELDS title SEARCH ANALYZER simple BM25(1.2,0.75) HIGHLIGHTS; DEFINE INDEX blog_title ON blog FIELDS title SEARCH ANALYZER simple BM25(1.2,0.75) HIGHLIGHTS;
SELECT id,search::score(1) AS score FROM blog WHERE (title @1@ 'animals' AND id>0) OR (title @1@ 'animals' AND id<99); SELECT id,search::score(1) AS score FROM blog WHERE (title @1@ 'animals' AND id>0) OR (title @1@ 'animals' AND id<99);
SELECT id,search::score(1) + search::score(2) AS score FROM blog WHERE title @1@ 'dummy1' OR title @2@ 'dummy2';
"; ";
let dbs = Datastore::new("memory").await?; let dbs = Datastore::new("memory").await?;
let ses = Session::for_kv().with_ns("test").with_db("test"); let ses = Session::for_kv().with_ns("test").with_db("test");
let res = &mut dbs.execute(&sql, &ses, None, false).await?; let res = &mut dbs.execute(&sql, &ses, None, false).await?;
assert_eq!(res.len(), 7); assert_eq!(res.len(), 8);
// //
for _ in 0..6 { for _ in 0..6 {
let _ = res.remove(0).result?; let _ = res.remove(0).result?;
} }
let tmp = res.remove(0).result?; let tmp = res.remove(0).result?;
let val = Value::parse( let val = Value::parse(
"[ "[
@ -254,5 +256,10 @@ async fn select_where_matches_without_using_index_and_score() -> Result<(), Erro
]", ]",
); );
assert_eq!(tmp, val); assert_eq!(tmp, val);
// This result should be empty, as we are looking for non-existing terms (dummy1 and dummy2).
let tmp = res.remove(0).result?;
let val = Value::parse("[]");
assert_eq!(tmp, val);
Ok(()) Ok(())
} }