Bugfix - matches operator returns results on non-matching terms (#2185)

This commit is contained in:
Emmanuel Keller 2023-06-26 19:23:05 +01:00 committed by GitHub
parent e0f885e736
commit 73a4e54ad5
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
7 changed files with 66 additions and 39 deletions

View file

@ -41,7 +41,7 @@ impl Analyzer {
t: &Terms,
tx: &mut Transaction,
query_string: String,
) -> Result<(Vec<TermId>, bool), Error> {
) -> Result<Vec<Option<TermId>>, Error> {
let tokens = self.analyze(query_string)?;
// We first collect every unique terms
// as it can contains duplicates
@ -49,17 +49,13 @@ impl Analyzer {
for token in tokens.list() {
terms.insert(token);
}
let mut missing = false;
// Now we can extract the term ids
let mut res = Vec::with_capacity(terms.len());
for term in terms {
if let Some(term_id) = t.get_term_id(tx, tokens.get_token_string(term)?).await? {
res.push(term_id);
} else {
missing = false;
}
let opt_term_id = t.get_term_id(tx, tokens.get_token_string(term)?).await?;
res.push(opt_term_id);
}
Ok((res, missing))
Ok(res)
}
/// This method is used for indexing.

View file

@ -5,7 +5,7 @@ mod highlighter;
mod offsets;
mod postings;
pub(super) mod scorer;
mod termdocs;
pub(super) mod termdocs;
pub(crate) mod terms;
use crate::err::Error;
@ -299,22 +299,29 @@ impl FtIndex {
&self,
tx: &mut Transaction,
query_string: String,
) -> Result<Vec<TermId>, Error> {
) -> Result<Vec<Option<TermId>>, Error> {
let t = self.terms(tx).await?;
let (terms, _) = self.analyzer.extract_terms(&t, tx, query_string).await?;
let terms = self.analyzer.extract_terms(&t, tx, query_string).await?;
Ok(terms)
}
pub(super) async fn get_terms_docs(
&self,
tx: &mut Transaction,
terms: &Vec<TermId>,
) -> Result<Vec<(TermId, RoaringTreemap)>, Error> {
terms: &Vec<Option<TermId>>,
) -> Result<Vec<Option<(TermId, RoaringTreemap)>>, Error> {
let mut terms_docs = Vec::with_capacity(terms.len());
let td = self.term_docs();
for term_id in terms {
if let Some(term_docs) = td.get_docs(tx, *term_id).await? {
terms_docs.push((*term_id, term_docs));
for opt_term_id in terms {
if let Some(term_id) = opt_term_id {
let docs = td.get_docs(tx, *term_id).await?;
if let Some(docs) = docs {
terms_docs.push(Some((*term_id, docs)));
} else {
terms_docs.push(Some((*term_id, RoaringTreemap::new())));
}
} else {
terms_docs.push(None);
}
}
Ok(terms_docs)
@ -323,14 +330,18 @@ impl FtIndex {
pub(super) async fn new_hits_iterator(
&self,
tx: &mut Transaction,
terms_docs: Arc<Vec<(TermId, RoaringTreemap)>>,
terms_docs: Arc<Vec<Option<(TermId, RoaringTreemap)>>>,
) -> Result<Option<HitsIterator>, Error> {
let mut hits: Option<RoaringTreemap> = None;
for (_, term_docs) in terms_docs.iter() {
if let Some(h) = hits {
hits = Some(h.bitand(term_docs));
for opt_term_docs in terms_docs.iter() {
if let Some((_, term_docs)) = opt_term_docs {
if let Some(h) = hits {
hits = Some(h.bitand(term_docs));
} else {
hits = Some(term_docs.clone());
}
} else {
hits = Some(term_docs.clone());
return Ok(None);
}
}
if let Some(hits) = hits {
@ -345,7 +356,7 @@ impl FtIndex {
pub(super) async fn new_scorer(
&self,
tx: &mut Transaction,
terms_docs: Arc<Vec<(TermId, RoaringTreemap)>>,
terms_docs: Arc<Vec<Option<(TermId, RoaringTreemap)>>>,
) -> Result<Option<BM25Scorer>, Error> {
if let Some(bm25) = &self.bm25 {
return Ok(Some(BM25Scorer::new(
@ -365,7 +376,7 @@ impl FtIndex {
&self,
tx: &mut Transaction,
thg: &Thing,
terms: &Vec<TermId>,
terms: &[Option<TermId>],
prefix: Value,
suffix: Value,
idiom: &Idiom,
@ -376,7 +387,7 @@ impl FtIndex {
if let Some(doc_id) = doc_ids.get_doc_id(tx, doc_key).await? {
let o = self.offsets();
let mut hl = Highlighter::new(prefix, suffix, idiom, doc);
for term_id in terms {
for term_id in terms.iter().flatten() {
let o = o.get_offsets(tx, doc_id, *term_id).await?;
if let Some(o) = o {
hl.highlight(o.0);
@ -391,14 +402,14 @@ impl FtIndex {
&self,
tx: &mut Transaction,
thg: &Thing,
terms: &Vec<TermId>,
terms: &[Option<TermId>],
) -> Result<Value, Error> {
let doc_key: Key = thg.into();
let doc_ids = self.doc_ids(tx).await?;
if let Some(doc_id) = doc_ids.get_doc_id(tx, doc_key).await? {
let o = self.offsets();
let mut or = Offseter::default();
for term_id in terms {
for term_id in terms.iter().flatten() {
let o = o.get_offsets(tx, doc_id, *term_id).await?;
if let Some(o) = o {
or.highlight(o.0);

View file

@ -12,7 +12,7 @@ pub(super) type Score = f32;
pub(crate) struct BM25Scorer {
postings: Postings,
terms_docs: Arc<Vec<(TermId, RoaringTreemap)>>,
terms_docs: Arc<Vec<Option<(TermId, RoaringTreemap)>>>,
doc_lengths: DocLengths,
average_doc_length: f32,
doc_count: f32,
@ -22,7 +22,7 @@ pub(crate) struct BM25Scorer {
impl BM25Scorer {
pub(super) fn new(
postings: Postings,
terms_docs: Arc<Vec<(TermId, RoaringTreemap)>>,
terms_docs: Arc<Vec<Option<(TermId, RoaringTreemap)>>>,
doc_lengths: DocLengths,
total_docs_length: u128,
doc_count: u64,
@ -55,7 +55,7 @@ impl BM25Scorer {
doc_id: DocId,
) -> Result<Option<Score>, Error> {
let mut sc = 0.0;
for (term_id, docs) in self.terms_docs.iter() {
for (term_id, docs) in self.terms_docs.iter().flatten() {
if docs.contains(doc_id) {
if let Some(term_freq) =
self.postings.get_term_frequency(tx, *term_id, doc_id).await?

View file

@ -5,6 +5,9 @@ use crate::idx::ft::terms::TermId;
use crate::idx::{IndexKeyBase, SerdeState};
use crate::kvs::Transaction;
use roaring::RoaringTreemap;
use std::sync::Arc;
pub(in crate::idx) type TermsDocs = Arc<Vec<Option<(TermId, RoaringTreemap)>>>;
pub(super) struct TermDocs {
index_key_base: IndexKeyBase,

View file

@ -2,6 +2,7 @@ use crate::dbs::{Options, Transaction};
use crate::err::Error;
use crate::idx::ft::docids::{DocId, DocIds};
use crate::idx::ft::scorer::BM25Scorer;
use crate::idx::ft::termdocs::TermsDocs;
use crate::idx::ft::terms::TermId;
use crate::idx::ft::{FtIndex, MatchRef};
use crate::idx::planner::plan::IndexOption;
@ -92,7 +93,7 @@ impl QueryExecutor {
})
}
pub(super) fn pre_match_terms_docs(&self) -> Option<Arc<Vec<(TermId, RoaringTreemap)>>> {
pub(super) fn pre_match_terms_docs(&self) -> Option<TermsDocs> {
if let Some(entry) = &self.pre_match_entry {
return Some(entry.0.terms_docs.clone());
}
@ -130,8 +131,18 @@ impl QueryExecutor {
let mut run = txn.lock().await;
let doc_key: Key = thg.into();
if let Some(doc_id) = ft.0.doc_ids.get_doc_id(&mut run, doc_key).await? {
for (_, docs) in ft.0.terms_docs.iter() {
if !docs.contains(doc_id) {
let term_goals = ft.0.terms_docs.len();
// If there is no terms, it can't be a match
if term_goals == 0 {
return Ok(Value::Bool(false));
}
for opt_td in ft.0.terms_docs.iter() {
if let Some((_, docs)) = opt_td {
if !docs.contains(doc_id) {
return Ok(Value::Bool(false));
}
} else {
// If one of the term is missing, it can't be a match
return Ok(Value::Bool(false));
}
}
@ -227,8 +238,8 @@ struct FtEntry(Arc<Inner>);
struct Inner {
index_option: IndexOption,
doc_ids: DocIds,
terms: Vec<TermId>,
terms_docs: Arc<Vec<(TermId, RoaringTreemap)>>,
terms: Vec<Option<TermId>>,
terms_docs: Arc<Vec<Option<(TermId, RoaringTreemap)>>>,
scorer: Option<BM25Scorer>,
}

View file

@ -1,7 +1,7 @@
use crate::dbs::{Options, Transaction};
use crate::err::Error;
use crate::idx::ft::docids::{DocId, NO_DOC_ID};
use crate::idx::ft::terms::TermId;
use crate::idx::ft::termdocs::TermsDocs;
use crate::idx::ft::{FtIndex, HitsIterator, MatchRef};
use crate::idx::planner::executor::QueryExecutor;
use crate::idx::IndexKeyBase;
@ -12,7 +12,6 @@ use crate::sql::scoring::Scoring;
use crate::sql::statements::DefineIndexStatement;
use crate::sql::{Array, Expression, Ident, Idiom, Object, Operator, Thing, Value};
use async_trait::async_trait;
use roaring::RoaringTreemap;
use std::collections::HashMap;
use std::hash::Hash;
use std::sync::Arc;
@ -260,7 +259,7 @@ impl MatchesThingIterator {
hl: bool,
sc: &Scoring,
order: u32,
terms_docs: Option<Arc<Vec<(TermId, RoaringTreemap)>>>,
terms_docs: Option<TermsDocs>,
) -> Result<Self, Error> {
let ikb = IndexKeyBase::new(opt, ix);
if let Scoring::Bm {

View file

@ -234,16 +234,18 @@ async fn select_where_matches_without_using_index_and_score() -> Result<(), Erro
CREATE blog:4 SET title = 'the dog sat there and did nothing';
DEFINE ANALYZER simple TOKENIZERS blank,class;
DEFINE INDEX blog_title ON blog FIELDS title SEARCH ANALYZER simple BM25(1.2,0.75) HIGHLIGHTS;
SELECT id,search::score(1) AS score FROM blog WHERE (title @1@ 'animals' AND id>0) OR (title @1@ 'animals' AND id<99);
SELECT id,search::score(1) AS score FROM blog WHERE (title @1@ 'animals' AND id>0) OR (title @1@ 'animals' AND id<99);
SELECT id,search::score(1) + search::score(2) AS score FROM blog WHERE title @1@ 'dummy1' OR title @2@ 'dummy2';
";
let dbs = Datastore::new("memory").await?;
let ses = Session::for_kv().with_ns("test").with_db("test");
let res = &mut dbs.execute(&sql, &ses, None, false).await?;
assert_eq!(res.len(), 7);
assert_eq!(res.len(), 8);
//
for _ in 0..6 {
let _ = res.remove(0).result?;
}
let tmp = res.remove(0).result?;
let val = Value::parse(
"[
@ -254,5 +256,10 @@ async fn select_where_matches_without_using_index_and_score() -> Result<(), Erro
]",
);
assert_eq!(tmp, val);
// This result should be empty, as we are looking for non-existing terms (dummy1 and dummy2).
let tmp = res.remove(0).result?;
let val = Value::parse("[]");
assert_eq!(tmp, val);
Ok(())
}