Bugfix - matches operator returns results on non-matching terms (#2185)
This commit is contained in:
parent
e0f885e736
commit
73a4e54ad5
7 changed files with 66 additions and 39 deletions
|
@ -41,7 +41,7 @@ impl Analyzer {
|
|||
t: &Terms,
|
||||
tx: &mut Transaction,
|
||||
query_string: String,
|
||||
) -> Result<(Vec<TermId>, bool), Error> {
|
||||
) -> Result<Vec<Option<TermId>>, Error> {
|
||||
let tokens = self.analyze(query_string)?;
|
||||
// We first collect every unique terms
|
||||
// as it can contains duplicates
|
||||
|
@ -49,17 +49,13 @@ impl Analyzer {
|
|||
for token in tokens.list() {
|
||||
terms.insert(token);
|
||||
}
|
||||
let mut missing = false;
|
||||
// Now we can extract the term ids
|
||||
let mut res = Vec::with_capacity(terms.len());
|
||||
for term in terms {
|
||||
if let Some(term_id) = t.get_term_id(tx, tokens.get_token_string(term)?).await? {
|
||||
res.push(term_id);
|
||||
} else {
|
||||
missing = false;
|
||||
}
|
||||
let opt_term_id = t.get_term_id(tx, tokens.get_token_string(term)?).await?;
|
||||
res.push(opt_term_id);
|
||||
}
|
||||
Ok((res, missing))
|
||||
Ok(res)
|
||||
}
|
||||
|
||||
/// This method is used for indexing.
|
||||
|
|
|
@ -5,7 +5,7 @@ mod highlighter;
|
|||
mod offsets;
|
||||
mod postings;
|
||||
pub(super) mod scorer;
|
||||
mod termdocs;
|
||||
pub(super) mod termdocs;
|
||||
pub(crate) mod terms;
|
||||
|
||||
use crate::err::Error;
|
||||
|
@ -299,22 +299,29 @@ impl FtIndex {
|
|||
&self,
|
||||
tx: &mut Transaction,
|
||||
query_string: String,
|
||||
) -> Result<Vec<TermId>, Error> {
|
||||
) -> Result<Vec<Option<TermId>>, Error> {
|
||||
let t = self.terms(tx).await?;
|
||||
let (terms, _) = self.analyzer.extract_terms(&t, tx, query_string).await?;
|
||||
let terms = self.analyzer.extract_terms(&t, tx, query_string).await?;
|
||||
Ok(terms)
|
||||
}
|
||||
|
||||
pub(super) async fn get_terms_docs(
|
||||
&self,
|
||||
tx: &mut Transaction,
|
||||
terms: &Vec<TermId>,
|
||||
) -> Result<Vec<(TermId, RoaringTreemap)>, Error> {
|
||||
terms: &Vec<Option<TermId>>,
|
||||
) -> Result<Vec<Option<(TermId, RoaringTreemap)>>, Error> {
|
||||
let mut terms_docs = Vec::with_capacity(terms.len());
|
||||
let td = self.term_docs();
|
||||
for term_id in terms {
|
||||
if let Some(term_docs) = td.get_docs(tx, *term_id).await? {
|
||||
terms_docs.push((*term_id, term_docs));
|
||||
for opt_term_id in terms {
|
||||
if let Some(term_id) = opt_term_id {
|
||||
let docs = td.get_docs(tx, *term_id).await?;
|
||||
if let Some(docs) = docs {
|
||||
terms_docs.push(Some((*term_id, docs)));
|
||||
} else {
|
||||
terms_docs.push(Some((*term_id, RoaringTreemap::new())));
|
||||
}
|
||||
} else {
|
||||
terms_docs.push(None);
|
||||
}
|
||||
}
|
||||
Ok(terms_docs)
|
||||
|
@ -323,14 +330,18 @@ impl FtIndex {
|
|||
pub(super) async fn new_hits_iterator(
|
||||
&self,
|
||||
tx: &mut Transaction,
|
||||
terms_docs: Arc<Vec<(TermId, RoaringTreemap)>>,
|
||||
terms_docs: Arc<Vec<Option<(TermId, RoaringTreemap)>>>,
|
||||
) -> Result<Option<HitsIterator>, Error> {
|
||||
let mut hits: Option<RoaringTreemap> = None;
|
||||
for (_, term_docs) in terms_docs.iter() {
|
||||
if let Some(h) = hits {
|
||||
hits = Some(h.bitand(term_docs));
|
||||
for opt_term_docs in terms_docs.iter() {
|
||||
if let Some((_, term_docs)) = opt_term_docs {
|
||||
if let Some(h) = hits {
|
||||
hits = Some(h.bitand(term_docs));
|
||||
} else {
|
||||
hits = Some(term_docs.clone());
|
||||
}
|
||||
} else {
|
||||
hits = Some(term_docs.clone());
|
||||
return Ok(None);
|
||||
}
|
||||
}
|
||||
if let Some(hits) = hits {
|
||||
|
@ -345,7 +356,7 @@ impl FtIndex {
|
|||
pub(super) async fn new_scorer(
|
||||
&self,
|
||||
tx: &mut Transaction,
|
||||
terms_docs: Arc<Vec<(TermId, RoaringTreemap)>>,
|
||||
terms_docs: Arc<Vec<Option<(TermId, RoaringTreemap)>>>,
|
||||
) -> Result<Option<BM25Scorer>, Error> {
|
||||
if let Some(bm25) = &self.bm25 {
|
||||
return Ok(Some(BM25Scorer::new(
|
||||
|
@ -365,7 +376,7 @@ impl FtIndex {
|
|||
&self,
|
||||
tx: &mut Transaction,
|
||||
thg: &Thing,
|
||||
terms: &Vec<TermId>,
|
||||
terms: &[Option<TermId>],
|
||||
prefix: Value,
|
||||
suffix: Value,
|
||||
idiom: &Idiom,
|
||||
|
@ -376,7 +387,7 @@ impl FtIndex {
|
|||
if let Some(doc_id) = doc_ids.get_doc_id(tx, doc_key).await? {
|
||||
let o = self.offsets();
|
||||
let mut hl = Highlighter::new(prefix, suffix, idiom, doc);
|
||||
for term_id in terms {
|
||||
for term_id in terms.iter().flatten() {
|
||||
let o = o.get_offsets(tx, doc_id, *term_id).await?;
|
||||
if let Some(o) = o {
|
||||
hl.highlight(o.0);
|
||||
|
@ -391,14 +402,14 @@ impl FtIndex {
|
|||
&self,
|
||||
tx: &mut Transaction,
|
||||
thg: &Thing,
|
||||
terms: &Vec<TermId>,
|
||||
terms: &[Option<TermId>],
|
||||
) -> Result<Value, Error> {
|
||||
let doc_key: Key = thg.into();
|
||||
let doc_ids = self.doc_ids(tx).await?;
|
||||
if let Some(doc_id) = doc_ids.get_doc_id(tx, doc_key).await? {
|
||||
let o = self.offsets();
|
||||
let mut or = Offseter::default();
|
||||
for term_id in terms {
|
||||
for term_id in terms.iter().flatten() {
|
||||
let o = o.get_offsets(tx, doc_id, *term_id).await?;
|
||||
if let Some(o) = o {
|
||||
or.highlight(o.0);
|
||||
|
|
|
@ -12,7 +12,7 @@ pub(super) type Score = f32;
|
|||
|
||||
pub(crate) struct BM25Scorer {
|
||||
postings: Postings,
|
||||
terms_docs: Arc<Vec<(TermId, RoaringTreemap)>>,
|
||||
terms_docs: Arc<Vec<Option<(TermId, RoaringTreemap)>>>,
|
||||
doc_lengths: DocLengths,
|
||||
average_doc_length: f32,
|
||||
doc_count: f32,
|
||||
|
@ -22,7 +22,7 @@ pub(crate) struct BM25Scorer {
|
|||
impl BM25Scorer {
|
||||
pub(super) fn new(
|
||||
postings: Postings,
|
||||
terms_docs: Arc<Vec<(TermId, RoaringTreemap)>>,
|
||||
terms_docs: Arc<Vec<Option<(TermId, RoaringTreemap)>>>,
|
||||
doc_lengths: DocLengths,
|
||||
total_docs_length: u128,
|
||||
doc_count: u64,
|
||||
|
@ -55,7 +55,7 @@ impl BM25Scorer {
|
|||
doc_id: DocId,
|
||||
) -> Result<Option<Score>, Error> {
|
||||
let mut sc = 0.0;
|
||||
for (term_id, docs) in self.terms_docs.iter() {
|
||||
for (term_id, docs) in self.terms_docs.iter().flatten() {
|
||||
if docs.contains(doc_id) {
|
||||
if let Some(term_freq) =
|
||||
self.postings.get_term_frequency(tx, *term_id, doc_id).await?
|
||||
|
|
|
@ -5,6 +5,9 @@ use crate::idx::ft::terms::TermId;
|
|||
use crate::idx::{IndexKeyBase, SerdeState};
|
||||
use crate::kvs::Transaction;
|
||||
use roaring::RoaringTreemap;
|
||||
use std::sync::Arc;
|
||||
|
||||
pub(in crate::idx) type TermsDocs = Arc<Vec<Option<(TermId, RoaringTreemap)>>>;
|
||||
|
||||
pub(super) struct TermDocs {
|
||||
index_key_base: IndexKeyBase,
|
||||
|
|
|
@ -2,6 +2,7 @@ use crate::dbs::{Options, Transaction};
|
|||
use crate::err::Error;
|
||||
use crate::idx::ft::docids::{DocId, DocIds};
|
||||
use crate::idx::ft::scorer::BM25Scorer;
|
||||
use crate::idx::ft::termdocs::TermsDocs;
|
||||
use crate::idx::ft::terms::TermId;
|
||||
use crate::idx::ft::{FtIndex, MatchRef};
|
||||
use crate::idx::planner::plan::IndexOption;
|
||||
|
@ -92,7 +93,7 @@ impl QueryExecutor {
|
|||
})
|
||||
}
|
||||
|
||||
pub(super) fn pre_match_terms_docs(&self) -> Option<Arc<Vec<(TermId, RoaringTreemap)>>> {
|
||||
pub(super) fn pre_match_terms_docs(&self) -> Option<TermsDocs> {
|
||||
if let Some(entry) = &self.pre_match_entry {
|
||||
return Some(entry.0.terms_docs.clone());
|
||||
}
|
||||
|
@ -130,8 +131,18 @@ impl QueryExecutor {
|
|||
let mut run = txn.lock().await;
|
||||
let doc_key: Key = thg.into();
|
||||
if let Some(doc_id) = ft.0.doc_ids.get_doc_id(&mut run, doc_key).await? {
|
||||
for (_, docs) in ft.0.terms_docs.iter() {
|
||||
if !docs.contains(doc_id) {
|
||||
let term_goals = ft.0.terms_docs.len();
|
||||
// If there is no terms, it can't be a match
|
||||
if term_goals == 0 {
|
||||
return Ok(Value::Bool(false));
|
||||
}
|
||||
for opt_td in ft.0.terms_docs.iter() {
|
||||
if let Some((_, docs)) = opt_td {
|
||||
if !docs.contains(doc_id) {
|
||||
return Ok(Value::Bool(false));
|
||||
}
|
||||
} else {
|
||||
// If one of the term is missing, it can't be a match
|
||||
return Ok(Value::Bool(false));
|
||||
}
|
||||
}
|
||||
|
@ -227,8 +238,8 @@ struct FtEntry(Arc<Inner>);
|
|||
struct Inner {
|
||||
index_option: IndexOption,
|
||||
doc_ids: DocIds,
|
||||
terms: Vec<TermId>,
|
||||
terms_docs: Arc<Vec<(TermId, RoaringTreemap)>>,
|
||||
terms: Vec<Option<TermId>>,
|
||||
terms_docs: Arc<Vec<Option<(TermId, RoaringTreemap)>>>,
|
||||
scorer: Option<BM25Scorer>,
|
||||
}
|
||||
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
use crate::dbs::{Options, Transaction};
|
||||
use crate::err::Error;
|
||||
use crate::idx::ft::docids::{DocId, NO_DOC_ID};
|
||||
use crate::idx::ft::terms::TermId;
|
||||
use crate::idx::ft::termdocs::TermsDocs;
|
||||
use crate::idx::ft::{FtIndex, HitsIterator, MatchRef};
|
||||
use crate::idx::planner::executor::QueryExecutor;
|
||||
use crate::idx::IndexKeyBase;
|
||||
|
@ -12,7 +12,6 @@ use crate::sql::scoring::Scoring;
|
|||
use crate::sql::statements::DefineIndexStatement;
|
||||
use crate::sql::{Array, Expression, Ident, Idiom, Object, Operator, Thing, Value};
|
||||
use async_trait::async_trait;
|
||||
use roaring::RoaringTreemap;
|
||||
use std::collections::HashMap;
|
||||
use std::hash::Hash;
|
||||
use std::sync::Arc;
|
||||
|
@ -260,7 +259,7 @@ impl MatchesThingIterator {
|
|||
hl: bool,
|
||||
sc: &Scoring,
|
||||
order: u32,
|
||||
terms_docs: Option<Arc<Vec<(TermId, RoaringTreemap)>>>,
|
||||
terms_docs: Option<TermsDocs>,
|
||||
) -> Result<Self, Error> {
|
||||
let ikb = IndexKeyBase::new(opt, ix);
|
||||
if let Scoring::Bm {
|
||||
|
|
|
@ -234,16 +234,18 @@ async fn select_where_matches_without_using_index_and_score() -> Result<(), Erro
|
|||
CREATE blog:4 SET title = 'the dog sat there and did nothing';
|
||||
DEFINE ANALYZER simple TOKENIZERS blank,class;
|
||||
DEFINE INDEX blog_title ON blog FIELDS title SEARCH ANALYZER simple BM25(1.2,0.75) HIGHLIGHTS;
|
||||
SELECT id,search::score(1) AS score FROM blog WHERE (title @1@ 'animals' AND id>0) OR (title @1@ 'animals' AND id<99);
|
||||
SELECT id,search::score(1) AS score FROM blog WHERE (title @1@ 'animals' AND id>0) OR (title @1@ 'animals' AND id<99);
|
||||
SELECT id,search::score(1) + search::score(2) AS score FROM blog WHERE title @1@ 'dummy1' OR title @2@ 'dummy2';
|
||||
";
|
||||
let dbs = Datastore::new("memory").await?;
|
||||
let ses = Session::for_kv().with_ns("test").with_db("test");
|
||||
let res = &mut dbs.execute(&sql, &ses, None, false).await?;
|
||||
assert_eq!(res.len(), 7);
|
||||
assert_eq!(res.len(), 8);
|
||||
//
|
||||
for _ in 0..6 {
|
||||
let _ = res.remove(0).result?;
|
||||
}
|
||||
|
||||
let tmp = res.remove(0).result?;
|
||||
let val = Value::parse(
|
||||
"[
|
||||
|
@ -254,5 +256,10 @@ async fn select_where_matches_without_using_index_and_score() -> Result<(), Erro
|
|||
]",
|
||||
);
|
||||
assert_eq!(tmp, val);
|
||||
|
||||
// This result should be empty, as we are looking for non-existing terms (dummy1 and dummy2).
|
||||
let tmp = res.remove(0).result?;
|
||||
let val = Value::parse("[]");
|
||||
assert_eq!(tmp, val);
|
||||
Ok(())
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue