diff --git a/lib/src/ctx/context.rs b/lib/src/ctx/context.rs index b60387d9..339bd1a5 100644 --- a/lib/src/ctx/context.rs +++ b/lib/src/ctx/context.rs @@ -2,6 +2,7 @@ use crate::ctx::canceller::Canceller; use crate::ctx::reason::Reason; use crate::dbs::Transaction; use crate::err::Error; +use crate::idx::ft::docids::DocId; use crate::idx::planner::executor::QueryExecutor; use crate::sql::value::Value; use crate::sql::Thing; @@ -39,6 +40,8 @@ pub struct Context<'a> { query_executors: Option>>, // An optional record id thing: Option<&'a Thing>, + // An optional doc id + doc_id: Option, // An optional cursor document cursor_doc: Option<&'a Value>, } @@ -73,6 +76,7 @@ impl<'a> Context<'a> { transaction: None, query_executors: None, thing: None, + doc_id: None, cursor_doc: None, } } @@ -87,6 +91,7 @@ impl<'a> Context<'a> { transaction: parent.transaction.clone(), query_executors: parent.query_executors.clone(), thing: parent.thing, + doc_id: parent.doc_id, cursor_doc: parent.cursor_doc, } } @@ -123,6 +128,10 @@ impl<'a> Context<'a> { self.thing = Some(thing); } + pub fn add_doc_id(&mut self, doc_id: DocId) { + self.doc_id = Some(doc_id); + } + /// Add a cursor document to this context. /// Usage: A new child context is created by an iterator for each document. /// The iterator sets the value of the current document (known as cursor document). @@ -165,6 +174,10 @@ impl<'a> Context<'a> { self.thing } + pub fn doc_id(&self) -> Option { + self.doc_id + } + pub fn doc(&self) -> Option<&Value> { self.cursor_doc } diff --git a/lib/src/dbs/iterate.rs b/lib/src/dbs/iterate.rs index 7155a51e..00f0c21d 100644 --- a/lib/src/dbs/iterate.rs +++ b/lib/src/dbs/iterate.rs @@ -449,43 +449,51 @@ impl Iterable { let txn = ctx.try_clone_transaction()?; // Check that the table exists txn.lock().await.check_ns_db_tb(opt.ns(), opt.db(), &table.0, opt.strict).await?; - let mut iterator = plan.new_iterator(opt, &txn).await?; - let mut things = iterator.next_batch(&txn, 1000).await?; - while !things.is_empty() { - // Check if the context is finished - if ctx.is_done() { - break; - } - - for thing in things { - // Check the context + let exe = ctx.get_query_executor(&table.0); + if let Some(exe) = exe { + let mut iterator = plan.new_iterator(opt, &txn, exe).await?; + let mut things = iterator.next_batch(&txn, 1000).await?; + while !things.is_empty() { + // Check if the context is finished if ctx.is_done() { break; } - // If the record is from another table we can skip - if !thing.tb.eq(table.as_str()) { - continue; + for (thing, doc_id) in things { + // Check the context + if ctx.is_done() { + break; + } + + // If the record is from another table we can skip + if !thing.tb.eq(table.as_str()) { + continue; + } + + // Fetch the data from the store + let key = thing::new(opt.ns(), opt.db(), &table.0, &thing.id); + let val = txn.lock().await.get(key.clone()).await?; + let rid = Thing::from((key.tb, key.id)); + let mut ctx = Context::new(ctx); + ctx.add_thing(&rid); + ctx.add_doc_id(doc_id); + // Parse the data from the store + let val = Operable::Value(match val { + Some(v) => Value::from(v), + None => Value::None, + }); + // Process the document record + ite.process(&ctx, opt, stm, val).await; } - // Fetch the data from the store - let key = thing::new(opt.ns(), opt.db(), &table.0, &thing.id); - let val = txn.lock().await.get(key.clone()).await?; - let rid = Thing::from((key.tb, key.id)); - let mut ctx = Context::new(ctx); - ctx.add_thing(&rid); - // Parse the data from the store - let val = Operable::Value(match val { - Some(v) => Value::from(v), - None => Value::None, - }); - // Process the document record - ite.process(&ctx, opt, stm, val).await; + // Collect the next batch of ids + things = iterator.next_batch(&txn, 1000).await?; } - - // Collect the next batch of ids - things = iterator.next_batch(&txn, 1000).await?; + Ok(()) + } else { + Err(Error::QueryNotExecutedDetail { + message: "The QueryExecutor has not been found.".to_string(), + }) } - Ok(()) } } diff --git a/lib/src/err/mod.rs b/lib/src/err/mod.rs index 44c117b3..4971c5e6 100644 --- a/lib/src/err/mod.rs +++ b/lib/src/err/mod.rs @@ -1,3 +1,4 @@ +use crate::idx::ft::MatchRef; use crate::sql::idiom::Idiom; use crate::sql::value::Value; use base64_lib::DecodeError as Base64Error; @@ -497,6 +498,12 @@ pub enum Error { #[doc(hidden)] #[error("Bypass the query planner")] BypassQueryPlanner, + + /// Duplicated match references are not allowed + #[error("Duplicated Match reference: {mr}")] + DuplicatedMatchRef { + mr: MatchRef, + }, } impl From for String { diff --git a/lib/src/fnc/mod.rs b/lib/src/fnc/mod.rs index cb265ac1..2b8f97ad 100644 --- a/lib/src/fnc/mod.rs +++ b/lib/src/fnc/mod.rs @@ -302,6 +302,7 @@ pub async fn asynchronous(ctx: &Context<'_>, name: &str, args: Vec) -> Re "http::patch" => http::patch(ctx).await, "http::delete" => http::delete(ctx).await, // + "search::score" => search::score(ctx).await, "search::highlight" => search::highlight(ctx).await, "search::offsets" => search::offsets(ctx).await, // diff --git a/lib/src/fnc/script/modules/surrealdb/functions/search.rs b/lib/src/fnc/script/modules/surrealdb/functions/search.rs index 7a05167e..17c88f53 100644 --- a/lib/src/fnc/script/modules/surrealdb/functions/search.rs +++ b/lib/src/fnc/script/modules/surrealdb/functions/search.rs @@ -8,5 +8,6 @@ impl_module_def!( Package, "search", "highlight" => fut Async, - "offsets" => fut Async + "offsets" => fut Async, + "score" => fut Async ); diff --git a/lib/src/fnc/search.rs b/lib/src/fnc/search.rs index ebeee72b..d5736619 100644 --- a/lib/src/fnc/search.rs +++ b/lib/src/fnc/search.rs @@ -2,6 +2,16 @@ use crate::ctx::Context; use crate::err::Error; use crate::sql::Value; +pub async fn score(ctx: &Context<'_>, (match_ref,): (Value,)) -> Result { + if let Some(thg) = ctx.thing() { + if let Some(exe) = ctx.get_query_executor(&thg.tb) { + let txn = ctx.try_clone_transaction()?; + return exe.score(txn, &match_ref, thg, ctx.doc_id()).await; + } + } + Ok(Value::None) +} + pub async fn highlight( ctx: &Context<'_>, (prefix, suffix, match_ref): (Value, Value, Value), @@ -10,7 +20,7 @@ pub async fn highlight( if let Some(thg) = ctx.thing() { if let Some(exe) = ctx.get_query_executor(&thg.tb) { let txn = ctx.try_clone_transaction()?; - return exe.highlight(txn, thg, prefix, suffix, match_ref.clone(), doc).await; + return exe.highlight(txn, thg, prefix, suffix, &match_ref, doc).await; } } } @@ -21,7 +31,7 @@ pub async fn offsets(ctx: &Context<'_>, (match_ref,): (Value,)) -> Result, @@ -59,7 +61,7 @@ impl DocIds { doc_id } - pub(super) async fn get_doc_id( + pub(crate) async fn get_doc_id( &self, tx: &mut Transaction, doc_key: Key, diff --git a/lib/src/idx/ft/mod.rs b/lib/src/idx/ft/mod.rs index cb50f0fd..2f61802e 100644 --- a/lib/src/idx/ft/mod.rs +++ b/lib/src/idx/ft/mod.rs @@ -4,18 +4,18 @@ mod doclength; mod highlighter; mod offsets; mod postings; -mod scorer; +pub(super) mod scorer; mod termdocs; pub(crate) mod terms; use crate::err::Error; use crate::idx::ft::analyzer::Analyzer; -use crate::idx::ft::docids::DocIds; +use crate::idx::ft::docids::{DocId, DocIds}; use crate::idx::ft::doclength::DocLengths; use crate::idx::ft::highlighter::{Highlighter, Offseter}; use crate::idx::ft::offsets::Offsets; use crate::idx::ft::postings::Postings; -use crate::idx::ft::scorer::{BM25Scorer, Score}; +use crate::idx::ft::scorer::BM25Scorer; use crate::idx::ft::termdocs::TermDocs; use crate::idx::ft::terms::{TermId, Terms}; use crate::idx::{btree, IndexKeyBase, SerdeState}; @@ -27,6 +27,7 @@ use roaring::treemap::IntoIter; use roaring::RoaringTreemap; use serde::{Deserialize, Serialize}; use std::ops::BitAnd; +use std::sync::Arc; pub(crate) type MatchRef = u8; @@ -118,7 +119,7 @@ impl FtIndex { }) } - async fn doc_ids(&self, tx: &mut Transaction) -> Result { + pub(crate) async fn doc_ids(&self, tx: &mut Transaction) -> Result { DocIds::new(tx, self.index_key_base.clone(), self.order).await } @@ -304,75 +305,59 @@ impl FtIndex { Ok(terms) } - pub(super) async fn search( + pub(super) async fn get_terms_docs( &self, tx: &mut Transaction, - query_string: String, - ) -> Result<(Vec, Option), Error> { - let t = self.terms(tx).await?; - let td = self.term_docs(); - let (terms, missing) = self.analyzer.extract_terms(&t, tx, query_string).await?; - if missing { - // If any term does not exists, as we are doing an AND query, - // we can return an empty results set - return Ok((terms, None)); - } - let mut hits: Option = None; + terms: &Vec, + ) -> Result, Error> { let mut terms_docs = Vec::with_capacity(terms.len()); - for term_id in &terms { + let td = self.term_docs(); + for term_id in terms { if let Some(term_docs) = td.get_docs(tx, *term_id).await? { - if let Some(h) = hits { - hits = Some(h.bitand(&term_docs)); - } else { - hits = Some(term_docs.clone()); - } terms_docs.push((*term_id, term_docs)); } } + Ok(terms_docs) + } + + pub(super) async fn new_hits_iterator( + &self, + tx: &mut Transaction, + terms_docs: Arc>, + ) -> Result, Error> { + let mut hits: Option = None; + for (_, term_docs) in terms_docs.iter() { + if let Some(h) = hits { + hits = Some(h.bitand(term_docs)); + } else { + hits = Some(term_docs.clone()); + } + } if let Some(hits) = hits { if !hits.is_empty() { - let postings = self.postings(tx).await?; - let doc_lengths = self.doc_lengths(tx).await?; - - let mut scorer = None; - if let Some(bm25) = &self.bm25 { - scorer = Some(BM25Scorer::new( - doc_lengths, - self.state.total_docs_lengths, - self.state.doc_count, - bm25.clone(), - )); - } let doc_ids = self.doc_ids(tx).await?; - return Ok(( - terms, - Some(HitsIterator::new(doc_ids, postings, hits, terms_docs, scorer)), - )); + return Ok(Some(HitsIterator::new(doc_ids, hits))); } } - Ok((terms, None)) + Ok(None) } - pub(super) async fn match_id_value( + pub(super) async fn new_scorer( &self, tx: &mut Transaction, - thg: &Thing, - term: &str, - ) -> Result { - let doc_key: Key = thg.into(); - let doc_ids = self.doc_ids(tx).await?; - if let Some(doc_id) = doc_ids.get_doc_id(tx, doc_key).await? { - let terms = self.terms(tx).await?; - if let Some(term_id) = terms.get_term_id(tx, term).await? { - let postings = self.postings(tx).await?; - if let Some(term_freq) = postings.get_term_frequency(tx, term_id, doc_id).await? { - if term_freq > 0 { - return Ok(true); - } - } - } + terms_docs: Arc>, + ) -> Result, Error> { + if let Some(bm25) = &self.bm25 { + return Ok(Some(BM25Scorer::new( + self.postings(tx).await?, + terms_docs, + self.doc_lengths(tx).await?, + self.state.total_docs_lengths, + self.state.doc_count, + bm25.clone(), + ))); } - Ok(false) + Ok(None) } #[allow(clippy::too_many_arguments)] @@ -437,51 +422,24 @@ impl FtIndex { pub(crate) struct HitsIterator { doc_ids: DocIds, - postings: Postings, iter: IntoIter, - terms_docs: Vec<(TermId, RoaringTreemap)>, - scorer: Option, } impl HitsIterator { - fn new( - doc_ids: DocIds, - postings: Postings, - hits: RoaringTreemap, - terms_docs: Vec<(TermId, RoaringTreemap)>, - scorer: Option, - ) -> Self { + fn new(doc_ids: DocIds, hits: RoaringTreemap) -> Self { Self { doc_ids, - postings, iter: hits.into_iter(), - terms_docs, - scorer, } } pub(crate) async fn next( &mut self, tx: &mut Transaction, - ) -> Result)>, Error> { + ) -> Result, Error> { for doc_id in self.iter.by_ref() { if let Some(doc_key) = self.doc_ids.get_doc_key(tx, doc_id).await? { - let score = if let Some(scorer) = &self.scorer { - let mut sc = 0.0; - for (term_id, docs) in &self.terms_docs { - if docs.contains(doc_id) { - if let Some(term_freq) = - self.postings.get_term_frequency(tx, *term_id, doc_id).await? - { - sc += scorer.score(tx, doc_id, docs.len(), term_freq).await?; - } - } - } - Some(sc) - } else { - None - }; - return Ok(Some((doc_key.into(), score))); + return Ok(Some((doc_key.into(), doc_id))); } } Ok(None) @@ -490,23 +448,27 @@ impl HitsIterator { #[cfg(test)] mod tests { - use crate::idx::ft::{FtIndex, HitsIterator, Score}; + use crate::idx::ft::scorer::{BM25Scorer, Score}; + use crate::idx::ft::{FtIndex, HitsIterator}; use crate::idx::IndexKeyBase; use crate::kvs::{Datastore, Transaction}; use crate::sql::scoring::Scoring; use crate::sql::statements::define::analyzer; use crate::sql::{Array, Thing}; use std::collections::HashMap; + use std::sync::Arc; use test_log::test; async fn check_hits( - i: Option, tx: &mut Transaction, + hits: Option, + scr: BM25Scorer, e: Vec<(&Thing, Option)>, ) { - if let Some(mut i) = i { + if let Some(mut hits) = hits { let mut map = HashMap::new(); - while let Some((k, s)) = i.next(tx).await.unwrap() { + while let Some((k, d)) = hits.next(tx).await.unwrap() { + let s = scr.score(tx, d).await.unwrap(); map.insert(k, s); } assert_eq!(map.len(), e.len()); @@ -518,6 +480,18 @@ mod tests { } } + async fn search( + tx: &mut Transaction, + fti: &FtIndex, + qs: &str, + ) -> (Option, BM25Scorer) { + let t = fti.extract_terms(tx, qs.to_string()).await.unwrap(); + let td = Arc::new(fti.get_terms_docs(tx, &t).await.unwrap()); + let scr = fti.new_scorer(tx, td.clone()).await.unwrap().unwrap(); + let hits = fti.new_hits_iterator(tx, td).await.unwrap(); + (hits, scr) + } + #[test(tokio::test)] async fn test_ft_index() { let ds = Datastore::new("memory").await.unwrap(); @@ -587,23 +561,23 @@ mod tests { assert_eq!(statistics.doc_lengths.keys_count, 3); // Search & score - let (_, i) = fti.search(&mut tx, "hello".to_string()).await.unwrap(); - check_hits(i, &mut tx, vec![(&doc1, Some(0.0)), (&doc2, Some(0.0))]).await; + let (hits, scr) = search(&mut tx, &fti, "hello").await; + check_hits(&mut tx, hits, scr, vec![(&doc1, Some(0.0)), (&doc2, Some(0.0))]).await; - let (_, i) = fti.search(&mut tx, "world".to_string()).await.unwrap(); - check_hits(i, &mut tx, vec![(&doc1, Some(0.4859746))]).await; + let (hits, scr) = search(&mut tx, &fti, "world").await; + check_hits(&mut tx, hits, scr, vec![(&doc1, Some(0.4859746))]).await; - let (_, i) = fti.search(&mut tx, "yellow".to_string()).await.unwrap(); - check_hits(i, &mut tx, vec![(&doc2, Some(0.4859746))]).await; + let (hits, scr) = search(&mut tx, &fti, "yellow").await; + check_hits(&mut tx, hits, scr, vec![(&doc2, Some(0.4859746))]).await; - let (_, i) = fti.search(&mut tx, "foo".to_string()).await.unwrap(); - check_hits(i, &mut tx, vec![(&doc3, Some(0.56902087))]).await; + let (hits, scr) = search(&mut tx, &fti, "foo").await; + check_hits(&mut tx, hits, scr, vec![(&doc3, Some(0.56902087))]).await; - let (_, i) = fti.search(&mut tx, "bar".to_string()).await.unwrap(); - check_hits(i, &mut tx, vec![(&doc3, Some(0.56902087))]).await; + let (hits, scr) = search(&mut tx, &fti, "bar").await; + check_hits(&mut tx, hits, scr, vec![(&doc3, Some(0.56902087))]).await; - let (_, i) = fti.search(&mut tx, "dummy".to_string()).await.unwrap(); - assert!(i.is_none()); + let (hits, _) = search(&mut tx, &fti, "dummy").await; + assert!(hits.is_none()); } { @@ -624,16 +598,16 @@ mod tests { // We can still find 'foo' let mut tx = ds.transaction(false, false).await.unwrap(); - let (_, i) = fti.search(&mut tx, "foo".to_string()).await.unwrap(); - check_hits(i, &mut tx, vec![(&doc3, Some(0.56902087))]).await; + let (hits, scr) = search(&mut tx, &fti, "foo").await; + check_hits(&mut tx, hits, scr, vec![(&doc3, Some(0.56902087))]).await; // We can't anymore find 'bar' - let (_, i) = fti.search(&mut tx, "bar".to_string()).await.unwrap(); - assert!(i.is_none()); + let (hits, _) = search(&mut tx, &fti, "bar").await; + assert!(hits.is_none()); // We can now find 'nobar' - let (_, i) = fti.search(&mut tx, "nobar".to_string()).await.unwrap(); - check_hits(i, &mut tx, vec![(&doc3, Some(0.56902087))]).await; + let (hits, scr) = search(&mut tx, &fti, "nobar").await; + check_hits(&mut tx, hits, scr, vec![(&doc3, Some(0.56902087))]).await; } { @@ -655,13 +629,11 @@ mod tests { tx.commit().await.unwrap(); let mut tx = ds.transaction(false, false).await.unwrap(); - let (v, h) = fti.search(&mut tx, "hello".to_string()).await.unwrap(); - assert!(v.is_empty()); - assert!(h.is_none()); + let (hits, _) = search(&mut tx, &fti, "hello").await; + assert!(hits.is_none()); - let (v, h) = fti.search(&mut tx, "foo".to_string()).await.unwrap(); - assert!(v.is_empty()); - assert!(h.is_none()); + let (hits, _) = search(&mut tx, &fti, "foo").await; + assert!(hits.is_none()); } } @@ -742,10 +714,11 @@ mod tests { assert_eq!(statistics.doc_ids.keys_count, 4); assert_eq!(statistics.doc_lengths.keys_count, 4); - let (_, i) = fti.search(&mut tx, "the".to_string()).await.unwrap(); + let (hits, scr) = search(&mut tx, &fti, "the").await; check_hits( - i, &mut tx, + hits, + scr, vec![ (&doc1, Some(0.0)), (&doc2, Some(0.0)), @@ -755,34 +728,35 @@ mod tests { ) .await; - let (_, i) = fti.search(&mut tx, "dog".to_string()).await.unwrap(); + let (hits, scr) = search(&mut tx, &fti, "dog").await; check_hits( - i, &mut tx, + hits, + scr, vec![(&doc1, Some(0.0)), (&doc2, Some(0.0)), (&doc3, Some(0.0))], ) .await; - let (_, i) = fti.search(&mut tx, "fox".to_string()).await.unwrap(); - check_hits(i, &mut tx, vec![(&doc1, Some(0.0)), (&doc2, Some(0.0))]).await; + let (hits, scr) = search(&mut tx, &fti, "fox").await; + check_hits(&mut tx, hits, scr, vec![(&doc1, Some(0.0)), (&doc2, Some(0.0))]).await; - let (_, i) = fti.search(&mut tx, "over".to_string()).await.unwrap(); - check_hits(i, &mut tx, vec![(&doc1, Some(0.0)), (&doc2, Some(0.0))]).await; + let (hits, scr) = search(&mut tx, &fti, "over").await; + check_hits(&mut tx, hits, scr, vec![(&doc1, Some(0.0)), (&doc2, Some(0.0))]).await; - let (_, i) = fti.search(&mut tx, "lazy".to_string()).await.unwrap(); - check_hits(i, &mut tx, vec![(&doc1, Some(0.0)), (&doc2, Some(0.0))]).await; + let (hits, scr) = search(&mut tx, &fti, "lazy").await; + check_hits(&mut tx, hits, scr, vec![(&doc1, Some(0.0)), (&doc2, Some(0.0))]).await; - let (_, i) = fti.search(&mut tx, "jumped".to_string()).await.unwrap(); - check_hits(i, &mut tx, vec![(&doc1, Some(0.0)), (&doc2, Some(0.0))]).await; + let (hits, scr) = search(&mut tx, &fti, "jumped").await; + check_hits(&mut tx, hits, scr, vec![(&doc1, Some(0.0)), (&doc2, Some(0.0))]).await; - let (_, i) = fti.search(&mut tx, "nothing".to_string()).await.unwrap(); - check_hits(i, &mut tx, vec![(&doc3, Some(0.87105393))]).await; + let (hits, scr) = search(&mut tx, &fti, "nothing").await; + check_hits(&mut tx, hits, scr, vec![(&doc3, Some(0.87105393))]).await; - let (_, i) = fti.search(&mut tx, "animals".to_string()).await.unwrap(); - check_hits(i, &mut tx, vec![(&doc4, Some(0.92279965))]).await; + let (hits, scr) = search(&mut tx, &fti, "animals").await; + check_hits(&mut tx, hits, scr, vec![(&doc4, Some(0.92279965))]).await; - let (_, i) = fti.search(&mut tx, "dummy".to_string()).await.unwrap(); - assert!(i.is_none()); + let (hits, _) = search(&mut tx, &fti, "dummy").await; + assert!(hits.is_none()); } } } diff --git a/lib/src/idx/ft/scorer.rs b/lib/src/idx/ft/scorer.rs index 29c7a163..86a64537 100644 --- a/lib/src/idx/ft/scorer.rs +++ b/lib/src/idx/ft/scorer.rs @@ -1,13 +1,18 @@ use crate::err::Error; use crate::idx::ft::docids::DocId; use crate::idx::ft::doclength::{DocLength, DocLengths}; -use crate::idx::ft::postings::TermFrequency; +use crate::idx::ft::postings::{Postings, TermFrequency}; +use crate::idx::ft::terms::TermId; use crate::idx::ft::Bm25Params; use crate::kvs::Transaction; +use roaring::RoaringTreemap; +use std::sync::Arc; pub(super) type Score = f32; -pub(super) struct BM25Scorer { +pub(crate) struct BM25Scorer { + postings: Postings, + terms_docs: Arc>, doc_lengths: DocLengths, average_doc_length: f32, doc_count: f32, @@ -16,12 +21,16 @@ pub(super) struct BM25Scorer { impl BM25Scorer { pub(super) fn new( + postings: Postings, + terms_docs: Arc>, doc_lengths: DocLengths, total_docs_length: u128, doc_count: u64, bm25: Bm25Params, ) -> Self { Self { + postings, + terms_docs, doc_lengths, average_doc_length: (total_docs_length as f32) / (doc_count as f32), doc_count: doc_count as f32, @@ -29,7 +38,7 @@ impl BM25Scorer { } } - pub(super) async fn score( + async fn term_score( &self, tx: &mut Transaction, doc_id: DocId, @@ -40,6 +49,24 @@ impl BM25Scorer { Ok(self.compute_bm25_score(term_frequency as f32, term_doc_count as f32, doc_length as f32)) } + pub(crate) async fn score( + &self, + tx: &mut Transaction, + doc_id: DocId, + ) -> Result, Error> { + let mut sc = 0.0; + for (term_id, docs) in self.terms_docs.iter() { + if docs.contains(doc_id) { + if let Some(term_freq) = + self.postings.get_term_frequency(tx, *term_id, doc_id).await? + { + sc += self.term_score(tx, doc_id, docs.len(), term_freq).await?; + } + } + } + Ok(Some(sc)) + } + // https://en.wikipedia.org/wiki/Okapi_BM25 // Including the lower-bounding term frequency normalization (2011 CIKM) fn compute_bm25_score(&self, term_freq: f32, term_doc_count: f32, doc_length: f32) -> f32 { diff --git a/lib/src/idx/planner/executor.rs b/lib/src/idx/planner/executor.rs index ffb37601..2ee9d769 100644 --- a/lib/src/idx/planner/executor.rs +++ b/lib/src/idx/planner/executor.rs @@ -1,32 +1,27 @@ use crate::dbs::{Options, Transaction}; use crate::err::Error; +use crate::idx::ft::docids::{DocId, DocIds}; +use crate::idx::ft::scorer::BM25Scorer; use crate::idx::ft::terms::TermId; use crate::idx::ft::{FtIndex, MatchRef}; use crate::idx::planner::plan::IndexOption; use crate::idx::planner::tree::IndexMap; use crate::idx::IndexKeyBase; +use crate::kvs; +use crate::kvs::Key; use crate::sql::index::Index; -use crate::sql::{Expression, Idiom, Table, Thing, Value}; -use std::collections::{HashMap, HashSet}; +use crate::sql::{Expression, Table, Thing, Value}; +use roaring::RoaringTreemap; +use std::collections::HashMap; use std::sync::Arc; -#[derive(Clone)] pub(crate) struct QueryExecutor { - inner: Arc, -} - -struct Inner { table: String, - index: HashMap>, - pre_match: Option, + pre_match_expression: Option, + pre_match_entry: Option, ft_map: HashMap, - terms: HashMap, -} - -struct IndexFieldTerms { - ix: String, - id: Idiom, - t: Vec, + mr_entries: HashMap, + exp_entries: HashMap, } impl QueryExecutor { @@ -35,83 +30,114 @@ impl QueryExecutor { txn: &Transaction, table: &Table, index_map: IndexMap, - pre_match: Option, + pre_match_expression: Option, ) -> Result { let mut run = txn.lock().await; - let mut ft_map = HashMap::new(); - for ios in index_map.index.values() { - for io in ios { - if let Index::Search { - az, - order, - sc, - hl, - } = &io.ix.index - { - if !ft_map.contains_key(&io.ix.name.0) { - let ikb = IndexKeyBase::new(opt, &io.ix); - let az = run.get_az(opt.ns(), opt.db(), az.as_str()).await?; - let ft = FtIndex::new(&mut run, az, ikb, *order, sc, *hl).await?; - ft_map.insert(io.ix.name.0.clone(), ft); + + let mut mr_entries = HashMap::default(); + let mut exp_entries = HashMap::default(); + let mut ft_map = HashMap::default(); + + // Create all the instances of FtIndex + // Build the FtEntries and map them to Expressions and MatchRef + for (exp, io) in index_map.consume() { + let mut entry = None; + if let Index::Search { + az, + order, + sc, + hl, + } = &io.ix().index + { + let ixn = &io.ix().name.0; + if let Some(ft) = ft_map.get(ixn) { + if entry.is_none() { + entry = FtEntry::new(&mut run, ft, io).await?; } + } else { + let ikb = IndexKeyBase::new(opt, io.ix()); + let az = run.get_az(opt.ns(), opt.db(), az.as_str()).await?; + let ft = FtIndex::new(&mut run, az, ikb, *order, sc, *hl).await?; + let ixn = ixn.to_owned(); + if entry.is_none() { + entry = FtEntry::new(&mut run, &ft, io).await?; + } + ft_map.insert(ixn, ft); } } - } - let mut terms = HashMap::with_capacity(index_map.terms.len()); - for (mr, ifv) in index_map.terms { - if let Some(ft) = ft_map.get(&ifv.ix) { - let term_ids = ft.extract_terms(&mut run, ifv.val.clone()).await?; - terms.insert( - mr, - IndexFieldTerms { - ix: ifv.ix, - id: ifv.id, - t: term_ids, - }, - ); + + if let Some(e) = entry { + if let Some(mr) = e.0.index_option.match_ref() { + if mr_entries.insert(*mr, e.clone()).is_some() { + return Err(Error::DuplicatedMatchRef { + mr: *mr, + }); + } + } + exp_entries.insert(exp, e); } } + + let mut pre_match_entry = None; + if let Some(exp) = &pre_match_expression { + pre_match_entry = exp_entries.get(exp).cloned(); + } Ok(Self { - inner: Arc::new(Inner { - table: table.0.clone(), - index: index_map.index, - pre_match, - ft_map, - terms, - }), + table: table.0.clone(), + pre_match_expression, + pre_match_entry, + ft_map, + mr_entries, + exp_entries, }) } + pub(super) fn pre_match_terms_docs(&self) -> Option>> { + if let Some(entry) = &self.pre_match_entry { + return Some(entry.0.terms_docs.clone()); + } + None + } + + fn get_match_ref(match_ref: &Value) -> Option { + if let Value::Number(n) = match_ref { + let m = n.to_int() as u8; + Some(m) + } else { + None + } + } + pub(crate) async fn matches( &self, txn: &Transaction, thg: &Thing, exp: &Expression, ) -> Result { - // If we find the expression in `pre_match`, + // If we find the expression in `pre_match_expression`, // it means that we are using an Iterator::Index // and we are iterating over document that already matches the expression. - if let Some(pre_match) = &self.inner.pre_match { - if pre_match.eq(exp) { + if let Some(pme) = &self.pre_match_expression { + if pme.eq(exp) { return Ok(Value::Bool(true)); } } // Otherwise, we look for the first possible index options, and evaluate the expression // Does the record id match this executor's table? - // Does the record id match this executor's table? - if thg.tb.eq(&self.inner.table) { - if let Some(ios) = self.inner.index.get(exp) { - for io in ios { - if let Some(fti) = self.inner.ft_map.get(&io.ix.name.0) { - let mut run = txn.lock().await; - // TODO The query string could be extracted when IndexOptions are created - let query_string = io.v.clone().convert_to_string()?; - return Ok(Value::Bool( - fti.match_id_value(&mut run, thg, &query_string).await?, - )); + if thg.tb.eq(&self.table) { + if let Some(ft) = self.exp_entries.get(exp) { + let mut run = txn.lock().await; + let doc_key: Key = thg.into(); + if let Some(doc_id) = ft.0.doc_ids.get_doc_id(&mut run, doc_key).await? { + for (_, docs) in ft.0.terms_docs.iter() { + if !docs.contains(doc_id) { + return Ok(Value::Bool(false)); + } } + return Ok(Value::Bool(true)); } + return Ok(Value::Bool(false)); } } @@ -121,27 +147,37 @@ impl QueryExecutor { }) } + fn get_ft_entry(&self, match_ref: &Value) -> Option<&FtEntry> { + if let Some(mr) = Self::get_match_ref(match_ref) { + self.mr_entries.get(&mr) + } else { + None + } + } + + fn get_ft_entry_and_index(&self, match_ref: &Value) -> Option<(&FtEntry, &FtIndex)> { + if let Some(e) = self.get_ft_entry(match_ref) { + if let Some(ft) = self.ft_map.get(&e.0.index_option.ix().name.0) { + return Some((e, ft)); + } + } + None + } + pub(crate) async fn highlight( &self, txn: Transaction, thg: &Thing, prefix: Value, suffix: Value, - match_ref: Value, + match_ref: &Value, doc: &Value, ) -> Result { - let mut tx = txn.lock().await; - // We have to make the connection between the match ref from the highlight function... - if let Value::Number(n) = match_ref { - let m = n.as_int() as u8; - // ... and from the match operator (@{matchref}@) - if let Some(ift) = self.inner.terms.get(&m) { - // Check we have an index? - if let Some(ft) = self.inner.ft_map.get(&ift.ix) { - // All good, we can do the highlight - return ft.highlight(&mut tx, thg, &ift.t, prefix, suffix, &ift.id, doc).await; - } - } + if let Some((e, ft)) = self.get_ft_entry_and_index(match_ref) { + let mut run = txn.lock().await; + return ft + .highlight(&mut run, thg, &e.0.terms, prefix, suffix, e.0.index_option.id(), doc) + .await; } Ok(Value::None) } @@ -150,21 +186,70 @@ impl QueryExecutor { &self, txn: Transaction, thg: &Thing, - match_ref: Value, + match_ref: &Value, ) -> Result { - let mut tx = txn.lock().await; - // We have to make the connection between the match ref from the highlight function... - if let Value::Number(n) = match_ref { - let m = n.as_int() as u8; - // ... and from the match operator (@{matchref}@) - if let Some(ift) = self.inner.terms.get(&m) { - // Check we have an index? - if let Some(ft) = self.inner.ft_map.get(&ift.ix) { - // All good, we can extract the offsets - return ft.extract_offsets(&mut tx, thg, &ift.t).await; + if let Some((e, ft)) = self.get_ft_entry_and_index(match_ref) { + let mut run = txn.lock().await; + return ft.extract_offsets(&mut run, thg, &e.0.terms).await; + } + Ok(Value::None) + } + + pub(crate) async fn score( + &self, + txn: Transaction, + match_ref: &Value, + rid: &Thing, + mut doc_id: Option, + ) -> Result { + if let Some(e) = self.get_ft_entry(match_ref) { + if let Some(scorer) = &e.0.scorer { + let mut run = txn.lock().await; + if doc_id.is_none() { + let key: Key = rid.into(); + doc_id = e.0.doc_ids.get_doc_id(&mut run, key).await?; + }; + if let Some(doc_id) = doc_id { + let score = scorer.score(&mut run, doc_id).await?; + if let Some(score) = score { + return Ok(Value::from(score)); + } } } } Ok(Value::None) } } + +#[derive(Clone)] +struct FtEntry(Arc); + +struct Inner { + index_option: IndexOption, + doc_ids: DocIds, + terms: Vec, + terms_docs: Arc>, + scorer: Option, +} + +impl FtEntry { + async fn new( + tx: &mut kvs::Transaction, + ft: &FtIndex, + io: IndexOption, + ) -> Result, Error> { + if let Some(qs) = io.qs() { + let terms = ft.extract_terms(tx, qs.to_owned()).await?; + let terms_docs = Arc::new(ft.get_terms_docs(tx, &terms).await?); + Ok(Some(Self(Arc::new(Inner { + index_option: io, + doc_ids: ft.doc_ids(tx).await?, + scorer: ft.new_scorer(tx, terms_docs.clone()).await?, + terms, + terms_docs, + })))) + } else { + Ok(None) + } + } +} diff --git a/lib/src/idx/planner/mod.rs b/lib/src/idx/planner/mod.rs index 7f7e0fec..0f50b13b 100644 --- a/lib/src/idx/planner/mod.rs +++ b/lib/src/idx/planner/mod.rs @@ -37,7 +37,7 @@ impl<'a> QueryPlanner<'a> { let res = Tree::build(self.opt, &txn, &t, self.cond).await?; if let Some((node, im)) = res { if let Some(plan) = AllAndStrategy::build(&node)? { - let e = plan.i.new_query_executor(opt, &txn, &t, im).await?; + let e = QueryExecutor::new(opt, &txn, &t, im, Some(plan.e.clone())).await?; self.executors.insert(t.0.clone(), e); return Ok(Iterable::Index(t, plan)); } @@ -81,15 +81,15 @@ impl AllAndStrategy { fn eval_node(&mut self, node: &Node) -> Result<(), Error> { match node { Node::Expression { - index_option, + io: index_option, left, right, - operator, + exp: expression, } => { if let Some(io) = index_option { - self.b.add(io.clone()); + self.b.add_index_option(expression.clone(), io.clone()); } - self.eval_expression(left, right, operator) + self.eval_expression(left, right, expression.operator()) } Node::Unsupported => Err(Error::BypassQueryPlanner), _ => Ok(()), diff --git a/lib/src/idx/planner/plan.rs b/lib/src/idx/planner/plan.rs index df6d0e5b..ed28ebce 100644 --- a/lib/src/idx/planner/plan.rs +++ b/lib/src/idx/planner/plan.rs @@ -1,33 +1,36 @@ use crate::dbs::{Options, Transaction}; use crate::err::Error; +use crate::idx::ft::docids::{DocId, NO_DOC_ID}; use crate::idx::ft::terms::TermId; use crate::idx::ft::{FtIndex, HitsIterator, MatchRef}; use crate::idx::planner::executor::QueryExecutor; -use crate::idx::planner::tree::IndexMap; use crate::idx::IndexKeyBase; use crate::key; use crate::kvs::Key; use crate::sql::index::Index; use crate::sql::scoring::Scoring; use crate::sql::statements::DefineIndexStatement; -use crate::sql::{Array, Expression, Ident, Object, Operator, Table, Thing, Value}; +use crate::sql::{Array, Expression, Ident, Idiom, Object, Operator, Thing, Value}; use async_trait::async_trait; +use roaring::RoaringTreemap; use std::collections::HashMap; +use std::hash::Hash; +use std::sync::Arc; #[derive(Default)] pub(super) struct PlanBuilder { - indexes: Vec, + indexes: Vec<(Expression, IndexOption)>, } impl PlanBuilder { - pub(super) fn add(&mut self, i: IndexOption) { - self.indexes.push(i); + pub(super) fn add_index_option(&mut self, e: Expression, i: IndexOption) { + self.indexes.push((e, i)); } pub(super) fn build(mut self) -> Result { // TODO select the best option if there are several (cost based) - if let Some(index) = self.indexes.pop() { - Ok(index.into()) + if let Some((e, i)) = self.indexes.pop() { + Ok(Plan::new(e, i)) } else { Err(Error::BypassQueryPlanner) } @@ -35,108 +38,144 @@ impl PlanBuilder { } pub(crate) struct Plan { + pub(super) e: Expression, pub(super) i: IndexOption, } impl Plan { + pub(super) fn new(e: Expression, i: IndexOption) -> Self { + Self { + e, + i, + } + } + pub(crate) async fn new_iterator( &self, opt: &Options, txn: &Transaction, + exe: &QueryExecutor, ) -> Result, Error> { - self.i.new_iterator(opt, txn).await + self.i.new_iterator(opt, txn, exe).await } pub(crate) fn explain(&self) -> Value { - let IndexOption { - ix, - v, - op, - .. - } = &self.i; - Value::Object(Object::from(HashMap::from([ - ("index", Value::from(ix.name.0.to_owned())), - ("operator", Value::from(op.to_string())), - ("value", v.clone()), + ("index", Value::from(self.i.ix().name.0.to_owned())), + ("operator", Value::from(self.i.op().to_string())), + ("value", self.i.value().clone()), ]))) } } -impl From for Plan { - fn from(i: IndexOption) -> Self { - Self { - i, - } - } -} - #[derive(Debug, Clone, Eq, PartialEq, Hash)] -pub(super) struct IndexOption { - pub(super) ix: DefineIndexStatement, - pub(super) v: Value, - pub(super) op: Operator, - ep: Expression, +pub(super) struct IndexOption(Arc); + +#[derive(Debug, Eq, PartialEq, Hash)] +pub(super) struct Inner { + ix: DefineIndexStatement, + id: Idiom, + v: Value, + qs: Option, + op: Operator, + mr: Option, } impl IndexOption { - pub(super) fn new(ix: DefineIndexStatement, op: Operator, v: Value, ep: Expression) -> Self { - Self { + pub(super) fn new( + ix: DefineIndexStatement, + id: Idiom, + op: Operator, + v: Value, + qs: Option, + mr: Option, + ) -> Self { + Self(Arc::new(Inner { ix, + id, op, v, - ep, - } + qs, + mr, + })) } - pub(super) async fn new_query_executor( - &self, - opt: &Options, - txn: &Transaction, - t: &Table, - i: IndexMap, - ) -> Result { - QueryExecutor::new(opt, txn, t, i, Some(self.ep.clone())).await + pub(super) fn ix(&self) -> &DefineIndexStatement { + &self.0.ix + } + + pub(super) fn op(&self) -> &Operator { + &self.0.op + } + + pub(super) fn value(&self) -> &Value { + &self.0.v + } + + pub(super) fn qs(&self) -> Option<&String> { + self.0.qs.as_ref() + } + + pub(super) fn id(&self) -> &Idiom { + &self.0.id + } + + pub(super) fn match_ref(&self) -> Option<&MatchRef> { + self.0.mr.as_ref() } async fn new_iterator( &self, opt: &Options, txn: &Transaction, + exe: &QueryExecutor, ) -> Result, Error> { - match &self.ix.index { - Index::Idx => match self.op { - Operator::Equal => { - Ok(Box::new(NonUniqueEqualThingIterator::new(opt, &self.ix, &self.v)?)) + match &self.ix().index { + Index::Idx => { + if self.op() == &Operator::Equal { + return Ok(Box::new(NonUniqueEqualThingIterator::new( + opt, + self.ix(), + self.value(), + )?)); } - _ => Err(Error::BypassQueryPlanner), - }, - Index::Uniq => match self.op { - Operator::Equal => { - Ok(Box::new(UniqueEqualThingIterator::new(opt, &self.ix, &self.v)?)) + } + Index::Uniq => { + if self.op() == &Operator::Equal { + return Ok(Box::new(UniqueEqualThingIterator::new( + opt, + self.ix(), + self.value(), + )?)); } - _ => Err(Error::BypassQueryPlanner), - }, + } Index::Search { az, hl, sc, order, - } => match self.op { - Operator::Matches(mr) => Ok(Box::new( - MatchesThingIterator::new(opt, txn, &self.ix, az, *hl, sc, *order, mr, &self.v) - .await?, - )), - _ => Err(Error::BypassQueryPlanner), - }, + } => { + if let Operator::Matches(_) = self.op() { + let td = exe.pre_match_terms_docs(); + return Ok(Box::new( + MatchesThingIterator::new(opt, txn, self.ix(), az, *hl, sc, *order, td) + .await?, + )); + } + } } + Err(Error::BypassQueryPlanner) } } #[cfg_attr(not(target_arch = "wasm32"), async_trait)] #[cfg_attr(target_arch = "wasm32", async_trait(?Send))] pub(crate) trait ThingIterator: Send { - async fn next_batch(&mut self, tx: &Transaction, size: u32) -> Result, Error>; + async fn next_batch( + &mut self, + tx: &Transaction, + size: u32, + ) -> Result, Error>; } struct NonUniqueEqualThingIterator { @@ -159,7 +198,11 @@ impl NonUniqueEqualThingIterator { #[cfg_attr(not(target_arch = "wasm32"), async_trait)] #[cfg_attr(target_arch = "wasm32", async_trait(?Send))] impl ThingIterator for NonUniqueEqualThingIterator { - async fn next_batch(&mut self, txn: &Transaction, limit: u32) -> Result, Error> { + async fn next_batch( + &mut self, + txn: &Transaction, + limit: u32, + ) -> Result, Error> { let min = self.beg.clone(); let max = self.end.clone(); let res = txn.lock().await.scan(min..max, limit).await?; @@ -167,7 +210,7 @@ impl ThingIterator for NonUniqueEqualThingIterator { self.beg = key.clone(); self.beg.push(0x00); } - let res = res.iter().map(|(_, val)| val.into()).collect(); + let res = res.iter().map(|(_, val)| (val.into(), NO_DOC_ID)).collect(); Ok(res) } } @@ -189,10 +232,14 @@ impl UniqueEqualThingIterator { #[cfg_attr(not(target_arch = "wasm32"), async_trait)] #[cfg_attr(target_arch = "wasm32", async_trait(?Send))] impl ThingIterator for UniqueEqualThingIterator { - async fn next_batch(&mut self, txn: &Transaction, _limit: u32) -> Result, Error> { + async fn next_batch( + &mut self, + txn: &Transaction, + _limit: u32, + ) -> Result, Error> { if let Some(key) = self.key.take() { if let Some(val) = txn.lock().await.get(key).await? { - return Ok(vec![val.into()]); + return Ok(vec![(val.into(), NO_DOC_ID)]); } } Ok(vec![]) @@ -200,7 +247,6 @@ impl ThingIterator for UniqueEqualThingIterator { } struct MatchesThingIterator { - _terms: Option<(MatchRef, Vec)>, hits: Option, } @@ -214,23 +260,26 @@ impl MatchesThingIterator { hl: bool, sc: &Scoring, order: u32, - mr: Option, - v: &Value, + terms_docs: Option>>, ) -> Result { let ikb = IndexKeyBase::new(opt, ix); - let mut run = txn.lock().await; if let Scoring::Bm { .. } = sc { - let query_string = v.clone().convert_to_string()?; + let mut run = txn.lock().await; let az = run.get_az(opt.ns(), opt.db(), az.as_str()).await?; let fti = FtIndex::new(&mut run, az, ikb, order, sc, hl).await?; - let (terms, hits) = fti.search(&mut run, query_string).await?; - Ok(Self { - hits, - _terms: mr.map(|mr| (mr, terms)), - }) + if let Some(terms_docs) = terms_docs { + let hits = fti.new_hits_iterator(&mut run, terms_docs).await?; + Ok(Self { + hits, + }) + } else { + Ok(Self { + hits: None, + }) + } } else { Err(Error::FeatureNotYetImplemented { feature: "Vector Search", @@ -242,12 +291,16 @@ impl MatchesThingIterator { #[cfg_attr(not(target_arch = "wasm32"), async_trait)] #[cfg_attr(target_arch = "wasm32", async_trait(?Send))] impl ThingIterator for MatchesThingIterator { - async fn next_batch(&mut self, txn: &Transaction, mut limit: u32) -> Result, Error> { + async fn next_batch( + &mut self, + txn: &Transaction, + mut limit: u32, + ) -> Result, Error> { let mut res = vec![]; if let Some(hits) = &mut self.hits { let mut run = txn.lock().await; while limit > 0 { - if let Some((hit, _)) = hits.next(&mut run).await? { + if let Some(hit) = hits.next(&mut run).await? { res.push(hit); } else { break; @@ -258,3 +311,39 @@ impl ThingIterator for MatchesThingIterator { Ok(res) } } + +#[cfg(test)] +mod tests { + use crate::idx::planner::plan::IndexOption; + use crate::sql::statements::DefineIndexStatement; + use crate::sql::{Idiom, Operator, Value}; + use std::collections::HashSet; + + #[test] + fn test_hash_index_option() { + let mut set = HashSet::new(); + let io1 = IndexOption::new( + DefineIndexStatement::default(), + Idiom::from("a.b".to_string()), + Operator::Equal, + Value::from("test"), + None, + None, + ); + + let io2 = IndexOption::new( + DefineIndexStatement::default(), + Idiom::from("a.b".to_string()), + Operator::Equal, + Value::from("test"), + None, + None, + ); + + set.insert(io1); + set.insert(io2.clone()); + set.insert(io2); + + assert_eq!(set.len(), 1); + } +} diff --git a/lib/src/idx/planner/tree.rs b/lib/src/idx/planner/tree.rs index 50222939..533aa374 100644 --- a/lib/src/idx/planner/tree.rs +++ b/lib/src/idx/planner/tree.rs @@ -1,30 +1,18 @@ use crate::dbs::{Options, Transaction}; use crate::err::Error; -use crate::idx::ft::MatchRef; use crate::idx::planner::plan::IndexOption; use crate::sql::index::Index; use crate::sql::statements::DefineIndexStatement; use crate::sql::{Cond, Expression, Idiom, Operator, Subquery, Table, Value}; use async_recursion::async_recursion; -use std::collections::hash_map::Entry; -use std::collections::{HashMap, HashSet}; +use std::collections::HashMap; use std::sync::Arc; -#[derive(Default)] -pub(super) struct IndexMap { - pub(super) index: HashMap>, - pub(super) terms: HashMap, -} - -pub(super) struct IndexFieldValue { - pub(super) ix: String, - pub(super) id: Idiom, - pub(super) val: String, -} - pub(super) struct Tree {} impl Tree { + /// Traverse the all the conditions and extract every expression + /// that can be resolved by an index. pub(super) async fn build<'a>( opt: &'a Options, txn: &'a Transaction, @@ -112,17 +100,25 @@ impl<'a> TreeBuilder<'a> { } => { let left = self.eval_value(l).await?; let right = self.eval_value(r).await?; - let mut index_option = None; + if let Some(io) = self.index_map.0.get(e) { + return Ok(Node::Expression { + io: Some(io.clone()), + left: Box::new(left), + right: Box::new(right), + exp: e.clone(), + }); + } + let mut io = None; if let Some((id, ix)) = left.is_indexed_field() { - index_option = self.lookup_index_option(ix, o, id, &right, e); + io = self.lookup_index_option(ix, o, id, &right, e); } else if let Some((id, ix)) = right.is_indexed_field() { - index_option = self.lookup_index_option(ix, o, id, &left, e); + io = self.lookup_index_option(ix, o, id, &left, e); }; Ok(Node::Expression { - index_option, + io, left: Box::new(left), right: Box::new(right), - operator: o.to_owned(), + exp: e.clone(), }) } } @@ -134,51 +130,31 @@ impl<'a> TreeBuilder<'a> { op: &Operator, id: &Idiom, v: &Node, - ep: &Expression, + e: &Expression, ) -> Option { if let Some(v) = v.is_scalar() { - if match &ix.index { - Index::Idx => Operator::Equal.eq(op), - Index::Uniq => Operator::Equal.eq(op), + let (found, mr, qs) = match &ix.index { + Index::Idx => (Operator::Equal.eq(op), None, None), + Index::Uniq => (Operator::Equal.eq(op), None, None), Index::Search { .. } => { if let Operator::Matches(mr) = op { - if let Some(mr) = mr { - self.index_map.terms.insert( - *mr, - IndexFieldValue { - ix: ix.name.0.to_owned(), - id: id.to_owned(), - val: v.to_raw_string(), - }, - ); - } - true + (true, *mr, Some(v.clone().to_raw_string())) } else { - false + (false, None, None) } } - } { - let io = IndexOption::new(ix.clone(), op.to_owned(), v.clone(), ep.clone()); - self.add_index(ep, io.clone()); + }; + if found { + let io = IndexOption::new(ix.clone(), id.clone(), op.to_owned(), v.clone(), qs, mr); + self.index_map.0.insert(e.clone(), io.clone()); return Some(io); } } None } - fn add_index(&mut self, e: &Expression, io: IndexOption) { - match self.index_map.index.entry(e.clone()) { - Entry::Occupied(mut e) => { - e.get_mut().insert(io); - } - Entry::Vacant(e) => { - e.insert(HashSet::from([io])); - } - } - } - async fn eval_subquery(&mut self, s: &Subquery) -> Result { Ok(match s { Subquery::Value(v) => self.eval_value(v).await?, @@ -187,13 +163,23 @@ impl<'a> TreeBuilder<'a> { } } +/// For each expression the a possible index option +#[derive(Default)] +pub(super) struct IndexMap(HashMap); + +impl IndexMap { + pub(super) fn consume(self) -> HashMap { + self.0 + } +} + #[derive(Debug, Clone, Eq, PartialEq, Hash)] pub(super) enum Node { Expression { - index_option: Option, + io: Option, left: Box, right: Box, - operator: Operator, + exp: Expression, }, IndexedField(Idiom, DefineIndexStatement), NonIndexedField, diff --git a/lib/src/sql/expression.rs b/lib/src/sql/expression.rs index 6d8b3f69..701280ec 100644 --- a/lib/src/sql/expression.rs +++ b/lib/src/sql/expression.rs @@ -86,6 +86,20 @@ impl Expression { } } + /// Returns the operator + pub(crate) fn operator(&self) -> &Operator { + match self { + Expression::Unary { + o, + .. + } => o, + Expression::Binary { + o, + .. + } => o, + } + } + /// Process this type returning a computed simple Value pub(crate) async fn compute(&self, ctx: &Context<'_>, opt: &Options) -> Result { let (l, o, r) = match self { diff --git a/lib/src/sql/function.rs b/lib/src/sql/function.rs index ab9c1624..6dbce187 100644 --- a/lib/src/sql/function.rs +++ b/lib/src/sql/function.rs @@ -455,7 +455,7 @@ fn function_rand(i: &str) -> IResult<&str, &str> { } fn function_search(i: &str) -> IResult<&str, &str> { - alt((tag("highlight"), tag("offsets")))(i) + alt((tag("score"), tag("highlight"), tag("offsets")))(i) } fn function_session(i: &str) -> IResult<&str, &str> { diff --git a/lib/tests/matches.rs b/lib/tests/matches.rs index 6153a3f8..f8001e67 100644 --- a/lib/tests/matches.rs +++ b/lib/tests/matches.rs @@ -57,7 +57,7 @@ async fn select_where_matches_without_using_index_iterator() -> Result<(), Error CREATE blog:2 SET title = 'Foo Bar!'; DEFINE ANALYZER simple TOKENIZERS blank,class FILTERS lowercase; DEFINE INDEX blog_title ON blog FIELDS title SEARCH ANALYZER simple BM25(1.2,0.75) HIGHLIGHTS; - SELECT id,search::highlight('', '', 1) AS title FROM blog WHERE (title @0@ 'hello' AND id>0) OR (title @1@ 'world' AND id<99) EXPLAIN; + SELECT id,search::highlight('', '', 1) AS title FROM blog WHERE (title @0@ 'hello' AND identifier > 0) OR (title @1@ 'world' AND identifier < 99) EXPLAIN; "; let dbs = Datastore::new("memory").await?; let ses = Session::for_kv().with_ns("test").with_db("test"); @@ -192,3 +192,67 @@ async fn select_where_matches_using_index_offsets() -> Result<(), Error> { assert_eq!(tmp, val); Ok(()) } + +#[tokio::test] +async fn select_where_matches_using_index_and_score() -> Result<(), Error> { + let sql = r" + CREATE blog:1 SET title = 'the quick brown fox jumped over the lazy dog'; + CREATE blog:2 SET title = 'the fast fox jumped over the lazy dog'; + CREATE blog:3 SET title = 'the other animals sat there watching'; + CREATE blog:4 SET title = 'the dog sat there and did nothing'; + DEFINE ANALYZER simple TOKENIZERS blank,class; + DEFINE INDEX blog_title ON blog FIELDS title SEARCH ANALYZER simple BM25(1.2,0.75) HIGHLIGHTS; + SELECT id,search::score(1) AS score FROM blog WHERE title @1@ 'animals'; + "; + let dbs = Datastore::new("memory").await?; + let ses = Session::for_kv().with_ns("test").with_db("test"); + let res = &mut dbs.execute(&sql, &ses, None, false).await?; + assert_eq!(res.len(), 7); + // + for _ in 0..6 { + let _ = res.remove(0).result?; + } + let tmp = res.remove(0).result?; + let val = Value::parse( + "[ + { + id: blog:3, + score: 0.9227996468544006 + } + ]", + ); + assert_eq!(tmp, val); + Ok(()) +} + +#[tokio::test] +async fn select_where_matches_without_using_index_and_score() -> Result<(), Error> { + let sql = r" + CREATE blog:1 SET title = 'the quick brown fox jumped over the lazy dog'; + CREATE blog:2 SET title = 'the fast fox jumped over the lazy dog'; + CREATE blog:3 SET title = 'the other animals sat there watching'; + CREATE blog:4 SET title = 'the dog sat there and did nothing'; + DEFINE ANALYZER simple TOKENIZERS blank,class; + DEFINE INDEX blog_title ON blog FIELDS title SEARCH ANALYZER simple BM25(1.2,0.75) HIGHLIGHTS; + SELECT id,search::score(1) AS score FROM blog WHERE (title @1@ 'animals' AND id>0) OR (title @1@ 'animals' AND id<99); + "; + let dbs = Datastore::new("memory").await?; + let ses = Session::for_kv().with_ns("test").with_db("test"); + let res = &mut dbs.execute(&sql, &ses, None, false).await?; + assert_eq!(res.len(), 7); + // + for _ in 0..6 { + let _ = res.remove(0).result?; + } + let tmp = res.remove(0).result?; + let val = Value::parse( + "[ + { + id: blog:3, + score: 0.9227996468544006 + } + ]", + ); + assert_eq!(tmp, val); + Ok(()) +}