diff --git a/lib/Cargo.toml b/lib/Cargo.toml index 8a66775d..997a90e1 100644 --- a/lib/Cargo.toml +++ b/lib/Cargo.toml @@ -58,8 +58,8 @@ async-trait = "0.1.68" async-recursion = "1.0.4" base64_lib = { version = "0.21.1", package = "base64" } bcrypt = "0.14.0" -bung = "0.1.0" bincode = "1.3.3" +bung = "0.1.0" channel = { version = "1.8.0", package = "async-channel" } chrono = { version = "0.4.24", features = ["serde"] } derive = { version = "0.8.0", package = "surrealdb-derive" } diff --git a/lib/src/ctx/context.rs b/lib/src/ctx/context.rs index 2833fe9a..b60387d9 100644 --- a/lib/src/ctx/context.rs +++ b/lib/src/ctx/context.rs @@ -152,7 +152,9 @@ impl<'a> Context<'a> { self.deadline.map(|v| v.saturating_duration_since(Instant::now())) } - pub fn clone_transaction(&self) -> Result { + /// Returns a transaction if any. + /// Otherwise it fails by returning a Error::NoTx error. + pub fn try_clone_transaction(&self) -> Result { match &self.transaction { None => Err(Error::NoTx), Some(txn) => Ok(txn.clone()), diff --git a/lib/src/dbs/channel.rs b/lib/src/dbs/channel.rs index 5826ae2e..f0cc3db4 100644 --- a/lib/src/dbs/channel.rs +++ b/lib/src/dbs/channel.rs @@ -23,7 +23,7 @@ impl Iterable { ) -> Result<(), Error> { if ctx.is_ok() { // Clone transaction - let txn = ctx.clone_transaction()?; + let txn = ctx.try_clone_transaction()?; match self { Iterable::Value(v) => { // Pass the value through diff --git a/lib/src/dbs/iterate.rs b/lib/src/dbs/iterate.rs index 37d95d69..7155a51e 100644 --- a/lib/src/dbs/iterate.rs +++ b/lib/src/dbs/iterate.rs @@ -64,7 +64,7 @@ impl Iterable { ite: &mut Iterator, ) -> Result<(), Error> { // Clone transaction - let txn = ctx.clone_transaction()?; + let txn = ctx.try_clone_transaction()?; // Check that the table exists txn.lock().await.check_ns_db_tb(opt.ns(), opt.db(), &v.tb, opt.strict).await?; // Fetch the data from the store @@ -92,7 +92,7 @@ impl Iterable { ite: &mut Iterator, ) -> Result<(), Error> { // Clone transaction - let txn = ctx.clone_transaction()?; + let txn = ctx.try_clone_transaction()?; // Check that the table exists txn.lock().await.check_ns_db_tb(opt.ns(), opt.db(), &v.tb, opt.strict).await?; // Fetch the data from the store @@ -123,7 +123,7 @@ impl Iterable { ite: &mut Iterator, ) -> Result<(), Error> { // Clone transaction - let txn = ctx.clone_transaction()?; + let txn = ctx.try_clone_transaction()?; // Check that the table exists txn.lock().await.check_ns_db_tb(opt.ns(), opt.db(), &v.tb, opt.strict).await?; // Fetch the data from the store @@ -152,7 +152,7 @@ impl Iterable { ite: &mut Iterator, ) -> Result<(), Error> { // Clone transaction - let txn = ctx.clone_transaction()?; + let txn = ctx.try_clone_transaction()?; // Check that the table exists txn.lock().await.check_ns_db_tb(opt.ns(), opt.db(), &v, opt.strict).await?; // Prepare the start and end keys @@ -220,7 +220,7 @@ impl Iterable { ite: &mut Iterator, ) -> Result<(), Error> { // Clone transaction - let txn = ctx.clone_transaction()?; + let txn = ctx.try_clone_transaction()?; // Check that the table exists txn.lock().await.check_ns_db_tb(opt.ns(), opt.db(), &v.tb, opt.strict).await?; // Prepare the range start key @@ -387,13 +387,13 @@ impl Iterable { None => { let min = beg.clone(); let max = end.clone(); - ctx.clone_transaction()?.lock().await.scan(min..max, 1000).await? + ctx.try_clone_transaction()?.lock().await.scan(min..max, 1000).await? } Some(ref mut beg) => { beg.push(0x00); let min = beg.clone(); let max = end.clone(); - ctx.clone_transaction()?.lock().await.scan(min..max, 1000).await? + ctx.try_clone_transaction()?.lock().await.scan(min..max, 1000).await? } }; // If there are key-value entries then fetch them @@ -418,7 +418,7 @@ impl Iterable { let gra: crate::key::graph::Graph = (&k).into(); // Fetch the data from the store let key = thing::new(opt.ns(), opt.db(), gra.ft, &gra.fk); - let val = ctx.clone_transaction()?.lock().await.get(key).await?; + let val = ctx.try_clone_transaction()?.lock().await.get(key).await?; let rid = Thing::from((gra.ft, gra.fk)); let mut ctx = Context::new(ctx); ctx.add_thing(&rid); @@ -446,7 +446,7 @@ impl Iterable { plan: Plan, ite: &mut Iterator, ) -> Result<(), Error> { - let txn = ctx.clone_transaction()?; + let txn = ctx.try_clone_transaction()?; // Check that the table exists txn.lock().await.check_ns_db_tb(opt.ns(), opt.db(), &table.0, opt.strict).await?; let mut iterator = plan.new_iterator(opt, &txn).await?; diff --git a/lib/src/doc/allow.rs b/lib/src/doc/allow.rs index 37a3a005..238c54e1 100644 --- a/lib/src/doc/allow.rs +++ b/lib/src/doc/allow.rs @@ -17,7 +17,7 @@ impl<'a> Document<'a> { // Should we run permissions checks? if opt.perms && opt.auth.perms() { // Clone transaction - let txn = ctx.clone_transaction()?; + let txn = ctx.try_clone_transaction()?; // Get the table let tb = self.tb(opt, &txn).await?; // Get the permission clause diff --git a/lib/src/doc/clean.rs b/lib/src/doc/clean.rs index fd7991fe..66f9be98 100644 --- a/lib/src/doc/clean.rs +++ b/lib/src/doc/clean.rs @@ -13,7 +13,7 @@ impl<'a> Document<'a> { _stm: &Statement<'_>, ) -> Result<(), Error> { // Clone transaction - let txn = ctx.clone_transaction()?; + let txn = ctx.try_clone_transaction()?; // Get the table let tb = self.tb(opt, &txn).await?; // This table is schemafull diff --git a/lib/src/doc/edges.rs b/lib/src/doc/edges.rs index ede63472..6b01a971 100644 --- a/lib/src/doc/edges.rs +++ b/lib/src/doc/edges.rs @@ -18,7 +18,7 @@ impl<'a> Document<'a> { _stm: &Statement<'_>, ) -> Result<(), Error> { // Clone transaction - let txn = ctx.clone_transaction()?; + let txn = ctx.try_clone_transaction()?; // Check if the table is a view if self.tb(opt, &txn).await?.drop { return Ok(()); diff --git a/lib/src/doc/event.rs b/lib/src/doc/event.rs index ca877419..22c87805 100644 --- a/lib/src/doc/event.rs +++ b/lib/src/doc/event.rs @@ -24,7 +24,7 @@ impl<'a> Document<'a> { // Don't run permissions let opt = &opt.perms(false); // Clone transaction - let txn = ctx.clone_transaction()?; + let txn = ctx.try_clone_transaction()?; // Loop through all event statements for ev in self.ev(opt, &txn).await?.iter() { // Get the event action diff --git a/lib/src/doc/field.rs b/lib/src/doc/field.rs index 76397f90..17556940 100644 --- a/lib/src/doc/field.rs +++ b/lib/src/doc/field.rs @@ -22,7 +22,7 @@ impl<'a> Document<'a> { // Get the user applied input let inp = self.initial.changed(self.current.as_ref()); // Clone transaction - let txn = ctx.clone_transaction()?; + let txn = ctx.try_clone_transaction()?; // Loop through all field statements for fd in self.fd(opt, &txn).await?.iter() { // Loop over each field in document diff --git a/lib/src/doc/index.rs b/lib/src/doc/index.rs index d2a87c5e..cd4b42d0 100644 --- a/lib/src/doc/index.rs +++ b/lib/src/doc/index.rs @@ -28,7 +28,7 @@ impl<'a> Document<'a> { return Ok(()); } // Clone transaction - let txn = ctx.clone_transaction()?; + let txn = ctx.try_clone_transaction()?; // Check if the table is a view if self.tb(opt, &txn).await?.drop { return Ok(()); @@ -49,7 +49,7 @@ impl<'a> Document<'a> { let mut run = txn.lock().await; // Store all the variable and parameters required by the index operation - let mut ic = IndexOperation::new(opt, ix, o, n, rid); + let ic = IndexOperation::new(opt, ix, o, n, rid); // Index operation dispatching match &ix.index { @@ -60,12 +60,7 @@ impl<'a> Document<'a> { sc, hl, order, - } => match sc { - Scoring::Bm { - .. - } => ic.index_best_matching_search(&mut run, az, *order, *hl).await?, - Scoring::Vs => ic.index_vector_search(az, *hl).await?, - }, + } => ic.index_full_text(&mut run, az, *order, sc, *hl).await?, }; } } @@ -182,16 +177,17 @@ impl<'a> IndexOperation<'a> { }) } - async fn index_best_matching_search( + async fn index_full_text( &self, run: &mut kvs::Transaction, az: &Ident, order: u32, - _hl: bool, + scoring: &Scoring, + hl: bool, ) -> Result<(), Error> { let ikb = IndexKeyBase::new(self.opt, self.ix); let az = run.get_az(self.opt.ns(), self.opt.db(), az.as_str()).await?; - let mut ft = FtIndex::new(run, az, ikb, order).await?; + let mut ft = FtIndex::new(run, az, ikb, order, scoring, hl).await?; if let Some(n) = &self.n { // TODO: Apply the analyzer ft.index_document(run, self.rid, n).await @@ -199,10 +195,4 @@ impl<'a> IndexOperation<'a> { ft.remove_document(run, self.rid).await } } - - async fn index_vector_search(&mut self, _az: &Ident, _hl: bool) -> Result<(), Error> { - Err(Error::FeatureNotYetImplemented { - feature: "VectorSearch indexing", - }) - } } diff --git a/lib/src/doc/lives.rs b/lib/src/doc/lives.rs index 576b79af..d95729fe 100644 --- a/lib/src/doc/lives.rs +++ b/lib/src/doc/lives.rs @@ -19,7 +19,7 @@ impl<'a> Document<'a> { return Ok(()); } // Clone transaction - let txn = ctx.clone_transaction()?; + let txn = ctx.try_clone_transaction()?; // Get the record id let id = self.id.as_ref().unwrap(); // Loop through all index statements diff --git a/lib/src/doc/pluck.rs b/lib/src/doc/pluck.rs index ccd26115..782df7ab 100644 --- a/lib/src/doc/pluck.rs +++ b/lib/src/doc/pluck.rs @@ -82,7 +82,7 @@ impl<'a> Document<'a> { // Should we run permissions checks? if opt.perms && opt.auth.perms() { // Clone transaction - let txn = ctx.clone_transaction()?; + let txn = ctx.try_clone_transaction()?; // Loop through all field statements for fd in self.fd(opt, &txn).await?.iter() { // Loop over each field in document diff --git a/lib/src/doc/purge.rs b/lib/src/doc/purge.rs index c1a23734..9d8416ad 100644 --- a/lib/src/doc/purge.rs +++ b/lib/src/doc/purge.rs @@ -24,7 +24,7 @@ impl<'a> Document<'a> { return Ok(()); } // Clone transaction - let txn = ctx.clone_transaction()?; + let txn = ctx.try_clone_transaction()?; // Check if the table is a view if self.tb(opt, &txn).await?.drop { return Ok(()); diff --git a/lib/src/doc/store.rs b/lib/src/doc/store.rs index 68a090df..128fb903 100644 --- a/lib/src/doc/store.rs +++ b/lib/src/doc/store.rs @@ -16,7 +16,7 @@ impl<'a> Document<'a> { return Ok(()); } // Clone transaction - let txn = ctx.clone_transaction()?; + let txn = ctx.try_clone_transaction()?; // Check if the table is a view if self.tb(opt, &txn).await?.drop { return Ok(()); diff --git a/lib/src/doc/table.rs b/lib/src/doc/table.rs index 5ce2f687..ed797c1c 100644 --- a/lib/src/doc/table.rs +++ b/lib/src/doc/table.rs @@ -56,7 +56,7 @@ impl<'a> Document<'a> { Action::Update }; // Clone transaction - let txn = ctx.clone_transaction()?; + let txn = ctx.try_clone_transaction()?; // Loop through all foreign table statements for ft in self.ft(opt, &txn).await?.iter() { // Get the table definition diff --git a/lib/src/fnc/mod.rs b/lib/src/fnc/mod.rs index 5531194c..cb265ac1 100644 --- a/lib/src/fnc/mod.rs +++ b/lib/src/fnc/mod.rs @@ -20,6 +20,7 @@ pub mod operate; pub mod parse; pub mod rand; pub mod script; +pub mod search; pub mod session; pub mod sleep; pub mod string; @@ -30,6 +31,7 @@ pub mod util; /// Attempts to run any function pub async fn run(ctx: &Context<'_>, name: &str, args: Vec) -> Result { if name.eq("sleep") + || name.starts_with("search") || name.starts_with("http") || name.starts_with("crypto::argon2") || name.starts_with("crypto::bcrypt") @@ -300,6 +302,9 @@ pub async fn asynchronous(ctx: &Context<'_>, name: &str, args: Vec) -> Re "http::patch" => http::patch(ctx).await, "http::delete" => http::delete(ctx).await, // + "search::highlight" => search::highlight(ctx).await, + "search::offsets" => search::offsets(ctx).await, + // "sleep" => sleep::sleep(ctx).await, ) } diff --git a/lib/src/fnc/operate.rs b/lib/src/fnc/operate.rs index f7ac7f71..03da5645 100644 --- a/lib/src/fnc/operate.rs +++ b/lib/src/fnc/operate.rs @@ -169,7 +169,7 @@ pub(crate) async fn matches(ctx: &Context<'_>, e: &Expression) -> Result (parse::Package), "rand" => (rand::Package), "array" => (array::Package), + "search" => (search::Package), "session" => (session::Package), "sleep" => fut Async, "string" => (string::Package), diff --git a/lib/src/fnc/script/modules/surrealdb/functions/search.rs b/lib/src/fnc/script/modules/surrealdb/functions/search.rs new file mode 100644 index 00000000..7a05167e --- /dev/null +++ b/lib/src/fnc/script/modules/surrealdb/functions/search.rs @@ -0,0 +1,12 @@ +use super::fut; +use crate::fnc::script::modules::impl_module_def; +use js::prelude::Async; + +pub struct Package; + +impl_module_def!( + Package, + "search", + "highlight" => fut Async, + "offsets" => fut Async +); diff --git a/lib/src/fnc/search.rs b/lib/src/fnc/search.rs new file mode 100644 index 00000000..ebeee72b --- /dev/null +++ b/lib/src/fnc/search.rs @@ -0,0 +1,28 @@ +use crate::ctx::Context; +use crate::err::Error; +use crate::sql::Value; + +pub async fn highlight( + ctx: &Context<'_>, + (prefix, suffix, match_ref): (Value, Value, Value), +) -> Result { + if let Some(doc) = ctx.doc() { + if let Some(thg) = ctx.thing() { + if let Some(exe) = ctx.get_query_executor(&thg.tb) { + let txn = ctx.try_clone_transaction()?; + return exe.highlight(txn, thg, prefix, suffix, match_ref.clone(), doc).await; + } + } + } + Ok(Value::None) +} + +pub async fn offsets(ctx: &Context<'_>, (match_ref,): (Value,)) -> Result { + if let Some(thg) = ctx.thing() { + if let Some(exe) = ctx.get_query_executor(&thg.tb) { + let txn = ctx.try_clone_transaction()?; + return exe.offsets(txn, thg, match_ref.clone()).await; + } + } + Ok(Value::None) +} diff --git a/lib/src/idx/ft/analyzer/filter.rs b/lib/src/idx/ft/analyzer/filter.rs index 524888b7..9134cc18 100644 --- a/lib/src/idx/ft/analyzer/filter.rs +++ b/lib/src/idx/ft/analyzer/filter.rs @@ -179,7 +179,7 @@ pub(super) enum Term { #[cfg(test)] mod tests { - use crate::idx::ft::analyzer::tests::test_analyser; + use crate::idx::ft::analyzer::tests::test_analyzer; #[test] fn test_arabic_stemmer() { @@ -189,17 +189,17 @@ mod tests { "كلاب", "تحب", "الجر", "في", "حديق", "لكن", "كلب", "صغير", "يفضل", "نوم", "في", "سرير", "بدل", "من", "الجر", ]; - test_analyser( + test_analyzer( "DEFINE ANALYZER test TOKENIZERS blank,class FILTERS snowball(arabic);", input, &output, ); - test_analyser( + test_analyzer( "DEFINE ANALYZER test TOKENIZERS blank,class FILTERS snowball(ar);", input, &output, ); - test_analyser( + test_analyzer( "DEFINE ANALYZER test TOKENIZERS blank,class FILTERS snowball(ara);", input, &output, @@ -233,17 +233,17 @@ mod tests { "løb", ".", ]; - test_analyser( + test_analyzer( "DEFINE ANALYZER test TOKENIZERS blank,class FILTERS snowball(danish);", input, &output, ); - test_analyser( + test_analyzer( "DEFINE ANALYZER test TOKENIZERS blank,class FILTERS snowball(dan);", input, &output, ); - test_analyser( + test_analyzer( "DEFINE ANALYZER test TOKENIZERS blank,class FILTERS snowball(da);", input, &output, @@ -257,17 +257,17 @@ mod tests { "hond", "houd", "ervan", "om", "in", "het", "park", "te", "renn", ",", "mar", "mijn", "klein", "hond", "slaapt", "liever", "in", "zijn", "mand", "dan", "te", "renn", ".", ]; - test_analyser( + test_analyzer( "DEFINE ANALYZER test TOKENIZERS blank,class FILTERS snowball(dutch);", input, &output, ); - test_analyser( + test_analyzer( "DEFINE ANALYZER test TOKENIZERS blank,class FILTERS snowball(nl);", input, &output, ); - test_analyser( + test_analyzer( "DEFINE ANALYZER test TOKENIZERS blank,class FILTERS snowball(nld);", input, &output, @@ -281,17 +281,17 @@ mod tests { "teacher", "are", "often", "teach", ",", "but", "my", "favorit", "teacher", "prefer", "read", "in", "her", "spare", "time", "rather", "than", "teach", ".", ]; - test_analyser( + test_analyzer( "DEFINE ANALYZER test TOKENIZERS blank,class FILTERS snowball(english);", input, &output, ); - test_analyser( + test_analyzer( "DEFINE ANALYZER test TOKENIZERS blank,class FILTERS snowball(eng);", input, &output, ); - test_analyser( + test_analyzer( "DEFINE ANALYZER test TOKENIZERS blank,class FILTERS snowball(en);", input, &output, @@ -305,17 +305,17 @@ mod tests { "le", "chien", "adorent", "cour", "dan", "le", "parc", ",", "mais", "mon", "pet", "chien", "aim", "plutôt", "se", "blott", "sur", "le", "canap", "que", "de", "cour", ]; - test_analyser( + test_analyzer( "DEFINE ANALYZER test TOKENIZERS blank,class FILTERS snowball(french);", input, &output, ); - test_analyser( + test_analyzer( "DEFINE ANALYZER test TOKENIZERS blank,class FILTERS snowball(fr);", input, &output, ); - test_analyser( + test_analyzer( "DEFINE ANALYZER test TOKENIZERS blank,class FILTERS snowball(fra);", input, &output, @@ -330,17 +330,17 @@ mod tests { "hund", "zieht", "es", "vor", ",", "auf", "dem", "sofa", "zu", "schlaf", ",", "statt", "zu", "lauf", ".", ]; - test_analyser( + test_analyzer( "DEFINE ANALYZER test TOKENIZERS blank,class FILTERS snowball(german);", input, &output, ); - test_analyser( + test_analyzer( "DEFINE ANALYZER test TOKENIZERS blank,class FILTERS snowball(de);", input, &output, ); - test_analyser( + test_analyzer( "DEFINE ANALYZER test TOKENIZERS blank,class FILTERS snowball(deu);", input, &output, @@ -375,17 +375,17 @@ mod tests { "τρεχ", ".", ]; - test_analyser( + test_analyzer( "DEFINE ANALYZER test TOKENIZERS blank,class FILTERS snowball(greek);", input, &output, ); - test_analyser( + test_analyzer( "DEFINE ANALYZER test TOKENIZERS blank,class FILTERS snowball(ell);", input, &output, ); - test_analyser( + test_analyzer( "DEFINE ANALYZER test TOKENIZERS blank,class FILTERS snowball(el);", input, &output, @@ -399,17 +399,17 @@ mod tests { "a", "kutya", "szeret", "futn", "a", "par", ",", "de", "az", "én", "kics", "kutya", "inkább", "alsz", "a", "kosar", ",", "mints", "fu", ".", ]; - test_analyser( + test_analyzer( "DEFINE ANALYZER test TOKENIZERS blank,class FILTERS snowball(hungarian);", input, &output, ); - test_analyser( + test_analyzer( "DEFINE ANALYZER test TOKENIZERS blank,class FILTERS snowball(hu);", input, &output, ); - test_analyser( + test_analyzer( "DEFINE ANALYZER test TOKENIZERS blank,class FILTERS snowball(hun);", input, &output, @@ -423,17 +423,17 @@ mod tests { "i", "can", "aman", "corr", "nel", "parc", ",", "ma", "il", "mio", "piccol", "can", "prefer", "dorm", "nel", "suo", "cest", "piuttost", "che", "corr", ".", ]; - test_analyser( + test_analyzer( "DEFINE ANALYZER test TOKENIZERS blank,class FILTERS snowball(italian);", input, &output, ); - test_analyser( + test_analyzer( "DEFINE ANALYZER test TOKENIZERS blank,class FILTERS snowball(it);", input, &output, ); - test_analyser( + test_analyzer( "DEFINE ANALYZER test TOKENIZERS blank,class FILTERS snowball(ita);", input, &output, @@ -467,17 +467,17 @@ mod tests { "løp", ".", ]; - test_analyser( + test_analyzer( "DEFINE ANALYZER test TOKENIZERS blank,class FILTERS snowball(norwegian);", input, &output, ); - test_analyser( + test_analyzer( "DEFINE ANALYZER test TOKENIZERS blank,class FILTERS snowball(no);", input, &output, ); - test_analyser( + test_analyzer( "DEFINE ANALYZER test TOKENIZERS blank,class FILTERS snowball(nor);", input, &output, @@ -491,17 +491,17 @@ mod tests { "os", "cã", "ador", "corr", "no", "parqu", ",", "mas", "o", "meu", "pequen", "cã", "prefer", "dorm", "na", "sua", "cam", "em", "vez", "de", "corr", ".", ]; - test_analyser( + test_analyzer( "DEFINE ANALYZER test TOKENIZERS blank,class FILTERS snowball(portuguese);", input, &output, ); - test_analyser( + test_analyzer( "DEFINE ANALYZER test TOKENIZERS blank,class FILTERS snowball(pt);", input, &output, ); - test_analyser( + test_analyzer( "DEFINE ANALYZER test TOKENIZERS blank,class FILTERS snowball(por);", input, &output, @@ -515,17 +515,17 @@ mod tests { "câin", "ador", "să", "alerg", "în", "parc", ",", "dar", "cățel", "meu", "prefer", "să", "doarm", "în", "coș", "lui", "decât", "să", "alerg", ".", ]; - test_analyser( + test_analyzer( "DEFINE ANALYZER test TOKENIZERS blank,class FILTERS snowball(romanian);", input, &output, ); - test_analyser( + test_analyzer( "DEFINE ANALYZER test TOKENIZERS blank,class FILTERS snowball(ro);", input, &output, ); - test_analyser( + test_analyzer( "DEFINE ANALYZER test TOKENIZERS blank,class FILTERS snowball(ron);", input, &output, @@ -557,17 +557,17 @@ mod tests { "бега", ".", ]; - test_analyser( + test_analyzer( "DEFINE ANALYZER test TOKENIZERS blank,class FILTERS snowball(russian);", input, &output, ); - test_analyser( + test_analyzer( "DEFINE ANALYZER test TOKENIZERS blank,class FILTERS snowball(ru);", input, &output, ); - test_analyser( + test_analyzer( "DEFINE ANALYZER test TOKENIZERS blank,class FILTERS snowball(rus);", input, &output, @@ -581,17 +581,17 @@ mod tests { "los", "perr", "aman", "corr", "en", "el", "parqu", ",", "per", "mi", "pequeñ", "perr", "prefier", "dorm", "en", "su", "cam", "en", "lug", "de", "corr", ".", ]; - test_analyser( + test_analyzer( "DEFINE ANALYZER test TOKENIZERS blank,class FILTERS snowball(spanish);", input, &output, ); - test_analyser( + test_analyzer( "DEFINE ANALYZER test TOKENIZERS blank,class FILTERS snowball(es);", input, &output, ); - test_analyser( + test_analyzer( "DEFINE ANALYZER test TOKENIZERS blank,class FILTERS snowball(spa);", input, &output, @@ -625,17 +625,17 @@ mod tests { "spring", ".", ]; - test_analyser( + test_analyzer( "DEFINE ANALYZER test TOKENIZERS blank,class FILTERS snowball(swedish);", input, &output, ); - test_analyser( + test_analyzer( "DEFINE ANALYZER test TOKENIZERS blank,class FILTERS snowball(sv);", input, &output, ); - test_analyser( + test_analyzer( "DEFINE ANALYZER test TOKENIZERS blank,class FILTERS snowball(swe);", input, &output, @@ -672,17 +672,17 @@ mod tests { "லை", ".", ]; - test_analyser( + test_analyzer( "DEFINE ANALYZER test TOKENIZERS blank,class FILTERS snowball(tamil);", input, &output, ); - test_analyser( + test_analyzer( "DEFINE ANALYZER test TOKENIZERS blank,class FILTERS snowball(ta);", input, &output, ); - test_analyser( + test_analyzer( "DEFINE ANALYZER test TOKENIZERS blank,class FILTERS snowball(tam);", input, &output, @@ -696,17 +696,17 @@ mod tests { "köpek", "park", "koşma", "sever", ",", "am", "be", "küçük", "köpek", "koşmak", "yatak", "uyuma", "tercih", "eder", ".", ]; - test_analyser( + test_analyzer( "DEFINE ANALYZER test TOKENIZERS blank,class FILTERS snowball(turkish);", input, &output, ); - test_analyser( + test_analyzer( "DEFINE ANALYZER test TOKENIZERS blank,class FILTERS snowball(tr);", input, &output, ); - test_analyser( + test_analyzer( "DEFINE ANALYZER test TOKENIZERS blank,class FILTERS snowball(tur);", input, &output, @@ -715,7 +715,7 @@ mod tests { #[test] fn test_ngram() { - test_analyser( + test_analyzer( "DEFINE ANALYZER test TOKENIZERS blank,class FILTERS lowercase,ngram(2,3);", "Ālea iacta est", &["āl", "āle", "le", "lea", "ia", "iac", "ac", "act", "ct", "cta", "es", "est"], @@ -724,7 +724,7 @@ mod tests { #[test] fn test_edgengram() { - test_analyser( + test_analyzer( "DEFINE ANALYZER test TOKENIZERS blank,class FILTERS lowercase,edgengram(2,3);", "Ālea iacta est", &["āl", "āle", "ia", "iac", "es", "est"], diff --git a/lib/src/idx/ft/analyzer/mod.rs b/lib/src/idx/ft/analyzer/mod.rs index 12a3a761..e70b9da6 100644 --- a/lib/src/idx/ft/analyzer/mod.rs +++ b/lib/src/idx/ft/analyzer/mod.rs @@ -1,12 +1,13 @@ use crate::err::Error; use crate::idx::ft::analyzer::tokenizer::{Tokenizer, Tokens}; use crate::idx::ft::doclength::DocLength; +use crate::idx::ft::offsets::{Offset, OffsetRecords}; use crate::idx::ft::postings::TermFrequency; use crate::idx::ft::terms::{TermId, Terms}; use crate::kvs::Transaction; use crate::sql::statements::DefineAnalyzerStatement; use crate::sql::tokenizer::Tokenizer as SqlTokenizer; -use crate::sql::Array; +use crate::sql::{Array, Value}; use filter::Filter; use std::collections::hash_map::Entry; use std::collections::{HashMap, HashSet}; @@ -40,46 +41,47 @@ impl Analyzer { t: &Terms, tx: &mut Transaction, query_string: String, - ) -> Result>, Error> { - let tokens = self.analyse(query_string); + ) -> Result<(Vec, bool), Error> { + let tokens = self.analyze(query_string); // We first collect every unique terms // as it can contains duplicates let mut terms = HashSet::new(); for token in tokens.list() { terms.insert(token); } + let mut missing = false; // Now we can extract the term ids let mut res = Vec::with_capacity(terms.len()); for term in terms { - let term_id = t.get_term_id(tx, tokens.get_token_string(term)).await?; - res.push(term_id); + if let Some(term_id) = t.get_term_id(tx, tokens.get_token_string(term)).await? { + res.push(term_id); + } else { + missing = false; + } } - Ok(res) + Ok((res, missing)) } /// This method is used for indexing. /// It will create new term ids for non already existing terms. pub(super) async fn extract_terms_with_frequencies( &self, - t: &mut Terms, + terms: &mut Terms, tx: &mut Transaction, field_content: &Array, ) -> Result<(DocLength, Vec<(TermId, TermFrequency)>), Error> { - let mut doc_length = 0; + let mut dl = 0; // Let's first collect all the inputs, and collect the tokens. // We need to store them because everything after is zero-copy - let mut inputs = Vec::with_capacity(field_content.0.len()); - for v in &field_content.0 { - let input = v.to_owned().convert_to_string()?; - let tks = self.analyse(input); - inputs.push(tks); - } + let mut inputs = vec![]; + self.analyze_content(field_content, &mut inputs)?; // We then collect every unique terms and count the frequency - let mut terms = HashMap::new(); - for tokens in &inputs { - for token in tokens.list() { - doc_length += 1; - match terms.entry(tokens.get_token_string(token)) { + let mut tf: HashMap<&str, TermFrequency> = HashMap::new(); + for tks in &inputs { + for tk in tks.list() { + dl += 1; + let s = tks.get_token_string(tk); + match tf.entry(s) { Entry::Vacant(e) => { e.insert(1); } @@ -89,15 +91,76 @@ impl Analyzer { } } } - // Now we can extract the term ids - let mut res = Vec::with_capacity(terms.len()); - for (term, freq) in terms { - res.push((t.resolve_term_id(tx, term).await?, freq)); + // Now we can resolve the term ids + let mut tfid = Vec::with_capacity(tf.len()); + for (t, f) in tf { + tfid.push((terms.resolve_term_id(tx, t).await?, f)); } - Ok((doc_length, res)) + Ok((dl, tfid)) } - fn analyse(&self, input: String) -> Tokens { + /// This method is used for indexing. + /// It will create new term ids for non already existing terms. + pub(super) async fn extract_terms_with_frequencies_with_offsets( + &self, + terms: &mut Terms, + tx: &mut Transaction, + field_content: &Array, + ) -> Result<(DocLength, Vec<(TermId, TermFrequency)>, Vec<(TermId, OffsetRecords)>), Error> { + let mut dl = 0; + // Let's first collect all the inputs, and collect the tokens. + // We need to store them because everything after is zero-copy + let mut inputs = Vec::with_capacity(field_content.len()); + self.analyze_content(field_content, &mut inputs)?; + // We then collect every unique terms and count the frequency and extract the offsets + let mut tfos: HashMap<&str, Vec> = HashMap::new(); + for (i, tks) in inputs.iter().enumerate() { + for tk in tks.list() { + dl += 1; + let s = tks.get_token_string(tk); + let o = tk.new_offset(i as u32); + match tfos.entry(s) { + Entry::Vacant(e) => { + e.insert(vec![o]); + } + Entry::Occupied(mut e) => e.get_mut().push(o), + } + } + } + + // Now we can resolve the term ids + let mut tfid = Vec::with_capacity(tfos.len()); + let mut osid = Vec::with_capacity(tfos.len()); + for (t, o) in tfos { + let id = terms.resolve_term_id(tx, t).await?; + tfid.push((id, o.len() as TermFrequency)); + osid.push((id, OffsetRecords(o))); + } + Ok((dl, tfid, osid)) + } + + fn analyze_content(&self, field_content: &Array, tks: &mut Vec) -> Result<(), Error> { + for v in &field_content.0 { + self.analyze_value(v, tks); + } + Ok(()) + } + + fn analyze_value(&self, val: &Value, tks: &mut Vec) { + match val { + Value::Strand(s) => tks.push(self.analyze(s.0.clone())), + Value::Number(n) => tks.push(self.analyze(n.to_string())), + Value::Bool(b) => tks.push(self.analyze(b.to_string())), + Value::Array(a) => { + for v in &a.0 { + self.analyze_value(v, tks); + } + } + _ => {} + } + } + + fn analyze(&self, input: String) -> Tokens { if let Some(t) = &self.t { if !input.is_empty() { let t = Tokenizer::tokenize(t, input); @@ -113,11 +176,11 @@ mod tests { use super::Analyzer; use crate::sql::statements::define::analyzer; - pub(super) fn test_analyser(def: &str, input: &str, expected: &[&str]) { + pub(super) fn test_analyzer(def: &str, input: &str, expected: &[&str]) { let (_, az) = analyzer(def).unwrap(); let a: Analyzer = az.into(); - let tokens = a.analyse(input.to_string()); + let tokens = a.analyze(input.to_string()); let mut res = vec![]; for t in tokens.list() { res.push(tokens.get_token_string(t)); diff --git a/lib/src/idx/ft/analyzer/tokenizer.rs b/lib/src/idx/ft/analyzer/tokenizer.rs index 862f8f79..586d7ad0 100644 --- a/lib/src/idx/ft/analyzer/tokenizer.rs +++ b/lib/src/idx/ft/analyzer/tokenizer.rs @@ -1,4 +1,5 @@ use crate::idx::ft::analyzer::filter::{Filter, FilterResult, Term}; +use crate::idx::ft::offsets::{Offset, Position}; use crate::sql::tokenizer::Tokenizer as SqlTokenizer; pub(super) struct Tokens { @@ -35,18 +36,19 @@ impl Tokens { match fr { FilterResult::Term(t) => match t { Term::Unchanged => tks.push(tk), - Term::NewTerm(s) => tks.push(Token::String(s)), + Term::NewTerm(s) => tks.push(tk.new_token(s)), }, FilterResult::Terms(ts) => { - let mut tk = Some(tk); + let mut already_pushed = false; for t in ts { match t { Term::Unchanged => { - if let Some(tk) = tk.take() { - tks.push(tk) + if !already_pushed { + tks.push(tk.clone()); + already_pushed = true; } } - Term::NewTerm(s) => tks.push(Token::String(s)), + Term::NewTerm(s) => tks.push(tk.new_token(s)), } } } @@ -66,22 +68,36 @@ impl Tokens { #[derive(Clone, Debug, PartialOrd, PartialEq, Eq, Ord, Hash)] pub(super) enum Token { - Ref(usize, usize), - String(String), + Ref(Position, Position), + String(Position, Position, String), } impl Token { + fn new_token(&self, t: String) -> Self { + match self { + Token::Ref(s, e) => Token::String(*s, *e, t), + Token::String(s, e, _) => Token::String(*s, *e, t), + } + } + + pub(super) fn new_offset(&self, i: u32) -> Offset { + match self { + Token::Ref(s, e) => Offset::new(i, *s, *e), + Token::String(s, e, _) => Offset::new(i, *s, *e), + } + } + fn is_empty(&self) -> bool { match self { Token::Ref(start, end) => start == end, - Token::String(s) => s.is_empty(), + Token::String(_, _, s) => s.is_empty(), } } pub(super) fn get_str<'a>(&'a self, i: &'a str) -> &str { match self { - Token::Ref(s, e) => &i[*s..*e], - Token::String(s) => s, + Token::Ref(s, e) => &i[(*s as usize)..(*e as usize)], + Token::String(_, _, s) => s, } } } @@ -128,10 +144,10 @@ impl Tokenizer { // If the character is not valid for indexing (space, control...) // Then we increase the last position to the next character if !is_valid { - last_pos += c.len_utf8(); + last_pos += c.len_utf8() as Position; } } - current_pos += c.len_utf8(); + current_pos += c.len_utf8() as Position; } if current_pos != last_pos { t.push(Token::Ref(last_pos, current_pos)); @@ -229,11 +245,11 @@ impl Splitter { #[cfg(test)] mod tests { - use crate::idx::ft::analyzer::tests::test_analyser; + use crate::idx::ft::analyzer::tests::test_analyzer; #[test] fn test_tokenize_blank_class() { - test_analyser( + test_analyzer( "DEFINE ANALYZER test TOKENIZERS blank,class FILTERS lowercase", "Abc12345xYZ DL1809 item123456 978-3-16-148410-0 1HGCM82633A123456", &[ @@ -245,7 +261,7 @@ mod tests { #[test] fn test_tokenize_source_code() { - test_analyser( + test_analyzer( "DEFINE ANALYZER test TOKENIZERS blank,class,camel,punct FILTERS lowercase", r#"struct MyRectangle { // specified by corners diff --git a/lib/src/idx/ft/highlighter.rs b/lib/src/idx/ft/highlighter.rs new file mode 100644 index 00000000..6cac00de --- /dev/null +++ b/lib/src/idx/ft/highlighter.rs @@ -0,0 +1,141 @@ +use crate::err::Error; +use crate::idx::ft::offsets::{Offset, Position}; +use crate::sql::{Array, Idiom, Object, Value}; +use std::collections::btree_map::Entry as BEntry; +use std::collections::hash_map::Entry as HEntry; +use std::collections::BTreeMap; +use std::collections::HashMap; +use std::default::Default; + +pub(super) struct Highlighter { + prefix: Vec, + suffix: Vec, + fields: Vec<(Idiom, Value)>, + offseter: Offseter, +} + +impl Highlighter { + pub(super) fn new(prefix: Value, suffix: Value, idiom: &Idiom, doc: &Value) -> Self { + let prefix = prefix.to_raw_string().chars().collect(); + let suffix = suffix.to_raw_string().chars().collect(); + // Extract the fields we want to highlight + let fields = doc.walk(idiom); + Self { + fields, + prefix, + suffix, + offseter: Offseter::default(), + } + } + + pub(super) fn highlight(&mut self, os: Vec) { + self.offseter.highlight(os); + } + + fn extract(val: Value, vals: &mut Vec>) { + match val { + Value::Strand(s) => vals.push(Some(s.0)), + Value::Number(n) => vals.push(Some(n.to_string())), + Value::Bool(b) => vals.push(Some(b.to_string())), + Value::Array(a) => { + for v in a.0 { + Self::extract(v, vals); + } + } + _ => vals.push(None), + } + } +} + +impl TryFrom for Value { + type Error = Error; + + fn try_from(hl: Highlighter) -> Result { + if hl.fields.is_empty() { + return Ok(Self::None); + } + let mut vals = vec![]; + for (_, f) in hl.fields { + Highlighter::extract(f, &mut vals); + } + let mut res = Vec::with_capacity(vals.len()); + for (idx, val) in vals.into_iter().enumerate() { + let idx = idx as u32; + if let Some(v) = val { + if let Some(m) = hl.offseter.offsets.get(&idx) { + let mut v: Vec = v.chars().collect(); + let mut d = 0; + for (s, e) in m { + let p = (*s as usize) + d; + v.splice(p..p, hl.prefix.clone()); + d += hl.prefix.len(); + let p = (*e as usize) + d; + v.splice(p..p, hl.suffix.clone()); + d += hl.suffix.len(); + } + let s: String = v.iter().collect(); + res.push(Value::from(s)); + } else { + res.push(Value::from(v)); + } + } + } + Ok(match res.len() { + 0 => Value::None, + 1 => res.remove(0), + _ => Value::from(res), + }) + } +} + +#[derive(Default)] +pub(super) struct Offseter { + offsets: HashMap>, +} + +impl Offseter { + pub(super) fn highlight(&mut self, os: Vec) { + for o in os { + match self.offsets.entry(o.index) { + HEntry::Occupied(mut e) => match e.get_mut().entry(o.start) { + BEntry::Vacant(e) => { + e.insert(o.end); + } + BEntry::Occupied(mut e) => { + if o.end.gt(e.get()) { + e.insert(o.end); + } + } + }, + HEntry::Vacant(e) => { + e.insert(BTreeMap::from([(o.start, o.end)])); + } + } + } + } +} + +impl TryFrom for Value { + type Error = Error; + + fn try_from(or: Offseter) -> Result { + if or.offsets.is_empty() { + return Ok(Self::None); + } + let mut res = BTreeMap::default(); + for (idx, offsets) in or.offsets { + let mut r = Vec::with_capacity(offsets.len()); + for (s, e) in offsets { + let mut o = BTreeMap::default(); + o.insert("s".to_string(), Value::from(s)); + o.insert("e".to_string(), Value::from(e)); + r.push(Value::Object(Object::from(o))); + } + res.insert(idx.to_string(), Value::Array(Array::from(r))); + } + Ok(match res.len() { + 0 => Value::None, + _ => Value::from(Object::from(res)), + }) + } +} diff --git a/lib/src/idx/ft/mod.rs b/lib/src/idx/ft/mod.rs index d98de5d6..cb50f0fd 100644 --- a/lib/src/idx/ft/mod.rs +++ b/lib/src/idx/ft/mod.rs @@ -1,6 +1,8 @@ pub(crate) mod analyzer; pub(crate) mod docids; mod doclength; +mod highlighter; +mod offsets; mod postings; mod scorer; mod termdocs; @@ -10,26 +12,32 @@ use crate::err::Error; use crate::idx::ft::analyzer::Analyzer; use crate::idx::ft::docids::DocIds; use crate::idx::ft::doclength::DocLengths; +use crate::idx::ft::highlighter::{Highlighter, Offseter}; +use crate::idx::ft::offsets::Offsets; use crate::idx::ft::postings::Postings; use crate::idx::ft::scorer::{BM25Scorer, Score}; use crate::idx::ft::termdocs::TermDocs; use crate::idx::ft::terms::{TermId, Terms}; use crate::idx::{btree, IndexKeyBase, SerdeState}; use crate::kvs::{Key, Transaction}; +use crate::sql::scoring::Scoring; use crate::sql::statements::DefineAnalyzerStatement; -use crate::sql::{Array, Object, Thing, Value}; +use crate::sql::{Array, Idiom, Object, Thing, Value}; use roaring::treemap::IntoIter; use roaring::RoaringTreemap; use serde::{Deserialize, Serialize}; use std::ops::BitAnd; +pub(crate) type MatchRef = u8; + pub(crate) struct FtIndex { analyzer: Analyzer, state_key: Key, index_key_base: IndexKeyBase, state: State, - bm25: Bm25Params, - btree_default_order: u32, + bm25: Option, + order: u32, + highlighting: bool, } #[derive(Clone)] @@ -78,7 +86,9 @@ impl FtIndex { tx: &mut Transaction, az: DefineAnalyzerStatement, index_key_base: IndexKeyBase, - btree_default_order: u32, + order: u32, + scoring: &Scoring, + hl: bool, ) -> Result { let state_key: Key = index_key_base.new_bs_key(); let state: State = if let Some(val) = tx.get(state_key.clone()).await? { @@ -86,22 +96,34 @@ impl FtIndex { } else { State::default() }; + let mut bm25 = None; + if let Scoring::Bm { + k1, + b, + } = scoring + { + bm25 = Some(Bm25Params { + k1: *k1, + b: *b, + }); + } Ok(Self { state, state_key, index_key_base, - bm25: Bm25Params::default(), - btree_default_order, + bm25, + order, + highlighting: hl, analyzer: az.into(), }) } async fn doc_ids(&self, tx: &mut Transaction) -> Result { - DocIds::new(tx, self.index_key_base.clone(), self.btree_default_order).await + DocIds::new(tx, self.index_key_base.clone(), self.order).await } async fn terms(&self, tx: &mut Transaction) -> Result { - Terms::new(tx, self.index_key_base.clone(), self.btree_default_order).await + Terms::new(tx, self.index_key_base.clone(), self.order).await } fn term_docs(&self) -> TermDocs { @@ -109,11 +131,15 @@ impl FtIndex { } async fn doc_lengths(&self, tx: &mut Transaction) -> Result { - DocLengths::new(tx, self.index_key_base.clone(), self.btree_default_order).await + DocLengths::new(tx, self.index_key_base.clone(), self.order).await } async fn postings(&self, tx: &mut Transaction) -> Result { - Postings::new(tx, self.index_key_base.clone(), self.btree_default_order).await + Postings::new(tx, self.index_key_base.clone(), self.order).await + } + + fn offsets(&self) -> Offsets { + Offsets::new(self.index_key_base.clone()) } pub(crate) async fn remove_document( @@ -140,7 +166,7 @@ impl FtIndex { let mut p = self.postings(tx).await?; let mut t = self.terms(tx).await?; let td = self.term_docs(); - for term_id in term_list { + for term_id in &term_list { p.remove_posting(tx, term_id, doc_id).await?; // if the term is not present in any document in the index, we can remove it let doc_count = td.remove_doc(tx, term_id, doc_id).await?; @@ -148,6 +174,14 @@ impl FtIndex { t.remove_term_id(tx, term_id).await?; } } + // Remove the offsets if any + if self.highlighting { + let o = self.offsets(); + for term_id in term_list { + // TODO?: Removal can be done with a prefix on doc_id + o.remove_offsets(tx, doc_id, term_id).await?; + } + } t.finish(tx).await?; p.finish(tx).await?; } @@ -168,10 +202,19 @@ impl FtIndex { let resolved = d.resolve_doc_id(tx, rid.into()).await?; let doc_id = *resolved.doc_id(); - // Extract the doc_lengths, terms en frequencies + // Extract the doc_lengths, terms en frequencies (and offset) let mut t = self.terms(tx).await?; - let (doc_length, terms_and_frequencies) = - self.analyzer.extract_terms_with_frequencies(&mut t, tx, field_content).await?; + let (doc_length, terms_and_frequencies, offsets) = if self.highlighting { + let (dl, tf, ofs) = self + .analyzer + .extract_terms_with_frequencies_with_offsets(&mut t, tx, field_content) + .await?; + (dl, tf, Some(ofs)) + } else { + let (dl, tf) = + self.analyzer.extract_terms_with_frequencies(&mut t, tx, field_content).await?; + (dl, tf, None) + }; // Set the doc length let mut l = self.doc_lengths(tx).await?; @@ -204,7 +247,7 @@ impl FtIndex { } // Remove any remaining postings - if let Some(old_term_ids) = old_term_ids { + if let Some(old_term_ids) = &old_term_ids { for old_term_id in old_term_ids { p.remove_posting(tx, old_term_id, doc_id).await?; let doc_count = term_docs.remove_doc(tx, old_term_id, doc_id).await?; @@ -215,6 +258,24 @@ impl FtIndex { } } + if self.highlighting { + let o = self.offsets(); + // Set the offset if any + if let Some(ofs) = offsets { + if !ofs.is_empty() { + for (tid, or) in ofs { + o.set_offsets(tx, doc_id, tid, or).await?; + } + } + } + // In case of an update, w remove the offset for the terms that does not exist anymore + if let Some(old_term_ids) = old_term_ids { + for old_term_id in old_term_ids { + o.remove_offsets(tx, doc_id, old_term_id).await?; + } + } + } + // Stores the term list for this doc_id tx.set(term_ids_key, terms_ids.try_to_val()?).await?; @@ -233,52 +294,63 @@ impl FtIndex { Ok(()) } + pub(super) async fn extract_terms( + &self, + tx: &mut Transaction, + query_string: String, + ) -> Result, Error> { + let t = self.terms(tx).await?; + let (terms, _) = self.analyzer.extract_terms(&t, tx, query_string).await?; + Ok(terms) + } + pub(super) async fn search( &self, tx: &mut Transaction, query_string: String, - ) -> Result, Error> { + ) -> Result<(Vec, Option), Error> { let t = self.terms(tx).await?; let td = self.term_docs(); - let terms = self.analyzer.extract_terms(&t, tx, query_string).await?; + let (terms, missing) = self.analyzer.extract_terms(&t, tx, query_string).await?; + if missing { + // If any term does not exists, as we are doing an AND query, + // we can return an empty results set + return Ok((terms, None)); + } let mut hits: Option = None; let mut terms_docs = Vec::with_capacity(terms.len()); - for term_id in terms { - if let Some(term_id) = term_id { - if let Some(term_docs) = td.get_docs(tx, term_id).await? { - if let Some(h) = hits { - hits = Some(h.bitand(&term_docs)); - } else { - hits = Some(term_docs.clone()); - } - terms_docs.push((term_id, term_docs)); - continue; + for term_id in &terms { + if let Some(term_docs) = td.get_docs(tx, *term_id).await? { + if let Some(h) = hits { + hits = Some(h.bitand(&term_docs)); + } else { + hits = Some(term_docs.clone()); } + terms_docs.push((*term_id, term_docs)); } - return Ok(None); } if let Some(hits) = hits { if !hits.is_empty() { let postings = self.postings(tx).await?; let doc_lengths = self.doc_lengths(tx).await?; - // TODO: Scoring should be optional - let scorer = BM25Scorer::new( - doc_lengths, - self.state.total_docs_lengths, - self.state.doc_count, - self.bm25.clone(), - ); + + let mut scorer = None; + if let Some(bm25) = &self.bm25 { + scorer = Some(BM25Scorer::new( + doc_lengths, + self.state.total_docs_lengths, + self.state.doc_count, + bm25.clone(), + )); + } let doc_ids = self.doc_ids(tx).await?; - return Ok(Some(HitsIterator::new( - doc_ids, - postings, - hits, - terms_docs, - Some(scorer), - ))); + return Ok(( + terms, + Some(HitsIterator::new(doc_ids, postings, hits, terms_docs, scorer)), + )); } } - Ok(None) + Ok((terms, None)) } pub(super) async fn match_id_value( @@ -303,6 +375,55 @@ impl FtIndex { Ok(false) } + #[allow(clippy::too_many_arguments)] + pub(super) async fn highlight( + &self, + tx: &mut Transaction, + thg: &Thing, + terms: &Vec, + prefix: Value, + suffix: Value, + idiom: &Idiom, + doc: &Value, + ) -> Result { + let doc_key: Key = thg.into(); + let doc_ids = self.doc_ids(tx).await?; + if let Some(doc_id) = doc_ids.get_doc_id(tx, doc_key).await? { + let o = self.offsets(); + let mut hl = Highlighter::new(prefix, suffix, idiom, doc); + for term_id in terms { + let o = o.get_offsets(tx, doc_id, *term_id).await?; + if let Some(o) = o { + hl.highlight(o.0); + } + } + return hl.try_into(); + } + Ok(Value::None) + } + + pub(super) async fn extract_offsets( + &self, + tx: &mut Transaction, + thg: &Thing, + terms: &Vec, + ) -> Result { + let doc_key: Key = thg.into(); + let doc_ids = self.doc_ids(tx).await?; + if let Some(doc_id) = doc_ids.get_doc_id(tx, doc_key).await? { + let o = self.offsets(); + let mut or = Offseter::default(); + for term_id in terms { + let o = o.get_offsets(tx, doc_id, *term_id).await?; + if let Some(o) = o { + or.highlight(o.0); + } + } + return or.try_into(); + } + Ok(Value::None) + } + pub(crate) async fn statistics(&self, tx: &mut Transaction) -> Result { // TODO do parallel execution Ok(Statistics { @@ -372,6 +493,7 @@ mod tests { use crate::idx::ft::{FtIndex, HitsIterator, Score}; use crate::idx::IndexKeyBase; use crate::kvs::{Datastore, Transaction}; + use crate::sql::scoring::Scoring; use crate::sql::statements::define::analyzer; use crate::sql::{Array, Thing}; use std::collections::HashMap; @@ -410,10 +532,16 @@ mod tests { { // Add one document let mut tx = ds.transaction(true, false).await.unwrap(); - let mut fti = - FtIndex::new(&mut tx, az.clone(), IndexKeyBase::default(), default_btree_order) - .await - .unwrap(); + let mut fti = FtIndex::new( + &mut tx, + az.clone(), + IndexKeyBase::default(), + default_btree_order, + &Scoring::default(), + false, + ) + .await + .unwrap(); fti.index_document(&mut tx, &doc1, &Array::from(vec!["hello the world"])) .await .unwrap(); @@ -423,10 +551,16 @@ mod tests { { // Add two documents let mut tx = ds.transaction(true, false).await.unwrap(); - let mut fti = - FtIndex::new(&mut tx, az.clone(), IndexKeyBase::default(), default_btree_order) - .await - .unwrap(); + let mut fti = FtIndex::new( + &mut tx, + az.clone(), + IndexKeyBase::default(), + default_btree_order, + &Scoring::default(), + false, + ) + .await + .unwrap(); fti.index_document(&mut tx, &doc2, &Array::from(vec!["a yellow hello"])).await.unwrap(); fti.index_document(&mut tx, &doc3, &Array::from(vec!["foo bar"])).await.unwrap(); tx.commit().await.unwrap(); @@ -434,10 +568,16 @@ mod tests { { let mut tx = ds.transaction(true, false).await.unwrap(); - let fti = - FtIndex::new(&mut tx, az.clone(), IndexKeyBase::default(), default_btree_order) - .await - .unwrap(); + let fti = FtIndex::new( + &mut tx, + az.clone(), + IndexKeyBase::default(), + default_btree_order, + &Scoring::default(), + false, + ) + .await + .unwrap(); // Check the statistics let statistics = fti.statistics(&mut tx).await.unwrap(); @@ -447,72 +587,85 @@ mod tests { assert_eq!(statistics.doc_lengths.keys_count, 3); // Search & score - let i = fti.search(&mut tx, "hello".to_string()).await.unwrap(); + let (_, i) = fti.search(&mut tx, "hello".to_string()).await.unwrap(); check_hits(i, &mut tx, vec![(&doc1, Some(0.0)), (&doc2, Some(0.0))]).await; - let i = fti.search(&mut tx, "world".to_string()).await.unwrap(); + let (_, i) = fti.search(&mut tx, "world".to_string()).await.unwrap(); check_hits(i, &mut tx, vec![(&doc1, Some(0.4859746))]).await; - let i = fti.search(&mut tx, "yellow".to_string()).await.unwrap(); + let (_, i) = fti.search(&mut tx, "yellow".to_string()).await.unwrap(); check_hits(i, &mut tx, vec![(&doc2, Some(0.4859746))]).await; - let i = fti.search(&mut tx, "foo".to_string()).await.unwrap(); + let (_, i) = fti.search(&mut tx, "foo".to_string()).await.unwrap(); check_hits(i, &mut tx, vec![(&doc3, Some(0.56902087))]).await; - let i = fti.search(&mut tx, "bar".to_string()).await.unwrap(); + let (_, i) = fti.search(&mut tx, "bar".to_string()).await.unwrap(); check_hits(i, &mut tx, vec![(&doc3, Some(0.56902087))]).await; - let i = fti.search(&mut tx, "dummy".to_string()).await.unwrap(); + let (_, i) = fti.search(&mut tx, "dummy".to_string()).await.unwrap(); assert!(i.is_none()); } { // Reindex one document let mut tx = ds.transaction(true, false).await.unwrap(); - let mut fti = - FtIndex::new(&mut tx, az.clone(), IndexKeyBase::default(), default_btree_order) - .await - .unwrap(); + let mut fti = FtIndex::new( + &mut tx, + az.clone(), + IndexKeyBase::default(), + default_btree_order, + &Scoring::default(), + false, + ) + .await + .unwrap(); fti.index_document(&mut tx, &doc3, &Array::from(vec!["nobar foo"])).await.unwrap(); tx.commit().await.unwrap(); // We can still find 'foo' let mut tx = ds.transaction(false, false).await.unwrap(); - let i = fti.search(&mut tx, "foo".to_string()).await.unwrap(); + let (_, i) = fti.search(&mut tx, "foo".to_string()).await.unwrap(); check_hits(i, &mut tx, vec![(&doc3, Some(0.56902087))]).await; // We can't anymore find 'bar' - let i = fti.search(&mut tx, "bar".to_string()).await.unwrap(); + let (_, i) = fti.search(&mut tx, "bar".to_string()).await.unwrap(); assert!(i.is_none()); // We can now find 'nobar' - let i = fti.search(&mut tx, "nobar".to_string()).await.unwrap(); + let (_, i) = fti.search(&mut tx, "nobar".to_string()).await.unwrap(); check_hits(i, &mut tx, vec![(&doc3, Some(0.56902087))]).await; } { // Remove documents let mut tx = ds.transaction(true, false).await.unwrap(); - let mut fti = - FtIndex::new(&mut tx, az.clone(), IndexKeyBase::default(), default_btree_order) - .await - .unwrap(); + let mut fti = FtIndex::new( + &mut tx, + az.clone(), + IndexKeyBase::default(), + default_btree_order, + &Scoring::default(), + false, + ) + .await + .unwrap(); fti.remove_document(&mut tx, &doc1).await.unwrap(); fti.remove_document(&mut tx, &doc2).await.unwrap(); fti.remove_document(&mut tx, &doc3).await.unwrap(); tx.commit().await.unwrap(); let mut tx = ds.transaction(false, false).await.unwrap(); - let i = fti.search(&mut tx, "hello".to_string()).await.unwrap(); - assert!(i.is_none()); + let (v, h) = fti.search(&mut tx, "hello".to_string()).await.unwrap(); + assert!(v.is_empty()); + assert!(h.is_none()); - let i = fti.search(&mut tx, "foo".to_string()).await.unwrap(); - assert!(i.is_none()); + let (v, h) = fti.search(&mut tx, "foo".to_string()).await.unwrap(); + assert!(v.is_empty()); + assert!(h.is_none()); } } - #[test(tokio::test)] - async fn test_ft_index_bm_25() { + async fn test_ft_index_bm_25(hl: bool) { // The function `extract_sorted_terms_with_frequencies` is non-deterministic. // the inner structures (BTrees) are built with the same terms and frequencies, // but the insertion order is different, ending up in different BTree structures. @@ -529,10 +682,16 @@ mod tests { let default_btree_order = 5; { let mut tx = ds.transaction(true, false).await.unwrap(); - let mut fti = - FtIndex::new(&mut tx, az.clone(), IndexKeyBase::default(), default_btree_order) - .await - .unwrap(); + let mut fti = FtIndex::new( + &mut tx, + az.clone(), + IndexKeyBase::default(), + default_btree_order, + &Scoring::default(), + hl, + ) + .await + .unwrap(); fti.index_document( &mut tx, &doc1, @@ -566,10 +725,16 @@ mod tests { { let mut tx = ds.transaction(true, false).await.unwrap(); - let fti = - FtIndex::new(&mut tx, az.clone(), IndexKeyBase::default(), default_btree_order) - .await - .unwrap(); + let fti = FtIndex::new( + &mut tx, + az.clone(), + IndexKeyBase::default(), + default_btree_order, + &Scoring::default(), + hl, + ) + .await + .unwrap(); let statistics = fti.statistics(&mut tx).await.unwrap(); assert_eq!(statistics.terms.keys_count, 17); @@ -577,7 +742,7 @@ mod tests { assert_eq!(statistics.doc_ids.keys_count, 4); assert_eq!(statistics.doc_lengths.keys_count, 4); - let i = fti.search(&mut tx, "the".to_string()).await.unwrap(); + let (_, i) = fti.search(&mut tx, "the".to_string()).await.unwrap(); check_hits( i, &mut tx, @@ -590,7 +755,7 @@ mod tests { ) .await; - let i = fti.search(&mut tx, "dog".to_string()).await.unwrap(); + let (_, i) = fti.search(&mut tx, "dog".to_string()).await.unwrap(); check_hits( i, &mut tx, @@ -598,27 +763,37 @@ mod tests { ) .await; - let i = fti.search(&mut tx, "fox".to_string()).await.unwrap(); + let (_, i) = fti.search(&mut tx, "fox".to_string()).await.unwrap(); check_hits(i, &mut tx, vec![(&doc1, Some(0.0)), (&doc2, Some(0.0))]).await; - let i = fti.search(&mut tx, "over".to_string()).await.unwrap(); + let (_, i) = fti.search(&mut tx, "over".to_string()).await.unwrap(); check_hits(i, &mut tx, vec![(&doc1, Some(0.0)), (&doc2, Some(0.0))]).await; - let i = fti.search(&mut tx, "lazy".to_string()).await.unwrap(); + let (_, i) = fti.search(&mut tx, "lazy".to_string()).await.unwrap(); check_hits(i, &mut tx, vec![(&doc1, Some(0.0)), (&doc2, Some(0.0))]).await; - let i = fti.search(&mut tx, "jumped".to_string()).await.unwrap(); + let (_, i) = fti.search(&mut tx, "jumped".to_string()).await.unwrap(); check_hits(i, &mut tx, vec![(&doc1, Some(0.0)), (&doc2, Some(0.0))]).await; - let i = fti.search(&mut tx, "nothing".to_string()).await.unwrap(); + let (_, i) = fti.search(&mut tx, "nothing".to_string()).await.unwrap(); check_hits(i, &mut tx, vec![(&doc3, Some(0.87105393))]).await; - let i = fti.search(&mut tx, "animals".to_string()).await.unwrap(); + let (_, i) = fti.search(&mut tx, "animals".to_string()).await.unwrap(); check_hits(i, &mut tx, vec![(&doc4, Some(0.92279965))]).await; - let i = fti.search(&mut tx, "dummy".to_string()).await.unwrap(); + let (_, i) = fti.search(&mut tx, "dummy".to_string()).await.unwrap(); assert!(i.is_none()); } } } + + #[test(tokio::test)] + async fn test_ft_index_bm_25_without_highlighting() { + test_ft_index_bm_25(false).await; + } + + #[test(tokio::test)] + async fn test_ft_index_bm_25_with_highlighting() { + test_ft_index_bm_25(true).await; + } } diff --git a/lib/src/idx/ft/offsets.rs b/lib/src/idx/ft/offsets.rs new file mode 100644 index 00000000..59cce4f4 --- /dev/null +++ b/lib/src/idx/ft/offsets.rs @@ -0,0 +1,156 @@ +use crate::err::Error; +use crate::idx::ft::docids::DocId; +use crate::idx::ft::terms::TermId; +use crate::idx::IndexKeyBase; +use crate::kvs::{Transaction, Val}; + +pub(super) type Position = u32; + +pub(super) struct Offsets { + index_key_base: IndexKeyBase, +} + +impl Offsets { + pub(super) fn new(index_key_base: IndexKeyBase) -> Self { + Self { + index_key_base, + } + } + + pub(super) async fn set_offsets( + &self, + tx: &mut Transaction, + doc_id: DocId, + term_id: TermId, + offsets: OffsetRecords, + ) -> Result<(), Error> { + let key = self.index_key_base.new_bo_key(doc_id, term_id); + let val: Val = offsets.try_into()?; + tx.set(key, val).await?; + Ok(()) + } + + pub(super) async fn get_offsets( + &self, + tx: &mut Transaction, + doc_id: DocId, + term_id: TermId, + ) -> Result, Error> { + let key = self.index_key_base.new_bo_key(doc_id, term_id); + if let Some(val) = tx.get(key).await? { + let offsets = val.try_into()?; + Ok(Some(offsets)) + } else { + Ok(None) + } + } + + pub(super) async fn remove_offsets( + &self, + tx: &mut Transaction, + doc_id: DocId, + term_id: TermId, + ) -> Result<(), Error> { + let key = self.index_key_base.new_bo_key(doc_id, term_id); + tx.del(key).await + } +} + +#[derive(Clone, Debug, PartialEq)] +pub(super) struct Offset { + pub(super) index: u32, + pub(super) start: Position, + pub(super) end: Position, +} + +impl Offset { + pub(super) fn new(index: u32, start: Position, end: Position) -> Self { + Self { + index, + start, + end, + } + } +} + +#[derive(Clone, Debug, PartialEq)] +pub(super) struct OffsetRecords(pub(super) Vec); + +impl TryFrom for Val { + type Error = Error; + + fn try_from(offsets: OffsetRecords) -> Result { + // We build a unique vector with every values (start and offset). + let mut decompressed = Vec::new(); + // The first push the size of the index, + // so we can rebuild the OffsetsRecord on deserialization. + decompressed.push(offsets.0.len() as u32); + // We want the value to be more or less sorted so the RLE compression + // will be more effective + // Indexes are likely to be very small + for o in &offsets.0 { + decompressed.push(o.index); + } + // `starts` and `offsets` are likely to be ascending + for o in &offsets.0 { + decompressed.push(o.start); + decompressed.push(o.end); + } + Ok(bincode::serialize(&decompressed)?) + } +} + +impl TryFrom for OffsetRecords { + type Error = Error; + + fn try_from(val: Val) -> Result { + if val.is_empty() { + return Ok(Self(vec![])); + } + let decompressed: Vec = bincode::deserialize(&val)?; + let mut iter = decompressed.iter(); + let s = *iter.next().ok_or(Error::CorruptedIndex)?; + let mut indexes = Vec::with_capacity(s as usize); + for _ in 0..s { + let index = *iter.next().ok_or(Error::CorruptedIndex)?; + indexes.push(index); + } + let mut res = Vec::with_capacity(s as usize); + for index in indexes { + let start = *iter.next().ok_or(Error::CorruptedIndex)?; + let end = *iter.next().ok_or(Error::CorruptedIndex)?; + res.push(Offset::new(index, start, end)); + } + Ok(OffsetRecords(res)) + } +} + +#[cfg(test)] +mod tests { + use crate::idx::ft::offsets::{Offset, OffsetRecords}; + use crate::kvs::Val; + + #[test] + fn test_offset_records() { + let o = OffsetRecords(vec![ + Offset { + index: 0, + start: 1, + end: 2, + }, + Offset { + index: 0, + start: 11, + end: 22, + }, + Offset { + index: 1, + start: 3, + end: 4, + }, + ]); + let v: Val = o.clone().try_into().unwrap(); + let o2 = v.try_into().unwrap(); + assert_eq!(o, o2) + } +} diff --git a/lib/src/idx/ft/postings.rs b/lib/src/idx/ft/postings.rs index 6cad9455..17470eb0 100644 --- a/lib/src/idx/ft/postings.rs +++ b/lib/src/idx/ft/postings.rs @@ -18,7 +18,7 @@ impl Postings { pub(super) async fn new( tx: &mut Transaction, index_key_base: IndexKeyBase, - default_btree_order: u32, + order: u32, ) -> Result { let keys = PostingsKeyProvider { index_key_base: index_key_base.clone(), @@ -27,7 +27,7 @@ impl Postings { let state: btree::State = if let Some(val) = tx.get(state_key.clone()).await? { btree::State::try_from_val(val)? } else { - btree::State::new(default_btree_order) + btree::State::new(order) }; Ok(Self { state_key, diff --git a/lib/src/idx/mod.rs b/lib/src/idx/mod.rs index a2ca39b5..5d0fd23c 100644 --- a/lib/src/idx/mod.rs +++ b/lib/src/idx/mod.rs @@ -14,6 +14,7 @@ use crate::key::bf::Bf; use crate::key::bi::Bi; use crate::key::bk::Bk; use crate::key::bl::Bl; +use crate::key::bo::Bo; use crate::key::bp::Bp; use crate::key::bs::Bs; use crate::key::bt::Bt; @@ -105,6 +106,18 @@ impl IndexKeyBase { .into() } + fn new_bo_key(&self, doc_id: DocId, term_id: TermId) -> Key { + Bo::new( + self.inner.ns.as_str(), + self.inner.db.as_str(), + self.inner.tb.as_str(), + self.inner.ix.as_str(), + doc_id, + term_id, + ) + .into() + } + fn new_bp_key(&self, node_id: Option) -> Key { Bp::new( self.inner.ns.as_str(), diff --git a/lib/src/idx/planner/executor.rs b/lib/src/idx/planner/executor.rs index 8404a522..ffb37601 100644 --- a/lib/src/idx/planner/executor.rs +++ b/lib/src/idx/planner/executor.rs @@ -1,11 +1,13 @@ use crate::dbs::{Options, Transaction}; use crate::err::Error; -use crate::idx::ft::FtIndex; +use crate::idx::ft::terms::TermId; +use crate::idx::ft::{FtIndex, MatchRef}; +use crate::idx::planner::plan::IndexOption; use crate::idx::planner::tree::IndexMap; use crate::idx::IndexKeyBase; use crate::sql::index::Index; -use crate::sql::{Expression, Table, Thing, Value}; -use std::collections::HashMap; +use crate::sql::{Expression, Idiom, Table, Thing, Value}; +use std::collections::{HashMap, HashSet}; use std::sync::Arc; #[derive(Clone)] @@ -15,9 +17,16 @@ pub(crate) struct QueryExecutor { struct Inner { table: String, - index_map: IndexMap, + index: HashMap>, pre_match: Option, ft_map: HashMap, + terms: HashMap, +} + +struct IndexFieldTerms { + ix: String, + id: Idiom, + t: Vec, } impl QueryExecutor { @@ -30,29 +39,45 @@ impl QueryExecutor { ) -> Result { let mut run = txn.lock().await; let mut ft_map = HashMap::new(); - for ios in index_map.values() { + for ios in index_map.index.values() { for io in ios { if let Index::Search { az, order, - .. + sc, + hl, } = &io.ix.index { if !ft_map.contains_key(&io.ix.name.0) { let ikb = IndexKeyBase::new(opt, &io.ix); let az = run.get_az(opt.ns(), opt.db(), az.as_str()).await?; - let ft = FtIndex::new(&mut run, az, ikb, *order).await?; + let ft = FtIndex::new(&mut run, az, ikb, *order, sc, *hl).await?; ft_map.insert(io.ix.name.0.clone(), ft); } } } } + let mut terms = HashMap::with_capacity(index_map.terms.len()); + for (mr, ifv) in index_map.terms { + if let Some(ft) = ft_map.get(&ifv.ix) { + let term_ids = ft.extract_terms(&mut run, ifv.val.clone()).await?; + terms.insert( + mr, + IndexFieldTerms { + ix: ifv.ix, + id: ifv.id, + t: term_ids, + }, + ); + } + } Ok(Self { inner: Arc::new(Inner { table: table.0.clone(), - index_map, + index: index_map.index, pre_match, ft_map, + terms, }), }) } @@ -74,8 +99,9 @@ impl QueryExecutor { // Otherwise, we look for the first possible index options, and evaluate the expression // Does the record id match this executor's table? + // Does the record id match this executor's table? if thg.tb.eq(&self.inner.table) { - if let Some(ios) = self.inner.index_map.get(exp) { + if let Some(ios) = self.inner.index.get(exp) { for io in ios { if let Some(fti) = self.inner.ft_map.get(&io.ix.name.0) { let mut run = txn.lock().await; @@ -94,4 +120,51 @@ impl QueryExecutor { value: exp.to_string(), }) } + + pub(crate) async fn highlight( + &self, + txn: Transaction, + thg: &Thing, + prefix: Value, + suffix: Value, + match_ref: Value, + doc: &Value, + ) -> Result { + let mut tx = txn.lock().await; + // We have to make the connection between the match ref from the highlight function... + if let Value::Number(n) = match_ref { + let m = n.as_int() as u8; + // ... and from the match operator (@{matchref}@) + if let Some(ift) = self.inner.terms.get(&m) { + // Check we have an index? + if let Some(ft) = self.inner.ft_map.get(&ift.ix) { + // All good, we can do the highlight + return ft.highlight(&mut tx, thg, &ift.t, prefix, suffix, &ift.id, doc).await; + } + } + } + Ok(Value::None) + } + + pub(crate) async fn offsets( + &self, + txn: Transaction, + thg: &Thing, + match_ref: Value, + ) -> Result { + let mut tx = txn.lock().await; + // We have to make the connection between the match ref from the highlight function... + if let Value::Number(n) = match_ref { + let m = n.as_int() as u8; + // ... and from the match operator (@{matchref}@) + if let Some(ift) = self.inner.terms.get(&m) { + // Check we have an index? + if let Some(ft) = self.inner.ft_map.get(&ift.ix) { + // All good, we can extract the offsets + return ft.extract_offsets(&mut tx, thg, &ift.t).await; + } + } + } + Ok(Value::None) + } } diff --git a/lib/src/idx/planner/mod.rs b/lib/src/idx/planner/mod.rs index b2b3cab8..7f7e0fec 100644 --- a/lib/src/idx/planner/mod.rs +++ b/lib/src/idx/planner/mod.rs @@ -14,6 +14,7 @@ use std::collections::HashMap; pub(crate) struct QueryPlanner<'a> { opt: &'a Options, cond: &'a Option, + /// There is one executor per table executors: HashMap, } @@ -32,7 +33,7 @@ impl<'a> QueryPlanner<'a> { opt: &Options, t: Table, ) -> Result { - let txn = ctx.clone_transaction()?; + let txn = ctx.try_clone_transaction()?; let res = Tree::build(self.opt, &txn, &t, self.cond).await?; if let Some((node, im)) = res { if let Some(plan) = AllAndStrategy::build(&node)? { diff --git a/lib/src/idx/planner/plan.rs b/lib/src/idx/planner/plan.rs index 3545fd63..df6d0e5b 100644 --- a/lib/src/idx/planner/plan.rs +++ b/lib/src/idx/planner/plan.rs @@ -1,8 +1,9 @@ use crate::dbs::{Options, Transaction}; use crate::err::Error; -use crate::idx::ft::{FtIndex, HitsIterator}; +use crate::idx::ft::terms::TermId; +use crate::idx::ft::{FtIndex, HitsIterator, MatchRef}; use crate::idx::planner::executor::QueryExecutor; -use crate::idx::planner::tree::{IndexMap, Node}; +use crate::idx::planner::tree::IndexMap; use crate::idx::IndexKeyBase; use crate::key; use crate::kvs::Key; @@ -79,7 +80,7 @@ pub(super) struct IndexOption { } impl IndexOption { - fn new(ix: DefineIndexStatement, op: Operator, v: Value, ep: Expression) -> Self { + pub(super) fn new(ix: DefineIndexStatement, op: Operator, v: Value, ep: Expression) -> Self { Self { ix, op, @@ -98,28 +99,6 @@ impl IndexOption { QueryExecutor::new(opt, txn, t, i, Some(self.ep.clone())).await } - pub(super) fn found( - ix: &DefineIndexStatement, - op: &Operator, - v: &Node, - ep: &Expression, - ) -> Option { - if let Some(v) = v.is_scalar() { - if match ix.index { - Index::Idx => Operator::Equal.eq(op), - Index::Uniq => Operator::Equal.eq(op), - Index::Search { - .. - } => { - matches!(op, Operator::Matches(_)) - } - } { - return Some(IndexOption::new(ix.clone(), op.to_owned(), v.clone(), ep.clone())); - } - } - None - } - async fn new_iterator( &self, opt: &Options, @@ -144,8 +123,8 @@ impl IndexOption { sc, order, } => match self.op { - Operator::Matches(_) => Ok(Box::new( - MatchesThingIterator::new(opt, txn, &self.ix, az, *hl, sc, *order, &self.v) + Operator::Matches(mr) => Ok(Box::new( + MatchesThingIterator::new(opt, txn, &self.ix, az, *hl, sc, *order, mr, &self.v) .await?, )), _ => Err(Error::BypassQueryPlanner), @@ -221,6 +200,7 @@ impl ThingIterator for UniqueEqualThingIterator { } struct MatchesThingIterator { + _terms: Option<(MatchRef, Vec)>, hits: Option, } @@ -231,9 +211,10 @@ impl MatchesThingIterator { txn: &Transaction, ix: &DefineIndexStatement, az: &Ident, - _hl: bool, + hl: bool, sc: &Scoring, order: u32, + mr: Option, v: &Value, ) -> Result { let ikb = IndexKeyBase::new(opt, ix); @@ -244,10 +225,11 @@ impl MatchesThingIterator { { let query_string = v.clone().convert_to_string()?; let az = run.get_az(opt.ns(), opt.db(), az.as_str()).await?; - let fti = FtIndex::new(&mut run, az, ikb, order).await?; - let hits = fti.search(&mut run, query_string).await?; + let fti = FtIndex::new(&mut run, az, ikb, order, sc, hl).await?; + let (terms, hits) = fti.search(&mut run, query_string).await?; Ok(Self { hits, + _terms: mr.map(|mr| (mr, terms)), }) } else { Err(Error::FeatureNotYetImplemented { diff --git a/lib/src/idx/planner/tree.rs b/lib/src/idx/planner/tree.rs index 9af89edb..50222939 100644 --- a/lib/src/idx/planner/tree.rs +++ b/lib/src/idx/planner/tree.rs @@ -1,6 +1,8 @@ use crate::dbs::{Options, Transaction}; use crate::err::Error; +use crate::idx::ft::MatchRef; use crate::idx::planner::plan::IndexOption; +use crate::sql::index::Index; use crate::sql::statements::DefineIndexStatement; use crate::sql::{Cond, Expression, Idiom, Operator, Subquery, Table, Value}; use async_recursion::async_recursion; @@ -8,9 +10,19 @@ use std::collections::hash_map::Entry; use std::collections::{HashMap, HashSet}; use std::sync::Arc; -pub(super) struct Tree {} +#[derive(Default)] +pub(super) struct IndexMap { + pub(super) index: HashMap>, + pub(super) terms: HashMap, +} -pub(super) type IndexMap = HashMap>; +pub(super) struct IndexFieldValue { + pub(super) ix: String, + pub(super) id: Idiom, + pub(super) val: String, +} + +pub(super) struct Tree {} impl Tree { pub(super) async fn build<'a>( @@ -80,7 +92,7 @@ impl<'a> TreeBuilder<'a> { async fn eval_idiom(&mut self, i: &Idiom) -> Result { Ok(if let Some(ix) = self.find_index(i).await? { - Node::IndexedField(ix) + Node::IndexedField(i.to_owned(), ix) } else { Node::NonIndexedField }) @@ -101,18 +113,11 @@ impl<'a> TreeBuilder<'a> { let left = self.eval_value(l).await?; let right = self.eval_value(r).await?; let mut index_option = None; - if let Some(ix) = left.is_indexed_field() { - if let Some(io) = IndexOption::found(ix, o, &right, e) { - index_option = Some(io.clone()); - self.add_index(e, io); - } - } - if let Some(ix) = right.is_indexed_field() { - if let Some(io) = IndexOption::found(ix, o, &left, e) { - index_option = Some(io.clone()); - self.add_index(e, io); - } - } + if let Some((id, ix)) = left.is_indexed_field() { + index_option = self.lookup_index_option(ix, o, id, &right, e); + } else if let Some((id, ix)) = right.is_indexed_field() { + index_option = self.lookup_index_option(ix, o, id, &left, e); + }; Ok(Node::Expression { index_option, left: Box::new(left), @@ -123,8 +128,48 @@ impl<'a> TreeBuilder<'a> { } } + fn lookup_index_option( + &mut self, + ix: &DefineIndexStatement, + op: &Operator, + id: &Idiom, + v: &Node, + ep: &Expression, + ) -> Option { + if let Some(v) = v.is_scalar() { + if match &ix.index { + Index::Idx => Operator::Equal.eq(op), + Index::Uniq => Operator::Equal.eq(op), + Index::Search { + .. + } => { + if let Operator::Matches(mr) = op { + if let Some(mr) = mr { + self.index_map.terms.insert( + *mr, + IndexFieldValue { + ix: ix.name.0.to_owned(), + id: id.to_owned(), + val: v.to_raw_string(), + }, + ); + } + true + } else { + false + } + } + } { + let io = IndexOption::new(ix.clone(), op.to_owned(), v.clone(), ep.clone()); + self.add_index(ep, io.clone()); + return Some(io); + } + } + None + } + fn add_index(&mut self, e: &Expression, io: IndexOption) { - match self.index_map.entry(e.clone()) { + match self.index_map.index.entry(e.clone()) { Entry::Occupied(mut e) => { e.get_mut().insert(io); } @@ -150,7 +195,7 @@ pub(super) enum Node { right: Box, operator: Operator, }, - IndexedField(DefineIndexStatement), + IndexedField(Idiom, DefineIndexStatement), NonIndexedField, Scalar(Value), Unsupported, @@ -165,9 +210,9 @@ impl Node { } } - pub(super) fn is_indexed_field(&self) -> Option<&DefineIndexStatement> { - if let Node::IndexedField(ix) = self { - Some(ix) + pub(super) fn is_indexed_field(&self) -> Option<(&Idiom, &DefineIndexStatement)> { + if let Node::IndexedField(id, ix) = self { + Some((id, ix)) } else { None } diff --git a/lib/src/key/bo.rs b/lib/src/key/bo.rs new file mode 100644 index 00000000..9953e06c --- /dev/null +++ b/lib/src/key/bo.rs @@ -0,0 +1,69 @@ +use crate::idx::ft::docids::DocId; +use crate::idx::ft::terms::TermId; +use derive::Key; +use serde::{Deserialize, Serialize}; + +#[derive(Clone, Debug, Eq, PartialEq, PartialOrd, Serialize, Deserialize, Key)] +pub struct Bo<'a> { + __: u8, + _a: u8, + pub ns: &'a str, + _b: u8, + pub db: &'a str, + _c: u8, + pub tb: &'a str, + _d: u8, + _e: u8, + _f: u8, + pub ix: &'a str, + _g: u8, + pub doc_id: DocId, + pub term_id: TermId, +} + +impl<'a> Bo<'a> { + pub fn new( + ns: &'a str, + db: &'a str, + tb: &'a str, + ix: &'a str, + doc_id: DocId, + term_id: TermId, + ) -> Self { + Self { + __: b'/', + _a: b'*', + ns, + _b: b'*', + db, + _c: b'*', + tb, + _d: b'!', + _e: b'b', + _f: b'o', + ix, + _g: b'*', + doc_id, + term_id, + } + } +} + +#[cfg(test)] +mod tests { + #[test] + fn key() { + use super::*; + #[rustfmt::skip] + let val = Bo::new( + "test", + "test", + "test", + "test", + 1,2 + ); + let enc = Bo::encode(&val).unwrap(); + let dec = Bo::decode(&enc).unwrap(); + assert_eq!(val, dec); + } +} diff --git a/lib/src/key/mod.rs b/lib/src/key/mod.rs index 4410fc0c..9650f336 100644 --- a/lib/src/key/mod.rs +++ b/lib/src/key/mod.rs @@ -57,6 +57,7 @@ pub mod bf; // Stores Term/Doc frequency pub mod bi; // Stores doc keys for doc_ids pub mod bk; // Stores the term list for doc_ids pub mod bl; // Stores BTree nodes for doc lengths +pub mod bo; // Stores the offsets pub mod bp; // Stores BTree nodes for postings pub mod bs; // Stores FullText index states pub mod bt; // Stores BTree nodes for terms diff --git a/lib/src/sql/function.rs b/lib/src/sql/function.rs index 41190444..ab9c1624 100644 --- a/lib/src/sql/function.rs +++ b/lib/src/sql/function.rs @@ -148,7 +148,7 @@ impl Function { // Get the function definition let val = { // Clone transaction - let txn = ctx.clone_transaction()?; + let txn = ctx.try_clone_transaction()?; // Claim transaction let mut run = txn.lock().await; // Get the function definition @@ -255,6 +255,7 @@ pub(crate) fn function_names(i: &str) -> IResult<&str, &str> { preceded(tag("meta::"), function_meta), preceded(tag("parse::"), function_parse), preceded(tag("rand::"), function_rand), + preceded(tag("search::"), function_search), preceded(tag("session::"), function_session), preceded(tag("string::"), function_string), preceded(tag("time::"), function_time), @@ -453,6 +454,10 @@ fn function_rand(i: &str) -> IResult<&str, &str> { ))(i) } +fn function_search(i: &str) -> IResult<&str, &str> { + alt((tag("highlight"), tag("offsets")))(i) +} + fn function_session(i: &str) -> IResult<&str, &str> { alt(( tag("db"), diff --git a/lib/src/sql/operator.rs b/lib/src/sql/operator.rs index 0b6815ca..de829c66 100644 --- a/lib/src/sql/operator.rs +++ b/lib/src/sql/operator.rs @@ -1,3 +1,4 @@ +use crate::idx::ft::MatchRef; use crate::sql::comment::mightbespace; use crate::sql::comment::shouldbespace; use crate::sql::error::IResult; @@ -38,11 +39,11 @@ pub enum Operator { AllEqual, // *= AnyEqual, // ?= // - Like, // ~ - NotLike, // !~ - AllLike, // *~ - AnyLike, // ?~ - Matches(Option), // @{ref}@ + Like, // ~ + NotLike, // !~ + AllLike, // *~ + AnyLike, // ?~ + Matches(Option), // @{ref}@ // LessThan, // < LessThanOrEqual, // <= diff --git a/lib/src/sql/param.rs b/lib/src/sql/param.rs index 868f5854..cba95317 100644 --- a/lib/src/sql/param.rs +++ b/lib/src/sql/param.rs @@ -60,7 +60,7 @@ impl Param { // The param has not been set locally None => { // Clone transaction - let txn = ctx.clone_transaction()?; + let txn = ctx.try_clone_transaction()?; // Claim transaction let mut run = txn.lock().await; // Get the param definition diff --git a/lib/src/sql/statements/analyze.rs b/lib/src/sql/statements/analyze.rs index 657234fb..3abf36ef 100644 --- a/lib/src/sql/statements/analyze.rs +++ b/lib/src/sql/statements/analyze.rs @@ -30,7 +30,7 @@ impl AnalyzeStatement { // Allowed to run? opt.check(Level::Db)?; // Clone transaction - let txn = ctx.clone_transaction()?; + let txn = ctx.try_clone_transaction()?; // Claim transaction let mut run = txn.lock().await; // Read the index @@ -42,10 +42,11 @@ impl AnalyzeStatement { Index::Search { az, order, - .. + sc, + hl, } => { let az = run.get_az(opt.ns(), opt.db(), az.as_str()).await?; - let ft = FtIndex::new(&mut run, az, ikb, *order).await?; + let ft = FtIndex::new(&mut run, az, ikb, *order, sc, *hl).await?; ft.statistics(&mut run).await? } _ => { diff --git a/lib/src/sql/statements/define.rs b/lib/src/sql/statements/define.rs index 919ee4ea..f4627ed5 100644 --- a/lib/src/sql/statements/define.rs +++ b/lib/src/sql/statements/define.rs @@ -135,7 +135,7 @@ impl DefineNamespaceStatement { // Process the statement let key = crate::key::ns::new(&self.name); // Clone transaction - let txn = ctx.clone_transaction()?; + let txn = ctx.try_clone_transaction()?; // Claim transaction let mut run = txn.lock().await; run.set(key, self).await?; @@ -182,7 +182,7 @@ impl DefineDatabaseStatement { // Allowed to run? opt.check(Level::Ns)?; // Clone transaction - let txn = ctx.clone_transaction()?; + let txn = ctx.try_clone_transaction()?; // Claim transaction let mut run = txn.lock().await; // Process the statement @@ -234,7 +234,7 @@ impl DefineFunctionStatement { // Allowed to run? opt.check(Level::Db)?; // Clone transaction - let txn = ctx.clone_transaction()?; + let txn = ctx.try_clone_transaction()?; // Claim transaction let mut run = txn.lock().await; // Process the statement @@ -312,7 +312,7 @@ impl DefineAnalyzerStatement { // Allowed to run? opt.check(Level::Db)?; // Clone transaction - let txn = ctx.clone_transaction()?; + let txn = ctx.try_clone_transaction()?; // Claim transaction let mut run = txn.lock().await; // Process the statement @@ -385,7 +385,7 @@ impl DefineLoginStatement { // Allowed to run? opt.check(Level::Kv)?; // Clone transaction - let txn = ctx.clone_transaction()?; + let txn = ctx.try_clone_transaction()?; // Claim transaction let mut run = txn.lock().await; // Process the statement @@ -401,7 +401,7 @@ impl DefineLoginStatement { // Allowed to run? opt.check(Level::Ns)?; // Clone transaction - let txn = ctx.clone_transaction()?; + let txn = ctx.try_clone_transaction()?; // Claim transaction let mut run = txn.lock().await; // Process the statement @@ -504,7 +504,7 @@ impl DefineTokenStatement { // Allowed to run? opt.check(Level::Kv)?; // Clone transaction - let txn = ctx.clone_transaction()?; + let txn = ctx.try_clone_transaction()?; // Claim transaction let mut run = txn.lock().await; // Process the statement @@ -520,7 +520,7 @@ impl DefineTokenStatement { // Allowed to run? opt.check(Level::Ns)?; // Clone transaction - let txn = ctx.clone_transaction()?; + let txn = ctx.try_clone_transaction()?; // Claim transaction let mut run = txn.lock().await; // Process the statement @@ -537,7 +537,7 @@ impl DefineTokenStatement { // Allowed to run? opt.check(Level::Db)?; // Clone transaction - let txn = ctx.clone_transaction()?; + let txn = ctx.try_clone_transaction()?; // Claim transaction let mut run = txn.lock().await; // Process the statement @@ -618,7 +618,7 @@ impl DefineScopeStatement { // Allowed to run? opt.check(Level::Db)?; // Clone transaction - let txn = ctx.clone_transaction()?; + let txn = ctx.try_clone_transaction()?; // Claim transaction let mut run = txn.lock().await; // Process the statement @@ -733,7 +733,7 @@ impl DefineParamStatement { // Allowed to run? opt.check(Level::Db)?; // Clone transaction - let txn = ctx.clone_transaction()?; + let txn = ctx.try_clone_transaction()?; // Claim transaction let mut run = txn.lock().await; // Process the statement @@ -793,7 +793,7 @@ impl DefineTableStatement { // Allowed to run? opt.check(Level::Db)?; // Clone transaction - let txn = ctx.clone_transaction()?; + let txn = ctx.try_clone_transaction()?; // Claim transaction let mut run = txn.lock().await; // Process the statement @@ -972,7 +972,7 @@ impl DefineEventStatement { // Allowed to run? opt.check(Level::Db)?; // Clone transaction - let txn = ctx.clone_transaction()?; + let txn = ctx.try_clone_transaction()?; // Claim transaction let mut run = txn.lock().await; // Process the statement @@ -1053,7 +1053,7 @@ impl DefineFieldStatement { // Allowed to run? opt.check(Level::Db)?; // Clone transaction - let txn = ctx.clone_transaction()?; + let txn = ctx.try_clone_transaction()?; // Claim transaction let mut run = txn.lock().await; // Process the statement @@ -1210,7 +1210,7 @@ impl DefineIndexStatement { // Allowed to run? opt.check(Level::Db)?; // Clone transaction - let txn = ctx.clone_transaction()?; + let txn = ctx.try_clone_transaction()?; // Claim transaction let mut run = txn.lock().await; // Process the statement diff --git a/lib/src/sql/statements/info.rs b/lib/src/sql/statements/info.rs index 437027f0..77f8c0a7 100644 --- a/lib/src/sql/statements/info.rs +++ b/lib/src/sql/statements/info.rs @@ -35,7 +35,7 @@ impl InfoStatement { // Create the result set let mut res = Object::default(); // Clone transaction - let txn = ctx.clone_transaction()?; + let txn = ctx.try_clone_transaction()?; // Claim transaction let mut run = txn.lock().await; // Process the statement @@ -53,7 +53,7 @@ impl InfoStatement { // Allowed to run? opt.check(Level::Ns)?; // Clone transaction - let txn = ctx.clone_transaction()?; + let txn = ctx.try_clone_transaction()?; // Claim transaction let mut run = txn.lock().await; // Create the result set @@ -85,7 +85,7 @@ impl InfoStatement { // Allowed to run? opt.check(Level::Db)?; // Clone transaction - let txn = ctx.clone_transaction()?; + let txn = ctx.try_clone_transaction()?; // Claim transaction let mut run = txn.lock().await; // Create the result set @@ -141,7 +141,7 @@ impl InfoStatement { // Allowed to run? opt.check(Level::Db)?; // Clone transaction - let txn = ctx.clone_transaction()?; + let txn = ctx.try_clone_transaction()?; // Claim transaction let mut run = txn.lock().await; // Create the result set @@ -161,7 +161,7 @@ impl InfoStatement { // Allowed to run? opt.check(Level::Db)?; // Clone transaction - let txn = ctx.clone_transaction()?; + let txn = ctx.try_clone_transaction()?; // Claim transaction let mut run = txn.lock().await; // Create the result set diff --git a/lib/src/sql/statements/kill.rs b/lib/src/sql/statements/kill.rs index 8c672ba7..292ed6ce 100644 --- a/lib/src/sql/statements/kill.rs +++ b/lib/src/sql/statements/kill.rs @@ -26,7 +26,7 @@ impl KillStatement { // Allowed to run? opt.check(Level::No)?; // Clone transaction - let txn = ctx.clone_transaction()?; + let txn = ctx.try_clone_transaction()?; // Claim transaction let mut run = txn.lock().await; // Fetch the live query key diff --git a/lib/src/sql/statements/live.rs b/lib/src/sql/statements/live.rs index 215fb2f2..d690dcac 100644 --- a/lib/src/sql/statements/live.rs +++ b/lib/src/sql/statements/live.rs @@ -40,7 +40,7 @@ impl LiveStatement { // Allowed to run? opt.check(Level::No)?; // Clone transaction - let txn = ctx.clone_transaction()?; + let txn = ctx.try_clone_transaction()?; // Claim transaction let mut run = txn.lock().await; // Process the live query table diff --git a/lib/src/sql/statements/remove.rs b/lib/src/sql/statements/remove.rs index 748db96f..d5b250d2 100644 --- a/lib/src/sql/statements/remove.rs +++ b/lib/src/sql/statements/remove.rs @@ -111,7 +111,7 @@ impl RemoveNamespaceStatement { // Allowed to run? opt.check(Level::Kv)?; // Clone transaction - let txn = ctx.clone_transaction()?; + let txn = ctx.try_clone_transaction()?; // Claim transaction let mut run = txn.lock().await; // Delete the definition @@ -163,7 +163,7 @@ impl RemoveDatabaseStatement { // Allowed to run? opt.check(Level::Ns)?; // Clone transaction - let txn = ctx.clone_transaction()?; + let txn = ctx.try_clone_transaction()?; // Claim transaction let mut run = txn.lock().await; // Delete the definition @@ -215,7 +215,7 @@ impl RemoveFunctionStatement { // Allowed to run? opt.check(Level::Db)?; // Clone transaction - let txn = ctx.clone_transaction()?; + let txn = ctx.try_clone_transaction()?; // Claim transaction let mut run = txn.lock().await; // Delete the definition @@ -270,7 +270,7 @@ impl RemoveAnalyzerStatement { // Allowed to run? opt.check(Level::Db)?; // Clone transaction - let txn = ctx.clone_transaction()?; + let txn = ctx.try_clone_transaction()?; // Claim transaction let mut run = txn.lock().await; // Delete the definition @@ -323,7 +323,7 @@ impl RemoveLoginStatement { // Allowed to run? opt.check(Level::Kv)?; // Clone transaction - let txn = ctx.clone_transaction()?; + let txn = ctx.try_clone_transaction()?; // Claim transaction let mut run = txn.lock().await; // Delete the definition @@ -338,7 +338,7 @@ impl RemoveLoginStatement { // Allowed to run? opt.check(Level::Ns)?; // Clone transaction - let txn = ctx.clone_transaction()?; + let txn = ctx.try_clone_transaction()?; // Claim transaction let mut run = txn.lock().await; // Delete the definition @@ -398,7 +398,7 @@ impl RemoveTokenStatement { // Allowed to run? opt.check(Level::Kv)?; // Clone transaction - let txn = ctx.clone_transaction()?; + let txn = ctx.try_clone_transaction()?; // Claim transaction let mut run = txn.lock().await; // Delete the definition @@ -413,7 +413,7 @@ impl RemoveTokenStatement { // Allowed to run? opt.check(Level::Ns)?; // Clone transaction - let txn = ctx.clone_transaction()?; + let txn = ctx.try_clone_transaction()?; // Claim transaction let mut run = txn.lock().await; // Delete the definition @@ -428,7 +428,7 @@ impl RemoveTokenStatement { // Allowed to run? opt.check(Level::Db)?; // Clone transaction - let txn = ctx.clone_transaction()?; + let txn = ctx.try_clone_transaction()?; // Claim transaction let mut run = txn.lock().await; // Delete the definition @@ -485,7 +485,7 @@ impl RemoveScopeStatement { // Allowed to run? opt.check(Level::Db)?; // Clone transaction - let txn = ctx.clone_transaction()?; + let txn = ctx.try_clone_transaction()?; // Claim transaction let mut run = txn.lock().await; // Delete the definition @@ -537,7 +537,7 @@ impl RemoveParamStatement { // Allowed to run? opt.check(Level::Db)?; // Clone transaction - let txn = ctx.clone_transaction()?; + let txn = ctx.try_clone_transaction()?; // Claim transaction let mut run = txn.lock().await; // Delete the definition @@ -587,7 +587,7 @@ impl RemoveTableStatement { // Allowed to run? opt.check(Level::Db)?; // Clone transaction - let txn = ctx.clone_transaction()?; + let txn = ctx.try_clone_transaction()?; // Claim transaction let mut run = txn.lock().await; // Delete the definition @@ -640,7 +640,7 @@ impl RemoveEventStatement { // Allowed to run? opt.check(Level::Db)?; // Clone transaction - let txn = ctx.clone_transaction()?; + let txn = ctx.try_clone_transaction()?; // Claim transaction let mut run = txn.lock().await; // Delete the definition @@ -699,7 +699,7 @@ impl RemoveFieldStatement { // Allowed to run? opt.check(Level::Db)?; // Clone transaction - let txn = ctx.clone_transaction()?; + let txn = ctx.try_clone_transaction()?; // Claim transaction let mut run = txn.lock().await; // Delete the definition @@ -759,7 +759,7 @@ impl RemoveIndexStatement { // Allowed to run? opt.check(Level::Db)?; // Clone transaction - let txn = ctx.clone_transaction()?; + let txn = ctx.try_clone_transaction()?; // Claim transaction let mut run = txn.lock().await; // Delete the definition diff --git a/lib/tests/matches.rs b/lib/tests/matches.rs index a76deb08..820aeeb9 100644 --- a/lib/tests/matches.rs +++ b/lib/tests/matches.rs @@ -10,8 +10,8 @@ async fn select_where_matches_using_index() -> Result<(), Error> { let sql = r" CREATE blog:1 SET title = 'Hello World!'; DEFINE ANALYZER simple TOKENIZERS blank,class; - DEFINE INDEX blog_title ON blog FIELDS title SEARCH ANALYZER simple BM25(1.2,0.75); - SELECT id,title FROM blog WHERE title @@ 'Hello' EXPLAIN; + DEFINE INDEX blog_title ON blog FIELDS title SEARCH ANALYZER simple BM25(1.2,0.75) HIGHLIGHTS; + SELECT id, search::highlight('', '', 1) AS title FROM blog WHERE title @1@ 'Hello' EXPLAIN; "; let dbs = Datastore::new("memory").await?; let ses = Session::for_kv().with_ns("test").with_db("test"); @@ -26,7 +26,7 @@ async fn select_where_matches_using_index() -> Result<(), Error> { "[ { id: blog:1, - title: 'Hello World!' + title: 'Hello World!' }, { explain: @@ -35,7 +35,7 @@ async fn select_where_matches_using_index() -> Result<(), Error> { detail: { plan: { index: 'blog_title', - operator: '@@', + operator: '@1@', value: 'Hello' }, table: 'blog', @@ -56,8 +56,8 @@ async fn select_where_matches_without_using_index_iterator() -> Result<(), Error CREATE blog:1 SET title = 'Hello World!'; CREATE blog:2 SET title = 'Foo Bar!'; DEFINE ANALYZER simple TOKENIZERS blank,class FILTERS lowercase; - DEFINE INDEX blog_title ON blog FIELDS title SEARCH ANALYZER simple BM25(1.2,0.75); - SELECT id,title FROM blog WHERE (title @@ 'hello' AND id>0) OR (title @@ 'world' AND id<99) EXPLAIN; + DEFINE INDEX blog_title ON blog FIELDS title SEARCH ANALYZER simple BM25(1.2,0.75) HIGHLIGHTS; + SELECT id,search::highlight('', '', 1) AS title FROM blog WHERE (title @0@ 'hello' AND id>0) OR (title @1@ 'world' AND id<99) EXPLAIN; "; let dbs = Datastore::new("memory").await?; let ses = Session::for_kv().with_ns("test").with_db("test"); @@ -73,7 +73,7 @@ async fn select_where_matches_without_using_index_iterator() -> Result<(), Error "[ { id: blog:1, - title: 'Hello World!' + title: 'Hello World!' }, { explain: @@ -91,3 +91,104 @@ async fn select_where_matches_without_using_index_iterator() -> Result<(), Error assert_eq!(tmp, val); Ok(()) } + +#[tokio::test] +async fn select_where_matches_using_index_and_arrays() -> Result<(), Error> { + let sql = r" + CREATE blog:1 SET content = ['Hello World!', 'Be Bop', 'Foo Bar']; + DEFINE ANALYZER simple TOKENIZERS blank,class; + DEFINE INDEX blog_content ON blog FIELDS content SEARCH ANALYZER simple BM25(1.2,0.75) HIGHLIGHTS; + SELECT id, search::highlight('', '', 1) AS content FROM blog WHERE content @1@ 'Hello Bar' EXPLAIN; + "; + let dbs = Datastore::new("memory").await?; + let ses = Session::for_kv().with_ns("test").with_db("test"); + let res = &mut dbs.execute(&sql, &ses, None, false).await?; + assert_eq!(res.len(), 4); + // + let _ = res.remove(0).result?; + let _ = res.remove(0).result?; + let _ = res.remove(0).result?; + let tmp = res.remove(0).result?; + let val = Value::parse( + "[ + { + id: blog:1, + content: [ + 'Hello World!', + 'Be Bop', + 'Foo Bar' + ] + }, + { + explain: + [ + { + detail: { + plan: { + index: 'blog_content', + operator: '@1@', + value: 'Hello Bar' + }, + table: 'blog', + }, + operation: 'Iterate Index' + } + ] + } + ]", + ); + assert_eq!(tmp, val); + Ok(()) +} + +#[tokio::test] +async fn select_where_matches_using_index_offsets() -> Result<(), Error> { + let sql = r" + CREATE blog:1 SET title = 'Blog title!', content = ['Hello World!', 'Be Bop', 'Foo Bar']; + DEFINE ANALYZER simple TOKENIZERS blank,class; + DEFINE INDEX blog_title ON blog FIELDS title SEARCH ANALYZER simple BM25(1.2,0.75) HIGHLIGHTS; + DEFINE INDEX blog_content ON blog FIELDS content SEARCH ANALYZER simple BM25(1.2,0.75) HIGHLIGHTS; + SELECT id, search::offsets(0) AS title, search::offsets(1) AS content FROM blog WHERE title @0@ 'title' AND content @1@ 'Hello Bar' EXPLAIN; + "; + let dbs = Datastore::new("memory").await?; + let ses = Session::for_kv().with_ns("test").with_db("test"); + let res = &mut dbs.execute(&sql, &ses, None, false).await?; + assert_eq!(res.len(), 5); + // + for _ in 0..4 { + let _ = res.remove(0).result?; + } + let tmp = res.remove(0).result?; + let val = Value::parse( + "[ + { + id: blog:1, + title: { + 0: [{s:5, e:10}], + }, + content: { + 0: [{s:0, e:5}], + 2: [{s:4, e:7}] + } + }, + { + explain: + [ + { + detail: { + plan: { + index: 'blog_content', + operator: '@1@', + value: 'Hello Bar' + }, + table: 'blog', + }, + operation: 'Iterate Index' + } + ] + } + ]", + ); + assert_eq!(tmp, val); + Ok(()) +}