Feature: Vector Search: mtree index + knn operator (#2546)

Co-authored-by: Tobie Morgan Hitchcock <tobie@surrealdb.com>
This commit is contained in:
Emmanuel Keller 2023-09-12 21:26:03 +01:00 committed by GitHub
parent 1a85f4967a
commit 0772a8c592
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
41 changed files with 2541 additions and 235 deletions

View file

@ -8,7 +8,7 @@ use crate::dbs::Statement;
use crate::dbs::{Options, Transaction}; use crate::dbs::{Options, Transaction};
use crate::doc::Document; use crate::doc::Document;
use crate::err::Error; use crate::err::Error;
use crate::idx::ft::docids::DocId; use crate::idx::docids::DocId;
use crate::idx::planner::executor::IteratorRef; use crate::idx::planner::executor::IteratorRef;
use crate::sql::array::Array; use crate::sql::array::Array;
use crate::sql::edges::Edges; use crate::sql::edges::Edges;

View file

@ -594,7 +594,7 @@ impl<'a> Processor<'a> {
} }
} }
Err(Error::QueryNotExecutedDetail { Err(Error::QueryNotExecutedDetail {
message: "No QueryExecutor has not been found.".to_string(), message: "No QueryExecutor has been found.".to_string(),
}) })
} }
} }

View file

@ -4,7 +4,7 @@ use crate::dbs::Workable;
use crate::err::Error; use crate::err::Error;
use crate::iam::Action; use crate::iam::Action;
use crate::iam::ResourceKind; use crate::iam::ResourceKind;
use crate::idx::ft::docids::DocId; use crate::idx::docids::DocId;
use crate::idx::planner::executor::IteratorRef; use crate::idx::planner::executor::IteratorRef;
use crate::sql::statements::define::DefineEventStatement; use crate::sql::statements::define::DefineEventStatement;
use crate::sql::statements::define::DefineFieldStatement; use crate::sql::statements::define::DefineFieldStatement;

View file

@ -4,10 +4,11 @@ use crate::dbs::{Options, Transaction};
use crate::doc::{CursorDoc, Document}; use crate::doc::{CursorDoc, Document};
use crate::err::Error; use crate::err::Error;
use crate::idx::ft::FtIndex; use crate::idx::ft::FtIndex;
use crate::idx::trees::mtree::MTreeIndex;
use crate::idx::trees::store::TreeStoreType; use crate::idx::trees::store::TreeStoreType;
use crate::idx::IndexKeyBase; use crate::idx::IndexKeyBase;
use crate::sql::array::Array; use crate::sql::array::Array;
use crate::sql::index::{Index, SearchParams}; use crate::sql::index::{Index, MTreeParams, SearchParams};
use crate::sql::statements::DefineIndexStatement; use crate::sql::statements::DefineIndexStatement;
use crate::sql::{Part, Thing, Value}; use crate::sql::{Part, Thing, Value};
use crate::{key, kvs}; use crate::{key, kvs};
@ -55,11 +56,7 @@ impl<'a> Document<'a> {
Index::Uniq => ic.index_unique(&mut run).await?, Index::Uniq => ic.index_unique(&mut run).await?,
Index::Idx => ic.index_non_unique(&mut run).await?, Index::Idx => ic.index_non_unique(&mut run).await?,
Index::Search(p) => ic.index_full_text(&mut run, p).await?, Index::Search(p) => ic.index_full_text(&mut run, p).await?,
Index::MTree(_) => { Index::MTree(p) => ic.index_mtree(&mut run, p).await?,
return Err(Error::FeatureNotYetImplemented {
feature: "MTree indexing".to_string(),
})
}
}; };
} }
} }
@ -332,18 +329,36 @@ impl<'a> IndexOperation<'a> {
} }
async fn index_full_text( async fn index_full_text(
&self, &mut self,
run: &mut kvs::Transaction, run: &mut kvs::Transaction,
p: &SearchParams, p: &SearchParams,
) -> Result<(), Error> { ) -> Result<(), Error> {
let ikb = IndexKeyBase::new(self.opt, self.ix); let ikb = IndexKeyBase::new(self.opt, self.ix);
let az = run.get_db_analyzer(self.opt.ns(), self.opt.db(), p.az.as_str()).await?; let az = run.get_db_analyzer(self.opt.ns(), self.opt.db(), p.az.as_str()).await?;
let mut ft = FtIndex::new(run, az, ikb, p, TreeStoreType::Write).await?; let mut ft = FtIndex::new(run, az, ikb, p, TreeStoreType::Write).await?;
if let Some(n) = &self.n { if let Some(n) = self.n.take() {
ft.index_document(run, self.rid, n).await?; ft.index_document(run, self.rid, n).await?;
} else { } else {
ft.remove_document(run, self.rid).await?; ft.remove_document(run, self.rid).await?;
} }
ft.finish(run).await ft.finish(run).await
} }
async fn index_mtree(
&mut self,
run: &mut kvs::Transaction,
p: &MTreeParams,
) -> Result<(), Error> {
let ikb = IndexKeyBase::new(self.opt, self.ix);
let mut mt = MTreeIndex::new(run, ikb, p, TreeStoreType::Write).await?;
// Delete the old index data
if let Some(o) = self.o.take() {
mt.remove_document(run, self.rid, o).await?;
}
// Create the new index data
if let Some(n) = self.n.take() {
mt.index_document(run, self.rid, n).await?;
}
mt.finish(run).await
}
} }

View file

@ -208,6 +208,26 @@ pub enum Error {
#[error("The URL `{0}` is invalid")] #[error("The URL `{0}` is invalid")]
InvalidUrl(String), InvalidUrl(String),
/// The size of the vector is incorrect
#[error("Incorrect vector dimension ({current}). Expected a vector of {expected} dimension.")]
InvalidVectorDimension {
current: usize,
expected: usize,
},
/// The size of the vector is incorrect
#[error("The vector element ({current}) is not a number.")]
InvalidVectorType {
current: String,
expected: &'static str,
},
/// The size of the vector is incorrect
#[error("The value '{current}' is not a vector.")]
InvalidVectorValue {
current: String,
},
/// The query timedout /// The query timedout
#[error("The query was not executed because it exceeded the timeout")] #[error("The query was not executed because it exceeded the timeout")]
QueryTimedout, QueryTimedout,

View file

@ -2,6 +2,7 @@ use crate::ctx::Context;
use crate::dbs::Transaction; use crate::dbs::Transaction;
use crate::doc::CursorDoc; use crate::doc::CursorDoc;
use crate::err::Error; use crate::err::Error;
use crate::idx::planner::executor::QueryExecutor;
use crate::sql::value::TryAdd; use crate::sql::value::TryAdd;
use crate::sql::value::TryDiv; use crate::sql::value::TryDiv;
use crate::sql::value::TryMul; use crate::sql::value::TryMul;
@ -9,7 +10,7 @@ use crate::sql::value::TryNeg;
use crate::sql::value::TryPow; use crate::sql::value::TryPow;
use crate::sql::value::TrySub; use crate::sql::value::TrySub;
use crate::sql::value::Value; use crate::sql::value::Value;
use crate::sql::Expression; use crate::sql::{Expression, Thing};
pub fn neg(a: Value) -> Result<Value, Error> { pub fn neg(a: Value) -> Result<Value, Error> {
a.try_neg() a.try_neg()
@ -167,31 +168,58 @@ pub fn intersects(a: &Value, b: &Value) -> Result<Value, Error> {
Ok(a.intersects(b).into()) Ok(a.intersects(b).into())
} }
enum IndexOption<'a> {
PreMatch,
None,
Execute(&'a QueryExecutor, &'a Thing),
}
fn get_index_option<'a>(
ctx: &'a Context<'_>,
doc: Option<&'a CursorDoc<'_>>,
exp: &'a Expression,
) -> IndexOption<'a> {
if let Some(doc) = doc {
if let Some(thg) = doc.rid {
if let Some(pla) = ctx.get_query_planner() {
if let Some(exe) = pla.get_query_executor(&thg.tb) {
if let Some(ir) = doc.ir {
if exe.is_iterator_expression(ir, exp) {
return IndexOption::PreMatch;
}
}
return IndexOption::Execute(exe, thg);
}
}
}
}
IndexOption::None
}
pub(crate) async fn matches( pub(crate) async fn matches(
ctx: &Context<'_>, ctx: &Context<'_>,
txn: &Transaction, txn: &Transaction,
doc: Option<&CursorDoc<'_>>, doc: Option<&CursorDoc<'_>>,
exp: &Expression, exp: &Expression,
) -> Result<Value, Error> { ) -> Result<Value, Error> {
if let Some(doc) = doc { match get_index_option(ctx, doc, exp) {
if let Some(thg) = doc.rid { IndexOption::PreMatch => Ok(Value::Bool(true)),
if let Some(pla) = ctx.get_query_planner() { IndexOption::None => Ok(Value::Bool(false)),
if let Some(exe) = pla.get_query_executor(&thg.tb) { IndexOption::Execute(exe, thg) => exe.matches(txn, thg, exp).await,
// If we find the expression in `pre_match`, }
// it means that we are using an Iterator::Index }
// and we are iterating over documents that already matches the expression.
if let Some(ir) = doc.ir { pub(crate) async fn knn(
if exe.is_iterator_expression(ir, exp) { ctx: &Context<'_>,
return Ok(Value::Bool(true)); txn: &Transaction,
} doc: Option<&CursorDoc<'_>>,
} exp: &Expression,
// Evaluate the matches ) -> Result<Value, Error> {
return exe.matches(txn, thg, exp).await; match get_index_option(ctx, doc, exp) {
} IndexOption::PreMatch => Ok(Value::Bool(true)),
} IndexOption::None => Ok(Value::Bool(false)),
} IndexOption::Execute(exe, thg) => exe.knn(txn, thg, exp).await,
} }
Ok(Value::Bool(false))
} }
#[cfg(test)] #[cfg(test)]

View file

@ -30,6 +30,7 @@ impl_module_def!(
"insert" => run, "insert" => run,
"intersect" => run, "intersect" => run,
"join" => run, "join" => run,
"knn" => run,
"last" => run, "last" => run,
"len" => run, "len" => run,
"logical_and" => run, "logical_and" => run,

View file

@ -132,11 +132,11 @@ impl ManhattanDistance for Vec<Number> {
} }
pub trait MinkowskiDistance { pub trait MinkowskiDistance {
fn minkowski_distance(&self, other: &Self, order: Number) -> Result<Number, Error>; fn minkowski_distance(&self, other: &Self, order: &Number) -> Result<Number, Error>;
} }
impl MinkowskiDistance for Vec<Number> { impl MinkowskiDistance for Vec<Number> {
fn minkowski_distance(&self, other: &Self, order: Number) -> Result<Number, Error> { fn minkowski_distance(&self, other: &Self, order: &Number) -> Result<Number, Error> {
check_same_dimension("vector::distance::minkowski", self, other)?; check_same_dimension("vector::distance::minkowski", self, other)?;
let p = order.to_float(); let p = order.to_float();
let dist: f64 = self let dist: f64 = self

View file

@ -75,7 +75,7 @@ pub mod distance {
} }
pub fn minkowski((a, b, o): (Vec<Number>, Vec<Number>, Number)) -> Result<Value, Error> { pub fn minkowski((a, b, o): (Vec<Number>, Vec<Number>, Number)) -> Result<Value, Error> {
Ok(a.minkowski_distance(&b, o)?.into()) Ok(a.minkowski_distance(&b, &o)?.into())
} }
} }

View file

@ -25,7 +25,7 @@ pub(crate) struct DocIds {
} }
impl DocIds { impl DocIds {
pub(super) async fn new( pub(in crate::idx) async fn new(
tx: &mut Transaction, tx: &mut Transaction,
index_key_base: IndexKeyBase, index_key_base: IndexKeyBase,
default_btree_order: u32, default_btree_order: u32,
@ -78,7 +78,7 @@ impl DocIds {
/// Returns the doc_id for the given doc_key. /// Returns the doc_id for the given doc_key.
/// If the doc_id does not exists, a new one is created, and associated to the given key. /// If the doc_id does not exists, a new one is created, and associated to the given key.
pub(super) async fn resolve_doc_id( pub(in crate::idx) async fn resolve_doc_id(
&mut self, &mut self,
tx: &mut Transaction, tx: &mut Transaction,
doc_key: Key, doc_key: Key,
@ -97,7 +97,7 @@ impl DocIds {
Ok(Resolved::New(doc_id)) Ok(Resolved::New(doc_id))
} }
pub(super) async fn remove_doc( pub(in crate::idx) async fn remove_doc(
&mut self, &mut self,
tx: &mut Transaction, tx: &mut Transaction,
doc_key: Key, doc_key: Key,
@ -119,7 +119,7 @@ impl DocIds {
} }
} }
pub(super) async fn get_doc_key( pub(in crate::idx) async fn get_doc_key(
&self, &self,
tx: &mut Transaction, tx: &mut Transaction,
doc_id: DocId, doc_id: DocId,
@ -132,12 +132,15 @@ impl DocIds {
} }
} }
pub(super) async fn statistics(&self, tx: &mut Transaction) -> Result<BStatistics, Error> { pub(in crate::idx) async fn statistics(
&self,
tx: &mut Transaction,
) -> Result<BStatistics, Error> {
let mut store = self.store.lock().await; let mut store = self.store.lock().await;
self.btree.statistics(tx, &mut store).await self.btree.statistics(tx, &mut store).await
} }
pub(super) async fn finish(&mut self, tx: &mut Transaction) -> Result<(), Error> { pub(in crate::idx) async fn finish(&mut self, tx: &mut Transaction) -> Result<(), Error> {
let updated = self.store.lock().await.finish(tx).await?; let updated = self.store.lock().await.finish(tx).await?;
if self.updated || updated { if self.updated || updated {
let state = State { let state = State {
@ -172,20 +175,20 @@ impl State {
} }
#[derive(Debug, PartialEq)] #[derive(Debug, PartialEq)]
pub(super) enum Resolved { pub(in crate::idx) enum Resolved {
New(DocId), New(DocId),
Existing(DocId), Existing(DocId),
} }
impl Resolved { impl Resolved {
pub(super) fn doc_id(&self) -> &DocId { pub(in crate::idx) fn doc_id(&self) -> &DocId {
match self { match self {
Resolved::New(doc_id) => doc_id, Resolved::New(doc_id) => doc_id,
Resolved::Existing(doc_id) => doc_id, Resolved::Existing(doc_id) => doc_id,
} }
} }
pub(super) fn was_existing(&self) -> bool { pub(in crate::idx) fn was_existing(&self) -> bool {
match self { match self {
Resolved::New(_) => false, Resolved::New(_) => false,
Resolved::Existing(_) => true, Resolved::Existing(_) => true,
@ -195,7 +198,7 @@ impl Resolved {
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use crate::idx::ft::docids::{DocIds, Resolved}; use crate::idx::docids::{DocIds, Resolved};
use crate::idx::trees::store::TreeStoreType; use crate::idx::trees::store::TreeStoreType;
use crate::idx::IndexKeyBase; use crate::idx::IndexKeyBase;
use crate::kvs::{Datastore, Transaction}; use crate::kvs::{Datastore, Transaction};

View file

@ -64,7 +64,7 @@ impl Analyzer {
&self, &self,
terms: &mut Terms, terms: &mut Terms,
tx: &mut Transaction, tx: &mut Transaction,
field_content: &[Value], field_content: Vec<Value>,
) -> Result<(DocLength, Vec<(TermId, TermFrequency)>), Error> { ) -> Result<(DocLength, Vec<(TermId, TermFrequency)>), Error> {
let mut dl = 0; let mut dl = 0;
// Let's first collect all the inputs, and collect the tokens. // Let's first collect all the inputs, and collect the tokens.
@ -101,7 +101,7 @@ impl Analyzer {
&self, &self,
terms: &mut Terms, terms: &mut Terms,
tx: &mut Transaction, tx: &mut Transaction,
content: &[Value], content: Vec<Value>,
) -> Result<(DocLength, Vec<(TermId, TermFrequency)>, Vec<(TermId, OffsetRecords)>), Error> { ) -> Result<(DocLength, Vec<(TermId, TermFrequency)>, Vec<(TermId, OffsetRecords)>), Error> {
let mut dl = 0; let mut dl = 0;
// Let's first collect all the inputs, and collect the tokens. // Let's first collect all the inputs, and collect the tokens.
@ -135,25 +135,25 @@ impl Analyzer {
Ok((dl, tfid, osid)) Ok((dl, tfid, osid))
} }
fn analyze_content(&self, content: &[Value], tks: &mut Vec<Tokens>) -> Result<(), Error> { fn analyze_content(&self, content: Vec<Value>, tks: &mut Vec<Tokens>) -> Result<(), Error> {
for v in content { for v in content {
self.analyze_value(v, tks)?; self.analyze_value(v, tks)?;
} }
Ok(()) Ok(())
} }
fn analyze_value(&self, val: &Value, tks: &mut Vec<Tokens>) -> Result<(), Error> { fn analyze_value(&self, val: Value, tks: &mut Vec<Tokens>) -> Result<(), Error> {
match val { match val {
Value::Strand(s) => tks.push(self.analyze(s.0.clone())?), Value::Strand(s) => tks.push(self.analyze(s.0)?),
Value::Number(n) => tks.push(self.analyze(n.to_string())?), Value::Number(n) => tks.push(self.analyze(n.to_string())?),
Value::Bool(b) => tks.push(self.analyze(b.to_string())?), Value::Bool(b) => tks.push(self.analyze(b.to_string())?),
Value::Array(a) => { Value::Array(a) => {
for v in &a.0 { for v in a.0 {
self.analyze_value(v, tks)?; self.analyze_value(v, tks)?;
} }
} }
Value::Object(o) => { Value::Object(o) => {
for v in o.0.values() { for (_, v) in o.0 {
self.analyze_value(v, tks)?; self.analyze_value(v, tks)?;
} }
} }

View file

@ -1,5 +1,5 @@
use crate::err::Error; use crate::err::Error;
use crate::idx::ft::docids::DocId; use crate::idx::docids::DocId;
use crate::idx::trees::bkeys::TrieKeys; use crate::idx::trees::bkeys::TrieKeys;
use crate::idx::trees::btree::{BState, BStatistics, BTree, BTreeNodeStore, Payload}; use crate::idx::trees::btree::{BState, BStatistics, BTree, BTreeNodeStore, Payload};
use crate::idx::trees::store::{TreeNodeProvider, TreeNodeStore, TreeStoreType}; use crate::idx::trees::store::{TreeNodeProvider, TreeNodeStore, TreeStoreType};
@ -72,9 +72,8 @@ impl DocLengths {
} }
pub(super) async fn finish(&self, tx: &mut Transaction) -> Result<(), Error> { pub(super) async fn finish(&self, tx: &mut Transaction) -> Result<(), Error> {
if self.store.lock().await.finish(tx).await? { self.store.lock().await.finish(tx).await?;
tx.set(self.state_key.clone(), self.btree.get_state().try_to_val()?).await?; self.btree.get_state().finish(tx, &self.state_key).await?;
}
Ok(()) Ok(())
} }
} }

View file

@ -1,5 +1,4 @@
pub(crate) mod analyzer; pub(crate) mod analyzer;
pub(crate) mod docids;
mod doclength; mod doclength;
mod highlighter; mod highlighter;
mod offsets; mod offsets;
@ -9,8 +8,8 @@ pub(super) mod termdocs;
pub(crate) mod terms; pub(crate) mod terms;
use crate::err::Error; use crate::err::Error;
use crate::idx::docids::{DocId, DocIds};
use crate::idx::ft::analyzer::Analyzer; use crate::idx::ft::analyzer::Analyzer;
use crate::idx::ft::docids::{DocId, DocIds};
use crate::idx::ft::doclength::DocLengths; use crate::idx::ft::doclength::DocLengths;
use crate::idx::ft::highlighter::{Highlighter, Offseter}; use crate::idx::ft::highlighter::{Highlighter, Offseter};
use crate::idx::ft::offsets::Offsets; use crate::idx::ft::offsets::Offsets;
@ -198,7 +197,7 @@ impl FtIndex {
&mut self, &mut self,
tx: &mut Transaction, tx: &mut Transaction,
rid: &Thing, rid: &Thing,
content: &[Value], content: Vec<Value>,
) -> Result<(), Error> { ) -> Result<(), Error> {
// Resolve the doc_id // Resolve the doc_id
let resolved = self.doc_ids.write().await.resolve_doc_id(tx, rid.into()).await?; let resolved = self.doc_ids.write().await.resolve_doc_id(tx, rid.into()).await?;
@ -481,7 +480,7 @@ mod tests {
} }
assert_eq!(map.len(), e.len()); assert_eq!(map.len(), e.len());
for (k, p) in e { for (k, p) in e {
assert_eq!(map.get(k), Some(&p)); assert_eq!(map.get(k), Some(&p), "{}", k);
} }
} else { } else {
panic!("hits is none"); panic!("hits is none");
@ -549,9 +548,7 @@ mod tests {
// Add one document // Add one document
let (mut tx, mut fti) = let (mut tx, mut fti) =
tx_fti(&ds, TreeStoreType::Write, &az, btree_order, false).await; tx_fti(&ds, TreeStoreType::Write, &az, btree_order, false).await;
fti.index_document(&mut tx, &doc1, &vec![Value::from("hello the world")]) fti.index_document(&mut tx, &doc1, vec![Value::from("hello the world")]).await.unwrap();
.await
.unwrap();
finish(tx, fti).await; finish(tx, fti).await;
} }
@ -559,8 +556,8 @@ mod tests {
// Add two documents // Add two documents
let (mut tx, mut fti) = let (mut tx, mut fti) =
tx_fti(&ds, TreeStoreType::Write, &az, btree_order, false).await; tx_fti(&ds, TreeStoreType::Write, &az, btree_order, false).await;
fti.index_document(&mut tx, &doc2, &vec![Value::from("a yellow hello")]).await.unwrap(); fti.index_document(&mut tx, &doc2, vec![Value::from("a yellow hello")]).await.unwrap();
fti.index_document(&mut tx, &doc3, &vec![Value::from("foo bar")]).await.unwrap(); fti.index_document(&mut tx, &doc3, vec![Value::from("foo bar")]).await.unwrap();
finish(tx, fti).await; finish(tx, fti).await;
} }
@ -575,7 +572,13 @@ mod tests {
// Search & score // Search & score
let (hits, scr) = search(&mut tx, &fti, "hello").await; let (hits, scr) = search(&mut tx, &fti, "hello").await;
check_hits(&mut tx, hits, scr, vec![(&doc1, Some(0.0)), (&doc2, Some(0.0))]).await; check_hits(
&mut tx,
hits,
scr,
vec![(&doc1, Some(-0.4859746)), (&doc2, Some(-0.4859746))],
)
.await;
let (hits, scr) = search(&mut tx, &fti, "world").await; let (hits, scr) = search(&mut tx, &fti, "world").await;
check_hits(&mut tx, hits, scr, vec![(&doc1, Some(0.4859746))]).await; check_hits(&mut tx, hits, scr, vec![(&doc1, Some(0.4859746))]).await;
@ -597,7 +600,7 @@ mod tests {
// Reindex one document // Reindex one document
let (mut tx, mut fti) = let (mut tx, mut fti) =
tx_fti(&ds, TreeStoreType::Write, &az, btree_order, false).await; tx_fti(&ds, TreeStoreType::Write, &az, btree_order, false).await;
fti.index_document(&mut tx, &doc3, &vec![Value::from("nobar foo")]).await.unwrap(); fti.index_document(&mut tx, &doc3, vec![Value::from("nobar foo")]).await.unwrap();
finish(tx, fti).await; finish(tx, fti).await;
let (mut tx, fti) = tx_fti(&ds, TreeStoreType::Read, &az, btree_order, false).await; let (mut tx, fti) = tx_fti(&ds, TreeStoreType::Read, &az, btree_order, false).await;
@ -655,28 +658,28 @@ mod tests {
fti.index_document( fti.index_document(
&mut tx, &mut tx,
&doc1, &doc1,
&vec![Value::from("the quick brown fox jumped over the lazy dog")], vec![Value::from("the quick brown fox jumped over the lazy dog")],
) )
.await .await
.unwrap(); .unwrap();
fti.index_document( fti.index_document(
&mut tx, &mut tx,
&doc2, &doc2,
&vec![Value::from("the fast fox jumped over the lazy dog")], vec![Value::from("the fast fox jumped over the lazy dog")],
) )
.await .await
.unwrap(); .unwrap();
fti.index_document( fti.index_document(
&mut tx, &mut tx,
&doc3, &doc3,
&vec![Value::from("the dog sat there and did nothing")], vec![Value::from("the dog sat there and did nothing")],
) )
.await .await
.unwrap(); .unwrap();
fti.index_document( fti.index_document(
&mut tx, &mut tx,
&doc4, &doc4,
&vec![Value::from("the other animals sat there watching")], vec![Value::from("the other animals sat there watching")],
) )
.await .await
.unwrap(); .unwrap();
@ -698,10 +701,10 @@ mod tests {
hits, hits,
scr, scr,
vec![ vec![
(&doc1, Some(0.0)), (&doc1, Some(-3.4388628)),
(&doc2, Some(0.0)), (&doc2, Some(-3.621457)),
(&doc3, Some(0.0)), (&doc3, Some(-2.258829)),
(&doc4, Some(0.0)), (&doc4, Some(-2.393017)),
], ],
) )
.await; .await;
@ -711,7 +714,11 @@ mod tests {
&mut tx, &mut tx,
hits, hits,
scr, scr,
vec![(&doc1, Some(0.0)), (&doc2, Some(0.0)), (&doc3, Some(0.0))], vec![
(&doc1, Some(-0.7832165)),
(&doc2, Some(-0.8248031)),
(&doc3, Some(-0.87105393)),
],
) )
.await; .await;

View file

@ -1,5 +1,5 @@
use crate::err::Error; use crate::err::Error;
use crate::idx::ft::docids::DocId; use crate::idx::docids::DocId;
use crate::idx::ft::terms::TermId; use crate::idx::ft::terms::TermId;
use crate::idx::IndexKeyBase; use crate::idx::IndexKeyBase;
use crate::kvs::{Transaction, Val}; use crate::kvs::{Transaction, Val};

View file

@ -1,5 +1,5 @@
use crate::err::Error; use crate::err::Error;
use crate::idx::ft::docids::DocId; use crate::idx::docids::DocId;
use crate::idx::ft::terms::TermId; use crate::idx::ft::terms::TermId;
use crate::idx::trees::bkeys::TrieKeys; use crate::idx::trees::bkeys::TrieKeys;
use crate::idx::trees::btree::{BState, BStatistics, BTree, BTreeNodeStore}; use crate::idx::trees::btree::{BState, BStatistics, BTree, BTreeNodeStore};
@ -81,10 +81,8 @@ impl Postings {
} }
pub(super) async fn finish(&self, tx: &mut Transaction) -> Result<(), Error> { pub(super) async fn finish(&self, tx: &mut Transaction) -> Result<(), Error> {
let updated = self.store.lock().await.finish(tx).await?; self.store.lock().await.finish(tx).await?;
if self.btree.is_updated() || updated { self.btree.get_state().finish(tx, &self.state_key).await?;
tx.set(self.state_key.clone(), self.btree.get_state().try_to_val()?).await?;
}
Ok(()) Ok(())
} }
} }

View file

@ -1,5 +1,5 @@
use crate::err::Error; use crate::err::Error;
use crate::idx::ft::docids::DocId; use crate::idx::docids::DocId;
use crate::idx::ft::doclength::{DocLength, DocLengths}; use crate::idx::ft::doclength::{DocLength, DocLengths};
use crate::idx::ft::postings::{Postings, TermFrequency}; use crate::idx::ft::postings::{Postings, TermFrequency};
use crate::idx::ft::termdocs::TermsDocs; use crate::idx::ft::termdocs::TermsDocs;
@ -76,8 +76,8 @@ impl BM25Scorer {
// (N - n(qi) + 0.5) // (N - n(qi) + 0.5)
let numerator = self.doc_count - term_doc_count + 0.5; let numerator = self.doc_count - term_doc_count + 0.5;
let idf = (numerator / denominator).ln(); let idf = (numerator / denominator).ln();
if idf.is_nan() || idf <= 0.0 { if idf.is_nan() {
return 0.0; return f32::NAN;
} }
let tf_prim = 1.0 + term_freq.ln(); let tf_prim = 1.0 + term_freq.ln();
// idf * (k1 + 1) // idf * (k1 + 1)

View file

@ -1,5 +1,5 @@
use crate::err::Error; use crate::err::Error;
use crate::idx::ft::docids::DocId; use crate::idx::docids::DocId;
use crate::idx::ft::doclength::DocLength; use crate::idx::ft::doclength::DocLength;
use crate::idx::ft::terms::TermId; use crate::idx::ft::terms::TermId;
use crate::idx::IndexKeyBase; use crate::idx::IndexKeyBase;

View file

@ -1,10 +1,11 @@
pub(crate) mod docids;
pub(crate) mod ft; pub(crate) mod ft;
pub(crate) mod planner; pub(crate) mod planner;
pub mod trees; pub mod trees;
use crate::dbs::Options; use crate::dbs::Options;
use crate::err::Error; use crate::err::Error;
use crate::idx::ft::docids::DocId; use crate::idx::docids::DocId;
use crate::idx::ft::terms::TermId; use crate::idx::ft::terms::TermId;
use crate::idx::trees::store::NodeId; use crate::idx::trees::store::NodeId;
use crate::key::index::bc::Bc; use crate::key::index::bc::Bc;
@ -18,6 +19,7 @@ use crate::key::index::bp::Bp;
use crate::key::index::bs::Bs; use crate::key::index::bs::Bs;
use crate::key::index::bt::Bt; use crate::key::index::bt::Bt;
use crate::key::index::bu::Bu; use crate::key::index::bu::Bu;
use crate::key::index::vm::Vm;
use crate::kvs::{Key, Val}; use crate::kvs::{Key, Val};
use crate::sql::statements::DefineIndexStatement; use crate::sql::statements::DefineIndexStatement;
use revision::Revisioned; use revision::Revisioned;
@ -171,6 +173,17 @@ impl IndexKeyBase {
) )
.into() .into()
} }
fn new_vm_key(&self, node_id: Option<NodeId>) -> Key {
Vm::new(
self.inner.ns.as_str(),
self.inner.db.as_str(),
self.inner.tb.as_str(),
self.inner.ix.as_str(),
node_id,
)
.into()
}
} }
/// This trait provides `Revision` based default implementations for serialization/deserialization /// This trait provides `Revision` based default implementations for serialization/deserialization

View file

@ -1,25 +1,27 @@
use crate::dbs::{Options, Transaction}; use crate::dbs::{Options, Transaction};
use crate::err::Error; use crate::err::Error;
use crate::idx::ft::docids::{DocId, DocIds}; use crate::idx::docids::{DocId, DocIds};
use crate::idx::ft::scorer::BM25Scorer; use crate::idx::ft::scorer::BM25Scorer;
use crate::idx::ft::termdocs::TermsDocs; use crate::idx::ft::termdocs::TermsDocs;
use crate::idx::ft::terms::TermId; use crate::idx::ft::terms::TermId;
use crate::idx::ft::{FtIndex, MatchRef}; use crate::idx::ft::{FtIndex, MatchRef};
use crate::idx::planner::iterators::{ use crate::idx::planner::iterators::{
IndexEqualThingIterator, IndexRangeThingIterator, MatchesThingIterator, ThingIterator, IndexEqualThingIterator, IndexRangeThingIterator, KnnThingIterator, MatchesThingIterator,
UniqueEqualThingIterator, UniqueRangeThingIterator, ThingIterator, UniqueEqualThingIterator, UniqueRangeThingIterator,
}; };
use crate::idx::planner::plan::IndexOperator::Matches; use crate::idx::planner::plan::IndexOperator::Matches;
use crate::idx::planner::plan::{IndexOperator, IndexOption, RangeValue}; use crate::idx::planner::plan::{IndexOperator, IndexOption, RangeValue};
use crate::idx::planner::tree::{IndexMap, IndexRef}; use crate::idx::planner::tree::{IndexMap, IndexRef};
use crate::idx::trees::mtree::MTreeIndex;
use crate::idx::trees::store::TreeStoreType; use crate::idx::trees::store::TreeStoreType;
use crate::idx::IndexKeyBase; use crate::idx::IndexKeyBase;
use crate::kvs; use crate::kvs;
use crate::kvs::Key; use crate::kvs::Key;
use crate::sql::index::Index; use crate::sql::index::Index;
use crate::sql::statements::DefineIndexStatement; use crate::sql::statements::DefineIndexStatement;
use crate::sql::{Expression, Object, Table, Thing, Value}; use crate::sql::{Array, Expression, Object, Table, Thing, Value};
use std::collections::{HashMap, HashSet}; use roaring::RoaringTreemap;
use std::collections::{HashMap, HashSet, VecDeque};
use std::sync::Arc; use std::sync::Arc;
use tokio::sync::RwLock; use tokio::sync::RwLock;
@ -30,6 +32,7 @@ pub(crate) struct QueryExecutor {
exp_entries: HashMap<Arc<Expression>, FtEntry>, exp_entries: HashMap<Arc<Expression>, FtEntry>,
it_entries: Vec<IteratorEntry>, it_entries: Vec<IteratorEntry>,
index_definitions: HashMap<IndexRef, DefineIndexStatement>, index_definitions: HashMap<IndexRef, DefineIndexStatement>,
mt_exp: HashMap<Arc<Expression>, MtEntry>,
} }
pub(crate) type IteratorRef = u16; pub(crate) type IteratorRef = u16;
@ -66,39 +69,59 @@ impl QueryExecutor {
let mut mr_entries = HashMap::default(); let mut mr_entries = HashMap::default();
let mut exp_entries = HashMap::default(); let mut exp_entries = HashMap::default();
let mut ft_map = HashMap::default(); let mut ft_map = HashMap::default();
let mut mt_map: HashMap<IndexRef, MTreeIndex> = HashMap::default();
let mut mt_exp = HashMap::default();
// Create all the instances of FtIndex // Create all the instances of FtIndex
// Build the FtEntries and map them to Expressions and MatchRef // Build the FtEntries and map them to Expressions and MatchRef
for (exp, io) in im.options { for (exp, io) in im.options {
let mut entry = None;
let ir = io.ir(); let ir = io.ir();
if let Some(idx_def) = im.definitions.get(&ir) { if let Some(idx_def) = im.definitions.get(&ir) {
if let Index::Search(p) = &idx_def.index { match &idx_def.index {
if let Some(ft) = ft_map.get(&ir) { Index::Search(p) => {
if entry.is_none() { let mut ft_entry = None;
entry = FtEntry::new(&mut run, ft, io).await?; if let Some(ft) = ft_map.get(&ir) {
if ft_entry.is_none() {
ft_entry = FtEntry::new(&mut run, ft, io).await?;
}
} else {
let ikb = IndexKeyBase::new(opt, idx_def);
let az = run.get_db_analyzer(opt.ns(), opt.db(), p.az.as_str()).await?;
let ft =
FtIndex::new(&mut run, az, ikb, p, TreeStoreType::Read).await?;
if ft_entry.is_none() {
ft_entry = FtEntry::new(&mut run, &ft, io).await?;
}
ft_map.insert(ir, ft);
} }
} else { if let Some(e) = ft_entry {
let ikb = IndexKeyBase::new(opt, idx_def); if let Matches(_, Some(mr)) = e.0.index_option.op() {
let az = run.get_db_analyzer(opt.ns(), opt.db(), p.az.as_str()).await?; if mr_entries.insert(*mr, e.clone()).is_some() {
let ft = FtIndex::new(&mut run, az, ikb, p, TreeStoreType::Read).await?; return Err(Error::DuplicatedMatchRef {
if entry.is_none() { mr: *mr,
entry = FtEntry::new(&mut run, &ft, io).await?; });
}
}
exp_entries.insert(exp, e);
} }
ft_map.insert(ir, ft);
} }
} Index::MTree(p) => {
} if let IndexOperator::Knn(a, k) = io.op() {
let entry = if let Some(mt) = mt_map.get(&ir) {
if let Some(e) = entry { MtEntry::new(&mut run, mt, a.clone(), *k).await?
if let Matches(_, Some(mr)) = e.0.index_option.op() { } else {
if mr_entries.insert(*mr, e.clone()).is_some() { let ikb = IndexKeyBase::new(opt, idx_def);
return Err(Error::DuplicatedMatchRef { let mt =
mr: *mr, MTreeIndex::new(&mut run, ikb, p, TreeStoreType::Read).await?;
}); let entry = MtEntry::new(&mut run, &mt, a.clone(), *k).await?;
mt_map.insert(ir, mt);
entry
};
mt_exp.insert(exp, entry);
}
} }
_ => {}
} }
exp_entries.insert(exp, e);
} }
} }
@ -109,6 +132,19 @@ impl QueryExecutor {
exp_entries, exp_entries,
it_entries: Vec::new(), it_entries: Vec::new(),
index_definitions: im.definitions, index_definitions: im.definitions,
mt_exp,
})
}
pub(crate) async fn knn(
&self,
_txn: &Transaction,
_thg: &Thing,
exp: &Expression,
) -> Result<Value, Error> {
// If no previous case were successful, we end up with a user error
Err(Error::NoIndexFoundForMatch {
value: exp.to_string(),
}) })
} }
@ -168,9 +204,7 @@ impl QueryExecutor {
Index::Search { Index::Search {
.. ..
} => self.new_search_index_iterator(ir, io.clone()).await, } => self.new_search_index_iterator(ir, io.clone()).await,
Index::MTree(_) => Err(Error::FeatureNotYetImplemented { Index::MTree(_) => Ok(self.new_mtree_index_knn_iterator(ir)),
feature: "VectorSearch iterator".to_string(),
}),
} }
} else { } else {
Ok(None) Ok(None)
@ -258,6 +292,16 @@ impl QueryExecutor {
Ok(None) Ok(None)
} }
fn new_mtree_index_knn_iterator(&self, ir: IteratorRef) -> Option<ThingIterator> {
if let Some(IteratorEntry::Single(exp, ..)) = self.it_entries.get(ir as usize) {
if let Some(mte) = self.mt_exp.get(exp.as_ref()) {
let it = KnnThingIterator::new(mte.doc_ids.clone(), mte.res.clone());
return Some(ThingIterator::Knn(it));
}
}
None
}
pub(crate) async fn matches( pub(crate) async fn matches(
&self, &self,
txn: &Transaction, txn: &Transaction,
@ -406,3 +450,24 @@ impl FtEntry {
} }
} }
} }
#[derive(Clone)]
pub(super) struct MtEntry {
doc_ids: Arc<RwLock<DocIds>>,
res: VecDeque<RoaringTreemap>,
}
impl MtEntry {
async fn new(
tx: &mut kvs::Transaction,
mt: &MTreeIndex,
a: Array,
k: u32,
) -> Result<Self, Error> {
let res = mt.knn_search(tx, a, k as usize).await?;
Ok(Self {
res,
doc_ids: mt.doc_ids(),
})
}
}

View file

@ -1,6 +1,6 @@
use crate::dbs::{Options, Transaction}; use crate::dbs::{Options, Transaction};
use crate::err::Error; use crate::err::Error;
use crate::idx::ft::docids::{DocId, NO_DOC_ID}; use crate::idx::docids::{DocId, DocIds, NO_DOC_ID};
use crate::idx::ft::termdocs::TermsDocs; use crate::idx::ft::termdocs::TermsDocs;
use crate::idx::ft::{FtIndex, HitsIterator}; use crate::idx::ft::{FtIndex, HitsIterator};
use crate::idx::planner::plan::RangeValue; use crate::idx::planner::plan::RangeValue;
@ -8,6 +8,10 @@ use crate::key::index::Index;
use crate::kvs::Key; use crate::kvs::Key;
use crate::sql::statements::DefineIndexStatement; use crate::sql::statements::DefineIndexStatement;
use crate::sql::{Array, Thing, Value}; use crate::sql::{Array, Thing, Value};
use roaring::RoaringTreemap;
use std::collections::VecDeque;
use std::sync::Arc;
use tokio::sync::RwLock;
pub(crate) enum ThingIterator { pub(crate) enum ThingIterator {
IndexEqual(IndexEqualThingIterator), IndexEqual(IndexEqualThingIterator),
@ -15,6 +19,7 @@ pub(crate) enum ThingIterator {
UniqueEqual(UniqueEqualThingIterator), UniqueEqual(UniqueEqualThingIterator),
UniqueRange(UniqueRangeThingIterator), UniqueRange(UniqueRangeThingIterator),
Matches(MatchesThingIterator), Matches(MatchesThingIterator),
Knn(KnnThingIterator),
} }
impl ThingIterator { impl ThingIterator {
@ -29,6 +34,7 @@ impl ThingIterator {
ThingIterator::IndexRange(i) => i.next_batch(tx, size).await, ThingIterator::IndexRange(i) => i.next_batch(tx, size).await,
ThingIterator::UniqueRange(i) => i.next_batch(tx, size).await, ThingIterator::UniqueRange(i) => i.next_batch(tx, size).await,
ThingIterator::Matches(i) => i.next_batch(tx, size).await, ThingIterator::Matches(i) => i.next_batch(tx, size).await,
ThingIterator::Knn(i) => i.next_batch(tx, size).await,
} }
} }
} }
@ -307,3 +313,52 @@ impl MatchesThingIterator {
Ok(res) Ok(res)
} }
} }
pub(crate) struct KnnThingIterator {
doc_ids: Arc<RwLock<DocIds>>,
res: VecDeque<RoaringTreemap>,
current: Option<RoaringTreemap>,
skip: RoaringTreemap,
}
impl KnnThingIterator {
pub(super) fn new(doc_ids: Arc<RwLock<DocIds>>, mut res: VecDeque<RoaringTreemap>) -> Self {
let current = res.pop_front();
Self {
doc_ids,
res,
current,
skip: RoaringTreemap::new(),
}
}
async fn next_batch(
&mut self,
txn: &Transaction,
mut limit: u32,
) -> Result<Vec<(Thing, DocId)>, Error> {
let mut res = vec![];
let mut tx = txn.lock().await;
while self.current.is_some() && limit > 0 {
if let Some(docs) = &mut self.current {
if let Some(doc_id) = docs.iter().next() {
docs.remove(doc_id);
if self.skip.insert(doc_id) {
if let Some(doc_key) =
self.doc_ids.read().await.get_doc_key(&mut tx, doc_id).await?
{
res.push((doc_key.into(), doc_id));
limit -= 1;
}
}
if docs.is_empty() {
self.current = None;
}
}
}
if self.current.is_none() {
self.current = self.res.pop_front();
}
}
Ok(res)
}
}

View file

@ -149,6 +149,7 @@ pub(super) enum IndexOperator {
Equality(Array), Equality(Array),
RangePart(Operator, Value), RangePart(Operator, Value),
Matches(String, Option<MatchRef>), Matches(String, Option<MatchRef>),
Knn(Array, u32),
} }
impl IndexOption { impl IndexOption {
@ -191,6 +192,10 @@ impl IndexOption {
e.insert("operator", Value::from(op.to_string())); e.insert("operator", Value::from(op.to_string()));
e.insert("value", v.to_owned()); e.insert("value", v.to_owned());
} }
IndexOperator::Knn(a, k) => {
e.insert("operator", Value::from(format!("<{}>", k)));
e.insert("value", Value::Array(a.clone()));
}
}; };
} }
} }

View file

@ -102,10 +102,10 @@ impl<'a> TreeBuilder<'a> {
match v { match v {
Value::Expression(e) => self.eval_expression(e).await, Value::Expression(e) => self.eval_expression(e).await,
Value::Idiom(i) => self.eval_idiom(i).await, Value::Idiom(i) => self.eval_idiom(i).await,
Value::Strand(_) => Ok(Node::Scalar(v.to_owned())), Value::Strand(_) | Value::Number(_) | Value::Bool(_) | Value::Thing(_) => {
Value::Number(_) => Ok(Node::Scalar(v.to_owned())), Ok(Node::Scalar(v.to_owned()))
Value::Bool(_) => Ok(Node::Scalar(v.to_owned())), }
Value::Thing(_) => Ok(Node::Scalar(v.to_owned())), Value::Array(a) => Ok(self.eval_array(a)),
Value::Subquery(s) => self.eval_subquery(s).await, Value::Subquery(s) => self.eval_subquery(s).await,
Value::Param(p) => { Value::Param(p) => {
let v = p.compute(self.ctx, self.opt, self.txn, None).await?; let v = p.compute(self.ctx, self.opt, self.txn, None).await?;
@ -115,6 +115,16 @@ impl<'a> TreeBuilder<'a> {
} }
} }
fn eval_array(&mut self, a: &Array) -> Node {
// Check if it is a numeric vector
for v in &a.0 {
if !v.is_number() {
return Node::Unsupported(format!("Unsupported array: {}", a));
}
}
Node::Vector(a.to_owned())
}
async fn eval_idiom(&mut self, i: &Idiom) -> Result<Node, Error> { async fn eval_idiom(&mut self, i: &Idiom) -> Result<Node, Error> {
if let Some(irs) = self.find_indexes(i).await? { if let Some(irs) = self.find_indexes(i).await? {
if !irs.is_empty() { if !irs.is_empty() {
@ -165,45 +175,61 @@ impl<'a> TreeBuilder<'a> {
irs: &[IndexRef], irs: &[IndexRef],
op: &Operator, op: &Operator,
id: &Idiom, id: &Idiom,
v: &Node, n: &Node,
e: &Expression, e: &Expression,
) -> Option<IndexOption> { ) -> Option<IndexOption> {
if let Some(v) = v.is_scalar() { for ir in irs {
for ir in irs { if let Some(ix) = self.index_map.definitions.get(ir) {
if let Some(ix) = self.index_map.definitions.get(ir) { let op = match &ix.index {
let op = match &ix.index { Index::Idx => Self::eval_index_operator(op, n),
Index::Idx => Self::eval_index_operator(op, v), Index::Uniq => Self::eval_index_operator(op, n),
Index::Uniq => Self::eval_index_operator(op, v), Index::Search {
Index::Search { ..
.. } => {
} => { if let Some(v) = n.is_scalar() {
if let Operator::Matches(mr) = op { if let Operator::Matches(mr) = op {
Some(IndexOperator::Matches(v.clone().to_raw_string(), *mr)) Some(IndexOperator::Matches(v.clone().to_raw_string(), *mr))
} else { } else {
None None
} }
} else {
None
} }
Index::MTree(_) => None,
};
if let Some(op) = op {
let io = IndexOption::new(*ir, id.clone(), op);
self.index_map.options.insert(Arc::new(e.clone()), io.clone());
return Some(io);
} }
Index::MTree(_) => {
if let Operator::Knn(k) = op {
if let Node::Vector(a) = n {
Some(IndexOperator::Knn(a.clone(), *k))
} else {
None
}
} else {
None
}
}
};
if let Some(op) = op {
let io = IndexOption::new(*ir, id.clone(), op);
self.index_map.options.insert(Arc::new(e.clone()), io.clone());
return Some(io);
} }
} }
} }
None None
} }
fn eval_index_operator(op: &Operator, v: &Value) -> Option<IndexOperator> { fn eval_index_operator(op: &Operator, n: &Node) -> Option<IndexOperator> {
match op { if let Some(v) = n.is_scalar() {
Operator::Equal => Some(IndexOperator::Equality(Array::from(v.clone()))), match op {
Operator::LessThan Operator::Equal => Some(IndexOperator::Equality(Array::from(v.clone()))),
| Operator::LessThanOrEqual Operator::LessThan
| Operator::MoreThan | Operator::LessThanOrEqual
| Operator::MoreThanOrEqual => Some(IndexOperator::RangePart(op.clone(), v.clone())), | Operator::MoreThan
_ => None, | Operator::MoreThanOrEqual => Some(IndexOperator::RangePart(op.clone(), v.clone())),
_ => None,
}
} else {
None
} }
} }
@ -235,6 +261,7 @@ pub(super) enum Node {
IndexedField(Idiom, Arc<Vec<IndexRef>>), IndexedField(Idiom, Arc<Vec<IndexRef>>),
NonIndexedField, NonIndexedField,
Scalar(Value), Scalar(Value),
Vector(Array),
Unsupported(String), Unsupported(String),
} }

View file

@ -21,7 +21,6 @@ where
{ {
state: BState, state: BState,
full_size: u32, full_size: u32,
updated: bool,
bk: PhantomData<BK>, bk: PhantomData<BK>,
} }
@ -31,6 +30,8 @@ pub struct BState {
minimum_degree: u32, minimum_degree: u32,
root: Option<NodeId>, root: Option<NodeId>,
next_node_id: NodeId, next_node_id: NodeId,
#[serde(skip)]
updated: bool,
} }
impl VersionedSerdeState for BState {} impl VersionedSerdeState for BState {}
@ -42,8 +43,34 @@ impl BState {
minimum_degree, minimum_degree,
root: None, root: None,
next_node_id: 0, next_node_id: 0,
updated: false,
} }
} }
fn set_root(&mut self, node_id: Option<NodeId>) {
if node_id.ne(&self.root) {
self.root = node_id;
self.updated = true;
}
}
fn new_node_id(&mut self) -> NodeId {
let new_node_id = self.next_node_id;
self.next_node_id += 1;
self.updated = true;
new_node_id
}
pub(in crate::idx) async fn finish(
&self,
tx: &mut Transaction,
key: &Key,
) -> Result<(), Error> {
if self.updated {
tx.set(key.clone(), self.try_to_val()?).await?;
}
Ok(())
}
} }
#[derive(Debug, Default, PartialEq)] #[derive(Debug, Default, PartialEq)]
@ -166,7 +193,6 @@ where
Self { Self {
full_size: state.minimum_degree * 2 - 1, full_size: state.minimum_degree * 2 - 1,
state, state,
updated: false,
bk: PhantomData, bk: PhantomData,
} }
} }
@ -180,11 +206,11 @@ where
let mut next_node = self.state.root; let mut next_node = self.state.root;
while let Some(node_id) = next_node.take() { while let Some(node_id) = next_node.take() {
let current = store.get_node(tx, node_id).await?; let current = store.get_node(tx, node_id).await?;
if let Some(payload) = current.node.keys().get(searched_key) { if let Some(payload) = current.n.keys().get(searched_key) {
store.set_node(current, false)?; store.set_node(current, false)?;
return Ok(Some(payload)); return Ok(Some(payload));
} }
if let BTreeNode::Internal(keys, children) = &current.node { if let BTreeNode::Internal(keys, children) = &current.n {
let child_idx = keys.get_child_idx(searched_key); let child_idx = keys.get_child_idx(searched_key);
next_node.replace(children[child_idx]); next_node.replace(children[child_idx]);
} }
@ -201,27 +227,30 @@ where
payload: Payload, payload: Payload,
) -> Result<(), Error> { ) -> Result<(), Error> {
if let Some(root_id) = self.state.root { if let Some(root_id) = self.state.root {
// We already have a root node
let root = store.get_node(tx, root_id).await?; let root = store.get_node(tx, root_id).await?;
if root.node.keys().len() == self.full_size { if root.n.keys().len() == self.full_size {
let new_root_id = self.new_node_id(); // The root node is full, let's split it
let new_root_id = self.state.new_node_id();
let new_root = store let new_root = store
.new_node(new_root_id, BTreeNode::Internal(BK::default(), vec![root_id]))?; .new_node(new_root_id, BTreeNode::Internal(BK::default(), vec![root_id]))?;
self.state.root = Some(new_root.id); self.state.set_root(Some(new_root.id));
self.split_child(store, new_root, 0, root).await?; self.split_child(store, new_root, 0, root).await?;
self.insert_non_full(tx, store, new_root_id, key, payload).await?; self.insert_non_full(tx, store, new_root_id, key, payload).await?;
} else { } else {
// The root node has place, let's insert the value
let root_id = root.id; let root_id = root.id;
store.set_node(root, false)?; store.set_node(root, false)?;
self.insert_non_full(tx, store, root_id, key, payload).await?; self.insert_non_full(tx, store, root_id, key, payload).await?;
} }
} else { } else {
let new_root_id = self.new_node_id(); // We don't have a root node, let's create id
let new_root_id = self.state.new_node_id();
let new_root_node = let new_root_node =
store.new_node(new_root_id, BTreeNode::Leaf(BK::with_key_val(key, payload)?))?; store.new_node(new_root_id, BTreeNode::Leaf(BK::with_key_val(key, payload)?))?;
store.set_node(new_root_node, true)?; store.set_node(new_root_node, true)?;
self.state.root = Some(new_root_id); self.state.set_root(Some(new_root_id));
} }
self.updated = true;
Ok(()) Ok(())
} }
@ -237,7 +266,7 @@ where
while let Some(node_id) = next_node_id.take() { while let Some(node_id) = next_node_id.take() {
let mut node = store.get_node(tx, node_id).await?; let mut node = store.get_node(tx, node_id).await?;
let key: Key = key.clone(); let key: Key = key.clone();
match &mut node.node { match &mut node.n {
BTreeNode::Leaf(keys) => { BTreeNode::Leaf(keys) => {
keys.insert(key, payload); keys.insert(key, payload);
store.set_node(node, true)?; store.set_node(node, true)?;
@ -250,7 +279,7 @@ where
} }
let child_idx = keys.get_child_idx(&key); let child_idx = keys.get_child_idx(&key);
let child = store.get_node(tx, children[child_idx]).await?; let child = store.get_node(tx, children[child_idx]).await?;
let next_id = if child.node.keys().len() == self.full_size { let next_id = if child.n.keys().len() == self.full_size {
let split_result = self.split_child(store, node, child_idx, child).await?; let split_result = self.split_child(store, node, child_idx, child).await?;
if key.gt(&split_result.median_key) { if key.gt(&split_result.median_key) {
split_result.right_node_id split_result.right_node_id
@ -277,12 +306,12 @@ where
idx: usize, idx: usize,
child_node: BStoredNode<BK>, child_node: BStoredNode<BK>,
) -> Result<SplitResult, Error> { ) -> Result<SplitResult, Error> {
let (left_node, right_node, median_key, median_payload) = match child_node.node { let (left_node, right_node, median_key, median_payload) = match child_node.n {
BTreeNode::Internal(keys, children) => self.split_internal_node(keys, children)?, BTreeNode::Internal(keys, children) => self.split_internal_node(keys, children)?,
BTreeNode::Leaf(keys) => self.split_leaf_node(keys)?, BTreeNode::Leaf(keys) => self.split_leaf_node(keys)?,
}; };
let right_node_id = self.new_node_id(); let right_node_id = self.state.new_node_id();
match parent_node.node { match parent_node.n {
BTreeNode::Internal(ref mut keys, ref mut children) => { BTreeNode::Internal(ref mut keys, ref mut children) => {
keys.insert(median_key.clone(), median_payload); keys.insert(median_key.clone(), median_payload);
children.insert(idx + 1, right_node_id); children.insert(idx + 1, right_node_id);
@ -329,12 +358,6 @@ where
Ok((left_node, right_node, r.median_key, r.median_payload)) Ok((left_node, right_node, r.median_key, r.median_payload))
} }
fn new_node_id(&mut self) -> NodeId {
let new_node_id = self.state.next_node_id;
self.state.next_node_id += 1;
new_node_id
}
pub(in crate::idx) async fn delete( pub(in crate::idx) async fn delete(
&mut self, &mut self,
tx: &mut Transaction, tx: &mut Transaction,
@ -348,7 +371,7 @@ where
while let Some((is_main_key, key_to_delete, node_id)) = next_node.take() { while let Some((is_main_key, key_to_delete, node_id)) = next_node.take() {
let mut node = store.get_node(tx, node_id).await?; let mut node = store.get_node(tx, node_id).await?;
match &mut node.node { match &mut node.n {
BTreeNode::Leaf(keys) => { BTreeNode::Leaf(keys) => {
// CLRS: 1 // CLRS: 1
if let Some(payload) = keys.get(&key_to_delete) { if let Some(payload) = keys.get(&key_to_delete) {
@ -361,12 +384,11 @@ where
store.remove_node(node.id, node.key)?; store.remove_node(node.id, node.key)?;
// Check if this was the root node // Check if this was the root node
if Some(node_id) == self.state.root { if Some(node_id) == self.state.root {
self.state.root = None; self.state.set_root(None);
} }
} else { } else {
store.set_node(node, true)?; store.set_node(node, true)?;
} }
self.updated = true;
} else { } else {
store.set_node(node, false)?; store.set_node(node, false)?;
} }
@ -388,7 +410,6 @@ where
.await?, .await?,
); );
store.set_node(node, true)?; store.set_node(node, true)?;
self.updated = true;
} else { } else {
// CLRS: 3 // CLRS: 3
let (node_update, is_main_key, key_to_delete, next_stored_node) = self let (node_update, is_main_key, key_to_delete, next_stored_node) = self
@ -409,11 +430,9 @@ where
} }
} }
store.remove_node(node_id, node.key)?; store.remove_node(node_id, node.key)?;
self.state.root = Some(next_stored_node); self.state.set_root(Some(next_stored_node));
self.updated = true;
} else if node_update { } else if node_update {
store.set_node(node, true)?; store.set_node(node, true)?;
self.updated = true;
} else { } else {
store.set_node(node, false)?; store.set_node(node, false)?;
} }
@ -437,9 +456,9 @@ where
let left_idx = keys.get_child_idx(&key_to_delete); let left_idx = keys.get_child_idx(&key_to_delete);
let left_id = children[left_idx]; let left_id = children[left_idx];
let mut left_node = store.get_node(tx, left_id).await?; let mut left_node = store.get_node(tx, left_id).await?;
if left_node.node.keys().len() >= self.state.minimum_degree { if left_node.n.keys().len() >= self.state.minimum_degree {
// CLRS: 2a -> left_node is named `y` in the book // CLRS: 2a -> left_node is named `y` in the book
if let Some((key_prim, payload_prim)) = left_node.node.keys().get_last_key() { if let Some((key_prim, payload_prim)) = left_node.n.keys().get_last_key() {
keys.remove(&key_to_delete); keys.remove(&key_to_delete);
keys.insert(key_prim.clone(), payload_prim); keys.insert(key_prim.clone(), payload_prim);
store.set_node(left_node, true)?; store.set_node(left_node, true)?;
@ -450,9 +469,9 @@ where
let right_idx = left_idx + 1; let right_idx = left_idx + 1;
let right_id = children[right_idx]; let right_id = children[right_idx];
let right_node = store.get_node(tx, right_id).await?; let right_node = store.get_node(tx, right_id).await?;
if right_node.node.keys().len() >= self.state.minimum_degree { if right_node.n.keys().len() >= self.state.minimum_degree {
// CLRS: 2b -> right_node is name `z` in the book // CLRS: 2b -> right_node is name `z` in the book
if let Some((key_prim, payload_prim)) = right_node.node.keys().get_first_key() { if let Some((key_prim, payload_prim)) = right_node.n.keys().get_first_key() {
keys.remove(&key_to_delete); keys.remove(&key_to_delete);
keys.insert(key_prim.clone(), payload_prim); keys.insert(key_prim.clone(), payload_prim);
store.set_node(left_node, false)?; store.set_node(left_node, false)?;
@ -464,7 +483,7 @@ where
// CLRS: 2c // CLRS: 2c
// Merge children // Merge children
// The payload is set to 0. The value does not matter, as the key will be deleted after anyway. // The payload is set to 0. The value does not matter, as the key will be deleted after anyway.
left_node.node.append(key_to_delete.clone(), 0, right_node.node)?; left_node.n.append(key_to_delete.clone(), 0, right_node.n)?;
store.set_node(left_node, true)?; store.set_node(left_node, true)?;
store.remove_node(right_id, right_node.key)?; store.remove_node(right_id, right_node.key)?;
keys.remove(&key_to_delete); keys.remove(&key_to_delete);
@ -485,11 +504,11 @@ where
let child_idx = keys.get_child_idx(&key_to_delete); let child_idx = keys.get_child_idx(&key_to_delete);
let child_id = children[child_idx]; let child_id = children[child_idx];
let child_stored_node = store.get_node(tx, child_id).await?; let child_stored_node = store.get_node(tx, child_id).await?;
if child_stored_node.node.keys().len() < self.state.minimum_degree { if child_stored_node.n.keys().len() < self.state.minimum_degree {
// right child (successor) // right child (successor)
if child_idx < children.len() - 1 { if child_idx < children.len() - 1 {
let right_child_stored_node = store.get_node(tx, children[child_idx + 1]).await?; let right_child_stored_node = store.get_node(tx, children[child_idx + 1]).await?;
return if right_child_stored_node.node.keys().len() >= self.state.minimum_degree { return if right_child_stored_node.n.keys().len() >= self.state.minimum_degree {
Self::delete_adjust_successor( Self::delete_adjust_successor(
store, store,
keys, keys,
@ -520,7 +539,7 @@ where
if child_idx > 0 { if child_idx > 0 {
let child_idx = child_idx - 1; let child_idx = child_idx - 1;
let left_child_stored_node = store.get_node(tx, children[child_idx]).await?; let left_child_stored_node = store.get_node(tx, children[child_idx]).await?;
return if left_child_stored_node.node.keys().len() >= self.state.minimum_degree { return if left_child_stored_node.n.keys().len() >= self.state.minimum_degree {
Self::delete_adjust_predecessor( Self::delete_adjust_predecessor(
store, store,
keys, keys,
@ -562,12 +581,12 @@ where
mut right_child_stored_node: BStoredNode<BK>, mut right_child_stored_node: BStoredNode<BK>,
) -> Result<(bool, bool, Key, NodeId), Error> { ) -> Result<(bool, bool, Key, NodeId), Error> {
if let Some((ascending_key, ascending_payload)) = if let Some((ascending_key, ascending_payload)) =
right_child_stored_node.node.keys().get_first_key() right_child_stored_node.n.keys().get_first_key()
{ {
right_child_stored_node.node.keys_mut().remove(&ascending_key); right_child_stored_node.n.keys_mut().remove(&ascending_key);
if let Some(descending_key) = keys.get_key(child_idx) { if let Some(descending_key) = keys.get_key(child_idx) {
if let Some(descending_payload) = keys.remove(&descending_key) { if let Some(descending_payload) = keys.remove(&descending_key) {
child_stored_node.node.keys_mut().insert(descending_key, descending_payload); child_stored_node.n.keys_mut().insert(descending_key, descending_payload);
keys.insert(ascending_key, ascending_payload); keys.insert(ascending_key, ascending_payload);
let child_id = child_stored_node.id; let child_id = child_stored_node.id;
store.set_node(child_stored_node, true)?; store.set_node(child_stored_node, true)?;
@ -590,12 +609,12 @@ where
mut left_child_stored_node: BStoredNode<BK>, mut left_child_stored_node: BStoredNode<BK>,
) -> Result<(bool, bool, Key, NodeId), Error> { ) -> Result<(bool, bool, Key, NodeId), Error> {
if let Some((ascending_key, ascending_payload)) = if let Some((ascending_key, ascending_payload)) =
left_child_stored_node.node.keys().get_last_key() left_child_stored_node.n.keys().get_last_key()
{ {
left_child_stored_node.node.keys_mut().remove(&ascending_key); left_child_stored_node.n.keys_mut().remove(&ascending_key);
if let Some(descending_key) = keys.get_key(child_idx) { if let Some(descending_key) = keys.get_key(child_idx) {
if let Some(descending_payload) = keys.remove(&descending_key) { if let Some(descending_payload) = keys.remove(&descending_key) {
child_stored_node.node.keys_mut().insert(descending_key, descending_payload); child_stored_node.n.keys_mut().insert(descending_key, descending_payload);
keys.insert(ascending_key, ascending_payload); keys.insert(ascending_key, ascending_payload);
let child_id = child_stored_node.id; let child_id = child_stored_node.id;
store.set_node(child_stored_node, true)?; store.set_node(child_stored_node, true)?;
@ -623,7 +642,7 @@ where
if let Some(descending_payload) = keys.remove(&descending_key) { if let Some(descending_payload) = keys.remove(&descending_key) {
children.remove(child_idx + 1); children.remove(child_idx + 1);
let left_id = left_child.id; let left_id = left_child.id;
left_child.node.append(descending_key, descending_payload, right_child.node)?; left_child.n.append(descending_key, descending_payload, right_child.n)?;
store.set_node(left_child, true)?; store.set_node(left_child, true)?;
store.remove_node(right_child.id, right_child.key)?; store.remove_node(right_child.id, right_child.key)?;
return Ok((true, is_main_key, key_to_delete, left_id)); return Ok((true, is_main_key, key_to_delete, left_id));
@ -645,13 +664,13 @@ where
} }
while let Some((node_id, depth)) = node_queue.pop_front() { while let Some((node_id, depth)) = node_queue.pop_front() {
let stored = store.get_node(tx, node_id).await?; let stored = store.get_node(tx, node_id).await?;
stats.keys_count += stored.node.keys().len() as u64; stats.keys_count += stored.n.keys().len() as u64;
if depth > stats.max_depth { if depth > stats.max_depth {
stats.max_depth = depth; stats.max_depth = depth;
} }
stats.nodes_count += 1; stats.nodes_count += 1;
stats.total_size += stored.size as u64; stats.total_size += stored.size as u64;
if let BTreeNode::Internal(_, children) = &stored.node { if let BTreeNode::Internal(_, children) = &stored.n {
let depth = depth + 1; let depth = depth + 1;
for child_id in children.iter() { for child_id in children.iter() {
node_queue.push_front((*child_id, depth)); node_queue.push_front((*child_id, depth));
@ -665,10 +684,6 @@ where
pub(in crate::idx) fn get_state(&self) -> &BState { pub(in crate::idx) fn get_state(&self) -> &BState {
&self.state &self.state
} }
pub(in crate::idx) fn is_updated(&self) -> bool {
self.updated
}
} }
#[cfg(test)] #[cfg(test)]
@ -1032,13 +1047,13 @@ mod tests {
0 => { 0 => {
assert_eq!(depth, 1); assert_eq!(depth, 1);
assert_eq!(node_id, 7); assert_eq!(node_id, 7);
check_is_internal_node(node.node, vec![("p", 16)], vec![1, 8]); check_is_internal_node(node.n, vec![("p", 16)], vec![1, 8]);
} }
1 => { 1 => {
assert_eq!(depth, 2); assert_eq!(depth, 2);
assert_eq!(node_id, 1); assert_eq!(node_id, 1);
check_is_internal_node( check_is_internal_node(
node.node, node.n,
vec![("c", 3), ("g", 7), ("m", 13)], vec![("c", 3), ("g", 7), ("m", 13)],
vec![0, 9, 2, 3], vec![0, 9, 2, 3],
); );
@ -1046,42 +1061,42 @@ mod tests {
2 => { 2 => {
assert_eq!(depth, 2); assert_eq!(depth, 2);
assert_eq!(node_id, 8); assert_eq!(node_id, 8);
check_is_internal_node(node.node, vec![("t", 20), ("x", 24)], vec![4, 6, 5]); check_is_internal_node(node.n, vec![("t", 20), ("x", 24)], vec![4, 6, 5]);
} }
3 => { 3 => {
assert_eq!(depth, 3); assert_eq!(depth, 3);
assert_eq!(node_id, 0); assert_eq!(node_id, 0);
check_is_leaf_node(node.node, vec![("a", 1), ("b", 2)]); check_is_leaf_node(node.n, vec![("a", 1), ("b", 2)]);
} }
4 => { 4 => {
assert_eq!(depth, 3); assert_eq!(depth, 3);
assert_eq!(node_id, 9); assert_eq!(node_id, 9);
check_is_leaf_node(node.node, vec![("d", 4), ("e", 5), ("f", 6)]); check_is_leaf_node(node.n, vec![("d", 4), ("e", 5), ("f", 6)]);
} }
5 => { 5 => {
assert_eq!(depth, 3); assert_eq!(depth, 3);
assert_eq!(node_id, 2); assert_eq!(node_id, 2);
check_is_leaf_node(node.node, vec![("j", 10), ("k", 11), ("l", 12)]); check_is_leaf_node(node.n, vec![("j", 10), ("k", 11), ("l", 12)]);
} }
6 => { 6 => {
assert_eq!(depth, 3); assert_eq!(depth, 3);
assert_eq!(node_id, 3); assert_eq!(node_id, 3);
check_is_leaf_node(node.node, vec![("n", 14), ("o", 15)]); check_is_leaf_node(node.n, vec![("n", 14), ("o", 15)]);
} }
7 => { 7 => {
assert_eq!(depth, 3); assert_eq!(depth, 3);
assert_eq!(node_id, 4); assert_eq!(node_id, 4);
check_is_leaf_node(node.node, vec![("q", 17), ("r", 18), ("s", 19)]); check_is_leaf_node(node.n, vec![("q", 17), ("r", 18), ("s", 19)]);
} }
8 => { 8 => {
assert_eq!(depth, 3); assert_eq!(depth, 3);
assert_eq!(node_id, 6); assert_eq!(node_id, 6);
check_is_leaf_node(node.node, vec![("u", 21), ("v", 22)]); check_is_leaf_node(node.n, vec![("u", 21), ("v", 22)]);
} }
9 => { 9 => {
assert_eq!(depth, 3); assert_eq!(depth, 3);
assert_eq!(node_id, 5); assert_eq!(node_id, 5);
check_is_leaf_node(node.node, vec![("y", 25), ("z", 26)]); check_is_leaf_node(node.n, vec![("y", 25), ("z", 26)]);
} }
_ => panic!("This node should not exist {}", count), _ => panic!("This node should not exist {}", count),
}) })
@ -1135,13 +1150,13 @@ mod tests {
let nodes_count = t let nodes_count = t
.inspect_nodes(&mut tx, |count, depth, node_id, node| { .inspect_nodes(&mut tx, |count, depth, node_id, node| {
debug!("{} -> {}", depth, node_id); debug!("{} -> {}", depth, node_id);
node.node.debug(|k| Ok(String::from_utf8(k)?)).unwrap(); node.n.debug(|k| Ok(String::from_utf8(k)?)).unwrap();
match count { match count {
0 => { 0 => {
assert_eq!(depth, 1); assert_eq!(depth, 1);
assert_eq!(node_id, 1); assert_eq!(node_id, 1);
check_is_internal_node( check_is_internal_node(
node.node, node.n,
vec![("e", 5), ("l", 12), ("p", 16), ("t", 20), ("x", 24)], vec![("e", 5), ("l", 12), ("p", 16), ("t", 20), ("x", 24)],
vec![0, 9, 3, 4, 6, 5], vec![0, 9, 3, 4, 6, 5],
); );
@ -1149,32 +1164,32 @@ mod tests {
1 => { 1 => {
assert_eq!(depth, 2); assert_eq!(depth, 2);
assert_eq!(node_id, 0); assert_eq!(node_id, 0);
check_is_leaf_node(node.node, vec![("a", 1), ("c", 3)]); check_is_leaf_node(node.n, vec![("a", 1), ("c", 3)]);
} }
2 => { 2 => {
assert_eq!(depth, 2); assert_eq!(depth, 2);
assert_eq!(node_id, 9); assert_eq!(node_id, 9);
check_is_leaf_node(node.node, vec![("j", 10), ("k", 11)]); check_is_leaf_node(node.n, vec![("j", 10), ("k", 11)]);
} }
3 => { 3 => {
assert_eq!(depth, 2); assert_eq!(depth, 2);
assert_eq!(node_id, 3); assert_eq!(node_id, 3);
check_is_leaf_node(node.node, vec![("n", 14), ("o", 15)]); check_is_leaf_node(node.n, vec![("n", 14), ("o", 15)]);
} }
4 => { 4 => {
assert_eq!(depth, 2); assert_eq!(depth, 2);
assert_eq!(node_id, 4); assert_eq!(node_id, 4);
check_is_leaf_node(node.node, vec![("q", 17), ("r", 18), ("s", 19)]); check_is_leaf_node(node.n, vec![("q", 17), ("r", 18), ("s", 19)]);
} }
5 => { 5 => {
assert_eq!(depth, 2); assert_eq!(depth, 2);
assert_eq!(node_id, 6); assert_eq!(node_id, 6);
check_is_leaf_node(node.node, vec![("u", 21), ("v", 22)]); check_is_leaf_node(node.n, vec![("u", 21), ("v", 22)]);
} }
6 => { 6 => {
assert_eq!(depth, 2); assert_eq!(depth, 2);
assert_eq!(node_id, 5); assert_eq!(node_id, 5);
check_is_leaf_node(node.node, vec![("y", 25), ("z", 26)]); check_is_leaf_node(node.n, vec![("y", 25), ("z", 26)]);
} }
_ => panic!("This node should not exist {}", count), _ => panic!("This node should not exist {}", count),
} }
@ -1316,7 +1331,7 @@ mod tests {
debug!("----------------------------------"); debug!("----------------------------------");
t.inspect_nodes(tx, |_count, depth, node_id, node| { t.inspect_nodes(tx, |_count, depth, node_id, node| {
debug!("{} -> {}", depth, node_id); debug!("{} -> {}", depth, node_id);
node.node.debug(|k| Ok(String::from_utf8(k)?)).unwrap(); node.n.debug(|k| Ok(String::from_utf8(k)?)).unwrap();
}) })
.await .await
.unwrap(); .unwrap();
@ -1359,7 +1374,7 @@ mod tests {
let mut s = TreeNodeStore::Traversal(TreeNodeProvider::Debug); let mut s = TreeNodeStore::Traversal(TreeNodeProvider::Debug);
while let Some((node_id, depth)) = node_queue.pop_front() { while let Some((node_id, depth)) = node_queue.pop_front() {
let stored_node = s.get_node(tx, node_id).await?; let stored_node = s.get_node(tx, node_id).await?;
if let BTreeNode::Internal(_, children) = &stored_node.node { if let BTreeNode::Internal(_, children) = &stored_node.n {
let depth = depth + 1; let depth = depth + 1;
for child_id in children { for child_id in children {
node_queue.push_back((*child_id, depth)); node_queue.push_back((*child_id, depth));

View file

@ -1,3 +1,4 @@
pub mod bkeys; pub mod bkeys;
pub mod btree; pub mod btree;
pub mod mtree;
pub mod store; pub mod store;

1792
lib/src/idx/trees/mtree.rs Normal file

File diff suppressed because it is too large Load diff

View file

@ -9,7 +9,7 @@ use tokio::sync::Mutex;
pub type NodeId = u64; pub type NodeId = u64;
#[derive(Clone, Copy)] #[derive(Clone, Copy, PartialEq)]
pub enum TreeStoreType { pub enum TreeStoreType {
Write, Write,
Read, Read,
@ -151,7 +151,7 @@ where
#[cfg(debug_assertions)] #[cfg(debug_assertions)]
self.out.insert(id); self.out.insert(id);
StoredNode { StoredNode {
node, n: node,
id, id,
key: self.np.get_key(id), key: self.np.get_key(id),
size: 0, size: 0,
@ -238,6 +238,7 @@ pub enum TreeNodeProvider {
DocLengths(IndexKeyBase), DocLengths(IndexKeyBase),
Postings(IndexKeyBase), Postings(IndexKeyBase),
Terms(IndexKeyBase), Terms(IndexKeyBase),
Vector(IndexKeyBase),
Debug, Debug,
} }
@ -248,6 +249,7 @@ impl TreeNodeProvider {
TreeNodeProvider::DocLengths(ikb) => ikb.new_bl_key(Some(node_id)), TreeNodeProvider::DocLengths(ikb) => ikb.new_bl_key(Some(node_id)),
TreeNodeProvider::Postings(ikb) => ikb.new_bp_key(Some(node_id)), TreeNodeProvider::Postings(ikb) => ikb.new_bp_key(Some(node_id)),
TreeNodeProvider::Terms(ikb) => ikb.new_bt_key(Some(node_id)), TreeNodeProvider::Terms(ikb) => ikb.new_bt_key(Some(node_id)),
TreeNodeProvider::Vector(ikb) => ikb.new_vm_key(Some(node_id)),
TreeNodeProvider::Debug => node_id.to_be_bytes().to_vec(), TreeNodeProvider::Debug => node_id.to_be_bytes().to_vec(),
} }
} }
@ -261,7 +263,7 @@ impl TreeNodeProvider {
let size = val.len() as u32; let size = val.len() as u32;
let node = N::try_from_val(val)?; let node = N::try_from_val(val)?;
Ok(StoredNode { Ok(StoredNode {
node, n: node,
id, id,
key, key,
size, size,
@ -275,19 +277,30 @@ impl TreeNodeProvider {
where where
N: TreeNode, N: TreeNode,
{ {
let val = node.node.try_into_val()?; let val = node.n.try_into_val()?;
tx.set(node.key, val).await?; tx.set(node.key, val).await?;
Ok(()) Ok(())
} }
} }
pub(super) struct StoredNode<N> { pub(super) struct StoredNode<N> {
pub(super) node: N, pub(super) n: N,
pub(super) id: NodeId, pub(super) id: NodeId,
pub(super) key: Key, pub(super) key: Key,
pub(super) size: u32, pub(super) size: u32,
} }
impl<N> StoredNode<N> {
pub(super) fn new(n: N, id: NodeId, key: Key, size: u32) -> Self {
Self {
n,
id,
key,
size,
}
}
}
pub trait TreeNode pub trait TreeNode
where where
Self: Sized, Self: Sized,

View file

@ -1,5 +1,5 @@
//! Stores Term/Doc frequency //! Stores Term/Doc frequency
use crate::idx::ft::docids::DocId; use crate::idx::docids::DocId;
use crate::idx::ft::terms::TermId; use crate::idx::ft::terms::TermId;
use derive::Key; use derive::Key;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};

View file

@ -1,5 +1,5 @@
//! Stores the term list for doc_ids //! Stores the term list for doc_ids
use crate::idx::ft::docids::DocId; use crate::idx::docids::DocId;
use derive::Key; use derive::Key;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};

View file

@ -1,5 +1,5 @@
//! Stores the offsets //! Stores the offsets
use crate::idx::ft::docids::DocId; use crate::idx::docids::DocId;
use crate::idx::ft::terms::TermId; use crate::idx::ft::terms::TermId;
use derive::Key; use derive::Key;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};

View file

@ -11,6 +11,7 @@ pub mod bp;
pub mod bs; pub mod bs;
pub mod bt; pub mod bt;
pub mod bu; pub mod bu;
pub mod vm;
use crate::sql::array::Array; use crate::sql::array::Array;
use crate::sql::id::Id; use crate::sql::id::Id;

68
lib/src/key/index/vm.rs Normal file
View file

@ -0,0 +1,68 @@
//! Stores MTree state and nodes
use crate::idx::trees::store::NodeId;
use derive::Key;
use serde::{Deserialize, Serialize};
#[derive(Clone, Debug, Eq, PartialEq, PartialOrd, Serialize, Deserialize, Key)]
pub struct Vm<'a> {
__: u8,
_a: u8,
pub ns: &'a str,
_b: u8,
pub db: &'a str,
_c: u8,
pub tb: &'a str,
_d: u8,
pub ix: &'a str,
_e: u8,
_f: u8,
_g: u8,
pub node_id: Option<NodeId>,
}
impl<'a> Vm<'a> {
pub fn new(
ns: &'a str,
db: &'a str,
tb: &'a str,
ix: &'a str,
node_id: Option<NodeId>,
) -> Self {
Self {
__: b'/',
_a: b'*',
ns,
_b: b'*',
db,
_c: b'*',
tb,
_d: b'+',
ix,
_e: b'!',
_f: b'v',
_g: b'm',
node_id,
}
}
}
#[cfg(test)]
mod tests {
#[test]
fn key() {
use super::*;
#[rustfmt::skip]
let val = Vm::new(
"testns",
"testdb",
"testtb",
"testix",
Some(8)
);
let enc = Vm::encode(&val).unwrap();
assert_eq!(enc, b"/*testns\0*testdb\0*testtb\0+testix\0!vm\x01\0\0\0\0\0\0\0\x08");
let dec = Vm::decode(&enc).unwrap();
assert_eq!(val, dec);
}
}

View file

@ -316,6 +316,9 @@ impl Datastore {
} }
/// Setup the initial credentials /// Setup the initial credentials
/// Trigger the `unreachable definition` compilation error, probably due to this issue:
/// https://github.com/rust-lang/rust/issues/111370
#[allow(unreachable_code, unused_variables)]
pub async fn setup_initial_creds(&self, creds: Root<'_>) -> Result<(), Error> { pub async fn setup_initial_creds(&self, creds: Root<'_>) -> Result<(), Error> {
// Start a new writeable transaction // Start a new writeable transaction
let txn = self.transaction(true, false).await?.rollback_with_panic().enclose(); let txn = self.transaction(true, false).await?.rollback_with_panic().enclose();

View file

@ -191,6 +191,7 @@ impl Expression {
Operator::Outside => fnc::operate::outside(&l, &r), Operator::Outside => fnc::operate::outside(&l, &r),
Operator::Intersects => fnc::operate::intersects(&l, &r), Operator::Intersects => fnc::operate::intersects(&l, &r),
Operator::Matches(_) => fnc::operate::matches(ctx, txn, doc, self).await, Operator::Matches(_) => fnc::operate::matches(ctx, txn, doc, self).await,
Operator::Knn(_) => fnc::operate::knn(ctx, txn, doc, self).await,
_ => unreachable!(), _ => unreachable!(),
} }
} }

View file

@ -49,7 +49,7 @@ pub struct MTreeParams {
pub doc_ids_order: u32, pub doc_ids_order: u32,
} }
#[derive(Default, Clone, Debug, Eq, PartialEq, PartialOrd, Serialize, Deserialize, Hash)] #[derive(Clone, Default, Debug, Eq, PartialEq, PartialOrd, Serialize, Deserialize, Hash)]
#[revisioned(revision = 1)] #[revisioned(revision = 1)]
pub enum Distance { pub enum Distance {
#[default] #[default]
@ -182,6 +182,7 @@ pub fn search(i: &str) -> IResult<&str, Index> {
pub fn distance(i: &str) -> IResult<&str, Distance> { pub fn distance(i: &str) -> IResult<&str, Distance> {
let (i, _) = mightbespace(i)?; let (i, _) = mightbespace(i)?;
let (i, _) = tag_no_case("DIST")(i)?; let (i, _) = tag_no_case("DIST")(i)?;
let (i, _) = shouldbespace(i)?;
alt(( alt((
map(tag_no_case("EUCLIDEAN"), |_| Distance::Euclidean), map(tag_no_case("EUCLIDEAN"), |_| Distance::Euclidean),
map(tag_no_case("MANHATTAN"), |_| Distance::Manhattan), map(tag_no_case("MANHATTAN"), |_| Distance::Manhattan),
@ -200,7 +201,7 @@ pub fn minkowski(i: &str) -> IResult<&str, Distance> {
} }
pub fn dimension(i: &str) -> IResult<&str, u16> { pub fn dimension(i: &str) -> IResult<&str, u16> {
let (i, _) = shouldbespace(i)?; let (i, _) = mightbespace(i)?;
let (i, _) = tag_no_case("DIMENSION")(i)?; let (i, _) = tag_no_case("DIMENSION")(i)?;
let (i, _) = shouldbespace(i)?; let (i, _) = shouldbespace(i)?;
let (i, dim) = uint16(i)?; let (i, dim) = uint16(i)?;

View file

@ -6,6 +6,7 @@ use nom::branch::alt;
use nom::bytes::complete::tag; use nom::bytes::complete::tag;
use nom::bytes::complete::tag_no_case; use nom::bytes::complete::tag_no_case;
use nom::character::complete::char; use nom::character::complete::char;
use nom::character::complete::u32 as uint32;
use nom::character::complete::u8 as uint8; use nom::character::complete::u8 as uint8;
use nom::combinator::cut; use nom::combinator::cut;
use nom::combinator::opt; use nom::combinator::opt;
@ -67,6 +68,8 @@ pub enum Operator {
// //
Outside, Outside,
Intersects, Intersects,
//
Knn(u32), // <{k}>
} }
impl Default for Operator { impl Default for Operator {
@ -141,6 +144,7 @@ impl fmt::Display for Operator {
f.write_str("@@") f.write_str("@@")
} }
} }
Self::Knn(k) => write!(f, "<{}>", k),
} }
} }
} }
@ -191,12 +195,14 @@ pub fn binary_symbols(i: &str) -> IResult<&str, Operator> {
value(Operator::AnyLike, tag("?~")), value(Operator::AnyLike, tag("?~")),
value(Operator::Like, char('~')), value(Operator::Like, char('~')),
matches, matches,
knn,
)), )),
alt(( alt((
value(Operator::LessThanOrEqual, tag("<=")), value(Operator::LessThanOrEqual, tag("<=")),
value(Operator::LessThan, char('<')), value(Operator::LessThan, char('<')),
value(Operator::MoreThanOrEqual, tag(">=")), value(Operator::MoreThanOrEqual, tag(">=")),
value(Operator::MoreThan, char('>')), value(Operator::MoreThan, char('>')),
knn,
)), )),
alt(( alt((
value(Operator::Pow, tag("**")), value(Operator::Pow, tag("**")),
@ -257,7 +263,6 @@ pub fn binary_phrases(i: &str) -> IResult<&str, Operator> {
pub fn matches(i: &str) -> IResult<&str, Operator> { pub fn matches(i: &str) -> IResult<&str, Operator> {
let (i, _) = char('@')(i)?; let (i, _) = char('@')(i)?;
// let (i, reference) = opt(|i| uint8(i))(i)?;
cut(|i| { cut(|i| {
let (i, reference) = opt(uint8)(i)?; let (i, reference) = opt(uint8)(i)?;
let (i, _) = char('@')(i)?; let (i, _) = char('@')(i)?;
@ -265,6 +270,13 @@ pub fn matches(i: &str) -> IResult<&str, Operator> {
})(i) })(i)
} }
pub fn knn(i: &str) -> IResult<&str, Operator> {
let (i, _) = char('<')(i)?;
let (i, k) = uint32(i)?;
let (i, _) = char('>')(i)?;
Ok((i, Operator::Knn(k)))
}
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use super::*; use super::*;
@ -290,4 +302,13 @@ mod tests {
let res = matches("@256@"); let res = matches("@256@");
res.unwrap_err(); res.unwrap_err();
} }
#[test]
fn test_knn() {
let res = knn("<5>");
assert!(res.is_ok());
let out = res.unwrap().1;
assert_eq!("<5>", format!("{}", out));
assert_eq!(out, Operator::Knn(5));
}
} }

View file

@ -5,6 +5,7 @@ use crate::doc::CursorDoc;
use crate::err::Error; use crate::err::Error;
use crate::iam::{Action, ResourceKind}; use crate::iam::{Action, ResourceKind};
use crate::idx::ft::FtIndex; use crate::idx::ft::FtIndex;
use crate::idx::trees::mtree::MTreeIndex;
use crate::idx::trees::store::TreeStoreType; use crate::idx::trees::store::TreeStoreType;
use crate::idx::IndexKeyBase; use crate::idx::IndexKeyBase;
use crate::sql::comment::shouldbespace; use crate::sql::comment::shouldbespace;
@ -56,6 +57,11 @@ impl AnalyzeStatement {
FtIndex::new(&mut run, az, ikb, p, TreeStoreType::Traversal).await?; FtIndex::new(&mut run, az, ikb, p, TreeStoreType::Traversal).await?;
ft.statistics(&mut run).await?.into() ft.statistics(&mut run).await?.into()
} }
Index::MTree(p) => {
let mt =
MTreeIndex::new(&mut run, ikb, p, TreeStoreType::Traversal).await?;
mt.statistics(&mut run).await?.into()
}
_ => { _ => {
return Err(Error::FeatureNotYetImplemented { return Err(Error::FeatureNotYetImplemented {
feature: "Statistics on unique and non-unique indexes.".to_string(), feature: "Statistics on unique and non-unique indexes.".to_string(),

View file

@ -178,7 +178,7 @@ fn index_comment(i: &str) -> IResult<&str, DefineIndexOption> {
mod tests { mod tests {
use super::*; use super::*;
use crate::sql::index::SearchParams; use crate::sql::index::{Distance, MTreeParams, SearchParams};
use crate::sql::Ident; use crate::sql::Ident;
use crate::sql::Idiom; use crate::sql::Idiom;
use crate::sql::Idioms; use crate::sql::Idioms;
@ -275,4 +275,29 @@ mod tests {
"DEFINE INDEX my_index ON my_table FIELDS my_col SEARCH ANALYZER my_analyzer VS DOC_IDS_ORDER 100 DOC_LENGTHS_ORDER 100 POSTINGS_ORDER 100 TERMS_ORDER 100" "DEFINE INDEX my_index ON my_table FIELDS my_col SEARCH ANALYZER my_analyzer VS DOC_IDS_ORDER 100 DOC_LENGTHS_ORDER 100 POSTINGS_ORDER 100 TERMS_ORDER 100"
); );
} }
#[test]
fn check_create_mtree_index() {
let sql = "INDEX my_index ON TABLE my_table COLUMNS my_col MTREE DIMENSION 4";
let (_, idx) = index(sql).unwrap();
assert_eq!(
idx,
DefineIndexStatement {
name: Ident("my_index".to_string()),
what: Ident("my_table".to_string()),
cols: Idioms(vec![Idiom(vec![Part::Field(Ident("my_col".to_string()))])]),
index: Index::MTree(MTreeParams {
dimension: 4,
distance: Distance::Euclidean,
capacity: 40,
doc_ids_order: 100,
}),
comment: None,
}
);
assert_eq!(
idx.to_string(),
"DEFINE INDEX my_index ON my_table FIELDS my_col MTREE DIMENSION 4 DIST EUCLIDEAN CAPACITY 40 DOC_IDS_ORDER 100"
);
}
} }

View file

@ -3,6 +3,7 @@ use crate::sql::index::Distance;
use crate::sql::value::serde::ser; use crate::sql::value::serde::ser;
use serde::ser::Error as _; use serde::ser::Error as _;
use serde::ser::Impossible; use serde::ser::Impossible;
use serde::Serialize;
pub(super) struct Serializer; pub(super) struct Serializer;
@ -29,9 +30,34 @@ impl ser::Serializer for Serializer {
) -> Result<Self::Ok, Error> { ) -> Result<Self::Ok, Error> {
match variant { match variant {
"Euclidean" => Ok(Distance::Euclidean), "Euclidean" => Ok(Distance::Euclidean),
"Manhattan" => Ok(Distance::Manhattan),
"Cosine" => Ok(Distance::Cosine),
"Hamming" => Ok(Distance::Hamming),
"Mahalanobis" => Ok(Distance::Mahalanobis),
variant => Err(Error::custom(format!("unexpected unit variant `{name}::{variant}`"))), variant => Err(Error::custom(format!("unexpected unit variant `{name}::{variant}`"))),
} }
} }
#[inline]
fn serialize_newtype_variant<T>(
self,
name: &'static str,
_variant_index: u32,
variant: &'static str,
value: &T,
) -> Result<Self::Ok, Error>
where
T: ?Sized + Serialize,
{
match variant {
"Minkowski" => {
Ok(Distance::Minkowski(value.serialize(ser::number::Serializer.wrap())?))
}
variant => {
Err(Error::custom(format!("unexpected newtype variant `{name}::{variant}`")))
}
}
}
} }
#[cfg(test)] #[cfg(test)]
@ -41,9 +67,44 @@ mod tests {
use serde::Serialize; use serde::Serialize;
#[test] #[test]
fn euclidean() { fn distance_euclidean() {
let dist = Distance::Euclidean; let dist = Distance::Euclidean;
let serialized = dist.serialize(Serializer.wrap()).unwrap(); let serialized = dist.serialize(Serializer.wrap()).unwrap();
assert_eq!(dist, serialized); assert_eq!(dist, serialized);
} }
#[test]
fn distance_manhattan() {
let dist = Distance::Manhattan;
let serialized = dist.serialize(Serializer.wrap()).unwrap();
assert_eq!(dist, serialized);
}
#[test]
fn distance_mahalanobis() {
let dist = Distance::Mahalanobis;
let serialized = dist.serialize(Serializer.wrap()).unwrap();
assert_eq!(dist, serialized);
}
#[test]
fn distance_hamming() {
let dist = Distance::Hamming;
let serialized = dist.serialize(Serializer.wrap()).unwrap();
assert_eq!(dist, serialized);
}
#[test]
fn distance_cosine() {
let dist = Distance::Cosine;
let serialized = dist.serialize(Serializer.wrap()).unwrap();
assert_eq!(dist, serialized);
}
#[test]
fn distance_minkowski() {
let dist = Distance::Minkowski(7.into());
let serialized = dist.serialize(Serializer.wrap()).unwrap();
assert_eq!(dist, serialized);
}
} }

View file

@ -366,7 +366,7 @@ async fn changefeed_with_ts() -> Result<(), Error> {
let Value::Object(a) = a else { let Value::Object(a) = a else {
unreachable!() unreachable!()
}; };
let Value::Number(versionstamp1) = a.get("versionstamp").unwrap() else { let Value::Number(versionstamp2) = a.get("versionstamp").unwrap() else {
unreachable!() unreachable!()
}; };
let changes = a.get("changes").unwrap().to_owned(); let changes = a.get("changes").unwrap().to_owned();
@ -389,10 +389,10 @@ async fn changefeed_with_ts() -> Result<(), Error> {
let Value::Object(a) = a else { let Value::Object(a) = a else {
unreachable!() unreachable!()
}; };
let Value::Number(versionstamp2) = a.get("versionstamp").unwrap() else { let Value::Number(versionstamp3) = a.get("versionstamp").unwrap() else {
unreachable!() unreachable!()
}; };
assert!(versionstamp1 < versionstamp2); assert!(versionstamp2 < versionstamp3);
let changes = a.get("changes").unwrap().to_owned(); let changes = a.get("changes").unwrap().to_owned();
assert_eq!( assert_eq!(
changes, changes,
@ -413,10 +413,10 @@ async fn changefeed_with_ts() -> Result<(), Error> {
let Value::Object(a) = a else { let Value::Object(a) = a else {
unreachable!() unreachable!()
}; };
let Value::Number(versionstamp3) = a.get("versionstamp").unwrap() else { let Value::Number(versionstamp4) = a.get("versionstamp").unwrap() else {
unreachable!() unreachable!()
}; };
assert!(versionstamp2 < versionstamp3); assert!(versionstamp3 < versionstamp4);
let changes = a.get("changes").unwrap().to_owned(); let changes = a.get("changes").unwrap().to_owned();
assert_eq!( assert_eq!(
changes, changes,
@ -437,10 +437,10 @@ async fn changefeed_with_ts() -> Result<(), Error> {
let Value::Object(a) = a else { let Value::Object(a) = a else {
unreachable!() unreachable!()
}; };
let Value::Number(versionstamp4) = a.get("versionstamp").unwrap() else { let Value::Number(versionstamp5) = a.get("versionstamp").unwrap() else {
unreachable!() unreachable!()
}; };
assert!(versionstamp3 < versionstamp4); assert!(versionstamp4 < versionstamp5);
let changes = a.get("changes").unwrap().to_owned(); let changes = a.get("changes").unwrap().to_owned();
assert_eq!( assert_eq!(
changes, changes,
@ -487,7 +487,7 @@ async fn changefeed_with_ts() -> Result<(), Error> {
let Value::Number(versionstamp1b) = a.get("versionstamp").unwrap() else { let Value::Number(versionstamp1b) = a.get("versionstamp").unwrap() else {
unreachable!() unreachable!()
}; };
assert!(versionstamp1 == versionstamp1b); assert!(versionstamp2 == versionstamp1b);
let changes = a.get("changes").unwrap().to_owned(); let changes = a.get("changes").unwrap().to_owned();
assert_eq!( assert_eq!(
changes, changes,

View file

@ -1211,7 +1211,9 @@ async fn define_statement_search_index() -> Result<(), Error> {
events: {}, events: {},
fields: {}, fields: {},
tables: {}, tables: {},
indexes: { blog_title: 'DEFINE INDEX blog_title ON blog FIELDS title SEARCH ANALYZER simple BM25(1.2,0.75) DOC_IDS_ORDER 100 DOC_LENGTHS_ORDER 100 POSTINGS_ORDER 100 TERMS_ORDER 100 HIGHLIGHTS' }, indexes: { blog_title: 'DEFINE INDEX blog_title ON blog FIELDS title \
SEARCH ANALYZER simple BM25(1.2,0.75) \
DOC_IDS_ORDER 100 DOC_LENGTHS_ORDER 100 POSTINGS_ORDER 100 TERMS_ORDER 100 HIGHLIGHTS' },
lives: {}, lives: {},
}", }",
); );

60
lib/tests/vector.rs Normal file
View file

@ -0,0 +1,60 @@
mod helpers;
mod parse;
use crate::helpers::new_ds;
use parse::Parse;
use surrealdb::dbs::Session;
use surrealdb::err::Error;
use surrealdb::sql::Value;
#[tokio::test]
async fn select_where_mtree_knn() -> Result<(), Error> {
let sql = r"
CREATE pts:1 SET point = [1,2,3,4];
CREATE pts:2 SET point = [4,5,6,7];
CREATE pts:3 SET point = [8,9,10,11];
DEFINE INDEX mt_pts ON pts FIELDS point MTREE DIMENSION 4;
LET $pt = [2,3,4,5];
SELECT id, vector::distance::euclidean(point, $pt) AS dist FROM pts WHERE point <2> $pt;
SELECT id FROM pts WHERE point <2> $pt EXPLAIN;
";
let dbs = new_ds().await?;
let ses = Session::owner().with_ns("test").with_db("test");
let res = &mut dbs.execute(sql, &ses, None).await?;
assert_eq!(res.len(), 7);
//
for _ in 0..5 {
let _ = res.remove(0).result?;
}
let tmp = res.remove(0).result?;
let val = Value::parse(
"[
{
id: pts:1,
dist: 2f
},
{
id: pts:2,
dist: 4f
}
]",
);
assert_eq!(format!("{:#}", tmp), format!("{:#}", val));
let tmp = res.remove(0).result?;
let val = Value::parse(
"[
{
detail: {
plan: {
index: 'mt_pts',
operator: '<2>',
value: [2,3,4,5]
},
table: 'pts',
},
operation: 'Iterate Index'
}
]",
);
assert_eq!(format!("{:#}", tmp), format!("{:#}", val));
Ok(())
}