Feature: Vector Search: mtree index + knn operator (#2546)

Co-authored-by: Tobie Morgan Hitchcock <tobie@surrealdb.com>
This commit is contained in:
Emmanuel Keller 2023-09-12 21:26:03 +01:00 committed by GitHub
parent 1a85f4967a
commit 0772a8c592
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
41 changed files with 2541 additions and 235 deletions

View file

@ -8,7 +8,7 @@ use crate::dbs::Statement;
use crate::dbs::{Options, Transaction};
use crate::doc::Document;
use crate::err::Error;
use crate::idx::ft::docids::DocId;
use crate::idx::docids::DocId;
use crate::idx::planner::executor::IteratorRef;
use crate::sql::array::Array;
use crate::sql::edges::Edges;

View file

@ -594,7 +594,7 @@ impl<'a> Processor<'a> {
}
}
Err(Error::QueryNotExecutedDetail {
message: "No QueryExecutor has not been found.".to_string(),
message: "No QueryExecutor has been found.".to_string(),
})
}
}

View file

@ -4,7 +4,7 @@ use crate::dbs::Workable;
use crate::err::Error;
use crate::iam::Action;
use crate::iam::ResourceKind;
use crate::idx::ft::docids::DocId;
use crate::idx::docids::DocId;
use crate::idx::planner::executor::IteratorRef;
use crate::sql::statements::define::DefineEventStatement;
use crate::sql::statements::define::DefineFieldStatement;

View file

@ -4,10 +4,11 @@ use crate::dbs::{Options, Transaction};
use crate::doc::{CursorDoc, Document};
use crate::err::Error;
use crate::idx::ft::FtIndex;
use crate::idx::trees::mtree::MTreeIndex;
use crate::idx::trees::store::TreeStoreType;
use crate::idx::IndexKeyBase;
use crate::sql::array::Array;
use crate::sql::index::{Index, SearchParams};
use crate::sql::index::{Index, MTreeParams, SearchParams};
use crate::sql::statements::DefineIndexStatement;
use crate::sql::{Part, Thing, Value};
use crate::{key, kvs};
@ -55,11 +56,7 @@ impl<'a> Document<'a> {
Index::Uniq => ic.index_unique(&mut run).await?,
Index::Idx => ic.index_non_unique(&mut run).await?,
Index::Search(p) => ic.index_full_text(&mut run, p).await?,
Index::MTree(_) => {
return Err(Error::FeatureNotYetImplemented {
feature: "MTree indexing".to_string(),
})
}
Index::MTree(p) => ic.index_mtree(&mut run, p).await?,
};
}
}
@ -332,18 +329,36 @@ impl<'a> IndexOperation<'a> {
}
async fn index_full_text(
&self,
&mut self,
run: &mut kvs::Transaction,
p: &SearchParams,
) -> Result<(), Error> {
let ikb = IndexKeyBase::new(self.opt, self.ix);
let az = run.get_db_analyzer(self.opt.ns(), self.opt.db(), p.az.as_str()).await?;
let mut ft = FtIndex::new(run, az, ikb, p, TreeStoreType::Write).await?;
if let Some(n) = &self.n {
if let Some(n) = self.n.take() {
ft.index_document(run, self.rid, n).await?;
} else {
ft.remove_document(run, self.rid).await?;
}
ft.finish(run).await
}
async fn index_mtree(
&mut self,
run: &mut kvs::Transaction,
p: &MTreeParams,
) -> Result<(), Error> {
let ikb = IndexKeyBase::new(self.opt, self.ix);
let mut mt = MTreeIndex::new(run, ikb, p, TreeStoreType::Write).await?;
// Delete the old index data
if let Some(o) = self.o.take() {
mt.remove_document(run, self.rid, o).await?;
}
// Create the new index data
if let Some(n) = self.n.take() {
mt.index_document(run, self.rid, n).await?;
}
mt.finish(run).await
}
}

View file

@ -208,6 +208,26 @@ pub enum Error {
#[error("The URL `{0}` is invalid")]
InvalidUrl(String),
/// The size of the vector is incorrect
#[error("Incorrect vector dimension ({current}). Expected a vector of {expected} dimension.")]
InvalidVectorDimension {
current: usize,
expected: usize,
},
/// The size of the vector is incorrect
#[error("The vector element ({current}) is not a number.")]
InvalidVectorType {
current: String,
expected: &'static str,
},
/// The size of the vector is incorrect
#[error("The value '{current}' is not a vector.")]
InvalidVectorValue {
current: String,
},
/// The query timedout
#[error("The query was not executed because it exceeded the timeout")]
QueryTimedout,

View file

@ -2,6 +2,7 @@ use crate::ctx::Context;
use crate::dbs::Transaction;
use crate::doc::CursorDoc;
use crate::err::Error;
use crate::idx::planner::executor::QueryExecutor;
use crate::sql::value::TryAdd;
use crate::sql::value::TryDiv;
use crate::sql::value::TryMul;
@ -9,7 +10,7 @@ use crate::sql::value::TryNeg;
use crate::sql::value::TryPow;
use crate::sql::value::TrySub;
use crate::sql::value::Value;
use crate::sql::Expression;
use crate::sql::{Expression, Thing};
pub fn neg(a: Value) -> Result<Value, Error> {
a.try_neg()
@ -167,32 +168,59 @@ pub fn intersects(a: &Value, b: &Value) -> Result<Value, Error> {
Ok(a.intersects(b).into())
}
enum IndexOption<'a> {
PreMatch,
None,
Execute(&'a QueryExecutor, &'a Thing),
}
fn get_index_option<'a>(
ctx: &'a Context<'_>,
doc: Option<&'a CursorDoc<'_>>,
exp: &'a Expression,
) -> IndexOption<'a> {
if let Some(doc) = doc {
if let Some(thg) = doc.rid {
if let Some(pla) = ctx.get_query_planner() {
if let Some(exe) = pla.get_query_executor(&thg.tb) {
if let Some(ir) = doc.ir {
if exe.is_iterator_expression(ir, exp) {
return IndexOption::PreMatch;
}
}
return IndexOption::Execute(exe, thg);
}
}
}
}
IndexOption::None
}
pub(crate) async fn matches(
ctx: &Context<'_>,
txn: &Transaction,
doc: Option<&CursorDoc<'_>>,
exp: &Expression,
) -> Result<Value, Error> {
if let Some(doc) = doc {
if let Some(thg) = doc.rid {
if let Some(pla) = ctx.get_query_planner() {
if let Some(exe) = pla.get_query_executor(&thg.tb) {
// If we find the expression in `pre_match`,
// it means that we are using an Iterator::Index
// and we are iterating over documents that already matches the expression.
if let Some(ir) = doc.ir {
if exe.is_iterator_expression(ir, exp) {
return Ok(Value::Bool(true));
match get_index_option(ctx, doc, exp) {
IndexOption::PreMatch => Ok(Value::Bool(true)),
IndexOption::None => Ok(Value::Bool(false)),
IndexOption::Execute(exe, thg) => exe.matches(txn, thg, exp).await,
}
}
// Evaluate the matches
return exe.matches(txn, thg, exp).await;
pub(crate) async fn knn(
ctx: &Context<'_>,
txn: &Transaction,
doc: Option<&CursorDoc<'_>>,
exp: &Expression,
) -> Result<Value, Error> {
match get_index_option(ctx, doc, exp) {
IndexOption::PreMatch => Ok(Value::Bool(true)),
IndexOption::None => Ok(Value::Bool(false)),
IndexOption::Execute(exe, thg) => exe.knn(txn, thg, exp).await,
}
}
}
}
Ok(Value::Bool(false))
}
#[cfg(test)]
mod tests {

View file

@ -30,6 +30,7 @@ impl_module_def!(
"insert" => run,
"intersect" => run,
"join" => run,
"knn" => run,
"last" => run,
"len" => run,
"logical_and" => run,

View file

@ -132,11 +132,11 @@ impl ManhattanDistance for Vec<Number> {
}
pub trait MinkowskiDistance {
fn minkowski_distance(&self, other: &Self, order: Number) -> Result<Number, Error>;
fn minkowski_distance(&self, other: &Self, order: &Number) -> Result<Number, Error>;
}
impl MinkowskiDistance for Vec<Number> {
fn minkowski_distance(&self, other: &Self, order: Number) -> Result<Number, Error> {
fn minkowski_distance(&self, other: &Self, order: &Number) -> Result<Number, Error> {
check_same_dimension("vector::distance::minkowski", self, other)?;
let p = order.to_float();
let dist: f64 = self

View file

@ -75,7 +75,7 @@ pub mod distance {
}
pub fn minkowski((a, b, o): (Vec<Number>, Vec<Number>, Number)) -> Result<Value, Error> {
Ok(a.minkowski_distance(&b, o)?.into())
Ok(a.minkowski_distance(&b, &o)?.into())
}
}

View file

@ -25,7 +25,7 @@ pub(crate) struct DocIds {
}
impl DocIds {
pub(super) async fn new(
pub(in crate::idx) async fn new(
tx: &mut Transaction,
index_key_base: IndexKeyBase,
default_btree_order: u32,
@ -78,7 +78,7 @@ impl DocIds {
/// Returns the doc_id for the given doc_key.
/// If the doc_id does not exists, a new one is created, and associated to the given key.
pub(super) async fn resolve_doc_id(
pub(in crate::idx) async fn resolve_doc_id(
&mut self,
tx: &mut Transaction,
doc_key: Key,
@ -97,7 +97,7 @@ impl DocIds {
Ok(Resolved::New(doc_id))
}
pub(super) async fn remove_doc(
pub(in crate::idx) async fn remove_doc(
&mut self,
tx: &mut Transaction,
doc_key: Key,
@ -119,7 +119,7 @@ impl DocIds {
}
}
pub(super) async fn get_doc_key(
pub(in crate::idx) async fn get_doc_key(
&self,
tx: &mut Transaction,
doc_id: DocId,
@ -132,12 +132,15 @@ impl DocIds {
}
}
pub(super) async fn statistics(&self, tx: &mut Transaction) -> Result<BStatistics, Error> {
pub(in crate::idx) async fn statistics(
&self,
tx: &mut Transaction,
) -> Result<BStatistics, Error> {
let mut store = self.store.lock().await;
self.btree.statistics(tx, &mut store).await
}
pub(super) async fn finish(&mut self, tx: &mut Transaction) -> Result<(), Error> {
pub(in crate::idx) async fn finish(&mut self, tx: &mut Transaction) -> Result<(), Error> {
let updated = self.store.lock().await.finish(tx).await?;
if self.updated || updated {
let state = State {
@ -172,20 +175,20 @@ impl State {
}
#[derive(Debug, PartialEq)]
pub(super) enum Resolved {
pub(in crate::idx) enum Resolved {
New(DocId),
Existing(DocId),
}
impl Resolved {
pub(super) fn doc_id(&self) -> &DocId {
pub(in crate::idx) fn doc_id(&self) -> &DocId {
match self {
Resolved::New(doc_id) => doc_id,
Resolved::Existing(doc_id) => doc_id,
}
}
pub(super) fn was_existing(&self) -> bool {
pub(in crate::idx) fn was_existing(&self) -> bool {
match self {
Resolved::New(_) => false,
Resolved::Existing(_) => true,
@ -195,7 +198,7 @@ impl Resolved {
#[cfg(test)]
mod tests {
use crate::idx::ft::docids::{DocIds, Resolved};
use crate::idx::docids::{DocIds, Resolved};
use crate::idx::trees::store::TreeStoreType;
use crate::idx::IndexKeyBase;
use crate::kvs::{Datastore, Transaction};

View file

@ -64,7 +64,7 @@ impl Analyzer {
&self,
terms: &mut Terms,
tx: &mut Transaction,
field_content: &[Value],
field_content: Vec<Value>,
) -> Result<(DocLength, Vec<(TermId, TermFrequency)>), Error> {
let mut dl = 0;
// Let's first collect all the inputs, and collect the tokens.
@ -101,7 +101,7 @@ impl Analyzer {
&self,
terms: &mut Terms,
tx: &mut Transaction,
content: &[Value],
content: Vec<Value>,
) -> Result<(DocLength, Vec<(TermId, TermFrequency)>, Vec<(TermId, OffsetRecords)>), Error> {
let mut dl = 0;
// Let's first collect all the inputs, and collect the tokens.
@ -135,25 +135,25 @@ impl Analyzer {
Ok((dl, tfid, osid))
}
fn analyze_content(&self, content: &[Value], tks: &mut Vec<Tokens>) -> Result<(), Error> {
fn analyze_content(&self, content: Vec<Value>, tks: &mut Vec<Tokens>) -> Result<(), Error> {
for v in content {
self.analyze_value(v, tks)?;
}
Ok(())
}
fn analyze_value(&self, val: &Value, tks: &mut Vec<Tokens>) -> Result<(), Error> {
fn analyze_value(&self, val: Value, tks: &mut Vec<Tokens>) -> Result<(), Error> {
match val {
Value::Strand(s) => tks.push(self.analyze(s.0.clone())?),
Value::Strand(s) => tks.push(self.analyze(s.0)?),
Value::Number(n) => tks.push(self.analyze(n.to_string())?),
Value::Bool(b) => tks.push(self.analyze(b.to_string())?),
Value::Array(a) => {
for v in &a.0 {
for v in a.0 {
self.analyze_value(v, tks)?;
}
}
Value::Object(o) => {
for v in o.0.values() {
for (_, v) in o.0 {
self.analyze_value(v, tks)?;
}
}

View file

@ -1,5 +1,5 @@
use crate::err::Error;
use crate::idx::ft::docids::DocId;
use crate::idx::docids::DocId;
use crate::idx::trees::bkeys::TrieKeys;
use crate::idx::trees::btree::{BState, BStatistics, BTree, BTreeNodeStore, Payload};
use crate::idx::trees::store::{TreeNodeProvider, TreeNodeStore, TreeStoreType};
@ -72,9 +72,8 @@ impl DocLengths {
}
pub(super) async fn finish(&self, tx: &mut Transaction) -> Result<(), Error> {
if self.store.lock().await.finish(tx).await? {
tx.set(self.state_key.clone(), self.btree.get_state().try_to_val()?).await?;
}
self.store.lock().await.finish(tx).await?;
self.btree.get_state().finish(tx, &self.state_key).await?;
Ok(())
}
}

View file

@ -1,5 +1,4 @@
pub(crate) mod analyzer;
pub(crate) mod docids;
mod doclength;
mod highlighter;
mod offsets;
@ -9,8 +8,8 @@ pub(super) mod termdocs;
pub(crate) mod terms;
use crate::err::Error;
use crate::idx::docids::{DocId, DocIds};
use crate::idx::ft::analyzer::Analyzer;
use crate::idx::ft::docids::{DocId, DocIds};
use crate::idx::ft::doclength::DocLengths;
use crate::idx::ft::highlighter::{Highlighter, Offseter};
use crate::idx::ft::offsets::Offsets;
@ -198,7 +197,7 @@ impl FtIndex {
&mut self,
tx: &mut Transaction,
rid: &Thing,
content: &[Value],
content: Vec<Value>,
) -> Result<(), Error> {
// Resolve the doc_id
let resolved = self.doc_ids.write().await.resolve_doc_id(tx, rid.into()).await?;
@ -481,7 +480,7 @@ mod tests {
}
assert_eq!(map.len(), e.len());
for (k, p) in e {
assert_eq!(map.get(k), Some(&p));
assert_eq!(map.get(k), Some(&p), "{}", k);
}
} else {
panic!("hits is none");
@ -549,9 +548,7 @@ mod tests {
// Add one document
let (mut tx, mut fti) =
tx_fti(&ds, TreeStoreType::Write, &az, btree_order, false).await;
fti.index_document(&mut tx, &doc1, &vec![Value::from("hello the world")])
.await
.unwrap();
fti.index_document(&mut tx, &doc1, vec![Value::from("hello the world")]).await.unwrap();
finish(tx, fti).await;
}
@ -559,8 +556,8 @@ mod tests {
// Add two documents
let (mut tx, mut fti) =
tx_fti(&ds, TreeStoreType::Write, &az, btree_order, false).await;
fti.index_document(&mut tx, &doc2, &vec![Value::from("a yellow hello")]).await.unwrap();
fti.index_document(&mut tx, &doc3, &vec![Value::from("foo bar")]).await.unwrap();
fti.index_document(&mut tx, &doc2, vec![Value::from("a yellow hello")]).await.unwrap();
fti.index_document(&mut tx, &doc3, vec![Value::from("foo bar")]).await.unwrap();
finish(tx, fti).await;
}
@ -575,7 +572,13 @@ mod tests {
// Search & score
let (hits, scr) = search(&mut tx, &fti, "hello").await;
check_hits(&mut tx, hits, scr, vec![(&doc1, Some(0.0)), (&doc2, Some(0.0))]).await;
check_hits(
&mut tx,
hits,
scr,
vec![(&doc1, Some(-0.4859746)), (&doc2, Some(-0.4859746))],
)
.await;
let (hits, scr) = search(&mut tx, &fti, "world").await;
check_hits(&mut tx, hits, scr, vec![(&doc1, Some(0.4859746))]).await;
@ -597,7 +600,7 @@ mod tests {
// Reindex one document
let (mut tx, mut fti) =
tx_fti(&ds, TreeStoreType::Write, &az, btree_order, false).await;
fti.index_document(&mut tx, &doc3, &vec![Value::from("nobar foo")]).await.unwrap();
fti.index_document(&mut tx, &doc3, vec![Value::from("nobar foo")]).await.unwrap();
finish(tx, fti).await;
let (mut tx, fti) = tx_fti(&ds, TreeStoreType::Read, &az, btree_order, false).await;
@ -655,28 +658,28 @@ mod tests {
fti.index_document(
&mut tx,
&doc1,
&vec![Value::from("the quick brown fox jumped over the lazy dog")],
vec![Value::from("the quick brown fox jumped over the lazy dog")],
)
.await
.unwrap();
fti.index_document(
&mut tx,
&doc2,
&vec![Value::from("the fast fox jumped over the lazy dog")],
vec![Value::from("the fast fox jumped over the lazy dog")],
)
.await
.unwrap();
fti.index_document(
&mut tx,
&doc3,
&vec![Value::from("the dog sat there and did nothing")],
vec![Value::from("the dog sat there and did nothing")],
)
.await
.unwrap();
fti.index_document(
&mut tx,
&doc4,
&vec![Value::from("the other animals sat there watching")],
vec![Value::from("the other animals sat there watching")],
)
.await
.unwrap();
@ -698,10 +701,10 @@ mod tests {
hits,
scr,
vec![
(&doc1, Some(0.0)),
(&doc2, Some(0.0)),
(&doc3, Some(0.0)),
(&doc4, Some(0.0)),
(&doc1, Some(-3.4388628)),
(&doc2, Some(-3.621457)),
(&doc3, Some(-2.258829)),
(&doc4, Some(-2.393017)),
],
)
.await;
@ -711,7 +714,11 @@ mod tests {
&mut tx,
hits,
scr,
vec![(&doc1, Some(0.0)), (&doc2, Some(0.0)), (&doc3, Some(0.0))],
vec![
(&doc1, Some(-0.7832165)),
(&doc2, Some(-0.8248031)),
(&doc3, Some(-0.87105393)),
],
)
.await;

View file

@ -1,5 +1,5 @@
use crate::err::Error;
use crate::idx::ft::docids::DocId;
use crate::idx::docids::DocId;
use crate::idx::ft::terms::TermId;
use crate::idx::IndexKeyBase;
use crate::kvs::{Transaction, Val};

View file

@ -1,5 +1,5 @@
use crate::err::Error;
use crate::idx::ft::docids::DocId;
use crate::idx::docids::DocId;
use crate::idx::ft::terms::TermId;
use crate::idx::trees::bkeys::TrieKeys;
use crate::idx::trees::btree::{BState, BStatistics, BTree, BTreeNodeStore};
@ -81,10 +81,8 @@ impl Postings {
}
pub(super) async fn finish(&self, tx: &mut Transaction) -> Result<(), Error> {
let updated = self.store.lock().await.finish(tx).await?;
if self.btree.is_updated() || updated {
tx.set(self.state_key.clone(), self.btree.get_state().try_to_val()?).await?;
}
self.store.lock().await.finish(tx).await?;
self.btree.get_state().finish(tx, &self.state_key).await?;
Ok(())
}
}

View file

@ -1,5 +1,5 @@
use crate::err::Error;
use crate::idx::ft::docids::DocId;
use crate::idx::docids::DocId;
use crate::idx::ft::doclength::{DocLength, DocLengths};
use crate::idx::ft::postings::{Postings, TermFrequency};
use crate::idx::ft::termdocs::TermsDocs;
@ -76,8 +76,8 @@ impl BM25Scorer {
// (N - n(qi) + 0.5)
let numerator = self.doc_count - term_doc_count + 0.5;
let idf = (numerator / denominator).ln();
if idf.is_nan() || idf <= 0.0 {
return 0.0;
if idf.is_nan() {
return f32::NAN;
}
let tf_prim = 1.0 + term_freq.ln();
// idf * (k1 + 1)

View file

@ -1,5 +1,5 @@
use crate::err::Error;
use crate::idx::ft::docids::DocId;
use crate::idx::docids::DocId;
use crate::idx::ft::doclength::DocLength;
use crate::idx::ft::terms::TermId;
use crate::idx::IndexKeyBase;

View file

@ -1,10 +1,11 @@
pub(crate) mod docids;
pub(crate) mod ft;
pub(crate) mod planner;
pub mod trees;
use crate::dbs::Options;
use crate::err::Error;
use crate::idx::ft::docids::DocId;
use crate::idx::docids::DocId;
use crate::idx::ft::terms::TermId;
use crate::idx::trees::store::NodeId;
use crate::key::index::bc::Bc;
@ -18,6 +19,7 @@ use crate::key::index::bp::Bp;
use crate::key::index::bs::Bs;
use crate::key::index::bt::Bt;
use crate::key::index::bu::Bu;
use crate::key::index::vm::Vm;
use crate::kvs::{Key, Val};
use crate::sql::statements::DefineIndexStatement;
use revision::Revisioned;
@ -171,6 +173,17 @@ impl IndexKeyBase {
)
.into()
}
fn new_vm_key(&self, node_id: Option<NodeId>) -> Key {
Vm::new(
self.inner.ns.as_str(),
self.inner.db.as_str(),
self.inner.tb.as_str(),
self.inner.ix.as_str(),
node_id,
)
.into()
}
}
/// This trait provides `Revision` based default implementations for serialization/deserialization

View file

@ -1,25 +1,27 @@
use crate::dbs::{Options, Transaction};
use crate::err::Error;
use crate::idx::ft::docids::{DocId, DocIds};
use crate::idx::docids::{DocId, DocIds};
use crate::idx::ft::scorer::BM25Scorer;
use crate::idx::ft::termdocs::TermsDocs;
use crate::idx::ft::terms::TermId;
use crate::idx::ft::{FtIndex, MatchRef};
use crate::idx::planner::iterators::{
IndexEqualThingIterator, IndexRangeThingIterator, MatchesThingIterator, ThingIterator,
UniqueEqualThingIterator, UniqueRangeThingIterator,
IndexEqualThingIterator, IndexRangeThingIterator, KnnThingIterator, MatchesThingIterator,
ThingIterator, UniqueEqualThingIterator, UniqueRangeThingIterator,
};
use crate::idx::planner::plan::IndexOperator::Matches;
use crate::idx::planner::plan::{IndexOperator, IndexOption, RangeValue};
use crate::idx::planner::tree::{IndexMap, IndexRef};
use crate::idx::trees::mtree::MTreeIndex;
use crate::idx::trees::store::TreeStoreType;
use crate::idx::IndexKeyBase;
use crate::kvs;
use crate::kvs::Key;
use crate::sql::index::Index;
use crate::sql::statements::DefineIndexStatement;
use crate::sql::{Expression, Object, Table, Thing, Value};
use std::collections::{HashMap, HashSet};
use crate::sql::{Array, Expression, Object, Table, Thing, Value};
use roaring::RoaringTreemap;
use std::collections::{HashMap, HashSet, VecDeque};
use std::sync::Arc;
use tokio::sync::RwLock;
@ -30,6 +32,7 @@ pub(crate) struct QueryExecutor {
exp_entries: HashMap<Arc<Expression>, FtEntry>,
it_entries: Vec<IteratorEntry>,
index_definitions: HashMap<IndexRef, DefineIndexStatement>,
mt_exp: HashMap<Arc<Expression>, MtEntry>,
}
pub(crate) type IteratorRef = u16;
@ -66,31 +69,32 @@ impl QueryExecutor {
let mut mr_entries = HashMap::default();
let mut exp_entries = HashMap::default();
let mut ft_map = HashMap::default();
let mut mt_map: HashMap<IndexRef, MTreeIndex> = HashMap::default();
let mut mt_exp = HashMap::default();
// Create all the instances of FtIndex
// Build the FtEntries and map them to Expressions and MatchRef
for (exp, io) in im.options {
let mut entry = None;
let ir = io.ir();
if let Some(idx_def) = im.definitions.get(&ir) {
if let Index::Search(p) = &idx_def.index {
match &idx_def.index {
Index::Search(p) => {
let mut ft_entry = None;
if let Some(ft) = ft_map.get(&ir) {
if entry.is_none() {
entry = FtEntry::new(&mut run, ft, io).await?;
if ft_entry.is_none() {
ft_entry = FtEntry::new(&mut run, ft, io).await?;
}
} else {
let ikb = IndexKeyBase::new(opt, idx_def);
let az = run.get_db_analyzer(opt.ns(), opt.db(), p.az.as_str()).await?;
let ft = FtIndex::new(&mut run, az, ikb, p, TreeStoreType::Read).await?;
if entry.is_none() {
entry = FtEntry::new(&mut run, &ft, io).await?;
let ft =
FtIndex::new(&mut run, az, ikb, p, TreeStoreType::Read).await?;
if ft_entry.is_none() {
ft_entry = FtEntry::new(&mut run, &ft, io).await?;
}
ft_map.insert(ir, ft);
}
}
}
if let Some(e) = entry {
if let Some(e) = ft_entry {
if let Matches(_, Some(mr)) = e.0.index_option.op() {
if mr_entries.insert(*mr, e.clone()).is_some() {
return Err(Error::DuplicatedMatchRef {
@ -101,6 +105,25 @@ impl QueryExecutor {
exp_entries.insert(exp, e);
}
}
Index::MTree(p) => {
if let IndexOperator::Knn(a, k) = io.op() {
let entry = if let Some(mt) = mt_map.get(&ir) {
MtEntry::new(&mut run, mt, a.clone(), *k).await?
} else {
let ikb = IndexKeyBase::new(opt, idx_def);
let mt =
MTreeIndex::new(&mut run, ikb, p, TreeStoreType::Read).await?;
let entry = MtEntry::new(&mut run, &mt, a.clone(), *k).await?;
mt_map.insert(ir, mt);
entry
};
mt_exp.insert(exp, entry);
}
}
_ => {}
}
}
}
Ok(Self {
table: table.0.clone(),
@ -109,6 +132,19 @@ impl QueryExecutor {
exp_entries,
it_entries: Vec::new(),
index_definitions: im.definitions,
mt_exp,
})
}
pub(crate) async fn knn(
&self,
_txn: &Transaction,
_thg: &Thing,
exp: &Expression,
) -> Result<Value, Error> {
// If no previous case were successful, we end up with a user error
Err(Error::NoIndexFoundForMatch {
value: exp.to_string(),
})
}
@ -168,9 +204,7 @@ impl QueryExecutor {
Index::Search {
..
} => self.new_search_index_iterator(ir, io.clone()).await,
Index::MTree(_) => Err(Error::FeatureNotYetImplemented {
feature: "VectorSearch iterator".to_string(),
}),
Index::MTree(_) => Ok(self.new_mtree_index_knn_iterator(ir)),
}
} else {
Ok(None)
@ -258,6 +292,16 @@ impl QueryExecutor {
Ok(None)
}
fn new_mtree_index_knn_iterator(&self, ir: IteratorRef) -> Option<ThingIterator> {
if let Some(IteratorEntry::Single(exp, ..)) = self.it_entries.get(ir as usize) {
if let Some(mte) = self.mt_exp.get(exp.as_ref()) {
let it = KnnThingIterator::new(mte.doc_ids.clone(), mte.res.clone());
return Some(ThingIterator::Knn(it));
}
}
None
}
pub(crate) async fn matches(
&self,
txn: &Transaction,
@ -406,3 +450,24 @@ impl FtEntry {
}
}
}
#[derive(Clone)]
pub(super) struct MtEntry {
doc_ids: Arc<RwLock<DocIds>>,
res: VecDeque<RoaringTreemap>,
}
impl MtEntry {
async fn new(
tx: &mut kvs::Transaction,
mt: &MTreeIndex,
a: Array,
k: u32,
) -> Result<Self, Error> {
let res = mt.knn_search(tx, a, k as usize).await?;
Ok(Self {
res,
doc_ids: mt.doc_ids(),
})
}
}

View file

@ -1,6 +1,6 @@
use crate::dbs::{Options, Transaction};
use crate::err::Error;
use crate::idx::ft::docids::{DocId, NO_DOC_ID};
use crate::idx::docids::{DocId, DocIds, NO_DOC_ID};
use crate::idx::ft::termdocs::TermsDocs;
use crate::idx::ft::{FtIndex, HitsIterator};
use crate::idx::planner::plan::RangeValue;
@ -8,6 +8,10 @@ use crate::key::index::Index;
use crate::kvs::Key;
use crate::sql::statements::DefineIndexStatement;
use crate::sql::{Array, Thing, Value};
use roaring::RoaringTreemap;
use std::collections::VecDeque;
use std::sync::Arc;
use tokio::sync::RwLock;
pub(crate) enum ThingIterator {
IndexEqual(IndexEqualThingIterator),
@ -15,6 +19,7 @@ pub(crate) enum ThingIterator {
UniqueEqual(UniqueEqualThingIterator),
UniqueRange(UniqueRangeThingIterator),
Matches(MatchesThingIterator),
Knn(KnnThingIterator),
}
impl ThingIterator {
@ -29,6 +34,7 @@ impl ThingIterator {
ThingIterator::IndexRange(i) => i.next_batch(tx, size).await,
ThingIterator::UniqueRange(i) => i.next_batch(tx, size).await,
ThingIterator::Matches(i) => i.next_batch(tx, size).await,
ThingIterator::Knn(i) => i.next_batch(tx, size).await,
}
}
}
@ -307,3 +313,52 @@ impl MatchesThingIterator {
Ok(res)
}
}
pub(crate) struct KnnThingIterator {
doc_ids: Arc<RwLock<DocIds>>,
res: VecDeque<RoaringTreemap>,
current: Option<RoaringTreemap>,
skip: RoaringTreemap,
}
impl KnnThingIterator {
pub(super) fn new(doc_ids: Arc<RwLock<DocIds>>, mut res: VecDeque<RoaringTreemap>) -> Self {
let current = res.pop_front();
Self {
doc_ids,
res,
current,
skip: RoaringTreemap::new(),
}
}
async fn next_batch(
&mut self,
txn: &Transaction,
mut limit: u32,
) -> Result<Vec<(Thing, DocId)>, Error> {
let mut res = vec![];
let mut tx = txn.lock().await;
while self.current.is_some() && limit > 0 {
if let Some(docs) = &mut self.current {
if let Some(doc_id) = docs.iter().next() {
docs.remove(doc_id);
if self.skip.insert(doc_id) {
if let Some(doc_key) =
self.doc_ids.read().await.get_doc_key(&mut tx, doc_id).await?
{
res.push((doc_key.into(), doc_id));
limit -= 1;
}
}
if docs.is_empty() {
self.current = None;
}
}
}
if self.current.is_none() {
self.current = self.res.pop_front();
}
}
Ok(res)
}
}

View file

@ -149,6 +149,7 @@ pub(super) enum IndexOperator {
Equality(Array),
RangePart(Operator, Value),
Matches(String, Option<MatchRef>),
Knn(Array, u32),
}
impl IndexOption {
@ -191,6 +192,10 @@ impl IndexOption {
e.insert("operator", Value::from(op.to_string()));
e.insert("value", v.to_owned());
}
IndexOperator::Knn(a, k) => {
e.insert("operator", Value::from(format!("<{}>", k)));
e.insert("value", Value::Array(a.clone()));
}
};
}
}

View file

@ -102,10 +102,10 @@ impl<'a> TreeBuilder<'a> {
match v {
Value::Expression(e) => self.eval_expression(e).await,
Value::Idiom(i) => self.eval_idiom(i).await,
Value::Strand(_) => Ok(Node::Scalar(v.to_owned())),
Value::Number(_) => Ok(Node::Scalar(v.to_owned())),
Value::Bool(_) => Ok(Node::Scalar(v.to_owned())),
Value::Thing(_) => Ok(Node::Scalar(v.to_owned())),
Value::Strand(_) | Value::Number(_) | Value::Bool(_) | Value::Thing(_) => {
Ok(Node::Scalar(v.to_owned()))
}
Value::Array(a) => Ok(self.eval_array(a)),
Value::Subquery(s) => self.eval_subquery(s).await,
Value::Param(p) => {
let v = p.compute(self.ctx, self.opt, self.txn, None).await?;
@ -115,6 +115,16 @@ impl<'a> TreeBuilder<'a> {
}
}
fn eval_array(&mut self, a: &Array) -> Node {
// Check if it is a numeric vector
for v in &a.0 {
if !v.is_number() {
return Node::Unsupported(format!("Unsupported array: {}", a));
}
}
Node::Vector(a.to_owned())
}
async fn eval_idiom(&mut self, i: &Idiom) -> Result<Node, Error> {
if let Some(irs) = self.find_indexes(i).await? {
if !irs.is_empty() {
@ -165,25 +175,38 @@ impl<'a> TreeBuilder<'a> {
irs: &[IndexRef],
op: &Operator,
id: &Idiom,
v: &Node,
n: &Node,
e: &Expression,
) -> Option<IndexOption> {
if let Some(v) = v.is_scalar() {
for ir in irs {
if let Some(ix) = self.index_map.definitions.get(ir) {
let op = match &ix.index {
Index::Idx => Self::eval_index_operator(op, v),
Index::Uniq => Self::eval_index_operator(op, v),
Index::Idx => Self::eval_index_operator(op, n),
Index::Uniq => Self::eval_index_operator(op, n),
Index::Search {
..
} => {
if let Some(v) = n.is_scalar() {
if let Operator::Matches(mr) = op {
Some(IndexOperator::Matches(v.clone().to_raw_string(), *mr))
} else {
None
}
} else {
None
}
}
Index::MTree(_) => {
if let Operator::Knn(k) = op {
if let Node::Vector(a) = n {
Some(IndexOperator::Knn(a.clone(), *k))
} else {
None
}
} else {
None
}
}
Index::MTree(_) => None,
};
if let Some(op) = op {
let io = IndexOption::new(*ir, id.clone(), op);
@ -192,11 +215,11 @@ impl<'a> TreeBuilder<'a> {
}
}
}
}
None
}
fn eval_index_operator(op: &Operator, v: &Value) -> Option<IndexOperator> {
fn eval_index_operator(op: &Operator, n: &Node) -> Option<IndexOperator> {
if let Some(v) = n.is_scalar() {
match op {
Operator::Equal => Some(IndexOperator::Equality(Array::from(v.clone()))),
Operator::LessThan
@ -205,6 +228,9 @@ impl<'a> TreeBuilder<'a> {
| Operator::MoreThanOrEqual => Some(IndexOperator::RangePart(op.clone(), v.clone())),
_ => None,
}
} else {
None
}
}
async fn eval_subquery(&mut self, s: &Subquery) -> Result<Node, Error> {
@ -235,6 +261,7 @@ pub(super) enum Node {
IndexedField(Idiom, Arc<Vec<IndexRef>>),
NonIndexedField,
Scalar(Value),
Vector(Array),
Unsupported(String),
}

View file

@ -21,7 +21,6 @@ where
{
state: BState,
full_size: u32,
updated: bool,
bk: PhantomData<BK>,
}
@ -31,6 +30,8 @@ pub struct BState {
minimum_degree: u32,
root: Option<NodeId>,
next_node_id: NodeId,
#[serde(skip)]
updated: bool,
}
impl VersionedSerdeState for BState {}
@ -42,8 +43,34 @@ impl BState {
minimum_degree,
root: None,
next_node_id: 0,
updated: false,
}
}
fn set_root(&mut self, node_id: Option<NodeId>) {
if node_id.ne(&self.root) {
self.root = node_id;
self.updated = true;
}
}
fn new_node_id(&mut self) -> NodeId {
let new_node_id = self.next_node_id;
self.next_node_id += 1;
self.updated = true;
new_node_id
}
pub(in crate::idx) async fn finish(
&self,
tx: &mut Transaction,
key: &Key,
) -> Result<(), Error> {
if self.updated {
tx.set(key.clone(), self.try_to_val()?).await?;
}
Ok(())
}
}
#[derive(Debug, Default, PartialEq)]
@ -166,7 +193,6 @@ where
Self {
full_size: state.minimum_degree * 2 - 1,
state,
updated: false,
bk: PhantomData,
}
}
@ -180,11 +206,11 @@ where
let mut next_node = self.state.root;
while let Some(node_id) = next_node.take() {
let current = store.get_node(tx, node_id).await?;
if let Some(payload) = current.node.keys().get(searched_key) {
if let Some(payload) = current.n.keys().get(searched_key) {
store.set_node(current, false)?;
return Ok(Some(payload));
}
if let BTreeNode::Internal(keys, children) = &current.node {
if let BTreeNode::Internal(keys, children) = &current.n {
let child_idx = keys.get_child_idx(searched_key);
next_node.replace(children[child_idx]);
}
@ -201,27 +227,30 @@ where
payload: Payload,
) -> Result<(), Error> {
if let Some(root_id) = self.state.root {
// We already have a root node
let root = store.get_node(tx, root_id).await?;
if root.node.keys().len() == self.full_size {
let new_root_id = self.new_node_id();
if root.n.keys().len() == self.full_size {
// The root node is full, let's split it
let new_root_id = self.state.new_node_id();
let new_root = store
.new_node(new_root_id, BTreeNode::Internal(BK::default(), vec![root_id]))?;
self.state.root = Some(new_root.id);
self.state.set_root(Some(new_root.id));
self.split_child(store, new_root, 0, root).await?;
self.insert_non_full(tx, store, new_root_id, key, payload).await?;
} else {
// The root node has place, let's insert the value
let root_id = root.id;
store.set_node(root, false)?;
self.insert_non_full(tx, store, root_id, key, payload).await?;
}
} else {
let new_root_id = self.new_node_id();
// We don't have a root node, let's create id
let new_root_id = self.state.new_node_id();
let new_root_node =
store.new_node(new_root_id, BTreeNode::Leaf(BK::with_key_val(key, payload)?))?;
store.set_node(new_root_node, true)?;
self.state.root = Some(new_root_id);
self.state.set_root(Some(new_root_id));
}
self.updated = true;
Ok(())
}
@ -237,7 +266,7 @@ where
while let Some(node_id) = next_node_id.take() {
let mut node = store.get_node(tx, node_id).await?;
let key: Key = key.clone();
match &mut node.node {
match &mut node.n {
BTreeNode::Leaf(keys) => {
keys.insert(key, payload);
store.set_node(node, true)?;
@ -250,7 +279,7 @@ where
}
let child_idx = keys.get_child_idx(&key);
let child = store.get_node(tx, children[child_idx]).await?;
let next_id = if child.node.keys().len() == self.full_size {
let next_id = if child.n.keys().len() == self.full_size {
let split_result = self.split_child(store, node, child_idx, child).await?;
if key.gt(&split_result.median_key) {
split_result.right_node_id
@ -277,12 +306,12 @@ where
idx: usize,
child_node: BStoredNode<BK>,
) -> Result<SplitResult, Error> {
let (left_node, right_node, median_key, median_payload) = match child_node.node {
let (left_node, right_node, median_key, median_payload) = match child_node.n {
BTreeNode::Internal(keys, children) => self.split_internal_node(keys, children)?,
BTreeNode::Leaf(keys) => self.split_leaf_node(keys)?,
};
let right_node_id = self.new_node_id();
match parent_node.node {
let right_node_id = self.state.new_node_id();
match parent_node.n {
BTreeNode::Internal(ref mut keys, ref mut children) => {
keys.insert(median_key.clone(), median_payload);
children.insert(idx + 1, right_node_id);
@ -329,12 +358,6 @@ where
Ok((left_node, right_node, r.median_key, r.median_payload))
}
fn new_node_id(&mut self) -> NodeId {
let new_node_id = self.state.next_node_id;
self.state.next_node_id += 1;
new_node_id
}
pub(in crate::idx) async fn delete(
&mut self,
tx: &mut Transaction,
@ -348,7 +371,7 @@ where
while let Some((is_main_key, key_to_delete, node_id)) = next_node.take() {
let mut node = store.get_node(tx, node_id).await?;
match &mut node.node {
match &mut node.n {
BTreeNode::Leaf(keys) => {
// CLRS: 1
if let Some(payload) = keys.get(&key_to_delete) {
@ -361,12 +384,11 @@ where
store.remove_node(node.id, node.key)?;
// Check if this was the root node
if Some(node_id) == self.state.root {
self.state.root = None;
self.state.set_root(None);
}
} else {
store.set_node(node, true)?;
}
self.updated = true;
} else {
store.set_node(node, false)?;
}
@ -388,7 +410,6 @@ where
.await?,
);
store.set_node(node, true)?;
self.updated = true;
} else {
// CLRS: 3
let (node_update, is_main_key, key_to_delete, next_stored_node) = self
@ -409,11 +430,9 @@ where
}
}
store.remove_node(node_id, node.key)?;
self.state.root = Some(next_stored_node);
self.updated = true;
self.state.set_root(Some(next_stored_node));
} else if node_update {
store.set_node(node, true)?;
self.updated = true;
} else {
store.set_node(node, false)?;
}
@ -437,9 +456,9 @@ where
let left_idx = keys.get_child_idx(&key_to_delete);
let left_id = children[left_idx];
let mut left_node = store.get_node(tx, left_id).await?;
if left_node.node.keys().len() >= self.state.minimum_degree {
if left_node.n.keys().len() >= self.state.minimum_degree {
// CLRS: 2a -> left_node is named `y` in the book
if let Some((key_prim, payload_prim)) = left_node.node.keys().get_last_key() {
if let Some((key_prim, payload_prim)) = left_node.n.keys().get_last_key() {
keys.remove(&key_to_delete);
keys.insert(key_prim.clone(), payload_prim);
store.set_node(left_node, true)?;
@ -450,9 +469,9 @@ where
let right_idx = left_idx + 1;
let right_id = children[right_idx];
let right_node = store.get_node(tx, right_id).await?;
if right_node.node.keys().len() >= self.state.minimum_degree {
if right_node.n.keys().len() >= self.state.minimum_degree {
// CLRS: 2b -> right_node is name `z` in the book
if let Some((key_prim, payload_prim)) = right_node.node.keys().get_first_key() {
if let Some((key_prim, payload_prim)) = right_node.n.keys().get_first_key() {
keys.remove(&key_to_delete);
keys.insert(key_prim.clone(), payload_prim);
store.set_node(left_node, false)?;
@ -464,7 +483,7 @@ where
// CLRS: 2c
// Merge children
// The payload is set to 0. The value does not matter, as the key will be deleted after anyway.
left_node.node.append(key_to_delete.clone(), 0, right_node.node)?;
left_node.n.append(key_to_delete.clone(), 0, right_node.n)?;
store.set_node(left_node, true)?;
store.remove_node(right_id, right_node.key)?;
keys.remove(&key_to_delete);
@ -485,11 +504,11 @@ where
let child_idx = keys.get_child_idx(&key_to_delete);
let child_id = children[child_idx];
let child_stored_node = store.get_node(tx, child_id).await?;
if child_stored_node.node.keys().len() < self.state.minimum_degree {
if child_stored_node.n.keys().len() < self.state.minimum_degree {
// right child (successor)
if child_idx < children.len() - 1 {
let right_child_stored_node = store.get_node(tx, children[child_idx + 1]).await?;
return if right_child_stored_node.node.keys().len() >= self.state.minimum_degree {
return if right_child_stored_node.n.keys().len() >= self.state.minimum_degree {
Self::delete_adjust_successor(
store,
keys,
@ -520,7 +539,7 @@ where
if child_idx > 0 {
let child_idx = child_idx - 1;
let left_child_stored_node = store.get_node(tx, children[child_idx]).await?;
return if left_child_stored_node.node.keys().len() >= self.state.minimum_degree {
return if left_child_stored_node.n.keys().len() >= self.state.minimum_degree {
Self::delete_adjust_predecessor(
store,
keys,
@ -562,12 +581,12 @@ where
mut right_child_stored_node: BStoredNode<BK>,
) -> Result<(bool, bool, Key, NodeId), Error> {
if let Some((ascending_key, ascending_payload)) =
right_child_stored_node.node.keys().get_first_key()
right_child_stored_node.n.keys().get_first_key()
{
right_child_stored_node.node.keys_mut().remove(&ascending_key);
right_child_stored_node.n.keys_mut().remove(&ascending_key);
if let Some(descending_key) = keys.get_key(child_idx) {
if let Some(descending_payload) = keys.remove(&descending_key) {
child_stored_node.node.keys_mut().insert(descending_key, descending_payload);
child_stored_node.n.keys_mut().insert(descending_key, descending_payload);
keys.insert(ascending_key, ascending_payload);
let child_id = child_stored_node.id;
store.set_node(child_stored_node, true)?;
@ -590,12 +609,12 @@ where
mut left_child_stored_node: BStoredNode<BK>,
) -> Result<(bool, bool, Key, NodeId), Error> {
if let Some((ascending_key, ascending_payload)) =
left_child_stored_node.node.keys().get_last_key()
left_child_stored_node.n.keys().get_last_key()
{
left_child_stored_node.node.keys_mut().remove(&ascending_key);
left_child_stored_node.n.keys_mut().remove(&ascending_key);
if let Some(descending_key) = keys.get_key(child_idx) {
if let Some(descending_payload) = keys.remove(&descending_key) {
child_stored_node.node.keys_mut().insert(descending_key, descending_payload);
child_stored_node.n.keys_mut().insert(descending_key, descending_payload);
keys.insert(ascending_key, ascending_payload);
let child_id = child_stored_node.id;
store.set_node(child_stored_node, true)?;
@ -623,7 +642,7 @@ where
if let Some(descending_payload) = keys.remove(&descending_key) {
children.remove(child_idx + 1);
let left_id = left_child.id;
left_child.node.append(descending_key, descending_payload, right_child.node)?;
left_child.n.append(descending_key, descending_payload, right_child.n)?;
store.set_node(left_child, true)?;
store.remove_node(right_child.id, right_child.key)?;
return Ok((true, is_main_key, key_to_delete, left_id));
@ -645,13 +664,13 @@ where
}
while let Some((node_id, depth)) = node_queue.pop_front() {
let stored = store.get_node(tx, node_id).await?;
stats.keys_count += stored.node.keys().len() as u64;
stats.keys_count += stored.n.keys().len() as u64;
if depth > stats.max_depth {
stats.max_depth = depth;
}
stats.nodes_count += 1;
stats.total_size += stored.size as u64;
if let BTreeNode::Internal(_, children) = &stored.node {
if let BTreeNode::Internal(_, children) = &stored.n {
let depth = depth + 1;
for child_id in children.iter() {
node_queue.push_front((*child_id, depth));
@ -665,10 +684,6 @@ where
pub(in crate::idx) fn get_state(&self) -> &BState {
&self.state
}
pub(in crate::idx) fn is_updated(&self) -> bool {
self.updated
}
}
#[cfg(test)]
@ -1032,13 +1047,13 @@ mod tests {
0 => {
assert_eq!(depth, 1);
assert_eq!(node_id, 7);
check_is_internal_node(node.node, vec![("p", 16)], vec![1, 8]);
check_is_internal_node(node.n, vec![("p", 16)], vec![1, 8]);
}
1 => {
assert_eq!(depth, 2);
assert_eq!(node_id, 1);
check_is_internal_node(
node.node,
node.n,
vec![("c", 3), ("g", 7), ("m", 13)],
vec![0, 9, 2, 3],
);
@ -1046,42 +1061,42 @@ mod tests {
2 => {
assert_eq!(depth, 2);
assert_eq!(node_id, 8);
check_is_internal_node(node.node, vec![("t", 20), ("x", 24)], vec![4, 6, 5]);
check_is_internal_node(node.n, vec![("t", 20), ("x", 24)], vec![4, 6, 5]);
}
3 => {
assert_eq!(depth, 3);
assert_eq!(node_id, 0);
check_is_leaf_node(node.node, vec![("a", 1), ("b", 2)]);
check_is_leaf_node(node.n, vec![("a", 1), ("b", 2)]);
}
4 => {
assert_eq!(depth, 3);
assert_eq!(node_id, 9);
check_is_leaf_node(node.node, vec![("d", 4), ("e", 5), ("f", 6)]);
check_is_leaf_node(node.n, vec![("d", 4), ("e", 5), ("f", 6)]);
}
5 => {
assert_eq!(depth, 3);
assert_eq!(node_id, 2);
check_is_leaf_node(node.node, vec![("j", 10), ("k", 11), ("l", 12)]);
check_is_leaf_node(node.n, vec![("j", 10), ("k", 11), ("l", 12)]);
}
6 => {
assert_eq!(depth, 3);
assert_eq!(node_id, 3);
check_is_leaf_node(node.node, vec![("n", 14), ("o", 15)]);
check_is_leaf_node(node.n, vec![("n", 14), ("o", 15)]);
}
7 => {
assert_eq!(depth, 3);
assert_eq!(node_id, 4);
check_is_leaf_node(node.node, vec![("q", 17), ("r", 18), ("s", 19)]);
check_is_leaf_node(node.n, vec![("q", 17), ("r", 18), ("s", 19)]);
}
8 => {
assert_eq!(depth, 3);
assert_eq!(node_id, 6);
check_is_leaf_node(node.node, vec![("u", 21), ("v", 22)]);
check_is_leaf_node(node.n, vec![("u", 21), ("v", 22)]);
}
9 => {
assert_eq!(depth, 3);
assert_eq!(node_id, 5);
check_is_leaf_node(node.node, vec![("y", 25), ("z", 26)]);
check_is_leaf_node(node.n, vec![("y", 25), ("z", 26)]);
}
_ => panic!("This node should not exist {}", count),
})
@ -1135,13 +1150,13 @@ mod tests {
let nodes_count = t
.inspect_nodes(&mut tx, |count, depth, node_id, node| {
debug!("{} -> {}", depth, node_id);
node.node.debug(|k| Ok(String::from_utf8(k)?)).unwrap();
node.n.debug(|k| Ok(String::from_utf8(k)?)).unwrap();
match count {
0 => {
assert_eq!(depth, 1);
assert_eq!(node_id, 1);
check_is_internal_node(
node.node,
node.n,
vec![("e", 5), ("l", 12), ("p", 16), ("t", 20), ("x", 24)],
vec![0, 9, 3, 4, 6, 5],
);
@ -1149,32 +1164,32 @@ mod tests {
1 => {
assert_eq!(depth, 2);
assert_eq!(node_id, 0);
check_is_leaf_node(node.node, vec![("a", 1), ("c", 3)]);
check_is_leaf_node(node.n, vec![("a", 1), ("c", 3)]);
}
2 => {
assert_eq!(depth, 2);
assert_eq!(node_id, 9);
check_is_leaf_node(node.node, vec![("j", 10), ("k", 11)]);
check_is_leaf_node(node.n, vec![("j", 10), ("k", 11)]);
}
3 => {
assert_eq!(depth, 2);
assert_eq!(node_id, 3);
check_is_leaf_node(node.node, vec![("n", 14), ("o", 15)]);
check_is_leaf_node(node.n, vec![("n", 14), ("o", 15)]);
}
4 => {
assert_eq!(depth, 2);
assert_eq!(node_id, 4);
check_is_leaf_node(node.node, vec![("q", 17), ("r", 18), ("s", 19)]);
check_is_leaf_node(node.n, vec![("q", 17), ("r", 18), ("s", 19)]);
}
5 => {
assert_eq!(depth, 2);
assert_eq!(node_id, 6);
check_is_leaf_node(node.node, vec![("u", 21), ("v", 22)]);
check_is_leaf_node(node.n, vec![("u", 21), ("v", 22)]);
}
6 => {
assert_eq!(depth, 2);
assert_eq!(node_id, 5);
check_is_leaf_node(node.node, vec![("y", 25), ("z", 26)]);
check_is_leaf_node(node.n, vec![("y", 25), ("z", 26)]);
}
_ => panic!("This node should not exist {}", count),
}
@ -1316,7 +1331,7 @@ mod tests {
debug!("----------------------------------");
t.inspect_nodes(tx, |_count, depth, node_id, node| {
debug!("{} -> {}", depth, node_id);
node.node.debug(|k| Ok(String::from_utf8(k)?)).unwrap();
node.n.debug(|k| Ok(String::from_utf8(k)?)).unwrap();
})
.await
.unwrap();
@ -1359,7 +1374,7 @@ mod tests {
let mut s = TreeNodeStore::Traversal(TreeNodeProvider::Debug);
while let Some((node_id, depth)) = node_queue.pop_front() {
let stored_node = s.get_node(tx, node_id).await?;
if let BTreeNode::Internal(_, children) = &stored_node.node {
if let BTreeNode::Internal(_, children) = &stored_node.n {
let depth = depth + 1;
for child_id in children {
node_queue.push_back((*child_id, depth));

View file

@ -1,3 +1,4 @@
pub mod bkeys;
pub mod btree;
pub mod mtree;
pub mod store;

1792
lib/src/idx/trees/mtree.rs Normal file

File diff suppressed because it is too large Load diff

View file

@ -9,7 +9,7 @@ use tokio::sync::Mutex;
pub type NodeId = u64;
#[derive(Clone, Copy)]
#[derive(Clone, Copy, PartialEq)]
pub enum TreeStoreType {
Write,
Read,
@ -151,7 +151,7 @@ where
#[cfg(debug_assertions)]
self.out.insert(id);
StoredNode {
node,
n: node,
id,
key: self.np.get_key(id),
size: 0,
@ -238,6 +238,7 @@ pub enum TreeNodeProvider {
DocLengths(IndexKeyBase),
Postings(IndexKeyBase),
Terms(IndexKeyBase),
Vector(IndexKeyBase),
Debug,
}
@ -248,6 +249,7 @@ impl TreeNodeProvider {
TreeNodeProvider::DocLengths(ikb) => ikb.new_bl_key(Some(node_id)),
TreeNodeProvider::Postings(ikb) => ikb.new_bp_key(Some(node_id)),
TreeNodeProvider::Terms(ikb) => ikb.new_bt_key(Some(node_id)),
TreeNodeProvider::Vector(ikb) => ikb.new_vm_key(Some(node_id)),
TreeNodeProvider::Debug => node_id.to_be_bytes().to_vec(),
}
}
@ -261,7 +263,7 @@ impl TreeNodeProvider {
let size = val.len() as u32;
let node = N::try_from_val(val)?;
Ok(StoredNode {
node,
n: node,
id,
key,
size,
@ -275,19 +277,30 @@ impl TreeNodeProvider {
where
N: TreeNode,
{
let val = node.node.try_into_val()?;
let val = node.n.try_into_val()?;
tx.set(node.key, val).await?;
Ok(())
}
}
pub(super) struct StoredNode<N> {
pub(super) node: N,
pub(super) n: N,
pub(super) id: NodeId,
pub(super) key: Key,
pub(super) size: u32,
}
impl<N> StoredNode<N> {
pub(super) fn new(n: N, id: NodeId, key: Key, size: u32) -> Self {
Self {
n,
id,
key,
size,
}
}
}
pub trait TreeNode
where
Self: Sized,

View file

@ -1,5 +1,5 @@
//! Stores Term/Doc frequency
use crate::idx::ft::docids::DocId;
use crate::idx::docids::DocId;
use crate::idx::ft::terms::TermId;
use derive::Key;
use serde::{Deserialize, Serialize};

View file

@ -1,5 +1,5 @@
//! Stores the term list for doc_ids
use crate::idx::ft::docids::DocId;
use crate::idx::docids::DocId;
use derive::Key;
use serde::{Deserialize, Serialize};

View file

@ -1,5 +1,5 @@
//! Stores the offsets
use crate::idx::ft::docids::DocId;
use crate::idx::docids::DocId;
use crate::idx::ft::terms::TermId;
use derive::Key;
use serde::{Deserialize, Serialize};

View file

@ -11,6 +11,7 @@ pub mod bp;
pub mod bs;
pub mod bt;
pub mod bu;
pub mod vm;
use crate::sql::array::Array;
use crate::sql::id::Id;

68
lib/src/key/index/vm.rs Normal file
View file

@ -0,0 +1,68 @@
//! Stores MTree state and nodes
use crate::idx::trees::store::NodeId;
use derive::Key;
use serde::{Deserialize, Serialize};
#[derive(Clone, Debug, Eq, PartialEq, PartialOrd, Serialize, Deserialize, Key)]
pub struct Vm<'a> {
__: u8,
_a: u8,
pub ns: &'a str,
_b: u8,
pub db: &'a str,
_c: u8,
pub tb: &'a str,
_d: u8,
pub ix: &'a str,
_e: u8,
_f: u8,
_g: u8,
pub node_id: Option<NodeId>,
}
impl<'a> Vm<'a> {
pub fn new(
ns: &'a str,
db: &'a str,
tb: &'a str,
ix: &'a str,
node_id: Option<NodeId>,
) -> Self {
Self {
__: b'/',
_a: b'*',
ns,
_b: b'*',
db,
_c: b'*',
tb,
_d: b'+',
ix,
_e: b'!',
_f: b'v',
_g: b'm',
node_id,
}
}
}
#[cfg(test)]
mod tests {
#[test]
fn key() {
use super::*;
#[rustfmt::skip]
let val = Vm::new(
"testns",
"testdb",
"testtb",
"testix",
Some(8)
);
let enc = Vm::encode(&val).unwrap();
assert_eq!(enc, b"/*testns\0*testdb\0*testtb\0+testix\0!vm\x01\0\0\0\0\0\0\0\x08");
let dec = Vm::decode(&enc).unwrap();
assert_eq!(val, dec);
}
}

View file

@ -316,6 +316,9 @@ impl Datastore {
}
/// Setup the initial credentials
/// Trigger the `unreachable definition` compilation error, probably due to this issue:
/// https://github.com/rust-lang/rust/issues/111370
#[allow(unreachable_code, unused_variables)]
pub async fn setup_initial_creds(&self, creds: Root<'_>) -> Result<(), Error> {
// Start a new writeable transaction
let txn = self.transaction(true, false).await?.rollback_with_panic().enclose();

View file

@ -191,6 +191,7 @@ impl Expression {
Operator::Outside => fnc::operate::outside(&l, &r),
Operator::Intersects => fnc::operate::intersects(&l, &r),
Operator::Matches(_) => fnc::operate::matches(ctx, txn, doc, self).await,
Operator::Knn(_) => fnc::operate::knn(ctx, txn, doc, self).await,
_ => unreachable!(),
}
}

View file

@ -49,7 +49,7 @@ pub struct MTreeParams {
pub doc_ids_order: u32,
}
#[derive(Default, Clone, Debug, Eq, PartialEq, PartialOrd, Serialize, Deserialize, Hash)]
#[derive(Clone, Default, Debug, Eq, PartialEq, PartialOrd, Serialize, Deserialize, Hash)]
#[revisioned(revision = 1)]
pub enum Distance {
#[default]
@ -182,6 +182,7 @@ pub fn search(i: &str) -> IResult<&str, Index> {
pub fn distance(i: &str) -> IResult<&str, Distance> {
let (i, _) = mightbespace(i)?;
let (i, _) = tag_no_case("DIST")(i)?;
let (i, _) = shouldbespace(i)?;
alt((
map(tag_no_case("EUCLIDEAN"), |_| Distance::Euclidean),
map(tag_no_case("MANHATTAN"), |_| Distance::Manhattan),
@ -200,7 +201,7 @@ pub fn minkowski(i: &str) -> IResult<&str, Distance> {
}
pub fn dimension(i: &str) -> IResult<&str, u16> {
let (i, _) = shouldbespace(i)?;
let (i, _) = mightbespace(i)?;
let (i, _) = tag_no_case("DIMENSION")(i)?;
let (i, _) = shouldbespace(i)?;
let (i, dim) = uint16(i)?;

View file

@ -6,6 +6,7 @@ use nom::branch::alt;
use nom::bytes::complete::tag;
use nom::bytes::complete::tag_no_case;
use nom::character::complete::char;
use nom::character::complete::u32 as uint32;
use nom::character::complete::u8 as uint8;
use nom::combinator::cut;
use nom::combinator::opt;
@ -67,6 +68,8 @@ pub enum Operator {
//
Outside,
Intersects,
//
Knn(u32), // <{k}>
}
impl Default for Operator {
@ -141,6 +144,7 @@ impl fmt::Display for Operator {
f.write_str("@@")
}
}
Self::Knn(k) => write!(f, "<{}>", k),
}
}
}
@ -191,12 +195,14 @@ pub fn binary_symbols(i: &str) -> IResult<&str, Operator> {
value(Operator::AnyLike, tag("?~")),
value(Operator::Like, char('~')),
matches,
knn,
)),
alt((
value(Operator::LessThanOrEqual, tag("<=")),
value(Operator::LessThan, char('<')),
value(Operator::MoreThanOrEqual, tag(">=")),
value(Operator::MoreThan, char('>')),
knn,
)),
alt((
value(Operator::Pow, tag("**")),
@ -257,7 +263,6 @@ pub fn binary_phrases(i: &str) -> IResult<&str, Operator> {
pub fn matches(i: &str) -> IResult<&str, Operator> {
let (i, _) = char('@')(i)?;
// let (i, reference) = opt(|i| uint8(i))(i)?;
cut(|i| {
let (i, reference) = opt(uint8)(i)?;
let (i, _) = char('@')(i)?;
@ -265,6 +270,13 @@ pub fn matches(i: &str) -> IResult<&str, Operator> {
})(i)
}
pub fn knn(i: &str) -> IResult<&str, Operator> {
let (i, _) = char('<')(i)?;
let (i, k) = uint32(i)?;
let (i, _) = char('>')(i)?;
Ok((i, Operator::Knn(k)))
}
#[cfg(test)]
mod tests {
use super::*;
@ -290,4 +302,13 @@ mod tests {
let res = matches("@256@");
res.unwrap_err();
}
#[test]
fn test_knn() {
let res = knn("<5>");
assert!(res.is_ok());
let out = res.unwrap().1;
assert_eq!("<5>", format!("{}", out));
assert_eq!(out, Operator::Knn(5));
}
}

View file

@ -5,6 +5,7 @@ use crate::doc::CursorDoc;
use crate::err::Error;
use crate::iam::{Action, ResourceKind};
use crate::idx::ft::FtIndex;
use crate::idx::trees::mtree::MTreeIndex;
use crate::idx::trees::store::TreeStoreType;
use crate::idx::IndexKeyBase;
use crate::sql::comment::shouldbespace;
@ -56,6 +57,11 @@ impl AnalyzeStatement {
FtIndex::new(&mut run, az, ikb, p, TreeStoreType::Traversal).await?;
ft.statistics(&mut run).await?.into()
}
Index::MTree(p) => {
let mt =
MTreeIndex::new(&mut run, ikb, p, TreeStoreType::Traversal).await?;
mt.statistics(&mut run).await?.into()
}
_ => {
return Err(Error::FeatureNotYetImplemented {
feature: "Statistics on unique and non-unique indexes.".to_string(),

View file

@ -178,7 +178,7 @@ fn index_comment(i: &str) -> IResult<&str, DefineIndexOption> {
mod tests {
use super::*;
use crate::sql::index::SearchParams;
use crate::sql::index::{Distance, MTreeParams, SearchParams};
use crate::sql::Ident;
use crate::sql::Idiom;
use crate::sql::Idioms;
@ -275,4 +275,29 @@ mod tests {
"DEFINE INDEX my_index ON my_table FIELDS my_col SEARCH ANALYZER my_analyzer VS DOC_IDS_ORDER 100 DOC_LENGTHS_ORDER 100 POSTINGS_ORDER 100 TERMS_ORDER 100"
);
}
#[test]
fn check_create_mtree_index() {
let sql = "INDEX my_index ON TABLE my_table COLUMNS my_col MTREE DIMENSION 4";
let (_, idx) = index(sql).unwrap();
assert_eq!(
idx,
DefineIndexStatement {
name: Ident("my_index".to_string()),
what: Ident("my_table".to_string()),
cols: Idioms(vec![Idiom(vec![Part::Field(Ident("my_col".to_string()))])]),
index: Index::MTree(MTreeParams {
dimension: 4,
distance: Distance::Euclidean,
capacity: 40,
doc_ids_order: 100,
}),
comment: None,
}
);
assert_eq!(
idx.to_string(),
"DEFINE INDEX my_index ON my_table FIELDS my_col MTREE DIMENSION 4 DIST EUCLIDEAN CAPACITY 40 DOC_IDS_ORDER 100"
);
}
}

View file

@ -3,6 +3,7 @@ use crate::sql::index::Distance;
use crate::sql::value::serde::ser;
use serde::ser::Error as _;
use serde::ser::Impossible;
use serde::Serialize;
pub(super) struct Serializer;
@ -29,9 +30,34 @@ impl ser::Serializer for Serializer {
) -> Result<Self::Ok, Error> {
match variant {
"Euclidean" => Ok(Distance::Euclidean),
"Manhattan" => Ok(Distance::Manhattan),
"Cosine" => Ok(Distance::Cosine),
"Hamming" => Ok(Distance::Hamming),
"Mahalanobis" => Ok(Distance::Mahalanobis),
variant => Err(Error::custom(format!("unexpected unit variant `{name}::{variant}`"))),
}
}
#[inline]
fn serialize_newtype_variant<T>(
self,
name: &'static str,
_variant_index: u32,
variant: &'static str,
value: &T,
) -> Result<Self::Ok, Error>
where
T: ?Sized + Serialize,
{
match variant {
"Minkowski" => {
Ok(Distance::Minkowski(value.serialize(ser::number::Serializer.wrap())?))
}
variant => {
Err(Error::custom(format!("unexpected newtype variant `{name}::{variant}`")))
}
}
}
}
#[cfg(test)]
@ -41,9 +67,44 @@ mod tests {
use serde::Serialize;
#[test]
fn euclidean() {
fn distance_euclidean() {
let dist = Distance::Euclidean;
let serialized = dist.serialize(Serializer.wrap()).unwrap();
assert_eq!(dist, serialized);
}
#[test]
fn distance_manhattan() {
let dist = Distance::Manhattan;
let serialized = dist.serialize(Serializer.wrap()).unwrap();
assert_eq!(dist, serialized);
}
#[test]
fn distance_mahalanobis() {
let dist = Distance::Mahalanobis;
let serialized = dist.serialize(Serializer.wrap()).unwrap();
assert_eq!(dist, serialized);
}
#[test]
fn distance_hamming() {
let dist = Distance::Hamming;
let serialized = dist.serialize(Serializer.wrap()).unwrap();
assert_eq!(dist, serialized);
}
#[test]
fn distance_cosine() {
let dist = Distance::Cosine;
let serialized = dist.serialize(Serializer.wrap()).unwrap();
assert_eq!(dist, serialized);
}
#[test]
fn distance_minkowski() {
let dist = Distance::Minkowski(7.into());
let serialized = dist.serialize(Serializer.wrap()).unwrap();
assert_eq!(dist, serialized);
}
}

View file

@ -366,7 +366,7 @@ async fn changefeed_with_ts() -> Result<(), Error> {
let Value::Object(a) = a else {
unreachable!()
};
let Value::Number(versionstamp1) = a.get("versionstamp").unwrap() else {
let Value::Number(versionstamp2) = a.get("versionstamp").unwrap() else {
unreachable!()
};
let changes = a.get("changes").unwrap().to_owned();
@ -389,10 +389,10 @@ async fn changefeed_with_ts() -> Result<(), Error> {
let Value::Object(a) = a else {
unreachable!()
};
let Value::Number(versionstamp2) = a.get("versionstamp").unwrap() else {
let Value::Number(versionstamp3) = a.get("versionstamp").unwrap() else {
unreachable!()
};
assert!(versionstamp1 < versionstamp2);
assert!(versionstamp2 < versionstamp3);
let changes = a.get("changes").unwrap().to_owned();
assert_eq!(
changes,
@ -413,10 +413,10 @@ async fn changefeed_with_ts() -> Result<(), Error> {
let Value::Object(a) = a else {
unreachable!()
};
let Value::Number(versionstamp3) = a.get("versionstamp").unwrap() else {
let Value::Number(versionstamp4) = a.get("versionstamp").unwrap() else {
unreachable!()
};
assert!(versionstamp2 < versionstamp3);
assert!(versionstamp3 < versionstamp4);
let changes = a.get("changes").unwrap().to_owned();
assert_eq!(
changes,
@ -437,10 +437,10 @@ async fn changefeed_with_ts() -> Result<(), Error> {
let Value::Object(a) = a else {
unreachable!()
};
let Value::Number(versionstamp4) = a.get("versionstamp").unwrap() else {
let Value::Number(versionstamp5) = a.get("versionstamp").unwrap() else {
unreachable!()
};
assert!(versionstamp3 < versionstamp4);
assert!(versionstamp4 < versionstamp5);
let changes = a.get("changes").unwrap().to_owned();
assert_eq!(
changes,
@ -487,7 +487,7 @@ async fn changefeed_with_ts() -> Result<(), Error> {
let Value::Number(versionstamp1b) = a.get("versionstamp").unwrap() else {
unreachable!()
};
assert!(versionstamp1 == versionstamp1b);
assert!(versionstamp2 == versionstamp1b);
let changes = a.get("changes").unwrap().to_owned();
assert_eq!(
changes,

View file

@ -1211,7 +1211,9 @@ async fn define_statement_search_index() -> Result<(), Error> {
events: {},
fields: {},
tables: {},
indexes: { blog_title: 'DEFINE INDEX blog_title ON blog FIELDS title SEARCH ANALYZER simple BM25(1.2,0.75) DOC_IDS_ORDER 100 DOC_LENGTHS_ORDER 100 POSTINGS_ORDER 100 TERMS_ORDER 100 HIGHLIGHTS' },
indexes: { blog_title: 'DEFINE INDEX blog_title ON blog FIELDS title \
SEARCH ANALYZER simple BM25(1.2,0.75) \
DOC_IDS_ORDER 100 DOC_LENGTHS_ORDER 100 POSTINGS_ORDER 100 TERMS_ORDER 100 HIGHLIGHTS' },
lives: {},
}",
);

60
lib/tests/vector.rs Normal file
View file

@ -0,0 +1,60 @@
mod helpers;
mod parse;
use crate::helpers::new_ds;
use parse::Parse;
use surrealdb::dbs::Session;
use surrealdb::err::Error;
use surrealdb::sql::Value;
#[tokio::test]
async fn select_where_mtree_knn() -> Result<(), Error> {
let sql = r"
CREATE pts:1 SET point = [1,2,3,4];
CREATE pts:2 SET point = [4,5,6,7];
CREATE pts:3 SET point = [8,9,10,11];
DEFINE INDEX mt_pts ON pts FIELDS point MTREE DIMENSION 4;
LET $pt = [2,3,4,5];
SELECT id, vector::distance::euclidean(point, $pt) AS dist FROM pts WHERE point <2> $pt;
SELECT id FROM pts WHERE point <2> $pt EXPLAIN;
";
let dbs = new_ds().await?;
let ses = Session::owner().with_ns("test").with_db("test");
let res = &mut dbs.execute(sql, &ses, None).await?;
assert_eq!(res.len(), 7);
//
for _ in 0..5 {
let _ = res.remove(0).result?;
}
let tmp = res.remove(0).result?;
let val = Value::parse(
"[
{
id: pts:1,
dist: 2f
},
{
id: pts:2,
dist: 4f
}
]",
);
assert_eq!(format!("{:#}", tmp), format!("{:#}", val));
let tmp = res.remove(0).result?;
let val = Value::parse(
"[
{
detail: {
plan: {
index: 'mt_pts',
operator: '<2>',
value: [2,3,4,5]
},
table: 'pts',
},
operation: 'Iterate Index'
}
]",
);
assert_eq!(format!("{:#}", tmp), format!("{:#}", val));
Ok(())
}