[Feat] KNN filtering with limit and KNN distance function ()

This commit is contained in:
Emmanuel Keller 2024-05-24 14:45:21 +01:00 committed by GitHub
parent 23653e5fce
commit 7495611bc4
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
47 changed files with 2889 additions and 1103 deletions

View file

@ -9,8 +9,7 @@ use crate::dbs::Statement;
use crate::dbs::{Options, Transaction};
use crate::doc::Document;
use crate::err::Error;
use crate::idx::docids::DocId;
use crate::idx::planner::executor::IteratorRef;
use crate::idx::planner::iterators::{IteratorRecord, IteratorRef};
use crate::idx::planner::IterationStage;
use crate::sql::edges::Edges;
use crate::sql::range::Range;
@ -34,9 +33,8 @@ pub(crate) enum Iterable {
}
pub(crate) struct Processed {
pub(crate) ir: Option<IteratorRef>,
pub(crate) rid: Option<Thing>,
pub(crate) doc_id: Option<DocId>,
pub(crate) ir: Option<IteratorRecord>,
pub(crate) val: Operable,
}

View file

@ -5,9 +5,10 @@ use crate::dbs::distinct::AsyncDistinct;
use crate::dbs::distinct::SyncDistinct;
use crate::dbs::{Iterable, Iterator, Operable, Options, Processed, Statement, Transaction};
use crate::err::Error;
use crate::idx::planner::executor::IteratorRef;
use crate::idx::planner::iterators::{CollectorRecord, IteratorRef, ThingIterator};
use crate::idx::planner::IterationStage;
use crate::key::{graph, thing};
use crate::kvs;
use crate::kvs::ScanPage;
use crate::sql::dir::Dir;
use crate::sql::{Edges, Range, Table, Thing, Value};
@ -15,6 +16,7 @@ use crate::sql::{Edges, Range, Table, Thing, Value};
use channel::Sender;
use reblessive::tree::Stk;
use std::ops::Bound;
use std::vec;
impl Iterable {
#[allow(clippy::too_many_arguments)]
@ -60,7 +62,7 @@ impl Iterable {
if let Some(IterationStage::BuildKnn) = ctx.get_iteration_stage() {
if let Some(qp) = ctx.get_query_planner() {
if let Some(exe) = qp.get_query_executor(tb) {
return exe.has_knn();
return exe.has_bruteforce_knn();
}
}
}
@ -71,7 +73,7 @@ impl Iterable {
}
}
enum Processor<'a> {
pub(crate) enum Processor<'a> {
Iterator(Option<&'a mut SyncDistinct>, &'a mut Iterator),
#[cfg(not(target_arch = "wasm32"))]
Channel(Option<AsyncDistinct>, Sender<Processed>),
@ -141,17 +143,17 @@ impl<'a> Processor<'a> {
}
Iterable::Range(v) => self.process_range(stk, ctx, opt, txn, stm, v).await?,
Iterable::Edges(e) => self.process_edge(stk, ctx, opt, txn, stm, e).await?,
Iterable::Index(t, ir) => {
Iterable::Index(t, irf) => {
if let Some(qp) = ctx.get_query_planner() {
if let Some(exe) = qp.get_query_executor(&t.0) {
// We set the query executor matching the current table in the Context
// Avoiding search in the hashmap of the query planner for each doc
let mut ctx = Context::new(ctx);
ctx.set_query_executor(exe.clone());
return self.process_index(stk, &ctx, opt, txn, stm, &t, ir).await;
return self.process_index(stk, &ctx, opt, txn, stm, &t, irf).await;
}
}
self.process_index(stk, ctx, opt, txn, stm, &t, ir).await?
self.process_index(stk, ctx, opt, txn, stm, &t, irf).await?
}
Iterable::Mergeable(v, o) => {
self.process_mergeable(stk, ctx, opt, txn, stm, v, o).await?
@ -175,9 +177,8 @@ impl<'a> Processor<'a> {
) -> Result<(), Error> {
// Pass the value through
let pro = Processed {
ir: None,
rid: None,
doc_id: None,
ir: None,
val: Operable::Value(v),
};
// Process the document record
@ -205,9 +206,8 @@ impl<'a> Processor<'a> {
});
// Process the document record
let pro = Processed {
ir: None,
rid: Some(v),
doc_id: None,
ir: None,
val,
};
self.process(stk, ctx, opt, txn, stm, pro).await?;
@ -228,9 +228,8 @@ impl<'a> Processor<'a> {
txn.lock().await.check_ns_db_tb(opt.ns(), opt.db(), &v.tb, opt.strict).await?;
// Process the document record
let pro = Processed {
ir: None,
rid: Some(v),
doc_id: None,
ir: None,
val: Operable::Value(Value::None),
};
self.process(stk, ctx, opt, txn, stm, pro).await?;
@ -263,9 +262,8 @@ impl<'a> Processor<'a> {
let val = Operable::Mergeable(x, o);
// Process the document record
let pro = Processed {
ir: None,
rid: Some(v),
doc_id: None,
ir: None,
val,
};
self.process(stk, ctx, opt, txn, stm, pro).await?;
@ -299,9 +297,8 @@ impl<'a> Processor<'a> {
let val = Operable::Relatable(f, x, w);
// Process the document record
let pro = Processed {
ir: None,
rid: Some(v),
doc_id: None,
ir: None,
val,
};
self.process(stk, ctx, opt, txn, stm, pro).await?;
@ -352,9 +349,8 @@ impl<'a> Processor<'a> {
let val = Operable::Value(val);
// Process the record
let pro = Processed {
ir: None,
rid: Some(rid),
doc_id: None,
ir: None,
val,
};
self.process(stk, ctx, opt, txn, stm, pro).await?;
@ -425,9 +421,8 @@ impl<'a> Processor<'a> {
let val = Operable::Value(val);
// Process the record
let pro = Processed {
ir: None,
rid: Some(rid),
doc_id: None,
ir: None,
val,
};
self.process(stk, ctx, opt, txn, stm, pro).await?;
@ -551,9 +546,8 @@ impl<'a> Processor<'a> {
});
// Process the record
let pro = Processed {
ir: None,
rid: Some(rid),
doc_id: None,
ir: None,
val,
};
self.process(stk, ctx, opt, txn, stm, pro).await?;
@ -574,53 +568,27 @@ impl<'a> Processor<'a> {
txn: &Transaction,
stm: &Statement<'_>,
table: &Table,
ir: IteratorRef,
irf: IteratorRef,
) -> Result<(), Error> {
// Check that the table exists
txn.lock().await.check_ns_db_tb(opt.ns(), opt.db(), &table.0, opt.strict).await?;
if let Some(exe) = ctx.get_query_executor() {
if let Some(mut iterator) = exe.new_iterator(opt, ir).await? {
let mut things = Vec::new();
iterator.next_batch(txn, PROCESSOR_BATCH_SIZE, &mut things).await?;
while !things.is_empty() {
if let Some(mut iterator) = exe.new_iterator(opt, irf).await? {
// Get the first batch
let mut to_process = Self::next_batch(ctx, opt, txn, &mut iterator).await?;
while !to_process.is_empty() {
// Check if the context is finished
if ctx.is_done() {
break;
}
for (thing, doc_id) in things {
// Check the context
if ctx.is_done() {
break;
}
// If the record is from another table we can skip
if !thing.tb.eq(table.as_str()) {
continue;
}
// Fetch the data from the store
let key = thing::new(opt.ns(), opt.db(), &table.0, &thing.id);
let val = txn.lock().await.get(key.clone()).await?;
let rid = Thing::from((key.tb, key.id));
// Parse the data from the store
let val = Operable::Value(match val {
Some(v) => Value::from(v),
None => Value::None,
});
// Process the document record
let pro = Processed {
ir: Some(ir),
rid: Some(rid),
doc_id,
val,
};
// Process the records
// TODO: par_iter
for pro in to_process {
self.process(stk, ctx, opt, txn, stm, pro).await?;
}
// Collect the next batch of ids
things = Vec::new();
iterator.next_batch(txn, PROCESSOR_BATCH_SIZE, &mut things).await?;
// Get the next batch
to_process = Self::next_batch(ctx, opt, txn, &mut iterator).await?;
}
// Everything ok
return Ok(());
@ -634,4 +602,48 @@ impl<'a> Processor<'a> {
message: "No QueryExecutor has been found.".to_string(),
})
}
async fn next_batch(
ctx: &Context<'_>,
opt: &Options,
txn: &Transaction,
iterator: &mut ThingIterator,
) -> Result<Vec<Processed>, Error> {
let mut tx = txn.lock().await;
let records: Vec<CollectorRecord> =
iterator.next_batch(ctx, &mut tx, PROCESSOR_BATCH_SIZE).await?;
let mut to_process = Vec::with_capacity(records.len());
for r in records {
let v = if let Some(v) = r.2 {
// The value may be already be fetched by the KNN iterator to evaluate the condition
v
} else {
// Otherwise we have to fetch the record
Iterable::fetch_thing(&mut tx, opt, &r.0).await?
};
let p = Processed {
rid: Some(r.0),
ir: Some(r.1),
val: Operable::Value(v),
};
to_process.push(p);
}
Ok(to_process)
}
}
impl Iterable {
/// Returns the value from the store, or Value::None it the value does not exist.
pub(crate) async fn fetch_thing(
tx: &mut kvs::Transaction,
opt: &Options,
thg: &Thing,
) -> Result<Value, Error> {
// Fetch the data from the store
let key = thing::new(opt.ns(), opt.db(), &thg.tb, &thg.id);
// Fetch and parse the data from the store
let val = tx.get(key).await?.map(Value::from).unwrap_or(Value::None);
// Return the result
Ok(val)
}
}

View file

@ -1,8 +1,9 @@
use crate::ctx::Context;
use crate::dbs::Statement;
use crate::dbs::{Options, Transaction};
use crate::doc::Document;
use crate::doc::{CursorDoc, Document};
use crate::err::Error;
use crate::sql::Cond;
use reblessive::tree::Stk;
impl<'a> Document<'a> {
@ -13,11 +14,22 @@ impl<'a> Document<'a> {
opt: &Options,
txn: &Transaction,
stm: &Statement<'_>,
) -> Result<(), Error> {
Self::check_cond(stk, ctx, opt, txn, stm.conds(), &self.current).await
}
pub(crate) async fn check_cond(
stk: &mut Stk,
ctx: &Context<'_>,
opt: &Options,
txn: &Transaction,
cond: Option<&Cond>,
doc: &CursorDoc<'_>,
) -> Result<(), Error> {
// Check where condition
if let Some(cond) = stm.conds() {
if let Some(cond) = cond {
// Check if the expression is truthy
if !cond.compute(stk, ctx, opt, txn, Some(&self.current)).await?.is_truthy() {
if !cond.compute(stk, ctx, opt, txn, Some(doc)).await?.is_truthy() {
// Ignore this document
return Err(Error::Ignore);
}

View file

@ -33,7 +33,7 @@ impl<'a> Document<'a> {
Operable::Relatable(f, v, w) => (v, Workable::Relate(f, w)),
};
// Setup a new document
let mut doc = Document::new(pro.ir, pro.rid.as_ref(), pro.doc_id, &ins.0, ins.1);
let mut doc = Document::new(pro.rid.as_ref(), pro.ir.as_ref(), &ins.0, ins.1);
// Process the statement
let res = match stm {
Statement::Select(_) => doc.select(stk, ctx, opt, txn, stm).await,
@ -59,9 +59,8 @@ impl<'a> Document<'a> {
None => Value::None,
};
pro = Processed {
ir: None,
doc_id: None,
rid: Some(v),
ir: None,
val: match doc.extras {
Workable::Normal => Operable::Value(val),
Workable::Insert(o) => Operable::Mergeable(val, o),

View file

@ -4,8 +4,7 @@ use crate::dbs::Workable;
use crate::err::Error;
use crate::iam::Action;
use crate::iam::ResourceKind;
use crate::idx::docids::DocId;
use crate::idx::planner::executor::IteratorRef;
use crate::idx::planner::iterators::IteratorRecord;
use crate::sql::statements::define::DefineEventStatement;
use crate::sql::statements::define::DefineFieldStatement;
use crate::sql::statements::define::DefineIndexStatement;
@ -28,24 +27,21 @@ pub(crate) struct Document<'a> {
#[non_exhaustive]
#[cfg_attr(debug_assertions, derive(Debug))]
pub struct CursorDoc<'a> {
pub(crate) ir: Option<IteratorRef>,
pub(crate) rid: Option<&'a Thing>,
pub(crate) ir: Option<&'a IteratorRecord>,
pub(crate) doc: Cow<'a, Value>,
pub(crate) doc_id: Option<DocId>,
}
impl<'a> CursorDoc<'a> {
pub(crate) fn new(
ir: Option<IteratorRef>,
rid: Option<&'a Thing>,
doc_id: Option<DocId>,
ir: Option<&'a IteratorRecord>,
doc: Cow<'a, Value>,
) -> Self {
Self {
ir,
rid,
ir,
doc,
doc_id,
}
}
}
@ -53,10 +49,9 @@ impl<'a> CursorDoc<'a> {
impl<'a> From<&'a Value> for CursorDoc<'a> {
fn from(doc: &'a Value) -> Self {
Self {
ir: None,
rid: None,
ir: None,
doc: Cow::Borrowed(doc),
doc_id: None,
}
}
}
@ -64,10 +59,9 @@ impl<'a> From<&'a Value> for CursorDoc<'a> {
impl<'a> From<&'a mut Value> for CursorDoc<'a> {
fn from(doc: &'a mut Value) -> Self {
Self {
ir: None,
rid: None,
ir: None,
doc: Cow::Borrowed(doc),
doc_id: None,
}
}
}
@ -86,17 +80,16 @@ impl<'a> From<&Document<'a>> for Vec<u8> {
impl<'a> Document<'a> {
pub fn new(
ir: Option<IteratorRef>,
id: Option<&'a Thing>,
doc_id: Option<DocId>,
ir: Option<&'a IteratorRecord>,
val: &'a Value,
extras: Workable,
) -> Self {
Document {
id,
extras,
current: CursorDoc::new(ir, id, doc_id, Cow::Borrowed(val)),
initial: CursorDoc::new(ir, id, doc_id, Cow::Borrowed(val)),
current: CursorDoc::new(id, ir, Cow::Borrowed(val)),
initial: CursorDoc::new(id, ir, Cow::Borrowed(val)),
}
}
@ -105,9 +98,8 @@ impl<'a> Document<'a> {
/// This allows for it to be crafted without needing statements to operate on it
#[doc(hidden)]
pub fn new_artificial(
ir: Option<IteratorRef>,
id: Option<&'a Thing>,
doc_id: Option<DocId>,
ir: Option<&'a IteratorRecord>,
val: Cow<'a, Value>,
initial: Cow<'a, Value>,
extras: Workable,
@ -115,8 +107,8 @@ impl<'a> Document<'a> {
Document {
id,
extras,
current: CursorDoc::new(ir, id, doc_id, val),
initial: CursorDoc::new(ir, id, doc_id, initial),
current: CursorDoc::new(id, ir, val),
initial: CursorDoc::new(id, ir, initial),
}
}

View file

@ -27,7 +27,7 @@ impl<'a> Document<'a> {
Operable::Relatable(f, v, w) => (v, Workable::Relate(f, w)),
};
// Setup a new document
let mut doc = Document::new(pro.ir, pro.rid.as_ref(), pro.doc_id, &ins.0, ins.1);
let mut doc = Document::new(pro.rid.as_ref(), pro.ir.as_ref(), &ins.0, ins.1);
// Process the statement
let res = match stm {
Statement::Select(_) => doc.select(stk, ctx, opt, txn, stm).await,
@ -53,9 +53,8 @@ impl<'a> Document<'a> {
None => Value::None,
};
pro = Processed {
ir: None,
doc_id: None,
rid: Some(v),
ir: None,
val: match doc.extras {
Workable::Normal => Operable::Value(val),
Workable::Insert(o) => Operable::Mergeable(val, o),

View file

@ -4,7 +4,9 @@ use crate::dbs::Options;
use crate::dbs::Transaction;
use crate::doc::CursorDoc;
use crate::err::Error;
use crate::idx::planner::executor::QueryExecutor;
use crate::sql::value::Value;
use crate::sql::Thing;
use reblessive::tree::Stk;
pub mod args;
@ -55,7 +57,7 @@ pub async fn run(
{
stk.run(|stk| asynchronous(stk, ctx, Some(opt), Some(txn), doc, name, args)).await
} else {
synchronous(ctx, name, args)
synchronous(ctx, doc, name, args)
}
}
@ -85,7 +87,12 @@ macro_rules! dispatch {
}
/// Attempts to run any synchronous function.
pub fn synchronous(ctx: &Context<'_>, name: &str, args: Vec<Value>) -> Result<Value, Error> {
pub fn synchronous(
ctx: &Context<'_>,
doc: Option<&CursorDoc<'_>>,
name: &str,
args: Vec<Value>,
) -> Result<Value, Error> {
dispatch!(
name,
args,
@ -362,6 +369,7 @@ pub fn synchronous(ctx: &Context<'_>, name: &str, args: Vec<Value>) -> Result<Va
"vector::distance::chebyshev" => vector::distance::chebyshev,
"vector::distance::euclidean" => vector::distance::euclidean,
"vector::distance::hamming" => vector::distance::hamming,
"vector::distance::knn" => vector::distance::knn((ctx, doc)),
"vector::distance::mahalanobis" => vector::distance::mahalanobis,
"vector::distance::manhattan" => vector::distance::manhattan,
"vector::distance::minkowski" => vector::distance::minkowski,
@ -509,3 +517,19 @@ mod tests {
}
}
}
fn get_execution_context<'a>(
ctx: &'a Context<'_>,
doc: Option<&'a CursorDoc<'_>>,
) -> Option<(&'a QueryExecutor, &'a CursorDoc<'a>, &'a Thing)> {
if let Some(doc) = doc {
if let Some(thg) = doc.rid {
if let Some(pla) = ctx.get_query_planner() {
if let Some(exe) = pla.get_query_executor(&thg.tb) {
return Some((exe, doc, thg));
}
}
}
}
None
}

View file

@ -201,7 +201,7 @@ fn get_executor_option<'a>(
if let Some(doc) = doc {
if let Some((exe, thg)) = get_executor_and_thing(ctx, doc) {
if let Some(ir) = doc.ir {
if exe.is_iterator_expression(ir, exp) {
if exe.is_iterator_expression(ir.irf(), exp) {
return ExecutorOption::PreMatch;
}
}

View file

@ -60,7 +60,7 @@ impl_module_def!(
fn run(js_ctx: js::Ctx<'_>, name: &str, args: Vec<Value>) -> Result<Value> {
let this = js_ctx.globals().get::<_, OwnedBorrow<QueryContext>>(QUERY_DATA_PROP_NAME)?;
// Process the called function
let res = fnc::synchronous(this.context, name, args);
let res = fnc::synchronous(this.context, this.doc, name, args);
// Convert any response error
res.map_err(|err| {
js::Exception::from_message(js_ctx, &err.to_string())

View file

@ -10,6 +10,7 @@ impl_module_def!(
"chebyshev" => run,
"euclidean" => run,
"hamming" => run,
"knn" => run,
"mahalanobis" => run,
"manhattan" => run,
"minkowski" => run

View file

@ -2,30 +2,11 @@ use crate::ctx::Context;
use crate::dbs::{Options, Transaction};
use crate::doc::CursorDoc;
use crate::err::Error;
use crate::fnc::get_execution_context;
use crate::idx::ft::analyzer::Analyzer;
use crate::idx::planner::executor::QueryExecutor;
use crate::sql::{Thing, Value};
use crate::sql::Value;
use reblessive::tree::Stk;
fn get_execution_context<'a>(
ctx: &'a Context<'_>,
txn: Option<&'a Transaction>,
doc: Option<&'a CursorDoc<'_>>,
) -> Option<(&'a Transaction, &'a QueryExecutor, &'a CursorDoc<'a>, &'a Thing)> {
if let Some(txn) = txn {
if let Some(doc) = doc {
if let Some(thg) = doc.rid {
if let Some(pla) = ctx.get_query_planner() {
if let Some(exe) = pla.get_query_executor(&thg.tb) {
return Some((txn, exe, doc, thg));
}
}
}
}
}
None
}
pub async fn analyze(
(stk, ctx, txn, opt): (&mut Stk, &Context<'_>, Option<&Transaction>, Option<&Options>),
(az, val): (Value, Value),
@ -43,33 +24,38 @@ pub async fn score(
(ctx, txn, doc): (&Context<'_>, Option<&Transaction>, Option<&CursorDoc<'_>>),
(match_ref,): (Value,),
) -> Result<Value, Error> {
if let Some((txn, exe, doc, thg)) = get_execution_context(ctx, txn, doc) {
exe.score(txn, &match_ref, thg, doc.doc_id).await
} else {
Ok(Value::None)
if let Some(txn) = txn {
if let Some((exe, doc, thg)) = get_execution_context(ctx, doc) {
return exe.score(txn, &match_ref, thg, doc.ir).await;
}
}
Ok(Value::None)
}
pub async fn highlight(
(ctx, txn, doc): (&Context<'_>, Option<&Transaction>, Option<&CursorDoc<'_>>),
(prefix, suffix, match_ref, partial): (Value, Value, Value, Option<Value>),
) -> Result<Value, Error> {
if let Some((txn, exe, doc, thg)) = get_execution_context(ctx, txn, doc) {
let partial = partial.map(|p| p.convert_to_bool()).unwrap_or(Ok(false))?;
exe.highlight(txn, thg, prefix, suffix, match_ref, partial, doc.doc.as_ref()).await
} else {
Ok(Value::None)
if let Some(txn) = txn {
if let Some((exe, doc, thg)) = get_execution_context(ctx, doc) {
let partial = partial.map(|p| p.convert_to_bool()).unwrap_or(Ok(false))?;
return exe
.highlight(txn, thg, prefix, suffix, match_ref, partial, doc.doc.as_ref())
.await;
}
}
Ok(Value::None)
}
pub async fn offsets(
(ctx, txn, doc): (&Context<'_>, Option<&Transaction>, Option<&CursorDoc<'_>>),
(match_ref, partial): (Value, Option<Value>),
) -> Result<Value, Error> {
if let Some((txn, exe, _, thg)) = get_execution_context(ctx, txn, doc) {
let partial = partial.map(|p| p.convert_to_bool()).unwrap_or(Ok(false))?;
exe.offsets(txn, thg, match_ref, partial).await
} else {
Ok(Value::None)
if let Some(txn) = txn {
if let Some((exe, _, thg)) = get_execution_context(ctx, doc) {
let partial = partial.map(|p| p.convert_to_bool()).unwrap_or(Ok(false))?;
return exe.offsets(txn, thg, match_ref, partial).await;
}
}
Ok(Value::None)
}

View file

@ -45,11 +45,14 @@ pub fn subtract((a, b): (Vec<Number>, Vec<Number>)) -> Result<Value, Error> {
}
pub mod distance {
use crate::ctx::Context;
use crate::doc::CursorDoc;
use crate::err::Error;
use crate::fnc::get_execution_context;
use crate::fnc::util::math::vector::{
ChebyshevDistance, EuclideanDistance, HammingDistance, ManhattanDistance, MinkowskiDistance,
};
use crate::idx::planner::IterationStage;
use crate::sql::{Number, Value};
pub fn chebyshev((a, b): (Vec<Number>, Vec<Number>)) -> Result<Value, Error> {
@ -64,6 +67,30 @@ pub mod distance {
Ok(a.hamming_distance(&b)?.into())
}
pub fn knn(
(ctx, doc): (&Context<'_>, Option<&CursorDoc<'_>>),
(knn_ref,): (Option<Value>,),
) -> Result<Value, Error> {
if let Some((_exe, doc, thg)) = get_execution_context(ctx, doc) {
if let Some(ir) = doc.ir {
if let Some(d) = ir.dist() {
return Ok(d.into());
}
}
if let Some(IterationStage::Iterate(Some(results))) = ctx.get_iteration_stage() {
let n = if let Some(Value::Number(n)) = knn_ref {
n.as_usize()
} else {
0
};
if let Some(d) = results.get_dist(n, thg) {
return Ok(d.into());
}
}
}
Ok(Value::None)
}
pub fn mahalanobis((_, _): (Vec<Number>, Vec<Number>)) -> Result<Value, Error> {
Err(Error::FeatureNotYetImplemented {
feature: "vector::distance::mahalanobis() function".to_string(),

View file

@ -10,7 +10,7 @@ use serde::{Deserialize, Serialize};
pub type DocId = u64;
pub(crate) struct DocIds {
pub struct DocIds {
ixs: IndexStores,
state_key: Key,
index_key_base: IndexKeyBase,
@ -21,7 +21,7 @@ pub(crate) struct DocIds {
}
impl DocIds {
pub(in crate::idx) async fn new(
pub async fn new(
ixs: &IndexStores,
tx: &mut Transaction,
tt: TransactionType,

View file

@ -89,6 +89,7 @@ impl Analyzer {
}
}
}
drop(tx);
Ok((
list,
TermsSet {
@ -123,6 +124,7 @@ impl Analyzer {
}
}
}
drop(tx);
Ok(TermsSet {
set,
has_unknown_terms,
@ -176,6 +178,7 @@ impl Analyzer {
for (t, f) in tf {
tfid.push((terms.resolve_term_id(&mut tx, t).await?, f));
}
drop(tx);
Ok((dl, tfid))
}
@ -221,6 +224,7 @@ impl Analyzer {
tfid.push((id, o.len() as TermFrequency));
osid.push((id, OffsetRecords(o)));
}
drop(tx);
Ok((dl, tfid, osid))
}

View file

@ -108,7 +108,9 @@ impl FtIndex {
) -> Result<Self, Error> {
let mut tx = txn.lock().await;
let az = tx.get_db_analyzer(opt.ns(), opt.db(), az).await?;
Self::with_analyzer(ixs, &mut tx, az, index_key_base, p, tt).await
let res = Self::with_analyzer(ixs, &mut tx, az, index_key_base, p, tt).await;
drop(tx);
res
}
async fn with_analyzer(
ixs: &IndexStores,
@ -194,13 +196,17 @@ impl FtIndex {
) -> Result<(), Error> {
let mut tx = txn.lock().await;
// Extract and remove the doc_id (if any)
if let Some(doc_id) = self.doc_ids.write().await.remove_doc(&mut tx, rid.into()).await? {
let mut doc_ids = self.doc_ids.write().await;
let doc_id = doc_ids.remove_doc(&mut tx, rid.into()).await?;
drop(doc_ids);
if let Some(doc_id) = doc_id {
self.state.doc_count -= 1;
// Remove the doc length
if let Some(doc_lengths) =
self.doc_lengths.write().await.remove_doc_length(&mut tx, doc_id).await?
{
let mut doc_lengths = self.doc_lengths.write().await;
let dl = doc_lengths.remove_doc_length(&mut tx, doc_id).await?;
drop(doc_lengths);
if let Some(doc_lengths) = dl {
self.state.total_docs_lengths -= doc_lengths as u128;
}
@ -218,6 +224,8 @@ impl FtIndex {
t.remove_term_id(&mut tx, term_id).await?;
}
}
drop(p);
drop(t);
// Remove the offsets if any
if self.highlighting {
for term_id in term_list {
@ -227,6 +235,7 @@ impl FtIndex {
}
}
}
drop(tx);
Ok(())
}
@ -241,9 +250,11 @@ impl FtIndex {
) -> Result<(), Error> {
// Resolve the doc_id
let mut tx = txn.lock().await;
let resolved = self.doc_ids.write().await.resolve_doc_id(&mut tx, rid.into()).await?;
let doc_id = *resolved.doc_id();
let mut doc_ids = self.doc_ids.write().await;
let resolved = doc_ids.resolve_doc_id(&mut tx, rid.into()).await?;
drop(doc_ids);
drop(tx);
let doc_id = *resolved.doc_id();
// Extract the doc_lengths, terms en frequencies (and offset)
let mut t = self.terms.write().await;
@ -270,6 +281,7 @@ impl FtIndex {
}
}
dl.set_doc_length(&mut tx, doc_id, doc_length).await?;
drop(dl);
// Retrieve the existing terms for this document (if any)
let term_ids_key = self.index_key_base.new_bk_key(doc_id);
@ -302,6 +314,8 @@ impl FtIndex {
}
}
}
drop(p);
drop(t);
if self.highlighting {
// Set the offset if any
@ -333,6 +347,7 @@ impl FtIndex {
// Update the states
tx.set(self.state_key.clone(), self.state.try_to_val()?).await?;
drop(tx);
Ok(())
}
@ -347,6 +362,7 @@ impl FtIndex {
let t = self.terms.read().await;
let res =
self.analyzer.extract_querying_terms(stk, ctx, opt, txn, &t, query_string).await?;
drop(t);
Ok(res)
}
@ -422,7 +438,10 @@ impl FtIndex {
doc: &Value,
) -> Result<Value, Error> {
let doc_key: Key = thg.into();
if let Some(doc_id) = self.doc_ids.read().await.get_doc_id(tx, doc_key).await? {
let di = self.doc_ids.read().await;
let doc_id = di.get_doc_id(tx, doc_key).await?;
drop(di);
if let Some(doc_id) = doc_id {
let mut hl = Highlighter::new(prefix, suffix, partial, idiom, doc);
for (term_id, term_len) in terms.iter().flatten() {
let o = self.offsets.get_offsets(tx, doc_id, *term_id).await?;
@ -443,7 +462,10 @@ impl FtIndex {
partial: bool,
) -> Result<Value, Error> {
let doc_key: Key = thg.into();
if let Some(doc_id) = self.doc_ids.read().await.get_doc_id(tx, doc_key).await? {
let di = self.doc_ids.read().await;
let doc_id = di.get_doc_id(tx, doc_key).await?;
drop(di);
if let Some(doc_id) = doc_id {
let mut or = Offseter::new(partial);
for (term_id, term_len) in terms.iter().flatten() {
let o = self.offsets.get_offsets(tx, doc_id, *term_id).await?;
@ -459,12 +481,14 @@ impl FtIndex {
pub(crate) async fn statistics(&self, txn: &Transaction) -> Result<FtStatistics, Error> {
// TODO do parallel execution
let mut run = txn.lock().await;
Ok(FtStatistics {
let res = FtStatistics {
doc_ids: self.doc_ids.read().await.statistics(&mut run).await?,
terms: self.terms.read().await.statistics(&mut run).await?,
doc_lengths: self.doc_lengths.read().await.statistics(&mut run).await?,
postings: self.postings.read().await.statistics(&mut run).await?,
})
};
drop(run);
Ok(res)
}
pub(crate) async fn finish(&self, tx: &Transaction) -> Result<(), Error> {
@ -473,6 +497,7 @@ impl FtIndex {
self.doc_lengths.write().await.finish(&mut run).await?;
self.postings.write().await.finish(&mut run).await?;
self.terms.write().await.finish(&mut run).await?;
drop(run);
Ok(())
}
}
@ -490,15 +515,27 @@ impl HitsIterator {
}
}
#[cfg(not(target_arch = "wasm32"))]
pub(crate) fn len(&self) -> usize {
self.iter.len()
}
#[cfg(target_arch = "wasm32")]
pub(crate) fn len(&self) -> usize {
self.iter.size_hint().0
}
pub(crate) async fn next(
&mut self,
tx: &mut kvs::Transaction,
) -> Result<Option<(Thing, DocId)>, Error> {
let di = self.doc_ids.read().await;
for doc_id in self.iter.by_ref() {
if let Some(doc_key) = self.doc_ids.read().await.get_doc_key(tx, doc_id).await? {
if let Some(doc_key) = di.get_doc_key(tx, doc_id).await? {
drop(di);
return Ok(Some((doc_key.into(), doc_id)));
}
}
drop(di);
Ok(None)
}
}
@ -541,6 +578,7 @@ mod tests {
} else {
panic!("hits is none");
}
drop(tx);
}
async fn search(
@ -600,7 +638,9 @@ mod tests {
pub(super) async fn finish(txn: &Transaction, fti: FtIndex) {
fti.finish(txn).await.unwrap();
txn.lock().await.commit().await.unwrap();
let mut tx = txn.lock().await;
tx.commit().await.unwrap();
drop(tx);
}
#[test(tokio::test)]

View file

@ -45,8 +45,9 @@ impl BM25Scorer {
term_doc_count: DocLength,
term_frequency: TermFrequency,
) -> Result<Score, Error> {
let doc_length =
self.doc_lengths.read().await.get_doc_length(tx, doc_id).await?.unwrap_or(0);
let dl = self.doc_lengths.read().await;
let doc_length = dl.get_doc_length(tx, doc_id).await?.unwrap_or(0);
drop(dl);
Ok(self.compute_bm25_score(term_frequency as f32, term_doc_count as f32, doc_length as f32))
}
@ -56,15 +57,16 @@ impl BM25Scorer {
doc_id: DocId,
) -> Result<Option<Score>, Error> {
let mut sc = 0.0;
let p = self.postings.read().await;
for (term_id, docs) in self.terms_docs.iter().flatten() {
if docs.contains(doc_id) {
if let Some(term_freq) =
self.postings.read().await.get_term_frequency(tx, *term_id, doc_id).await?
{
let tf = p.get_term_frequency(tx, *term_id, doc_id).await?;
if let Some(term_freq) = tf {
sc += self.term_score(tx, doc_id, docs.len(), term_freq).await?;
}
}
}
drop(p);
Ok(Some(sc))
}

View file

@ -1,6 +1,6 @@
pub mod docids;
pub(crate) mod ft;
pub(crate) mod planner;
pub mod planner;
pub mod trees;
use crate::dbs::Options;

View file

@ -0,0 +1,352 @@
use crate::ctx::Context;
use crate::dbs::{Iterable, Options, Transaction};
use crate::doc::CursorDoc;
use crate::err::Error;
use crate::idx::docids::{DocId, DocIds};
use crate::idx::planner::iterators::KnnIteratorResult;
use crate::idx::trees::hnsw::docs::HnswDocs;
use crate::idx::trees::knn::Ids64;
use crate::sql::{Cond, Thing, Value};
use hashbrown::hash_map::Entry;
use hashbrown::HashMap;
use reblessive::tree::Stk;
use std::borrow::Cow;
use std::collections::VecDeque;
use std::sync::Arc;
pub enum HnswConditionChecker<'a> {
Hnsw(HnswChecker),
HnswCondition(HnswCondChecker<'a>),
}
pub enum MTreeConditionChecker<'a> {
MTree(MTreeChecker<'a>),
MTreeCondition(MTreeCondChecker<'a>),
}
impl<'a> Default for HnswConditionChecker<'a> {
fn default() -> Self {
Self::Hnsw(HnswChecker {})
}
}
impl<'a> HnswConditionChecker<'a> {
pub(in crate::idx) fn new_cond(
ctx: &'a Context<'a>,
opt: &'a Options,
txn: &'a Transaction,
cond: Arc<Cond>,
) -> Self {
Self::HnswCondition(HnswCondChecker {
ctx,
opt,
txn,
cond,
cache: Default::default(),
})
}
pub(in crate::idx) async fn check_truthy(
&mut self,
stk: &mut Stk,
docs: &HnswDocs,
doc_ids: &Ids64,
) -> Result<bool, Error> {
match self {
Self::HnswCondition(c) => c.check_any_truthy(stk, docs, doc_ids).await,
Self::Hnsw(_) => Ok(true),
}
}
pub(in crate::idx) fn expire(&mut self, doc_id: u64) {
if let Self::HnswCondition(c) = self {
c.expire(doc_id)
}
}
pub(in crate::idx) fn expires(&mut self, doc_ids: Ids64) {
if let Self::HnswCondition(c) = self {
c.expires(doc_ids)
}
}
pub(in crate::idx) async fn convert_result(
&mut self,
docs: &HnswDocs,
res: VecDeque<(DocId, f64)>,
) -> Result<VecDeque<KnnIteratorResult>, Error> {
match self {
Self::Hnsw(c) => c.convert_result(docs, res).await,
Self::HnswCondition(c) => Ok(c.convert_result(res)),
}
}
}
impl<'a> MTreeConditionChecker<'a> {
pub fn new_cond(
ctx: &'a Context<'_>,
opt: &'a Options,
txn: &'a Transaction,
cond: Arc<Cond>,
) -> Self {
if Cond(Value::Bool(true)).ne(cond.as_ref()) {
return Self::MTreeCondition(MTreeCondChecker {
ctx,
opt,
txn,
cond,
cache: Default::default(),
});
} else {
Self::new(txn)
}
}
pub fn new(txn: &'a Transaction) -> Self {
Self::MTree(MTreeChecker {
txn,
})
}
pub(in crate::idx) async fn check_truthy(
&mut self,
stk: &mut Stk,
doc_ids: &DocIds,
doc_id: DocId,
) -> Result<bool, Error> {
match self {
Self::MTreeCondition(c) => c.check_truthy(stk, doc_ids, doc_id).await,
Self::MTree(_) => Ok(true),
}
}
pub(in crate::idx) fn expires(&mut self, ids: Ids64) {
if let Self::MTreeCondition(c) = self {
c.expires(ids)
}
}
pub(in crate::idx) async fn convert_result(
&mut self,
doc_ids: &DocIds,
res: VecDeque<(DocId, f64)>,
) -> Result<VecDeque<KnnIteratorResult>, Error> {
match self {
Self::MTree(c) => c.convert_result(doc_ids, res).await,
Self::MTreeCondition(c) => Ok(c.convert_result(res)),
}
}
}
pub struct MTreeChecker<'a> {
txn: &'a Transaction,
}
impl<'a> MTreeChecker<'a> {
async fn convert_result(
&self,
doc_ids: &DocIds,
res: VecDeque<(DocId, f64)>,
) -> Result<VecDeque<KnnIteratorResult>, Error> {
if res.is_empty() {
return Ok(VecDeque::from([]));
}
let mut result = VecDeque::with_capacity(res.len());
let mut tx = self.txn.lock().await;
for (doc_id, dist) in res {
if let Some(key) = doc_ids.get_doc_key(&mut tx, doc_id).await? {
result.push_back((key.into(), dist, None));
}
}
drop(tx);
Ok(result)
}
}
struct CheckerCacheEntry {
record: Option<(Thing, Value)>,
truthy: bool,
}
impl CheckerCacheEntry {
fn convert_result(
res: VecDeque<(DocId, f64)>,
cache: &mut HashMap<DocId, CheckerCacheEntry>,
) -> VecDeque<KnnIteratorResult> {
let mut result = VecDeque::with_capacity(res.len());
for (doc_id, dist) in res {
if let Some(e) = cache.remove(&doc_id) {
if e.truthy {
if let Some((rid, value)) = e.record {
result.push_back((rid, dist, Some(value)))
}
}
}
}
result
}
async fn build(
stk: &mut Stk,
ctx: &Context<'_>,
opt: &Options,
txn: &Transaction,
rid: Option<Thing>,
cond: &Cond,
) -> Result<Self, Error> {
if let Some(rid) = rid {
let mut tx = txn.lock().await;
let val = Iterable::fetch_thing(&mut tx, opt, &rid).await?;
drop(tx);
if !val.is_none_or_null() {
let (value, truthy) = {
let cursor_doc = CursorDoc {
rid: Some(&rid),
ir: None,
doc: Cow::Owned(val),
};
let truthy =
cond.compute(stk, ctx, opt, txn, Some(&cursor_doc)).await?.is_truthy();
(cursor_doc.doc.into_owned(), truthy)
};
return Ok(CheckerCacheEntry {
record: Some((rid, value)),
truthy,
});
}
}
Ok(CheckerCacheEntry {
record: None,
truthy: false,
})
}
}
pub struct MTreeCondChecker<'a> {
ctx: &'a Context<'a>,
opt: &'a Options,
txn: &'a Transaction,
cond: Arc<Cond>,
cache: HashMap<DocId, CheckerCacheEntry>,
}
impl<'a> MTreeCondChecker<'a> {
async fn check_truthy(
&mut self,
stk: &mut Stk,
doc_ids: &DocIds,
doc_id: u64,
) -> Result<bool, Error> {
match self.cache.entry(doc_id) {
Entry::Occupied(e) => Ok(e.get().truthy),
Entry::Vacant(e) => {
let mut tx = self.txn.lock().await;
let rid = doc_ids.get_doc_key(&mut tx, doc_id).await?.map(|k| k.into());
drop(tx);
let ent = CheckerCacheEntry::build(
stk,
self.ctx,
self.opt,
self.txn,
rid,
self.cond.as_ref(),
)
.await?;
let truthy = ent.truthy;
e.insert(ent);
Ok(truthy)
}
}
}
fn expire(&mut self, doc_id: DocId) {
self.cache.remove(&doc_id);
}
fn expires(&mut self, doc_ids: Ids64) {
for doc_id in doc_ids.iter() {
self.expire(doc_id);
}
}
fn convert_result(&mut self, res: VecDeque<(DocId, f64)>) -> VecDeque<KnnIteratorResult> {
CheckerCacheEntry::convert_result(res, &mut self.cache)
}
}
pub struct HnswChecker {}
impl<'a> HnswChecker {
async fn convert_result(
&self,
docs: &HnswDocs,
res: VecDeque<(DocId, f64)>,
) -> Result<VecDeque<KnnIteratorResult>, Error> {
if res.is_empty() {
return Ok(VecDeque::from([]));
}
let mut result = VecDeque::with_capacity(res.len());
for (doc_id, dist) in res {
if let Some(rid) = docs.get_thing(doc_id) {
result.push_back((rid.clone(), dist, None));
}
}
Ok(result)
}
}
pub struct HnswCondChecker<'a> {
ctx: &'a Context<'a>,
opt: &'a Options,
txn: &'a Transaction,
cond: Arc<Cond>,
cache: HashMap<DocId, CheckerCacheEntry>,
}
impl<'a> HnswCondChecker<'a> {
fn convert_result(&mut self, res: VecDeque<(DocId, f64)>) -> VecDeque<KnnIteratorResult> {
CheckerCacheEntry::convert_result(res, &mut self.cache)
}
async fn check_any_truthy(
&mut self,
stk: &mut Stk,
docs: &HnswDocs,
doc_ids: &Ids64,
) -> Result<bool, Error> {
let mut res = false;
for doc_id in doc_ids.iter() {
if match self.cache.entry(doc_id) {
Entry::Occupied(e) => e.get().truthy,
Entry::Vacant(e) => {
let rid: Option<Thing> = docs.get_thing(doc_id).cloned();
let ent = CheckerCacheEntry::build(
stk,
self.ctx,
self.opt,
self.txn,
rid,
self.cond.as_ref(),
)
.await?;
let truthy = ent.truthy;
e.insert(ent);
truthy
}
} {
res = true;
}
}
Ok(res)
}
fn expire(&mut self, doc_id: DocId) {
self.cache.remove(&doc_id);
}
fn expires(&mut self, doc_ids: Ids64) {
for doc_id in doc_ids.iter() {
self.expire(doc_id);
}
}
}

View file

@ -2,40 +2,60 @@ use crate::ctx::Context;
use crate::dbs::{Options, Transaction};
use crate::doc::CursorDoc;
use crate::err::Error;
use crate::idx::docids::{DocId, DocIds};
use crate::idx::docids::DocIds;
use crate::idx::ft::analyzer::{Analyzer, TermsList, TermsSet};
use crate::idx::ft::scorer::BM25Scorer;
use crate::idx::ft::termdocs::TermsDocs;
use crate::idx::ft::terms::Terms;
use crate::idx::ft::{FtIndex, MatchRef};
use crate::idx::planner::checker::{HnswConditionChecker, MTreeConditionChecker};
use crate::idx::planner::iterators::{
DocIdsIterator, IndexEqualThingIterator, IndexJoinThingIterator, IndexRangeThingIterator,
IndexUnionThingIterator, MatchesThingIterator, ThingIterator, ThingsIterator,
UniqueEqualThingIterator, UniqueJoinThingIterator, UniqueRangeThingIterator,
UniqueUnionThingIterator,
IndexEqualThingIterator, IndexJoinThingIterator, IndexRangeThingIterator,
IndexUnionThingIterator, IteratorRecord, IteratorRef, KnnIterator, KnnIteratorResult,
MatchesThingIterator, ThingIterator, UniqueEqualThingIterator, UniqueJoinThingIterator,
UniqueRangeThingIterator, UniqueUnionThingIterator,
};
use crate::idx::planner::knn::KnnPriorityList;
use crate::idx::planner::knn::{KnnBruteForceResult, KnnPriorityList};
use crate::idx::planner::plan::IndexOperator::Matches;
use crate::idx::planner::plan::{IndexOperator, IndexOption, RangeValue};
use crate::idx::planner::tree::{IdiomPosition, IndexRef, IndexesMap};
use crate::idx::planner::{IterationStage, KnnSet};
use crate::idx::planner::IterationStage;
use crate::idx::trees::mtree::MTreeIndex;
use crate::idx::trees::store::hnsw::SharedHnswIndex;
use crate::idx::IndexKeyBase;
use crate::kvs;
use crate::kvs::{Key, TransactionType};
use crate::sql::index::{Distance, Index};
use crate::sql::statements::DefineIndexStatement;
use crate::sql::{Array, Expression, Idiom, Number, Object, Table, Thing, Value};
use crate::sql::{Cond, Expression, Idiom, Number, Object, Table, Thing, Value};
use reblessive::tree::Stk;
use std::collections::hash_map::Entry;
use std::collections::{HashMap, HashSet, VecDeque};
use std::sync::Arc;
use tokio::sync::RwLock;
pub(super) type KnnEntry = (KnnPriorityList, Idiom, Arc<Vec<Number>>, Distance);
pub(super) type KnnExpressions = HashMap<Arc<Expression>, (u32, Idiom, Arc<Vec<Number>>, Distance)>;
pub(super) type AnnExpressions = HashMap<Arc<Expression>, (usize, Idiom, Arc<Vec<Number>>, usize)>;
pub(super) type KnnBruteForceEntry = (KnnPriorityList, Idiom, Arc<Vec<Number>>, Distance);
pub(super) struct KnnBruteForceExpression {
k: u32,
id: Idiom,
obj: Arc<Vec<Number>>,
d: Distance,
}
impl KnnBruteForceExpression {
pub(super) fn new(k: u32, id: Idiom, obj: Arc<Vec<Number>>, d: Distance) -> Self {
Self {
k,
id,
obj,
d,
}
}
}
pub(super) type KnnBruteForceExpressions = HashMap<Arc<Expression>, KnnBruteForceExpression>;
pub(super) type KnnExpressions = HashSet<Arc<Expression>>;
#[derive(Clone)]
pub(crate) struct QueryExecutor(Arc<InnerQueryExecutor>);
@ -49,7 +69,7 @@ pub(super) struct InnerQueryExecutor {
index_definitions: Vec<DefineIndexStatement>,
mt_entries: HashMap<Arc<Expression>, MtEntry>,
hnsw_entries: HashMap<Arc<Expression>, HnswEntry>,
knn_entries: HashMap<Arc<Expression>, KnnEntry>,
knn_bruteforce_entries: HashMap<Arc<Expression>, KnnBruteForceEntry>,
}
impl From<InnerQueryExecutor> for QueryExecutor {
@ -58,8 +78,6 @@ impl From<InnerQueryExecutor> for QueryExecutor {
}
}
pub(crate) type IteratorRef = u16;
pub(super) enum IteratorEntry {
Single(Arc<Expression>, IndexOption),
Range(HashSet<Arc<Expression>>, IndexRef, RangeValue, RangeValue),
@ -82,6 +100,7 @@ impl IteratorEntry {
}
}
impl InnerQueryExecutor {
#[allow(clippy::too_many_arguments)]
pub(super) async fn new(
stk: &mut Stk,
ctx: &Context<'_>,
@ -90,6 +109,8 @@ impl InnerQueryExecutor {
table: &Table,
im: IndexesMap,
knns: KnnExpressions,
kbtes: KnnBruteForceExpressions,
knn_condition: Option<Cond>,
) -> Result<Self, Error> {
let mut mr_entries = HashMap::default();
let mut exp_entries = HashMap::default();
@ -98,7 +119,8 @@ impl InnerQueryExecutor {
let mut mt_entries = HashMap::default();
let mut hnsw_map: HashMap<IndexRef, SharedHnswIndex> = HashMap::default();
let mut hnsw_entries = HashMap::default();
let mut knn_entries = HashMap::with_capacity(knns.len());
let mut knn_bruteforce_entries = HashMap::with_capacity(knns.len());
let knn_condition = knn_condition.map(Arc::new);
// Create all the instances of FtIndex
// Build the FtEntries and map them to Idioms and MatchRef
@ -141,11 +163,23 @@ impl InnerQueryExecutor {
}
Index::MTree(p) => {
if let IndexOperator::Knn(a, k) = io.op() {
let mut tx = txn.lock().await;
let entry = match mt_map.entry(ix_ref) {
Entry::Occupied(e) => MtEntry::new(&mut tx, e.get(), a, *k).await?,
Entry::Occupied(e) => {
MtEntry::new(
stk,
ctx,
opt,
txn,
e.get(),
a,
*k,
knn_condition.clone(),
)
.await?
}
Entry::Vacant(e) => {
let ikb = IndexKeyBase::new(opt, idx_def);
let mut tx = txn.lock().await;
let mt = MTreeIndex::new(
ctx.get_index_stores(),
&mut tx,
@ -154,7 +188,18 @@ impl InnerQueryExecutor {
TransactionType::Read,
)
.await?;
let entry = MtEntry::new(&mut tx, &mt, a, *k).await?;
drop(tx);
let entry = MtEntry::new(
stk,
ctx,
opt,
txn,
&mt,
a,
*k,
knn_condition.clone(),
)
.await?;
e.insert(mt);
entry
}
@ -163,17 +208,39 @@ impl InnerQueryExecutor {
}
}
Index::Hnsw(p) => {
if let IndexOperator::Ann(a, n, ef) = io.op() {
if let IndexOperator::Ann(a, k, ef) = io.op() {
let entry = match hnsw_map.entry(ix_ref) {
Entry::Occupied(e) => {
HnswEntry::new(e.get().clone(), a, *n, *ef).await?
HnswEntry::new(
stk,
ctx,
opt,
txn,
e.get().clone(),
a,
*k,
*ef,
knn_condition.clone(),
)
.await?
}
Entry::Vacant(e) => {
let hnsw = ctx
.get_index_stores()
.get_index_hnsw(opt, idx_def, p)
.await;
let entry = HnswEntry::new(hnsw.clone(), a, *n, *ef).await?;
let entry = HnswEntry::new(
stk,
ctx,
opt,
txn,
hnsw.clone(),
a,
*k,
*ef,
knn_condition.clone(),
)
.await?;
e.insert(hnsw);
entry
}
@ -186,8 +253,9 @@ impl InnerQueryExecutor {
}
}
for (exp, (knn, id, obj, dist)) in knns {
knn_entries.insert(exp, (KnnPriorityList::new(knn as usize), id, obj, dist));
for (exp, knn) in kbtes {
knn_bruteforce_entries
.insert(exp, (KnnPriorityList::new(knn.k as usize), knn.id, knn.obj, knn.d));
}
Ok(Self {
@ -199,7 +267,7 @@ impl InnerQueryExecutor {
index_definitions: im.definitions,
mt_entries,
hnsw_entries,
knn_entries,
knn_bruteforce_entries,
})
}
@ -223,18 +291,12 @@ impl QueryExecutor {
exp: &Expression,
) -> Result<Value, Error> {
if let Some(IterationStage::Iterate(e)) = ctx.get_iteration_stage() {
if let Some(e) = e {
if let Some(e) = e.get(thg.tb.as_str()) {
if let Some(things) = e.get(exp) {
if things.contains(thg) {
return Ok(Value::Bool(true));
}
}
}
if let Some(results) = e {
return Ok(results.contains(exp, thg).into());
}
Ok(Value::Bool(false))
} else {
if let Some((p, id, val, dist)) = self.0.knn_entries.get(exp) {
if let Some((p, id, val, dist)) = self.0.knn_bruteforce_entries.get(exp) {
let v: Vec<Number> = id.compute(stk, ctx, opt, txn, doc).await?.try_into()?;
let dist = dist.compute(&v, val.as_ref())?;
p.add(dist, thg).await;
@ -243,25 +305,25 @@ impl QueryExecutor {
}
}
pub(super) async fn build_knn_set(&self) -> KnnSet {
let mut set = HashMap::with_capacity(self.0.knn_entries.len());
for (exp, (p, _, _, _)) in &self.0.knn_entries {
set.insert(exp.clone(), p.build().await);
pub(super) async fn build_bruteforce_knn_result(&self) -> KnnBruteForceResult {
let mut result = KnnBruteForceResult::with_capacity(self.0.knn_bruteforce_entries.len());
for (e, (p, _, _, _)) in &self.0.knn_bruteforce_entries {
result.insert(e.clone(), p.build().await);
}
set
result
}
pub(crate) fn is_table(&self, tb: &str) -> bool {
self.0.table.eq(tb)
}
pub(crate) fn has_knn(&self) -> bool {
!self.0.knn_entries.is_empty()
pub(crate) fn has_bruteforce_knn(&self) -> bool {
!self.0.knn_bruteforce_entries.is_empty()
}
/// Returns `true` if the expression is matching the current iterator.
pub(crate) fn is_iterator_expression(&self, ir: IteratorRef, exp: &Expression) -> bool {
match self.0.it_entries.get(ir as usize) {
pub(crate) fn is_iterator_expression(&self, irf: IteratorRef, exp: &Expression) -> bool {
match self.0.it_entries.get(irf as usize) {
Some(IteratorEntry::Single(e, ..)) => exp.eq(e.as_ref()),
Some(IteratorEntry::Range(es, ..)) => es.contains(exp),
_ => false,
@ -287,13 +349,13 @@ impl QueryExecutor {
pub(crate) async fn new_iterator(
&self,
opt: &Options,
it_ref: IteratorRef,
irf: IteratorRef,
) -> Result<Option<ThingIterator>, Error> {
if let Some(it_entry) = self.0.it_entries.get(it_ref as usize) {
if let Some(it_entry) = self.0.it_entries.get(irf as usize) {
match it_entry {
IteratorEntry::Single(_, io) => self.new_single_iterator(opt, it_ref, io).await,
IteratorEntry::Range(_, ir, from, to) => {
Ok(self.new_range_iterator(opt, *ir, from, to))
IteratorEntry::Single(_, io) => self.new_single_iterator(opt, irf, io).await,
IteratorEntry::Range(_, ixr, from, to) => {
Ok(self.new_range_iterator(opt, *ixr, from, to))
}
}
} else {
@ -304,20 +366,18 @@ impl QueryExecutor {
async fn new_single_iterator(
&self,
opt: &Options,
it_ref: IteratorRef,
irf: IteratorRef,
io: &IndexOption,
) -> Result<Option<ThingIterator>, Error> {
if let Some(ix) = self.get_index_def(io.ix_ref()) {
match ix.index {
Index::Idx => Ok(self.new_index_iterator(opt, it_ref, ix, io.clone()).await?),
Index::Uniq => {
Ok(self.new_unique_index_iterator(opt, it_ref, ix, io.clone()).await?)
}
Index::Idx => Ok(self.new_index_iterator(opt, irf, ix, io.clone()).await?),
Index::Uniq => Ok(self.new_unique_index_iterator(opt, irf, ix, io.clone()).await?),
Index::Search {
..
} => self.new_search_index_iterator(it_ref, io.clone()).await,
Index::MTree(_) => Ok(self.new_mtree_index_knn_iterator(it_ref)),
Index::Hnsw(_) => Ok(self.new_hnsw_index_ann_iterator(it_ref)),
} => self.new_search_index_iterator(irf, io.clone()).await,
Index::MTree(_) => Ok(self.new_mtree_index_knn_iterator(irf)),
Index::Hnsw(_) => Ok(self.new_hnsw_index_ann_iterator(irf)),
}
} else {
Ok(None)
@ -327,13 +387,14 @@ impl QueryExecutor {
async fn new_index_iterator(
&self,
opt: &Options,
it_ref: IteratorRef,
irf: IteratorRef,
ix: &DefineIndexStatement,
io: IndexOption,
) -> Result<Option<ThingIterator>, Error> {
Ok(match io.op() {
IndexOperator::Equality(value) | IndexOperator::Exactness(value) => {
Some(ThingIterator::IndexEqual(IndexEqualThingIterator::new(
irf,
opt.ns(),
opt.db(),
&ix.what,
@ -342,11 +403,11 @@ impl QueryExecutor {
)))
}
IndexOperator::Union(value) => Some(ThingIterator::IndexUnion(
IndexUnionThingIterator::new(opt.ns(), opt.db(), &ix.what, &ix.name, value),
IndexUnionThingIterator::new(irf, opt.ns(), opt.db(), &ix.what, &ix.name, value),
)),
IndexOperator::Join(ios) => {
let iterators = self.build_iterators(opt, it_ref, ios).await?;
let index_join = Box::new(IndexJoinThingIterator::new(opt, ix, iterators));
let iterators = self.build_iterators(opt, irf, ios).await?;
let index_join = Box::new(IndexJoinThingIterator::new(irf, opt, ix, iterators));
Some(ThingIterator::IndexJoin(index_join))
}
_ => None,
@ -364,6 +425,7 @@ impl QueryExecutor {
match ix.index {
Index::Idx => {
return Some(ThingIterator::IndexRange(IndexRangeThingIterator::new(
ir,
opt.ns(),
opt.db(),
&ix.what,
@ -374,6 +436,7 @@ impl QueryExecutor {
}
Index::Uniq => {
return Some(ThingIterator::UniqueRange(UniqueRangeThingIterator::new(
ir,
opt.ns(),
opt.db(),
&ix.what,
@ -391,20 +454,20 @@ impl QueryExecutor {
async fn new_unique_index_iterator(
&self,
opt: &Options,
it_ref: IteratorRef,
irf: IteratorRef,
ix: &DefineIndexStatement,
io: IndexOption,
) -> Result<Option<ThingIterator>, Error> {
Ok(match io.op() {
IndexOperator::Equality(value) => Some(ThingIterator::UniqueEqual(
UniqueEqualThingIterator::new(opt.ns(), opt.db(), &ix.what, &ix.name, value),
UniqueEqualThingIterator::new(irf, opt.ns(), opt.db(), &ix.what, &ix.name, value),
)),
IndexOperator::Union(value) => {
Some(ThingIterator::UniqueUnion(UniqueUnionThingIterator::new(opt, ix, value)))
Some(ThingIterator::UniqueUnion(UniqueUnionThingIterator::new(irf, opt, ix, value)))
}
IndexOperator::Join(ios) => {
let iterators = self.build_iterators(opt, it_ref, ios).await?;
let unique_join = Box::new(UniqueJoinThingIterator::new(opt, ix, iterators));
let iterators = self.build_iterators(opt, irf, ios).await?;
let unique_join = Box::new(UniqueJoinThingIterator::new(irf, opt, ix, iterators));
Some(ThingIterator::UniqueJoin(unique_join))
}
_ => None,
@ -413,14 +476,15 @@ impl QueryExecutor {
async fn new_search_index_iterator(
&self,
it_ref: IteratorRef,
irf: IteratorRef,
io: IndexOption,
) -> Result<Option<ThingIterator>, Error> {
if let Some(IteratorEntry::Single(exp, ..)) = self.0.it_entries.get(it_ref as usize) {
if let Some(IteratorEntry::Single(exp, ..)) = self.0.it_entries.get(irf as usize) {
if let Matches(_, _) = io.op() {
if let Some(fti) = self.0.ft_map.get(&io.ix_ref()) {
if let Some(fte) = self.0.exp_entries.get(exp) {
let it = MatchesThingIterator::new(fti, fte.0.terms_docs.clone()).await?;
let it =
MatchesThingIterator::new(irf, fti, fte.0.terms_docs.clone()).await?;
return Ok(Some(ThingIterator::Matches(it)));
}
}
@ -429,24 +493,21 @@ impl QueryExecutor {
Ok(None)
}
fn new_mtree_index_knn_iterator(&self, it_ref: IteratorRef) -> Option<ThingIterator> {
if let Some(IteratorEntry::Single(exp, ..)) = self.0.it_entries.get(it_ref as usize) {
fn new_mtree_index_knn_iterator(&self, irf: IteratorRef) -> Option<ThingIterator> {
if let Some(IteratorEntry::Single(exp, ..)) = self.0.it_entries.get(irf as usize) {
if let Some(mte) = self.0.mt_entries.get(exp) {
let it = DocIdsIterator::new(
mte.doc_ids.clone(),
mte.res.iter().map(|(d, _)| *d).collect(),
);
let it = KnnIterator::new(irf, mte.res.clone());
return Some(ThingIterator::Knn(it));
}
}
None
}
fn new_hnsw_index_ann_iterator(&self, it_ref: IteratorRef) -> Option<ThingIterator> {
if let Some(IteratorEntry::Single(exp, ..)) = self.0.it_entries.get(it_ref as usize) {
fn new_hnsw_index_ann_iterator(&self, irf: IteratorRef) -> Option<ThingIterator> {
if let Some(IteratorEntry::Single(exp, ..)) = self.0.it_entries.get(irf as usize) {
if let Some(he) = self.0.hnsw_entries.get(exp) {
let it = ThingsIterator::new(he.res.iter().map(|(thg, _)| thg.clone()).collect());
return Some(ThingIterator::Things(it));
let it = KnnIterator::new(irf, he.res.clone());
return Some(ThingIterator::Knn(it));
}
}
None
@ -455,12 +516,12 @@ impl QueryExecutor {
async fn build_iterators(
&self,
opt: &Options,
it_ref: IteratorRef,
irf: IteratorRef,
ios: &[IndexOption],
) -> Result<VecDeque<ThingIterator>, Error> {
let mut iterators = VecDeque::with_capacity(ios.len());
for io in ios {
if let Some(it) = Box::pin(self.new_single_iterator(opt, it_ref, io)).await? {
if let Some(it) = Box::pin(self.new_single_iterator(opt, irf, io)).await? {
iterators.push_back(it);
}
}
@ -504,9 +565,13 @@ impl QueryExecutor {
thg: &Thing,
ft: &FtEntry,
) -> Result<bool, Error> {
let mut run = txn.lock().await;
let doc_key: Key = thg.into();
if let Some(doc_id) = ft.0.doc_ids.read().await.get_doc_id(&mut run, doc_key).await? {
let mut run = txn.lock().await;
let di = ft.0.doc_ids.read().await;
let doc_id = di.get_doc_id(&mut run, doc_key).await?;
drop(di);
drop(run);
if let Some(doc_id) = doc_id {
let term_goals = ft.0.terms_docs.len();
// If there is no terms, it can't be a match
if term_goals == 0 {
@ -551,6 +616,7 @@ impl QueryExecutor {
let terms = ft.0.terms.read().await;
// Extract the terms set from the record
let t = ft.0.analyzer.extract_indexing_terms(stk, ctx, opt, txn, &terms, v).await?;
drop(terms);
Ok(ft.0.query_terms_set.is_subset(&t))
}
@ -584,7 +650,7 @@ impl QueryExecutor {
) -> Result<Value, Error> {
if let Some((e, ft)) = self.get_ft_entry_and_index(&match_ref) {
let mut run = txn.lock().await;
return ft
let res = ft
.highlight(
&mut run,
thg,
@ -596,6 +662,8 @@ impl QueryExecutor {
doc,
)
.await;
drop(run);
return res;
}
Ok(Value::None)
}
@ -609,7 +677,9 @@ impl QueryExecutor {
) -> Result<Value, Error> {
if let Some((e, ft)) = self.get_ft_entry_and_index(&match_ref) {
let mut run = txn.lock().await;
return ft.extract_offsets(&mut run, thg, &e.0.query_terms_list, partial).await;
let res = ft.extract_offsets(&mut run, thg, &e.0.query_terms_list, partial).await;
drop(run);
return res;
}
Ok(Value::None)
}
@ -619,21 +689,30 @@ impl QueryExecutor {
txn: &Transaction,
match_ref: &Value,
rid: &Thing,
mut doc_id: Option<DocId>,
ir: Option<&IteratorRecord>,
) -> Result<Value, Error> {
if let Some(e) = self.get_ft_entry(match_ref) {
if let Some(scorer) = &e.0.scorer {
let mut run = txn.lock().await;
let mut doc_id = if let Some(ir) = ir {
ir.doc_id()
} else {
None
};
if doc_id.is_none() {
let key: Key = rid.into();
doc_id = e.0.doc_ids.read().await.get_doc_id(&mut run, key).await?;
};
let di = e.0.doc_ids.read().await;
doc_id = di.get_doc_id(&mut run, key).await?;
drop(di);
}
if let Some(doc_id) = doc_id {
let score = scorer.score(&mut run, doc_id).await?;
if let Some(score) = score {
drop(run);
return Ok(Value::from(score));
}
}
drop(run);
}
}
Ok(Value::None)
@ -668,6 +747,7 @@ impl FtEntry {
ft.extract_querying_terms(stk, ctx, opt, txn, qs.to_owned()).await?;
let mut tx = txn.lock().await;
let terms_docs = Arc::new(ft.get_terms_docs(&mut tx, &terms_list).await?);
drop(tx);
Ok(Some(Self(Arc::new(Inner {
index_option: io,
doc_ids: ft.doc_ids(),
@ -686,33 +766,59 @@ impl FtEntry {
#[derive(Clone)]
pub(super) struct MtEntry {
doc_ids: Arc<RwLock<DocIds>>,
res: VecDeque<(DocId, f64)>,
res: VecDeque<KnnIteratorResult>,
}
impl MtEntry {
#[allow(clippy::too_many_arguments)]
async fn new(
tx: &mut kvs::Transaction,
stk: &mut Stk,
ctx: &Context<'_>,
opt: &Options,
txn: &Transaction,
mt: &MTreeIndex,
a: &Array,
o: &[Number],
k: u32,
cond: Option<Arc<Cond>>,
) -> Result<Self, Error> {
let res = mt.knn_search(tx, a, k as usize).await?;
let cond_checker = if let Some(cond) = cond {
MTreeConditionChecker::new_cond(ctx, opt, txn, cond)
} else {
MTreeConditionChecker::new(txn)
};
let res = mt.knn_search(stk, txn, o, k as usize, cond_checker).await?;
Ok(Self {
res,
doc_ids: mt.doc_ids(),
})
}
}
#[derive(Clone)]
pub(super) struct HnswEntry {
res: VecDeque<(Thing, f64)>,
res: VecDeque<KnnIteratorResult>,
}
impl HnswEntry {
async fn new(h: SharedHnswIndex, a: &Array, n: usize, ef: usize) -> Result<Self, Error> {
let res = h.read().await.knn_search(a, n, ef)?;
#[allow(clippy::too_many_arguments)]
async fn new(
stk: &mut Stk,
ctx: &Context<'_>,
opt: &Options,
txn: &Transaction,
h: SharedHnswIndex,
v: &[Number],
n: u32,
ef: u32,
cond: Option<Arc<Cond>>,
) -> Result<Self, Error> {
let cond_checker = if let Some(cond) = cond {
HnswConditionChecker::new_cond(ctx, opt, txn, cond)
} else {
HnswConditionChecker::default()
};
let h = h.read().await;
let res = h.knn_search(v, n as usize, ef as usize, stk, cond_checker).await?;
drop(h);
Ok(Self {
res,
})

View file

@ -1,17 +1,105 @@
use crate::dbs::{Options, Transaction};
use crate::ctx::Context;
use crate::dbs::Options;
use crate::err::Error;
use crate::idx::docids::{DocId, DocIds};
use crate::idx::docids::DocId;
use crate::idx::ft::termdocs::TermsDocs;
use crate::idx::ft::{FtIndex, HitsIterator};
use crate::idx::planner::plan::RangeValue;
use crate::key::index::Index;
use crate::kvs;
use crate::kvs::{Key, Limit, ScanPage};
use crate::sql::statements::DefineIndexStatement;
use crate::sql::{Array, Ident, Thing, Value};
use radix_trie::Trie;
use std::collections::VecDeque;
use std::sync::Arc;
use tokio::sync::RwLock;
pub(crate) type IteratorRef = u16;
#[derive(Debug)]
pub(crate) struct IteratorRecord {
irf: IteratorRef,
doc_id: Option<DocId>,
dist: Option<f64>,
}
impl IteratorRecord {
pub(crate) fn irf(&self) -> IteratorRef {
self.irf
}
pub(crate) fn doc_id(&self) -> Option<DocId> {
self.doc_id
}
pub(crate) fn dist(&self) -> Option<f64> {
self.dist
}
}
impl From<IteratorRef> for IteratorRecord {
fn from(irf: IteratorRef) -> Self {
IteratorRecord {
irf,
doc_id: None,
dist: None,
}
}
}
pub(crate) trait IteratorBatch {
fn empty() -> Self;
fn with_capacity(capacity: usize) -> Self;
fn from_one(record: CollectorRecord) -> Self;
fn add(&mut self, record: CollectorRecord);
fn len(&self) -> usize;
fn is_empty(&self) -> bool;
}
impl IteratorBatch for Vec<CollectorRecord> {
fn empty() -> Self {
Vec::from([])
}
fn with_capacity(capacity: usize) -> Self {
Vec::with_capacity(capacity)
}
fn from_one(record: CollectorRecord) -> Self {
Vec::from([record])
}
fn add(&mut self, record: CollectorRecord) {
self.push(record)
}
fn len(&self) -> usize {
Vec::len(self)
}
fn is_empty(&self) -> bool {
Vec::is_empty(self)
}
}
impl IteratorBatch for VecDeque<CollectorRecord> {
fn empty() -> Self {
VecDeque::from([])
}
fn with_capacity(capacity: usize) -> Self {
VecDeque::with_capacity(capacity)
}
fn from_one(record: CollectorRecord) -> Self {
VecDeque::from([record])
}
fn add(&mut self, record: CollectorRecord) {
self.push_back(record)
}
fn len(&self) -> usize {
VecDeque::len(self)
}
fn is_empty(&self) -> bool {
VecDeque::is_empty(self)
}
}
pub(crate) enum ThingIterator {
IndexEqual(IndexEqualThingIterator),
@ -23,77 +111,68 @@ pub(crate) enum ThingIterator {
UniqueUnion(UniqueUnionThingIterator),
UniqueJoin(Box<UniqueJoinThingIterator>),
Matches(MatchesThingIterator),
Knn(DocIdsIterator),
Things(ThingsIterator),
Knn(KnnIterator),
}
impl ThingIterator {
pub(crate) async fn next_batch<T: ThingCollector>(
pub(crate) async fn next_batch<B: IteratorBatch>(
&mut self,
tx: &Transaction,
ctx: &Context<'_>,
tx: &mut kvs::Transaction,
size: u32,
collector: &mut T,
) -> Result<usize, Error> {
) -> Result<B, Error> {
match self {
Self::IndexEqual(i) => i.next_batch(tx, size, collector).await,
Self::UniqueEqual(i) => i.next_batch(tx, collector).await,
Self::IndexRange(i) => i.next_batch(tx, size, collector).await,
Self::UniqueRange(i) => i.next_batch(tx, size, collector).await,
Self::IndexUnion(i) => i.next_batch(tx, size, collector).await,
Self::UniqueUnion(i) => i.next_batch(tx, size, collector).await,
Self::Matches(i) => i.next_batch(tx, size, collector).await,
Self::Knn(i) => i.next_batch(tx, size, collector).await,
Self::IndexJoin(i) => Box::pin(i.next_batch(tx, size, collector)).await,
Self::UniqueJoin(i) => Box::pin(i.next_batch(tx, size, collector)).await,
Self::Things(i) => Ok(i.next_batch(size, collector)),
Self::IndexEqual(i) => i.next_batch(tx, size).await,
Self::UniqueEqual(i) => i.next_batch(tx).await,
Self::IndexRange(i) => i.next_batch(tx, size).await,
Self::UniqueRange(i) => i.next_batch(tx, size).await,
Self::IndexUnion(i) => i.next_batch(ctx, tx, size).await,
Self::UniqueUnion(i) => i.next_batch(ctx, tx, size).await,
Self::Matches(i) => i.next_batch(ctx, tx, size).await,
Self::Knn(i) => i.next_batch(ctx, size).await,
Self::IndexJoin(i) => Box::pin(i.next_batch(ctx, tx, size)).await,
Self::UniqueJoin(i) => Box::pin(i.next_batch(ctx, tx, size)).await,
}
}
}
pub(crate) trait ThingCollector {
fn add(&mut self, thing: Thing, doc_id: Option<DocId>);
}
impl ThingCollector for Vec<(Thing, Option<DocId>)> {
fn add(&mut self, thing: Thing, doc_id: Option<DocId>) {
self.push((thing, doc_id));
}
}
impl ThingCollector for VecDeque<(Thing, Option<DocId>)> {
fn add(&mut self, thing: Thing, doc_id: Option<DocId>) {
self.push_back((thing, doc_id));
}
}
pub(crate) type CollectorRecord = (Thing, IteratorRecord, Option<Value>);
pub(crate) struct IndexEqualThingIterator {
irf: IteratorRef,
beg: Vec<u8>,
end: Vec<u8>,
}
impl IndexEqualThingIterator {
pub(super) fn new(ns: &str, db: &str, ix_what: &Ident, ix_name: &Ident, v: &Value) -> Self {
pub(super) fn new(
irf: IteratorRef,
ns: &str,
db: &str,
ix_what: &Ident,
ix_name: &Ident,
v: &Value,
) -> Self {
let a = Array::from(v.clone());
let beg = Index::prefix_ids_beg(ns, db, ix_what, ix_name, &a);
let end = Index::prefix_ids_end(ns, db, ix_what, ix_name, &a);
Self {
irf,
beg,
end,
}
}
async fn next_scan<T: ThingCollector>(
txn: &Transaction,
async fn next_scan<B: IteratorBatch>(
tx: &mut kvs::Transaction,
irf: IteratorRef,
beg: &mut Vec<u8>,
end: &[u8],
limit: u32,
collector: &mut T,
) -> Result<usize, Error> {
) -> Result<B, Error> {
let min = beg.clone();
let max = end.to_owned();
let res = txn
.lock()
.await
let res = tx
.scan_paged(
ScanPage {
range: min..max,
@ -108,18 +187,17 @@ impl IndexEqualThingIterator {
key.push(0x00);
*beg = key;
}
let count = res.len();
res.into_iter().for_each(|(_, val)| collector.add(val.into(), None));
Ok(count)
let mut records = B::with_capacity(res.len());
res.into_iter().for_each(|(_, val)| records.add((val.into(), irf.into(), None)));
Ok(records)
}
async fn next_batch<T: ThingCollector>(
async fn next_batch<B: IteratorBatch>(
&mut self,
txn: &Transaction,
tx: &mut kvs::Transaction,
limit: u32,
collector: &mut T,
) -> Result<usize, Error> {
Self::next_scan(txn, &mut self.beg, &self.end, limit, collector).await
) -> Result<B, Error> {
Self::next_scan(tx, self.irf, &mut self.beg, &self.end, limit).await
}
}
@ -168,11 +246,13 @@ impl RangeScan {
}
pub(crate) struct IndexRangeThingIterator {
irf: IteratorRef,
r: RangeScan,
}
impl IndexRangeThingIterator {
pub(super) fn new(
irf: IteratorRef,
ns: &str,
db: &str,
ix_what: &Ident,
@ -183,6 +263,7 @@ impl IndexRangeThingIterator {
let beg = Self::compute_beg(ns, db, ix_what, ix_name, from);
let end = Self::compute_end(ns, db, ix_what, ix_name, to);
Self {
irf,
r: RangeScan::new(beg, from.inclusive, end, to.inclusive),
}
}
@ -223,17 +304,14 @@ impl IndexRangeThingIterator {
}
}
async fn next_batch<T: ThingCollector>(
async fn next_batch<B: IteratorBatch>(
&mut self,
txn: &Transaction,
tx: &mut kvs::Transaction,
limit: u32,
collector: &mut T,
) -> Result<usize, Error> {
) -> Result<B, Error> {
let min = self.r.beg.clone();
let max = self.r.end.clone();
let res = txn
.lock()
.await
let res = tx
.scan_paged(
ScanPage {
range: min..max,
@ -247,24 +325,29 @@ impl IndexRangeThingIterator {
self.r.beg.clone_from(key);
self.r.beg.push(0x00);
}
let mut count = 0;
for (k, v) in res {
if self.r.matches(&k) {
collector.add(v.into(), None);
count += 1;
}
}
Ok(count)
let mut records = B::with_capacity(res.len());
res.into_iter()
.filter(|(k, _)| self.r.matches(k))
.for_each(|(_, v)| records.add((v.into(), self.irf.into(), None)));
Ok(records)
}
}
pub(crate) struct IndexUnionThingIterator {
irf: IteratorRef,
values: VecDeque<(Vec<u8>, Vec<u8>)>,
current: Option<(Vec<u8>, Vec<u8>)>,
}
impl IndexUnionThingIterator {
pub(super) fn new(ns: &str, db: &str, ix_what: &Ident, ix_name: &Ident, a: &Array) -> Self {
pub(super) fn new(
irf: IteratorRef,
ns: &str,
db: &str,
ix_what: &Ident,
ix_name: &Ident,
a: &Array,
) -> Self {
// We create a VecDeque to hold the prefix keys (begin and end) for each value in the array.
let mut values: VecDeque<(Vec<u8>, Vec<u8>)> =
a.0.iter()
@ -277,26 +360,30 @@ impl IndexUnionThingIterator {
.collect();
let current = values.pop_front();
Self {
irf,
values,
current,
}
}
async fn next_batch<T: ThingCollector>(
async fn next_batch<B: IteratorBatch>(
&mut self,
txn: &Transaction,
ctx: &Context<'_>,
tx: &mut kvs::Transaction,
limit: u32,
collector: &mut T,
) -> Result<usize, Error> {
) -> Result<B, Error> {
while let Some(r) = &mut self.current {
let count =
IndexEqualThingIterator::next_scan(txn, &mut r.0, &r.1, limit, collector).await?;
if count != 0 {
return Ok(count);
if ctx.is_done() {
break;
}
let records: B =
IndexEqualThingIterator::next_scan(tx, self.irf, &mut r.0, &r.1, limit).await?;
if !records.is_empty() {
return Ok(records);
}
self.current = self.values.pop_front();
}
Ok(0)
Ok(B::empty())
}
}
@ -307,7 +394,7 @@ struct JoinThingIterator {
ix_name: Ident,
remote_iterators: VecDeque<ThingIterator>,
current_remote: Option<ThingIterator>,
current_remote_batch: VecDeque<(Thing, Option<DocId>)>,
current_remote_batch: VecDeque<CollectorRecord>,
current_local: Option<ThingIterator>,
distinct: Trie<Key, bool>,
}
@ -324,7 +411,7 @@ impl JoinThingIterator {
ix_what: ix.what.clone(),
ix_name: ix.name.clone(),
current_remote: None,
current_remote_batch: VecDeque::with_capacity(0),
current_remote_batch: VecDeque::with_capacity(1),
remote_iterators,
current_local: None,
distinct: Default::default(),
@ -335,34 +422,37 @@ impl JoinThingIterator {
impl JoinThingIterator {
async fn next_current_remote_batch(
&mut self,
tx: &Transaction,
ctx: &Context<'_>,
tx: &mut kvs::Transaction,
limit: u32,
) -> Result<bool, Error> {
loop {
while !ctx.is_done() {
if let Some(it) = &mut self.current_remote {
self.current_remote_batch.clear();
if it.next_batch(tx, limit, &mut self.current_remote_batch).await? > 0 {
self.current_remote_batch = it.next_batch(ctx, tx, limit).await?;
if !self.current_remote_batch.is_empty() {
return Ok(true);
}
}
self.current_remote = self.remote_iterators.pop_front();
if self.current_remote.is_none() {
return Ok(false);
break;
}
}
Ok(false)
}
async fn next_current_local<F>(
&mut self,
tx: &Transaction,
ctx: &Context<'_>,
tx: &mut kvs::Transaction,
limit: u32,
new_iter: F,
) -> Result<bool, Error>
where
F: Fn(&str, &str, &Ident, &Ident, Value) -> ThingIterator,
{
loop {
while let Some((thing, _)) = self.current_remote_batch.pop_front() {
while !ctx.is_done() {
while let Some((thing, _, _)) = self.current_remote_batch.pop_front() {
let k: Key = (&thing).into();
let value = Value::from(thing);
if self.distinct.insert(k, true).is_none() {
@ -371,98 +461,109 @@ impl JoinThingIterator {
return Ok(true);
}
}
if !self.next_current_remote_batch(tx, limit).await? {
if !self.next_current_remote_batch(ctx, tx, limit).await? {
break;
}
}
Ok(false)
}
async fn next_batch<T: ThingCollector, F>(
async fn next_batch<F, B: IteratorBatch>(
&mut self,
tx: &Transaction,
ctx: &Context<'_>,
tx: &mut kvs::Transaction,
limit: u32,
collector: &mut T,
new_iter: F,
) -> Result<usize, Error>
) -> Result<B, Error>
where
F: Fn(&str, &str, &Ident, &Ident, Value) -> ThingIterator + Copy,
{
loop {
while !ctx.is_done() {
if let Some(current_local) = &mut self.current_local {
let n = current_local.next_batch(tx, limit, collector).await?;
if n > 0 {
return Ok(n);
let records: B = current_local.next_batch(ctx, tx, limit).await?;
if !records.is_empty() {
return Ok(records);
}
}
if !self.next_current_local(tx, limit, new_iter).await? {
return Ok(0);
if !self.next_current_local(ctx, tx, limit, new_iter).await? {
break;
}
}
Ok(B::empty())
}
}
pub(crate) struct IndexJoinThingIterator(JoinThingIterator);
pub(crate) struct IndexJoinThingIterator(IteratorRef, JoinThingIterator);
impl IndexJoinThingIterator {
pub(super) fn new(
irf: IteratorRef,
opt: &Options,
ix: &DefineIndexStatement,
remote_iterators: VecDeque<ThingIterator>,
) -> Self {
Self(JoinThingIterator::new(opt, ix, remote_iterators))
Self(irf, JoinThingIterator::new(opt, ix, remote_iterators))
}
async fn next_batch<T: ThingCollector>(
async fn next_batch<B: IteratorBatch>(
&mut self,
tx: &Transaction,
ctx: &Context<'_>,
tx: &mut kvs::Transaction,
limit: u32,
collector: &mut T,
) -> Result<usize, Error> {
) -> Result<B, Error> {
let new_iter = |ns: &str, db: &str, ix_what: &Ident, ix_name: &Ident, value: Value| {
let it = IndexEqualThingIterator::new(ns, db, ix_what, ix_name, &value);
let it = IndexEqualThingIterator::new(self.0, ns, db, ix_what, ix_name, &value);
ThingIterator::IndexEqual(it)
};
self.0.next_batch(tx, limit, collector, new_iter).await
self.1.next_batch(ctx, tx, limit, new_iter).await
}
}
pub(crate) struct UniqueEqualThingIterator {
irf: IteratorRef,
key: Option<Key>,
}
impl UniqueEqualThingIterator {
pub(super) fn new(ns: &str, db: &str, ix_what: &Ident, ix_name: &Ident, v: &Value) -> Self {
pub(super) fn new(
irf: IteratorRef,
ns: &str,
db: &str,
ix_what: &Ident,
ix_name: &Ident,
v: &Value,
) -> Self {
let a = Array::from(v.to_owned());
let key = Index::new(ns, db, ix_what, ix_name, &a, None).into();
Self {
irf,
key: Some(key),
}
}
async fn next_batch<T: ThingCollector>(
async fn next_batch<B: IteratorBatch>(
&mut self,
txn: &Transaction,
collector: &mut T,
) -> Result<usize, Error> {
let mut count = 0;
tx: &mut kvs::Transaction,
) -> Result<B, Error> {
if let Some(key) = self.key.take() {
if let Some(val) = txn.lock().await.get(key).await? {
collector.add(val.into(), None);
count += 1;
if let Some(val) = tx.get(key).await? {
let record = (val.into(), self.irf.into(), None);
return Ok(B::from_one(record));
}
}
Ok(count)
Ok(B::empty())
}
}
pub(crate) struct UniqueRangeThingIterator {
irf: IteratorRef,
r: RangeScan,
done: bool,
}
impl UniqueRangeThingIterator {
pub(super) fn new(
irf: IteratorRef,
ns: &str,
db: &str,
ix_what: &Ident,
@ -473,6 +574,7 @@ impl UniqueRangeThingIterator {
let beg = Self::compute_beg(ns, db, ix_what, ix_name, from);
let end = Self::compute_end(ns, db, ix_what, ix_name, to);
Self {
irf,
r: RangeScan::new(beg, from.inclusive, end, to.inclusive),
done: false,
}
@ -508,19 +610,17 @@ impl UniqueRangeThingIterator {
.unwrap()
}
async fn next_batch<T: ThingCollector>(
async fn next_batch<B: IteratorBatch>(
&mut self,
txn: &Transaction,
tx: &mut kvs::Transaction,
mut limit: u32,
collector: &mut T,
) -> Result<usize, Error> {
) -> Result<B, Error> {
if self.done {
return Ok(0);
return Ok(B::empty());
}
let min = self.r.beg.clone();
let max = self.r.end.clone();
limit += 1;
let mut tx = txn.lock().await;
let res = tx
.scan_paged(
ScanPage {
@ -530,36 +630,40 @@ impl UniqueRangeThingIterator {
limit,
)
.await?;
let mut count = 0;
let mut records = B::with_capacity(res.values.len());
for (k, v) in res.values {
limit -= 1;
if limit == 0 {
self.r.beg = k;
return Ok(count);
return Ok(records);
}
if self.r.matches(&k) {
collector.add(v.into(), None);
count += 1;
records.add((v.into(), self.irf.into(), None));
}
}
let end = self.r.end.clone();
if self.r.matches(&end) {
if let Some(v) = tx.get(end).await? {
collector.add(v.into(), None);
count += 1;
records.add((v.into(), self.irf.into(), None));
}
}
self.done = true;
Ok(count)
Ok(records)
}
}
pub(crate) struct UniqueUnionThingIterator {
irf: IteratorRef,
keys: VecDeque<Key>,
}
impl UniqueUnionThingIterator {
pub(super) fn new(opt: &Options, ix: &DefineIndexStatement, a: &Array) -> Self {
pub(super) fn new(
irf: IteratorRef,
opt: &Options,
ix: &DefineIndexStatement,
a: &Array,
) -> Self {
// We create a VecDeque to hold the key for each value in the array.
let keys: VecDeque<Key> =
a.0.iter()
@ -570,146 +674,147 @@ impl UniqueUnionThingIterator {
})
.collect();
Self {
irf,
keys,
}
}
async fn next_batch<T: ThingCollector>(
async fn next_batch<B: IteratorBatch>(
&mut self,
txn: &Transaction,
ctx: &Context<'_>,
tx: &mut kvs::Transaction,
limit: u32,
collector: &mut T,
) -> Result<usize, Error> {
let mut run = txn.lock().await;
let mut count = 0;
) -> Result<B, Error> {
let limit = limit as usize;
let mut results = B::with_capacity(limit.min(self.keys.len()));
while let Some(key) = self.keys.pop_front() {
if let Some(val) = run.get(key).await? {
collector.add(val.into(), None);
count += 1;
if count >= limit {
if ctx.is_done() {
break;
}
if let Some(val) = tx.get(key).await? {
results.add((val.into(), self.irf.into(), None));
if results.len() >= limit {
break;
}
}
}
Ok(count as usize)
Ok(results)
}
}
pub(crate) struct UniqueJoinThingIterator(JoinThingIterator);
pub(crate) struct UniqueJoinThingIterator(IteratorRef, JoinThingIterator);
impl UniqueJoinThingIterator {
pub(super) fn new(
irf: IteratorRef,
opt: &Options,
ix: &DefineIndexStatement,
remote_iterators: VecDeque<ThingIterator>,
) -> Self {
Self(JoinThingIterator::new(opt, ix, remote_iterators))
Self(irf, JoinThingIterator::new(opt, ix, remote_iterators))
}
async fn next_batch<T: ThingCollector>(
async fn next_batch<B: IteratorBatch>(
&mut self,
tx: &Transaction,
ctx: &Context<'_>,
tx: &mut kvs::Transaction,
limit: u32,
collector: &mut T,
) -> Result<usize, Error> {
) -> Result<B, Error> {
let new_iter = |ns: &str, db: &str, ix_what: &Ident, ix_name: &Ident, value: Value| {
let it = UniqueEqualThingIterator::new(ns, db, ix_what, ix_name, &value);
let it = UniqueEqualThingIterator::new(self.0, ns, db, ix_what, ix_name, &value);
ThingIterator::UniqueEqual(it)
};
self.0.next_batch(tx, limit, collector, new_iter).await
self.1.next_batch(ctx, tx, limit, new_iter).await
}
}
pub(crate) struct MatchesThingIterator {
irf: IteratorRef,
hits_left: usize,
hits: Option<HitsIterator>,
}
impl MatchesThingIterator {
pub(super) async fn new(fti: &FtIndex, terms_docs: TermsDocs) -> Result<Self, Error> {
pub(super) async fn new(
irf: IteratorRef,
fti: &FtIndex,
terms_docs: TermsDocs,
) -> Result<Self, Error> {
let hits = fti.new_hits_iterator(terms_docs)?;
let hits_left = if let Some(h) = &hits {
h.len()
} else {
0
};
Ok(Self {
irf,
hits,
hits_left,
})
}
async fn next_batch<T: ThingCollector>(
async fn next_batch<B: IteratorBatch>(
&mut self,
txn: &Transaction,
ctx: &Context<'_>,
tx: &mut kvs::Transaction,
limit: u32,
collector: &mut T,
) -> Result<usize, Error> {
let mut count = 0;
) -> Result<B, Error> {
if let Some(hits) = &mut self.hits {
let mut run = txn.lock().await;
while limit > count {
if let Some((thg, doc_id)) = hits.next(&mut run).await? {
collector.add(thg, Some(doc_id));
count += 1;
let limit = limit as usize;
let mut records = B::with_capacity(limit.min(self.hits_left));
while limit > records.len() && !ctx.is_done() {
if let Some((thg, doc_id)) = hits.next(tx).await? {
let ir = IteratorRecord {
irf: self.irf,
doc_id: Some(doc_id),
dist: None,
};
records.add((thg, ir, None));
self.hits_left -= 1;
} else {
break;
}
}
Ok(records)
} else {
Ok(B::empty())
}
Ok(count as usize)
}
}
pub(crate) struct DocIdsIterator {
doc_ids: Arc<RwLock<DocIds>>,
res: VecDeque<DocId>,
pub(crate) type KnnIteratorResult = (Thing, f64, Option<Value>);
pub(crate) struct KnnIterator {
irf: IteratorRef,
res: VecDeque<KnnIteratorResult>,
}
impl DocIdsIterator {
pub(super) fn new(doc_ids: Arc<RwLock<DocIds>>, res: VecDeque<DocId>) -> Self {
impl KnnIterator {
pub(super) fn new(irf: IteratorRef, res: VecDeque<KnnIteratorResult>) -> Self {
Self {
doc_ids,
irf,
res,
}
}
async fn next_batch<T: ThingCollector>(
async fn next_batch<B: IteratorBatch>(
&mut self,
txn: &Transaction,
ctx: &Context<'_>,
limit: u32,
collector: &mut T,
) -> Result<usize, Error> {
let mut tx = txn.lock().await;
let mut count = 0;
while limit > count {
if let Some(doc_id) = self.res.pop_front() {
if let Some(doc_key) =
self.doc_ids.read().await.get_doc_key(&mut tx, doc_id).await?
{
collector.add(doc_key.into(), Some(doc_id));
count += 1;
}
) -> Result<B, Error> {
let limit = limit as usize;
let mut records = B::with_capacity(limit.min(self.res.len()));
while limit > records.len() && !ctx.is_done() {
if let Some((thing, dist, val)) = self.res.pop_front() {
let ir = IteratorRecord {
irf: self.irf,
doc_id: None,
dist: Some(dist),
};
records.add((thing, ir, val));
} else {
break;
}
}
Ok(count as usize)
}
}
pub(crate) struct ThingsIterator {
res: VecDeque<Thing>,
}
impl ThingsIterator {
pub(super) fn new(res: VecDeque<Thing>) -> Self {
Self {
res,
}
}
fn next_batch<T: ThingCollector>(&mut self, limit: u32, collector: &mut T) -> usize {
let mut count = 0;
while limit > count {
if let Some(thg) = self.res.pop_front() {
collector.add(thg, None);
count += 1;
} else {
break;
}
}
count as usize
Ok(records)
}
}

View file

@ -1,6 +1,7 @@
use crate::sql::{Number, Thing};
use crate::sql::{Expression, Number, Thing};
use hashbrown::{HashMap, HashSet};
use std::collections::btree_map::Entry;
use std::collections::{BTreeMap, HashSet, VecDeque};
use std::collections::BTreeMap;
use std::sync::Arc;
use tokio::sync::Mutex;
@ -25,10 +26,14 @@ impl KnnPriorityList {
if i.check_add(&dist) {
i.add(dist, thing);
}
drop(i);
}
pub(super) async fn build(&self) -> HashSet<Arc<Thing>> {
self.0.lock().await.build()
pub(super) async fn build(&self) -> HashMap<Arc<Thing>, Number> {
let l = self.0.lock().await;
let r = l.build();
drop(l);
r
}
}
@ -73,21 +78,21 @@ impl Inner {
}
}
fn build(&self) -> HashSet<Arc<Thing>> {
let mut sorted_docs = VecDeque::with_capacity(self.knn);
fn build(&self) -> HashMap<Arc<Thing>, Number> {
let mut result = HashMap::with_capacity(self.knn);
#[cfg(debug_assertions)]
debug!("self.priority_list: {:?} - self.docs: {:?}", self.priority_list, self.docs);
let mut left = self.knn;
for docs in self.priority_list.values() {
for (dist, docs) in &self.priority_list {
let dl = docs.len();
if dl > left {
for doc_id in docs.iter().take(left) {
sorted_docs.push_back(doc_id);
result.insert(doc_id.clone(), dist.clone());
}
break;
}
for doc_id in docs {
sorted_docs.push_back(doc_id);
result.insert(doc_id.clone(), dist.clone());
}
left -= dl;
// We don't expect anymore result, we can leave
@ -95,12 +100,55 @@ impl Inner {
break;
}
}
debug!("sorted_docs: {:?}", sorted_docs);
let mut r = HashSet::with_capacity(sorted_docs.len());
for id in sorted_docs {
r.insert(id.clone());
}
r
result
}
}
pub(crate) struct KnnBruteForceResult {
exp: HashMap<Arc<Expression>, usize>,
res: Vec<HashMap<Arc<Thing>, Number>>,
}
impl KnnBruteForceResult {
pub(super) fn with_capacity(capacity: usize) -> Self {
Self {
exp: HashMap::with_capacity(capacity),
res: Vec::with_capacity(capacity),
}
}
pub(super) fn insert(&mut self, e: Arc<Expression>, m: HashMap<Arc<Thing>, Number>) {
self.exp.insert(e.clone(), self.res.len());
self.res.push(m);
}
}
#[derive(Clone)]
pub(crate) struct KnnBruteForceResults(Arc<std::collections::HashMap<String, KnnBruteForceResult>>);
impl From<std::collections::HashMap<String, KnnBruteForceResult>> for KnnBruteForceResults {
fn from(map: std::collections::HashMap<String, KnnBruteForceResult>) -> Self {
Self(map.into())
}
}
impl KnnBruteForceResults {
pub(super) fn contains(&self, exp: &Expression, thg: &Thing) -> bool {
if let Some(result) = self.0.get(thg.tb.as_str()) {
if let Some(&pos) = result.exp.get(exp) {
if let Some(things) = result.res.get(pos) {
return things.contains_key(thg);
}
}
}
false
}
pub(crate) fn get_dist(&self, pos: usize, thg: &Thing) -> Option<Number> {
if let Some(result) = self.0.get(thg.tb.as_str()) {
if let Some(things) = result.res.get(pos) {
return things.get(thg).cloned();
}
}
None
}
}

View file

@ -1,23 +1,24 @@
pub mod checker;
pub(crate) mod executor;
pub(crate) mod iterators;
pub(in crate::idx) mod knn;
pub(crate) mod plan;
mod tree;
pub(in crate::idx) mod rewriter;
pub(in crate::idx) mod tree;
use crate::ctx::Context;
use crate::dbs::{Iterable, Iterator, Options, Transaction};
use crate::err::Error;
use crate::idx::planner::executor::{
InnerQueryExecutor, IteratorEntry, IteratorRef, QueryExecutor,
};
use crate::idx::planner::executor::{InnerQueryExecutor, IteratorEntry, QueryExecutor};
use crate::idx::planner::iterators::IteratorRef;
use crate::idx::planner::knn::KnnBruteForceResults;
use crate::idx::planner::plan::{Plan, PlanBuilder};
use crate::idx::planner::tree::Tree;
use crate::sql::with::With;
use crate::sql::{Cond, Expression, Table, Thing};
use crate::sql::{Cond, Table};
use reblessive::tree::Stk;
use std::collections::{HashMap, HashSet};
use std::collections::HashMap;
use std::sync::atomic::{AtomicU8, Ordering};
use std::sync::Arc;
pub(crate) struct QueryPlanner<'a> {
opt: &'a Options,
@ -66,6 +67,8 @@ impl<'a> QueryPlanner<'a> {
&t,
tree.index_map,
tree.knn_expressions,
tree.knn_brute_force_expressions,
tree.knn_condition,
)
.await?;
match PlanBuilder::build(tree.root, self.with, tree.with_indexes)? {
@ -149,27 +152,24 @@ impl<'a> QueryPlanner<'a> {
let pos = self.iteration_index.fetch_add(1, Ordering::Relaxed);
match self.iteration_workflow.get(pos as usize) {
Some(IterationStage::BuildKnn) => {
Some(IterationStage::Iterate(Some(self.build_knn_sets().await)))
Some(IterationStage::Iterate(Some(self.build_bruteforce_knn_results().await)))
}
is => is.cloned(),
}
}
async fn build_knn_sets(&self) -> KnnSets {
async fn build_bruteforce_knn_results(&self) -> KnnBruteForceResults {
let mut results = HashMap::with_capacity(self.executors.len());
for (tb, exe) in &self.executors {
results.insert(tb.clone(), exe.build_knn_set().await);
results.insert(tb.clone(), exe.build_bruteforce_knn_result().await);
}
Arc::new(results)
results.into()
}
}
pub(crate) type KnnSet = HashMap<Arc<Expression>, HashSet<Arc<Thing>>>;
pub(crate) type KnnSets = Arc<HashMap<String, KnnSet>>;
#[derive(Clone)]
pub(crate) enum IterationStage {
Iterate(Option<KnnSets>),
Iterate(Option<KnnBruteForceResults>),
CollectKnn,
BuildKnn,
}

View file

@ -3,7 +3,7 @@ use crate::idx::ft::MatchRef;
use crate::idx::planner::tree::{GroupRef, IdiomPosition, IndexRef, Node};
use crate::sql::statements::DefineIndexStatement;
use crate::sql::with::With;
use crate::sql::{Array, Expression, Idiom, Object};
use crate::sql::{Array, Expression, Idiom, Number, Object};
use crate::sql::{Operator, Value};
use std::collections::hash_map::Entry;
use std::collections::{BTreeMap, HashMap, HashSet};
@ -166,7 +166,7 @@ pub(super) enum Plan {
#[derive(Debug, Eq, PartialEq, Hash, Clone)]
pub(super) struct IndexOption {
/// A reference o the index definition
ir: IndexRef,
ix_ref: IndexRef,
id: Idiom,
id_pos: IdiomPosition,
op: Arc<IndexOperator>,
@ -180,14 +180,19 @@ pub(super) enum IndexOperator {
Join(Vec<IndexOption>),
RangePart(Operator, Value),
Matches(String, Option<MatchRef>),
Knn(Array, u32),
Ann(Array, usize, usize),
Knn(Arc<Vec<Number>>, u32),
Ann(Arc<Vec<Number>>, u32, u32),
}
impl IndexOption {
pub(super) fn new(ir: IndexRef, id: Idiom, id_pos: IdiomPosition, op: IndexOperator) -> Self {
pub(super) fn new(
ix_ref: IndexRef,
id: Idiom,
id_pos: IdiomPosition,
op: IndexOperator,
) -> Self {
Self {
ir,
ix_ref,
id,
id_pos,
op: Arc::new(op),
@ -199,7 +204,7 @@ impl IndexOption {
}
pub(super) fn ix_ref(&self) -> IndexRef {
self.ir
self.ix_ref
}
pub(super) fn op(&self) -> &IndexOperator {
@ -225,7 +230,7 @@ impl IndexOption {
pub(crate) fn explain(&self, ix_def: &[DefineIndexStatement]) -> Value {
let mut e = HashMap::new();
if let Some(ix) = ix_def.get(self.ir as usize) {
if let Some(ix) = ix_def.get(self.ix_ref as usize) {
e.insert("index", Value::from(ix.name.0.to_owned()));
}
match self.op() {
@ -259,12 +264,16 @@ impl IndexOption {
e.insert("value", v.to_owned());
}
IndexOperator::Knn(a, k) => {
e.insert("operator", Value::from(format!("<{}>", k)));
e.insert("value", Value::Array(a.clone()));
let op = Value::from(Operator::Knn(*k, None).to_string());
let val = Value::Array(Array::from(a.as_ref().clone()));
e.insert("operator", op);
e.insert("value", val);
}
IndexOperator::Ann(a, n, ef) => {
e.insert("operator", Value::from(format!("<{},{}>", n, ef)));
e.insert("value", Value::Array(a.clone()));
IndexOperator::Ann(a, k, ef) => {
let op = Value::from(Operator::Ann(*k, *ef).to_string());
let val = Value::Array(Array::from(a.as_ref().clone()));
e.insert("operator", op);
e.insert("value", val);
}
};
Value::from(e)

View file

@ -0,0 +1,232 @@
use crate::idx::planner::executor::KnnExpressions;
use crate::sql::{
Array, Cast, Cond, Expression, Function, Id, Idiom, Model, Object, Part, Range, Thing, Value,
};
use std::collections::BTreeMap;
use std::ops::Bound;
pub(super) struct KnnConditionRewriter<'a>(&'a KnnExpressions);
impl<'a> KnnConditionRewriter<'a> {
// This function rebuild the same condition, but replaces any KnnExpression by a `true` value
pub(super) fn build(expressions: &'a KnnExpressions, cond: &Cond) -> Option<Cond> {
let b = Self(expressions);
b.eval_value(&cond.0).map(Cond)
}
fn eval_value(&self, v: &Value) -> Option<Value> {
match v {
Value::Array(a) => self.eval_value_array(a),
Value::Object(o) => self.eval_value_object(o),
Value::Thing(t) => self.eval_value_thing(t),
Value::Idiom(i) => self.eval_value_idiom(i),
Value::Cast(c) => self.eval_value_cast(c),
Value::Range(r) => self.eval_value_range(r),
Value::Edges(_)
| Value::Block(_)
| Value::Future(_)
| Value::Subquery(_)
| Value::Query(_) => None,
Value::Function(f) => self.eval_value_function(f),
Value::Expression(e) => self.eval_value_expression(e),
Value::Model(m) => self.eval_value_model(m),
Value::None
| Value::Null
| Value::Bool(_)
| Value::Number(_)
| Value::Strand(_)
| Value::Duration(_)
| Value::Datetime(_)
| Value::Uuid(_)
| Value::Geometry(_)
| Value::Bytes(_)
| Value::Param(_)
| Value::Table(_)
| Value::Mock(_)
| Value::Regex(_)
| Value::Constant(_) => Some(v.clone()),
}
}
fn eval_value_array(&self, a: &Array) -> Option<Value> {
self.eval_array(a).map(|a| a.into())
}
fn eval_array(&self, a: &Array) -> Option<Array> {
self.eval_values(&a.0).map(|v| v.into())
}
fn eval_values(&self, values: &[Value]) -> Option<Vec<Value>> {
let mut new_vec = Vec::with_capacity(values.len());
for v in values {
if let Some(v) = self.eval_value(v) {
new_vec.push(v);
} else {
return None;
}
}
Some(new_vec)
}
fn eval_value_object(&self, o: &Object) -> Option<Value> {
self.eval_object(o).map(|o| o.into())
}
fn eval_object(&self, o: &Object) -> Option<Object> {
let mut new_o = BTreeMap::new();
for (k, v) in &o.0 {
if let Some(v) = self.eval_value(v) {
new_o.insert(k.to_owned(), v);
} else {
return None;
}
}
Some(new_o.into())
}
fn eval_value_thing(&self, t: &Thing) -> Option<Value> {
self.eval_thing(t).map(|t| t.into())
}
fn eval_thing(&self, t: &Thing) -> Option<Thing> {
self.eval_id(&t.id).map(|id| Thing {
tb: t.tb.clone(),
id,
})
}
fn eval_id(&self, id: &Id) -> Option<Id> {
match id {
Id::Number(_) | Id::String(_) | Id::Generate(_) => Some(id.clone()),
Id::Array(a) => self.eval_array(a).map(Id::Array),
Id::Object(o) => self.eval_object(o).map(Id::Object),
}
}
fn eval_value_idiom(&self, i: &Idiom) -> Option<Value> {
self.eval_idiom(i).map(|i| i.into())
}
fn eval_idiom(&self, i: &Idiom) -> Option<Idiom> {
let mut new_i = Vec::with_capacity(i.0.len());
for p in &i.0 {
if let Some(p) = self.eval_part(p) {
new_i.push(p);
} else {
return None;
}
}
Some(new_i.into())
}
fn eval_part(&self, p: &Part) -> Option<Part> {
match p {
Part::All
| Part::Flatten
| Part::Last
| Part::First
| Part::Field(_)
| Part::Index(_) => Some(p.clone()),
Part::Where(v) => self.eval_value(v).map(Part::Where),
Part::Graph(_) => None,
Part::Value(v) => self.eval_value(v).map(Part::Value),
Part::Start(v) => self.eval_value(v).map(Part::Start),
Part::Method(n, p) => self.eval_values(p).map(|v| Part::Method(n.clone(), v)),
}
}
fn eval_value_cast(&self, c: &Cast) -> Option<Value> {
self.eval_cast(c).map(|c| c.into())
}
fn eval_cast(&self, c: &Cast) -> Option<Cast> {
self.eval_value(&c.1).map(|v| Cast(c.0.clone(), v))
}
fn eval_value_range(&self, r: &Range) -> Option<Value> {
self.eval_range(r).map(|r| r.into())
}
fn eval_range(&self, r: &Range) -> Option<Range> {
if let Some(beg) = self.eval_bound(&r.beg) {
self.eval_bound(&r.end).map(|end| Range {
tb: r.tb.clone(),
beg,
end,
})
} else {
None
}
}
fn eval_bound(&self, b: &Bound<Id>) -> Option<Bound<Id>> {
match b {
Bound::Included(id) => self.eval_id(id).map(Bound::Included),
Bound::Excluded(id) => self.eval_id(id).map(Bound::Excluded),
Bound::Unbounded => Some(Bound::Unbounded),
}
}
fn eval_value_function(&self, f: &Function) -> Option<Value> {
self.eval_function(f).map(|f| f.into())
}
fn eval_function(&self, f: &Function) -> Option<Function> {
match f {
Function::Normal(s, args) => {
self.eval_values(args).map(|args| Function::Normal(s.clone(), args))
}
Function::Custom(s, args) => {
self.eval_values(args).map(|args| Function::Custom(s.clone(), args))
}
Function::Script(s, args) => {
self.eval_values(args).map(|args| Function::Script(s.clone(), args))
}
}
}
fn eval_value_model(&self, m: &Model) -> Option<Value> {
self.eval_model(m).map(|m| m.into())
}
fn eval_model(&self, m: &Model) -> Option<Model> {
self.eval_values(&m.args).map(|args| Model {
name: m.name.clone(),
version: m.version.clone(),
args,
})
}
fn eval_value_expression(&self, e: &Expression) -> Option<Value> {
if self.0.contains(e) {
return Some(Value::Bool(true));
}
self.eval_expression(e).map(|e| e.into())
}
fn eval_expression(&self, e: &Expression) -> Option<Expression> {
match e {
Expression::Unary {
o,
v,
} => self.eval_value(v).map(|v| Expression::Unary {
o: o.clone(),
v,
}),
Expression::Binary {
l,
o,
r,
} => {
if let Some(l) = self.eval_value(l) {
self.eval_value(r).map(|r| Expression::Binary {
l,
o: o.clone(),
r,
})
} else {
None
}
}
}
}
}

View file

@ -1,10 +1,13 @@
use crate::ctx::Context;
use crate::dbs::{Options, Transaction};
use crate::err::Error;
use crate::idx::planner::executor::{AnnExpressions, KnnExpressions};
use crate::idx::planner::executor::{
KnnBruteForceExpression, KnnBruteForceExpressions, KnnExpressions,
};
use crate::idx::planner::plan::{IndexOperator, IndexOption};
use crate::idx::planner::rewriter::KnnConditionRewriter;
use crate::kvs;
use crate::sql::index::{Distance, Index};
use crate::sql::index::Index;
use crate::sql::statements::{DefineFieldStatement, DefineIndexStatement};
use crate::sql::{
Array, Cond, Expression, Idiom, Kind, Number, Operator, Part, Subquery, Table, Value, With,
@ -18,6 +21,8 @@ pub(super) struct Tree {
pub(super) index_map: IndexesMap,
pub(super) with_indexes: Vec<IndexRef>,
pub(super) knn_expressions: KnnExpressions,
pub(super) knn_brute_force_expressions: KnnBruteForceExpressions,
pub(super) knn_condition: Option<Cond>,
}
impl Tree {
@ -35,11 +40,18 @@ impl Tree {
let mut b = TreeBuilder::new(ctx, opt, txn, table, with);
if let Some(cond) = cond {
let root = b.eval_value(stk, 0, &cond.0).await?;
let knn_condition = if b.knn_expressions.is_empty() {
None
} else {
KnnConditionRewriter::build(&b.knn_expressions, cond)
};
Ok(Some(Self {
root,
index_map: b.index_map,
with_indexes: b.with_indexes,
knn_expressions: b.knn_expressions,
knn_brute_force_expressions: b.knn_brute_force_expressions,
knn_condition,
}))
} else {
Ok(None)
@ -59,8 +71,8 @@ struct TreeBuilder<'a> {
resolved_idioms: HashMap<Idiom, Node>,
index_map: IndexesMap,
with_indexes: Vec<IndexRef>,
knn_brute_force_expressions: HashMap<Arc<Expression>, KnnBruteForceExpression>,
knn_expressions: KnnExpressions,
ann_expressions: AnnExpressions,
idioms_record_options: HashMap<Idiom, RecordOptions>,
group_sequence: GroupRef,
}
@ -98,8 +110,8 @@ impl<'a> TreeBuilder<'a> {
resolved_idioms: Default::default(),
index_map: Default::default(),
with_indexes,
knn_brute_force_expressions: Default::default(),
knn_expressions: Default::default(),
ann_expressions: Default::default(),
idioms_record_options: Default::default(),
group_sequence: 0,
}
@ -201,9 +213,11 @@ impl<'a> TreeBuilder<'a> {
}
// Try to detect an indexed record field
if let Some(ro) = self.resolve_record_field(&mut tx, schema.fields.as_ref(), i).await? {
drop(tx);
return Ok(Node::RecordField(i.clone(), ro));
}
}
drop(tx);
Ok(Node::NonIndexedField(i.clone()))
}
@ -329,9 +343,9 @@ impl<'a> TreeBuilder<'a> {
remote_irs,
)?;
} else if let Some(id) = left.is_non_indexed_field() {
self.eval_knn(id, &right, &exp)?;
self.eval_bruteforce_knn(id, &right, &exp)?;
} else if let Some(id) = right.is_non_indexed_field() {
self.eval_knn(id, &left, &exp)?;
self.eval_bruteforce_knn(id, &left, &exp)?;
}
let re = ResolvedExpression {
group,
@ -393,8 +407,8 @@ impl<'a> TreeBuilder<'a> {
Index::Search {
..
} => Self::eval_matches_operator(op, n),
Index::MTree(_) => self.eval_indexed_knn(e, op, n, id)?,
Index::Hnsw(_) => self.eval_indexed_ann(e, op, n, id)?,
Index::MTree(_) => self.eval_mtree_knn(e, op, n)?,
Index::Hnsw(_) => self.eval_hnsw_knn(e, op, n)?,
};
if let Some(op) = op {
let io = IndexOption::new(*ir, id.clone(), p, op);
@ -427,61 +441,51 @@ impl<'a> TreeBuilder<'a> {
None
}
fn eval_indexed_knn(
fn eval_mtree_knn(
&mut self,
exp: &Arc<Expression>,
op: &Operator,
n: &Node,
id: &Idiom,
) -> Result<Option<IndexOperator>, Error> {
if let Operator::Knn(k, d) = op {
if let Operator::Knn(k, None) = op {
if let Node::Computed(v) = n {
let vec: Vec<Number> = v.as_ref().try_into()?;
self.knn_expressions.insert(
exp.clone(),
(*k, id.clone(), Arc::new(vec), d.clone().unwrap_or(Distance::Euclidean)),
);
if let Value::Array(a) = v.as_ref() {
match d {
None | Some(Distance::Euclidean) | Some(Distance::Manhattan) => {
return Ok(Some(IndexOperator::Knn(a.clone(), *k)))
}
_ => {}
}
}
let vec: Arc<Vec<Number>> = Arc::new(v.as_ref().try_into()?);
self.knn_expressions.insert(exp.clone());
return Ok(Some(IndexOperator::Knn(vec, *k)));
}
}
Ok(None)
}
fn eval_indexed_ann(
fn eval_hnsw_knn(
&mut self,
exp: &Arc<Expression>,
op: &Operator,
nd: &Node,
id: &Idiom,
n: &Node,
) -> Result<Option<IndexOperator>, Error> {
if let Operator::Ann(n, ef) = op {
if let Node::Computed(v) = nd {
let vec: Vec<Number> = v.as_ref().try_into()?;
let n = *n as usize;
let ef = *ef as usize;
self.ann_expressions.insert(exp.clone(), (n, id.clone(), Arc::new(vec), ef));
if let Value::Array(a) = v.as_ref() {
return Ok(Some(IndexOperator::Ann(a.clone(), n, ef)));
}
if let Operator::Ann(k, ef) = op {
if let Node::Computed(v) = n {
let vec: Arc<Vec<Number>> = Arc::new(v.as_ref().try_into()?);
self.knn_expressions.insert(exp.clone());
return Ok(Some(IndexOperator::Ann(vec, *k, *ef)));
}
}
Ok(None)
}
fn eval_knn(&mut self, id: &Idiom, val: &Node, exp: &Arc<Expression>) -> Result<(), Error> {
if let Operator::Knn(k, d) = exp.operator() {
fn eval_bruteforce_knn(
&mut self,
id: &Idiom,
val: &Node,
exp: &Arc<Expression>,
) -> Result<(), Error> {
if let Operator::Knn(k, Some(d)) = exp.operator() {
if let Node::Computed(v) = val {
let vec: Vec<Number> = v.as_ref().try_into()?;
self.knn_expressions.insert(
let vec: Arc<Vec<Number>> = Arc::new(v.as_ref().try_into()?);
self.knn_expressions.insert(exp.clone());
self.knn_brute_force_expressions.insert(
exp.clone(),
(*k, id.clone(), Arc::new(vec), d.clone().unwrap_or(Distance::Euclidean)),
KnnBruteForceExpression::new(*k, id.clone(), vec, d.clone()),
);
}
}

View file

@ -0,0 +1,56 @@
use crate::idx::docids::DocId;
use crate::kvs::Key;
use crate::sql::Thing;
use radix_trie::Trie;
use roaring::RoaringTreemap;
#[derive(Default)]
pub(in crate::idx) struct HnswDocs {
doc_ids: Trie<Key, DocId>,
ids_doc: Vec<Option<Thing>>,
available: RoaringTreemap,
}
impl HnswDocs {
pub(super) fn resolve(&mut self, rid: &Thing) -> DocId {
let doc_key: Key = rid.into();
if let Some(doc_id) = self.doc_ids.get(&doc_key) {
*doc_id
} else {
let doc_id = self.next_doc_id();
self.ids_doc.push(Some(rid.clone()));
self.doc_ids.insert(doc_key, doc_id);
doc_id
}
}
fn next_doc_id(&mut self) -> DocId {
if let Some(doc_id) = self.available.iter().next() {
self.available.remove(doc_id);
doc_id
} else {
self.ids_doc.len() as DocId
}
}
pub(in crate::idx) fn get_thing(&self, doc_id: DocId) -> Option<&Thing> {
if let Some(r) = self.ids_doc.get(doc_id as usize) {
r.as_ref()
} else {
None
}
}
pub(super) fn remove(&mut self, rid: &Thing) -> Option<DocId> {
let doc_key: Key = rid.into();
if let Some(doc_id) = self.doc_ids.remove(&doc_key) {
let n = doc_id as usize;
if n < self.ids_doc.len() {
self.ids_doc[n] = None;
}
self.available.insert(doc_id);
Some(doc_id)
} else {
None
}
}
}

View file

@ -0,0 +1,58 @@
use crate::idx::trees::hnsw::ElementId;
use crate::idx::trees::vector::SharedVector;
use crate::sql::index::Distance;
use hashbrown::HashMap;
pub(super) struct HnswElements {
elements: HashMap<ElementId, SharedVector>,
next_element_id: ElementId,
dist: Distance,
}
impl HnswElements {
pub(super) fn new(dist: Distance) -> Self {
Self {
elements: Default::default(),
next_element_id: 0,
dist,
}
}
pub(super) fn next_element_id(&self) -> ElementId {
self.next_element_id
}
#[cfg(test)]
pub(super) fn len(&self) -> usize {
self.elements.len()
}
#[cfg(test)]
pub(super) fn contains(&self, e_id: &ElementId) -> bool {
self.elements.contains_key(e_id)
}
pub(super) fn inc_next_element_id(&mut self) {
self.next_element_id += 1;
}
pub(super) fn insert(&mut self, id: ElementId, pt: SharedVector) {
self.elements.insert(id, pt);
}
pub(super) fn get_vector(&self, e_id: &ElementId) -> Option<&SharedVector> {
self.elements.get(e_id)
}
pub(super) fn distance(&self, a: &SharedVector, b: &SharedVector) -> f64 {
self.dist.calculate(a, b)
}
pub(super) fn get_distance(&self, q: &SharedVector, e_id: &ElementId) -> Option<f64> {
self.elements.get(e_id).map(|e_pt| self.dist.calculate(e_pt, q))
}
pub(super) fn remove(&mut self, e_id: &ElementId) {
self.elements.remove(e_id);
}
}

View file

@ -0,0 +1,201 @@
use crate::err::Error;
use crate::idx::planner::checker::HnswConditionChecker;
use crate::idx::trees::dynamicset::{ArraySet, HashBrownSet};
use crate::idx::trees::hnsw::docs::HnswDocs;
use crate::idx::trees::hnsw::index::VecDocs;
use crate::idx::trees::hnsw::{ElementId, Hnsw, HnswSearch};
use crate::idx::trees::vector::SharedVector;
use crate::sql::index::HnswParams;
use reblessive::tree::Stk;
pub(super) type ASet<const N: usize> = ArraySet<ElementId, N>;
pub(super) type HSet = HashBrownSet<ElementId>;
pub(super) enum HnswFlavor {
H5_9(Hnsw<ASet<9>, ASet<5>>),
H5_17(Hnsw<ASet<17>, ASet<5>>),
H5_25(Hnsw<ASet<25>, ASet<5>>),
H5set(Hnsw<HSet, ASet<5>>),
H9_17(Hnsw<ASet<17>, ASet<9>>),
H9_25(Hnsw<ASet<25>, ASet<9>>),
H9set(Hnsw<HSet, ASet<9>>),
H13_25(Hnsw<ASet<25>, ASet<13>>),
H13set(Hnsw<HSet, ASet<13>>),
H17set(Hnsw<HSet, ASet<17>>),
H21set(Hnsw<HSet, ASet<21>>),
H25set(Hnsw<HSet, ASet<25>>),
H29set(Hnsw<HSet, ASet<29>>),
Hset(Hnsw<HSet, HSet>),
}
impl HnswFlavor {
pub(super) fn new(p: &HnswParams) -> Self {
match p.m {
1..=4 => match p.m0 {
1..=8 => Self::H5_9(Hnsw::<ASet<9>, ASet<5>>::new(p)),
9..=16 => Self::H5_17(Hnsw::<ASet<17>, ASet<5>>::new(p)),
17..=24 => Self::H5_25(Hnsw::<ASet<25>, ASet<5>>::new(p)),
_ => Self::H5set(Hnsw::<HSet, ASet<5>>::new(p)),
},
5..=8 => match p.m0 {
1..=16 => Self::H9_17(Hnsw::<ASet<17>, ASet<9>>::new(p)),
17..=24 => Self::H9_25(Hnsw::<ASet<25>, ASet<9>>::new(p)),
_ => Self::H9set(Hnsw::<HSet, ASet<9>>::new(p)),
},
9..=12 => match p.m0 {
17..=24 => Self::H13_25(Hnsw::<ASet<25>, ASet<13>>::new(p)),
_ => Self::H13set(Hnsw::<HSet, ASet<13>>::new(p)),
},
13..=16 => Self::H17set(Hnsw::<HSet, ASet<17>>::new(p)),
17..=20 => Self::H21set(Hnsw::<HSet, ASet<21>>::new(p)),
21..=24 => Self::H25set(Hnsw::<HSet, ASet<25>>::new(p)),
25..=28 => Self::H29set(Hnsw::<HSet, ASet<29>>::new(p)),
_ => Self::Hset(Hnsw::<HSet, HSet>::new(p)),
}
}
pub(super) fn insert(&mut self, q_pt: SharedVector) -> ElementId {
match self {
HnswFlavor::H5_9(h) => h.insert(q_pt),
HnswFlavor::H5_17(h) => h.insert(q_pt),
HnswFlavor::H5_25(h) => h.insert(q_pt),
HnswFlavor::H5set(h) => h.insert(q_pt),
HnswFlavor::H9_17(h) => h.insert(q_pt),
HnswFlavor::H9_25(h) => h.insert(q_pt),
HnswFlavor::H9set(h) => h.insert(q_pt),
HnswFlavor::H13_25(h) => h.insert(q_pt),
HnswFlavor::H13set(h) => h.insert(q_pt),
HnswFlavor::H17set(h) => h.insert(q_pt),
HnswFlavor::H21set(h) => h.insert(q_pt),
HnswFlavor::H25set(h) => h.insert(q_pt),
HnswFlavor::H29set(h) => h.insert(q_pt),
HnswFlavor::Hset(h) => h.insert(q_pt),
}
}
pub(super) fn remove(&mut self, e_id: ElementId) -> bool {
match self {
HnswFlavor::H5_9(h) => h.remove(e_id),
HnswFlavor::H5_17(h) => h.remove(e_id),
HnswFlavor::H5_25(h) => h.remove(e_id),
HnswFlavor::H5set(h) => h.remove(e_id),
HnswFlavor::H9_17(h) => h.remove(e_id),
HnswFlavor::H9_25(h) => h.remove(e_id),
HnswFlavor::H9set(h) => h.remove(e_id),
HnswFlavor::H13_25(h) => h.remove(e_id),
HnswFlavor::H13set(h) => h.remove(e_id),
HnswFlavor::H17set(h) => h.remove(e_id),
HnswFlavor::H21set(h) => h.remove(e_id),
HnswFlavor::H25set(h) => h.remove(e_id),
HnswFlavor::H29set(h) => h.remove(e_id),
HnswFlavor::Hset(h) => h.remove(e_id),
}
}
pub(super) fn knn_search(&self, search: &HnswSearch) -> Vec<(f64, ElementId)> {
match self {
HnswFlavor::H5_9(h) => h.knn_search(search),
HnswFlavor::H5_17(h) => h.knn_search(search),
HnswFlavor::H5_25(h) => h.knn_search(search),
HnswFlavor::H5set(h) => h.knn_search(search),
HnswFlavor::H9_17(h) => h.knn_search(search),
HnswFlavor::H9_25(h) => h.knn_search(search),
HnswFlavor::H9set(h) => h.knn_search(search),
HnswFlavor::H13_25(h) => h.knn_search(search),
HnswFlavor::H13set(h) => h.knn_search(search),
HnswFlavor::H17set(h) => h.knn_search(search),
HnswFlavor::H21set(h) => h.knn_search(search),
HnswFlavor::H25set(h) => h.knn_search(search),
HnswFlavor::H29set(h) => h.knn_search(search),
HnswFlavor::Hset(h) => h.knn_search(search),
}
}
pub(super) async fn knn_search_checked(
&self,
search: &HnswSearch,
hnsw_docs: &HnswDocs,
vec_docs: &VecDocs,
stk: &mut Stk,
chk: &mut HnswConditionChecker<'_>,
) -> Result<Vec<(f64, ElementId)>, Error> {
match self {
HnswFlavor::H5_9(h) => {
h.knn_search_checked(search, hnsw_docs, vec_docs, stk, chk).await
}
HnswFlavor::H5_17(h) => {
h.knn_search_checked(search, hnsw_docs, vec_docs, stk, chk).await
}
HnswFlavor::H5_25(h) => {
h.knn_search_checked(search, hnsw_docs, vec_docs, stk, chk).await
}
HnswFlavor::H5set(h) => {
h.knn_search_checked(search, hnsw_docs, vec_docs, stk, chk).await
}
HnswFlavor::H9_17(h) => {
h.knn_search_checked(search, hnsw_docs, vec_docs, stk, chk).await
}
HnswFlavor::H9_25(h) => {
h.knn_search_checked(search, hnsw_docs, vec_docs, stk, chk).await
}
HnswFlavor::H9set(h) => {
h.knn_search_checked(search, hnsw_docs, vec_docs, stk, chk).await
}
HnswFlavor::H13_25(h) => {
h.knn_search_checked(search, hnsw_docs, vec_docs, stk, chk).await
}
HnswFlavor::H13set(h) => {
h.knn_search_checked(search, hnsw_docs, vec_docs, stk, chk).await
}
HnswFlavor::H17set(h) => {
h.knn_search_checked(search, hnsw_docs, vec_docs, stk, chk).await
}
HnswFlavor::H21set(h) => {
h.knn_search_checked(search, hnsw_docs, vec_docs, stk, chk).await
}
HnswFlavor::H25set(h) => {
h.knn_search_checked(search, hnsw_docs, vec_docs, stk, chk).await
}
HnswFlavor::H29set(h) => {
h.knn_search_checked(search, hnsw_docs, vec_docs, stk, chk).await
}
HnswFlavor::Hset(h) => {
h.knn_search_checked(search, hnsw_docs, vec_docs, stk, chk).await
}
}
}
pub(super) fn get_vector(&self, e_id: &ElementId) -> Option<&SharedVector> {
match self {
HnswFlavor::H5_9(h) => h.get_vector(e_id),
HnswFlavor::H5_17(h) => h.get_vector(e_id),
HnswFlavor::H5_25(h) => h.get_vector(e_id),
HnswFlavor::H5set(h) => h.get_vector(e_id),
HnswFlavor::H9_17(h) => h.get_vector(e_id),
HnswFlavor::H9_25(h) => h.get_vector(e_id),
HnswFlavor::H9set(h) => h.get_vector(e_id),
HnswFlavor::H13_25(h) => h.get_vector(e_id),
HnswFlavor::H13set(h) => h.get_vector(e_id),
HnswFlavor::H17set(h) => h.get_vector(e_id),
HnswFlavor::H21set(h) => h.get_vector(e_id),
HnswFlavor::H25set(h) => h.get_vector(e_id),
HnswFlavor::H29set(h) => h.get_vector(e_id),
HnswFlavor::Hset(h) => h.get_vector(e_id),
}
}
#[cfg(test)]
pub(super) fn check_hnsw_properties(&self, expected_count: usize) {
match self {
HnswFlavor::H5_9(h) => h.check_hnsw_properties(expected_count),
HnswFlavor::H5_17(h) => h.check_hnsw_properties(expected_count),
HnswFlavor::H5_25(h) => h.check_hnsw_properties(expected_count),
HnswFlavor::H5set(h) => h.check_hnsw_properties(expected_count),
HnswFlavor::H9_17(h) => h.check_hnsw_properties(expected_count),
HnswFlavor::H9_25(h) => h.check_hnsw_properties(expected_count),
HnswFlavor::H9set(h) => h.check_hnsw_properties(expected_count),
HnswFlavor::H13_25(h) => h.check_hnsw_properties(expected_count),
HnswFlavor::H13set(h) => h.check_hnsw_properties(expected_count),
HnswFlavor::H17set(h) => h.check_hnsw_properties(expected_count),
HnswFlavor::H21set(h) => h.check_hnsw_properties(expected_count),
HnswFlavor::H25set(h) => h.check_hnsw_properties(expected_count),
HnswFlavor::H29set(h) => h.check_hnsw_properties(expected_count),
HnswFlavor::Hset(h) => h.check_hnsw_properties(expected_count),
}
}
}

View file

@ -0,0 +1,208 @@
use crate::err::Error;
use crate::idx::docids::DocId;
use crate::idx::planner::checker::HnswConditionChecker;
use crate::idx::planner::iterators::KnnIteratorResult;
use crate::idx::trees::hnsw::docs::HnswDocs;
use crate::idx::trees::hnsw::elements::HnswElements;
use crate::idx::trees::hnsw::flavor::HnswFlavor;
use crate::idx::trees::hnsw::{ElementId, HnswSearch};
use crate::idx::trees::knn::{Ids64, KnnResult, KnnResultBuilder};
use crate::idx::trees::vector::{SharedVector, Vector};
use crate::sql::index::{HnswParams, VectorType};
use crate::sql::{Number, Thing, Value};
use hashbrown::hash_map::Entry;
use hashbrown::HashMap;
use reblessive::tree::Stk;
use std::collections::VecDeque;
pub struct HnswIndex {
dim: usize,
vector_type: VectorType,
hnsw: HnswFlavor,
docs: HnswDocs,
vec_docs: VecDocs,
}
pub(super) type VecDocs = HashMap<SharedVector, (Ids64, ElementId)>;
pub(super) struct HnswCheckedSearchContext<'a> {
elements: &'a HnswElements,
docs: &'a HnswDocs,
vec_docs: &'a VecDocs,
pt: &'a SharedVector,
ef: usize,
}
impl<'a> HnswCheckedSearchContext<'a> {
pub(super) fn new(
elements: &'a HnswElements,
docs: &'a HnswDocs,
vec_docs: &'a VecDocs,
pt: &'a SharedVector,
ef: usize,
) -> Self {
Self {
elements,
docs,
vec_docs,
pt,
ef,
}
}
pub(super) fn pt(&self) -> &SharedVector {
self.pt
}
pub(super) fn ef(&self) -> usize {
self.ef
}
pub(super) fn docs(&self) -> &HnswDocs {
self.docs
}
pub(super) fn get_docs(&self, pt: &SharedVector) -> Option<&Ids64> {
self.vec_docs.get(pt).map(|(doc_ids, _)| doc_ids)
}
pub(super) fn elements(&self) -> &HnswElements {
self.elements
}
}
impl HnswIndex {
pub fn new(p: &HnswParams) -> Self {
Self {
dim: p.dimension as usize,
vector_type: p.vector_type,
hnsw: HnswFlavor::new(p),
docs: HnswDocs::default(),
vec_docs: HashMap::default(),
}
}
pub fn index_document(&mut self, rid: &Thing, content: &Vec<Value>) -> Result<(), Error> {
// Resolve the doc_id
let doc_id = self.docs.resolve(rid);
// Index the values
for value in content {
// Extract the vector
let vector = Vector::try_from_value(self.vector_type, self.dim, value)?;
vector.check_dimension(self.dim)?;
self.insert(vector.into(), doc_id);
}
Ok(())
}
pub(super) fn insert(&mut self, o: SharedVector, d: DocId) {
match self.vec_docs.entry(o) {
Entry::Occupied(mut e) => {
let (docs, element_id) = e.get_mut();
if let Some(new_docs) = docs.insert(d) {
let element_id = *element_id;
e.insert((new_docs, element_id));
}
}
Entry::Vacant(e) => {
let o = e.key().clone();
let element_id = self.hnsw.insert(o);
e.insert((Ids64::One(d), element_id));
}
}
}
pub(super) fn remove(&mut self, o: SharedVector, d: DocId) {
if let Entry::Occupied(mut e) = self.vec_docs.entry(o) {
let (docs, e_id) = e.get_mut();
if let Some(new_docs) = docs.remove(d) {
let e_id = *e_id;
if new_docs.is_empty() {
e.remove();
self.hnsw.remove(e_id);
} else {
e.insert((new_docs, e_id));
}
}
}
}
pub(crate) fn remove_document(
&mut self,
rid: &Thing,
content: &Vec<Value>,
) -> Result<(), Error> {
if let Some(doc_id) = self.docs.remove(rid) {
for v in content {
// Extract the vector
let vector = Vector::try_from_value(self.vector_type, self.dim, v)?;
vector.check_dimension(self.dim)?;
// Remove the vector
self.remove(vector.into(), doc_id);
}
}
Ok(())
}
pub async fn knn_search(
&self,
pt: &[Number],
k: usize,
ef: usize,
stk: &mut Stk,
mut chk: HnswConditionChecker<'_>,
) -> Result<VecDeque<KnnIteratorResult>, Error> {
// Extract the vector
let vector: SharedVector = Vector::try_from_vector(self.vector_type, pt)?.into();
vector.check_dimension(self.dim)?;
let search = HnswSearch::new(vector, k, ef);
// Do the search
let result = self.search(&search, stk, &mut chk).await?;
let res = chk.convert_result(&self.docs, result.docs).await?;
Ok(res)
}
pub(super) async fn search(
&self,
search: &HnswSearch,
stk: &mut Stk,
chk: &mut HnswConditionChecker<'_>,
) -> Result<KnnResult, Error> {
// Do the search
let neighbors = match chk {
HnswConditionChecker::Hnsw(_) => self.hnsw.knn_search(search),
HnswConditionChecker::HnswCondition(_) => {
self.hnsw.knn_search_checked(search, &self.docs, &self.vec_docs, stk, chk).await?
}
};
Ok(self.build_result(neighbors, search.k, chk))
}
fn build_result(
&self,
neighbors: Vec<(f64, ElementId)>,
n: usize,
chk: &mut HnswConditionChecker<'_>,
) -> KnnResult {
let mut builder = KnnResultBuilder::new(n);
for (e_dist, e_id) in neighbors {
if builder.check_add(e_dist) {
if let Some(v) = self.hnsw.get_vector(&e_id) {
if let Some((docs, _)) = self.vec_docs.get(v) {
let evicted_docs = builder.add(e_dist, docs);
chk.expires(evicted_docs);
}
}
}
}
builder.build(
#[cfg(debug_assertions)]
HashMap::new(),
)
}
#[cfg(test)]
pub(super) fn check_hnsw_properties(&self, expected_count: usize) {
self.hnsw.check_hnsw_properties(expected_count)
}
}

View file

@ -1,10 +1,14 @@
use crate::err::Error;
use crate::idx::planner::checker::HnswConditionChecker;
use crate::idx::trees::dynamicset::DynamicSet;
use crate::idx::trees::graph::UndirectedGraph;
use crate::idx::trees::hnsw::heuristic::Heuristic;
use crate::idx::trees::hnsw::index::HnswCheckedSearchContext;
use crate::idx::trees::hnsw::{ElementId, HnswElements};
use crate::idx::trees::knn::DoublePriorityQueue;
use crate::idx::trees::vector::SharedVector;
use hashbrown::HashSet;
use reblessive::tree::Stk;
#[derive(Debug)]
pub(super) struct HnswLayer<S>
@ -40,7 +44,7 @@ where
pub(super) fn search_single(
&self,
elements: &HnswElements,
q: &SharedVector,
pt: &SharedVector,
ep_dist: f64,
ep_id: ElementId,
ef: usize,
@ -48,51 +52,67 @@ where
let visited = HashSet::from([ep_id]);
let candidates = DoublePriorityQueue::from(ep_dist, ep_id);
let w = candidates.clone();
self.search(elements, q, candidates, visited, w, ef)
self.search(elements, pt, candidates, visited, w, ef)
}
pub(super) async fn search_single_checked(
&self,
search: &HnswCheckedSearchContext<'_>,
ep_pt: &SharedVector,
ep_dist: f64,
ep_id: ElementId,
stk: &mut Stk,
chk: &mut HnswConditionChecker<'_>,
) -> Result<DoublePriorityQueue, Error> {
let visited = HashSet::from([ep_id]);
let candidates = DoublePriorityQueue::from(ep_dist, ep_id);
let mut w = DoublePriorityQueue::default();
Self::add_if_truthy(search, &mut w, ep_pt, ep_dist, ep_id, stk, chk).await?;
self.search_checked(search, candidates, visited, w, stk, chk).await
}
pub(super) fn search_multi(
&self,
elements: &HnswElements,
q: &SharedVector,
pt: &SharedVector,
candidates: DoublePriorityQueue,
ef: usize,
) -> DoublePriorityQueue {
let w = candidates.clone();
let visited = w.to_set();
self.search(elements, q, candidates, visited, w, ef)
self.search(elements, pt, candidates, visited, w, ef)
}
pub(super) fn search_single_ignore_ep(
&self,
elements: &HnswElements,
q: &SharedVector,
pt: &SharedVector,
ep_id: ElementId,
) -> Option<(f64, ElementId)> {
let visited = HashSet::from([ep_id]);
let candidates = DoublePriorityQueue::from(0.0, ep_id);
let w = candidates.clone();
let q = self.search(elements, q, candidates, visited, w, 1);
let q = self.search(elements, pt, candidates, visited, w, 1);
q.peek_first()
}
pub(super) fn search_multi_ignore_ep(
&self,
elements: &HnswElements,
q: &SharedVector,
pt: &SharedVector,
ep_id: ElementId,
ef: usize,
efc: usize,
) -> DoublePriorityQueue {
let visited = HashSet::from([ep_id]);
let candidates = DoublePriorityQueue::from(0.0, ep_id);
let w = DoublePriorityQueue::default();
self.search(elements, q, candidates, visited, w, ef)
self.search(elements, pt, candidates, visited, w, efc)
}
pub(super) fn search(
&self,
elements: &HnswElements,
q: &SharedVector,
pt: &SharedVector,
mut candidates: DoublePriorityQueue,
mut visited: HashSet<ElementId>,
mut w: DoublePriorityQueue,
@ -109,15 +129,59 @@ where
}
if let Some(neighbourhood) = self.graph.get_edges(&doc) {
for &e_id in neighbourhood.iter() {
if visited.insert(e_id) {
if let Some(e_pt) = elements.get_vector(&e_id) {
let e_dist = elements.distance(e_pt, q);
if e_dist < f_dist || w.len() < ef {
candidates.push(e_dist, e_id);
w.push(e_dist, e_id);
if w.len() > ef {
w.pop_last();
}
// Did we already visit it?
if !visited.insert(e_id) {
continue;
}
if let Some(e_pt) = elements.get_vector(&e_id) {
let e_dist = elements.distance(e_pt, pt);
if e_dist < f_dist || w.len() < ef {
candidates.push(e_dist, e_id);
w.push(e_dist, e_id);
if w.len() > ef {
w.pop_last();
}
f_dist = w.peek_last_dist().unwrap(); // w can't be empty
}
}
}
}
}
w
}
pub(super) async fn search_checked(
&self,
search: &HnswCheckedSearchContext<'_>,
mut candidates: DoublePriorityQueue,
mut visited: HashSet<ElementId>,
mut w: DoublePriorityQueue,
stk: &mut Stk,
chk: &mut HnswConditionChecker<'_>,
) -> Result<DoublePriorityQueue, Error> {
let mut f_dist = w.peek_last_dist().unwrap_or(f64::MAX);
let ef = search.ef();
let pt = search.pt();
let elements = search.elements();
while let Some((dist, doc)) = candidates.pop_first() {
if dist > f_dist {
break;
}
if let Some(neighbourhood) = self.graph.get_edges(&doc) {
for &e_id in neighbourhood.iter() {
// Did we already visit it?
if !visited.insert(e_id) {
continue;
}
if let Some(e_pt) = elements.get_vector(&e_id) {
let e_dist = elements.distance(e_pt, pt);
if e_dist < f_dist || w.len() < ef {
candidates.push(e_dist, e_id);
if Self::add_if_truthy(search, &mut w, e_pt, e_dist, e_id, stk, chk)
.await?
{
f_dist = w.peek_last_dist().unwrap(); // w can't be empty
}
}
@ -125,7 +189,30 @@ where
}
}
}
w
Ok(w)
}
pub(super) async fn add_if_truthy(
search: &HnswCheckedSearchContext<'_>,
w: &mut DoublePriorityQueue,
e_pt: &SharedVector,
e_dist: f64,
e_id: ElementId,
stk: &mut Stk,
chk: &mut HnswConditionChecker<'_>,
) -> Result<bool, Error> {
if let Some(docs) = search.get_docs(e_pt) {
if chk.check_truthy(stk, search.docs(), docs).await? {
w.push(e_dist, e_id);
if w.len() > search.ef() {
if let Some((_, id)) = w.pop_last() {
chk.expire(id);
}
}
return Ok(true);
}
}
Ok(false)
}
pub(super) fn insert(
@ -223,12 +310,7 @@ where
S: DynamicSet<ElementId>,
{
pub(in crate::idx::trees::hnsw) fn check_props(&self, elements: &HnswElements) {
assert!(
self.graph.len() <= elements.elements.len(),
"{} - {}",
self.graph.len(),
elements.elements.len()
);
assert!(self.graph.len() <= elements.len(), "{} - {}", self.graph.len(), elements.len());
for (e_id, f_ids) in self.graph.nodes() {
assert!(
f_ids.len() <= self.m_max,
@ -238,7 +320,7 @@ where
);
assert!(!f_ids.contains(e_id), "!f_ids.contains(e_id) - el: {e_id} - f_ids: {f_ids:?}");
assert!(
elements.elements.contains_key(e_id),
elements.contains(e_id),
"h.elements.contains_key(e_id) - el: {e_id} - f_ids: {f_ids:?}"
);
}

View file

@ -1,279 +1,40 @@
pub(in crate::idx) mod docs;
mod elements;
mod flavor;
mod heuristic;
pub mod index;
mod layer;
use crate::err::Error;
use crate::idx::docids::DocId;
use crate::idx::trees::dynamicset::{ArraySet, DynamicSet, HashBrownSet};
use crate::idx::planner::checker::HnswConditionChecker;
use crate::idx::trees::dynamicset::DynamicSet;
use crate::idx::trees::hnsw::docs::HnswDocs;
use crate::idx::trees::hnsw::elements::HnswElements;
use crate::idx::trees::hnsw::heuristic::Heuristic;
use crate::idx::trees::hnsw::index::{HnswCheckedSearchContext, VecDocs};
use crate::idx::trees::hnsw::layer::HnswLayer;
use crate::idx::trees::knn::{DoublePriorityQueue, Ids64, KnnResult, KnnResultBuilder};
use crate::idx::trees::vector::{SharedVector, Vector};
use crate::kvs::Key;
use crate::sql::index::{Distance, HnswParams, VectorType};
use crate::sql::{Array, Thing, Value};
use hashbrown::hash_map::Entry;
use hashbrown::HashMap;
use radix_trie::Trie;
use crate::idx::trees::knn::DoublePriorityQueue;
use crate::idx::trees::vector::SharedVector;
use crate::sql::index::HnswParams;
use rand::prelude::SmallRng;
use rand::{Rng, SeedableRng};
use roaring::RoaringTreemap;
use std::collections::VecDeque;
use reblessive::tree::Stk;
pub struct HnswIndex {
dim: usize,
vector_type: VectorType,
hnsw: Box<dyn HnswMethods>,
docs: HnswDocs,
vec_docs: HashMap<SharedVector, (Ids64, ElementId)>,
struct HnswSearch {
pt: SharedVector,
k: usize,
ef: usize,
}
type ASet<const N: usize> = ArraySet<ElementId, N>;
type HSet = HashBrownSet<ElementId>;
impl HnswIndex {
pub fn new(p: &HnswParams) -> Self {
impl HnswSearch {
pub(super) fn new(pt: SharedVector, k: usize, ef: usize) -> Self {
Self {
dim: p.dimension as usize,
vector_type: p.vector_type,
hnsw: Self::new_hnsw(p),
docs: HnswDocs::default(),
vec_docs: HashMap::default(),
}
}
fn new_hnsw(p: &HnswParams) -> Box<dyn HnswMethods> {
match p.m {
1..=4 => match p.m0 {
1..=8 => Box::new(Hnsw::<ASet<9>, ASet<5>>::new(p)),
9..=16 => Box::new(Hnsw::<ASet<17>, ASet<5>>::new(p)),
17..=24 => Box::new(Hnsw::<ASet<25>, ASet<5>>::new(p)),
_ => Box::new(Hnsw::<HSet, ASet<5>>::new(p)),
},
5..=8 => match p.m0 {
1..=16 => Box::new(Hnsw::<ASet<17>, ASet<9>>::new(p)),
17..=24 => Box::new(Hnsw::<ASet<25>, ASet<9>>::new(p)),
_ => Box::new(Hnsw::<HSet, ASet<9>>::new(p)),
},
9..=12 => match p.m0 {
17..=24 => Box::new(Hnsw::<ASet<25>, ASet<13>>::new(p)),
_ => Box::new(Hnsw::<HSet, ASet<13>>::new(p)),
},
13..=16 => Box::new(Hnsw::<HSet, ASet<17>>::new(p)),
17..=20 => Box::new(Hnsw::<HSet, ASet<21>>::new(p)),
21..=24 => Box::new(Hnsw::<HSet, ASet<25>>::new(p)),
25..=28 => Box::new(Hnsw::<HSet, ASet<29>>::new(p)),
_ => Box::new(Hnsw::<HSet, HSet>::new(p)),
}
}
pub fn index_document(&mut self, rid: &Thing, content: &Vec<Value>) -> Result<(), Error> {
// Resolve the doc_id
let doc_id = self.docs.resolve(rid);
// Index the values
for value in content {
// Extract the vector
let vector = Vector::try_from_value(self.vector_type, self.dim, value)?;
vector.check_dimension(self.dim)?;
self.insert(vector.into(), doc_id);
}
Ok(())
}
fn insert(&mut self, o: SharedVector, d: DocId) {
match self.vec_docs.entry(o) {
Entry::Occupied(mut e) => {
let (docs, element_id) = e.get_mut();
if let Some(new_docs) = docs.insert(d) {
let element_id = *element_id;
e.insert((new_docs, element_id));
}
}
Entry::Vacant(e) => {
let o = e.key().clone();
let element_id = self.hnsw.insert(o);
e.insert((Ids64::One(d), element_id));
}
}
}
fn remove(&mut self, o: SharedVector, d: DocId) {
if let Entry::Occupied(mut e) = self.vec_docs.entry(o) {
let (docs, e_id) = e.get_mut();
if let Some(new_docs) = docs.remove(d) {
let e_id = *e_id;
if new_docs.is_empty() {
e.remove();
self.hnsw.remove(e_id);
} else {
e.insert((new_docs, e_id));
}
}
}
}
pub(crate) fn remove_document(
&mut self,
rid: &Thing,
content: &Vec<Value>,
) -> Result<(), Error> {
if let Some(doc_id) = self.docs.remove(rid) {
for v in content {
// Extract the vector
let vector = Vector::try_from_value(self.vector_type, self.dim, v)?;
vector.check_dimension(self.dim)?;
// Remove the vector
self.remove(vector.into(), doc_id);
}
}
Ok(())
}
pub fn knn_search(
&self,
a: &Array,
n: usize,
ef: usize,
) -> Result<VecDeque<(Thing, f64)>, Error> {
// Extract the vector
let vector = Vector::try_from_array(self.vector_type, a)?;
vector.check_dimension(self.dim)?;
// Do the search
let res = self.search(&vector.into(), n, ef);
Ok(self.result(res))
}
fn result(&self, res: KnnResult) -> VecDeque<(Thing, f64)> {
res.docs
.into_iter()
.filter_map(|(doc_id, dist)| self.docs.get(doc_id).map(|t| (t.clone(), dist)))
.collect()
}
fn search(&self, o: &SharedVector, n: usize, ef: usize) -> KnnResult {
let neighbors = self.hnsw.knn_search(o, n, ef);
let mut builder = KnnResultBuilder::new(n);
for (e_dist, e_id) in neighbors {
if builder.check_add(e_dist) {
if let Some(v) = self.hnsw.get_vector(&e_id) {
if let Some((docs, _)) = self.vec_docs.get(v) {
builder.add(e_dist, docs);
}
}
}
}
builder.build(
#[cfg(debug_assertions)]
HashMap::new(),
)
}
}
#[derive(Default)]
struct HnswDocs {
doc_ids: Trie<Key, DocId>,
ids_doc: Vec<Option<Thing>>,
available: RoaringTreemap,
}
impl HnswDocs {
fn resolve(&mut self, rid: &Thing) -> DocId {
let doc_key: Key = rid.into();
if let Some(doc_id) = self.doc_ids.get(&doc_key) {
*doc_id
} else {
let doc_id = self.next_doc_id();
self.ids_doc.push(Some(rid.clone()));
self.doc_ids.insert(doc_key, doc_id);
doc_id
}
}
fn next_doc_id(&mut self) -> DocId {
if let Some(doc_id) = self.available.iter().next() {
self.available.remove(doc_id);
doc_id
} else {
self.ids_doc.len() as DocId
}
}
fn get(&self, doc_id: DocId) -> Option<Thing> {
if let Some(t) = self.ids_doc.get(doc_id as usize) {
t.clone()
} else {
None
}
}
fn remove(&mut self, rid: &Thing) -> Option<DocId> {
let doc_key: Key = rid.into();
if let Some(doc_id) = self.doc_ids.remove(&doc_key) {
let n = doc_id as usize;
if n < self.ids_doc.len() {
self.ids_doc[n] = None;
}
self.available.insert(doc_id);
Some(doc_id)
} else {
None
pt,
k,
ef,
}
}
}
trait HnswMethods: Send + Sync {
fn insert(&mut self, q_pt: SharedVector) -> ElementId;
fn remove(&mut self, e_id: ElementId) -> bool;
fn knn_search(&self, q: &SharedVector, k: usize, efs: usize) -> Vec<(f64, ElementId)>;
fn get_vector(&self, e_id: &ElementId) -> Option<&SharedVector>;
#[cfg(test)]
fn check_hnsw_properties(&self, expected_count: usize);
}
#[cfg(test)]
fn check_hnsw_props<L0, L>(h: &Hnsw<L0, L>, expected_count: usize)
where
L0: DynamicSet<ElementId>,
L: DynamicSet<ElementId>,
{
assert_eq!(h.elements.elements.len(), expected_count);
for layer in h.layers.iter() {
layer.check_props(&h.elements);
}
}
struct HnswElements {
elements: HashMap<ElementId, SharedVector>,
next_element_id: ElementId,
dist: Distance,
}
impl HnswElements {
fn new(dist: Distance) -> Self {
Self {
elements: Default::default(),
next_element_id: 0,
dist,
}
}
fn get_vector(&self, e_id: &ElementId) -> Option<&SharedVector> {
self.elements.get(e_id)
}
fn distance(&self, a: &SharedVector, b: &SharedVector) -> f64 {
self.dist.calculate(a, b)
}
fn get_distance(&self, q: &SharedVector, e_id: &ElementId) -> Option<f64> {
self.elements.get(e_id).map(|e_pt| self.dist.calculate(e_pt, q))
}
fn remove(&mut self, e_id: &ElementId) {
self.elements.remove(e_id);
}
}
struct Hnsw<L0, L>
where
L0: DynamicSet<ElementId>,
@ -314,7 +75,7 @@ where
fn insert_level(&mut self, q_pt: SharedVector, q_level: usize) -> ElementId {
// Attribute an ID to the vector
let q_id = self.elements.next_element_id;
let q_id = self.elements.next_element_id();
let top_up_layers = self.layers.len();
// Be sure we have existing (up) layers if required
@ -323,7 +84,7 @@ where
}
// Store the vector
self.elements.elements.insert(q_id, q_pt.clone());
self.elements.insert(q_id, q_pt.clone());
if let Some(ep_id) = self.enter_point {
// We already have an enter_point, let's insert the element in the layers
@ -333,7 +94,7 @@ where
self.insert_first_element(q_id, q_level);
}
self.elements.next_element_id += 1;
self.elements.inc_next_element_id();
q_id
}
@ -395,13 +156,7 @@ where
self.enter_point = Some(q_id);
}
}
}
impl<L0, L> HnswMethods for Hnsw<L0, L>
where
L0: DynamicSet<ElementId>,
L: DynamicSet<ElementId>,
{
fn insert(&mut self, q_pt: SharedVector) -> ElementId {
let q_level = self.get_random_level();
self.insert_level(q_pt, q_level)
@ -448,31 +203,56 @@ where
removed
}
fn knn_search(&self, q: &SharedVector, k: usize, efs: usize) -> Vec<(f64, ElementId)> {
#[cfg(debug_assertions)]
let expected_w_len = self.elements.elements.len().min(k);
fn knn_search(&self, search: &HnswSearch) -> Vec<(f64, ElementId)> {
if let Some((ep_dist, ep_id)) = self.search_ep(&search.pt) {
let w =
self.layer0.search_single(&self.elements, &search.pt, ep_dist, ep_id, search.ef);
w.to_vec_limit(search.k)
} else {
vec![]
}
}
async fn knn_search_checked(
&self,
search: &HnswSearch,
hnsw_docs: &HnswDocs,
vec_docs: &VecDocs,
stk: &mut Stk,
chk: &mut HnswConditionChecker<'_>,
) -> Result<Vec<(f64, ElementId)>, Error> {
if let Some((ep_dist, ep_id)) = self.search_ep(&search.pt) {
if let Some(ep_pt) = self.elements.get_vector(&ep_id) {
let search_ctx = HnswCheckedSearchContext::new(
&self.elements,
hnsw_docs,
vec_docs,
&search.pt,
search.ef,
);
let w = self
.layer0
.search_single_checked(&search_ctx, ep_pt, ep_dist, ep_id, stk, chk)
.await?;
return Ok(w.to_vec_limit(search.k));
}
}
Ok(vec![])
}
fn search_ep(&self, pt: &SharedVector) -> Option<(f64, ElementId)> {
if let Some(mut ep_id) = self.enter_point {
let mut ep_dist =
self.elements.get_distance(q, &ep_id).unwrap_or_else(|| unreachable!());
self.elements.get_distance(pt, &ep_id).unwrap_or_else(|| unreachable!());
for layer in self.layers.iter().rev() {
(ep_dist, ep_id) = layer
.search_single(&self.elements, q, ep_dist, ep_id, 1)
.search_single(&self.elements, pt, ep_dist, ep_id, 1)
.peek_first()
.unwrap_or_else(|| unreachable!());
}
{
let w = self.layer0.search_single(&self.elements, q, ep_dist, ep_id, efs);
#[cfg(debug_assertions)]
if w.len() < expected_w_len {
debug!(
"0 search_layer - ep_id: {ep_id:?} - ef_search: {efs} - k: {k} - w.len: {} < {expected_w_len}",
w.len()
);
}
w.to_vec_limit(k)
}
Some((ep_dist, ep_id))
} else {
vec![]
None
}
}
@ -485,23 +265,39 @@ where
}
}
#[cfg(test)]
fn check_hnsw_props<L0, L>(h: &Hnsw<L0, L>, expected_count: usize)
where
L0: DynamicSet<ElementId>,
L: DynamicSet<ElementId>,
{
assert_eq!(h.elements.len(), expected_count);
for layer in h.layers.iter() {
layer.check_props(&h.elements);
}
}
#[cfg(test)]
mod tests {
use crate::err::Error;
use crate::idx::docids::DocId;
use crate::idx::trees::hnsw::{HnswIndex, HnswMethods};
use crate::idx::planner::checker::HnswConditionChecker;
use crate::idx::trees::hnsw::flavor::HnswFlavor;
use crate::idx::trees::hnsw::index::HnswIndex;
use crate::idx::trees::hnsw::HnswSearch;
use crate::idx::trees::knn::tests::{new_vectors_from_file, TestCollection};
use crate::idx::trees::knn::{Ids64, KnnResult, KnnResultBuilder};
use crate::idx::trees::vector::{SharedVector, Vector};
use crate::sql::index::{Distance, HnswParams, VectorType};
use hashbrown::{hash_map::Entry, HashMap, HashSet};
use ndarray::Array1;
use reblessive::tree::Stk;
use roaring::RoaringTreemap;
use std::sync::Arc;
use test_log::test;
fn insert_collection_hnsw(
h: &mut Box<dyn HnswMethods>,
h: &mut HnswFlavor,
collection: &TestCollection,
) -> HashSet<SharedVector> {
let mut set = HashSet::new();
@ -513,12 +309,12 @@ mod tests {
}
set
}
fn find_collection_hnsw(h: &Box<dyn HnswMethods>, collection: &TestCollection) {
fn find_collection_hnsw(h: &HnswFlavor, collection: &TestCollection) {
let max_knn = 20.min(collection.len());
for (_, obj) in collection.to_vec_ref() {
let obj = obj.clone().into();
for knn in 1..max_knn {
let res = h.knn_search(&obj, knn, 80);
let search = HnswSearch::new(obj.clone(), knn, 80);
let res = h.knn_search(&search);
if collection.is_unique() {
let mut found = false;
for (_, e_id) in &res {
@ -556,7 +352,7 @@ mod tests {
}
fn test_hnsw_collection(p: &HnswParams, collection: &TestCollection) {
let mut h = HnswIndex::new_hnsw(p);
let mut h = HnswFlavor::new(p);
insert_collection_hnsw(&mut h, collection);
find_collection_hnsw(&h, &collection);
}
@ -648,17 +444,22 @@ mod tests {
e.insert(HashSet::from([*doc_id]));
}
}
h.hnsw.check_hnsw_properties(map.len());
h.check_hnsw_properties(map.len());
}
map
}
fn find_collection_hnsw_index(h: &mut HnswIndex, collection: &TestCollection) {
async fn find_collection_hnsw_index(
stk: &mut Stk,
h: &mut HnswIndex,
collection: &TestCollection,
) {
let max_knn = 20.min(collection.len());
for (doc_id, obj) in collection.to_vec_ref() {
for knn in 1..max_knn {
let obj: SharedVector = obj.clone().into();
let res = h.search(&obj, knn, 500);
let mut chk = HnswConditionChecker::default();
let search = HnswSearch::new(obj.clone(), knn, 500);
let res = h.search(&search, stk, &mut chk).await.unwrap();
if knn == 1 && res.docs.len() == 1 && res.docs[0].1 > 0.0 {
let docs: Vec<DocId> = res.docs.iter().map(|(d, _)| *d).collect();
if collection.is_unique() {
@ -701,11 +502,11 @@ mod tests {
e.remove();
}
}
h.hnsw.check_hnsw_properties(map.len());
h.check_hnsw_properties(map.len());
}
}
fn test_hnsw_index(collection_size: usize, unique: bool, p: HnswParams) {
async fn test_hnsw_index(collection_size: usize, unique: bool, p: HnswParams) {
info!("test_hnsw_index - coll size: {collection_size} - params: {p:?}");
let collection = TestCollection::new(
unique,
@ -716,7 +517,13 @@ mod tests {
);
let mut h = HnswIndex::new(&p);
let map = insert_collection_hnsw_index(&mut h, &collection);
find_collection_hnsw_index(&mut h, &collection);
let mut stack = reblessive::tree::TreeStack::new();
stack
.enter(|stk| async {
find_collection_hnsw_index(stk, &mut h, &collection).await;
})
.finish()
.await;
delete_hnsw_index_collection(&mut h, &collection, map);
}
@ -744,7 +551,7 @@ mod tests {
for unique in [false, true] {
let p = new_params(dim, vt, dist.clone(), 8, 150, extend, keep);
let f = tokio::spawn(async move {
test_hnsw_index(30, unique, p);
test_hnsw_index(30, unique, p).await;
});
futures.push(f);
}
@ -773,13 +580,11 @@ mod tests {
(10, new_i16_vec(0, 3)),
]);
let p = new_params(2, VectorType::I16, Distance::Euclidean, 3, 500, true, true);
let mut h = HnswIndex::new_hnsw(&p);
let mut h = HnswFlavor::new(&p);
insert_collection_hnsw(&mut h, &collection);
let pt = new_i16_vec(-2, -3);
let knn = 10;
let efs = 501;
let res = h.knn_search(&pt, knn, efs);
assert_eq!(res.len(), knn);
let search = HnswSearch::new(new_i16_vec(-2, -3), 10, 501);
let res = h.knn_search(&search);
assert_eq!(res.len(), 10);
}
async fn test_recall(
@ -820,24 +625,32 @@ mod tests {
let collection = collection.clone();
let h = h.clone();
let f = tokio::spawn(async move {
let mut total_recall = 0.0;
for (_, pt) in queries.to_vec_ref() {
let knn = 10;
let hnsw_res = h.search(pt, knn, efs);
assert_eq!(hnsw_res.docs.len(), knn, "Different size - knn: {knn}",);
let brute_force_res = collection.knn(pt, Distance::Euclidean, knn);
let rec = brute_force_res.recall(&hnsw_res);
if rec == 1.0 {
assert_eq!(brute_force_res.docs, hnsw_res.docs);
}
total_recall += rec;
}
let recall = total_recall / queries.to_vec_ref().len() as f64;
info!("EFS: {efs} - Recall: {recall}");
assert!(
recall >= expected_recall,
"EFS: {efs} - Recall: {recall} - Expected: {expected_recall}"
);
let mut stack = reblessive::tree::TreeStack::new();
stack
.enter(|stk| async {
let mut total_recall = 0.0;
for (_, pt) in queries.to_vec_ref() {
let knn = 10;
let mut chk = HnswConditionChecker::default();
let search = HnswSearch::new(pt.clone(), knn, efs);
let hnsw_res = h.search(&search, stk, &mut chk).await.unwrap();
assert_eq!(hnsw_res.docs.len(), knn, "Different size - knn: {knn}",);
let brute_force_res = collection.knn(pt, Distance::Euclidean, knn);
let rec = brute_force_res.recall(&hnsw_res);
if rec == 1.0 {
assert_eq!(brute_force_res.docs, hnsw_res.docs);
}
total_recall += rec;
}
let recall = total_recall / queries.to_vec_ref().len() as f64;
info!("EFS: {efs} - Recall: {recall}");
assert!(
recall >= expected_recall,
"EFS: {efs} - Recall: {recall} - Expected: {expected_recall}"
);
})
.finish()
.await;
});
futures.push(f);
}

View file

@ -175,7 +175,7 @@ impl Ord for FloatKey {
/// When identifiers are added or removed, the method returned the most appropriate
/// variant (if required).
#[derive(Debug, Clone, PartialEq)]
pub(super) enum Ids64 {
pub(in crate::idx) enum Ids64 {
#[allow(dead_code)] // Will be used with HNSW
Empty,
One(u64),
@ -354,7 +354,7 @@ impl Ids64 {
}
}
fn iter(&self) -> Box<dyn Iterator<Item = DocId> + '_> {
pub(in crate::idx) fn iter(&self) -> Box<dyn Iterator<Item = DocId> + '_> {
match &self {
Self::Empty => Box::new(EmptyIterator {}),
Self::One(d) => Box::new(OneDocIterator(Some(*d))),
@ -531,16 +531,17 @@ impl KnnResultBuilder {
}
}
pub(super) fn check_add(&self, dist: f64) -> bool {
if self.docs.len() < self.knn {
true
} else if let Some(pr) = self.priority_list.keys().last() {
dist <= pr.0
} else {
true
if self.docs.len() >= self.knn {
if let Some(pr) = self.priority_list.keys().last() {
if dist > pr.0 {
return false;
}
}
}
true
}
pub(super) fn add(&mut self, dist: f64, docs: &Ids64) {
pub(super) fn add(&mut self, dist: f64, docs: &Ids64) -> Ids64 {
let pr = FloatKey(dist);
docs.append_to(&mut self.docs);
match self.priority_list.entry(pr) {
@ -562,10 +563,12 @@ impl KnnResultBuilder {
if docs_len - d.len() >= self.knn {
if let Some((_, evicted_docs)) = self.priority_list.pop_last() {
evicted_docs.remove_to(&mut self.docs);
return evicted_docs;
}
}
}
}
Ids64::Empty
}
pub(super) fn build(
@ -614,7 +617,7 @@ pub(super) mod tests {
use crate::idx::trees::knn::{DoublePriorityQueue, FloatKey, Ids64, KnnResultBuilder};
use crate::idx::trees::vector::{SharedVector, Vector};
use crate::sql::index::{Distance, VectorType};
use crate::sql::{Array, Number};
use crate::sql::{Array, Number, Value};
use crate::syn::Parse;
use flate2::read::GzDecoder;
#[cfg(debug_assertions)]
@ -683,7 +686,7 @@ pub(super) mod tests {
}
let line = line_result?;
let array = Array::parse(&line);
let vec = Vector::try_from_array(t, &array)?.into();
let vec = Vector::try_from_value(t, array.len(), &Value::Array(array))?.into();
res.push((i as DocId, vec));
}
Ok(res)
@ -699,7 +702,7 @@ pub(super) mod tests {
for _ in 0..dim {
vec.push(gen.generate(rng));
}
let vec = Vector::try_from_array(t, &Array::from(vec)).unwrap();
let vec = Vector::try_from_vector(t, &vec).unwrap();
if vec.is_null() {
// Some similarities (cosine) is undefined for null vector.
new_random_vec(rng, t, dim, gen)

View file

@ -3,7 +3,7 @@ pub mod btree;
pub mod dynamicset;
mod graph;
pub mod hnsw;
mod knn;
pub(in crate::idx) mod knn;
pub mod mtree;
pub mod store;
pub mod vector;

View file

@ -1,3 +1,4 @@
use crate::dbs;
use hashbrown::hash_map::Entry;
use hashbrown::{HashMap, HashSet};
use reblessive::tree::Stk;
@ -13,6 +14,8 @@ use tokio::sync::RwLock;
use crate::err::Error;
use crate::idx::docids::{DocId, DocIds};
use crate::idx::planner::checker::MTreeConditionChecker;
use crate::idx::planner::iterators::KnnIteratorResult;
use crate::idx::trees::btree::BStatistics;
use crate::idx::trees::knn::{Ids64, KnnResult, KnnResultBuilder, PriorityNode};
use crate::idx::trees::store::{
@ -22,9 +25,10 @@ use crate::idx::trees::vector::{SharedVector, Vector};
use crate::idx::{IndexKeyBase, VersionedSerdeState};
use crate::kvs::{Key, Transaction, TransactionType, Val};
use crate::sql::index::{Distance, MTreeParams, VectorType};
use crate::sql::{Array, Object, Thing, Value};
use crate::sql::{Number, Object, Thing, Value};
pub(crate) struct MTreeIndex {
#[non_exhaustive]
pub struct MTreeIndex {
ixs: IndexStores,
state_key: Key,
dim: usize,
@ -34,8 +38,15 @@ pub(crate) struct MTreeIndex {
mtree: Arc<RwLock<MTree>>,
}
struct MTreeSearchContext<'a> {
txn: &'a dbs::Transaction,
pt: SharedVector,
k: usize,
store: &'a MTreeStore,
}
impl MTreeIndex {
pub(crate) async fn new(
pub async fn new(
ixs: &IndexStores,
tx: &mut Transaction,
ikb: IndexKeyBase,
@ -70,7 +81,7 @@ impl MTreeIndex {
store,
})
}
pub(crate) async fn index_document(
pub async fn index_document(
&mut self,
stk: &mut Stk,
tx: &mut Transaction,
@ -78,8 +89,10 @@ impl MTreeIndex {
content: &Vec<Value>,
) -> Result<(), Error> {
// Resolve the doc_id
let resolved = self.doc_ids.write().await.resolve_doc_id(tx, rid.into()).await?;
let mut doc_ids = self.doc_ids.write().await;
let resolved = doc_ids.resolve_doc_id(tx, rid.into()).await?;
let doc_id = *resolved.doc_id();
drop(doc_ids);
// Index the values
let mut mtree = self.mtree.write().await;
for v in content {
@ -89,34 +102,51 @@ impl MTreeIndex {
// Insert the vector in the index
mtree.insert(stk, tx, &mut self.store, vector.into(), doc_id).await?;
}
drop(mtree);
Ok(())
}
pub(crate) async fn knn_search(
pub async fn knn_search(
&self,
tx: &mut Transaction,
a: &Array,
stk: &mut Stk,
txn: &dbs::Transaction,
v: &[Number],
k: usize,
) -> Result<VecDeque<(DocId, f64)>, Error> {
mut chk: MTreeConditionChecker<'_>,
) -> Result<VecDeque<KnnIteratorResult>, Error> {
// Extract the vector
let vector = Vector::try_from_array(self.vector_type, a)?;
let vector = Vector::try_from_vector(self.vector_type, v)?;
vector.check_dimension(self.dim)?;
let vector: SharedVector = vector.into();
// Lock the index
// Build the search context
let search = MTreeSearchContext {
txn,
pt: vector.into(),
k,
store: &self.store,
};
// Lock the tree and the docs
let mtree = self.mtree.read().await;
let doc_ids = self.doc_ids.read().await;
// Do the search
let res = mtree.knn_search(tx, &self.store, &vector, k).await?;
Ok(res.docs)
let res = mtree.knn_search(&search, &doc_ids, stk, &mut chk).await?;
drop(mtree);
// Resolve the doc_id to Thing and the optional value
let res = chk.convert_result(&doc_ids, res.docs).await;
drop(doc_ids);
res
}
pub(crate) async fn remove_document(
pub async fn remove_document(
&mut self,
stk: &mut Stk,
tx: &mut Transaction,
rid: &Thing,
content: &Vec<Value>,
) -> Result<(), Error> {
if let Some(doc_id) = self.doc_ids.write().await.remove_doc(tx, rid.into()).await? {
let mut doc_ids = self.doc_ids.write().await;
let doc_id = doc_ids.remove_doc(tx, rid.into()).await?;
drop(doc_ids);
if let Some(doc_id) = doc_id {
// Lock the index
let mut mtree = self.mtree.write().await;
for v in content {
@ -126,28 +156,28 @@ impl MTreeIndex {
// Remove the vector
mtree.delete(stk, tx, &mut self.store, vector.into(), doc_id).await?;
}
drop(mtree);
}
Ok(())
}
pub(in crate::idx) fn doc_ids(&self) -> Arc<RwLock<DocIds>> {
self.doc_ids.clone()
}
pub(crate) async fn statistics(&self, tx: &mut Transaction) -> Result<MtStatistics, Error> {
Ok(MtStatistics {
doc_ids: self.doc_ids.read().await.statistics(tx).await?,
})
}
pub(crate) async fn finish(&mut self, tx: &mut Transaction) -> Result<(), Error> {
self.doc_ids.write().await.finish(tx).await?;
pub async fn finish(&mut self, tx: &mut Transaction) -> Result<(), Error> {
let mut doc_ids = self.doc_ids.write().await;
doc_ids.finish(tx).await?;
drop(doc_ids);
let mut mtree = self.mtree.write().await;
if let Some(new_cache) = self.store.finish(tx).await? {
mtree.state.generation += 1;
tx.set(self.state_key.clone(), mtree.state.try_to_val()?).await?;
self.ixs.advance_store_mtree(new_cache);
}
drop(mtree);
Ok(())
}
}
@ -155,14 +185,14 @@ impl MTreeIndex {
// https://en.wikipedia.org/wiki/M-tree
// https://arxiv.org/pdf/1004.4216.pdf
#[non_exhaustive]
pub struct MTree {
struct MTree {
state: MState,
distance: Distance,
minimum: usize,
}
impl MTree {
pub fn new(state: MState, distance: Distance) -> Self {
fn new(state: MState, distance: Distance) -> Self {
let minimum = (state.capacity + 1) as usize / 2;
Self {
state,
@ -171,17 +201,17 @@ impl MTree {
}
}
pub async fn knn_search(
async fn knn_search(
&self,
tx: &mut Transaction,
store: &MTreeStore,
v: &SharedVector,
k: usize,
search: &MTreeSearchContext<'_>,
doc_ids: &DocIds,
stk: &mut Stk,
chk: &mut MTreeConditionChecker<'_>,
) -> Result<KnnResult, Error> {
#[cfg(debug_assertions)]
debug!("knn_search - v: {:?} - k: {}", v, k);
debug!("knn_search - pt: {:?} - k: {}", search.pt, search.k);
let mut queue = BinaryHeap::new();
let mut res = KnnResultBuilder::new(k);
let mut res = KnnResultBuilder::new(search.k);
if let Some(root_id) = self.state.root {
queue.push(PriorityNode::new(0.0, root_id));
}
@ -189,7 +219,7 @@ impl MTree {
let mut visited_nodes = HashMap::new();
while let Some(e) = queue.pop() {
let id = e.id();
let node = store.get_node(tx, id).await?;
let node = search.store.get_node_txn(search.txn, id).await?;
#[cfg(debug_assertions)]
{
debug!("Visit node id: {}", id);
@ -202,11 +232,22 @@ impl MTree {
#[cfg(debug_assertions)]
debug!("Leaf found - id: {} - len: {}", node.id, n.len(),);
for (o, p) in n {
let d = self.calculate_distance(o, v)?;
let d = self.calculate_distance(o, &search.pt)?;
if res.check_add(d) {
#[cfg(debug_assertions)]
debug!("Add: {d} - obj: {o:?} - docs: {:?}", p.docs);
res.add(d, &Ids64::Bits(p.docs.clone()));
let mut docs = Ids64::Empty;
for doc in &p.docs {
if chk.check_truthy(stk, doc_ids, doc).await? {
if let Some(new_docs) = docs.insert(doc) {
docs = new_docs;
}
}
}
if !docs.is_empty() {
let evicted_docs = res.add(d, &docs);
chk.expires(evicted_docs);
}
}
}
}
@ -214,7 +255,7 @@ impl MTree {
#[cfg(debug_assertions)]
debug!("Internal found - id: {} - {:?}", node.id, n);
for (o, p) in n {
let d = self.calculate_distance(o, v)?;
let d = self.calculate_distance(o, &search.pt)?;
let min_dist = (d - p.radius).max(0.0);
if res.check_add(min_dist) {
debug!("Queue add - dist: {} - node: {}", min_dist, p.node);
@ -252,7 +293,7 @@ impl MTree {
new_node_id
}
pub async fn insert(
async fn insert(
&mut self,
stk: &mut Stk,
tx: &mut Transaction,
@ -1430,45 +1471,50 @@ impl VersionedSerdeState for MState {}
#[cfg(test)]
mod tests {
use futures::lock::Mutex;
use hashbrown::{HashMap, HashSet};
use reblessive::tree::Stk;
use std::collections::VecDeque;
use std::sync::Arc;
use crate::dbs;
use crate::err::Error;
use test_log::test;
use crate::idx::docids::DocId;
use crate::idx::docids::{DocId, DocIds};
use crate::idx::planner::checker::MTreeConditionChecker;
use crate::idx::trees::knn::tests::TestCollection;
use crate::idx::trees::mtree::{MState, MTree, MTreeNode, MTreeStore};
use crate::idx::trees::mtree::{MState, MTree, MTreeNode, MTreeSearchContext, MTreeStore};
use crate::idx::trees::store::{NodeId, TreeNodeProvider, TreeStore};
use crate::idx::trees::vector::SharedVector;
use crate::idx::IndexKeyBase;
use crate::kvs::LockType::*;
use crate::kvs::Transaction;
use crate::kvs::{Datastore, TransactionType};
use crate::sql::index::{Distance, VectorType};
async fn new_operation(
async fn new_operation<'a>(
ds: &Datastore,
t: &MTree,
tt: TransactionType,
cache_size: usize,
) -> (TreeStore<MTreeNode>, Transaction) {
) -> (dbs::Transaction, TreeStore<MTreeNode>) {
let st = ds
.index_store()
.get_store_mtree(TreeNodeProvider::Debug, t.state.generation, tt, cache_size)
.await;
let tx = ds.transaction(tt, Optimistic).await.unwrap();
(st, tx)
let tx = Arc::new(Mutex::new(ds.transaction(tt, Optimistic).await.unwrap()));
(tx, st)
}
async fn finish_operation(
ds: &Datastore,
t: &mut MTree,
mut tx: Transaction,
tx: &mut Transaction,
mut st: TreeStore<MTreeNode>,
commit: bool,
) -> Result<(), Error> {
if let Some(new_cache) = st.finish(&mut tx).await? {
if let Some(new_cache) = st.finish(tx).await? {
assert!(new_cache.len() > 0, "new_cache.len() = {}", new_cache.len());
t.state.generation += 1;
ds.index_store().advance_store_mtree(new_cache);
@ -1492,17 +1538,19 @@ mod tests {
let mut c = 0;
for (doc_id, obj) in collection.to_vec_ref() {
{
let (mut st, mut tx) =
new_operation(ds, t, TransactionType::Write, cache_size).await;
let (txn, mut st) = new_operation(ds, t, TransactionType::Write, cache_size).await;
let mut tx = txn.lock().await;
t.insert(stk, &mut tx, &mut st, obj.clone(), *doc_id).await?;
finish_operation(ds, t, tx, st, true).await?;
finish_operation(ds, t, &mut tx, st, true).await?;
drop(tx);
map.insert(*doc_id, obj.clone());
}
c += 1;
{
let (mut st, mut tx) =
new_operation(ds, t, TransactionType::Read, cache_size).await;
let (txn, mut st) = new_operation(ds, t, TransactionType::Read, cache_size).await;
let mut tx = txn.lock().await;
let p = check_tree_properties(&mut tx, &mut st, t).await?;
drop(tx);
assert_eq!(p.doc_count, c);
}
}
@ -1518,16 +1566,20 @@ mod tests {
) -> Result<HashMap<DocId, SharedVector>, Error> {
let mut map = HashMap::with_capacity(collection.len());
{
let (mut st, mut tx) = new_operation(ds, t, TransactionType::Write, cache_size).await;
let (txn, mut st) = new_operation(ds, t, TransactionType::Write, cache_size).await;
let mut tx = txn.lock().await;
for (doc_id, obj) in collection.to_vec_ref() {
t.insert(stk, &mut tx, &mut st, obj.clone(), *doc_id).await?;
map.insert(*doc_id, obj.clone());
}
finish_operation(ds, t, tx, st, true).await?;
finish_operation(ds, t, &mut tx, st, true).await?;
drop(tx);
}
{
let (mut st, mut tx) = new_operation(ds, t, TransactionType::Read, cache_size).await;
let (txn, mut st) = new_operation(ds, t, TransactionType::Read, cache_size).await;
let mut tx = txn.lock().await;
check_tree_properties(&mut tx, &mut st, t).await?;
drop(tx);
}
Ok(map)
}
@ -1535,6 +1587,7 @@ mod tests {
async fn delete_collection(
stk: &mut Stk,
ds: &Datastore,
doc_ids: &DocIds,
t: &mut MTree,
collection: &TestCollection,
cache_size: usize,
@ -1543,16 +1596,24 @@ mod tests {
for (doc_id, obj) in collection.to_vec_ref() {
let deleted = {
debug!("### Remove {} {:?}", doc_id, obj);
let (mut st, mut tx) =
new_operation(ds, t, TransactionType::Write, cache_size).await;
let (txn, mut st) = new_operation(ds, t, TransactionType::Write, cache_size).await;
let mut tx = txn.lock().await;
let deleted = t.delete(stk, &mut tx, &mut st, obj.clone(), *doc_id).await?;
finish_operation(ds, t, tx, st, true).await?;
finish_operation(ds, t, &mut tx, st, true).await?;
drop(tx);
deleted
};
all_deleted = all_deleted && deleted;
if deleted {
let (st, mut tx) = new_operation(ds, t, TransactionType::Read, cache_size).await;
let res = t.knn_search(&mut tx, &st, obj, 1).await?;
let (txn, st) = new_operation(ds, t, TransactionType::Read, cache_size).await;
let mut chk = MTreeConditionChecker::new(&txn);
let search = MTreeSearchContext {
txn: &txn,
pt: obj.clone(),
k: 1,
store: &st,
};
let res = t.knn_search(&search, doc_ids, stk, &mut chk).await?;
assert!(
!res.docs.iter().any(|(id, _)| id == doc_id),
"Found: {} {:?}",
@ -1564,30 +1625,42 @@ mod tests {
warn!("Delete failed: {} {:?}", doc_id, obj);
}
{
let (mut st, mut tx) =
new_operation(ds, t, TransactionType::Read, cache_size).await;
let (txn, mut st) = new_operation(ds, t, TransactionType::Read, cache_size).await;
let mut tx = txn.lock().await;
check_tree_properties(&mut tx, &mut st, t).await?;
drop(tx);
}
}
if all_deleted {
let (mut st, mut tx) = new_operation(ds, t, TransactionType::Read, cache_size).await;
let (txn, mut st) = new_operation(ds, t, TransactionType::Read, cache_size).await;
let mut tx = txn.lock().await;
check_tree_properties(&mut tx, &mut st, t).await?.check(0, 0, None, None, 0, 0);
drop(tx);
}
Ok(())
}
async fn find_collection(
stk: &mut Stk,
ds: &Datastore,
doc_ids: &DocIds,
t: &mut MTree,
collection: &TestCollection,
cache_size: usize,
) -> Result<(), Error> {
let (mut st, mut tx) = new_operation(ds, t, TransactionType::Read, cache_size).await;
let (txn, mut st) = new_operation(ds, t, TransactionType::Read, cache_size).await;
let max_knn = 20.max(collection.len());
for (doc_id, obj) in collection.to_vec_ref() {
for knn in 1..max_knn {
let res = t.knn_search(&mut tx, &st, obj, knn).await?;
let mut chk = MTreeConditionChecker::new(&txn);
let search = MTreeSearchContext {
txn: &txn,
pt: obj.clone(),
k: knn,
store: &st,
};
let res = t.knn_search(&search, doc_ids, stk, &mut chk).await?;
let docs: Vec<DocId> = res.docs.iter().map(|(d, _)| *d).collect();
if collection.is_unique() {
assert!(
@ -1603,7 +1676,9 @@ mod tests {
if expected_len != res.docs.len() {
#[cfg(debug_assertions)]
debug!("{:?}", res.visited_nodes);
let mut tx = txn.lock().await;
check_tree_properties(&mut tx, &mut st, t).await?;
drop(tx);
}
assert_eq!(
expected_len,
@ -1619,14 +1694,23 @@ mod tests {
}
async fn check_full_knn(
stk: &mut Stk,
ds: &Datastore,
doc_ids: &DocIds,
t: &mut MTree,
map: &HashMap<DocId, SharedVector>,
cache_size: usize,
) -> Result<(), Error> {
let (st, mut tx) = new_operation(ds, t, TransactionType::Read, cache_size).await;
let (txn, st) = new_operation(ds, t, TransactionType::Read, cache_size).await;
for obj in map.values() {
let res = t.knn_search(&mut tx, &st, obj, map.len()).await?;
let mut chk = MTreeConditionChecker::new(&txn);
let search = MTreeSearchContext {
txn: &txn,
pt: obj.clone(),
k: map.len(),
store: &st,
};
let res = t.knn_search(&search, doc_ids, stk, &mut chk).await?;
assert_eq!(
map.len(),
res.docs.len(),
@ -1671,21 +1755,36 @@ mod tests {
vector_type,
);
let ds = Datastore::new("memory").await?;
let mut t = MTree::new(MState::new(*capacity), distance.clone());
let (txn, _st) = new_operation(&ds, &t, TransactionType::Read, cache_size).await;
let mut tx = txn.lock().await;
let doc_ids = DocIds::new(
ds.index_store(),
&mut tx,
TransactionType::Read,
IndexKeyBase::default(),
7,
100,
)
.await
.unwrap();
drop(tx);
let map = if collection.len() < 1000 {
insert_collection_one_by_one(stk, &ds, &mut t, &collection, cache_size).await?
} else {
insert_collection_batch(stk, &ds, &mut t, &collection, cache_size).await?
};
if check_find {
find_collection(&ds, &mut t, &collection, cache_size).await?;
find_collection(stk, &ds, &doc_ids, &mut t, &collection, cache_size).await?;
}
if check_full {
check_full_knn(&ds, &mut t, &map, cache_size).await?;
check_full_knn(stk, &ds, &doc_ids, &mut t, &map, cache_size).await?;
}
if check_delete {
delete_collection(stk, &ds, &mut t, &collection, cache_size).await?;
delete_collection(stk, &ds, &doc_ids, &mut t, &collection, cache_size).await?;
}
}
}

View file

@ -1,4 +1,4 @@
use crate::idx::trees::hnsw::HnswIndex;
use crate::idx::trees::hnsw::index::HnswIndex;
use crate::idx::IndexKeyBase;
use crate::kvs::Key;
use crate::sql::index::HnswParams;
@ -20,30 +20,36 @@ impl Default for HnswIndexes {
impl HnswIndexes {
pub(super) async fn get(&self, ikb: &IndexKeyBase, p: &HnswParams) -> SharedHnswIndex {
let key = ikb.new_vm_key(None);
{
let r = self.0.read().await;
if let Some(h) = r.get(&key).cloned() {
return h;
}
let r = self.0.read().await;
let h = r.get(&key).cloned();
drop(r);
if let Some(h) = h {
return h;
}
let mut w = self.0.write().await;
match w.entry(key) {
let ix = match w.entry(key) {
Entry::Occupied(e) => e.get().clone(),
Entry::Vacant(e) => {
let h = Arc::new(RwLock::new(HnswIndex::new(p)));
e.insert(h.clone());
h
}
}
};
drop(w);
ix
}
pub(super) async fn remove(&self, ikb: &IndexKeyBase) {
let key = ikb.new_vm_key(None);
let mut w = self.0.write().await;
w.remove(&key);
drop(w);
}
pub(super) async fn is_empty(&self) -> bool {
self.0.read().await.is_empty()
let h = self.0.read().await;
let r = h.is_empty();
drop(h);
r
}
}

View file

@ -47,7 +47,10 @@ where
// Locate the shard
let n = key as usize % self.shards_count;
// Get and promote the key
self.shards[n].lock().await.get_and_promote(key)
let mut shard = self.shards[n].lock().await;
let v = shard.get_and_promote(key);
drop(shard);
v
}
pub(super) async fn insert<K: Into<CacheKey>>(&self, key: K, val: V) {
@ -55,7 +58,9 @@ where
// Locate the shard
let shard = key as usize % self.shards_count;
// Insert the key/object in the shard and get the new length
let new_length = self.shards[shard].lock().await.insert(key, val, self.full.load(Relaxed));
let mut s = self.shards[shard].lock().await;
let new_length = s.insert(key, val, self.full.load(Relaxed));
drop(s);
// Update lengths
self.check_length(new_length, shard);
}
@ -65,7 +70,9 @@ where
// Locate the shard
let shard = key as usize % self.shards_count;
// Remove the key
let new_length = self.shards[shard].lock().await.remove(key);
let mut s = self.shards[shard].lock().await;
let new_length = s.remove(key);
drop(s);
// Update lengths
self.check_length(new_length, shard);
}
@ -94,7 +101,9 @@ where
.shards
.iter()
.map(|s| async {
let shard = s.lock().await.duplicate(filter);
let s = s.lock().await;
let shard = s.duplicate(filter);
drop(s);
(shard.map.len(), Mutex::new(shard))
})
.collect();

View file

@ -3,6 +3,7 @@ pub(crate) mod hnsw;
mod lru;
pub(crate) mod tree;
use crate::dbs;
use crate::dbs::Options;
use crate::err::Error;
use crate::idx::trees::bkeys::{FstKeys, TrieKeys};
@ -66,6 +67,22 @@ where
}
}
pub(in crate::idx) async fn get_node_txn(
&self,
txn: &dbs::Transaction,
node_id: NodeId,
) -> Result<Arc<StoredNode<N>>, Error> {
match self {
Self::Read(r) => {
let mut tx = txn.lock().await;
let n = r.get_node(&mut tx, node_id).await;
drop(tx);
n
}
_ => Err(Error::Unreachable("TreeStore::get_node_txn")),
}
}
pub(in crate::idx) async fn set_node(
&mut self,
node: StoredNode<N>,

View file

@ -1,7 +1,7 @@
use crate::err::Error;
use crate::fnc::util::math::ToFloat;
use crate::sql::index::{Distance, VectorType};
use crate::sql::{Array, Number, Value};
use crate::sql::{Number, Value};
use ahash::AHasher;
use hashbrown::HashSet;
use linfa_linalg::norm::Norm;
@ -446,50 +446,43 @@ impl Vector {
}
}
pub fn try_from_array(t: VectorType, a: &Array) -> Result<Self, Error> {
pub fn try_from_vector(t: VectorType, v: &[Number]) -> Result<Self, Error> {
let res = match t {
VectorType::F64 => {
let mut vec = Vec::with_capacity(a.len());
Self::check_vector_array(a, &mut vec)?;
let mut vec = Vec::with_capacity(v.len());
Self::check_vector_number(v, &mut vec)?;
Vector::F64(Array1::from_vec(vec))
}
VectorType::F32 => {
let mut vec = Vec::with_capacity(a.len());
Self::check_vector_array(a, &mut vec)?;
let mut vec = Vec::with_capacity(v.len());
Self::check_vector_number(v, &mut vec)?;
Vector::F32(Array1::from_vec(vec))
}
VectorType::I64 => {
let mut vec = Vec::with_capacity(a.len());
Self::check_vector_array(a, &mut vec)?;
let mut vec = Vec::with_capacity(v.len());
Self::check_vector_number(v, &mut vec)?;
Vector::I64(Array1::from_vec(vec))
}
VectorType::I32 => {
let mut vec = Vec::with_capacity(a.len());
Self::check_vector_array(a, &mut vec)?;
let mut vec = Vec::with_capacity(v.len());
Self::check_vector_number(v, &mut vec)?;
Vector::I32(Array1::from_vec(vec))
}
VectorType::I16 => {
let mut vec = Vec::with_capacity(a.len());
Self::check_vector_array(a, &mut vec)?;
let mut vec = Vec::with_capacity(v.len());
Self::check_vector_number(v, &mut vec)?;
Vector::I16(Array1::from_vec(vec))
}
};
Ok(res)
}
fn check_vector_array<T>(a: &Array, vec: &mut Vec<T>) -> Result<(), Error>
fn check_vector_number<T>(v: &[Number], vec: &mut Vec<T>) -> Result<(), Error>
where
T: for<'a> TryFrom<&'a Number, Error = Error>,
{
for v in &a.0 {
if let Value::Number(n) = v {
vec.push(n.try_into()?);
} else {
return Err(Error::InvalidVectorType {
current: v.clone().to_string(),
expected: "Number",
});
}
for n in v {
vec.push(n.try_into()?);
}
Ok(())
}
@ -540,7 +533,6 @@ mod tests {
use crate::idx::trees::knn::tests::{get_seed_rnd, new_random_vec, RandomItemGenerator};
use crate::idx::trees::vector::{SharedVector, Vector};
use crate::sql::index::{Distance, VectorType};
use crate::sql::Array;
fn test_distance(dist: Distance, a1: &[f64], a2: &[f64], res: f64) {
// Convert the arrays to Vec<Number>
@ -554,10 +546,8 @@ mod tests {
// Check the "Vector" optimised implementations
for t in [VectorType::F64] {
let v1: SharedVector =
Vector::try_from_array(t, &Array::from(v1.clone())).unwrap().into();
let v2: SharedVector =
Vector::try_from_array(t, &Array::from(v2.clone())).unwrap().into();
let v1: SharedVector = Vector::try_from_vector(t, &v1).unwrap().into();
let v2: SharedVector = Vector::try_from_vector(t, &v2).unwrap().into();
assert_eq!(dist.calculate(&v1, &v2), res);
}
}

View file

@ -16,7 +16,6 @@ pub(in crate::kvs) fn construct_document(
match mutation {
TableMutation::Set(id, current_value) => {
let doc = Document::new_artificial(
None,
Some(id),
None,
Cow::Borrowed(current_value),
@ -31,7 +30,6 @@ pub(in crate::kvs) fn construct_document(
"id" => Value::Thing(id.clone()),
}));
let doc = Document::new_artificial(
None,
Some(id),
None,
Cow::Owned(Value::None),
@ -49,7 +47,6 @@ pub(in crate::kvs) fn construct_document(
operations.iter().map(|op| Value::Object(Object::from(op.clone()))).collect(),
)))?;
let doc = Document::new_artificial(
None,
Some(id),
None,
Cow::Borrowed(current_value),
@ -62,7 +59,6 @@ pub(in crate::kvs) fn construct_document(
}
TableMutation::DelWithOriginal(id, val) => {
let doc = Document::new_artificial(
None,
Some(id),
None,
Cow::Owned(Value::None),

View file

@ -73,6 +73,27 @@ pub struct MTreeParams {
}
impl MTreeParams {
pub fn new(
dimension: u16,
distance: Distance,
vector_type: VectorType,
capacity: u16,
doc_ids_order: u32,
doc_ids_cache: u32,
mtree_cache: u32,
) -> Self {
Self {
dimension,
_distance: Default::default(),
distance,
vector_type,
capacity,
doc_ids_order,
doc_ids_cache,
mtree_cache,
}
}
fn convert_old_distance(
&mut self,
_revision: u16,

View file

@ -149,7 +149,7 @@ impl fmt::Display for Operator {
}
}
Self::Ann(k, ef) => {
write!(f, "<{k},{ef}>")
write!(f, "<|{k},{ef}|>")
}
}
}

View file

@ -110,16 +110,7 @@ impl serde::ser::SerializeStruct for SerializeMTree {
#[test]
fn mtree_params() {
let params = MTreeParams {
dimension: 1,
_distance: Default::default(),
distance: Default::default(),
vector_type: Default::default(),
capacity: 2,
doc_ids_order: 3,
doc_ids_cache: 4,
mtree_cache: 5,
};
let params = MTreeParams::new(1, Default::default(), Default::default(), 2, 3, 4, 5);
let serialized = params.serialize(Serializer.wrap()).unwrap();
assert_eq!(params, serialized);
}

View file

@ -353,6 +353,7 @@ pub(crate) static PATHS: phf::Map<UniCase<&'static str>, PathKind> = phf_map! {
UniCase::ascii("vector::distance::chebyshev") => PathKind::Function,
UniCase::ascii("vector::distance::euclidean") => PathKind::Function,
UniCase::ascii("vector::distance::hamming") => PathKind::Function,
UniCase::ascii("vector::distance::knn") => PathKind::Function,
UniCase::ascii("vector::distance::mahalanobis") => PathKind::Function,
UniCase::ascii("vector::distance::manhattan") => PathKind::Function,
UniCase::ascii("vector::distance::minkowski") => PathKind::Function,

View file

@ -707,7 +707,7 @@ impl Parser<'_> {
loop {
match self.peek_kind() {
// COLUMS and FIELDS are the same tokenkind
// COLUMNS and FIELDS are the same tokenkind
t!("FIELDS") => {
self.pop_peek();
res.cols = Idioms(vec![self.parse_local_idiom()?]);
@ -852,16 +852,15 @@ impl Parser<'_> {
_ => break,
}
}
res.index = Index::MTree(crate::sql::index::MTreeParams {
res.index = Index::MTree(crate::sql::index::MTreeParams::new(
dimension,
_distance: Default::default(),
distance,
vector_type,
capacity,
doc_ids_order,
doc_ids_cache,
mtree_cache,
vector_type,
})
))
}
t!("HNSW") => {
self.pop_peek();
@ -909,8 +908,7 @@ impl Parser<'_> {
self.pop_peek();
keep_pruned_connections = true;
}
t => {
println!("TOKEN: {t:?}");
_ => {
break;
}
}

View file

@ -1,15 +1,17 @@
use criterion::measurement::WallTime;
use criterion::{criterion_group, criterion_main, BenchmarkGroup, Criterion, Throughput};
use flate2::read::GzDecoder;
use reblessive::TreeStack;
use std::fs::File;
use std::io::{BufRead, BufReader};
use std::time::Duration;
use surrealdb::idx::trees::hnsw::HnswIndex;
use surrealdb::sql::index::Distance;
use surrealdb_core::dbs::Session;
use surrealdb_core::idx::planner::checker::HnswConditionChecker;
use surrealdb_core::idx::trees::hnsw::index::HnswIndex;
use surrealdb_core::kvs::Datastore;
use surrealdb_core::sql::index::{HnswParams, VectorType};
use surrealdb_core::sql::{value, Array, Id, Thing, Value};
use surrealdb_core::sql::{value, Array, Id, Number, Thing, Value};
use tokio::runtime::{Builder, Runtime};
const EF_CONSTRUCTION: u16 = 150;
@ -45,14 +47,15 @@ fn bench_hnsw_no_db(c: &mut Criterion) {
let hnsw = insert_objects(&samples);
let samples = new_vectors_from_file(QUERYING_SOURCE);
let samples: Vec<Array> = samples.into_iter().map(|(_, a)| a).collect();
let samples: Vec<Vec<Number>> =
samples.into_iter().map(|(_, a)| convert_array_to_vec_number(a)).collect();
// Knn lookup benchmark group
{
let mut group = get_group(c, GROUP_NAME, samples.len(), 10);
let id = format!("lookup len: {}", samples.len());
group.bench_function(id, |b| {
b.iter(|| knn_lookup_objects(&hnsw, &samples));
b.to_async(Runtime::new().unwrap()).iter(|| knn_lookup_objects(&hnsw, &samples));
});
group.finish();
}
@ -175,6 +178,19 @@ fn new_vectors_from_file(path: &str) -> Vec<(Thing, Array)> {
}
res
}
fn convert_array_to_vec_number(a: Array) -> Vec<Number> {
a.into_iter()
.map(|v| {
if let Value::Number(n) = v {
n
} else {
panic!("Wrong value {}", v);
}
})
.collect()
}
async fn init_datastore(session: &Session, with_index: bool) -> Datastore {
let ds = Datastore::new("memory").await.unwrap();
if with_index {
@ -215,11 +231,20 @@ async fn insert_objects_db(session: &Session, create_index: bool, inserts: &[Str
ds
}
fn knn_lookup_objects(h: &HnswIndex, samples: &[Array]) {
for a in samples {
let r = h.knn_search(a, NN, EF_SEARCH).unwrap();
assert_eq!(r.len(), NN);
}
async fn knn_lookup_objects(h: &HnswIndex, samples: &[Vec<Number>]) {
let mut stack = TreeStack::new();
stack
.enter(|stk| async {
for v in samples {
let r = h
.knn_search(v, NN, EF_SEARCH, stk, HnswConditionChecker::default())
.await
.unwrap();
assert_eq!(r.len(), NN);
}
})
.finish()
.await;
}
async fn knn_lookup_objects_db(ds: &Datastore, session: &Session, selects: &[String]) {

View file

@ -1,50 +1,77 @@
use criterion::measurement::WallTime;
use criterion::{criterion_group, criterion_main, BenchmarkGroup, Criterion, Throughput};
use futures::executor::block_on;
use rand::prelude::ThreadRng;
use rand::{thread_rng, Rng};
use futures::lock::Mutex;
use rand::rngs::StdRng;
use rand::{Rng, SeedableRng};
use reblessive::TreeStack;
use std::time::Duration;
use surrealdb::idx::docids::DocId;
use surrealdb::idx::trees::mtree::{MState, MTree};
use surrealdb::idx::trees::store::TreeNodeProvider;
use surrealdb::idx::trees::vector::Vector;
use std::sync::atomic::{AtomicUsize, Ordering};
use std::sync::Arc;
use surrealdb::kvs::Datastore;
use surrealdb::kvs::LockType::Optimistic;
use surrealdb::kvs::TransactionType::{Read, Write};
use surrealdb::sql::index::Distance;
use tokio::runtime::Runtime;
use surrealdb_core::idx::planner::checker::MTreeConditionChecker;
use surrealdb_core::idx::trees::mtree::MTreeIndex;
use surrealdb_core::idx::IndexKeyBase;
use surrealdb_core::kvs::{Transaction, TransactionType};
use surrealdb_core::sql::index::{Distance, MTreeParams, VectorType};
use surrealdb_core::sql::{Id, Number, Thing, Value};
use tokio::runtime::{Builder, Runtime};
use tokio::task;
fn bench_index_mtree_dim_3(c: &mut Criterion) {
bench_index_mtree(c, 1_000, 100_000, 3, 120, 100);
bench_index_mtree(c, 250, 25_000, 3, 100);
}
fn bench_index_mtree_dim_3_full_cache(c: &mut Criterion) {
bench_index_mtree(c, 1_000, 100_000, 3, 120, 0);
bench_index_mtree(c, 250, 25_000, 3, 0);
}
fn bench_index_mtree_dim_50(c: &mut Criterion) {
bench_index_mtree(c, 100, 10_000, 50, 20, 100);
bench_index_mtree(c, 100, 10_000, 50, 100);
}
fn bench_index_mtree_dim_50_full_cache(c: &mut Criterion) {
bench_index_mtree(c, 100, 10_000, 50, 20, 0);
bench_index_mtree(c, 100, 10_000, 50, 0);
}
fn bench_index_mtree_dim_300(c: &mut Criterion) {
bench_index_mtree(c, 50, 5_000, 300, 40, 100);
bench_index_mtree(c, 50, 5_000, 300, 100);
}
fn bench_index_mtree_dim_300_full_cache(c: &mut Criterion) {
bench_index_mtree(c, 50, 5_000, 300, 40, 0);
bench_index_mtree(c, 50, 5_000, 300, 0);
}
fn bench_index_mtree_dim_2048(c: &mut Criterion) {
bench_index_mtree(c, 10, 1_000, 2048, 60, 100);
bench_index_mtree(c, 10, 1_000, 2048, 100);
}
fn bench_index_mtree_dim_2048_full_cache(c: &mut Criterion) {
bench_index_mtree(c, 10, 1_000, 2048, 60, 0);
bench_index_mtree(c, 10, 1_000, 2048, 0);
}
async fn mtree_index(
ds: &Datastore,
tx: &mut Transaction,
dimension: usize,
cache_size: usize,
tt: TransactionType,
) -> MTreeIndex {
let p = MTreeParams::new(
dimension as u16,
Distance::Euclidean,
VectorType::F64,
40,
100,
cache_size as u32,
cache_size as u32,
);
MTreeIndex::new(ds.index_store(), tx, IndexKeyBase::default(), &p, tt).await.unwrap()
}
fn runtime() -> Runtime {
Builder::new_multi_thread().worker_threads(4).enable_all().build().unwrap()
}
fn bench_index_mtree(
@ -52,7 +79,6 @@ fn bench_index_mtree(
debug_samples_len: usize,
release_samples_len: usize,
vector_dimension: usize,
measurement_secs: u64,
cache_size: usize,
) {
let samples_len = if cfg!(debug_assertions) {
@ -66,10 +92,10 @@ fn bench_index_mtree(
// Indexing benchmark group
{
let mut group = get_group(c, "index_mtree_insert", samples_len, measurement_secs);
let mut group = get_group(c, "index_mtree_insert", samples_len);
let id = format!("len_{}_dim_{}_cache_{}", samples_len, vector_dimension, cache_size);
group.bench_function(id, |b| {
b.to_async(Runtime::new().unwrap())
b.to_async(runtime())
.iter(|| insert_objects(&ds, samples_len, vector_dimension, cache_size));
});
group.finish();
@ -77,15 +103,15 @@ fn bench_index_mtree(
// Knn lookup benchmark group
{
let mut group = get_group(c, "index_mtree_lookup", samples_len, 10);
let mut group = get_group(c, "index_mtree_lookup", samples_len);
for knn in [1, 10] {
let id = format!(
"knn_{}_len_{}_dim_{}_cache_{}",
knn, samples_len, vector_dimension, cache_size
);
group.bench_function(id, |b| {
b.to_async(Runtime::new().unwrap()).iter(|| {
knn_lookup_objects(&ds, samples_len, vector_dimension, knn, cache_size)
b.to_async(runtime()).iter(|| {
knn_lookup_objects(&ds, samples_len, vector_dimension, cache_size, knn)
});
});
}
@ -97,24 +123,18 @@ fn get_group<'a>(
c: &'a mut Criterion,
group_name: &str,
samples_len: usize,
measurement_secs: u64,
) -> BenchmarkGroup<'a, WallTime> {
let mut group = c.benchmark_group(group_name);
group.throughput(Throughput::Elements(samples_len as u64));
group.sample_size(10);
group.measurement_time(Duration::from_secs(measurement_secs));
group
}
fn random_object(rng: &mut ThreadRng, vector_size: usize) -> Vector {
fn random_object(rng: &mut StdRng, vector_size: usize) -> Vec<Number> {
let mut vec = Vec::with_capacity(vector_size);
for _ in 0..vector_size {
vec.push(rng.gen_range(-1.0..=1.0));
vec.push(rng.gen_range(-1.0..=1.0).into());
}
Vector::F32(vec.into())
}
fn mtree() -> MTree {
MTree::new(MState::new(40), Distance::Euclidean)
vec
}
async fn insert_objects(
@ -123,27 +143,22 @@ async fn insert_objects(
vector_size: usize,
cache_size: usize,
) {
let mut rng = thread_rng();
let mut t = mtree();
let mut tx = ds.transaction(Write, Optimistic).await.unwrap();
let mut s =
ds.index_store().get_store_mtree(TreeNodeProvider::Debug, 0, Write, cache_size).await;
let mut mt = mtree_index(ds, &mut tx, vector_size, cache_size, Write).await;
let mut stack = TreeStack::new();
let mut rng = StdRng::from_entropy();
stack
.enter(|stk| async {
for i in 0..samples_size {
let object = random_object(&mut rng, vector_size).into();
let vector: Vec<Number> = random_object(&mut rng, vector_size);
// Insert the sample
t.insert(stk, &mut tx, &mut s, object, i as DocId).await.unwrap();
let rid = Thing::from(("test", Id::from(i as i64)));
mt.index_document(stk, &mut tx, &rid, &vec![Value::from(vector)]).await.unwrap();
}
})
.finish()
.await;
if let Some(new_cache) = s.finish(&mut tx).await.unwrap() {
ds.index_store().advance_store_mtree(new_cache);
}
mt.finish(&mut tx).await.unwrap();
tx.commit().await.unwrap();
}
@ -151,19 +166,48 @@ async fn knn_lookup_objects(
ds: &Datastore,
samples_size: usize,
vector_size: usize,
knn: usize,
cache_size: usize,
knn: usize,
) {
let mut rng = thread_rng();
let t = mtree();
let mut tx = ds.transaction(Read, Optimistic).await.unwrap();
let s = ds.index_store().get_store_mtree(TreeNodeProvider::Debug, 0, Read, cache_size).await;
for _ in 0..samples_size {
let object = random_object(&mut rng, vector_size).into();
// Insert the sample
t.knn_search(&mut tx, &s, &object, knn).await.unwrap();
let txn = Arc::new(Mutex::new(ds.transaction(Read, Optimistic).await.unwrap()));
let mut tx = txn.lock().await;
let mt = Arc::new(mtree_index(ds, &mut tx, vector_size, cache_size, Read).await);
drop(tx);
let counter = Arc::new(AtomicUsize::new(0));
let mut consumers = Vec::with_capacity(4);
for _ in 0..4 {
let (txn, mt, counter) = (txn.clone(), mt.clone(), counter.clone());
let c = task::spawn(async move {
let mut rng = StdRng::from_entropy();
while counter.fetch_add(1, Ordering::Relaxed) < samples_size {
let object = random_object(&mut rng, vector_size);
knn_lookup_object(mt.as_ref(), &txn, object, knn).await;
}
});
consumers.push(c);
}
tx.rollback_with_panic();
for c in consumers {
c.await.unwrap();
}
}
async fn knn_lookup_object(
mt: &MTreeIndex,
txn: &Arc<Mutex<Transaction>>,
object: Vec<Number>,
knn: usize,
) {
let mut stack = TreeStack::new();
stack
.enter(|stk| async {
let chk = MTreeConditionChecker::new(&txn);
let r = mt.knn_search(stk, txn, &object, knn, chk).await.unwrap();
assert_eq!(r.len(), knn);
})
.finish()
.await;
}
criterion_group!(

View file

@ -14,7 +14,7 @@ async fn select_where_mtree_knn() -> Result<(), Error> {
CREATE pts:3 SET point = [8,9,10,11];
DEFINE INDEX mt_pts ON pts FIELDS point MTREE DIMENSION 4 TYPE F32;
LET $pt = [2,3,4,5];
SELECT id, vector::distance::euclidean(point, $pt) AS dist FROM pts WHERE point <|2|> $pt;
SELECT id, vector::distance::knn() AS dist FROM pts WHERE point <|2|> $pt;
SELECT id FROM pts WHERE point <|2|> $pt EXPLAIN;
";
let dbs = new_ds().await?;
@ -46,7 +46,7 @@ async fn select_where_mtree_knn() -> Result<(), Error> {
detail: {
plan: {
index: 'mt_pts',
operator: '<2>',
operator: '<|2|>',
value: [2,3,4,5]
},
table: 'pts',
@ -76,7 +76,7 @@ async fn delete_update_mtree_index() -> Result<(), Error> {
DELETE pts:2;
UPDATE pts:3 SET point = [12,13,14,15];
LET $pt = [2,3,4,5];
SELECT id, vector::distance::euclidean(point, $pt) AS dist FROM pts WHERE point <|5|> $pt ORDER BY dist;
SELECT id, vector::distance::knn() AS dist FROM pts WHERE point <|5|> $pt ORDER BY dist;
";
let dbs = new_ds().await?;
let ses = Session::owner().with_ns("test").with_db("test");
@ -162,8 +162,8 @@ async fn select_where_brute_force_knn() -> Result<(), Error> {
CREATE pts:3 SET point = [8,9,10,11];
LET $pt = [2,3,4,5];
SELECT id FROM pts WHERE point <|2,EUCLIDEAN|> $pt EXPLAIN;
SELECT id, vector::distance::euclidean(point, $pt) AS dist FROM pts WHERE point <|2,EUCLIDEAN|> $pt;
SELECT id, vector::distance::euclidean(point, $pt) AS dist FROM pts WHERE point <|2,EUCLIDEAN|> $pt PARALLEL;
SELECT id, vector::distance::knn() AS dist FROM pts WHERE point <|2,EUCLIDEAN|> $pt;
SELECT id, vector::distance::knn() AS dist FROM pts WHERE point <|2,EUCLIDEAN|> $pt PARALLEL;
";
let dbs = new_ds().await?;
let ses = Session::owner().with_ns("test").with_db("test");
@ -224,7 +224,7 @@ async fn select_where_hnsw_knn() -> Result<(), Error> {
CREATE pts:3 SET point = [8,9,10,11];
DEFINE INDEX hnsw_pts ON pts FIELDS point HNSW DIMENSION 4 DIST EUCLIDEAN TYPE F32 EFC 500 M 12;
LET $pt = [2,3,4,5];
SELECT id, vector::distance::euclidean(point, $pt) AS dist FROM pts WHERE point <|2,100|> $pt;
SELECT id, vector::distance::knn() AS dist FROM pts WHERE point <|2,100|> $pt;
SELECT id FROM pts WHERE point <|2,100|> $pt EXPLAIN;
";
let dbs = new_ds().await?;
@ -256,7 +256,7 @@ async fn select_where_hnsw_knn() -> Result<(), Error> {
detail: {
plan: {
index: 'hnsw_pts',
operator: '<2,100>',
operator: '<|2,100|>',
value: [2,3,4,5]
},
table: 'pts',
@ -274,3 +274,219 @@ async fn select_where_hnsw_knn() -> Result<(), Error> {
assert_eq!(format!("{:#}", tmp), format!("{:#}", val));
Ok(())
}
#[tokio::test]
async fn select_mtree_knn_with_condition() -> Result<(), Error> {
let sql = r"
DEFINE INDEX mt_pt1 ON pts FIELDS point MTREE DIMENSION 1;
INSERT INTO pts [
{ id: pts:1, point: [ 10f ], flag: true },
{ id: pts:2, point: [ 20f ], flag: false },
{ id: pts:3, point: [ 30f ], flag: true },
{ id: pts:4, point: [ 40f ], flag: false },
{ id: pts:5, point: [ 50f ], flag: true },
{ id: pts:6, point: [ 60f ], flag: false },
{ id: pts:7, point: [ 70f ], flag: true }
];
LET $pt = [44f];
SELECT id, flag, vector::distance::knn() AS distance FROM pts
WHERE flag = true && point <|2|> $pt
ORDER BY distance EXPLAIN;
SELECT id, flag, vector::distance::knn() AS distance FROM pts
WHERE flag = true && point <|2|> $pt
ORDER BY distance;
";
let dbs = new_ds().await?;
let ses = Session::owner().with_ns("test").with_db("test");
let mut res = &mut dbs.execute(sql, &ses, None).await?;
assert_eq!(res.len(), 5);
//
skip_ok(&mut res, 3)?;
//
let tmp = res.remove(0).result?;
let val = Value::parse(
"[
{
detail: {
plan: {
index: 'mt_pt1',
operator: '<|2|>',
value: [44f]
},
table: 'pts',
},
operation: 'Iterate Index'
},
{
detail: {
type: 'Memory'
},
operation: 'Collector'
}
]",
);
assert_eq!(format!("{:#}", tmp), format!("{:#}", val));
//
let tmp = res.remove(0).result?;
let val = Value::parse(
"[
{
id: pts:5,
flag: true,
distance: 6f
},
{
id: pts:3,
flag: true,
distance: 14f
}
]",
);
assert_eq!(format!("{:#}", tmp), format!("{:#}", val));
//
Ok(())
}
#[test_log::test(tokio::test)]
async fn select_hnsw_knn_with_condition() -> Result<(), Error> {
let sql = r"
DEFINE INDEX hn_pt1 ON pts FIELDS point HNSW DIMENSION 1;
INSERT INTO pts [
{ id: pts:1, point: [ 10f ], flag: true },
{ id: pts:2, point: [ 20f ], flag: false },
{ id: pts:3, point: [ 30f ], flag: true },
{ id: pts:4, point: [ 40f ], flag: false },
{ id: pts:5, point: [ 50f ], flag: true },
{ id: pts:6, point: [ 60f ], flag: false },
{ id: pts:7, point: [ 70f ], flag: true }
];
LET $pt = [44f];
SELECT id, flag, vector::distance::knn() AS distance FROM pts
WHERE flag = true AND point <|2,40|> $pt
ORDER BY distance EXPLAIN;
SELECT id, flag, vector::distance::knn() AS distance FROM pts
WHERE flag = true AND point <|2,40|> $pt
ORDER BY distance;
";
let dbs = new_ds().await?;
let ses = Session::owner().with_ns("test").with_db("test");
let mut res = &mut dbs.execute(sql, &ses, None).await?;
assert_eq!(res.len(), 5);
//
skip_ok(&mut res, 3)?;
//
let tmp = res.remove(0).result?;
let val = Value::parse(
"[
{
detail: {
plan: {
index: 'hn_pt1',
operator: '<|2,40|>',
value: [44f]
},
table: 'pts',
},
operation: 'Iterate Index'
},
{
detail: {
type: 'Memory'
},
operation: 'Collector'
}
]",
);
assert_eq!(format!("{:#}", tmp), format!("{:#}", val));
//
let tmp = res.remove(0).result?;
let val = Value::parse(
"[
{
distance: 6f,
flag: true,
id: pts:5
},
{
distance: 14f,
flag: true,
id: pts:3
}
]",
);
assert_eq!(format!("{:#}", tmp), format!("{:#}", val));
//
Ok(())
}
#[test_log::test(tokio::test)]
async fn select_bruteforce_knn_with_condition() -> Result<(), Error> {
let sql = r"
INSERT INTO pts [
{ id: pts:1, point: [ 10f ], flag: true },
{ id: pts:2, point: [ 20f ], flag: false },
{ id: pts:3, point: [ 30f ], flag: true },
{ id: pts:4, point: [ 40f ], flag: false },
{ id: pts:5, point: [ 50f ], flag: true },
{ id: pts:6, point: [ 60f ], flag: false },
{ id: pts:7, point: [ 70f ], flag: true }
];
LET $pt = [44f];
SELECT id, flag, vector::distance::knn() AS distance FROM pts
WHERE flag = true AND point <|2,EUCLIDEAN|> $pt
ORDER BY distance EXPLAIN;
SELECT id, flag, vector::distance::knn() AS distance FROM pts
WHERE flag = true AND point <|2,EUCLIDEAN|> $pt
ORDER BY distance;
";
let dbs = new_ds().await?;
let ses = Session::owner().with_ns("test").with_db("test");
let mut res = &mut dbs.execute(sql, &ses, None).await?;
assert_eq!(res.len(), 4);
//
skip_ok(&mut res, 2)?;
//
let tmp = res.remove(0).result?;
let val = Value::parse(
"[
{
detail: {
table: 'pts'
},
operation: 'Iterate Table'
},
{
detail: {
reason: 'NO INDEX FOUND'
},
operation: 'Fallback'
},
{
detail: {
type: 'Memory'
},
operation: 'Collector'
}
]",
);
assert_eq!(format!("{:#}", tmp), format!("{:#}", val));
//
let tmp = res.remove(0).result?;
let val = Value::parse(
"[
{
distance: 6f,
flag: true,
id: pts:5
},
{
distance: 14f,
flag: true,
id: pts:3
}
]",
);
assert_eq!(format!("{:#}", tmp), format!("{:#}", val));
//
Ok(())
}