Feat: Analyzers to support functions (#2974)

Co-authored-by: Yusuke Kuoka <ykuoka@gmail.com>
This commit is contained in:
Emmanuel Keller 2023-11-20 18:36:21 +00:00 committed by GitHub
parent 5b23602359
commit 6efd3e3d87
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
28 changed files with 863 additions and 337 deletions

View file

@ -1,5 +0,0 @@
[advisories]
ignore = [
# Acknowledged in: https://github.com/surrealdb/surrealdb/pull/3005
"RUSTSEC-2021-0127"
]

View file

@ -265,6 +265,7 @@
"uuid"
"rand::uuid::v4("
"rand::uuid::v7("
"search::analyze("
"search::score("
"search::highlight("
"search::offsets("
@ -300,6 +301,7 @@
"string::join("
"string::len("
"string::lowercase("
"string::matches("
"string::repeat("
"string::replace("
"string::reverse("

View file

@ -265,6 +265,7 @@
"uuid"
"rand::uuid::v4("
"rand::uuid::v7("
"search::analyze("
"search::score("
"search::highlight("
"search::offsets("
@ -299,6 +300,7 @@
"string::join("
"string::len("
"string::lowercase("
"string::matches("
"string::repeat("
"string::replace("
"string::reverse("

View file

@ -7,11 +7,11 @@ use crate::idx::ft::FtIndex;
use crate::idx::trees::mtree::MTreeIndex;
use crate::idx::trees::store::TreeStoreType;
use crate::idx::IndexKeyBase;
use crate::key;
use crate::sql::array::Array;
use crate::sql::index::{Index, MTreeParams, SearchParams};
use crate::sql::statements::DefineIndexStatement;
use crate::sql::{Part, Thing, Value};
use crate::{key, kvs};
impl<'a> Document<'a> {
pub async fn index(
@ -45,18 +45,15 @@ impl<'a> Document<'a> {
// Update the index entries
if opt.force || o != n {
// Claim transaction
let mut run = txn.lock().await;
// Store all the variable and parameters required by the index operation
let mut ic = IndexOperation::new(opt, ix, o, n, rid);
// Index operation dispatching
match &ix.index {
Index::Uniq => ic.index_unique(&mut run).await?,
Index::Idx => ic.index_non_unique(&mut run).await?,
Index::Search(p) => ic.index_full_text(&mut run, p).await?,
Index::MTree(p) => ic.index_mtree(&mut run, p).await?,
Index::Uniq => ic.index_unique(txn).await?,
Index::Idx => ic.index_non_unique(txn).await?,
Index::Search(p) => ic.index_full_text(ctx, txn, p).await?,
Index::MTree(p) => ic.index_mtree(txn, p).await?,
};
}
}
@ -257,7 +254,8 @@ impl<'a> IndexOperation<'a> {
)
}
async fn index_unique(&mut self, run: &mut kvs::Transaction) -> Result<(), Error> {
async fn index_unique(&mut self, txn: &Transaction) -> Result<(), Error> {
let mut run = txn.lock().await;
// Delete the old index data
if let Some(o) = self.o.take() {
let i = Indexable::new(o, self.ix);
@ -288,7 +286,8 @@ impl<'a> IndexOperation<'a> {
Ok(())
}
async fn index_non_unique(&mut self, run: &mut kvs::Transaction) -> Result<(), Error> {
async fn index_non_unique(&mut self, txn: &Transaction) -> Result<(), Error> {
let mut run = txn.lock().await;
// Delete the old index data
if let Some(o) = self.o.take() {
let i = Indexable::new(o, self.ix);
@ -330,35 +329,34 @@ impl<'a> IndexOperation<'a> {
async fn index_full_text(
&mut self,
run: &mut kvs::Transaction,
ctx: &Context<'_>,
txn: &Transaction,
p: &SearchParams,
) -> Result<(), Error> {
let ikb = IndexKeyBase::new(self.opt, self.ix);
let az = run.get_db_analyzer(self.opt.ns(), self.opt.db(), p.az.as_str()).await?;
let mut ft = FtIndex::new(run, az, ikb, p, TreeStoreType::Write).await?;
let mut ft = FtIndex::new(self.opt, txn, &p.az, ikb, p, TreeStoreType::Write).await?;
if let Some(n) = self.n.take() {
ft.index_document(run, self.rid, n).await?;
ft.index_document(ctx, self.opt, txn, self.rid, n).await?;
} else {
ft.remove_document(run, self.rid).await?;
ft.remove_document(txn, self.rid).await?;
}
ft.finish(run).await
ft.finish(txn).await
}
async fn index_mtree(
&mut self,
run: &mut kvs::Transaction,
p: &MTreeParams,
) -> Result<(), Error> {
async fn index_mtree(&mut self, txn: &Transaction, p: &MTreeParams) -> Result<(), Error> {
let mut tx = txn.lock().await;
let ikb = IndexKeyBase::new(self.opt, self.ix);
let mut mt = MTreeIndex::new(run, ikb, p, TreeStoreType::Write).await?;
let mut mt = MTreeIndex::new(&mut tx, ikb, p, TreeStoreType::Write).await?;
// Delete the old index data
if let Some(o) = self.o.take() {
mt.remove_document(run, self.rid, o).await?;
mt.remove_document(&mut tx, self.rid, o).await?;
}
// Create the new index data
if let Some(n) = self.n.take() {
mt.index_document(run, self.rid, n).await?;
mt.index_document(&mut tx, self.rid, n).await?;
}
mt.finish(run).await
mt.finish(&mut tx).await
}
}

View file

@ -230,6 +230,10 @@ pub enum Error {
current: String,
},
/// Invalid regular expression
#[error("Invalid regular expression: {0:?}")]
InvalidRegex(String),
/// The query timedout
#[error("The query was not executed because it exceeded the timeout")]
QueryTimedout,
@ -738,6 +742,12 @@ impl From<JWTError> for Error {
}
}
impl From<regex::Error> for Error {
fn from(error: regex::Error) -> Self {
Error::InvalidRegex(error.to_string())
}
}
#[cfg(feature = "kv-mem")]
impl From<echodb::err::Error> for Error {
fn from(e: echodb::err::Error) -> Error {

View file

@ -1,6 +1,6 @@
use crate::err::Error;
use crate::sql::value::Value;
use crate::sql::{Array, Bytes, Datetime, Duration, Kind, Number, Strand, Thing};
use crate::sql::{Array, Bytes, Datetime, Duration, Kind, Number, Regex, Strand, Thing};
/// Implemented by types that are commonly used, in a certain way, as arguments.
pub trait FromArg: Sized {
@ -13,6 +13,12 @@ impl FromArg for Value {
}
}
impl FromArg for Regex {
fn from_arg(arg: Value) -> Result<Self, Error> {
arg.coerce_to_regex()
}
}
impl FromArg for String {
fn from_arg(arg: Value) -> Result<Self, Error> {
arg.coerce_to_string()

View file

@ -230,6 +230,7 @@ pub fn synchronous(ctx: &Context<'_>, name: &str, args: Vec<Value>) -> Result<Va
"string::join" => string::join,
"string::len" => string::len,
"string::lowercase" => string::lowercase,
"string::matches" => string::matches,
"string::repeat" => string::repeat,
"string::replace" => string::replace,
"string::reverse" => string::reverse,
@ -388,6 +389,7 @@ pub async fn asynchronous(
"http::patch" => http::patch(ctx).await,
"http::delete" => http::delete(ctx).await,
//
"search::analyze" => search::analyze((ctx, txn, opt)).await,
"search::score" => search::score((ctx, txn, doc)).await,
"search::highlight" => search::highlight((ctx,txn, doc)).await,
"search::offsets" => search::offsets((ctx, txn, doc)).await,

View file

@ -7,6 +7,7 @@ pub struct Package;
impl_module_def!(
Package,
"search",
"analyze" => fut Async,
"highlight" => fut Async,
"offsets" => fut Async,
"score" => fut Async

View file

@ -17,6 +17,7 @@ impl_module_def!(
"join" => run,
"len" => run,
"lowercase" => run,
"matches" => run,
"repeat" => run,
"replace" => run,
"reverse" => run,

View file

@ -1,7 +1,8 @@
use crate::ctx::Context;
use crate::dbs::Transaction;
use crate::dbs::{Options, Transaction};
use crate::doc::CursorDoc;
use crate::err::Error;
use crate::idx::ft::analyzer::Analyzer;
use crate::idx::planner::executor::QueryExecutor;
use crate::sql::{Thing, Value};
@ -24,6 +25,19 @@ fn get_execution_context<'a>(
None
}
pub async fn analyze(
(ctx, txn, opt): (&Context<'_>, Option<&Transaction>, Option<&Options>),
(az, val): (Value, Value),
) -> Result<Value, Error> {
if let (Some(txn), Some(opt), Value::Strand(az), Value::Strand(val)) = (txn, opt, az, val) {
let az: Analyzer =
txn.lock().await.get_db_analyzer(opt.ns(), opt.db(), az.as_str()).await?.into();
az.analyze(ctx, opt, txn, val.0).await
} else {
Ok(Value::None)
}
}
pub async fn score(
(ctx, txn, doc): (&Context<'_>, Option<&Transaction>, Option<&CursorDoc<'_>>),
(match_ref,): (Value,),

View file

@ -1,6 +1,7 @@
use crate::err::Error;
use crate::fnc::util::string;
use crate::sql::value::Value;
use crate::sql::Regex;
/// Returns `true` if a string of this length is too much to allocate.
fn limit(name: &str, n: usize) -> Result<(), Error> {
@ -64,17 +65,32 @@ pub fn repeat((val, num): (String, usize)) -> Result<Value, Error> {
Ok(val.repeat(num).into())
}
pub fn replace((val, old, new): (String, String, String)) -> Result<Value, Error> {
if new.len() > old.len() {
let increase = new.len() - old.len();
limit(
"string::replace",
val.len().saturating_add(val.matches(&old).count().saturating_mul(increase)),
)?;
}
Ok(val.replace(&old, &new).into())
pub fn matches((val, regex): (String, Regex)) -> Result<Value, Error> {
Ok(regex.0.is_match(&val).into())
}
pub fn replace((val, old_or_regexp, new): (String, Value, String)) -> Result<Value, Error> {
match old_or_regexp {
Value::Strand(old) => {
if new.len() > old.len() {
let increase = new.len() - old.len();
limit(
"string::replace",
val.len().saturating_add(val.matches(&old.0).count().saturating_mul(increase)),
)?;
}
Ok(val.replace(&old.0, &new).into())
}
Value::Regex(r) => Ok(r.0.replace_all(&val, new).into_owned().into()),
_ => Err(Error::InvalidArguments {
name: "string::replace".to_string(),
message: format!(
"Argument 2 was the wrong type. Expected a string but found {}",
old_or_regexp
),
}),
}
}
pub fn reverse((string,): (String,)) -> Result<Value, Error> {
Ok(string.chars().rev().collect::<String>().into())
}
@ -180,6 +196,7 @@ pub mod is {
pub fn datetime((arg, fmt): (String, String)) -> Result<Value, Error> {
Ok(NaiveDateTime::parse_from_str(&arg, &fmt).is_ok().into())
}
pub fn domain((arg,): (String,)) -> Result<Value, Error> {
Ok(addr::parse_domain_name(arg.as_str()).is_ok().into())
}
@ -245,7 +262,7 @@ pub mod similarity {
#[cfg(test)]
mod tests {
use super::{contains, slice};
use super::{contains, matches, replace, slice};
use crate::sql::Value;
#[test]
@ -288,6 +305,41 @@ mod tests {
test("好世界", "你好", false);
}
#[test]
fn string_replace() {
fn test(base: &str, pattern: Value, replacement: &str, expected: &str) {
assert_eq!(
replace((base.to_string(), pattern.clone(), replacement.to_string())).unwrap(),
Value::from(expected),
"replace({},{},{})",
base,
pattern,
replacement
);
}
test("foo bar", Value::Regex("foo".parse().unwrap()), "bar", "bar bar");
test("foo bar", "bar".into(), "foo", "foo foo");
}
#[test]
fn string_matches() {
fn test(base: &str, regex: &str, expected: bool) {
assert_eq!(
matches((base.to_string(), regex.parse().unwrap())).unwrap(),
Value::from(expected),
"matches({},{})",
base,
regex
);
}
test("bar", "foo", false);
test("", "foo", false);
test("foo bar", "foo", true);
test("foo bar", "bar", true);
}
#[test]
fn is_alphanum() {
let value = super::is::alphanum((String::from("abc123"),)).unwrap();

View file

@ -182,8 +182,8 @@ pub(super) enum Term {
mod tests {
use crate::idx::ft::analyzer::tests::test_analyzer;
#[test]
fn test_arabic_stemmer() {
#[tokio::test]
async fn test_arabic_stemmer() {
let input =
"الكلاب تحب الجري في الحديقة، لكن كلبي الصغير يفضل النوم في سريره بدلاً من الجري";
let output = vec![
@ -194,17 +194,20 @@ mod tests {
"ANALYZER test TOKENIZERS blank,class FILTERS snowball(arabic);",
input,
&output,
);
test_analyzer("ANALYZER test TOKENIZERS blank,class FILTERS snowball(ar);", input, &output);
)
.await;
test_analyzer("ANALYZER test TOKENIZERS blank,class FILTERS snowball(ar);", input, &output)
.await;
test_analyzer(
"ANALYZER test TOKENIZERS blank,class FILTERS snowball(ara);",
input,
&output,
);
)
.await;
}
#[test]
fn test_danish_stemmer() {
#[tokio::test]
async fn test_danish_stemmer() {
let input = "Hunde elsker at løbe i parken, men min lille hund foretrækker at sove i sin kurv frem for at løbe.";
let output = vec![
"hund",
@ -234,17 +237,20 @@ mod tests {
"ANALYZER test TOKENIZERS blank,class FILTERS snowball(danish);",
input,
&output,
);
)
.await;
test_analyzer(
"ANALYZER test TOKENIZERS blank,class FILTERS snowball(dan);",
input,
&output,
);
test_analyzer("ANALYZER test TOKENIZERS blank,class FILTERS snowball(da);", input, &output);
)
.await;
test_analyzer("ANALYZER test TOKENIZERS blank,class FILTERS snowball(da);", input, &output)
.await;
}
#[test]
fn test_dutch_stemmer() {
#[tokio::test]
async fn test_dutch_stemmer() {
let input = "Honden houden ervan om in het park te rennen, maar mijn kleine hond slaapt liever in zijn mand dan te rennen.";
let output = vec![
"hond", "houd", "ervan", "om", "in", "het", "park", "te", "renn", ",", "mar", "mijn",
@ -254,17 +260,20 @@ mod tests {
"ANALYZER test TOKENIZERS blank,class FILTERS snowball(dutch);",
input,
&output,
);
test_analyzer("ANALYZER test TOKENIZERS blank,class FILTERS snowball(nl);", input, &output);
)
.await;
test_analyzer("ANALYZER test TOKENIZERS blank,class FILTERS snowball(nl);", input, &output)
.await;
test_analyzer(
"ANALYZER test TOKENIZERS blank,class FILTERS snowball(nld);",
input,
&output,
);
)
.await;
}
#[test]
fn test_english_stemmer() {
#[tokio::test]
async fn test_english_stemmer() {
let input = "Teachers are often teaching, but my favorite teacher prefers reading in her spare time rather than teaching.";
let output = vec![
"teacher", "are", "often", "teach", ",", "but", "my", "favorit", "teacher", "prefer",
@ -274,17 +283,20 @@ mod tests {
"ANALYZER test TOKENIZERS blank,class FILTERS snowball(english);",
input,
&output,
);
)
.await;
test_analyzer(
"ANALYZER test TOKENIZERS blank,class FILTERS snowball(eng);",
input,
&output,
);
test_analyzer("ANALYZER test TOKENIZERS blank,class FILTERS snowball(en);", input, &output);
)
.await;
test_analyzer("ANALYZER test TOKENIZERS blank,class FILTERS snowball(en);", input, &output)
.await;
}
#[test]
fn test_french_stemmer() {
#[tokio::test]
async fn test_french_stemmer() {
let input = "Les chiens adorent courir dans le parc, mais mon petit chien aime plutôt se blottir sur le canapé que de courir";
let output = [
"le", "chien", "adorent", "cour", "dan", "le", "parc", ",", "mais", "mon", "pet",
@ -294,17 +306,20 @@ mod tests {
"ANALYZER test TOKENIZERS blank,class FILTERS snowball(french);",
input,
&output,
);
test_analyzer("ANALYZER test TOKENIZERS blank,class FILTERS snowball(fr);", input, &output);
)
.await;
test_analyzer("ANALYZER test TOKENIZERS blank,class FILTERS snowball(fr);", input, &output)
.await;
test_analyzer(
"ANALYZER test TOKENIZERS blank,class FILTERS snowball(fra);",
input,
&output,
);
)
.await;
}
#[test]
fn test_german_stemmer() {
#[tokio::test]
async fn test_german_stemmer() {
let input = "Hunde lieben es, im Park zu laufen, aber mein kleiner Hund zieht es vor, auf dem Sofa zu schlafen, statt zu laufen.";
let output = [
"hund", "lieb", "es", ",", "im", "park", "zu", "lauf", ",", "aber", "mein", "klein",
@ -315,17 +330,20 @@ mod tests {
"ANALYZER test TOKENIZERS blank,class FILTERS snowball(german);",
input,
&output,
);
test_analyzer("ANALYZER test TOKENIZERS blank,class FILTERS snowball(de);", input, &output);
)
.await;
test_analyzer("ANALYZER test TOKENIZERS blank,class FILTERS snowball(de);", input, &output)
.await;
test_analyzer(
"ANALYZER test TOKENIZERS blank,class FILTERS snowball(deu);",
input,
&output,
);
)
.await;
}
#[test]
fn test_greek_stemmer() {
#[tokio::test]
async fn test_greek_stemmer() {
let input = "Τα σκυλιά αγαπούν να τρέχουν στο πάρκο, αλλά ο μικρός μου σκύλος προτιμά να κοιμάται στο κρεβάτι του αντί να τρέχει.";
let output = [
"τα",
@ -356,17 +374,20 @@ mod tests {
"ANALYZER test TOKENIZERS blank,class FILTERS snowball(greek);",
input,
&output,
);
)
.await;
test_analyzer(
"ANALYZER test TOKENIZERS blank,class FILTERS snowball(ell);",
input,
&output,
);
test_analyzer("ANALYZER test TOKENIZERS blank,class FILTERS snowball(el);", input, &output);
)
.await;
test_analyzer("ANALYZER test TOKENIZERS blank,class FILTERS snowball(el);", input, &output)
.await;
}
#[test]
fn test_hungarian_stemmer() {
#[tokio::test]
async fn test_hungarian_stemmer() {
let input = "A kutyák szeretnek futni a parkban, de az én kicsi kutyám inkább alszik a kosarában, mintsem fut.";
let output = [
"a", "kutya", "szeret", "futn", "a", "par", ",", "de", "az", "én", "kics", "kutya",
@ -376,17 +397,20 @@ mod tests {
"ANALYZER test TOKENIZERS blank,class FILTERS snowball(hungarian);",
input,
&output,
);
test_analyzer("ANALYZER test TOKENIZERS blank,class FILTERS snowball(hu);", input, &output);
)
.await;
test_analyzer("ANALYZER test TOKENIZERS blank,class FILTERS snowball(hu);", input, &output)
.await;
test_analyzer(
"ANALYZER test TOKENIZERS blank,class FILTERS snowball(hun);",
input,
&output,
);
)
.await;
}
#[test]
fn test_italian_stemmer() {
#[tokio::test]
async fn test_italian_stemmer() {
let input = "I cani amano correre nel parco, ma il mio piccolo cane preferisce dormire nel suo cesto piuttosto che correre.";
let output = [
"i", "can", "aman", "corr", "nel", "parc", ",", "ma", "il", "mio", "piccol", "can",
@ -396,17 +420,20 @@ mod tests {
"ANALYZER test TOKENIZERS blank,class FILTERS snowball(italian);",
input,
&output,
);
test_analyzer("ANALYZER test TOKENIZERS blank,class FILTERS snowball(it);", input, &output);
)
.await;
test_analyzer("ANALYZER test TOKENIZERS blank,class FILTERS snowball(it);", input, &output)
.await;
test_analyzer(
"ANALYZER test TOKENIZERS blank,class FILTERS snowball(ita);",
input,
&output,
);
)
.await;
}
#[test]
fn test_norwegian_stemmer() {
#[tokio::test]
async fn test_norwegian_stemmer() {
let input = "Hunder elsker å løpe i parken, men min lille hund foretrekker å sove i sengen sin heller enn å løpe.";
let output = [
"hund",
@ -436,17 +463,20 @@ mod tests {
"ANALYZER test TOKENIZERS blank,class FILTERS snowball(norwegian);",
input,
&output,
);
test_analyzer("ANALYZER test TOKENIZERS blank,class FILTERS snowball(no);", input, &output);
)
.await;
test_analyzer("ANALYZER test TOKENIZERS blank,class FILTERS snowball(no);", input, &output)
.await;
test_analyzer(
"ANALYZER test TOKENIZERS blank,class FILTERS snowball(nor);",
input,
&output,
);
)
.await;
}
#[test]
fn test_portuguese_stemmer() {
#[tokio::test]
async fn test_portuguese_stemmer() {
let input = "Os cães adoram correr no parque, mas o meu pequeno cão prefere dormir na sua cama em vez de correr.";
let output = [
"os", "", "ador", "corr", "no", "parqu", ",", "mas", "o", "meu", "pequen", "",
@ -456,17 +486,20 @@ mod tests {
"ANALYZER test TOKENIZERS blank,class FILTERS snowball(portuguese);",
input,
&output,
);
test_analyzer("ANALYZER test TOKENIZERS blank,class FILTERS snowball(pt);", input, &output);
)
.await;
test_analyzer("ANALYZER test TOKENIZERS blank,class FILTERS snowball(pt);", input, &output)
.await;
test_analyzer(
"ANALYZER test TOKENIZERS blank,class FILTERS snowball(por);",
input,
&output,
);
)
.await;
}
#[test]
fn test_romanian_stemmer() {
#[tokio::test]
async fn test_romanian_stemmer() {
let input = "Câinii adoră să alerge în parc, dar cățelul meu preferă să doarmă în coșul lui decât să alerge.";
let output = [
"câin", "ador", "", "alerg", "în", "parc", ",", "dar", "cățel", "meu", "prefer",
@ -476,17 +509,20 @@ mod tests {
"ANALYZER test TOKENIZERS blank,class FILTERS snowball(romanian);",
input,
&output,
);
test_analyzer("ANALYZER test TOKENIZERS blank,class FILTERS snowball(ro);", input, &output);
)
.await;
test_analyzer("ANALYZER test TOKENIZERS blank,class FILTERS snowball(ro);", input, &output)
.await;
test_analyzer(
"ANALYZER test TOKENIZERS blank,class FILTERS snowball(ron);",
input,
&output,
);
)
.await;
}
#[test]
fn test_russian_stemmer() {
#[tokio::test]
async fn test_russian_stemmer() {
let input = "Собаки любят бегать в парке, но моя маленькая собака предпочитает спать в своей корзине, а не бегать.";
let output = [
"собак",
@ -514,17 +550,20 @@ mod tests {
"ANALYZER test TOKENIZERS blank,class FILTERS snowball(russian);",
input,
&output,
);
test_analyzer("ANALYZER test TOKENIZERS blank,class FILTERS snowball(ru);", input, &output);
)
.await;
test_analyzer("ANALYZER test TOKENIZERS blank,class FILTERS snowball(ru);", input, &output)
.await;
test_analyzer(
"ANALYZER test TOKENIZERS blank,class FILTERS snowball(rus);",
input,
&output,
);
)
.await;
}
#[test]
fn test_spanish_stemmer() {
#[tokio::test]
async fn test_spanish_stemmer() {
let input = "Los perros aman correr en el parque, pero mi pequeño perro prefiere dormir en su cama en lugar de correr.";
let output = [
"los", "perr", "aman", "corr", "en", "el", "parqu", ",", "per", "mi", "pequeñ", "perr",
@ -534,17 +573,20 @@ mod tests {
"ANALYZER test TOKENIZERS blank,class FILTERS snowball(spanish);",
input,
&output,
);
test_analyzer("ANALYZER test TOKENIZERS blank,class FILTERS snowball(es);", input, &output);
)
.await;
test_analyzer("ANALYZER test TOKENIZERS blank,class FILTERS snowball(es);", input, &output)
.await;
test_analyzer(
"ANALYZER test TOKENIZERS blank,class FILTERS snowball(spa);",
input,
&output,
);
)
.await;
}
#[test]
fn test_swedish_stemmer() {
#[tokio::test]
async fn test_swedish_stemmer() {
let input = "Hundar älskar att springa i parken, men min lilla hund föredrar att sova i sin säng istället för att springa.";
let output = [
"hund",
@ -574,17 +616,20 @@ mod tests {
"ANALYZER test TOKENIZERS blank,class FILTERS snowball(swedish);",
input,
&output,
);
test_analyzer("ANALYZER test TOKENIZERS blank,class FILTERS snowball(sv);", input, &output);
)
.await;
test_analyzer("ANALYZER test TOKENIZERS blank,class FILTERS snowball(sv);", input, &output)
.await;
test_analyzer(
"ANALYZER test TOKENIZERS blank,class FILTERS snowball(swe);",
input,
&output,
);
)
.await;
}
#[test]
fn test_tamil_stemmer() {
#[tokio::test]
async fn test_tamil_stemmer() {
let input = "நாய்கள் பூங்காவில் ஓடுவதை விரும்புகின்றன, ஆனால் என் சிறிய நாய் அதன் படுகையில் தூங்குவதை விரும்புகின்றது, ஓட இல்லை.";
let output = [
"\u{bbe}",
@ -617,17 +662,20 @@ mod tests {
"ANALYZER test TOKENIZERS blank,class FILTERS snowball(tamil);",
input,
&output,
);
test_analyzer("ANALYZER test TOKENIZERS blank,class FILTERS snowball(ta);", input, &output);
)
.await;
test_analyzer("ANALYZER test TOKENIZERS blank,class FILTERS snowball(ta);", input, &output)
.await;
test_analyzer(
"ANALYZER test TOKENIZERS blank,class FILTERS snowball(tam);",
input,
&output,
);
)
.await;
}
#[test]
fn test_turkish_stemmer() {
#[tokio::test]
async fn test_turkish_stemmer() {
let input = "Köpekler parkta koşmayı sever, ama benim küçük köpeğim koşmaktansa yatağında uyumayı tercih eder.";
let output = [
"köpek", "park", "koşma", "sever", ",", "am", "be", "küçük", "köpek", "koşmak",
@ -637,30 +685,35 @@ mod tests {
"ANALYZER test TOKENIZERS blank,class FILTERS snowball(turkish);",
input,
&output,
);
test_analyzer("ANALYZER test TOKENIZERS blank,class FILTERS snowball(tr);", input, &output);
)
.await;
test_analyzer("ANALYZER test TOKENIZERS blank,class FILTERS snowball(tr);", input, &output)
.await;
test_analyzer(
"ANALYZER test TOKENIZERS blank,class FILTERS snowball(tur);",
input,
&output,
);
)
.await;
}
#[test]
fn test_ngram() {
#[tokio::test]
async fn test_ngram() {
test_analyzer(
"ANALYZER test TOKENIZERS blank,class FILTERS lowercase,ngram(2,3);",
"Ālea iacta est",
&["āl", "āle", "le", "lea", "ia", "iac", "ac", "act", "ct", "cta", "es", "est"],
);
)
.await;
}
#[test]
fn test_edgengram() {
#[tokio::test]
async fn test_edgengram() {
test_analyzer(
"ANALYZER test TOKENIZERS blank,class FILTERS lowercase,edgengram(2,3);",
"Ālea iacta est",
&["āl", "āle", "ia", "iac", "es", "est"],
);
)
.await;
}
}

View file

@ -1,13 +1,16 @@
use crate::ctx::Context;
use crate::dbs::{Options, Transaction};
use crate::err::Error;
use crate::idx::ft::analyzer::tokenizer::{Tokenizer, Tokens};
use crate::idx::ft::doclength::DocLength;
use crate::idx::ft::offsets::{Offset, OffsetRecords};
use crate::idx::ft::postings::TermFrequency;
use crate::idx::ft::terms::{TermId, Terms};
use crate::kvs::Transaction;
use crate::sql::statements::DefineAnalyzerStatement;
use crate::sql::tokenizer::Tokenizer as SqlTokenizer;
use crate::sql::Value;
use crate::syn::path_like;
use async_recursion::async_recursion;
use filter::Filter;
use std::collections::hash_map::Entry;
use std::collections::{HashMap, HashSet};
@ -15,28 +18,31 @@ use std::collections::{HashMap, HashSet};
mod filter;
mod tokenizer;
pub(super) struct Analyzer {
t: Option<Vec<SqlTokenizer>>,
f: Option<Vec<Filter>>,
pub(crate) struct Analyzer {
function: Option<String>,
tokenizers: Option<Vec<SqlTokenizer>>,
filters: Option<Vec<Filter>>,
}
impl From<DefineAnalyzerStatement> for Analyzer {
fn from(az: DefineAnalyzerStatement) -> Self {
Self {
t: az.tokenizers,
f: Filter::from(az.filters),
function: az.function.map(|i| i.0),
tokenizers: az.tokenizers,
filters: Filter::from(az.filters),
}
}
}
impl Analyzer {
pub(super) async fn extract_terms(
&self,
ctx: &Context<'_>,
opt: &Options,
txn: &Transaction,
t: &Terms,
tx: &mut Transaction,
query_string: String,
) -> Result<Vec<Option<TermId>>, Error> {
let tokens = self.analyze(query_string)?;
let tokens = self.generate_tokens(ctx, opt, txn, query_string).await?;
// We first collect every unique terms
// as it can contains duplicates
let mut terms = HashSet::new();
@ -45,8 +51,9 @@ impl Analyzer {
}
// Now we can extract the term ids
let mut res = Vec::with_capacity(terms.len());
let mut tx = txn.lock().await;
for term in terms {
let opt_term_id = t.get_term_id(tx, tokens.get_token_string(term)?).await?;
let opt_term_id = t.get_term_id(&mut tx, tokens.get_token_string(term)?).await?;
res.push(opt_term_id);
}
Ok(res)
@ -56,15 +63,17 @@ impl Analyzer {
/// It will create new term ids for non already existing terms.
pub(super) async fn extract_terms_with_frequencies(
&self,
ctx: &Context<'_>,
opt: &Options,
txn: &Transaction,
terms: &mut Terms,
tx: &mut Transaction,
field_content: Vec<Value>,
) -> Result<(DocLength, Vec<(TermId, TermFrequency)>), Error> {
let mut dl = 0;
// Let's first collect all the inputs, and collect the tokens.
// We need to store them because everything after is zero-copy
let mut inputs = vec![];
self.analyze_content(field_content, &mut inputs)?;
self.analyze_content(ctx, opt, txn, field_content, &mut inputs).await?;
// We then collect every unique terms and count the frequency
let mut tf: HashMap<&str, TermFrequency> = HashMap::new();
for tks in &inputs {
@ -83,8 +92,9 @@ impl Analyzer {
}
// Now we can resolve the term ids
let mut tfid = Vec::with_capacity(tf.len());
let mut tx = txn.lock().await;
for (t, f) in tf {
tfid.push((terms.resolve_term_id(tx, t).await?, f));
tfid.push((terms.resolve_term_id(&mut tx, t).await?, f));
}
Ok((dl, tfid))
}
@ -93,15 +103,17 @@ impl Analyzer {
/// It will create new term ids for non already existing terms.
pub(super) async fn extract_terms_with_frequencies_with_offsets(
&self,
ctx: &Context<'_>,
opt: &Options,
txn: &Transaction,
terms: &mut Terms,
tx: &mut Transaction,
content: Vec<Value>,
) -> Result<(DocLength, Vec<(TermId, TermFrequency)>, Vec<(TermId, OffsetRecords)>), Error> {
let mut dl = 0;
// Let's first collect all the inputs, and collect the tokens.
// We need to store them because everything after is zero-copy
let mut inputs = Vec::with_capacity(content.len());
self.analyze_content(content, &mut inputs)?;
self.analyze_content(ctx, opt, txn, content, &mut inputs).await?;
// We then collect every unique terms and count the frequency and extract the offsets
let mut tfos: HashMap<&str, Vec<Offset>> = HashMap::new();
for (i, tks) in inputs.iter().enumerate() {
@ -121,34 +133,53 @@ impl Analyzer {
// Now we can resolve the term ids
let mut tfid = Vec::with_capacity(tfos.len());
let mut osid = Vec::with_capacity(tfos.len());
let mut tx = txn.lock().await;
for (t, o) in tfos {
let id = terms.resolve_term_id(tx, t).await?;
let id = terms.resolve_term_id(&mut tx, t).await?;
tfid.push((id, o.len() as TermFrequency));
osid.push((id, OffsetRecords(o)));
}
Ok((dl, tfid, osid))
}
fn analyze_content(&self, content: Vec<Value>, tks: &mut Vec<Tokens>) -> Result<(), Error> {
#[cfg_attr(not(target_arch = "wasm32"), async_recursion)]
#[cfg_attr(target_arch = "wasm32", async_recursion(?Send))]
async fn analyze_content(
&self,
ctx: &Context<'_>,
opt: &Options,
txn: &Transaction,
content: Vec<Value>,
tks: &mut Vec<Tokens>,
) -> Result<(), Error> {
for v in content {
self.analyze_value(v, tks)?;
self.analyze_value(ctx, opt, txn, v, tks).await?;
}
Ok(())
}
fn analyze_value(&self, val: Value, tks: &mut Vec<Tokens>) -> Result<(), Error> {
#[cfg_attr(not(target_arch = "wasm32"), async_recursion)]
#[cfg_attr(target_arch = "wasm32", async_recursion(?Send))]
async fn analyze_value(
&self,
ctx: &Context<'_>,
opt: &Options,
txn: &Transaction,
val: Value,
tks: &mut Vec<Tokens>,
) -> Result<(), Error> {
match val {
Value::Strand(s) => tks.push(self.analyze(s.0)?),
Value::Number(n) => tks.push(self.analyze(n.to_string())?),
Value::Bool(b) => tks.push(self.analyze(b.to_string())?),
Value::Strand(s) => tks.push(self.generate_tokens(ctx, opt, txn, s.0).await?),
Value::Number(n) => tks.push(self.generate_tokens(ctx, opt, txn, n.to_string()).await?),
Value::Bool(b) => tks.push(self.generate_tokens(ctx, opt, txn, b.to_string()).await?),
Value::Array(a) => {
for v in a.0 {
self.analyze_value(v, tks)?;
self.analyze_value(ctx, opt, txn, v, tks).await?;
}
}
Value::Object(o) => {
for (_, v) in o.0 {
self.analyze_value(v, tks)?;
self.analyze_value(ctx, opt, txn, v, tks).await?;
}
}
_ => {}
@ -156,33 +187,84 @@ impl Analyzer {
Ok(())
}
fn analyze(&self, input: String) -> Result<Tokens, Error> {
if let Some(t) = &self.t {
async fn generate_tokens(
&self,
ctx: &Context<'_>,
opt: &Options,
txn: &Transaction,
mut input: String,
) -> Result<Tokens, Error> {
if let Some(function_name) = &self.function {
let fns = format!("fn::{function_name}(\"{input}\")");
match path_like(&fns) {
Ok(func_value) => {
let val = func_value.compute(ctx, opt, txn, None).await?;
if let Value::Strand(val) = val {
input = val.0;
} else {
return Err(Error::InvalidFunction {
name: function_name.to_string(),
message: "The function should return a string.".to_string(),
});
}
}
Err(e) => {
return Err(Error::InvalidFunction {
name: function_name.to_string(),
message: e.to_string(),
})
}
}
}
if let Some(t) = &self.tokenizers {
if !input.is_empty() {
let t = Tokenizer::tokenize(t, input);
return Filter::apply_filters(t, &self.f);
return Filter::apply_filters(t, &self.filters);
}
}
Ok(Tokens::new(input))
}
/// Used for exposing the analyzer as the native function `search::analyze`
pub(crate) async fn analyze(
&self,
ctx: &Context<'_>,
opt: &Options,
txn: &Transaction,
input: String,
) -> Result<Value, Error> {
self.generate_tokens(ctx, opt, txn, input).await?.try_into()
}
}
#[cfg(test)]
mod tests {
use super::Analyzer;
use crate::ctx::Context;
use crate::dbs::{Options, Transaction};
use crate::kvs::{Datastore, LockType, TransactionType};
use crate::{
sql::{statements::DefineStatement, Statement},
syn,
};
use futures::lock::Mutex;
use std::sync::Arc;
pub(super) async fn test_analyzer(def: &str, input: &str, expected: &[&str]) {
let ds = Datastore::new("memory").await.unwrap();
let tx = ds.transaction(TransactionType::Read, LockType::Optimistic).await.unwrap();
let txn: Transaction = Arc::new(Mutex::new(tx));
pub(super) fn test_analyzer(def: &str, input: &str, expected: &[&str]) {
let mut stmt = syn::parse(&format!("DEFINE {def}")).unwrap();
let Some(Statement::Define(DefineStatement::Analyzer(az))) = stmt.0 .0.pop() else {
panic!()
};
let a: Analyzer = az.into();
let tokens = a.analyze(input.to_string()).unwrap();
let tokens = a
.generate_tokens(&Context::default(), &Options::default(), &txn, input.to_string())
.await
.unwrap();
let mut res = vec![];
for t in tokens.list() {
res.push(tokens.get_token_string(t).unwrap());

View file

@ -1,7 +1,9 @@
use crate::err;
use crate::err::Error;
use crate::idx::ft::analyzer::filter::{Filter, FilterResult, Term};
use crate::idx::ft::offsets::{Offset, Position};
use crate::sql::tokenizer::Tokenizer as SqlTokenizer;
use crate::sql::Value;
pub(super) struct Tokens {
/// The input string
@ -67,6 +69,18 @@ impl Tokens {
}
}
impl TryFrom<Tokens> for Value {
type Error = err::Error;
fn try_from(tokens: Tokens) -> Result<Self, Error> {
let mut vec: Vec<Value> = Vec::with_capacity(tokens.t.len());
for token in tokens.t {
vec.push(token.get_str(&tokens.i)?.into())
}
Ok(vec.into())
}
}
#[derive(Clone, Debug, PartialOrd, PartialEq, Eq, Ord, Hash)]
pub(super) enum Token {
Ref {
@ -310,8 +324,8 @@ impl Splitter {
mod tests {
use crate::idx::ft::analyzer::tests::test_analyzer;
#[test]
fn test_tokenize_blank_class() {
#[tokio::test]
async fn test_tokenize_blank_class() {
test_analyzer(
"ANALYZER test TOKENIZERS blank,class FILTERS lowercase",
"Abc12345xYZ DL1809 item123456 978-3-16-148410-0 1HGCM82633A123456",
@ -319,11 +333,12 @@ mod tests {
"abc", "12345", "xyz", "dl", "1809", "item", "123456", "978", "-", "3", "-", "16",
"-", "148410", "-", "0", "1", "hgcm", "82633", "a", "123456",
],
);
)
.await;
}
#[test]
fn test_tokenize_source_code() {
#[tokio::test]
async fn test_tokenize_source_code() {
test_analyzer(
"ANALYZER test TOKENIZERS blank,class,camel,punct FILTERS lowercase",
r#"struct MyRectangle {
@ -366,6 +381,7 @@ static LANGUAGE: &str = "Rust";"#,
"\"",
";",
],
);
)
.await;
}
}

View file

@ -7,6 +7,8 @@ pub(super) mod scorer;
pub(super) mod termdocs;
pub(crate) mod terms;
use crate::ctx::Context;
use crate::dbs::{Options, Transaction};
use crate::err::Error;
use crate::idx::docids::{DocId, DocIds};
use crate::idx::ft::analyzer::Analyzer;
@ -20,7 +22,8 @@ use crate::idx::ft::terms::{TermId, Terms};
use crate::idx::trees::btree::BStatistics;
use crate::idx::trees::store::TreeStoreType;
use crate::idx::{IndexKeyBase, VersionedSerdeState};
use crate::kvs::{Key, Transaction};
use crate::kvs;
use crate::kvs::Key;
use crate::sql::index::SearchParams;
use crate::sql::scoring::Scoring;
use crate::sql::statements::DefineAnalyzerStatement;
@ -94,29 +97,42 @@ impl VersionedSerdeState for State {}
impl FtIndex {
pub(crate) async fn new(
tx: &mut Transaction,
opt: &Options,
txn: &Transaction,
az: &str,
index_key_base: IndexKeyBase,
p: &SearchParams,
store_type: TreeStoreType,
) -> Result<Self, Error> {
let mut tx = txn.lock().await;
let az = tx.get_db_analyzer(opt.ns(), opt.db(), az).await?;
Self::with_analyzer(&mut tx, az, index_key_base, p, store_type).await
}
async fn with_analyzer(
run: &mut kvs::Transaction,
az: DefineAnalyzerStatement,
index_key_base: IndexKeyBase,
p: &SearchParams,
store_type: TreeStoreType,
) -> Result<Self, Error> {
let state_key: Key = index_key_base.new_bs_key();
let state: State = if let Some(val) = tx.get(state_key.clone()).await? {
let state: State = if let Some(val) = run.get(state_key.clone()).await? {
State::try_from_val(val)?
} else {
State::default()
};
let doc_ids = Arc::new(RwLock::new(
DocIds::new(tx, index_key_base.clone(), p.doc_ids_order, store_type).await?,
DocIds::new(run, index_key_base.clone(), p.doc_ids_order, store_type).await?,
));
let doc_lengths = Arc::new(RwLock::new(
DocLengths::new(tx, index_key_base.clone(), p.doc_lengths_order, store_type).await?,
DocLengths::new(run, index_key_base.clone(), p.doc_lengths_order, store_type).await?,
));
let postings = Arc::new(RwLock::new(
Postings::new(tx, index_key_base.clone(), p.postings_order, store_type).await?,
Postings::new(run, index_key_base.clone(), p.postings_order, store_type).await?,
));
let terms = Arc::new(RwLock::new(
Terms::new(tx, index_key_base.clone(), p.terms_order, store_type).await?,
Terms::new(run, index_key_base.clone(), p.terms_order, store_type).await?,
));
let termdocs = TermDocs::new(index_key_base.clone());
let offsets = Offsets::new(index_key_base.clone());
@ -153,16 +169,17 @@ impl FtIndex {
pub(crate) async fn remove_document(
&mut self,
tx: &mut Transaction,
txn: &Transaction,
rid: &Thing,
) -> Result<(), Error> {
let mut tx = txn.lock().await;
// Extract and remove the doc_id (if any)
if let Some(doc_id) = self.doc_ids.write().await.remove_doc(tx, rid.into()).await? {
if let Some(doc_id) = self.doc_ids.write().await.remove_doc(&mut tx, rid.into()).await? {
self.state.doc_count -= 1;
// Remove the doc length
if let Some(doc_lengths) =
self.doc_lengths.write().await.remove_doc_length(tx, doc_id).await?
self.doc_lengths.write().await.remove_doc_length(&mut tx, doc_id).await?
{
self.state.total_docs_lengths -= doc_lengths as u128;
}
@ -174,18 +191,18 @@ impl FtIndex {
let mut p = self.postings.write().await;
let mut t = self.terms.write().await;
for term_id in &term_list {
p.remove_posting(tx, term_id, doc_id).await?;
p.remove_posting(&mut tx, term_id, doc_id).await?;
// if the term is not present in any document in the index, we can remove it
let doc_count = self.term_docs.remove_doc(tx, term_id, doc_id).await?;
let doc_count = self.term_docs.remove_doc(&mut tx, term_id, doc_id).await?;
if doc_count == 0 {
t.remove_term_id(tx, term_id).await?;
t.remove_term_id(&mut tx, term_id).await?;
}
}
// Remove the offsets if any
if self.highlighting {
for term_id in term_list {
// TODO?: Removal can be done with a prefix on doc_id
self.offsets.remove_offsets(tx, doc_id, term_id).await?;
self.offsets.remove_offsets(&mut tx, doc_id, term_id).await?;
}
}
}
@ -195,36 +212,43 @@ impl FtIndex {
pub(crate) async fn index_document(
&mut self,
tx: &mut Transaction,
ctx: &Context<'_>,
opt: &Options,
txn: &Transaction,
rid: &Thing,
content: Vec<Value>,
) -> Result<(), Error> {
// Resolve the doc_id
let resolved = self.doc_ids.write().await.resolve_doc_id(tx, rid.into()).await?;
let mut tx = txn.lock().await;
let resolved = self.doc_ids.write().await.resolve_doc_id(&mut tx, rid.into()).await?;
let doc_id = *resolved.doc_id();
drop(tx);
// Extract the doc_lengths, terms en frequencies (and offset)
let mut t = self.terms.write().await;
let (doc_length, terms_and_frequencies, offsets) = if self.highlighting {
let (dl, tf, ofs) = self
.analyzer
.extract_terms_with_frequencies_with_offsets(&mut t, tx, content)
.extract_terms_with_frequencies_with_offsets(ctx, opt, txn, &mut t, content)
.await?;
(dl, tf, Some(ofs))
} else {
let (dl, tf) =
self.analyzer.extract_terms_with_frequencies(&mut t, tx, content).await?;
let (dl, tf) = self
.analyzer
.extract_terms_with_frequencies(ctx, opt, txn, &mut t, content)
.await?;
(dl, tf, None)
};
// Set the doc length
let mut tx = txn.lock().await;
let mut dl = self.doc_lengths.write().await;
if resolved.was_existing() {
if let Some(old_doc_length) = dl.get_doc_length(tx, doc_id).await? {
if let Some(old_doc_length) = dl.get_doc_length(&mut tx, doc_id).await? {
self.state.total_docs_lengths -= old_doc_length as u128;
}
}
dl.set_doc_length(tx, doc_id, doc_length).await?;
dl.set_doc_length(&mut tx, doc_id, doc_length).await?;
// Retrieve the existing terms for this document (if any)
let term_ids_key = self.index_key_base.new_bk_key(doc_id);
@ -238,22 +262,22 @@ impl FtIndex {
let mut terms_ids = RoaringTreemap::default();
let mut p = self.postings.write().await;
for (term_id, term_freq) in terms_and_frequencies {
p.update_posting(tx, term_id, doc_id, term_freq).await?;
p.update_posting(&mut tx, term_id, doc_id, term_freq).await?;
if let Some(old_term_ids) = &mut old_term_ids {
old_term_ids.remove(term_id);
}
self.term_docs.set_doc(tx, term_id, doc_id).await?;
self.term_docs.set_doc(&mut tx, term_id, doc_id).await?;
terms_ids.insert(term_id);
}
// Remove any remaining postings
if let Some(old_term_ids) = &old_term_ids {
for old_term_id in old_term_ids {
p.remove_posting(tx, old_term_id, doc_id).await?;
let doc_count = self.term_docs.remove_doc(tx, old_term_id, doc_id).await?;
p.remove_posting(&mut tx, old_term_id, doc_id).await?;
let doc_count = self.term_docs.remove_doc(&mut tx, old_term_id, doc_id).await?;
// if the term does not have anymore postings, we can remove the term
if doc_count == 0 {
t.remove_term_id(tx, old_term_id).await?;
t.remove_term_id(&mut tx, old_term_id).await?;
}
}
}
@ -263,14 +287,14 @@ impl FtIndex {
if let Some(ofs) = offsets {
if !ofs.is_empty() {
for (tid, or) in ofs {
self.offsets.set_offsets(tx, doc_id, tid, or).await?;
self.offsets.set_offsets(&mut tx, doc_id, tid, or).await?;
}
}
}
// In case of an update, w remove the offset for the terms that does not exist anymore
if let Some(old_term_ids) = old_term_ids {
for old_term_id in old_term_ids {
self.offsets.remove_offsets(tx, doc_id, old_term_id).await?;
self.offsets.remove_offsets(&mut tx, doc_id, old_term_id).await?;
}
}
}
@ -293,17 +317,19 @@ impl FtIndex {
pub(super) async fn extract_terms(
&self,
tx: &mut Transaction,
ctx: &Context<'_>,
opt: &Options,
txn: &Transaction,
query_string: String,
) -> Result<Vec<Option<TermId>>, Error> {
let t = self.terms.read().await;
let terms = self.analyzer.extract_terms(&t, tx, query_string).await?;
let terms = self.analyzer.extract_terms(ctx, opt, txn, &t, query_string).await?;
Ok(terms)
}
pub(super) async fn get_terms_docs(
&self,
tx: &mut Transaction,
tx: &mut kvs::Transaction,
terms: &Vec<Option<TermId>>,
) -> Result<Vec<Option<(TermId, RoaringTreemap)>>, Error> {
let mut terms_docs = Vec::with_capacity(terms.len());
@ -363,7 +389,7 @@ impl FtIndex {
#[allow(clippy::too_many_arguments)]
pub(super) async fn highlight(
&self,
tx: &mut Transaction,
tx: &mut kvs::Transaction,
thg: &Thing,
terms: &[Option<TermId>],
prefix: Value,
@ -387,7 +413,7 @@ impl FtIndex {
pub(super) async fn extract_offsets(
&self,
tx: &mut Transaction,
tx: &mut kvs::Transaction,
thg: &Thing,
terms: &[Option<TermId>],
) -> Result<Value, Error> {
@ -405,21 +431,23 @@ impl FtIndex {
Ok(Value::None)
}
pub(crate) async fn statistics(&self, tx: &mut Transaction) -> Result<FtStatistics, Error> {
pub(crate) async fn statistics(&self, txn: &Transaction) -> Result<FtStatistics, Error> {
// TODO do parallel execution
let mut run = txn.lock().await;
Ok(FtStatistics {
doc_ids: self.doc_ids.read().await.statistics(tx).await?,
terms: self.terms.read().await.statistics(tx).await?,
doc_lengths: self.doc_lengths.read().await.statistics(tx).await?,
postings: self.postings.read().await.statistics(tx).await?,
doc_ids: self.doc_ids.read().await.statistics(&mut run).await?,
terms: self.terms.read().await.statistics(&mut run).await?,
doc_lengths: self.doc_lengths.read().await.statistics(&mut run).await?,
postings: self.postings.read().await.statistics(&mut run).await?,
})
}
pub(crate) async fn finish(self, tx: &mut Transaction) -> Result<(), Error> {
self.doc_ids.write().await.finish(tx).await?;
self.doc_lengths.write().await.finish(tx).await?;
self.postings.write().await.finish(tx).await?;
self.terms.write().await.finish(tx).await?;
pub(crate) async fn finish(self, tx: &Transaction) -> Result<(), Error> {
let mut run = tx.lock().await;
self.doc_ids.write().await.finish(&mut run).await?;
self.doc_lengths.write().await.finish(&mut run).await?;
self.postings.write().await.finish(&mut run).await?;
self.terms.write().await.finish(&mut run).await?;
Ok(())
}
}
@ -439,7 +467,7 @@ impl HitsIterator {
pub(crate) async fn next(
&mut self,
tx: &mut Transaction,
tx: &mut kvs::Transaction,
) -> Result<Option<(Thing, DocId)>, Error> {
for doc_id in self.iter.by_ref() {
if let Some(doc_key) = self.doc_ids.read().await.get_doc_key(tx, doc_id).await? {
@ -452,30 +480,34 @@ impl HitsIterator {
#[cfg(test)]
mod tests {
use crate::ctx::Context;
use crate::dbs::{Options, Transaction};
use crate::idx::ft::scorer::{BM25Scorer, Score};
use crate::idx::ft::{FtIndex, HitsIterator};
use crate::idx::trees::store::TreeStoreType;
use crate::idx::IndexKeyBase;
use crate::kvs::{Datastore, LockType::*, Transaction};
use crate::kvs::{Datastore, LockType::*};
use crate::sql::index::SearchParams;
use crate::sql::scoring::Scoring;
use crate::sql::statements::{DefineAnalyzerStatement, DefineStatement};
use crate::sql::{Statement, Thing, Value};
use crate::syn;
use futures::lock::Mutex;
use std::collections::HashMap;
use std::sync::Arc;
use test_log::test;
async fn check_hits(
tx: &mut Transaction,
txn: &Transaction,
hits: Option<HitsIterator>,
scr: BM25Scorer,
e: Vec<(&Thing, Option<Score>)>,
) {
let mut tx = txn.lock().await;
if let Some(mut hits) = hits {
let mut map = HashMap::new();
while let Some((k, d)) = hits.next(tx).await.unwrap() {
let s = scr.score(tx, d).await.unwrap();
while let Some((k, d)) = hits.next(&mut tx).await.unwrap() {
let s = scr.score(&mut tx, d).await.unwrap();
map.insert(k, s);
}
assert_eq!(map.len(), e.len());
@ -488,27 +520,33 @@ mod tests {
}
async fn search(
tx: &mut Transaction,
ctx: &Context<'_>,
opt: &Options,
txn: &Transaction,
fti: &FtIndex,
qs: &str,
) -> (Option<HitsIterator>, BM25Scorer) {
let t = fti.extract_terms(tx, qs.to_string()).await.unwrap();
let td = Arc::new(fti.get_terms_docs(tx, &t).await.unwrap());
let t = fti.extract_terms(ctx, opt, txn, qs.to_string()).await.unwrap();
let mut tx = txn.lock().await;
let td = Arc::new(fti.get_terms_docs(&mut tx, &t).await.unwrap());
drop(tx);
let scr = fti.new_scorer(td.clone()).unwrap().unwrap();
let hits = fti.new_hits_iterator(td).unwrap();
(hits, scr)
}
pub(super) async fn tx_fti(
pub(super) async fn tx_fti<'a>(
ds: &Datastore,
store_type: TreeStoreType,
az: &DefineAnalyzerStatement,
order: u32,
hl: bool,
) -> (Transaction, FtIndex) {
) -> (Context<'a>, Options, Transaction, FtIndex) {
let write = matches!(store_type, TreeStoreType::Write);
let mut tx = ds.transaction(write.into(), Optimistic).await.unwrap();
let fti = FtIndex::new(
let tx = ds.transaction(write.into(), Optimistic).await.unwrap();
let txn = Arc::new(Mutex::new(tx));
let mut tx = txn.lock().await;
let fti = FtIndex::with_analyzer(
&mut tx,
az.clone(),
IndexKeyBase::default(),
@ -525,12 +563,13 @@ mod tests {
)
.await
.unwrap();
(tx, fti)
drop(tx);
(Context::default(), Options::default(), txn, fti)
}
pub(super) async fn finish(mut tx: Transaction, fti: FtIndex) {
fti.finish(&mut tx).await.unwrap();
tx.commit().await.unwrap();
pub(super) async fn finish(txn: &Transaction, fti: FtIndex) {
fti.finish(txn).await.unwrap();
txn.lock().await.commit().await.unwrap();
}
#[test(tokio::test)]
@ -549,93 +588,99 @@ mod tests {
{
// Add one document
let (mut tx, mut fti) =
let (ctx, opt, txn, mut fti) =
tx_fti(&ds, TreeStoreType::Write, &az, btree_order, false).await;
fti.index_document(&mut tx, &doc1, vec![Value::from("hello the world")]).await.unwrap();
finish(tx, fti).await;
fti.index_document(&ctx, &opt, &txn, &doc1, vec![Value::from("hello the world")])
.await
.unwrap();
finish(&txn, fti).await;
}
{
// Add two documents
let (mut tx, mut fti) =
let (ctx, opt, txn, mut fti) =
tx_fti(&ds, TreeStoreType::Write, &az, btree_order, false).await;
fti.index_document(&mut tx, &doc2, vec![Value::from("a yellow hello")]).await.unwrap();
fti.index_document(&mut tx, &doc3, vec![Value::from("foo bar")]).await.unwrap();
finish(tx, fti).await;
fti.index_document(&ctx, &opt, &txn, &doc2, vec![Value::from("a yellow hello")])
.await
.unwrap();
fti.index_document(&ctx, &opt, &txn, &doc3, vec![Value::from("foo bar")])
.await
.unwrap();
finish(&txn, fti).await;
}
{
let (mut tx, fti) = tx_fti(&ds, TreeStoreType::Read, &az, btree_order, false).await;
let (ctx, opt, txn, fti) =
tx_fti(&ds, TreeStoreType::Read, &az, btree_order, false).await;
// Check the statistics
let statistics = fti.statistics(&mut tx).await.unwrap();
let statistics = fti.statistics(&txn).await.unwrap();
assert_eq!(statistics.terms.keys_count, 7);
assert_eq!(statistics.postings.keys_count, 8);
assert_eq!(statistics.doc_ids.keys_count, 3);
assert_eq!(statistics.doc_lengths.keys_count, 3);
// Search & score
let (hits, scr) = search(&mut tx, &fti, "hello").await;
check_hits(
&mut tx,
hits,
scr,
vec![(&doc1, Some(-0.4859746)), (&doc2, Some(-0.4859746))],
)
.await;
let (hits, scr) = search(&ctx, &opt, &txn, &fti, "hello").await;
check_hits(&txn, hits, scr, vec![(&doc1, Some(-0.4859746)), (&doc2, Some(-0.4859746))])
.await;
let (hits, scr) = search(&mut tx, &fti, "world").await;
check_hits(&mut tx, hits, scr, vec![(&doc1, Some(0.4859746))]).await;
let (hits, scr) = search(&ctx, &opt, &txn, &fti, "world").await;
check_hits(&txn, hits, scr, vec![(&doc1, Some(0.4859746))]).await;
let (hits, scr) = search(&mut tx, &fti, "yellow").await;
check_hits(&mut tx, hits, scr, vec![(&doc2, Some(0.4859746))]).await;
let (hits, scr) = search(&ctx, &opt, &txn, &fti, "yellow").await;
check_hits(&txn, hits, scr, vec![(&doc2, Some(0.4859746))]).await;
let (hits, scr) = search(&mut tx, &fti, "foo").await;
check_hits(&mut tx, hits, scr, vec![(&doc3, Some(0.56902087))]).await;
let (hits, scr) = search(&ctx, &opt, &txn, &fti, "foo").await;
check_hits(&txn, hits, scr, vec![(&doc3, Some(0.56902087))]).await;
let (hits, scr) = search(&mut tx, &fti, "bar").await;
check_hits(&mut tx, hits, scr, vec![(&doc3, Some(0.56902087))]).await;
let (hits, scr) = search(&ctx, &opt, &txn, &fti, "bar").await;
check_hits(&txn, hits, scr, vec![(&doc3, Some(0.56902087))]).await;
let (hits, _) = search(&mut tx, &fti, "dummy").await;
let (hits, _) = search(&ctx, &opt, &txn, &fti, "dummy").await;
assert!(hits.is_none());
}
{
// Reindex one document
let (mut tx, mut fti) =
let (ctx, opt, txn, mut fti) =
tx_fti(&ds, TreeStoreType::Write, &az, btree_order, false).await;
fti.index_document(&mut tx, &doc3, vec![Value::from("nobar foo")]).await.unwrap();
finish(tx, fti).await;
fti.index_document(&ctx, &opt, &txn, &doc3, vec![Value::from("nobar foo")])
.await
.unwrap();
finish(&txn, fti).await;
let (mut tx, fti) = tx_fti(&ds, TreeStoreType::Read, &az, btree_order, false).await;
let (ctx, opt, txn, fti) =
tx_fti(&ds, TreeStoreType::Read, &az, btree_order, false).await;
// We can still find 'foo'
let (hits, scr) = search(&mut tx, &fti, "foo").await;
check_hits(&mut tx, hits, scr, vec![(&doc3, Some(0.56902087))]).await;
let (hits, scr) = search(&ctx, &opt, &txn, &fti, "foo").await;
check_hits(&txn, hits, scr, vec![(&doc3, Some(0.56902087))]).await;
// We can't anymore find 'bar'
let (hits, _) = search(&mut tx, &fti, "bar").await;
let (hits, _) = search(&ctx, &opt, &txn, &fti, "bar").await;
assert!(hits.is_none());
// We can now find 'nobar'
let (hits, scr) = search(&mut tx, &fti, "nobar").await;
check_hits(&mut tx, hits, scr, vec![(&doc3, Some(0.56902087))]).await;
let (hits, scr) = search(&ctx, &opt, &txn, &fti, "nobar").await;
check_hits(&txn, hits, scr, vec![(&doc3, Some(0.56902087))]).await;
}
{
// Remove documents
let (mut tx, mut fti) =
let (_, _, txn, mut fti) =
tx_fti(&ds, TreeStoreType::Write, &az, btree_order, false).await;
fti.remove_document(&mut tx, &doc1).await.unwrap();
fti.remove_document(&mut tx, &doc2).await.unwrap();
fti.remove_document(&mut tx, &doc3).await.unwrap();
finish(tx, fti).await;
fti.remove_document(&txn, &doc1).await.unwrap();
fti.remove_document(&txn, &doc2).await.unwrap();
fti.remove_document(&txn, &doc3).await.unwrap();
finish(&txn, fti).await;
}
{
let (mut tx, fti) = tx_fti(&ds, TreeStoreType::Read, &az, btree_order, false).await;
let (hits, _) = search(&mut tx, &fti, "hello").await;
let (ctx, opt, txn, fti) =
tx_fti(&ds, TreeStoreType::Read, &az, btree_order, false).await;
let (hits, _) = search(&ctx, &opt, &txn, &fti, "hello").await;
assert!(hits.is_none());
let (hits, _) = search(&mut tx, &fti, "foo").await;
let (hits, _) = search(&ctx, &opt, &txn, &fti, "foo").await;
assert!(hits.is_none());
}
}
@ -659,51 +704,60 @@ mod tests {
let btree_order = 5;
{
let (mut tx, mut fti) =
let (ctx, opt, txn, mut fti) =
tx_fti(&ds, TreeStoreType::Write, &az, btree_order, hl).await;
fti.index_document(
&mut tx,
&ctx,
&opt,
&txn,
&doc1,
vec![Value::from("the quick brown fox jumped over the lazy dog")],
)
.await
.unwrap();
fti.index_document(
&mut tx,
&ctx,
&opt,
&txn,
&doc2,
vec![Value::from("the fast fox jumped over the lazy dog")],
)
.await
.unwrap();
fti.index_document(
&mut tx,
&ctx,
&opt,
&txn,
&doc3,
vec![Value::from("the dog sat there and did nothing")],
)
.await
.unwrap();
fti.index_document(
&mut tx,
&ctx,
&opt,
&txn,
&doc4,
vec![Value::from("the other animals sat there watching")],
)
.await
.unwrap();
finish(tx, fti).await;
finish(&txn, fti).await;
}
{
let (mut tx, fti) = tx_fti(&ds, TreeStoreType::Read, &az, btree_order, hl).await;
let (ctx, opt, txn, fti) =
tx_fti(&ds, TreeStoreType::Read, &az, btree_order, hl).await;
let statistics = fti.statistics(&mut tx).await.unwrap();
let statistics = fti.statistics(&txn).await.unwrap();
assert_eq!(statistics.terms.keys_count, 17);
assert_eq!(statistics.postings.keys_count, 28);
assert_eq!(statistics.doc_ids.keys_count, 4);
assert_eq!(statistics.doc_lengths.keys_count, 4);
let (hits, scr) = search(&mut tx, &fti, "the").await;
let (hits, scr) = search(&ctx, &opt, &txn, &fti, "the").await;
check_hits(
&mut tx,
&txn,
hits,
scr,
vec![
@ -715,9 +769,9 @@ mod tests {
)
.await;
let (hits, scr) = search(&mut tx, &fti, "dog").await;
let (hits, scr) = search(&ctx, &opt, &txn, &fti, "dog").await;
check_hits(
&mut tx,
&txn,
hits,
scr,
vec![
@ -728,25 +782,25 @@ mod tests {
)
.await;
let (hits, scr) = search(&mut tx, &fti, "fox").await;
check_hits(&mut tx, hits, scr, vec![(&doc1, Some(0.0)), (&doc2, Some(0.0))]).await;
let (hits, scr) = search(&ctx, &opt, &txn, &fti, "fox").await;
check_hits(&txn, hits, scr, vec![(&doc1, Some(0.0)), (&doc2, Some(0.0))]).await;
let (hits, scr) = search(&mut tx, &fti, "over").await;
check_hits(&mut tx, hits, scr, vec![(&doc1, Some(0.0)), (&doc2, Some(0.0))]).await;
let (hits, scr) = search(&ctx, &opt, &txn, &fti, "over").await;
check_hits(&txn, hits, scr, vec![(&doc1, Some(0.0)), (&doc2, Some(0.0))]).await;
let (hits, scr) = search(&mut tx, &fti, "lazy").await;
check_hits(&mut tx, hits, scr, vec![(&doc1, Some(0.0)), (&doc2, Some(0.0))]).await;
let (hits, scr) = search(&ctx, &opt, &txn, &fti, "lazy").await;
check_hits(&txn, hits, scr, vec![(&doc1, Some(0.0)), (&doc2, Some(0.0))]).await;
let (hits, scr) = search(&mut tx, &fti, "jumped").await;
check_hits(&mut tx, hits, scr, vec![(&doc1, Some(0.0)), (&doc2, Some(0.0))]).await;
let (hits, scr) = search(&ctx, &opt, &txn, &fti, "jumped").await;
check_hits(&txn, hits, scr, vec![(&doc1, Some(0.0)), (&doc2, Some(0.0))]).await;
let (hits, scr) = search(&mut tx, &fti, "nothing").await;
check_hits(&mut tx, hits, scr, vec![(&doc3, Some(0.87105393))]).await;
let (hits, scr) = search(&ctx, &opt, &txn, &fti, "nothing").await;
check_hits(&txn, hits, scr, vec![(&doc3, Some(0.87105393))]).await;
let (hits, scr) = search(&mut tx, &fti, "animals").await;
check_hits(&mut tx, hits, scr, vec![(&doc4, Some(0.92279965))]).await;
let (hits, scr) = search(&ctx, &opt, &txn, &fti, "animals").await;
check_hits(&txn, hits, scr, vec![(&doc4, Some(0.92279965))]).await;
let (hits, _) = search(&mut tx, &fti, "dummy").await;
let (hits, _) = search(&ctx, &opt, &txn, &fti, "dummy").await;
assert!(hits.is_none());
}
}

View file

@ -1,3 +1,4 @@
use crate::ctx::Context;
use crate::dbs::{Options, Transaction};
use crate::err::Error;
use crate::idx::docids::{DocId, DocIds};
@ -58,12 +59,12 @@ impl IteratorEntry {
}
impl QueryExecutor {
pub(super) async fn new(
ctx: &Context<'_>,
opt: &Options,
txn: &Transaction,
table: &Table,
im: IndexesMap,
) -> Result<Self, Error> {
let mut run = txn.lock().await;
let mut mr_entries = HashMap::default();
let mut exp_entries = HashMap::default();
let mut ft_map = HashMap::default();
@ -80,15 +81,15 @@ impl QueryExecutor {
let mut ft_entry = None;
if let Some(ft) = ft_map.get(&ix_ref) {
if ft_entry.is_none() {
ft_entry = FtEntry::new(&mut run, ft, io).await?;
ft_entry = FtEntry::new(ctx, opt, txn, ft, io).await?;
}
} else {
let ikb = IndexKeyBase::new(opt, idx_def);
let az = run.get_db_analyzer(opt.ns(), opt.db(), p.az.as_str()).await?;
let ft =
FtIndex::new(&mut run, az, ikb, p, TreeStoreType::Read).await?;
FtIndex::new(opt, txn, p.az.as_str(), ikb, p, TreeStoreType::Read)
.await?;
if ft_entry.is_none() {
ft_entry = FtEntry::new(&mut run, &ft, io).await?;
ft_entry = FtEntry::new(ctx, opt, txn, &ft, io).await?;
}
ft_map.insert(ix_ref, ft);
}
@ -105,13 +106,14 @@ impl QueryExecutor {
}
Index::MTree(p) => {
if let IndexOperator::Knn(a, k) = io.op() {
let mut tx = txn.lock().await;
let entry = if let Some(mt) = mt_map.get(&ix_ref) {
MtEntry::new(&mut run, mt, a.clone(), *k).await?
MtEntry::new(&mut tx, mt, a.clone(), *k).await?
} else {
let ikb = IndexKeyBase::new(opt, idx_def);
let mt =
MTreeIndex::new(&mut run, ikb, p, TreeStoreType::Read).await?;
let entry = MtEntry::new(&mut run, &mt, a.clone(), *k).await?;
MTreeIndex::new(&mut tx, ikb, p, TreeStoreType::Read).await?;
let entry = MtEntry::new(&mut tx, &mt, a.clone(), *k).await?;
mt_map.insert(ix_ref, mt);
entry
};
@ -437,13 +439,16 @@ struct Inner {
impl FtEntry {
async fn new(
tx: &mut kvs::Transaction,
ctx: &Context<'_>,
opt: &Options,
txn: &Transaction,
ft: &FtIndex,
io: IndexOption,
) -> Result<Option<Self>, Error> {
if let Matches(qs, _) = io.op() {
let terms = ft.extract_terms(tx, qs.to_owned()).await?;
let terms_docs = Arc::new(ft.get_terms_docs(tx, &terms).await?);
let terms = ft.extract_terms(ctx, opt, txn, qs.to_owned()).await?;
let mut tx = txn.lock().await;
let terms_docs = Arc::new(ft.get_terms_docs(&mut tx, &terms).await?);
Ok(Some(Self(Arc::new(Inner {
index_option: io,
doc_ids: ft.doc_ids(),

View file

@ -44,7 +44,7 @@ impl<'a> QueryPlanner<'a> {
) -> Result<(), Error> {
match Tree::build(ctx, self.opt, txn, &t, self.cond, self.with).await? {
Some((node, im, with_indexes)) => {
let mut exe = QueryExecutor::new(self.opt, txn, &t, im).await?;
let mut exe = QueryExecutor::new(ctx, self.opt, txn, &t, im).await?;
match PlanBuilder::build(node, self.with, with_indexes)? {
Plan::SingleIndex(exp, io) => {
if io.require_distinct() {

View file

@ -1,3 +1,5 @@
use lru::LruCache;
use once_cell::sync::Lazy;
use revision::revisioned;
use serde::{
de::{self, Visitor},
@ -7,8 +9,10 @@ use std::cmp::Ordering;
use std::fmt::Debug;
use std::fmt::{self, Display, Formatter};
use std::hash::{Hash, Hasher};
use std::str;
use std::num::NonZeroUsize;
use std::str::FromStr;
use std::sync::Mutex;
use std::{env, str};
pub(crate) const TOKEN: &str = "$surrealdb::private::sql::Regex";
@ -23,6 +27,25 @@ impl Regex {
}
}
fn regex_new(str: &str) -> Result<regex::Regex, regex::Error> {
static REGEX_CACHE: Lazy<Mutex<LruCache<String, regex::Regex>>> = Lazy::new(|| {
let cache_size: usize = env::var("SURREAL_REGEX_CACHE_SIZE")
.map_or(1000, |v| v.parse().unwrap_or(1000))
.max(10); // The minimum cache size is 10
Mutex::new(LruCache::new(NonZeroUsize::new(cache_size).unwrap()))
});
let mut cache = match REGEX_CACHE.lock() {
Ok(guard) => guard,
Err(poisoned) => poisoned.into_inner(),
};
if let Some(re) = cache.get(str) {
return Ok(re.clone());
}
let re = regex::Regex::new(str)?;
cache.put(str.to_owned(), re.clone());
Ok(re)
}
impl FromStr for Regex {
type Err = <regex::Regex as FromStr>::Err;
@ -30,7 +53,7 @@ impl FromStr for Regex {
if s.contains('\0') {
Err(regex::Error::Syntax("regex contained NUL byte".to_owned()))
} else {
regex::Regex::new(&s.replace("\\/", "/")).map(Self)
regex_new(&s.replace("\\/", "/")).map(Self)
}
}
}

View file

@ -37,10 +37,10 @@ impl AnalyzeStatement {
AnalyzeStatement::Idx(tb, idx) => {
// Allowed to run?
opt.is_allowed(Action::View, ResourceKind::Index, &Base::Db)?;
// Claim transaction
let mut run = txn.lock().await;
// Read the index
let ix = run
let ix = txn
.lock()
.await
.get_and_cache_tb_index(opt.ns(), opt.db(), tb.as_str(), idx.as_str())
.await?;
let ikb = IndexKeyBase::new(opt, &ix);
@ -48,15 +48,15 @@ impl AnalyzeStatement {
// Index operation dispatching
let value: Value = match &ix.index {
Index::Search(p) => {
let az = run.get_db_analyzer(opt.ns(), opt.db(), p.az.as_str()).await?;
let ft =
FtIndex::new(&mut run, az, ikb, p, TreeStoreType::Traversal).await?;
ft.statistics(&mut run).await?.into()
FtIndex::new(opt, txn, p.az.as_str(), ikb, p, TreeStoreType::Traversal)
.await?;
ft.statistics(txn).await?.into()
}
Index::MTree(p) => {
let mt =
MTreeIndex::new(&mut run, ikb, p, TreeStoreType::Traversal).await?;
mt.statistics(&mut run).await?.into()
let mut tx = txn.lock().await;
let mt = MTreeIndex::new(&mut tx, ikb, p, TreeStoreType::Traversal).await?;
mt.statistics(&mut tx).await?.into()
}
_ => {
return Err(Error::FeatureNotYetImplemented {

View file

@ -10,9 +10,11 @@ use serde::{Deserialize, Serialize};
use std::fmt::{self, Display};
#[derive(Clone, Debug, Default, Eq, PartialEq, PartialOrd, Serialize, Deserialize, Store, Hash)]
#[revisioned(revision = 1)]
#[revisioned(revision = 2)]
pub struct DefineAnalyzerStatement {
pub name: Ident,
#[revision(start = 2)]
pub function: Option<Ident>,
pub tokenizers: Option<Vec<Tokenizer>>,
pub filters: Option<Vec<Filter>>,
pub comment: Option<Strand>,
@ -47,6 +49,9 @@ impl DefineAnalyzerStatement {
impl Display for DefineAnalyzerStatement {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "DEFINE ANALYZER {}", self.name)?;
if let Some(ref i) = self.function {
write!(f, " FUNCTION fn::{i}")?
}
if let Some(v) = &self.tokenizers {
let tokens: Vec<String> = v.iter().map(|f| f.to_string()).collect();
write!(f, " TOKENIZERS {}", tokens.join(","))?;

View file

@ -39,6 +39,7 @@ impl ser::Serializer for Serializer {
#[derive(Default)]
pub struct SerializeDefineAnalyzerStatement {
name: Ident,
function: Option<Strand>,
tokenizers: Option<Vec<Tokenizer>>,
filters: Option<Vec<Filter>>,
comment: Option<Strand>,
@ -56,6 +57,9 @@ impl serde::ser::SerializeStruct for SerializeDefineAnalyzerStatement {
"name" => {
self.name = Ident(value.serialize(ser::string::Serializer.wrap())?);
}
"function" => {
self.function = value.serialize(ser::strand::opt::Serializer.wrap())?;
}
"tokenizers" => {
self.tokenizers = value.serialize(ser::tokenizer::vec::opt::Serializer.wrap())?;
}
@ -77,6 +81,7 @@ impl serde::ser::SerializeStruct for SerializeDefineAnalyzerStatement {
fn end(self) -> Result<Self::Ok, Error> {
Ok(DefineAnalyzerStatement {
name: self.name,
function: self.function.map(|s| Ident(s.0)),
tokenizers: self.tokenizers,
filters: self.filters,
comment: self.comment,

View file

@ -1394,6 +1394,21 @@ impl Value {
}
}
/// Try to coerce this value to a `Regex`
pub(crate) fn coerce_to_regex(self) -> Result<Regex, Error> {
match self {
// Allow any Regex value
Value::Regex(v) => Ok(v),
// Allow any string value
Value::Strand(v) => Ok(v.as_str().parse()?),
// Anything else raises an error
_ => Err(Error::CoerceTo {
from: self,
into: "regex".into(),
}),
}
}
/// Try to coerce this value to a `String`
pub(crate) fn coerce_to_string(self) -> Result<String, Error> {
match self {

View file

@ -5,7 +5,8 @@ pub mod error;
pub mod v1;
pub use v1::{
datetime, datetime_raw, duration, idiom, json, parse, range, subquery, thing, thing_raw, value,
datetime, datetime_raw, duration, idiom, json, parse, path_like, range, subquery, thing,
thing_raw, value,
};
#[cfg(test)]

View file

@ -329,6 +329,7 @@ pub(crate) fn builtin_name(i: &str) -> IResult<&str, BuiltinName<&str>, ParseErr
},
},
search => {
analyze => { fn },
score => { fn },
highlight => { fn },
offsets => { fn },
@ -350,6 +351,7 @@ pub(crate) fn builtin_name(i: &str) -> IResult<&str, BuiltinName<&str>, ParseErr
join => { fn },
len => { fn },
lowercase => { fn },
matches => {fn},
repeat => { fn },
replace => { fn },
reverse => { fn },

View file

@ -91,6 +91,10 @@ pub fn duration(input: &str) -> Result<Duration, Error> {
parse_impl(input, literal::duration)
}
pub fn path_like(input: &str) -> Result<Value, Error> {
parse_impl(input, value::path_like)
}
pub fn range(input: &str) -> Result<Range, Error> {
parse_impl(input, literal::range)
}

View file

@ -5,15 +5,17 @@ use super::super::super::{
literal::{filters, ident, strand, tokenizer::tokenizers},
IResult,
};
use crate::sql::{filter::Filter, statements::DefineAnalyzerStatement, Strand, Tokenizer};
use nom::{branch::alt, bytes::complete::tag_no_case, combinator::cut, multi::many0};
use crate::sql::{filter::Filter, statements::DefineAnalyzerStatement, Ident, Strand, Tokenizer};
use nom::{
branch::alt, bytes::complete::tag, bytes::complete::tag_no_case, combinator::cut, multi::many0,
};
pub fn analyzer(i: &str) -> IResult<&str, DefineAnalyzerStatement> {
let (i, _) = tag_no_case("ANALYZER")(i)?;
let (i, _) = shouldbespace(i)?;
let (i, name) = cut(ident)(i)?;
let (i, opts) = many0(analyzer_opts)(i)?;
let (i, _) = expected("one of FILTERS, TOKENIZERS, or COMMENT", ending::query)(i)?;
let (i, _) = expected("one of FUNCTION, FILTERS, TOKENIZERS, or COMMENT", ending::query)(i)?;
// Create the base statement
let mut res = DefineAnalyzerStatement {
name,
@ -22,6 +24,9 @@ pub fn analyzer(i: &str) -> IResult<&str, DefineAnalyzerStatement> {
// Assign any defined options
for opt in opts {
match opt {
DefineAnalyzerOption::Function(i) => {
res.function = Some(i);
}
DefineAnalyzerOption::Comment(v) => {
res.comment = Some(v);
}
@ -38,13 +43,23 @@ pub fn analyzer(i: &str) -> IResult<&str, DefineAnalyzerStatement> {
}
enum DefineAnalyzerOption {
Function(Ident),
Comment(Strand),
Filters(Vec<Filter>),
Tokenizers(Vec<Tokenizer>),
}
fn analyzer_opts(i: &str) -> IResult<&str, DefineAnalyzerOption> {
alt((analyzer_comment, analyzer_filters, analyzer_tokenizers))(i)
alt((analyzer_function, analyzer_comment, analyzer_filters, analyzer_tokenizers))(i)
}
fn analyzer_function(i: &str) -> IResult<&str, DefineAnalyzerOption> {
let (i, _) = shouldbespace(i)?;
let (i, _) = tag_no_case("FUNCTION")(i)?;
let (i, _) = shouldbespace(i)?;
let (i, _) = tag("fn::")(i)?;
let (i, name) = ident(i)?;
Ok((i, DefineAnalyzerOption::Function(name)))
}
fn analyzer_comment(i: &str) -> IResult<&str, DefineAnalyzerOption> {

View file

@ -1146,39 +1146,44 @@ async fn define_statement_index_multiple_unique_embedded_multiple() -> Result<()
#[tokio::test]
async fn define_statement_analyzer() -> Result<(), Error> {
let sql = "
let sql = r#"
DEFINE ANALYZER english TOKENIZERS blank,class FILTERS lowercase,snowball(english);
DEFINE ANALYZER autocomplete FILTERS lowercase,edgengram(2,10);
DEFINE FUNCTION fn::stripHtml($html: string) {
RETURN string::replace($html, /<[^>]*>/, "");
};
DEFINE ANALYZER htmlAnalyzer FUNCTION fn::stripHtml TOKENIZERS blank,class;
INFO FOR DB;
";
"#;
let dbs = new_ds().await?;
let ses = Session::owner().with_ns("test").with_db("test");
let res = &mut dbs.execute(sql, &ses, None).await?;
assert_eq!(res.len(), 3);
assert_eq!(res.len(), 5);
//
let tmp = res.remove(0).result;
assert!(tmp.is_ok());
for _ in 0..4 {
let tmp = res.remove(0).result;
assert!(tmp.is_ok());
}
//
let tmp = res.remove(0).result;
assert!(tmp.is_ok());
//
let tmp = res.remove(0).result?;
let val = Value::parse(
"{
r#"{
analyzers: {
autocomplete: 'DEFINE ANALYZER autocomplete FILTERS LOWERCASE,EDGENGRAM(2,10)',
english: 'DEFINE ANALYZER english TOKENIZERS BLANK,CLASS FILTERS LOWERCASE,SNOWBALL(ENGLISH)',
htmlAnalyzer: 'DEFINE ANALYZER htmlAnalyzer FUNCTION fn::stripHtml TOKENIZERS BLANK,CLASS'
},
tokens: {},
functions: {},
functions: {
stripHtml: "DEFINE FUNCTION fn::stripHtml($html: string) { RETURN string::replace($html, /<[^>]*>/, ''); }"
},
params: {},
scopes: {},
tables: {},
users: {},
}",
}"#,
);
assert_eq!(tmp, val);
assert_eq!(format!("{:#}", tmp), format!("{:#}", val));
Ok(())
}

View file

@ -5,6 +5,7 @@ use helpers::new_ds;
use surrealdb::dbs::Session;
use surrealdb::err::Error;
use surrealdb::sql::{Number, Value};
use test_log::test;
async fn test_queries(sql: &str, desired_responses: &[&str]) -> Result<(), Error> {
let db = new_ds().await?;
@ -3297,6 +3298,120 @@ async fn function_string_ends_with() -> Result<(), Error> {
Ok(())
}
#[test(tokio::test)]
async fn function_search_analyzer() -> Result<(), Error> {
let sql = r#"
DEFINE FUNCTION fn::stripHtml($html: string) {
RETURN string::replace($html, /<[^>]*>/, "");
};
DEFINE ANALYZER htmlAnalyzer FUNCTION fn::stripHtml TOKENIZERS blank,class;
RETURN search::analyze('htmlAnalyzer', '<p>This is a <em>sample</em> of HTML</p>');
"#;
let dbs = new_ds().await?;
let ses = Session::owner().with_ns("test").with_db("test");
let res = &mut dbs.execute(sql, &ses, None).await?;
assert_eq!(res.len(), 3);
//
for _ in 0..2 {
let tmp = res.remove(0).result;
assert!(tmp.is_ok());
}
//
let tmp = res.remove(0).result?;
let val = Value::parse("['This', 'is', 'a', 'sample', 'of', 'HTML']");
assert_eq!(format!("{:#}", tmp), format!("{:#}", val));
Ok(())
}
#[test(tokio::test)]
async fn function_search_analyzer_invalid_arguments() -> Result<(), Error> {
let sql = r#"
DEFINE FUNCTION fn::unsupportedFunction() {
RETURN 1;
};
DEFINE ANALYZER htmlAnalyzer FUNCTION fn::unsupportedFunction TOKENIZERS blank,class;
RETURN search::analyze('htmlAnalyzer', '<p>This is a <em>sample</em> of HTML</p>');
"#;
let dbs = new_ds().await?;
let ses = Session::owner().with_ns("test").with_db("test");
let res = &mut dbs.execute(sql, &ses, None).await?;
assert_eq!(res.len(), 3);
//
for _ in 0..2 {
let tmp = res.remove(0).result;
assert!(tmp.is_ok());
}
//
match res.remove(0).result {
Err(Error::InvalidArguments {
name,
message,
}) => {
assert_eq!(&name, "fn::unsupportedFunction");
assert_eq!(&message, "The function expects 0 arguments.");
}
_ => panic!("Should have fail!"),
}
Ok(())
}
#[test(tokio::test)]
async fn function_search_analyzer_invalid_return_type() -> Result<(), Error> {
let sql = r#"
DEFINE FUNCTION fn::unsupportedReturnedType($html: string) {
RETURN 1;
};
DEFINE ANALYZER htmlAnalyzer FUNCTION fn::unsupportedReturnedType TOKENIZERS blank,class;
RETURN search::analyze('htmlAnalyzer', '<p>This is a <em>sample</em> of HTML</p>');
"#;
let dbs = new_ds().await?;
let ses = Session::owner().with_ns("test").with_db("test");
let res = &mut dbs.execute(sql, &ses, None).await?;
assert_eq!(res.len(), 3);
//
for _ in 0..2 {
let tmp = res.remove(0).result;
assert!(tmp.is_ok());
}
//
match res.remove(0).result {
Err(Error::InvalidFunction {
name,
message,
}) => {
assert_eq!(&name, "unsupportedReturnedType");
assert_eq!(&message, "The function should return a string.");
}
r => panic!("Unexpected result: {:?}", r),
}
Ok(())
}
#[test(tokio::test)]
async fn function_search_analyzer_invalid_function_name() -> Result<(), Error> {
let sql = r#"
DEFINE ANALYZER htmlAnalyzer FUNCTION fn::doesNotExist TOKENIZERS blank,class;
RETURN search::analyze('htmlAnalyzer', '<p>This is a <em>sample</em> of HTML</p>');
"#;
let dbs = new_ds().await?;
let ses = Session::owner().with_ns("test").with_db("test");
let res = &mut dbs.execute(sql, &ses, None).await?;
assert_eq!(res.len(), 2);
//
let tmp = res.remove(0).result;
assert!(tmp.is_ok());
//
match res.remove(0).result {
Err(Error::FcNotFound {
value,
}) => {
assert_eq!(&value, "doesNotExist");
}
r => panic!("Unexpected result: {:?}", r),
}
Ok(())
}
#[tokio::test]
async fn function_parse_is_alphanum() -> Result<(), Error> {
let sql = r#"
@ -3664,6 +3779,49 @@ async fn function_string_lowercase() -> Result<(), Error> {
Ok(())
}
// "<[^>]*>" , ""
#[tokio::test]
async fn function_string_replace_with_regex() -> Result<(), Error> {
let sql = r#"
RETURN string::replace('<p>This is a <em>sample</em> string with <a href="\\#">HTML</a> tags.</p>', /<[^>]*>/, "");
RETURN string::replace('<p>This one is already <strong>compiled!<strong></p>', /<[^>]*>/, "");
"#;
let dbs = new_ds().await?;
let ses = Session::owner().with_ns("test").with_db("test");
let res = &mut dbs.execute(sql, &ses, None).await?;
assert_eq!(res.len(), 2);
//
let tmp = res.remove(0).result?;
let val = Value::from("This is a sample string with HTML tags.");
assert_eq!(tmp, val);
//
let tmp = res.remove(0).result?;
let val = Value::from("This one is already compiled!");
assert_eq!(tmp, val);
Ok(())
}
#[tokio::test]
async fn function_string_matches() -> Result<(), Error> {
let sql = r#"
RETURN string::matches("foo", /foo/);
RETURN string::matches("bar", /foo/);
"#;
let dbs = new_ds().await?;
let ses = Session::owner().with_ns("test").with_db("test");
let res = &mut dbs.execute(sql, &ses, None).await?;
//
let tmp = res.remove(0).result?;
let val = Value::from(true);
assert_eq!(tmp, val);
//
let tmp = res.remove(0).result?;
let val = Value::from(false);
assert_eq!(tmp, val);
Ok(())
}
#[tokio::test]
async fn function_string_repeat() -> Result<(), Error> {
let sql = r#"