Feat: Analyzers to support functions (#2974)
Co-authored-by: Yusuke Kuoka <ykuoka@gmail.com>
This commit is contained in:
parent
5b23602359
commit
6efd3e3d87
28 changed files with 863 additions and 337 deletions
|
@ -1,5 +0,0 @@
|
|||
[advisories]
|
||||
ignore = [
|
||||
# Acknowledged in: https://github.com/surrealdb/surrealdb/pull/3005
|
||||
"RUSTSEC-2021-0127"
|
||||
]
|
|
@ -265,6 +265,7 @@
|
|||
"uuid"
|
||||
"rand::uuid::v4("
|
||||
"rand::uuid::v7("
|
||||
"search::analyze("
|
||||
"search::score("
|
||||
"search::highlight("
|
||||
"search::offsets("
|
||||
|
@ -300,6 +301,7 @@
|
|||
"string::join("
|
||||
"string::len("
|
||||
"string::lowercase("
|
||||
"string::matches("
|
||||
"string::repeat("
|
||||
"string::replace("
|
||||
"string::reverse("
|
||||
|
|
|
@ -265,6 +265,7 @@
|
|||
"uuid"
|
||||
"rand::uuid::v4("
|
||||
"rand::uuid::v7("
|
||||
"search::analyze("
|
||||
"search::score("
|
||||
"search::highlight("
|
||||
"search::offsets("
|
||||
|
@ -299,6 +300,7 @@
|
|||
"string::join("
|
||||
"string::len("
|
||||
"string::lowercase("
|
||||
"string::matches("
|
||||
"string::repeat("
|
||||
"string::replace("
|
||||
"string::reverse("
|
||||
|
|
|
@ -7,11 +7,11 @@ use crate::idx::ft::FtIndex;
|
|||
use crate::idx::trees::mtree::MTreeIndex;
|
||||
use crate::idx::trees::store::TreeStoreType;
|
||||
use crate::idx::IndexKeyBase;
|
||||
use crate::key;
|
||||
use crate::sql::array::Array;
|
||||
use crate::sql::index::{Index, MTreeParams, SearchParams};
|
||||
use crate::sql::statements::DefineIndexStatement;
|
||||
use crate::sql::{Part, Thing, Value};
|
||||
use crate::{key, kvs};
|
||||
|
||||
impl<'a> Document<'a> {
|
||||
pub async fn index(
|
||||
|
@ -45,18 +45,15 @@ impl<'a> Document<'a> {
|
|||
|
||||
// Update the index entries
|
||||
if opt.force || o != n {
|
||||
// Claim transaction
|
||||
let mut run = txn.lock().await;
|
||||
|
||||
// Store all the variable and parameters required by the index operation
|
||||
let mut ic = IndexOperation::new(opt, ix, o, n, rid);
|
||||
|
||||
// Index operation dispatching
|
||||
match &ix.index {
|
||||
Index::Uniq => ic.index_unique(&mut run).await?,
|
||||
Index::Idx => ic.index_non_unique(&mut run).await?,
|
||||
Index::Search(p) => ic.index_full_text(&mut run, p).await?,
|
||||
Index::MTree(p) => ic.index_mtree(&mut run, p).await?,
|
||||
Index::Uniq => ic.index_unique(txn).await?,
|
||||
Index::Idx => ic.index_non_unique(txn).await?,
|
||||
Index::Search(p) => ic.index_full_text(ctx, txn, p).await?,
|
||||
Index::MTree(p) => ic.index_mtree(txn, p).await?,
|
||||
};
|
||||
}
|
||||
}
|
||||
|
@ -257,7 +254,8 @@ impl<'a> IndexOperation<'a> {
|
|||
)
|
||||
}
|
||||
|
||||
async fn index_unique(&mut self, run: &mut kvs::Transaction) -> Result<(), Error> {
|
||||
async fn index_unique(&mut self, txn: &Transaction) -> Result<(), Error> {
|
||||
let mut run = txn.lock().await;
|
||||
// Delete the old index data
|
||||
if let Some(o) = self.o.take() {
|
||||
let i = Indexable::new(o, self.ix);
|
||||
|
@ -288,7 +286,8 @@ impl<'a> IndexOperation<'a> {
|
|||
Ok(())
|
||||
}
|
||||
|
||||
async fn index_non_unique(&mut self, run: &mut kvs::Transaction) -> Result<(), Error> {
|
||||
async fn index_non_unique(&mut self, txn: &Transaction) -> Result<(), Error> {
|
||||
let mut run = txn.lock().await;
|
||||
// Delete the old index data
|
||||
if let Some(o) = self.o.take() {
|
||||
let i = Indexable::new(o, self.ix);
|
||||
|
@ -330,35 +329,34 @@ impl<'a> IndexOperation<'a> {
|
|||
|
||||
async fn index_full_text(
|
||||
&mut self,
|
||||
run: &mut kvs::Transaction,
|
||||
ctx: &Context<'_>,
|
||||
txn: &Transaction,
|
||||
p: &SearchParams,
|
||||
) -> Result<(), Error> {
|
||||
let ikb = IndexKeyBase::new(self.opt, self.ix);
|
||||
let az = run.get_db_analyzer(self.opt.ns(), self.opt.db(), p.az.as_str()).await?;
|
||||
let mut ft = FtIndex::new(run, az, ikb, p, TreeStoreType::Write).await?;
|
||||
|
||||
let mut ft = FtIndex::new(self.opt, txn, &p.az, ikb, p, TreeStoreType::Write).await?;
|
||||
|
||||
if let Some(n) = self.n.take() {
|
||||
ft.index_document(run, self.rid, n).await?;
|
||||
ft.index_document(ctx, self.opt, txn, self.rid, n).await?;
|
||||
} else {
|
||||
ft.remove_document(run, self.rid).await?;
|
||||
ft.remove_document(txn, self.rid).await?;
|
||||
}
|
||||
ft.finish(run).await
|
||||
ft.finish(txn).await
|
||||
}
|
||||
|
||||
async fn index_mtree(
|
||||
&mut self,
|
||||
run: &mut kvs::Transaction,
|
||||
p: &MTreeParams,
|
||||
) -> Result<(), Error> {
|
||||
async fn index_mtree(&mut self, txn: &Transaction, p: &MTreeParams) -> Result<(), Error> {
|
||||
let mut tx = txn.lock().await;
|
||||
let ikb = IndexKeyBase::new(self.opt, self.ix);
|
||||
let mut mt = MTreeIndex::new(run, ikb, p, TreeStoreType::Write).await?;
|
||||
let mut mt = MTreeIndex::new(&mut tx, ikb, p, TreeStoreType::Write).await?;
|
||||
// Delete the old index data
|
||||
if let Some(o) = self.o.take() {
|
||||
mt.remove_document(run, self.rid, o).await?;
|
||||
mt.remove_document(&mut tx, self.rid, o).await?;
|
||||
}
|
||||
// Create the new index data
|
||||
if let Some(n) = self.n.take() {
|
||||
mt.index_document(run, self.rid, n).await?;
|
||||
mt.index_document(&mut tx, self.rid, n).await?;
|
||||
}
|
||||
mt.finish(run).await
|
||||
mt.finish(&mut tx).await
|
||||
}
|
||||
}
|
||||
|
|
|
@ -230,6 +230,10 @@ pub enum Error {
|
|||
current: String,
|
||||
},
|
||||
|
||||
/// Invalid regular expression
|
||||
#[error("Invalid regular expression: {0:?}")]
|
||||
InvalidRegex(String),
|
||||
|
||||
/// The query timedout
|
||||
#[error("The query was not executed because it exceeded the timeout")]
|
||||
QueryTimedout,
|
||||
|
@ -738,6 +742,12 @@ impl From<JWTError> for Error {
|
|||
}
|
||||
}
|
||||
|
||||
impl From<regex::Error> for Error {
|
||||
fn from(error: regex::Error) -> Self {
|
||||
Error::InvalidRegex(error.to_string())
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(feature = "kv-mem")]
|
||||
impl From<echodb::err::Error> for Error {
|
||||
fn from(e: echodb::err::Error) -> Error {
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
use crate::err::Error;
|
||||
use crate::sql::value::Value;
|
||||
use crate::sql::{Array, Bytes, Datetime, Duration, Kind, Number, Strand, Thing};
|
||||
use crate::sql::{Array, Bytes, Datetime, Duration, Kind, Number, Regex, Strand, Thing};
|
||||
|
||||
/// Implemented by types that are commonly used, in a certain way, as arguments.
|
||||
pub trait FromArg: Sized {
|
||||
|
@ -13,6 +13,12 @@ impl FromArg for Value {
|
|||
}
|
||||
}
|
||||
|
||||
impl FromArg for Regex {
|
||||
fn from_arg(arg: Value) -> Result<Self, Error> {
|
||||
arg.coerce_to_regex()
|
||||
}
|
||||
}
|
||||
|
||||
impl FromArg for String {
|
||||
fn from_arg(arg: Value) -> Result<Self, Error> {
|
||||
arg.coerce_to_string()
|
||||
|
|
|
@ -230,6 +230,7 @@ pub fn synchronous(ctx: &Context<'_>, name: &str, args: Vec<Value>) -> Result<Va
|
|||
"string::join" => string::join,
|
||||
"string::len" => string::len,
|
||||
"string::lowercase" => string::lowercase,
|
||||
"string::matches" => string::matches,
|
||||
"string::repeat" => string::repeat,
|
||||
"string::replace" => string::replace,
|
||||
"string::reverse" => string::reverse,
|
||||
|
@ -388,6 +389,7 @@ pub async fn asynchronous(
|
|||
"http::patch" => http::patch(ctx).await,
|
||||
"http::delete" => http::delete(ctx).await,
|
||||
//
|
||||
"search::analyze" => search::analyze((ctx, txn, opt)).await,
|
||||
"search::score" => search::score((ctx, txn, doc)).await,
|
||||
"search::highlight" => search::highlight((ctx,txn, doc)).await,
|
||||
"search::offsets" => search::offsets((ctx, txn, doc)).await,
|
||||
|
|
|
@ -7,6 +7,7 @@ pub struct Package;
|
|||
impl_module_def!(
|
||||
Package,
|
||||
"search",
|
||||
"analyze" => fut Async,
|
||||
"highlight" => fut Async,
|
||||
"offsets" => fut Async,
|
||||
"score" => fut Async
|
||||
|
|
|
@ -17,6 +17,7 @@ impl_module_def!(
|
|||
"join" => run,
|
||||
"len" => run,
|
||||
"lowercase" => run,
|
||||
"matches" => run,
|
||||
"repeat" => run,
|
||||
"replace" => run,
|
||||
"reverse" => run,
|
||||
|
|
|
@ -1,7 +1,8 @@
|
|||
use crate::ctx::Context;
|
||||
use crate::dbs::Transaction;
|
||||
use crate::dbs::{Options, Transaction};
|
||||
use crate::doc::CursorDoc;
|
||||
use crate::err::Error;
|
||||
use crate::idx::ft::analyzer::Analyzer;
|
||||
use crate::idx::planner::executor::QueryExecutor;
|
||||
use crate::sql::{Thing, Value};
|
||||
|
||||
|
@ -24,6 +25,19 @@ fn get_execution_context<'a>(
|
|||
None
|
||||
}
|
||||
|
||||
pub async fn analyze(
|
||||
(ctx, txn, opt): (&Context<'_>, Option<&Transaction>, Option<&Options>),
|
||||
(az, val): (Value, Value),
|
||||
) -> Result<Value, Error> {
|
||||
if let (Some(txn), Some(opt), Value::Strand(az), Value::Strand(val)) = (txn, opt, az, val) {
|
||||
let az: Analyzer =
|
||||
txn.lock().await.get_db_analyzer(opt.ns(), opt.db(), az.as_str()).await?.into();
|
||||
az.analyze(ctx, opt, txn, val.0).await
|
||||
} else {
|
||||
Ok(Value::None)
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn score(
|
||||
(ctx, txn, doc): (&Context<'_>, Option<&Transaction>, Option<&CursorDoc<'_>>),
|
||||
(match_ref,): (Value,),
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
use crate::err::Error;
|
||||
use crate::fnc::util::string;
|
||||
use crate::sql::value::Value;
|
||||
use crate::sql::Regex;
|
||||
|
||||
/// Returns `true` if a string of this length is too much to allocate.
|
||||
fn limit(name: &str, n: usize) -> Result<(), Error> {
|
||||
|
@ -64,17 +65,32 @@ pub fn repeat((val, num): (String, usize)) -> Result<Value, Error> {
|
|||
Ok(val.repeat(num).into())
|
||||
}
|
||||
|
||||
pub fn replace((val, old, new): (String, String, String)) -> Result<Value, Error> {
|
||||
if new.len() > old.len() {
|
||||
let increase = new.len() - old.len();
|
||||
limit(
|
||||
"string::replace",
|
||||
val.len().saturating_add(val.matches(&old).count().saturating_mul(increase)),
|
||||
)?;
|
||||
}
|
||||
Ok(val.replace(&old, &new).into())
|
||||
pub fn matches((val, regex): (String, Regex)) -> Result<Value, Error> {
|
||||
Ok(regex.0.is_match(&val).into())
|
||||
}
|
||||
|
||||
pub fn replace((val, old_or_regexp, new): (String, Value, String)) -> Result<Value, Error> {
|
||||
match old_or_regexp {
|
||||
Value::Strand(old) => {
|
||||
if new.len() > old.len() {
|
||||
let increase = new.len() - old.len();
|
||||
limit(
|
||||
"string::replace",
|
||||
val.len().saturating_add(val.matches(&old.0).count().saturating_mul(increase)),
|
||||
)?;
|
||||
}
|
||||
Ok(val.replace(&old.0, &new).into())
|
||||
}
|
||||
Value::Regex(r) => Ok(r.0.replace_all(&val, new).into_owned().into()),
|
||||
_ => Err(Error::InvalidArguments {
|
||||
name: "string::replace".to_string(),
|
||||
message: format!(
|
||||
"Argument 2 was the wrong type. Expected a string but found {}",
|
||||
old_or_regexp
|
||||
),
|
||||
}),
|
||||
}
|
||||
}
|
||||
pub fn reverse((string,): (String,)) -> Result<Value, Error> {
|
||||
Ok(string.chars().rev().collect::<String>().into())
|
||||
}
|
||||
|
@ -180,6 +196,7 @@ pub mod is {
|
|||
pub fn datetime((arg, fmt): (String, String)) -> Result<Value, Error> {
|
||||
Ok(NaiveDateTime::parse_from_str(&arg, &fmt).is_ok().into())
|
||||
}
|
||||
|
||||
pub fn domain((arg,): (String,)) -> Result<Value, Error> {
|
||||
Ok(addr::parse_domain_name(arg.as_str()).is_ok().into())
|
||||
}
|
||||
|
@ -245,7 +262,7 @@ pub mod similarity {
|
|||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::{contains, slice};
|
||||
use super::{contains, matches, replace, slice};
|
||||
use crate::sql::Value;
|
||||
|
||||
#[test]
|
||||
|
@ -288,6 +305,41 @@ mod tests {
|
|||
test("好世界", "你好", false);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn string_replace() {
|
||||
fn test(base: &str, pattern: Value, replacement: &str, expected: &str) {
|
||||
assert_eq!(
|
||||
replace((base.to_string(), pattern.clone(), replacement.to_string())).unwrap(),
|
||||
Value::from(expected),
|
||||
"replace({},{},{})",
|
||||
base,
|
||||
pattern,
|
||||
replacement
|
||||
);
|
||||
}
|
||||
|
||||
test("foo bar", Value::Regex("foo".parse().unwrap()), "bar", "bar bar");
|
||||
test("foo bar", "bar".into(), "foo", "foo foo");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn string_matches() {
|
||||
fn test(base: &str, regex: &str, expected: bool) {
|
||||
assert_eq!(
|
||||
matches((base.to_string(), regex.parse().unwrap())).unwrap(),
|
||||
Value::from(expected),
|
||||
"matches({},{})",
|
||||
base,
|
||||
regex
|
||||
);
|
||||
}
|
||||
|
||||
test("bar", "foo", false);
|
||||
test("", "foo", false);
|
||||
test("foo bar", "foo", true);
|
||||
test("foo bar", "bar", true);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn is_alphanum() {
|
||||
let value = super::is::alphanum((String::from("abc123"),)).unwrap();
|
||||
|
|
|
@ -182,8 +182,8 @@ pub(super) enum Term {
|
|||
mod tests {
|
||||
use crate::idx::ft::analyzer::tests::test_analyzer;
|
||||
|
||||
#[test]
|
||||
fn test_arabic_stemmer() {
|
||||
#[tokio::test]
|
||||
async fn test_arabic_stemmer() {
|
||||
let input =
|
||||
"الكلاب تحب الجري في الحديقة، لكن كلبي الصغير يفضل النوم في سريره بدلاً من الجري";
|
||||
let output = vec![
|
||||
|
@ -194,17 +194,20 @@ mod tests {
|
|||
"ANALYZER test TOKENIZERS blank,class FILTERS snowball(arabic);",
|
||||
input,
|
||||
&output,
|
||||
);
|
||||
test_analyzer("ANALYZER test TOKENIZERS blank,class FILTERS snowball(ar);", input, &output);
|
||||
)
|
||||
.await;
|
||||
test_analyzer("ANALYZER test TOKENIZERS blank,class FILTERS snowball(ar);", input, &output)
|
||||
.await;
|
||||
test_analyzer(
|
||||
"ANALYZER test TOKENIZERS blank,class FILTERS snowball(ara);",
|
||||
input,
|
||||
&output,
|
||||
);
|
||||
)
|
||||
.await;
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_danish_stemmer() {
|
||||
#[tokio::test]
|
||||
async fn test_danish_stemmer() {
|
||||
let input = "Hunde elsker at løbe i parken, men min lille hund foretrækker at sove i sin kurv frem for at løbe.";
|
||||
let output = vec![
|
||||
"hund",
|
||||
|
@ -234,17 +237,20 @@ mod tests {
|
|||
"ANALYZER test TOKENIZERS blank,class FILTERS snowball(danish);",
|
||||
input,
|
||||
&output,
|
||||
);
|
||||
)
|
||||
.await;
|
||||
test_analyzer(
|
||||
"ANALYZER test TOKENIZERS blank,class FILTERS snowball(dan);",
|
||||
input,
|
||||
&output,
|
||||
);
|
||||
test_analyzer("ANALYZER test TOKENIZERS blank,class FILTERS snowball(da);", input, &output);
|
||||
)
|
||||
.await;
|
||||
test_analyzer("ANALYZER test TOKENIZERS blank,class FILTERS snowball(da);", input, &output)
|
||||
.await;
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_dutch_stemmer() {
|
||||
#[tokio::test]
|
||||
async fn test_dutch_stemmer() {
|
||||
let input = "Honden houden ervan om in het park te rennen, maar mijn kleine hond slaapt liever in zijn mand dan te rennen.";
|
||||
let output = vec![
|
||||
"hond", "houd", "ervan", "om", "in", "het", "park", "te", "renn", ",", "mar", "mijn",
|
||||
|
@ -254,17 +260,20 @@ mod tests {
|
|||
"ANALYZER test TOKENIZERS blank,class FILTERS snowball(dutch);",
|
||||
input,
|
||||
&output,
|
||||
);
|
||||
test_analyzer("ANALYZER test TOKENIZERS blank,class FILTERS snowball(nl);", input, &output);
|
||||
)
|
||||
.await;
|
||||
test_analyzer("ANALYZER test TOKENIZERS blank,class FILTERS snowball(nl);", input, &output)
|
||||
.await;
|
||||
test_analyzer(
|
||||
"ANALYZER test TOKENIZERS blank,class FILTERS snowball(nld);",
|
||||
input,
|
||||
&output,
|
||||
);
|
||||
)
|
||||
.await;
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_english_stemmer() {
|
||||
#[tokio::test]
|
||||
async fn test_english_stemmer() {
|
||||
let input = "Teachers are often teaching, but my favorite teacher prefers reading in her spare time rather than teaching.";
|
||||
let output = vec![
|
||||
"teacher", "are", "often", "teach", ",", "but", "my", "favorit", "teacher", "prefer",
|
||||
|
@ -274,17 +283,20 @@ mod tests {
|
|||
"ANALYZER test TOKENIZERS blank,class FILTERS snowball(english);",
|
||||
input,
|
||||
&output,
|
||||
);
|
||||
)
|
||||
.await;
|
||||
test_analyzer(
|
||||
"ANALYZER test TOKENIZERS blank,class FILTERS snowball(eng);",
|
||||
input,
|
||||
&output,
|
||||
);
|
||||
test_analyzer("ANALYZER test TOKENIZERS blank,class FILTERS snowball(en);", input, &output);
|
||||
)
|
||||
.await;
|
||||
test_analyzer("ANALYZER test TOKENIZERS blank,class FILTERS snowball(en);", input, &output)
|
||||
.await;
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_french_stemmer() {
|
||||
#[tokio::test]
|
||||
async fn test_french_stemmer() {
|
||||
let input = "Les chiens adorent courir dans le parc, mais mon petit chien aime plutôt se blottir sur le canapé que de courir";
|
||||
let output = [
|
||||
"le", "chien", "adorent", "cour", "dan", "le", "parc", ",", "mais", "mon", "pet",
|
||||
|
@ -294,17 +306,20 @@ mod tests {
|
|||
"ANALYZER test TOKENIZERS blank,class FILTERS snowball(french);",
|
||||
input,
|
||||
&output,
|
||||
);
|
||||
test_analyzer("ANALYZER test TOKENIZERS blank,class FILTERS snowball(fr);", input, &output);
|
||||
)
|
||||
.await;
|
||||
test_analyzer("ANALYZER test TOKENIZERS blank,class FILTERS snowball(fr);", input, &output)
|
||||
.await;
|
||||
test_analyzer(
|
||||
"ANALYZER test TOKENIZERS blank,class FILTERS snowball(fra);",
|
||||
input,
|
||||
&output,
|
||||
);
|
||||
)
|
||||
.await;
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_german_stemmer() {
|
||||
#[tokio::test]
|
||||
async fn test_german_stemmer() {
|
||||
let input = "Hunde lieben es, im Park zu laufen, aber mein kleiner Hund zieht es vor, auf dem Sofa zu schlafen, statt zu laufen.";
|
||||
let output = [
|
||||
"hund", "lieb", "es", ",", "im", "park", "zu", "lauf", ",", "aber", "mein", "klein",
|
||||
|
@ -315,17 +330,20 @@ mod tests {
|
|||
"ANALYZER test TOKENIZERS blank,class FILTERS snowball(german);",
|
||||
input,
|
||||
&output,
|
||||
);
|
||||
test_analyzer("ANALYZER test TOKENIZERS blank,class FILTERS snowball(de);", input, &output);
|
||||
)
|
||||
.await;
|
||||
test_analyzer("ANALYZER test TOKENIZERS blank,class FILTERS snowball(de);", input, &output)
|
||||
.await;
|
||||
test_analyzer(
|
||||
"ANALYZER test TOKENIZERS blank,class FILTERS snowball(deu);",
|
||||
input,
|
||||
&output,
|
||||
);
|
||||
)
|
||||
.await;
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_greek_stemmer() {
|
||||
#[tokio::test]
|
||||
async fn test_greek_stemmer() {
|
||||
let input = "Τα σκυλιά αγαπούν να τρέχουν στο πάρκο, αλλά ο μικρός μου σκύλος προτιμά να κοιμάται στο κρεβάτι του αντί να τρέχει.";
|
||||
let output = [
|
||||
"τα",
|
||||
|
@ -356,17 +374,20 @@ mod tests {
|
|||
"ANALYZER test TOKENIZERS blank,class FILTERS snowball(greek);",
|
||||
input,
|
||||
&output,
|
||||
);
|
||||
)
|
||||
.await;
|
||||
test_analyzer(
|
||||
"ANALYZER test TOKENIZERS blank,class FILTERS snowball(ell);",
|
||||
input,
|
||||
&output,
|
||||
);
|
||||
test_analyzer("ANALYZER test TOKENIZERS blank,class FILTERS snowball(el);", input, &output);
|
||||
)
|
||||
.await;
|
||||
test_analyzer("ANALYZER test TOKENIZERS blank,class FILTERS snowball(el);", input, &output)
|
||||
.await;
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_hungarian_stemmer() {
|
||||
#[tokio::test]
|
||||
async fn test_hungarian_stemmer() {
|
||||
let input = "A kutyák szeretnek futni a parkban, de az én kicsi kutyám inkább alszik a kosarában, mintsem fut.";
|
||||
let output = [
|
||||
"a", "kutya", "szeret", "futn", "a", "par", ",", "de", "az", "én", "kics", "kutya",
|
||||
|
@ -376,17 +397,20 @@ mod tests {
|
|||
"ANALYZER test TOKENIZERS blank,class FILTERS snowball(hungarian);",
|
||||
input,
|
||||
&output,
|
||||
);
|
||||
test_analyzer("ANALYZER test TOKENIZERS blank,class FILTERS snowball(hu);", input, &output);
|
||||
)
|
||||
.await;
|
||||
test_analyzer("ANALYZER test TOKENIZERS blank,class FILTERS snowball(hu);", input, &output)
|
||||
.await;
|
||||
test_analyzer(
|
||||
"ANALYZER test TOKENIZERS blank,class FILTERS snowball(hun);",
|
||||
input,
|
||||
&output,
|
||||
);
|
||||
)
|
||||
.await;
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_italian_stemmer() {
|
||||
#[tokio::test]
|
||||
async fn test_italian_stemmer() {
|
||||
let input = "I cani amano correre nel parco, ma il mio piccolo cane preferisce dormire nel suo cesto piuttosto che correre.";
|
||||
let output = [
|
||||
"i", "can", "aman", "corr", "nel", "parc", ",", "ma", "il", "mio", "piccol", "can",
|
||||
|
@ -396,17 +420,20 @@ mod tests {
|
|||
"ANALYZER test TOKENIZERS blank,class FILTERS snowball(italian);",
|
||||
input,
|
||||
&output,
|
||||
);
|
||||
test_analyzer("ANALYZER test TOKENIZERS blank,class FILTERS snowball(it);", input, &output);
|
||||
)
|
||||
.await;
|
||||
test_analyzer("ANALYZER test TOKENIZERS blank,class FILTERS snowball(it);", input, &output)
|
||||
.await;
|
||||
test_analyzer(
|
||||
"ANALYZER test TOKENIZERS blank,class FILTERS snowball(ita);",
|
||||
input,
|
||||
&output,
|
||||
);
|
||||
)
|
||||
.await;
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_norwegian_stemmer() {
|
||||
#[tokio::test]
|
||||
async fn test_norwegian_stemmer() {
|
||||
let input = "Hunder elsker å løpe i parken, men min lille hund foretrekker å sove i sengen sin heller enn å løpe.";
|
||||
let output = [
|
||||
"hund",
|
||||
|
@ -436,17 +463,20 @@ mod tests {
|
|||
"ANALYZER test TOKENIZERS blank,class FILTERS snowball(norwegian);",
|
||||
input,
|
||||
&output,
|
||||
);
|
||||
test_analyzer("ANALYZER test TOKENIZERS blank,class FILTERS snowball(no);", input, &output);
|
||||
)
|
||||
.await;
|
||||
test_analyzer("ANALYZER test TOKENIZERS blank,class FILTERS snowball(no);", input, &output)
|
||||
.await;
|
||||
test_analyzer(
|
||||
"ANALYZER test TOKENIZERS blank,class FILTERS snowball(nor);",
|
||||
input,
|
||||
&output,
|
||||
);
|
||||
)
|
||||
.await;
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_portuguese_stemmer() {
|
||||
#[tokio::test]
|
||||
async fn test_portuguese_stemmer() {
|
||||
let input = "Os cães adoram correr no parque, mas o meu pequeno cão prefere dormir na sua cama em vez de correr.";
|
||||
let output = [
|
||||
"os", "cã", "ador", "corr", "no", "parqu", ",", "mas", "o", "meu", "pequen", "cã",
|
||||
|
@ -456,17 +486,20 @@ mod tests {
|
|||
"ANALYZER test TOKENIZERS blank,class FILTERS snowball(portuguese);",
|
||||
input,
|
||||
&output,
|
||||
);
|
||||
test_analyzer("ANALYZER test TOKENIZERS blank,class FILTERS snowball(pt);", input, &output);
|
||||
)
|
||||
.await;
|
||||
test_analyzer("ANALYZER test TOKENIZERS blank,class FILTERS snowball(pt);", input, &output)
|
||||
.await;
|
||||
test_analyzer(
|
||||
"ANALYZER test TOKENIZERS blank,class FILTERS snowball(por);",
|
||||
input,
|
||||
&output,
|
||||
);
|
||||
)
|
||||
.await;
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_romanian_stemmer() {
|
||||
#[tokio::test]
|
||||
async fn test_romanian_stemmer() {
|
||||
let input = "Câinii adoră să alerge în parc, dar cățelul meu preferă să doarmă în coșul lui decât să alerge.";
|
||||
let output = [
|
||||
"câin", "ador", "să", "alerg", "în", "parc", ",", "dar", "cățel", "meu", "prefer",
|
||||
|
@ -476,17 +509,20 @@ mod tests {
|
|||
"ANALYZER test TOKENIZERS blank,class FILTERS snowball(romanian);",
|
||||
input,
|
||||
&output,
|
||||
);
|
||||
test_analyzer("ANALYZER test TOKENIZERS blank,class FILTERS snowball(ro);", input, &output);
|
||||
)
|
||||
.await;
|
||||
test_analyzer("ANALYZER test TOKENIZERS blank,class FILTERS snowball(ro);", input, &output)
|
||||
.await;
|
||||
test_analyzer(
|
||||
"ANALYZER test TOKENIZERS blank,class FILTERS snowball(ron);",
|
||||
input,
|
||||
&output,
|
||||
);
|
||||
)
|
||||
.await;
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_russian_stemmer() {
|
||||
#[tokio::test]
|
||||
async fn test_russian_stemmer() {
|
||||
let input = "Собаки любят бегать в парке, но моя маленькая собака предпочитает спать в своей корзине, а не бегать.";
|
||||
let output = [
|
||||
"собак",
|
||||
|
@ -514,17 +550,20 @@ mod tests {
|
|||
"ANALYZER test TOKENIZERS blank,class FILTERS snowball(russian);",
|
||||
input,
|
||||
&output,
|
||||
);
|
||||
test_analyzer("ANALYZER test TOKENIZERS blank,class FILTERS snowball(ru);", input, &output);
|
||||
)
|
||||
.await;
|
||||
test_analyzer("ANALYZER test TOKENIZERS blank,class FILTERS snowball(ru);", input, &output)
|
||||
.await;
|
||||
test_analyzer(
|
||||
"ANALYZER test TOKENIZERS blank,class FILTERS snowball(rus);",
|
||||
input,
|
||||
&output,
|
||||
);
|
||||
)
|
||||
.await;
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_spanish_stemmer() {
|
||||
#[tokio::test]
|
||||
async fn test_spanish_stemmer() {
|
||||
let input = "Los perros aman correr en el parque, pero mi pequeño perro prefiere dormir en su cama en lugar de correr.";
|
||||
let output = [
|
||||
"los", "perr", "aman", "corr", "en", "el", "parqu", ",", "per", "mi", "pequeñ", "perr",
|
||||
|
@ -534,17 +573,20 @@ mod tests {
|
|||
"ANALYZER test TOKENIZERS blank,class FILTERS snowball(spanish);",
|
||||
input,
|
||||
&output,
|
||||
);
|
||||
test_analyzer("ANALYZER test TOKENIZERS blank,class FILTERS snowball(es);", input, &output);
|
||||
)
|
||||
.await;
|
||||
test_analyzer("ANALYZER test TOKENIZERS blank,class FILTERS snowball(es);", input, &output)
|
||||
.await;
|
||||
test_analyzer(
|
||||
"ANALYZER test TOKENIZERS blank,class FILTERS snowball(spa);",
|
||||
input,
|
||||
&output,
|
||||
);
|
||||
)
|
||||
.await;
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_swedish_stemmer() {
|
||||
#[tokio::test]
|
||||
async fn test_swedish_stemmer() {
|
||||
let input = "Hundar älskar att springa i parken, men min lilla hund föredrar att sova i sin säng istället för att springa.";
|
||||
let output = [
|
||||
"hund",
|
||||
|
@ -574,17 +616,20 @@ mod tests {
|
|||
"ANALYZER test TOKENIZERS blank,class FILTERS snowball(swedish);",
|
||||
input,
|
||||
&output,
|
||||
);
|
||||
test_analyzer("ANALYZER test TOKENIZERS blank,class FILTERS snowball(sv);", input, &output);
|
||||
)
|
||||
.await;
|
||||
test_analyzer("ANALYZER test TOKENIZERS blank,class FILTERS snowball(sv);", input, &output)
|
||||
.await;
|
||||
test_analyzer(
|
||||
"ANALYZER test TOKENIZERS blank,class FILTERS snowball(swe);",
|
||||
input,
|
||||
&output,
|
||||
);
|
||||
)
|
||||
.await;
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_tamil_stemmer() {
|
||||
#[tokio::test]
|
||||
async fn test_tamil_stemmer() {
|
||||
let input = "நாய்கள் பூங்காவில் ஓடுவதை விரும்புகின்றன, ஆனால் என் சிறிய நாய் அதன் படுகையில் தூங்குவதை விரும்புகின்றது, ஓட இல்லை.";
|
||||
let output = [
|
||||
"ந\u{bbe}ய",
|
||||
|
@ -617,17 +662,20 @@ mod tests {
|
|||
"ANALYZER test TOKENIZERS blank,class FILTERS snowball(tamil);",
|
||||
input,
|
||||
&output,
|
||||
);
|
||||
test_analyzer("ANALYZER test TOKENIZERS blank,class FILTERS snowball(ta);", input, &output);
|
||||
)
|
||||
.await;
|
||||
test_analyzer("ANALYZER test TOKENIZERS blank,class FILTERS snowball(ta);", input, &output)
|
||||
.await;
|
||||
test_analyzer(
|
||||
"ANALYZER test TOKENIZERS blank,class FILTERS snowball(tam);",
|
||||
input,
|
||||
&output,
|
||||
);
|
||||
)
|
||||
.await;
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_turkish_stemmer() {
|
||||
#[tokio::test]
|
||||
async fn test_turkish_stemmer() {
|
||||
let input = "Köpekler parkta koşmayı sever, ama benim küçük köpeğim koşmaktansa yatağında uyumayı tercih eder.";
|
||||
let output = [
|
||||
"köpek", "park", "koşma", "sever", ",", "am", "be", "küçük", "köpek", "koşmak",
|
||||
|
@ -637,30 +685,35 @@ mod tests {
|
|||
"ANALYZER test TOKENIZERS blank,class FILTERS snowball(turkish);",
|
||||
input,
|
||||
&output,
|
||||
);
|
||||
test_analyzer("ANALYZER test TOKENIZERS blank,class FILTERS snowball(tr);", input, &output);
|
||||
)
|
||||
.await;
|
||||
test_analyzer("ANALYZER test TOKENIZERS blank,class FILTERS snowball(tr);", input, &output)
|
||||
.await;
|
||||
test_analyzer(
|
||||
"ANALYZER test TOKENIZERS blank,class FILTERS snowball(tur);",
|
||||
input,
|
||||
&output,
|
||||
);
|
||||
)
|
||||
.await;
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_ngram() {
|
||||
#[tokio::test]
|
||||
async fn test_ngram() {
|
||||
test_analyzer(
|
||||
"ANALYZER test TOKENIZERS blank,class FILTERS lowercase,ngram(2,3);",
|
||||
"Ālea iacta est",
|
||||
&["āl", "āle", "le", "lea", "ia", "iac", "ac", "act", "ct", "cta", "es", "est"],
|
||||
);
|
||||
)
|
||||
.await;
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_edgengram() {
|
||||
#[tokio::test]
|
||||
async fn test_edgengram() {
|
||||
test_analyzer(
|
||||
"ANALYZER test TOKENIZERS blank,class FILTERS lowercase,edgengram(2,3);",
|
||||
"Ālea iacta est",
|
||||
&["āl", "āle", "ia", "iac", "es", "est"],
|
||||
);
|
||||
)
|
||||
.await;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,13 +1,16 @@
|
|||
use crate::ctx::Context;
|
||||
use crate::dbs::{Options, Transaction};
|
||||
use crate::err::Error;
|
||||
use crate::idx::ft::analyzer::tokenizer::{Tokenizer, Tokens};
|
||||
use crate::idx::ft::doclength::DocLength;
|
||||
use crate::idx::ft::offsets::{Offset, OffsetRecords};
|
||||
use crate::idx::ft::postings::TermFrequency;
|
||||
use crate::idx::ft::terms::{TermId, Terms};
|
||||
use crate::kvs::Transaction;
|
||||
use crate::sql::statements::DefineAnalyzerStatement;
|
||||
use crate::sql::tokenizer::Tokenizer as SqlTokenizer;
|
||||
use crate::sql::Value;
|
||||
use crate::syn::path_like;
|
||||
use async_recursion::async_recursion;
|
||||
use filter::Filter;
|
||||
use std::collections::hash_map::Entry;
|
||||
use std::collections::{HashMap, HashSet};
|
||||
|
@ -15,28 +18,31 @@ use std::collections::{HashMap, HashSet};
|
|||
mod filter;
|
||||
mod tokenizer;
|
||||
|
||||
pub(super) struct Analyzer {
|
||||
t: Option<Vec<SqlTokenizer>>,
|
||||
f: Option<Vec<Filter>>,
|
||||
pub(crate) struct Analyzer {
|
||||
function: Option<String>,
|
||||
tokenizers: Option<Vec<SqlTokenizer>>,
|
||||
filters: Option<Vec<Filter>>,
|
||||
}
|
||||
|
||||
impl From<DefineAnalyzerStatement> for Analyzer {
|
||||
fn from(az: DefineAnalyzerStatement) -> Self {
|
||||
Self {
|
||||
t: az.tokenizers,
|
||||
f: Filter::from(az.filters),
|
||||
function: az.function.map(|i| i.0),
|
||||
tokenizers: az.tokenizers,
|
||||
filters: Filter::from(az.filters),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Analyzer {
|
||||
pub(super) async fn extract_terms(
|
||||
&self,
|
||||
ctx: &Context<'_>,
|
||||
opt: &Options,
|
||||
txn: &Transaction,
|
||||
t: &Terms,
|
||||
tx: &mut Transaction,
|
||||
query_string: String,
|
||||
) -> Result<Vec<Option<TermId>>, Error> {
|
||||
let tokens = self.analyze(query_string)?;
|
||||
let tokens = self.generate_tokens(ctx, opt, txn, query_string).await?;
|
||||
// We first collect every unique terms
|
||||
// as it can contains duplicates
|
||||
let mut terms = HashSet::new();
|
||||
|
@ -45,8 +51,9 @@ impl Analyzer {
|
|||
}
|
||||
// Now we can extract the term ids
|
||||
let mut res = Vec::with_capacity(terms.len());
|
||||
let mut tx = txn.lock().await;
|
||||
for term in terms {
|
||||
let opt_term_id = t.get_term_id(tx, tokens.get_token_string(term)?).await?;
|
||||
let opt_term_id = t.get_term_id(&mut tx, tokens.get_token_string(term)?).await?;
|
||||
res.push(opt_term_id);
|
||||
}
|
||||
Ok(res)
|
||||
|
@ -56,15 +63,17 @@ impl Analyzer {
|
|||
/// It will create new term ids for non already existing terms.
|
||||
pub(super) async fn extract_terms_with_frequencies(
|
||||
&self,
|
||||
ctx: &Context<'_>,
|
||||
opt: &Options,
|
||||
txn: &Transaction,
|
||||
terms: &mut Terms,
|
||||
tx: &mut Transaction,
|
||||
field_content: Vec<Value>,
|
||||
) -> Result<(DocLength, Vec<(TermId, TermFrequency)>), Error> {
|
||||
let mut dl = 0;
|
||||
// Let's first collect all the inputs, and collect the tokens.
|
||||
// We need to store them because everything after is zero-copy
|
||||
let mut inputs = vec![];
|
||||
self.analyze_content(field_content, &mut inputs)?;
|
||||
self.analyze_content(ctx, opt, txn, field_content, &mut inputs).await?;
|
||||
// We then collect every unique terms and count the frequency
|
||||
let mut tf: HashMap<&str, TermFrequency> = HashMap::new();
|
||||
for tks in &inputs {
|
||||
|
@ -83,8 +92,9 @@ impl Analyzer {
|
|||
}
|
||||
// Now we can resolve the term ids
|
||||
let mut tfid = Vec::with_capacity(tf.len());
|
||||
let mut tx = txn.lock().await;
|
||||
for (t, f) in tf {
|
||||
tfid.push((terms.resolve_term_id(tx, t).await?, f));
|
||||
tfid.push((terms.resolve_term_id(&mut tx, t).await?, f));
|
||||
}
|
||||
Ok((dl, tfid))
|
||||
}
|
||||
|
@ -93,15 +103,17 @@ impl Analyzer {
|
|||
/// It will create new term ids for non already existing terms.
|
||||
pub(super) async fn extract_terms_with_frequencies_with_offsets(
|
||||
&self,
|
||||
ctx: &Context<'_>,
|
||||
opt: &Options,
|
||||
txn: &Transaction,
|
||||
terms: &mut Terms,
|
||||
tx: &mut Transaction,
|
||||
content: Vec<Value>,
|
||||
) -> Result<(DocLength, Vec<(TermId, TermFrequency)>, Vec<(TermId, OffsetRecords)>), Error> {
|
||||
let mut dl = 0;
|
||||
// Let's first collect all the inputs, and collect the tokens.
|
||||
// We need to store them because everything after is zero-copy
|
||||
let mut inputs = Vec::with_capacity(content.len());
|
||||
self.analyze_content(content, &mut inputs)?;
|
||||
self.analyze_content(ctx, opt, txn, content, &mut inputs).await?;
|
||||
// We then collect every unique terms and count the frequency and extract the offsets
|
||||
let mut tfos: HashMap<&str, Vec<Offset>> = HashMap::new();
|
||||
for (i, tks) in inputs.iter().enumerate() {
|
||||
|
@ -121,34 +133,53 @@ impl Analyzer {
|
|||
// Now we can resolve the term ids
|
||||
let mut tfid = Vec::with_capacity(tfos.len());
|
||||
let mut osid = Vec::with_capacity(tfos.len());
|
||||
let mut tx = txn.lock().await;
|
||||
for (t, o) in tfos {
|
||||
let id = terms.resolve_term_id(tx, t).await?;
|
||||
let id = terms.resolve_term_id(&mut tx, t).await?;
|
||||
tfid.push((id, o.len() as TermFrequency));
|
||||
osid.push((id, OffsetRecords(o)));
|
||||
}
|
||||
Ok((dl, tfid, osid))
|
||||
}
|
||||
|
||||
fn analyze_content(&self, content: Vec<Value>, tks: &mut Vec<Tokens>) -> Result<(), Error> {
|
||||
#[cfg_attr(not(target_arch = "wasm32"), async_recursion)]
|
||||
#[cfg_attr(target_arch = "wasm32", async_recursion(?Send))]
|
||||
async fn analyze_content(
|
||||
&self,
|
||||
ctx: &Context<'_>,
|
||||
opt: &Options,
|
||||
txn: &Transaction,
|
||||
content: Vec<Value>,
|
||||
tks: &mut Vec<Tokens>,
|
||||
) -> Result<(), Error> {
|
||||
for v in content {
|
||||
self.analyze_value(v, tks)?;
|
||||
self.analyze_value(ctx, opt, txn, v, tks).await?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn analyze_value(&self, val: Value, tks: &mut Vec<Tokens>) -> Result<(), Error> {
|
||||
#[cfg_attr(not(target_arch = "wasm32"), async_recursion)]
|
||||
#[cfg_attr(target_arch = "wasm32", async_recursion(?Send))]
|
||||
async fn analyze_value(
|
||||
&self,
|
||||
ctx: &Context<'_>,
|
||||
opt: &Options,
|
||||
txn: &Transaction,
|
||||
val: Value,
|
||||
tks: &mut Vec<Tokens>,
|
||||
) -> Result<(), Error> {
|
||||
match val {
|
||||
Value::Strand(s) => tks.push(self.analyze(s.0)?),
|
||||
Value::Number(n) => tks.push(self.analyze(n.to_string())?),
|
||||
Value::Bool(b) => tks.push(self.analyze(b.to_string())?),
|
||||
Value::Strand(s) => tks.push(self.generate_tokens(ctx, opt, txn, s.0).await?),
|
||||
Value::Number(n) => tks.push(self.generate_tokens(ctx, opt, txn, n.to_string()).await?),
|
||||
Value::Bool(b) => tks.push(self.generate_tokens(ctx, opt, txn, b.to_string()).await?),
|
||||
Value::Array(a) => {
|
||||
for v in a.0 {
|
||||
self.analyze_value(v, tks)?;
|
||||
self.analyze_value(ctx, opt, txn, v, tks).await?;
|
||||
}
|
||||
}
|
||||
Value::Object(o) => {
|
||||
for (_, v) in o.0 {
|
||||
self.analyze_value(v, tks)?;
|
||||
self.analyze_value(ctx, opt, txn, v, tks).await?;
|
||||
}
|
||||
}
|
||||
_ => {}
|
||||
|
@ -156,33 +187,84 @@ impl Analyzer {
|
|||
Ok(())
|
||||
}
|
||||
|
||||
fn analyze(&self, input: String) -> Result<Tokens, Error> {
|
||||
if let Some(t) = &self.t {
|
||||
async fn generate_tokens(
|
||||
&self,
|
||||
ctx: &Context<'_>,
|
||||
opt: &Options,
|
||||
txn: &Transaction,
|
||||
mut input: String,
|
||||
) -> Result<Tokens, Error> {
|
||||
if let Some(function_name) = &self.function {
|
||||
let fns = format!("fn::{function_name}(\"{input}\")");
|
||||
match path_like(&fns) {
|
||||
Ok(func_value) => {
|
||||
let val = func_value.compute(ctx, opt, txn, None).await?;
|
||||
if let Value::Strand(val) = val {
|
||||
input = val.0;
|
||||
} else {
|
||||
return Err(Error::InvalidFunction {
|
||||
name: function_name.to_string(),
|
||||
message: "The function should return a string.".to_string(),
|
||||
});
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
return Err(Error::InvalidFunction {
|
||||
name: function_name.to_string(),
|
||||
message: e.to_string(),
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
if let Some(t) = &self.tokenizers {
|
||||
if !input.is_empty() {
|
||||
let t = Tokenizer::tokenize(t, input);
|
||||
return Filter::apply_filters(t, &self.f);
|
||||
return Filter::apply_filters(t, &self.filters);
|
||||
}
|
||||
}
|
||||
Ok(Tokens::new(input))
|
||||
}
|
||||
|
||||
/// Used for exposing the analyzer as the native function `search::analyze`
|
||||
pub(crate) async fn analyze(
|
||||
&self,
|
||||
ctx: &Context<'_>,
|
||||
opt: &Options,
|
||||
txn: &Transaction,
|
||||
input: String,
|
||||
) -> Result<Value, Error> {
|
||||
self.generate_tokens(ctx, opt, txn, input).await?.try_into()
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::Analyzer;
|
||||
use crate::ctx::Context;
|
||||
use crate::dbs::{Options, Transaction};
|
||||
use crate::kvs::{Datastore, LockType, TransactionType};
|
||||
use crate::{
|
||||
sql::{statements::DefineStatement, Statement},
|
||||
syn,
|
||||
};
|
||||
use futures::lock::Mutex;
|
||||
use std::sync::Arc;
|
||||
|
||||
pub(super) async fn test_analyzer(def: &str, input: &str, expected: &[&str]) {
|
||||
let ds = Datastore::new("memory").await.unwrap();
|
||||
let tx = ds.transaction(TransactionType::Read, LockType::Optimistic).await.unwrap();
|
||||
let txn: Transaction = Arc::new(Mutex::new(tx));
|
||||
|
||||
pub(super) fn test_analyzer(def: &str, input: &str, expected: &[&str]) {
|
||||
let mut stmt = syn::parse(&format!("DEFINE {def}")).unwrap();
|
||||
let Some(Statement::Define(DefineStatement::Analyzer(az))) = stmt.0 .0.pop() else {
|
||||
panic!()
|
||||
};
|
||||
let a: Analyzer = az.into();
|
||||
|
||||
let tokens = a.analyze(input.to_string()).unwrap();
|
||||
let tokens = a
|
||||
.generate_tokens(&Context::default(), &Options::default(), &txn, input.to_string())
|
||||
.await
|
||||
.unwrap();
|
||||
let mut res = vec![];
|
||||
for t in tokens.list() {
|
||||
res.push(tokens.get_token_string(t).unwrap());
|
||||
|
|
|
@ -1,7 +1,9 @@
|
|||
use crate::err;
|
||||
use crate::err::Error;
|
||||
use crate::idx::ft::analyzer::filter::{Filter, FilterResult, Term};
|
||||
use crate::idx::ft::offsets::{Offset, Position};
|
||||
use crate::sql::tokenizer::Tokenizer as SqlTokenizer;
|
||||
use crate::sql::Value;
|
||||
|
||||
pub(super) struct Tokens {
|
||||
/// The input string
|
||||
|
@ -67,6 +69,18 @@ impl Tokens {
|
|||
}
|
||||
}
|
||||
|
||||
impl TryFrom<Tokens> for Value {
|
||||
type Error = err::Error;
|
||||
|
||||
fn try_from(tokens: Tokens) -> Result<Self, Error> {
|
||||
let mut vec: Vec<Value> = Vec::with_capacity(tokens.t.len());
|
||||
for token in tokens.t {
|
||||
vec.push(token.get_str(&tokens.i)?.into())
|
||||
}
|
||||
Ok(vec.into())
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialOrd, PartialEq, Eq, Ord, Hash)]
|
||||
pub(super) enum Token {
|
||||
Ref {
|
||||
|
@ -310,8 +324,8 @@ impl Splitter {
|
|||
mod tests {
|
||||
use crate::idx::ft::analyzer::tests::test_analyzer;
|
||||
|
||||
#[test]
|
||||
fn test_tokenize_blank_class() {
|
||||
#[tokio::test]
|
||||
async fn test_tokenize_blank_class() {
|
||||
test_analyzer(
|
||||
"ANALYZER test TOKENIZERS blank,class FILTERS lowercase",
|
||||
"Abc12345xYZ DL1809 item123456 978-3-16-148410-0 1HGCM82633A123456",
|
||||
|
@ -319,11 +333,12 @@ mod tests {
|
|||
"abc", "12345", "xyz", "dl", "1809", "item", "123456", "978", "-", "3", "-", "16",
|
||||
"-", "148410", "-", "0", "1", "hgcm", "82633", "a", "123456",
|
||||
],
|
||||
);
|
||||
)
|
||||
.await;
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_tokenize_source_code() {
|
||||
#[tokio::test]
|
||||
async fn test_tokenize_source_code() {
|
||||
test_analyzer(
|
||||
"ANALYZER test TOKENIZERS blank,class,camel,punct FILTERS lowercase",
|
||||
r#"struct MyRectangle {
|
||||
|
@ -366,6 +381,7 @@ static LANGUAGE: &str = "Rust";"#,
|
|||
"\"",
|
||||
";",
|
||||
],
|
||||
);
|
||||
)
|
||||
.await;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -7,6 +7,8 @@ pub(super) mod scorer;
|
|||
pub(super) mod termdocs;
|
||||
pub(crate) mod terms;
|
||||
|
||||
use crate::ctx::Context;
|
||||
use crate::dbs::{Options, Transaction};
|
||||
use crate::err::Error;
|
||||
use crate::idx::docids::{DocId, DocIds};
|
||||
use crate::idx::ft::analyzer::Analyzer;
|
||||
|
@ -20,7 +22,8 @@ use crate::idx::ft::terms::{TermId, Terms};
|
|||
use crate::idx::trees::btree::BStatistics;
|
||||
use crate::idx::trees::store::TreeStoreType;
|
||||
use crate::idx::{IndexKeyBase, VersionedSerdeState};
|
||||
use crate::kvs::{Key, Transaction};
|
||||
use crate::kvs;
|
||||
use crate::kvs::Key;
|
||||
use crate::sql::index::SearchParams;
|
||||
use crate::sql::scoring::Scoring;
|
||||
use crate::sql::statements::DefineAnalyzerStatement;
|
||||
|
@ -94,29 +97,42 @@ impl VersionedSerdeState for State {}
|
|||
|
||||
impl FtIndex {
|
||||
pub(crate) async fn new(
|
||||
tx: &mut Transaction,
|
||||
opt: &Options,
|
||||
txn: &Transaction,
|
||||
az: &str,
|
||||
index_key_base: IndexKeyBase,
|
||||
p: &SearchParams,
|
||||
store_type: TreeStoreType,
|
||||
) -> Result<Self, Error> {
|
||||
let mut tx = txn.lock().await;
|
||||
let az = tx.get_db_analyzer(opt.ns(), opt.db(), az).await?;
|
||||
Self::with_analyzer(&mut tx, az, index_key_base, p, store_type).await
|
||||
}
|
||||
|
||||
async fn with_analyzer(
|
||||
run: &mut kvs::Transaction,
|
||||
az: DefineAnalyzerStatement,
|
||||
index_key_base: IndexKeyBase,
|
||||
p: &SearchParams,
|
||||
store_type: TreeStoreType,
|
||||
) -> Result<Self, Error> {
|
||||
let state_key: Key = index_key_base.new_bs_key();
|
||||
let state: State = if let Some(val) = tx.get(state_key.clone()).await? {
|
||||
let state: State = if let Some(val) = run.get(state_key.clone()).await? {
|
||||
State::try_from_val(val)?
|
||||
} else {
|
||||
State::default()
|
||||
};
|
||||
let doc_ids = Arc::new(RwLock::new(
|
||||
DocIds::new(tx, index_key_base.clone(), p.doc_ids_order, store_type).await?,
|
||||
DocIds::new(run, index_key_base.clone(), p.doc_ids_order, store_type).await?,
|
||||
));
|
||||
let doc_lengths = Arc::new(RwLock::new(
|
||||
DocLengths::new(tx, index_key_base.clone(), p.doc_lengths_order, store_type).await?,
|
||||
DocLengths::new(run, index_key_base.clone(), p.doc_lengths_order, store_type).await?,
|
||||
));
|
||||
let postings = Arc::new(RwLock::new(
|
||||
Postings::new(tx, index_key_base.clone(), p.postings_order, store_type).await?,
|
||||
Postings::new(run, index_key_base.clone(), p.postings_order, store_type).await?,
|
||||
));
|
||||
let terms = Arc::new(RwLock::new(
|
||||
Terms::new(tx, index_key_base.clone(), p.terms_order, store_type).await?,
|
||||
Terms::new(run, index_key_base.clone(), p.terms_order, store_type).await?,
|
||||
));
|
||||
let termdocs = TermDocs::new(index_key_base.clone());
|
||||
let offsets = Offsets::new(index_key_base.clone());
|
||||
|
@ -153,16 +169,17 @@ impl FtIndex {
|
|||
|
||||
pub(crate) async fn remove_document(
|
||||
&mut self,
|
||||
tx: &mut Transaction,
|
||||
txn: &Transaction,
|
||||
rid: &Thing,
|
||||
) -> Result<(), Error> {
|
||||
let mut tx = txn.lock().await;
|
||||
// Extract and remove the doc_id (if any)
|
||||
if let Some(doc_id) = self.doc_ids.write().await.remove_doc(tx, rid.into()).await? {
|
||||
if let Some(doc_id) = self.doc_ids.write().await.remove_doc(&mut tx, rid.into()).await? {
|
||||
self.state.doc_count -= 1;
|
||||
|
||||
// Remove the doc length
|
||||
if let Some(doc_lengths) =
|
||||
self.doc_lengths.write().await.remove_doc_length(tx, doc_id).await?
|
||||
self.doc_lengths.write().await.remove_doc_length(&mut tx, doc_id).await?
|
||||
{
|
||||
self.state.total_docs_lengths -= doc_lengths as u128;
|
||||
}
|
||||
|
@ -174,18 +191,18 @@ impl FtIndex {
|
|||
let mut p = self.postings.write().await;
|
||||
let mut t = self.terms.write().await;
|
||||
for term_id in &term_list {
|
||||
p.remove_posting(tx, term_id, doc_id).await?;
|
||||
p.remove_posting(&mut tx, term_id, doc_id).await?;
|
||||
// if the term is not present in any document in the index, we can remove it
|
||||
let doc_count = self.term_docs.remove_doc(tx, term_id, doc_id).await?;
|
||||
let doc_count = self.term_docs.remove_doc(&mut tx, term_id, doc_id).await?;
|
||||
if doc_count == 0 {
|
||||
t.remove_term_id(tx, term_id).await?;
|
||||
t.remove_term_id(&mut tx, term_id).await?;
|
||||
}
|
||||
}
|
||||
// Remove the offsets if any
|
||||
if self.highlighting {
|
||||
for term_id in term_list {
|
||||
// TODO?: Removal can be done with a prefix on doc_id
|
||||
self.offsets.remove_offsets(tx, doc_id, term_id).await?;
|
||||
self.offsets.remove_offsets(&mut tx, doc_id, term_id).await?;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -195,36 +212,43 @@ impl FtIndex {
|
|||
|
||||
pub(crate) async fn index_document(
|
||||
&mut self,
|
||||
tx: &mut Transaction,
|
||||
ctx: &Context<'_>,
|
||||
opt: &Options,
|
||||
txn: &Transaction,
|
||||
rid: &Thing,
|
||||
content: Vec<Value>,
|
||||
) -> Result<(), Error> {
|
||||
// Resolve the doc_id
|
||||
let resolved = self.doc_ids.write().await.resolve_doc_id(tx, rid.into()).await?;
|
||||
let mut tx = txn.lock().await;
|
||||
let resolved = self.doc_ids.write().await.resolve_doc_id(&mut tx, rid.into()).await?;
|
||||
let doc_id = *resolved.doc_id();
|
||||
drop(tx);
|
||||
|
||||
// Extract the doc_lengths, terms en frequencies (and offset)
|
||||
let mut t = self.terms.write().await;
|
||||
let (doc_length, terms_and_frequencies, offsets) = if self.highlighting {
|
||||
let (dl, tf, ofs) = self
|
||||
.analyzer
|
||||
.extract_terms_with_frequencies_with_offsets(&mut t, tx, content)
|
||||
.extract_terms_with_frequencies_with_offsets(ctx, opt, txn, &mut t, content)
|
||||
.await?;
|
||||
(dl, tf, Some(ofs))
|
||||
} else {
|
||||
let (dl, tf) =
|
||||
self.analyzer.extract_terms_with_frequencies(&mut t, tx, content).await?;
|
||||
let (dl, tf) = self
|
||||
.analyzer
|
||||
.extract_terms_with_frequencies(ctx, opt, txn, &mut t, content)
|
||||
.await?;
|
||||
(dl, tf, None)
|
||||
};
|
||||
|
||||
// Set the doc length
|
||||
let mut tx = txn.lock().await;
|
||||
let mut dl = self.doc_lengths.write().await;
|
||||
if resolved.was_existing() {
|
||||
if let Some(old_doc_length) = dl.get_doc_length(tx, doc_id).await? {
|
||||
if let Some(old_doc_length) = dl.get_doc_length(&mut tx, doc_id).await? {
|
||||
self.state.total_docs_lengths -= old_doc_length as u128;
|
||||
}
|
||||
}
|
||||
dl.set_doc_length(tx, doc_id, doc_length).await?;
|
||||
dl.set_doc_length(&mut tx, doc_id, doc_length).await?;
|
||||
|
||||
// Retrieve the existing terms for this document (if any)
|
||||
let term_ids_key = self.index_key_base.new_bk_key(doc_id);
|
||||
|
@ -238,22 +262,22 @@ impl FtIndex {
|
|||
let mut terms_ids = RoaringTreemap::default();
|
||||
let mut p = self.postings.write().await;
|
||||
for (term_id, term_freq) in terms_and_frequencies {
|
||||
p.update_posting(tx, term_id, doc_id, term_freq).await?;
|
||||
p.update_posting(&mut tx, term_id, doc_id, term_freq).await?;
|
||||
if let Some(old_term_ids) = &mut old_term_ids {
|
||||
old_term_ids.remove(term_id);
|
||||
}
|
||||
self.term_docs.set_doc(tx, term_id, doc_id).await?;
|
||||
self.term_docs.set_doc(&mut tx, term_id, doc_id).await?;
|
||||
terms_ids.insert(term_id);
|
||||
}
|
||||
|
||||
// Remove any remaining postings
|
||||
if let Some(old_term_ids) = &old_term_ids {
|
||||
for old_term_id in old_term_ids {
|
||||
p.remove_posting(tx, old_term_id, doc_id).await?;
|
||||
let doc_count = self.term_docs.remove_doc(tx, old_term_id, doc_id).await?;
|
||||
p.remove_posting(&mut tx, old_term_id, doc_id).await?;
|
||||
let doc_count = self.term_docs.remove_doc(&mut tx, old_term_id, doc_id).await?;
|
||||
// if the term does not have anymore postings, we can remove the term
|
||||
if doc_count == 0 {
|
||||
t.remove_term_id(tx, old_term_id).await?;
|
||||
t.remove_term_id(&mut tx, old_term_id).await?;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -263,14 +287,14 @@ impl FtIndex {
|
|||
if let Some(ofs) = offsets {
|
||||
if !ofs.is_empty() {
|
||||
for (tid, or) in ofs {
|
||||
self.offsets.set_offsets(tx, doc_id, tid, or).await?;
|
||||
self.offsets.set_offsets(&mut tx, doc_id, tid, or).await?;
|
||||
}
|
||||
}
|
||||
}
|
||||
// In case of an update, w remove the offset for the terms that does not exist anymore
|
||||
if let Some(old_term_ids) = old_term_ids {
|
||||
for old_term_id in old_term_ids {
|
||||
self.offsets.remove_offsets(tx, doc_id, old_term_id).await?;
|
||||
self.offsets.remove_offsets(&mut tx, doc_id, old_term_id).await?;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -293,17 +317,19 @@ impl FtIndex {
|
|||
|
||||
pub(super) async fn extract_terms(
|
||||
&self,
|
||||
tx: &mut Transaction,
|
||||
ctx: &Context<'_>,
|
||||
opt: &Options,
|
||||
txn: &Transaction,
|
||||
query_string: String,
|
||||
) -> Result<Vec<Option<TermId>>, Error> {
|
||||
let t = self.terms.read().await;
|
||||
let terms = self.analyzer.extract_terms(&t, tx, query_string).await?;
|
||||
let terms = self.analyzer.extract_terms(ctx, opt, txn, &t, query_string).await?;
|
||||
Ok(terms)
|
||||
}
|
||||
|
||||
pub(super) async fn get_terms_docs(
|
||||
&self,
|
||||
tx: &mut Transaction,
|
||||
tx: &mut kvs::Transaction,
|
||||
terms: &Vec<Option<TermId>>,
|
||||
) -> Result<Vec<Option<(TermId, RoaringTreemap)>>, Error> {
|
||||
let mut terms_docs = Vec::with_capacity(terms.len());
|
||||
|
@ -363,7 +389,7 @@ impl FtIndex {
|
|||
#[allow(clippy::too_many_arguments)]
|
||||
pub(super) async fn highlight(
|
||||
&self,
|
||||
tx: &mut Transaction,
|
||||
tx: &mut kvs::Transaction,
|
||||
thg: &Thing,
|
||||
terms: &[Option<TermId>],
|
||||
prefix: Value,
|
||||
|
@ -387,7 +413,7 @@ impl FtIndex {
|
|||
|
||||
pub(super) async fn extract_offsets(
|
||||
&self,
|
||||
tx: &mut Transaction,
|
||||
tx: &mut kvs::Transaction,
|
||||
thg: &Thing,
|
||||
terms: &[Option<TermId>],
|
||||
) -> Result<Value, Error> {
|
||||
|
@ -405,21 +431,23 @@ impl FtIndex {
|
|||
Ok(Value::None)
|
||||
}
|
||||
|
||||
pub(crate) async fn statistics(&self, tx: &mut Transaction) -> Result<FtStatistics, Error> {
|
||||
pub(crate) async fn statistics(&self, txn: &Transaction) -> Result<FtStatistics, Error> {
|
||||
// TODO do parallel execution
|
||||
let mut run = txn.lock().await;
|
||||
Ok(FtStatistics {
|
||||
doc_ids: self.doc_ids.read().await.statistics(tx).await?,
|
||||
terms: self.terms.read().await.statistics(tx).await?,
|
||||
doc_lengths: self.doc_lengths.read().await.statistics(tx).await?,
|
||||
postings: self.postings.read().await.statistics(tx).await?,
|
||||
doc_ids: self.doc_ids.read().await.statistics(&mut run).await?,
|
||||
terms: self.terms.read().await.statistics(&mut run).await?,
|
||||
doc_lengths: self.doc_lengths.read().await.statistics(&mut run).await?,
|
||||
postings: self.postings.read().await.statistics(&mut run).await?,
|
||||
})
|
||||
}
|
||||
|
||||
pub(crate) async fn finish(self, tx: &mut Transaction) -> Result<(), Error> {
|
||||
self.doc_ids.write().await.finish(tx).await?;
|
||||
self.doc_lengths.write().await.finish(tx).await?;
|
||||
self.postings.write().await.finish(tx).await?;
|
||||
self.terms.write().await.finish(tx).await?;
|
||||
pub(crate) async fn finish(self, tx: &Transaction) -> Result<(), Error> {
|
||||
let mut run = tx.lock().await;
|
||||
self.doc_ids.write().await.finish(&mut run).await?;
|
||||
self.doc_lengths.write().await.finish(&mut run).await?;
|
||||
self.postings.write().await.finish(&mut run).await?;
|
||||
self.terms.write().await.finish(&mut run).await?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
@ -439,7 +467,7 @@ impl HitsIterator {
|
|||
|
||||
pub(crate) async fn next(
|
||||
&mut self,
|
||||
tx: &mut Transaction,
|
||||
tx: &mut kvs::Transaction,
|
||||
) -> Result<Option<(Thing, DocId)>, Error> {
|
||||
for doc_id in self.iter.by_ref() {
|
||||
if let Some(doc_key) = self.doc_ids.read().await.get_doc_key(tx, doc_id).await? {
|
||||
|
@ -452,30 +480,34 @@ impl HitsIterator {
|
|||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use crate::ctx::Context;
|
||||
use crate::dbs::{Options, Transaction};
|
||||
use crate::idx::ft::scorer::{BM25Scorer, Score};
|
||||
use crate::idx::ft::{FtIndex, HitsIterator};
|
||||
use crate::idx::trees::store::TreeStoreType;
|
||||
use crate::idx::IndexKeyBase;
|
||||
use crate::kvs::{Datastore, LockType::*, Transaction};
|
||||
use crate::kvs::{Datastore, LockType::*};
|
||||
use crate::sql::index::SearchParams;
|
||||
use crate::sql::scoring::Scoring;
|
||||
use crate::sql::statements::{DefineAnalyzerStatement, DefineStatement};
|
||||
use crate::sql::{Statement, Thing, Value};
|
||||
use crate::syn;
|
||||
use futures::lock::Mutex;
|
||||
use std::collections::HashMap;
|
||||
use std::sync::Arc;
|
||||
use test_log::test;
|
||||
|
||||
async fn check_hits(
|
||||
tx: &mut Transaction,
|
||||
txn: &Transaction,
|
||||
hits: Option<HitsIterator>,
|
||||
scr: BM25Scorer,
|
||||
e: Vec<(&Thing, Option<Score>)>,
|
||||
) {
|
||||
let mut tx = txn.lock().await;
|
||||
if let Some(mut hits) = hits {
|
||||
let mut map = HashMap::new();
|
||||
while let Some((k, d)) = hits.next(tx).await.unwrap() {
|
||||
let s = scr.score(tx, d).await.unwrap();
|
||||
while let Some((k, d)) = hits.next(&mut tx).await.unwrap() {
|
||||
let s = scr.score(&mut tx, d).await.unwrap();
|
||||
map.insert(k, s);
|
||||
}
|
||||
assert_eq!(map.len(), e.len());
|
||||
|
@ -488,27 +520,33 @@ mod tests {
|
|||
}
|
||||
|
||||
async fn search(
|
||||
tx: &mut Transaction,
|
||||
ctx: &Context<'_>,
|
||||
opt: &Options,
|
||||
txn: &Transaction,
|
||||
fti: &FtIndex,
|
||||
qs: &str,
|
||||
) -> (Option<HitsIterator>, BM25Scorer) {
|
||||
let t = fti.extract_terms(tx, qs.to_string()).await.unwrap();
|
||||
let td = Arc::new(fti.get_terms_docs(tx, &t).await.unwrap());
|
||||
let t = fti.extract_terms(ctx, opt, txn, qs.to_string()).await.unwrap();
|
||||
let mut tx = txn.lock().await;
|
||||
let td = Arc::new(fti.get_terms_docs(&mut tx, &t).await.unwrap());
|
||||
drop(tx);
|
||||
let scr = fti.new_scorer(td.clone()).unwrap().unwrap();
|
||||
let hits = fti.new_hits_iterator(td).unwrap();
|
||||
(hits, scr)
|
||||
}
|
||||
|
||||
pub(super) async fn tx_fti(
|
||||
pub(super) async fn tx_fti<'a>(
|
||||
ds: &Datastore,
|
||||
store_type: TreeStoreType,
|
||||
az: &DefineAnalyzerStatement,
|
||||
order: u32,
|
||||
hl: bool,
|
||||
) -> (Transaction, FtIndex) {
|
||||
) -> (Context<'a>, Options, Transaction, FtIndex) {
|
||||
let write = matches!(store_type, TreeStoreType::Write);
|
||||
let mut tx = ds.transaction(write.into(), Optimistic).await.unwrap();
|
||||
let fti = FtIndex::new(
|
||||
let tx = ds.transaction(write.into(), Optimistic).await.unwrap();
|
||||
let txn = Arc::new(Mutex::new(tx));
|
||||
let mut tx = txn.lock().await;
|
||||
let fti = FtIndex::with_analyzer(
|
||||
&mut tx,
|
||||
az.clone(),
|
||||
IndexKeyBase::default(),
|
||||
|
@ -525,12 +563,13 @@ mod tests {
|
|||
)
|
||||
.await
|
||||
.unwrap();
|
||||
(tx, fti)
|
||||
drop(tx);
|
||||
(Context::default(), Options::default(), txn, fti)
|
||||
}
|
||||
|
||||
pub(super) async fn finish(mut tx: Transaction, fti: FtIndex) {
|
||||
fti.finish(&mut tx).await.unwrap();
|
||||
tx.commit().await.unwrap();
|
||||
pub(super) async fn finish(txn: &Transaction, fti: FtIndex) {
|
||||
fti.finish(txn).await.unwrap();
|
||||
txn.lock().await.commit().await.unwrap();
|
||||
}
|
||||
|
||||
#[test(tokio::test)]
|
||||
|
@ -549,93 +588,99 @@ mod tests {
|
|||
|
||||
{
|
||||
// Add one document
|
||||
let (mut tx, mut fti) =
|
||||
let (ctx, opt, txn, mut fti) =
|
||||
tx_fti(&ds, TreeStoreType::Write, &az, btree_order, false).await;
|
||||
fti.index_document(&mut tx, &doc1, vec![Value::from("hello the world")]).await.unwrap();
|
||||
finish(tx, fti).await;
|
||||
fti.index_document(&ctx, &opt, &txn, &doc1, vec![Value::from("hello the world")])
|
||||
.await
|
||||
.unwrap();
|
||||
finish(&txn, fti).await;
|
||||
}
|
||||
|
||||
{
|
||||
// Add two documents
|
||||
let (mut tx, mut fti) =
|
||||
let (ctx, opt, txn, mut fti) =
|
||||
tx_fti(&ds, TreeStoreType::Write, &az, btree_order, false).await;
|
||||
fti.index_document(&mut tx, &doc2, vec![Value::from("a yellow hello")]).await.unwrap();
|
||||
fti.index_document(&mut tx, &doc3, vec![Value::from("foo bar")]).await.unwrap();
|
||||
finish(tx, fti).await;
|
||||
fti.index_document(&ctx, &opt, &txn, &doc2, vec![Value::from("a yellow hello")])
|
||||
.await
|
||||
.unwrap();
|
||||
fti.index_document(&ctx, &opt, &txn, &doc3, vec![Value::from("foo bar")])
|
||||
.await
|
||||
.unwrap();
|
||||
finish(&txn, fti).await;
|
||||
}
|
||||
|
||||
{
|
||||
let (mut tx, fti) = tx_fti(&ds, TreeStoreType::Read, &az, btree_order, false).await;
|
||||
let (ctx, opt, txn, fti) =
|
||||
tx_fti(&ds, TreeStoreType::Read, &az, btree_order, false).await;
|
||||
// Check the statistics
|
||||
let statistics = fti.statistics(&mut tx).await.unwrap();
|
||||
let statistics = fti.statistics(&txn).await.unwrap();
|
||||
assert_eq!(statistics.terms.keys_count, 7);
|
||||
assert_eq!(statistics.postings.keys_count, 8);
|
||||
assert_eq!(statistics.doc_ids.keys_count, 3);
|
||||
assert_eq!(statistics.doc_lengths.keys_count, 3);
|
||||
|
||||
// Search & score
|
||||
let (hits, scr) = search(&mut tx, &fti, "hello").await;
|
||||
check_hits(
|
||||
&mut tx,
|
||||
hits,
|
||||
scr,
|
||||
vec![(&doc1, Some(-0.4859746)), (&doc2, Some(-0.4859746))],
|
||||
)
|
||||
.await;
|
||||
let (hits, scr) = search(&ctx, &opt, &txn, &fti, "hello").await;
|
||||
check_hits(&txn, hits, scr, vec![(&doc1, Some(-0.4859746)), (&doc2, Some(-0.4859746))])
|
||||
.await;
|
||||
|
||||
let (hits, scr) = search(&mut tx, &fti, "world").await;
|
||||
check_hits(&mut tx, hits, scr, vec![(&doc1, Some(0.4859746))]).await;
|
||||
let (hits, scr) = search(&ctx, &opt, &txn, &fti, "world").await;
|
||||
check_hits(&txn, hits, scr, vec![(&doc1, Some(0.4859746))]).await;
|
||||
|
||||
let (hits, scr) = search(&mut tx, &fti, "yellow").await;
|
||||
check_hits(&mut tx, hits, scr, vec![(&doc2, Some(0.4859746))]).await;
|
||||
let (hits, scr) = search(&ctx, &opt, &txn, &fti, "yellow").await;
|
||||
check_hits(&txn, hits, scr, vec![(&doc2, Some(0.4859746))]).await;
|
||||
|
||||
let (hits, scr) = search(&mut tx, &fti, "foo").await;
|
||||
check_hits(&mut tx, hits, scr, vec![(&doc3, Some(0.56902087))]).await;
|
||||
let (hits, scr) = search(&ctx, &opt, &txn, &fti, "foo").await;
|
||||
check_hits(&txn, hits, scr, vec![(&doc3, Some(0.56902087))]).await;
|
||||
|
||||
let (hits, scr) = search(&mut tx, &fti, "bar").await;
|
||||
check_hits(&mut tx, hits, scr, vec![(&doc3, Some(0.56902087))]).await;
|
||||
let (hits, scr) = search(&ctx, &opt, &txn, &fti, "bar").await;
|
||||
check_hits(&txn, hits, scr, vec![(&doc3, Some(0.56902087))]).await;
|
||||
|
||||
let (hits, _) = search(&mut tx, &fti, "dummy").await;
|
||||
let (hits, _) = search(&ctx, &opt, &txn, &fti, "dummy").await;
|
||||
assert!(hits.is_none());
|
||||
}
|
||||
|
||||
{
|
||||
// Reindex one document
|
||||
let (mut tx, mut fti) =
|
||||
let (ctx, opt, txn, mut fti) =
|
||||
tx_fti(&ds, TreeStoreType::Write, &az, btree_order, false).await;
|
||||
fti.index_document(&mut tx, &doc3, vec![Value::from("nobar foo")]).await.unwrap();
|
||||
finish(tx, fti).await;
|
||||
fti.index_document(&ctx, &opt, &txn, &doc3, vec![Value::from("nobar foo")])
|
||||
.await
|
||||
.unwrap();
|
||||
finish(&txn, fti).await;
|
||||
|
||||
let (mut tx, fti) = tx_fti(&ds, TreeStoreType::Read, &az, btree_order, false).await;
|
||||
let (ctx, opt, txn, fti) =
|
||||
tx_fti(&ds, TreeStoreType::Read, &az, btree_order, false).await;
|
||||
|
||||
// We can still find 'foo'
|
||||
let (hits, scr) = search(&mut tx, &fti, "foo").await;
|
||||
check_hits(&mut tx, hits, scr, vec![(&doc3, Some(0.56902087))]).await;
|
||||
let (hits, scr) = search(&ctx, &opt, &txn, &fti, "foo").await;
|
||||
check_hits(&txn, hits, scr, vec![(&doc3, Some(0.56902087))]).await;
|
||||
|
||||
// We can't anymore find 'bar'
|
||||
let (hits, _) = search(&mut tx, &fti, "bar").await;
|
||||
let (hits, _) = search(&ctx, &opt, &txn, &fti, "bar").await;
|
||||
assert!(hits.is_none());
|
||||
|
||||
// We can now find 'nobar'
|
||||
let (hits, scr) = search(&mut tx, &fti, "nobar").await;
|
||||
check_hits(&mut tx, hits, scr, vec![(&doc3, Some(0.56902087))]).await;
|
||||
let (hits, scr) = search(&ctx, &opt, &txn, &fti, "nobar").await;
|
||||
check_hits(&txn, hits, scr, vec![(&doc3, Some(0.56902087))]).await;
|
||||
}
|
||||
|
||||
{
|
||||
// Remove documents
|
||||
let (mut tx, mut fti) =
|
||||
let (_, _, txn, mut fti) =
|
||||
tx_fti(&ds, TreeStoreType::Write, &az, btree_order, false).await;
|
||||
fti.remove_document(&mut tx, &doc1).await.unwrap();
|
||||
fti.remove_document(&mut tx, &doc2).await.unwrap();
|
||||
fti.remove_document(&mut tx, &doc3).await.unwrap();
|
||||
finish(tx, fti).await;
|
||||
fti.remove_document(&txn, &doc1).await.unwrap();
|
||||
fti.remove_document(&txn, &doc2).await.unwrap();
|
||||
fti.remove_document(&txn, &doc3).await.unwrap();
|
||||
finish(&txn, fti).await;
|
||||
}
|
||||
|
||||
{
|
||||
let (mut tx, fti) = tx_fti(&ds, TreeStoreType::Read, &az, btree_order, false).await;
|
||||
let (hits, _) = search(&mut tx, &fti, "hello").await;
|
||||
let (ctx, opt, txn, fti) =
|
||||
tx_fti(&ds, TreeStoreType::Read, &az, btree_order, false).await;
|
||||
let (hits, _) = search(&ctx, &opt, &txn, &fti, "hello").await;
|
||||
assert!(hits.is_none());
|
||||
let (hits, _) = search(&mut tx, &fti, "foo").await;
|
||||
let (hits, _) = search(&ctx, &opt, &txn, &fti, "foo").await;
|
||||
assert!(hits.is_none());
|
||||
}
|
||||
}
|
||||
|
@ -659,51 +704,60 @@ mod tests {
|
|||
|
||||
let btree_order = 5;
|
||||
{
|
||||
let (mut tx, mut fti) =
|
||||
let (ctx, opt, txn, mut fti) =
|
||||
tx_fti(&ds, TreeStoreType::Write, &az, btree_order, hl).await;
|
||||
fti.index_document(
|
||||
&mut tx,
|
||||
&ctx,
|
||||
&opt,
|
||||
&txn,
|
||||
&doc1,
|
||||
vec![Value::from("the quick brown fox jumped over the lazy dog")],
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
fti.index_document(
|
||||
&mut tx,
|
||||
&ctx,
|
||||
&opt,
|
||||
&txn,
|
||||
&doc2,
|
||||
vec![Value::from("the fast fox jumped over the lazy dog")],
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
fti.index_document(
|
||||
&mut tx,
|
||||
&ctx,
|
||||
&opt,
|
||||
&txn,
|
||||
&doc3,
|
||||
vec![Value::from("the dog sat there and did nothing")],
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
fti.index_document(
|
||||
&mut tx,
|
||||
&ctx,
|
||||
&opt,
|
||||
&txn,
|
||||
&doc4,
|
||||
vec![Value::from("the other animals sat there watching")],
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
finish(tx, fti).await;
|
||||
finish(&txn, fti).await;
|
||||
}
|
||||
|
||||
{
|
||||
let (mut tx, fti) = tx_fti(&ds, TreeStoreType::Read, &az, btree_order, hl).await;
|
||||
let (ctx, opt, txn, fti) =
|
||||
tx_fti(&ds, TreeStoreType::Read, &az, btree_order, hl).await;
|
||||
|
||||
let statistics = fti.statistics(&mut tx).await.unwrap();
|
||||
let statistics = fti.statistics(&txn).await.unwrap();
|
||||
assert_eq!(statistics.terms.keys_count, 17);
|
||||
assert_eq!(statistics.postings.keys_count, 28);
|
||||
assert_eq!(statistics.doc_ids.keys_count, 4);
|
||||
assert_eq!(statistics.doc_lengths.keys_count, 4);
|
||||
|
||||
let (hits, scr) = search(&mut tx, &fti, "the").await;
|
||||
let (hits, scr) = search(&ctx, &opt, &txn, &fti, "the").await;
|
||||
check_hits(
|
||||
&mut tx,
|
||||
&txn,
|
||||
hits,
|
||||
scr,
|
||||
vec![
|
||||
|
@ -715,9 +769,9 @@ mod tests {
|
|||
)
|
||||
.await;
|
||||
|
||||
let (hits, scr) = search(&mut tx, &fti, "dog").await;
|
||||
let (hits, scr) = search(&ctx, &opt, &txn, &fti, "dog").await;
|
||||
check_hits(
|
||||
&mut tx,
|
||||
&txn,
|
||||
hits,
|
||||
scr,
|
||||
vec![
|
||||
|
@ -728,25 +782,25 @@ mod tests {
|
|||
)
|
||||
.await;
|
||||
|
||||
let (hits, scr) = search(&mut tx, &fti, "fox").await;
|
||||
check_hits(&mut tx, hits, scr, vec![(&doc1, Some(0.0)), (&doc2, Some(0.0))]).await;
|
||||
let (hits, scr) = search(&ctx, &opt, &txn, &fti, "fox").await;
|
||||
check_hits(&txn, hits, scr, vec![(&doc1, Some(0.0)), (&doc2, Some(0.0))]).await;
|
||||
|
||||
let (hits, scr) = search(&mut tx, &fti, "over").await;
|
||||
check_hits(&mut tx, hits, scr, vec![(&doc1, Some(0.0)), (&doc2, Some(0.0))]).await;
|
||||
let (hits, scr) = search(&ctx, &opt, &txn, &fti, "over").await;
|
||||
check_hits(&txn, hits, scr, vec![(&doc1, Some(0.0)), (&doc2, Some(0.0))]).await;
|
||||
|
||||
let (hits, scr) = search(&mut tx, &fti, "lazy").await;
|
||||
check_hits(&mut tx, hits, scr, vec![(&doc1, Some(0.0)), (&doc2, Some(0.0))]).await;
|
||||
let (hits, scr) = search(&ctx, &opt, &txn, &fti, "lazy").await;
|
||||
check_hits(&txn, hits, scr, vec![(&doc1, Some(0.0)), (&doc2, Some(0.0))]).await;
|
||||
|
||||
let (hits, scr) = search(&mut tx, &fti, "jumped").await;
|
||||
check_hits(&mut tx, hits, scr, vec![(&doc1, Some(0.0)), (&doc2, Some(0.0))]).await;
|
||||
let (hits, scr) = search(&ctx, &opt, &txn, &fti, "jumped").await;
|
||||
check_hits(&txn, hits, scr, vec![(&doc1, Some(0.0)), (&doc2, Some(0.0))]).await;
|
||||
|
||||
let (hits, scr) = search(&mut tx, &fti, "nothing").await;
|
||||
check_hits(&mut tx, hits, scr, vec![(&doc3, Some(0.87105393))]).await;
|
||||
let (hits, scr) = search(&ctx, &opt, &txn, &fti, "nothing").await;
|
||||
check_hits(&txn, hits, scr, vec![(&doc3, Some(0.87105393))]).await;
|
||||
|
||||
let (hits, scr) = search(&mut tx, &fti, "animals").await;
|
||||
check_hits(&mut tx, hits, scr, vec![(&doc4, Some(0.92279965))]).await;
|
||||
let (hits, scr) = search(&ctx, &opt, &txn, &fti, "animals").await;
|
||||
check_hits(&txn, hits, scr, vec![(&doc4, Some(0.92279965))]).await;
|
||||
|
||||
let (hits, _) = search(&mut tx, &fti, "dummy").await;
|
||||
let (hits, _) = search(&ctx, &opt, &txn, &fti, "dummy").await;
|
||||
assert!(hits.is_none());
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
use crate::ctx::Context;
|
||||
use crate::dbs::{Options, Transaction};
|
||||
use crate::err::Error;
|
||||
use crate::idx::docids::{DocId, DocIds};
|
||||
|
@ -58,12 +59,12 @@ impl IteratorEntry {
|
|||
}
|
||||
impl QueryExecutor {
|
||||
pub(super) async fn new(
|
||||
ctx: &Context<'_>,
|
||||
opt: &Options,
|
||||
txn: &Transaction,
|
||||
table: &Table,
|
||||
im: IndexesMap,
|
||||
) -> Result<Self, Error> {
|
||||
let mut run = txn.lock().await;
|
||||
let mut mr_entries = HashMap::default();
|
||||
let mut exp_entries = HashMap::default();
|
||||
let mut ft_map = HashMap::default();
|
||||
|
@ -80,15 +81,15 @@ impl QueryExecutor {
|
|||
let mut ft_entry = None;
|
||||
if let Some(ft) = ft_map.get(&ix_ref) {
|
||||
if ft_entry.is_none() {
|
||||
ft_entry = FtEntry::new(&mut run, ft, io).await?;
|
||||
ft_entry = FtEntry::new(ctx, opt, txn, ft, io).await?;
|
||||
}
|
||||
} else {
|
||||
let ikb = IndexKeyBase::new(opt, idx_def);
|
||||
let az = run.get_db_analyzer(opt.ns(), opt.db(), p.az.as_str()).await?;
|
||||
let ft =
|
||||
FtIndex::new(&mut run, az, ikb, p, TreeStoreType::Read).await?;
|
||||
FtIndex::new(opt, txn, p.az.as_str(), ikb, p, TreeStoreType::Read)
|
||||
.await?;
|
||||
if ft_entry.is_none() {
|
||||
ft_entry = FtEntry::new(&mut run, &ft, io).await?;
|
||||
ft_entry = FtEntry::new(ctx, opt, txn, &ft, io).await?;
|
||||
}
|
||||
ft_map.insert(ix_ref, ft);
|
||||
}
|
||||
|
@ -105,13 +106,14 @@ impl QueryExecutor {
|
|||
}
|
||||
Index::MTree(p) => {
|
||||
if let IndexOperator::Knn(a, k) = io.op() {
|
||||
let mut tx = txn.lock().await;
|
||||
let entry = if let Some(mt) = mt_map.get(&ix_ref) {
|
||||
MtEntry::new(&mut run, mt, a.clone(), *k).await?
|
||||
MtEntry::new(&mut tx, mt, a.clone(), *k).await?
|
||||
} else {
|
||||
let ikb = IndexKeyBase::new(opt, idx_def);
|
||||
let mt =
|
||||
MTreeIndex::new(&mut run, ikb, p, TreeStoreType::Read).await?;
|
||||
let entry = MtEntry::new(&mut run, &mt, a.clone(), *k).await?;
|
||||
MTreeIndex::new(&mut tx, ikb, p, TreeStoreType::Read).await?;
|
||||
let entry = MtEntry::new(&mut tx, &mt, a.clone(), *k).await?;
|
||||
mt_map.insert(ix_ref, mt);
|
||||
entry
|
||||
};
|
||||
|
@ -437,13 +439,16 @@ struct Inner {
|
|||
|
||||
impl FtEntry {
|
||||
async fn new(
|
||||
tx: &mut kvs::Transaction,
|
||||
ctx: &Context<'_>,
|
||||
opt: &Options,
|
||||
txn: &Transaction,
|
||||
ft: &FtIndex,
|
||||
io: IndexOption,
|
||||
) -> Result<Option<Self>, Error> {
|
||||
if let Matches(qs, _) = io.op() {
|
||||
let terms = ft.extract_terms(tx, qs.to_owned()).await?;
|
||||
let terms_docs = Arc::new(ft.get_terms_docs(tx, &terms).await?);
|
||||
let terms = ft.extract_terms(ctx, opt, txn, qs.to_owned()).await?;
|
||||
let mut tx = txn.lock().await;
|
||||
let terms_docs = Arc::new(ft.get_terms_docs(&mut tx, &terms).await?);
|
||||
Ok(Some(Self(Arc::new(Inner {
|
||||
index_option: io,
|
||||
doc_ids: ft.doc_ids(),
|
||||
|
|
|
@ -44,7 +44,7 @@ impl<'a> QueryPlanner<'a> {
|
|||
) -> Result<(), Error> {
|
||||
match Tree::build(ctx, self.opt, txn, &t, self.cond, self.with).await? {
|
||||
Some((node, im, with_indexes)) => {
|
||||
let mut exe = QueryExecutor::new(self.opt, txn, &t, im).await?;
|
||||
let mut exe = QueryExecutor::new(ctx, self.opt, txn, &t, im).await?;
|
||||
match PlanBuilder::build(node, self.with, with_indexes)? {
|
||||
Plan::SingleIndex(exp, io) => {
|
||||
if io.require_distinct() {
|
||||
|
|
|
@ -1,3 +1,5 @@
|
|||
use lru::LruCache;
|
||||
use once_cell::sync::Lazy;
|
||||
use revision::revisioned;
|
||||
use serde::{
|
||||
de::{self, Visitor},
|
||||
|
@ -7,8 +9,10 @@ use std::cmp::Ordering;
|
|||
use std::fmt::Debug;
|
||||
use std::fmt::{self, Display, Formatter};
|
||||
use std::hash::{Hash, Hasher};
|
||||
use std::str;
|
||||
use std::num::NonZeroUsize;
|
||||
use std::str::FromStr;
|
||||
use std::sync::Mutex;
|
||||
use std::{env, str};
|
||||
|
||||
pub(crate) const TOKEN: &str = "$surrealdb::private::sql::Regex";
|
||||
|
||||
|
@ -23,6 +27,25 @@ impl Regex {
|
|||
}
|
||||
}
|
||||
|
||||
fn regex_new(str: &str) -> Result<regex::Regex, regex::Error> {
|
||||
static REGEX_CACHE: Lazy<Mutex<LruCache<String, regex::Regex>>> = Lazy::new(|| {
|
||||
let cache_size: usize = env::var("SURREAL_REGEX_CACHE_SIZE")
|
||||
.map_or(1000, |v| v.parse().unwrap_or(1000))
|
||||
.max(10); // The minimum cache size is 10
|
||||
Mutex::new(LruCache::new(NonZeroUsize::new(cache_size).unwrap()))
|
||||
});
|
||||
let mut cache = match REGEX_CACHE.lock() {
|
||||
Ok(guard) => guard,
|
||||
Err(poisoned) => poisoned.into_inner(),
|
||||
};
|
||||
if let Some(re) = cache.get(str) {
|
||||
return Ok(re.clone());
|
||||
}
|
||||
let re = regex::Regex::new(str)?;
|
||||
cache.put(str.to_owned(), re.clone());
|
||||
Ok(re)
|
||||
}
|
||||
|
||||
impl FromStr for Regex {
|
||||
type Err = <regex::Regex as FromStr>::Err;
|
||||
|
||||
|
@ -30,7 +53,7 @@ impl FromStr for Regex {
|
|||
if s.contains('\0') {
|
||||
Err(regex::Error::Syntax("regex contained NUL byte".to_owned()))
|
||||
} else {
|
||||
regex::Regex::new(&s.replace("\\/", "/")).map(Self)
|
||||
regex_new(&s.replace("\\/", "/")).map(Self)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -37,10 +37,10 @@ impl AnalyzeStatement {
|
|||
AnalyzeStatement::Idx(tb, idx) => {
|
||||
// Allowed to run?
|
||||
opt.is_allowed(Action::View, ResourceKind::Index, &Base::Db)?;
|
||||
// Claim transaction
|
||||
let mut run = txn.lock().await;
|
||||
// Read the index
|
||||
let ix = run
|
||||
let ix = txn
|
||||
.lock()
|
||||
.await
|
||||
.get_and_cache_tb_index(opt.ns(), opt.db(), tb.as_str(), idx.as_str())
|
||||
.await?;
|
||||
let ikb = IndexKeyBase::new(opt, &ix);
|
||||
|
@ -48,15 +48,15 @@ impl AnalyzeStatement {
|
|||
// Index operation dispatching
|
||||
let value: Value = match &ix.index {
|
||||
Index::Search(p) => {
|
||||
let az = run.get_db_analyzer(opt.ns(), opt.db(), p.az.as_str()).await?;
|
||||
let ft =
|
||||
FtIndex::new(&mut run, az, ikb, p, TreeStoreType::Traversal).await?;
|
||||
ft.statistics(&mut run).await?.into()
|
||||
FtIndex::new(opt, txn, p.az.as_str(), ikb, p, TreeStoreType::Traversal)
|
||||
.await?;
|
||||
ft.statistics(txn).await?.into()
|
||||
}
|
||||
Index::MTree(p) => {
|
||||
let mt =
|
||||
MTreeIndex::new(&mut run, ikb, p, TreeStoreType::Traversal).await?;
|
||||
mt.statistics(&mut run).await?.into()
|
||||
let mut tx = txn.lock().await;
|
||||
let mt = MTreeIndex::new(&mut tx, ikb, p, TreeStoreType::Traversal).await?;
|
||||
mt.statistics(&mut tx).await?.into()
|
||||
}
|
||||
_ => {
|
||||
return Err(Error::FeatureNotYetImplemented {
|
||||
|
|
|
@ -10,9 +10,11 @@ use serde::{Deserialize, Serialize};
|
|||
use std::fmt::{self, Display};
|
||||
|
||||
#[derive(Clone, Debug, Default, Eq, PartialEq, PartialOrd, Serialize, Deserialize, Store, Hash)]
|
||||
#[revisioned(revision = 1)]
|
||||
#[revisioned(revision = 2)]
|
||||
pub struct DefineAnalyzerStatement {
|
||||
pub name: Ident,
|
||||
#[revision(start = 2)]
|
||||
pub function: Option<Ident>,
|
||||
pub tokenizers: Option<Vec<Tokenizer>>,
|
||||
pub filters: Option<Vec<Filter>>,
|
||||
pub comment: Option<Strand>,
|
||||
|
@ -47,6 +49,9 @@ impl DefineAnalyzerStatement {
|
|||
impl Display for DefineAnalyzerStatement {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
write!(f, "DEFINE ANALYZER {}", self.name)?;
|
||||
if let Some(ref i) = self.function {
|
||||
write!(f, " FUNCTION fn::{i}")?
|
||||
}
|
||||
if let Some(v) = &self.tokenizers {
|
||||
let tokens: Vec<String> = v.iter().map(|f| f.to_string()).collect();
|
||||
write!(f, " TOKENIZERS {}", tokens.join(","))?;
|
||||
|
|
|
@ -39,6 +39,7 @@ impl ser::Serializer for Serializer {
|
|||
#[derive(Default)]
|
||||
pub struct SerializeDefineAnalyzerStatement {
|
||||
name: Ident,
|
||||
function: Option<Strand>,
|
||||
tokenizers: Option<Vec<Tokenizer>>,
|
||||
filters: Option<Vec<Filter>>,
|
||||
comment: Option<Strand>,
|
||||
|
@ -56,6 +57,9 @@ impl serde::ser::SerializeStruct for SerializeDefineAnalyzerStatement {
|
|||
"name" => {
|
||||
self.name = Ident(value.serialize(ser::string::Serializer.wrap())?);
|
||||
}
|
||||
"function" => {
|
||||
self.function = value.serialize(ser::strand::opt::Serializer.wrap())?;
|
||||
}
|
||||
"tokenizers" => {
|
||||
self.tokenizers = value.serialize(ser::tokenizer::vec::opt::Serializer.wrap())?;
|
||||
}
|
||||
|
@ -77,6 +81,7 @@ impl serde::ser::SerializeStruct for SerializeDefineAnalyzerStatement {
|
|||
fn end(self) -> Result<Self::Ok, Error> {
|
||||
Ok(DefineAnalyzerStatement {
|
||||
name: self.name,
|
||||
function: self.function.map(|s| Ident(s.0)),
|
||||
tokenizers: self.tokenizers,
|
||||
filters: self.filters,
|
||||
comment: self.comment,
|
||||
|
|
|
@ -1394,6 +1394,21 @@ impl Value {
|
|||
}
|
||||
}
|
||||
|
||||
/// Try to coerce this value to a `Regex`
|
||||
pub(crate) fn coerce_to_regex(self) -> Result<Regex, Error> {
|
||||
match self {
|
||||
// Allow any Regex value
|
||||
Value::Regex(v) => Ok(v),
|
||||
// Allow any string value
|
||||
Value::Strand(v) => Ok(v.as_str().parse()?),
|
||||
// Anything else raises an error
|
||||
_ => Err(Error::CoerceTo {
|
||||
from: self,
|
||||
into: "regex".into(),
|
||||
}),
|
||||
}
|
||||
}
|
||||
|
||||
/// Try to coerce this value to a `String`
|
||||
pub(crate) fn coerce_to_string(self) -> Result<String, Error> {
|
||||
match self {
|
||||
|
|
|
@ -5,7 +5,8 @@ pub mod error;
|
|||
|
||||
pub mod v1;
|
||||
pub use v1::{
|
||||
datetime, datetime_raw, duration, idiom, json, parse, range, subquery, thing, thing_raw, value,
|
||||
datetime, datetime_raw, duration, idiom, json, parse, path_like, range, subquery, thing,
|
||||
thing_raw, value,
|
||||
};
|
||||
|
||||
#[cfg(test)]
|
||||
|
|
|
@ -329,6 +329,7 @@ pub(crate) fn builtin_name(i: &str) -> IResult<&str, BuiltinName<&str>, ParseErr
|
|||
},
|
||||
},
|
||||
search => {
|
||||
analyze => { fn },
|
||||
score => { fn },
|
||||
highlight => { fn },
|
||||
offsets => { fn },
|
||||
|
@ -350,6 +351,7 @@ pub(crate) fn builtin_name(i: &str) -> IResult<&str, BuiltinName<&str>, ParseErr
|
|||
join => { fn },
|
||||
len => { fn },
|
||||
lowercase => { fn },
|
||||
matches => {fn},
|
||||
repeat => { fn },
|
||||
replace => { fn },
|
||||
reverse => { fn },
|
||||
|
|
|
@ -91,6 +91,10 @@ pub fn duration(input: &str) -> Result<Duration, Error> {
|
|||
parse_impl(input, literal::duration)
|
||||
}
|
||||
|
||||
pub fn path_like(input: &str) -> Result<Value, Error> {
|
||||
parse_impl(input, value::path_like)
|
||||
}
|
||||
|
||||
pub fn range(input: &str) -> Result<Range, Error> {
|
||||
parse_impl(input, literal::range)
|
||||
}
|
||||
|
|
|
@ -5,15 +5,17 @@ use super::super::super::{
|
|||
literal::{filters, ident, strand, tokenizer::tokenizers},
|
||||
IResult,
|
||||
};
|
||||
use crate::sql::{filter::Filter, statements::DefineAnalyzerStatement, Strand, Tokenizer};
|
||||
use nom::{branch::alt, bytes::complete::tag_no_case, combinator::cut, multi::many0};
|
||||
use crate::sql::{filter::Filter, statements::DefineAnalyzerStatement, Ident, Strand, Tokenizer};
|
||||
use nom::{
|
||||
branch::alt, bytes::complete::tag, bytes::complete::tag_no_case, combinator::cut, multi::many0,
|
||||
};
|
||||
|
||||
pub fn analyzer(i: &str) -> IResult<&str, DefineAnalyzerStatement> {
|
||||
let (i, _) = tag_no_case("ANALYZER")(i)?;
|
||||
let (i, _) = shouldbespace(i)?;
|
||||
let (i, name) = cut(ident)(i)?;
|
||||
let (i, opts) = many0(analyzer_opts)(i)?;
|
||||
let (i, _) = expected("one of FILTERS, TOKENIZERS, or COMMENT", ending::query)(i)?;
|
||||
let (i, _) = expected("one of FUNCTION, FILTERS, TOKENIZERS, or COMMENT", ending::query)(i)?;
|
||||
// Create the base statement
|
||||
let mut res = DefineAnalyzerStatement {
|
||||
name,
|
||||
|
@ -22,6 +24,9 @@ pub fn analyzer(i: &str) -> IResult<&str, DefineAnalyzerStatement> {
|
|||
// Assign any defined options
|
||||
for opt in opts {
|
||||
match opt {
|
||||
DefineAnalyzerOption::Function(i) => {
|
||||
res.function = Some(i);
|
||||
}
|
||||
DefineAnalyzerOption::Comment(v) => {
|
||||
res.comment = Some(v);
|
||||
}
|
||||
|
@ -38,13 +43,23 @@ pub fn analyzer(i: &str) -> IResult<&str, DefineAnalyzerStatement> {
|
|||
}
|
||||
|
||||
enum DefineAnalyzerOption {
|
||||
Function(Ident),
|
||||
Comment(Strand),
|
||||
Filters(Vec<Filter>),
|
||||
Tokenizers(Vec<Tokenizer>),
|
||||
}
|
||||
|
||||
fn analyzer_opts(i: &str) -> IResult<&str, DefineAnalyzerOption> {
|
||||
alt((analyzer_comment, analyzer_filters, analyzer_tokenizers))(i)
|
||||
alt((analyzer_function, analyzer_comment, analyzer_filters, analyzer_tokenizers))(i)
|
||||
}
|
||||
|
||||
fn analyzer_function(i: &str) -> IResult<&str, DefineAnalyzerOption> {
|
||||
let (i, _) = shouldbespace(i)?;
|
||||
let (i, _) = tag_no_case("FUNCTION")(i)?;
|
||||
let (i, _) = shouldbespace(i)?;
|
||||
let (i, _) = tag("fn::")(i)?;
|
||||
let (i, name) = ident(i)?;
|
||||
Ok((i, DefineAnalyzerOption::Function(name)))
|
||||
}
|
||||
|
||||
fn analyzer_comment(i: &str) -> IResult<&str, DefineAnalyzerOption> {
|
||||
|
|
|
@ -1146,39 +1146,44 @@ async fn define_statement_index_multiple_unique_embedded_multiple() -> Result<()
|
|||
|
||||
#[tokio::test]
|
||||
async fn define_statement_analyzer() -> Result<(), Error> {
|
||||
let sql = "
|
||||
let sql = r#"
|
||||
DEFINE ANALYZER english TOKENIZERS blank,class FILTERS lowercase,snowball(english);
|
||||
DEFINE ANALYZER autocomplete FILTERS lowercase,edgengram(2,10);
|
||||
DEFINE FUNCTION fn::stripHtml($html: string) {
|
||||
RETURN string::replace($html, /<[^>]*>/, "");
|
||||
};
|
||||
DEFINE ANALYZER htmlAnalyzer FUNCTION fn::stripHtml TOKENIZERS blank,class;
|
||||
INFO FOR DB;
|
||||
";
|
||||
"#;
|
||||
let dbs = new_ds().await?;
|
||||
let ses = Session::owner().with_ns("test").with_db("test");
|
||||
let res = &mut dbs.execute(sql, &ses, None).await?;
|
||||
assert_eq!(res.len(), 3);
|
||||
assert_eq!(res.len(), 5);
|
||||
//
|
||||
let tmp = res.remove(0).result;
|
||||
assert!(tmp.is_ok());
|
||||
for _ in 0..4 {
|
||||
let tmp = res.remove(0).result;
|
||||
assert!(tmp.is_ok());
|
||||
}
|
||||
//
|
||||
let tmp = res.remove(0).result;
|
||||
assert!(tmp.is_ok());
|
||||
//
|
||||
|
||||
let tmp = res.remove(0).result?;
|
||||
let val = Value::parse(
|
||||
"{
|
||||
r#"{
|
||||
analyzers: {
|
||||
autocomplete: 'DEFINE ANALYZER autocomplete FILTERS LOWERCASE,EDGENGRAM(2,10)',
|
||||
english: 'DEFINE ANALYZER english TOKENIZERS BLANK,CLASS FILTERS LOWERCASE,SNOWBALL(ENGLISH)',
|
||||
htmlAnalyzer: 'DEFINE ANALYZER htmlAnalyzer FUNCTION fn::stripHtml TOKENIZERS BLANK,CLASS'
|
||||
},
|
||||
tokens: {},
|
||||
functions: {},
|
||||
functions: {
|
||||
stripHtml: "DEFINE FUNCTION fn::stripHtml($html: string) { RETURN string::replace($html, /<[^>]*>/, ''); }"
|
||||
},
|
||||
params: {},
|
||||
scopes: {},
|
||||
tables: {},
|
||||
users: {},
|
||||
}",
|
||||
}"#,
|
||||
);
|
||||
assert_eq!(tmp, val);
|
||||
assert_eq!(format!("{:#}", tmp), format!("{:#}", val));
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
|
|
@ -5,6 +5,7 @@ use helpers::new_ds;
|
|||
use surrealdb::dbs::Session;
|
||||
use surrealdb::err::Error;
|
||||
use surrealdb::sql::{Number, Value};
|
||||
use test_log::test;
|
||||
|
||||
async fn test_queries(sql: &str, desired_responses: &[&str]) -> Result<(), Error> {
|
||||
let db = new_ds().await?;
|
||||
|
@ -3297,6 +3298,120 @@ async fn function_string_ends_with() -> Result<(), Error> {
|
|||
Ok(())
|
||||
}
|
||||
|
||||
#[test(tokio::test)]
|
||||
async fn function_search_analyzer() -> Result<(), Error> {
|
||||
let sql = r#"
|
||||
DEFINE FUNCTION fn::stripHtml($html: string) {
|
||||
RETURN string::replace($html, /<[^>]*>/, "");
|
||||
};
|
||||
DEFINE ANALYZER htmlAnalyzer FUNCTION fn::stripHtml TOKENIZERS blank,class;
|
||||
RETURN search::analyze('htmlAnalyzer', '<p>This is a <em>sample</em> of HTML</p>');
|
||||
"#;
|
||||
let dbs = new_ds().await?;
|
||||
let ses = Session::owner().with_ns("test").with_db("test");
|
||||
let res = &mut dbs.execute(sql, &ses, None).await?;
|
||||
assert_eq!(res.len(), 3);
|
||||
//
|
||||
for _ in 0..2 {
|
||||
let tmp = res.remove(0).result;
|
||||
assert!(tmp.is_ok());
|
||||
}
|
||||
//
|
||||
let tmp = res.remove(0).result?;
|
||||
let val = Value::parse("['This', 'is', 'a', 'sample', 'of', 'HTML']");
|
||||
assert_eq!(format!("{:#}", tmp), format!("{:#}", val));
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test(tokio::test)]
|
||||
async fn function_search_analyzer_invalid_arguments() -> Result<(), Error> {
|
||||
let sql = r#"
|
||||
DEFINE FUNCTION fn::unsupportedFunction() {
|
||||
RETURN 1;
|
||||
};
|
||||
DEFINE ANALYZER htmlAnalyzer FUNCTION fn::unsupportedFunction TOKENIZERS blank,class;
|
||||
RETURN search::analyze('htmlAnalyzer', '<p>This is a <em>sample</em> of HTML</p>');
|
||||
"#;
|
||||
let dbs = new_ds().await?;
|
||||
let ses = Session::owner().with_ns("test").with_db("test");
|
||||
let res = &mut dbs.execute(sql, &ses, None).await?;
|
||||
assert_eq!(res.len(), 3);
|
||||
//
|
||||
for _ in 0..2 {
|
||||
let tmp = res.remove(0).result;
|
||||
assert!(tmp.is_ok());
|
||||
}
|
||||
//
|
||||
match res.remove(0).result {
|
||||
Err(Error::InvalidArguments {
|
||||
name,
|
||||
message,
|
||||
}) => {
|
||||
assert_eq!(&name, "fn::unsupportedFunction");
|
||||
assert_eq!(&message, "The function expects 0 arguments.");
|
||||
}
|
||||
_ => panic!("Should have fail!"),
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test(tokio::test)]
|
||||
async fn function_search_analyzer_invalid_return_type() -> Result<(), Error> {
|
||||
let sql = r#"
|
||||
DEFINE FUNCTION fn::unsupportedReturnedType($html: string) {
|
||||
RETURN 1;
|
||||
};
|
||||
DEFINE ANALYZER htmlAnalyzer FUNCTION fn::unsupportedReturnedType TOKENIZERS blank,class;
|
||||
RETURN search::analyze('htmlAnalyzer', '<p>This is a <em>sample</em> of HTML</p>');
|
||||
"#;
|
||||
let dbs = new_ds().await?;
|
||||
let ses = Session::owner().with_ns("test").with_db("test");
|
||||
let res = &mut dbs.execute(sql, &ses, None).await?;
|
||||
assert_eq!(res.len(), 3);
|
||||
//
|
||||
for _ in 0..2 {
|
||||
let tmp = res.remove(0).result;
|
||||
assert!(tmp.is_ok());
|
||||
}
|
||||
//
|
||||
match res.remove(0).result {
|
||||
Err(Error::InvalidFunction {
|
||||
name,
|
||||
message,
|
||||
}) => {
|
||||
assert_eq!(&name, "unsupportedReturnedType");
|
||||
assert_eq!(&message, "The function should return a string.");
|
||||
}
|
||||
r => panic!("Unexpected result: {:?}", r),
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test(tokio::test)]
|
||||
async fn function_search_analyzer_invalid_function_name() -> Result<(), Error> {
|
||||
let sql = r#"
|
||||
DEFINE ANALYZER htmlAnalyzer FUNCTION fn::doesNotExist TOKENIZERS blank,class;
|
||||
RETURN search::analyze('htmlAnalyzer', '<p>This is a <em>sample</em> of HTML</p>');
|
||||
"#;
|
||||
let dbs = new_ds().await?;
|
||||
let ses = Session::owner().with_ns("test").with_db("test");
|
||||
let res = &mut dbs.execute(sql, &ses, None).await?;
|
||||
assert_eq!(res.len(), 2);
|
||||
//
|
||||
let tmp = res.remove(0).result;
|
||||
assert!(tmp.is_ok());
|
||||
//
|
||||
match res.remove(0).result {
|
||||
Err(Error::FcNotFound {
|
||||
value,
|
||||
}) => {
|
||||
assert_eq!(&value, "doesNotExist");
|
||||
}
|
||||
r => panic!("Unexpected result: {:?}", r),
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn function_parse_is_alphanum() -> Result<(), Error> {
|
||||
let sql = r#"
|
||||
|
@ -3664,6 +3779,49 @@ async fn function_string_lowercase() -> Result<(), Error> {
|
|||
Ok(())
|
||||
}
|
||||
|
||||
// "<[^>]*>" , ""
|
||||
|
||||
#[tokio::test]
|
||||
async fn function_string_replace_with_regex() -> Result<(), Error> {
|
||||
let sql = r#"
|
||||
RETURN string::replace('<p>This is a <em>sample</em> string with <a href="\\#">HTML</a> tags.</p>', /<[^>]*>/, "");
|
||||
RETURN string::replace('<p>This one is already <strong>compiled!<strong></p>', /<[^>]*>/, "");
|
||||
"#;
|
||||
let dbs = new_ds().await?;
|
||||
let ses = Session::owner().with_ns("test").with_db("test");
|
||||
let res = &mut dbs.execute(sql, &ses, None).await?;
|
||||
assert_eq!(res.len(), 2);
|
||||
//
|
||||
let tmp = res.remove(0).result?;
|
||||
let val = Value::from("This is a sample string with HTML tags.");
|
||||
assert_eq!(tmp, val);
|
||||
//
|
||||
let tmp = res.remove(0).result?;
|
||||
let val = Value::from("This one is already compiled!");
|
||||
assert_eq!(tmp, val);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn function_string_matches() -> Result<(), Error> {
|
||||
let sql = r#"
|
||||
RETURN string::matches("foo", /foo/);
|
||||
RETURN string::matches("bar", /foo/);
|
||||
"#;
|
||||
let dbs = new_ds().await?;
|
||||
let ses = Session::owner().with_ns("test").with_db("test");
|
||||
let res = &mut dbs.execute(sql, &ses, None).await?;
|
||||
//
|
||||
let tmp = res.remove(0).result?;
|
||||
let val = Value::from(true);
|
||||
assert_eq!(tmp, val);
|
||||
//
|
||||
let tmp = res.remove(0).result?;
|
||||
let val = Value::from(false);
|
||||
assert_eq!(tmp, val);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn function_string_repeat() -> Result<(), Error> {
|
||||
let sql = r#"
|
||||
|
|
Loading…
Reference in a new issue