From b3a1b392363b0ef605773dc9acc4956d9eec8ca0 Mon Sep 17 00:00:00 2001 From: Eduardo Pereira de Sousa Date: Fri, 14 Jul 2023 17:37:52 -0300 Subject: [PATCH] Feature: Add additional functions for comparing strings for similarity. (#1904) Co-authored-by: Tobie Morgan Hitchcock --- lib/fuzz/fuzz_targets/fuzz_executor.dict | 17 +++++ lib/fuzz/fuzz_targets/fuzz_sql_parser.dict | 17 +++++ lib/src/fnc/mod.rs | 5 ++ .../modules/surrealdb/functions/string.rs | 4 ++ .../surrealdb/functions/string/distance.rs | 11 ++++ .../surrealdb/functions/string/similarity.rs | 12 ++++ lib/src/fnc/string.rs | 41 +++++++++++- lib/src/fnc/util/string/fuzzy.rs | 23 +++++++ lib/src/fnc/util/string/mod.rs | 25 +------- lib/src/fnc/util/string/slug.rs | 23 +++++++ lib/src/sql/function.rs | 2 + lib/src/sql/value/value.rs | 10 +-- lib/tests/function.rs | 64 +++++++++++++++++++ 13 files changed, 223 insertions(+), 31 deletions(-) create mode 100644 lib/src/fnc/script/modules/surrealdb/functions/string/distance.rs create mode 100644 lib/src/fnc/script/modules/surrealdb/functions/string/similarity.rs create mode 100644 lib/src/fnc/util/string/fuzzy.rs create mode 100644 lib/src/fnc/util/string/slug.rs diff --git a/lib/fuzz/fuzz_targets/fuzz_executor.dict b/lib/fuzz/fuzz_targets/fuzz_executor.dict index a03e5155..3d1c2d69 100644 --- a/lib/fuzz/fuzz_targets/fuzz_executor.dict +++ b/lib/fuzz/fuzz_targets/fuzz_executor.dict @@ -289,6 +289,8 @@ "string" "string::concat(" "string::contains(" +"string::distance::hamming(" +"string::distance::levenshtein(" "string::endsWith(" "string::join(" "string::len(" @@ -296,6 +298,9 @@ "string::repeat(" "string::replace(" "string::reverse(" +"string::similarity::fuzzy(" +"string::similarity::jaro(" +"string::similarity::smithwaterman(" "string::slice(" "string::slug(" "string::split(" @@ -338,5 +343,17 @@ "type::string(" "type::table(" "type::thing(" +"vector::dotproduct(", +"vector::magnitude(", +"vector::distance::chebyshev(", +"vector::distance::euclidean(", +"vector::distance::hamming(", +"vector::distance::mahalanobis(", +"vector::distance::manhattan(", +"vector::distance::minkowski(", +"vector::similarity::cosine(", +"vector::similarity::jaccard(", +"vector::similarity::pearson(", +"vector::similarity::spearman(", # TODO: Add Javascript keywords diff --git a/lib/fuzz/fuzz_targets/fuzz_sql_parser.dict b/lib/fuzz/fuzz_targets/fuzz_sql_parser.dict index 42fc862c..0028154f 100644 --- a/lib/fuzz/fuzz_targets/fuzz_sql_parser.dict +++ b/lib/fuzz/fuzz_targets/fuzz_sql_parser.dict @@ -288,6 +288,8 @@ "string" "string::concat(" "string::contains(" +"string::distance::hamming(" +"string::distance::levenshtein(" "string::endsWith(" "string::join(" "string::len(" @@ -295,6 +297,9 @@ "string::repeat(" "string::replace(" "string::reverse(" +"string::similarity::fuzzy(" +"string::similarity::jaro(" +"string::similarity::smithwaterman(" "string::slice(" "string::slug(" "string::split(" @@ -335,5 +340,17 @@ "type::string(" "type::table(" "type::thing(" +"vector::dotproduct(", +"vector::magnitude(", +"vector::distance::chebyshev(", +"vector::distance::euclidean(", +"vector::distance::hamming(", +"vector::distance::mahalanobis(", +"vector::distance::manhattan(", +"vector::distance::minkowski(", +"vector::similarity::cosine(", +"vector::similarity::jaccard(", +"vector::similarity::pearson(", +"vector::similarity::spearman(", # TODO: Add Javascript keywords diff --git a/lib/src/fnc/mod.rs b/lib/src/fnc/mod.rs index e49f4461..4ae4a206 100644 --- a/lib/src/fnc/mod.rs +++ b/lib/src/fnc/mod.rs @@ -248,6 +248,11 @@ pub fn synchronous(ctx: &Context<'_>, name: &str, args: Vec) -> Result string::trim, "string::uppercase" => string::uppercase, "string::words" => string::words, + "string::distance::hamming" => string::distance::hamming, + "string::distance::levenshtein" => string::distance::levenshtein, + "string::similarity::fuzzy" => string::similarity::fuzzy, + "string::similarity::jaro" => string::similarity::jaro, + "string::similarity::smithwaterman" => string::similarity::smithwaterman, // "time::ceil" => time::ceil, "time::day" => time::day, diff --git a/lib/src/fnc/script/modules/surrealdb/functions/string.rs b/lib/src/fnc/script/modules/surrealdb/functions/string.rs index c8ee1bd7..29c208a8 100644 --- a/lib/src/fnc/script/modules/surrealdb/functions/string.rs +++ b/lib/src/fnc/script/modules/surrealdb/functions/string.rs @@ -1,6 +1,8 @@ use super::run; use crate::fnc::script::modules::impl_module_def; +mod distance; +mod similarity; pub struct Package; impl_module_def!( @@ -8,6 +10,7 @@ impl_module_def!( "string", "concat" => run, "contains" => run, + "distance" => (distance::Package), "endsWith" => run, "join" => run, "len" => run, @@ -15,6 +18,7 @@ impl_module_def!( "repeat" => run, "replace" => run, "reverse" => run, + "similarity" => (similarity::Package), "slice" => run, "slug" => run, "split" => run, diff --git a/lib/src/fnc/script/modules/surrealdb/functions/string/distance.rs b/lib/src/fnc/script/modules/surrealdb/functions/string/distance.rs new file mode 100644 index 00000000..27cca681 --- /dev/null +++ b/lib/src/fnc/script/modules/surrealdb/functions/string/distance.rs @@ -0,0 +1,11 @@ +use super::run; +use crate::fnc::script::modules::impl_module_def; + +pub struct Package; + +impl_module_def!( + Package, + "string::distance", + "hamming" => run, + "levenshtein" => run +); diff --git a/lib/src/fnc/script/modules/surrealdb/functions/string/similarity.rs b/lib/src/fnc/script/modules/surrealdb/functions/string/similarity.rs new file mode 100644 index 00000000..1ed20456 --- /dev/null +++ b/lib/src/fnc/script/modules/surrealdb/functions/string/similarity.rs @@ -0,0 +1,12 @@ +use super::run; +use crate::fnc::script::modules::impl_module_def; + +pub struct Package; + +impl_module_def!( + Package, + "string::similarity", + "fuzzy" => run, + "jaro" => run, + "smithwaterman" => run +); diff --git a/lib/src/fnc/string.rs b/lib/src/fnc/string.rs index f14ab722..33b29af0 100644 --- a/lib/src/fnc/string.rs +++ b/lib/src/fnc/string.rs @@ -110,7 +110,7 @@ pub fn slice((val, beg, lim): (String, Option, Option)) -> Result< } pub fn slug((string,): (String,)) -> Result { - Ok(string::slug(string).into()) + Ok(string::slug::slug(string).into()) } pub fn split((val, chr): (String, String)) -> Result { @@ -133,6 +133,45 @@ pub fn words((string,): (String,)) -> Result { Ok(string.split_whitespace().collect::>().into()) } +pub mod distance { + + use crate::err::Error; + use crate::sql::Value; + + pub fn hamming((_, _): (String, String)) -> Result { + Err(Error::FeatureNotYetImplemented { + feature: "string::distance::hamming() function", + }) + } + + pub fn levenshtein((_, _): (String, String)) -> Result { + Err(Error::FeatureNotYetImplemented { + feature: "string::distance::levenshtein() function", + }) + } +} + +pub mod similarity { + + use crate::err::Error; + use crate::fnc::util::string::fuzzy::Fuzzy; + use crate::sql::Value; + + pub fn fuzzy((a, b): (String, String)) -> Result { + Ok(a.as_str().fuzzy_score(b.as_str()).into()) + } + + pub fn jaro((_, _): (String, String)) -> Result { + Err(Error::FeatureNotYetImplemented { + feature: "string::similarity::jaro() function", + }) + } + + pub fn smithwaterman((a, b): (String, String)) -> Result { + Ok(a.as_str().fuzzy_score(b.as_str()).into()) + } +} + #[cfg(test)] mod tests { use super::{contains, slice}; diff --git a/lib/src/fnc/util/string/fuzzy.rs b/lib/src/fnc/util/string/fuzzy.rs new file mode 100644 index 00000000..f2044745 --- /dev/null +++ b/lib/src/fnc/util/string/fuzzy.rs @@ -0,0 +1,23 @@ +use fuzzy_matcher::skim::SkimMatcherV2; +use fuzzy_matcher::FuzzyMatcher; +use once_cell::sync::Lazy; + +static MATCHER: Lazy = Lazy::new(|| SkimMatcherV2::default().ignore_case()); + +pub trait Fuzzy { + /// Retrieve the fuzzy similarity score of this &str compared to another &str + fn fuzzy_match(&self, other: &str) -> bool; + /// Check if this &str matches another &str using a fuzzy algorithm + fn fuzzy_score(&self, other: &str) -> i64; +} + +impl Fuzzy for str { + /// Retrieve the fuzzy similarity score of this &str compared to another &str + fn fuzzy_match(&self, other: &str) -> bool { + MATCHER.fuzzy_match(self, other).is_some() + } + /// Check if this &str matches another &str using a fuzzy algorithm + fn fuzzy_score(&self, other: &str) -> i64 { + MATCHER.fuzzy_match(self, other).unwrap_or(0) + } +} diff --git a/lib/src/fnc/util/string/mod.rs b/lib/src/fnc/util/string/mod.rs index cee1ce4d..81020072 100644 --- a/lib/src/fnc/util/string/mod.rs +++ b/lib/src/fnc/util/string/mod.rs @@ -1,23 +1,2 @@ -use ascii::any_ascii as ascii; -use once_cell::sync::Lazy; -use regex::Regex; - -static SIMPLES: Lazy = Lazy::new(|| Regex::new("[^a-z0-9-_]").unwrap()); -static HYPHENS: Lazy = Lazy::new(|| Regex::new("-+").unwrap()); - -pub fn slug>(s: S) -> String { - // Get a reference - let s = s.as_ref(); - // Convert unicode to ascii - let mut s = ascii(s); - // Convert string to lowercase - s.make_ascii_lowercase(); - // Replace any non-simple characters - let s = SIMPLES.replace_all(s.as_ref(), "-"); - // Replace any duplicated hyphens - let s = HYPHENS.replace_all(s.as_ref(), "-"); - // Remove any surrounding hyphens - let s = s.trim_matches('-'); - // Return the string - s.to_owned() -} +pub mod fuzzy; +pub mod slug; diff --git a/lib/src/fnc/util/string/slug.rs b/lib/src/fnc/util/string/slug.rs new file mode 100644 index 00000000..cee1ce4d --- /dev/null +++ b/lib/src/fnc/util/string/slug.rs @@ -0,0 +1,23 @@ +use ascii::any_ascii as ascii; +use once_cell::sync::Lazy; +use regex::Regex; + +static SIMPLES: Lazy = Lazy::new(|| Regex::new("[^a-z0-9-_]").unwrap()); +static HYPHENS: Lazy = Lazy::new(|| Regex::new("-+").unwrap()); + +pub fn slug>(s: S) -> String { + // Get a reference + let s = s.as_ref(); + // Convert unicode to ascii + let mut s = ascii(s); + // Convert string to lowercase + s.make_ascii_lowercase(); + // Replace any non-simple characters + let s = SIMPLES.replace_all(s.as_ref(), "-"); + // Replace any duplicated hyphens + let s = HYPHENS.replace_all(s.as_ref(), "-"); + // Remove any surrounding hyphens + let s = s.trim_matches('-'); + // Return the string + s.to_owned() +} diff --git a/lib/src/sql/function.rs b/lib/src/sql/function.rs index 91837e65..3cd767f0 100644 --- a/lib/src/sql/function.rs +++ b/lib/src/sql/function.rs @@ -508,6 +508,8 @@ fn function_string(i: &str) -> IResult<&str, &str> { tag("trim"), tag("uppercase"), tag("words"), + preceded(tag("distance::"), alt((tag("hamming"), tag("levenshtein")))), + preceded(tag("similarity::"), alt((tag("fuzzy"), tag("jaro"), tag("smithwaterman")))), ))(i) } diff --git a/lib/src/sql/value/value.rs b/lib/src/sql/value/value.rs index 7da8b193..55aa2e68 100644 --- a/lib/src/sql/value/value.rs +++ b/lib/src/sql/value/value.rs @@ -4,6 +4,7 @@ use crate::ctx::Context; use crate::dbs::{Options, Transaction}; use crate::doc::CursorDoc; use crate::err::Error; +use crate::fnc::util::string::fuzzy::Fuzzy; use crate::sql::array::Uniq; use crate::sql::array::{array, Array}; use crate::sql::block::{block, Block}; @@ -41,8 +42,6 @@ use crate::sql::uuid::{uuid as unique, Uuid}; use async_recursion::async_recursion; use chrono::{DateTime, Utc}; use derive::Store; -use fuzzy_matcher::skim::SkimMatcherV2; -use fuzzy_matcher::FuzzyMatcher; use geo::Point; use nom::branch::alt; use nom::bytes::complete::tag_no_case; @@ -50,7 +49,6 @@ use nom::character::complete::char; use nom::combinator::{map, opt}; use nom::multi::separated_list0; use nom::multi::separated_list1; -use once_cell::sync::Lazy; use rust_decimal::prelude::*; use serde::{Deserialize, Serialize}; use serde_json::Value as Json; @@ -62,8 +60,6 @@ use std::ops::Deref; use std::ops::Neg; use std::str::FromStr; -static MATCHER: Lazy = Lazy::new(|| SkimMatcherV2::default().ignore_case()); - pub(crate) const TOKEN: &str = "$surrealdb::private::sql::Value"; #[derive(Clone, Debug, Default, Eq, PartialEq, Serialize, Deserialize, Hash)] @@ -2322,11 +2318,11 @@ impl Value { pub fn fuzzy(&self, other: &Value) -> bool { match self { Value::Uuid(v) => match other { - Value::Strand(w) => MATCHER.fuzzy_match(v.to_raw().as_str(), w.as_str()).is_some(), + Value::Strand(w) => v.to_raw().as_str().fuzzy_match(w.as_str()), _ => false, }, Value::Strand(v) => match other { - Value::Strand(w) => MATCHER.fuzzy_match(v.as_str(), w.as_str()).is_some(), + Value::Strand(w) => v.as_str().fuzzy_match(w.as_str()), _ => false, }, _ => self.equal(other), diff --git a/lib/tests/function.rs b/lib/tests/function.rs index 3c5c86a3..b390871d 100644 --- a/lib/tests/function.rs +++ b/lib/tests/function.rs @@ -3609,6 +3609,70 @@ async fn function_string_reverse() -> Result<(), Error> { Ok(()) } +#[tokio::test] +async fn function_string_similarity_fuzzy() -> Result<(), Error> { + let sql = r#" + RETURN string::similarity::fuzzy("", ""); + RETURN string::similarity::fuzzy("some", "text"); + RETURN string::similarity::fuzzy("text", "TEXT"); + RETURN string::similarity::fuzzy("TEXT", "TEXT"); + RETURN string::similarity::fuzzy("this could be a tricky test", "this test"); + "#; + let dbs = Datastore::new("memory").await?; + let ses = Session::for_kv().with_ns("test").with_db("test"); + let res = &mut dbs.execute(&sql, &ses, None).await?; + assert_eq!(res.len(), 5); + // + let tmp = res.remove(0).result?; + assert_eq!(tmp, Value::from(0)); + // + let tmp = res.remove(0).result?; + assert_eq!(tmp, Value::from(0)); + // + let tmp = res.remove(0).result?; + assert_eq!(tmp, Value::from(83)); + // + let tmp = res.remove(0).result?; + assert_eq!(tmp, Value::from(91)); + // + let tmp = res.remove(0).result?; + assert_eq!(tmp, Value::from(174)); + // + Ok(()) +} + +#[tokio::test] +async fn function_string_similarity_smithwaterman() -> Result<(), Error> { + let sql = r#" + RETURN string::similarity::smithwaterman("", ""); + RETURN string::similarity::smithwaterman("some", "text"); + RETURN string::similarity::smithwaterman("text", "TEXT"); + RETURN string::similarity::smithwaterman("TEXT", "TEXT"); + RETURN string::similarity::smithwaterman("this could be a tricky test", "this test"); + "#; + let dbs = Datastore::new("memory").await?; + let ses = Session::for_kv().with_ns("test").with_db("test"); + let res = &mut dbs.execute(&sql, &ses, None).await?; + assert_eq!(res.len(), 5); + // + let tmp = res.remove(0).result?; + assert_eq!(tmp, Value::from(0)); + // + let tmp = res.remove(0).result?; + assert_eq!(tmp, Value::from(0)); + // + let tmp = res.remove(0).result?; + assert_eq!(tmp, Value::from(83)); + // + let tmp = res.remove(0).result?; + assert_eq!(tmp, Value::from(91)); + // + let tmp = res.remove(0).result?; + assert_eq!(tmp, Value::from(174)); + // + Ok(()) +} + #[tokio::test] async fn function_string_slice() -> Result<(), Error> { let sql = r#"