Feature: Add additional functions for comparing strings for similarity. (#1904)
Co-authored-by: Tobie Morgan Hitchcock <tobie@surrealdb.com>
This commit is contained in:
parent
b83cd86f9d
commit
b3a1b39236
13 changed files with 223 additions and 31 deletions
|
@ -289,6 +289,8 @@
|
||||||
"string"
|
"string"
|
||||||
"string::concat("
|
"string::concat("
|
||||||
"string::contains("
|
"string::contains("
|
||||||
|
"string::distance::hamming("
|
||||||
|
"string::distance::levenshtein("
|
||||||
"string::endsWith("
|
"string::endsWith("
|
||||||
"string::join("
|
"string::join("
|
||||||
"string::len("
|
"string::len("
|
||||||
|
@ -296,6 +298,9 @@
|
||||||
"string::repeat("
|
"string::repeat("
|
||||||
"string::replace("
|
"string::replace("
|
||||||
"string::reverse("
|
"string::reverse("
|
||||||
|
"string::similarity::fuzzy("
|
||||||
|
"string::similarity::jaro("
|
||||||
|
"string::similarity::smithwaterman("
|
||||||
"string::slice("
|
"string::slice("
|
||||||
"string::slug("
|
"string::slug("
|
||||||
"string::split("
|
"string::split("
|
||||||
|
@ -338,5 +343,17 @@
|
||||||
"type::string("
|
"type::string("
|
||||||
"type::table("
|
"type::table("
|
||||||
"type::thing("
|
"type::thing("
|
||||||
|
"vector::dotproduct(",
|
||||||
|
"vector::magnitude(",
|
||||||
|
"vector::distance::chebyshev(",
|
||||||
|
"vector::distance::euclidean(",
|
||||||
|
"vector::distance::hamming(",
|
||||||
|
"vector::distance::mahalanobis(",
|
||||||
|
"vector::distance::manhattan(",
|
||||||
|
"vector::distance::minkowski(",
|
||||||
|
"vector::similarity::cosine(",
|
||||||
|
"vector::similarity::jaccard(",
|
||||||
|
"vector::similarity::pearson(",
|
||||||
|
"vector::similarity::spearman(",
|
||||||
# TODO: Add Javascript keywords
|
# TODO: Add Javascript keywords
|
||||||
|
|
||||||
|
|
|
@ -288,6 +288,8 @@
|
||||||
"string"
|
"string"
|
||||||
"string::concat("
|
"string::concat("
|
||||||
"string::contains("
|
"string::contains("
|
||||||
|
"string::distance::hamming("
|
||||||
|
"string::distance::levenshtein("
|
||||||
"string::endsWith("
|
"string::endsWith("
|
||||||
"string::join("
|
"string::join("
|
||||||
"string::len("
|
"string::len("
|
||||||
|
@ -295,6 +297,9 @@
|
||||||
"string::repeat("
|
"string::repeat("
|
||||||
"string::replace("
|
"string::replace("
|
||||||
"string::reverse("
|
"string::reverse("
|
||||||
|
"string::similarity::fuzzy("
|
||||||
|
"string::similarity::jaro("
|
||||||
|
"string::similarity::smithwaterman("
|
||||||
"string::slice("
|
"string::slice("
|
||||||
"string::slug("
|
"string::slug("
|
||||||
"string::split("
|
"string::split("
|
||||||
|
@ -335,5 +340,17 @@
|
||||||
"type::string("
|
"type::string("
|
||||||
"type::table("
|
"type::table("
|
||||||
"type::thing("
|
"type::thing("
|
||||||
|
"vector::dotproduct(",
|
||||||
|
"vector::magnitude(",
|
||||||
|
"vector::distance::chebyshev(",
|
||||||
|
"vector::distance::euclidean(",
|
||||||
|
"vector::distance::hamming(",
|
||||||
|
"vector::distance::mahalanobis(",
|
||||||
|
"vector::distance::manhattan(",
|
||||||
|
"vector::distance::minkowski(",
|
||||||
|
"vector::similarity::cosine(",
|
||||||
|
"vector::similarity::jaccard(",
|
||||||
|
"vector::similarity::pearson(",
|
||||||
|
"vector::similarity::spearman(",
|
||||||
# TODO: Add Javascript keywords
|
# TODO: Add Javascript keywords
|
||||||
|
|
||||||
|
|
|
@ -248,6 +248,11 @@ pub fn synchronous(ctx: &Context<'_>, name: &str, args: Vec<Value>) -> Result<Va
|
||||||
"string::trim" => string::trim,
|
"string::trim" => string::trim,
|
||||||
"string::uppercase" => string::uppercase,
|
"string::uppercase" => string::uppercase,
|
||||||
"string::words" => string::words,
|
"string::words" => string::words,
|
||||||
|
"string::distance::hamming" => string::distance::hamming,
|
||||||
|
"string::distance::levenshtein" => string::distance::levenshtein,
|
||||||
|
"string::similarity::fuzzy" => string::similarity::fuzzy,
|
||||||
|
"string::similarity::jaro" => string::similarity::jaro,
|
||||||
|
"string::similarity::smithwaterman" => string::similarity::smithwaterman,
|
||||||
//
|
//
|
||||||
"time::ceil" => time::ceil,
|
"time::ceil" => time::ceil,
|
||||||
"time::day" => time::day,
|
"time::day" => time::day,
|
||||||
|
|
|
@ -1,6 +1,8 @@
|
||||||
use super::run;
|
use super::run;
|
||||||
use crate::fnc::script::modules::impl_module_def;
|
use crate::fnc::script::modules::impl_module_def;
|
||||||
|
|
||||||
|
mod distance;
|
||||||
|
mod similarity;
|
||||||
pub struct Package;
|
pub struct Package;
|
||||||
|
|
||||||
impl_module_def!(
|
impl_module_def!(
|
||||||
|
@ -8,6 +10,7 @@ impl_module_def!(
|
||||||
"string",
|
"string",
|
||||||
"concat" => run,
|
"concat" => run,
|
||||||
"contains" => run,
|
"contains" => run,
|
||||||
|
"distance" => (distance::Package),
|
||||||
"endsWith" => run,
|
"endsWith" => run,
|
||||||
"join" => run,
|
"join" => run,
|
||||||
"len" => run,
|
"len" => run,
|
||||||
|
@ -15,6 +18,7 @@ impl_module_def!(
|
||||||
"repeat" => run,
|
"repeat" => run,
|
||||||
"replace" => run,
|
"replace" => run,
|
||||||
"reverse" => run,
|
"reverse" => run,
|
||||||
|
"similarity" => (similarity::Package),
|
||||||
"slice" => run,
|
"slice" => run,
|
||||||
"slug" => run,
|
"slug" => run,
|
||||||
"split" => run,
|
"split" => run,
|
||||||
|
|
|
@ -0,0 +1,11 @@
|
||||||
|
use super::run;
|
||||||
|
use crate::fnc::script::modules::impl_module_def;
|
||||||
|
|
||||||
|
pub struct Package;
|
||||||
|
|
||||||
|
impl_module_def!(
|
||||||
|
Package,
|
||||||
|
"string::distance",
|
||||||
|
"hamming" => run,
|
||||||
|
"levenshtein" => run
|
||||||
|
);
|
|
@ -0,0 +1,12 @@
|
||||||
|
use super::run;
|
||||||
|
use crate::fnc::script::modules::impl_module_def;
|
||||||
|
|
||||||
|
pub struct Package;
|
||||||
|
|
||||||
|
impl_module_def!(
|
||||||
|
Package,
|
||||||
|
"string::similarity",
|
||||||
|
"fuzzy" => run,
|
||||||
|
"jaro" => run,
|
||||||
|
"smithwaterman" => run
|
||||||
|
);
|
|
@ -110,7 +110,7 @@ pub fn slice((val, beg, lim): (String, Option<isize>, Option<isize>)) -> Result<
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn slug((string,): (String,)) -> Result<Value, Error> {
|
pub fn slug((string,): (String,)) -> Result<Value, Error> {
|
||||||
Ok(string::slug(string).into())
|
Ok(string::slug::slug(string).into())
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn split((val, chr): (String, String)) -> Result<Value, Error> {
|
pub fn split((val, chr): (String, String)) -> Result<Value, Error> {
|
||||||
|
@ -133,6 +133,45 @@ pub fn words((string,): (String,)) -> Result<Value, Error> {
|
||||||
Ok(string.split_whitespace().collect::<Vec<&str>>().into())
|
Ok(string.split_whitespace().collect::<Vec<&str>>().into())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub mod distance {
|
||||||
|
|
||||||
|
use crate::err::Error;
|
||||||
|
use crate::sql::Value;
|
||||||
|
|
||||||
|
pub fn hamming((_, _): (String, String)) -> Result<Value, Error> {
|
||||||
|
Err(Error::FeatureNotYetImplemented {
|
||||||
|
feature: "string::distance::hamming() function",
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn levenshtein((_, _): (String, String)) -> Result<Value, Error> {
|
||||||
|
Err(Error::FeatureNotYetImplemented {
|
||||||
|
feature: "string::distance::levenshtein() function",
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub mod similarity {
|
||||||
|
|
||||||
|
use crate::err::Error;
|
||||||
|
use crate::fnc::util::string::fuzzy::Fuzzy;
|
||||||
|
use crate::sql::Value;
|
||||||
|
|
||||||
|
pub fn fuzzy((a, b): (String, String)) -> Result<Value, Error> {
|
||||||
|
Ok(a.as_str().fuzzy_score(b.as_str()).into())
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn jaro((_, _): (String, String)) -> Result<Value, Error> {
|
||||||
|
Err(Error::FeatureNotYetImplemented {
|
||||||
|
feature: "string::similarity::jaro() function",
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn smithwaterman((a, b): (String, String)) -> Result<Value, Error> {
|
||||||
|
Ok(a.as_str().fuzzy_score(b.as_str()).into())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use super::{contains, slice};
|
use super::{contains, slice};
|
||||||
|
|
23
lib/src/fnc/util/string/fuzzy.rs
Normal file
23
lib/src/fnc/util/string/fuzzy.rs
Normal file
|
@ -0,0 +1,23 @@
|
||||||
|
use fuzzy_matcher::skim::SkimMatcherV2;
|
||||||
|
use fuzzy_matcher::FuzzyMatcher;
|
||||||
|
use once_cell::sync::Lazy;
|
||||||
|
|
||||||
|
static MATCHER: Lazy<SkimMatcherV2> = Lazy::new(|| SkimMatcherV2::default().ignore_case());
|
||||||
|
|
||||||
|
pub trait Fuzzy {
|
||||||
|
/// Retrieve the fuzzy similarity score of this &str compared to another &str
|
||||||
|
fn fuzzy_match(&self, other: &str) -> bool;
|
||||||
|
/// Check if this &str matches another &str using a fuzzy algorithm
|
||||||
|
fn fuzzy_score(&self, other: &str) -> i64;
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Fuzzy for str {
|
||||||
|
/// Retrieve the fuzzy similarity score of this &str compared to another &str
|
||||||
|
fn fuzzy_match(&self, other: &str) -> bool {
|
||||||
|
MATCHER.fuzzy_match(self, other).is_some()
|
||||||
|
}
|
||||||
|
/// Check if this &str matches another &str using a fuzzy algorithm
|
||||||
|
fn fuzzy_score(&self, other: &str) -> i64 {
|
||||||
|
MATCHER.fuzzy_match(self, other).unwrap_or(0)
|
||||||
|
}
|
||||||
|
}
|
|
@ -1,23 +1,2 @@
|
||||||
use ascii::any_ascii as ascii;
|
pub mod fuzzy;
|
||||||
use once_cell::sync::Lazy;
|
pub mod slug;
|
||||||
use regex::Regex;
|
|
||||||
|
|
||||||
static SIMPLES: Lazy<Regex> = Lazy::new(|| Regex::new("[^a-z0-9-_]").unwrap());
|
|
||||||
static HYPHENS: Lazy<Regex> = Lazy::new(|| Regex::new("-+").unwrap());
|
|
||||||
|
|
||||||
pub fn slug<S: AsRef<str>>(s: S) -> String {
|
|
||||||
// Get a reference
|
|
||||||
let s = s.as_ref();
|
|
||||||
// Convert unicode to ascii
|
|
||||||
let mut s = ascii(s);
|
|
||||||
// Convert string to lowercase
|
|
||||||
s.make_ascii_lowercase();
|
|
||||||
// Replace any non-simple characters
|
|
||||||
let s = SIMPLES.replace_all(s.as_ref(), "-");
|
|
||||||
// Replace any duplicated hyphens
|
|
||||||
let s = HYPHENS.replace_all(s.as_ref(), "-");
|
|
||||||
// Remove any surrounding hyphens
|
|
||||||
let s = s.trim_matches('-');
|
|
||||||
// Return the string
|
|
||||||
s.to_owned()
|
|
||||||
}
|
|
||||||
|
|
23
lib/src/fnc/util/string/slug.rs
Normal file
23
lib/src/fnc/util/string/slug.rs
Normal file
|
@ -0,0 +1,23 @@
|
||||||
|
use ascii::any_ascii as ascii;
|
||||||
|
use once_cell::sync::Lazy;
|
||||||
|
use regex::Regex;
|
||||||
|
|
||||||
|
static SIMPLES: Lazy<Regex> = Lazy::new(|| Regex::new("[^a-z0-9-_]").unwrap());
|
||||||
|
static HYPHENS: Lazy<Regex> = Lazy::new(|| Regex::new("-+").unwrap());
|
||||||
|
|
||||||
|
pub fn slug<S: AsRef<str>>(s: S) -> String {
|
||||||
|
// Get a reference
|
||||||
|
let s = s.as_ref();
|
||||||
|
// Convert unicode to ascii
|
||||||
|
let mut s = ascii(s);
|
||||||
|
// Convert string to lowercase
|
||||||
|
s.make_ascii_lowercase();
|
||||||
|
// Replace any non-simple characters
|
||||||
|
let s = SIMPLES.replace_all(s.as_ref(), "-");
|
||||||
|
// Replace any duplicated hyphens
|
||||||
|
let s = HYPHENS.replace_all(s.as_ref(), "-");
|
||||||
|
// Remove any surrounding hyphens
|
||||||
|
let s = s.trim_matches('-');
|
||||||
|
// Return the string
|
||||||
|
s.to_owned()
|
||||||
|
}
|
|
@ -508,6 +508,8 @@ fn function_string(i: &str) -> IResult<&str, &str> {
|
||||||
tag("trim"),
|
tag("trim"),
|
||||||
tag("uppercase"),
|
tag("uppercase"),
|
||||||
tag("words"),
|
tag("words"),
|
||||||
|
preceded(tag("distance::"), alt((tag("hamming"), tag("levenshtein")))),
|
||||||
|
preceded(tag("similarity::"), alt((tag("fuzzy"), tag("jaro"), tag("smithwaterman")))),
|
||||||
))(i)
|
))(i)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -4,6 +4,7 @@ use crate::ctx::Context;
|
||||||
use crate::dbs::{Options, Transaction};
|
use crate::dbs::{Options, Transaction};
|
||||||
use crate::doc::CursorDoc;
|
use crate::doc::CursorDoc;
|
||||||
use crate::err::Error;
|
use crate::err::Error;
|
||||||
|
use crate::fnc::util::string::fuzzy::Fuzzy;
|
||||||
use crate::sql::array::Uniq;
|
use crate::sql::array::Uniq;
|
||||||
use crate::sql::array::{array, Array};
|
use crate::sql::array::{array, Array};
|
||||||
use crate::sql::block::{block, Block};
|
use crate::sql::block::{block, Block};
|
||||||
|
@ -41,8 +42,6 @@ use crate::sql::uuid::{uuid as unique, Uuid};
|
||||||
use async_recursion::async_recursion;
|
use async_recursion::async_recursion;
|
||||||
use chrono::{DateTime, Utc};
|
use chrono::{DateTime, Utc};
|
||||||
use derive::Store;
|
use derive::Store;
|
||||||
use fuzzy_matcher::skim::SkimMatcherV2;
|
|
||||||
use fuzzy_matcher::FuzzyMatcher;
|
|
||||||
use geo::Point;
|
use geo::Point;
|
||||||
use nom::branch::alt;
|
use nom::branch::alt;
|
||||||
use nom::bytes::complete::tag_no_case;
|
use nom::bytes::complete::tag_no_case;
|
||||||
|
@ -50,7 +49,6 @@ use nom::character::complete::char;
|
||||||
use nom::combinator::{map, opt};
|
use nom::combinator::{map, opt};
|
||||||
use nom::multi::separated_list0;
|
use nom::multi::separated_list0;
|
||||||
use nom::multi::separated_list1;
|
use nom::multi::separated_list1;
|
||||||
use once_cell::sync::Lazy;
|
|
||||||
use rust_decimal::prelude::*;
|
use rust_decimal::prelude::*;
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
use serde_json::Value as Json;
|
use serde_json::Value as Json;
|
||||||
|
@ -62,8 +60,6 @@ use std::ops::Deref;
|
||||||
use std::ops::Neg;
|
use std::ops::Neg;
|
||||||
use std::str::FromStr;
|
use std::str::FromStr;
|
||||||
|
|
||||||
static MATCHER: Lazy<SkimMatcherV2> = Lazy::new(|| SkimMatcherV2::default().ignore_case());
|
|
||||||
|
|
||||||
pub(crate) const TOKEN: &str = "$surrealdb::private::sql::Value";
|
pub(crate) const TOKEN: &str = "$surrealdb::private::sql::Value";
|
||||||
|
|
||||||
#[derive(Clone, Debug, Default, Eq, PartialEq, Serialize, Deserialize, Hash)]
|
#[derive(Clone, Debug, Default, Eq, PartialEq, Serialize, Deserialize, Hash)]
|
||||||
|
@ -2322,11 +2318,11 @@ impl Value {
|
||||||
pub fn fuzzy(&self, other: &Value) -> bool {
|
pub fn fuzzy(&self, other: &Value) -> bool {
|
||||||
match self {
|
match self {
|
||||||
Value::Uuid(v) => match other {
|
Value::Uuid(v) => match other {
|
||||||
Value::Strand(w) => MATCHER.fuzzy_match(v.to_raw().as_str(), w.as_str()).is_some(),
|
Value::Strand(w) => v.to_raw().as_str().fuzzy_match(w.as_str()),
|
||||||
_ => false,
|
_ => false,
|
||||||
},
|
},
|
||||||
Value::Strand(v) => match other {
|
Value::Strand(v) => match other {
|
||||||
Value::Strand(w) => MATCHER.fuzzy_match(v.as_str(), w.as_str()).is_some(),
|
Value::Strand(w) => v.as_str().fuzzy_match(w.as_str()),
|
||||||
_ => false,
|
_ => false,
|
||||||
},
|
},
|
||||||
_ => self.equal(other),
|
_ => self.equal(other),
|
||||||
|
|
|
@ -3609,6 +3609,70 @@ async fn function_string_reverse() -> Result<(), Error> {
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn function_string_similarity_fuzzy() -> Result<(), Error> {
|
||||||
|
let sql = r#"
|
||||||
|
RETURN string::similarity::fuzzy("", "");
|
||||||
|
RETURN string::similarity::fuzzy("some", "text");
|
||||||
|
RETURN string::similarity::fuzzy("text", "TEXT");
|
||||||
|
RETURN string::similarity::fuzzy("TEXT", "TEXT");
|
||||||
|
RETURN string::similarity::fuzzy("this could be a tricky test", "this test");
|
||||||
|
"#;
|
||||||
|
let dbs = Datastore::new("memory").await?;
|
||||||
|
let ses = Session::for_kv().with_ns("test").with_db("test");
|
||||||
|
let res = &mut dbs.execute(&sql, &ses, None).await?;
|
||||||
|
assert_eq!(res.len(), 5);
|
||||||
|
//
|
||||||
|
let tmp = res.remove(0).result?;
|
||||||
|
assert_eq!(tmp, Value::from(0));
|
||||||
|
//
|
||||||
|
let tmp = res.remove(0).result?;
|
||||||
|
assert_eq!(tmp, Value::from(0));
|
||||||
|
//
|
||||||
|
let tmp = res.remove(0).result?;
|
||||||
|
assert_eq!(tmp, Value::from(83));
|
||||||
|
//
|
||||||
|
let tmp = res.remove(0).result?;
|
||||||
|
assert_eq!(tmp, Value::from(91));
|
||||||
|
//
|
||||||
|
let tmp = res.remove(0).result?;
|
||||||
|
assert_eq!(tmp, Value::from(174));
|
||||||
|
//
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn function_string_similarity_smithwaterman() -> Result<(), Error> {
|
||||||
|
let sql = r#"
|
||||||
|
RETURN string::similarity::smithwaterman("", "");
|
||||||
|
RETURN string::similarity::smithwaterman("some", "text");
|
||||||
|
RETURN string::similarity::smithwaterman("text", "TEXT");
|
||||||
|
RETURN string::similarity::smithwaterman("TEXT", "TEXT");
|
||||||
|
RETURN string::similarity::smithwaterman("this could be a tricky test", "this test");
|
||||||
|
"#;
|
||||||
|
let dbs = Datastore::new("memory").await?;
|
||||||
|
let ses = Session::for_kv().with_ns("test").with_db("test");
|
||||||
|
let res = &mut dbs.execute(&sql, &ses, None).await?;
|
||||||
|
assert_eq!(res.len(), 5);
|
||||||
|
//
|
||||||
|
let tmp = res.remove(0).result?;
|
||||||
|
assert_eq!(tmp, Value::from(0));
|
||||||
|
//
|
||||||
|
let tmp = res.remove(0).result?;
|
||||||
|
assert_eq!(tmp, Value::from(0));
|
||||||
|
//
|
||||||
|
let tmp = res.remove(0).result?;
|
||||||
|
assert_eq!(tmp, Value::from(83));
|
||||||
|
//
|
||||||
|
let tmp = res.remove(0).result?;
|
||||||
|
assert_eq!(tmp, Value::from(91));
|
||||||
|
//
|
||||||
|
let tmp = res.remove(0).result?;
|
||||||
|
assert_eq!(tmp, Value::from(174));
|
||||||
|
//
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
#[tokio::test]
|
#[tokio::test]
|
||||||
async fn function_string_slice() -> Result<(), Error> {
|
async fn function_string_slice() -> Result<(), Error> {
|
||||||
let sql = r#"
|
let sql = r#"
|
||||||
|
|
Loading…
Reference in a new issue