Feature: Add additional functions for comparing strings for similarity. (#1904)
Co-authored-by: Tobie Morgan Hitchcock <tobie@surrealdb.com>
This commit is contained in:
parent
b83cd86f9d
commit
b3a1b39236
13 changed files with 223 additions and 31 deletions
|
@ -289,6 +289,8 @@
|
|||
"string"
|
||||
"string::concat("
|
||||
"string::contains("
|
||||
"string::distance::hamming("
|
||||
"string::distance::levenshtein("
|
||||
"string::endsWith("
|
||||
"string::join("
|
||||
"string::len("
|
||||
|
@ -296,6 +298,9 @@
|
|||
"string::repeat("
|
||||
"string::replace("
|
||||
"string::reverse("
|
||||
"string::similarity::fuzzy("
|
||||
"string::similarity::jaro("
|
||||
"string::similarity::smithwaterman("
|
||||
"string::slice("
|
||||
"string::slug("
|
||||
"string::split("
|
||||
|
@ -338,5 +343,17 @@
|
|||
"type::string("
|
||||
"type::table("
|
||||
"type::thing("
|
||||
"vector::dotproduct(",
|
||||
"vector::magnitude(",
|
||||
"vector::distance::chebyshev(",
|
||||
"vector::distance::euclidean(",
|
||||
"vector::distance::hamming(",
|
||||
"vector::distance::mahalanobis(",
|
||||
"vector::distance::manhattan(",
|
||||
"vector::distance::minkowski(",
|
||||
"vector::similarity::cosine(",
|
||||
"vector::similarity::jaccard(",
|
||||
"vector::similarity::pearson(",
|
||||
"vector::similarity::spearman(",
|
||||
# TODO: Add Javascript keywords
|
||||
|
||||
|
|
|
@ -288,6 +288,8 @@
|
|||
"string"
|
||||
"string::concat("
|
||||
"string::contains("
|
||||
"string::distance::hamming("
|
||||
"string::distance::levenshtein("
|
||||
"string::endsWith("
|
||||
"string::join("
|
||||
"string::len("
|
||||
|
@ -295,6 +297,9 @@
|
|||
"string::repeat("
|
||||
"string::replace("
|
||||
"string::reverse("
|
||||
"string::similarity::fuzzy("
|
||||
"string::similarity::jaro("
|
||||
"string::similarity::smithwaterman("
|
||||
"string::slice("
|
||||
"string::slug("
|
||||
"string::split("
|
||||
|
@ -335,5 +340,17 @@
|
|||
"type::string("
|
||||
"type::table("
|
||||
"type::thing("
|
||||
"vector::dotproduct(",
|
||||
"vector::magnitude(",
|
||||
"vector::distance::chebyshev(",
|
||||
"vector::distance::euclidean(",
|
||||
"vector::distance::hamming(",
|
||||
"vector::distance::mahalanobis(",
|
||||
"vector::distance::manhattan(",
|
||||
"vector::distance::minkowski(",
|
||||
"vector::similarity::cosine(",
|
||||
"vector::similarity::jaccard(",
|
||||
"vector::similarity::pearson(",
|
||||
"vector::similarity::spearman(",
|
||||
# TODO: Add Javascript keywords
|
||||
|
||||
|
|
|
@ -248,6 +248,11 @@ pub fn synchronous(ctx: &Context<'_>, name: &str, args: Vec<Value>) -> Result<Va
|
|||
"string::trim" => string::trim,
|
||||
"string::uppercase" => string::uppercase,
|
||||
"string::words" => string::words,
|
||||
"string::distance::hamming" => string::distance::hamming,
|
||||
"string::distance::levenshtein" => string::distance::levenshtein,
|
||||
"string::similarity::fuzzy" => string::similarity::fuzzy,
|
||||
"string::similarity::jaro" => string::similarity::jaro,
|
||||
"string::similarity::smithwaterman" => string::similarity::smithwaterman,
|
||||
//
|
||||
"time::ceil" => time::ceil,
|
||||
"time::day" => time::day,
|
||||
|
|
|
@ -1,6 +1,8 @@
|
|||
use super::run;
|
||||
use crate::fnc::script::modules::impl_module_def;
|
||||
|
||||
mod distance;
|
||||
mod similarity;
|
||||
pub struct Package;
|
||||
|
||||
impl_module_def!(
|
||||
|
@ -8,6 +10,7 @@ impl_module_def!(
|
|||
"string",
|
||||
"concat" => run,
|
||||
"contains" => run,
|
||||
"distance" => (distance::Package),
|
||||
"endsWith" => run,
|
||||
"join" => run,
|
||||
"len" => run,
|
||||
|
@ -15,6 +18,7 @@ impl_module_def!(
|
|||
"repeat" => run,
|
||||
"replace" => run,
|
||||
"reverse" => run,
|
||||
"similarity" => (similarity::Package),
|
||||
"slice" => run,
|
||||
"slug" => run,
|
||||
"split" => run,
|
||||
|
|
|
@ -0,0 +1,11 @@
|
|||
use super::run;
|
||||
use crate::fnc::script::modules::impl_module_def;
|
||||
|
||||
pub struct Package;
|
||||
|
||||
impl_module_def!(
|
||||
Package,
|
||||
"string::distance",
|
||||
"hamming" => run,
|
||||
"levenshtein" => run
|
||||
);
|
|
@ -0,0 +1,12 @@
|
|||
use super::run;
|
||||
use crate::fnc::script::modules::impl_module_def;
|
||||
|
||||
pub struct Package;
|
||||
|
||||
impl_module_def!(
|
||||
Package,
|
||||
"string::similarity",
|
||||
"fuzzy" => run,
|
||||
"jaro" => run,
|
||||
"smithwaterman" => run
|
||||
);
|
|
@ -110,7 +110,7 @@ pub fn slice((val, beg, lim): (String, Option<isize>, Option<isize>)) -> Result<
|
|||
}
|
||||
|
||||
pub fn slug((string,): (String,)) -> Result<Value, Error> {
|
||||
Ok(string::slug(string).into())
|
||||
Ok(string::slug::slug(string).into())
|
||||
}
|
||||
|
||||
pub fn split((val, chr): (String, String)) -> Result<Value, Error> {
|
||||
|
@ -133,6 +133,45 @@ pub fn words((string,): (String,)) -> Result<Value, Error> {
|
|||
Ok(string.split_whitespace().collect::<Vec<&str>>().into())
|
||||
}
|
||||
|
||||
pub mod distance {
|
||||
|
||||
use crate::err::Error;
|
||||
use crate::sql::Value;
|
||||
|
||||
pub fn hamming((_, _): (String, String)) -> Result<Value, Error> {
|
||||
Err(Error::FeatureNotYetImplemented {
|
||||
feature: "string::distance::hamming() function",
|
||||
})
|
||||
}
|
||||
|
||||
pub fn levenshtein((_, _): (String, String)) -> Result<Value, Error> {
|
||||
Err(Error::FeatureNotYetImplemented {
|
||||
feature: "string::distance::levenshtein() function",
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
pub mod similarity {
|
||||
|
||||
use crate::err::Error;
|
||||
use crate::fnc::util::string::fuzzy::Fuzzy;
|
||||
use crate::sql::Value;
|
||||
|
||||
pub fn fuzzy((a, b): (String, String)) -> Result<Value, Error> {
|
||||
Ok(a.as_str().fuzzy_score(b.as_str()).into())
|
||||
}
|
||||
|
||||
pub fn jaro((_, _): (String, String)) -> Result<Value, Error> {
|
||||
Err(Error::FeatureNotYetImplemented {
|
||||
feature: "string::similarity::jaro() function",
|
||||
})
|
||||
}
|
||||
|
||||
pub fn smithwaterman((a, b): (String, String)) -> Result<Value, Error> {
|
||||
Ok(a.as_str().fuzzy_score(b.as_str()).into())
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::{contains, slice};
|
||||
|
|
23
lib/src/fnc/util/string/fuzzy.rs
Normal file
23
lib/src/fnc/util/string/fuzzy.rs
Normal file
|
@ -0,0 +1,23 @@
|
|||
use fuzzy_matcher::skim::SkimMatcherV2;
|
||||
use fuzzy_matcher::FuzzyMatcher;
|
||||
use once_cell::sync::Lazy;
|
||||
|
||||
static MATCHER: Lazy<SkimMatcherV2> = Lazy::new(|| SkimMatcherV2::default().ignore_case());
|
||||
|
||||
pub trait Fuzzy {
|
||||
/// Retrieve the fuzzy similarity score of this &str compared to another &str
|
||||
fn fuzzy_match(&self, other: &str) -> bool;
|
||||
/// Check if this &str matches another &str using a fuzzy algorithm
|
||||
fn fuzzy_score(&self, other: &str) -> i64;
|
||||
}
|
||||
|
||||
impl Fuzzy for str {
|
||||
/// Retrieve the fuzzy similarity score of this &str compared to another &str
|
||||
fn fuzzy_match(&self, other: &str) -> bool {
|
||||
MATCHER.fuzzy_match(self, other).is_some()
|
||||
}
|
||||
/// Check if this &str matches another &str using a fuzzy algorithm
|
||||
fn fuzzy_score(&self, other: &str) -> i64 {
|
||||
MATCHER.fuzzy_match(self, other).unwrap_or(0)
|
||||
}
|
||||
}
|
|
@ -1,23 +1,2 @@
|
|||
use ascii::any_ascii as ascii;
|
||||
use once_cell::sync::Lazy;
|
||||
use regex::Regex;
|
||||
|
||||
static SIMPLES: Lazy<Regex> = Lazy::new(|| Regex::new("[^a-z0-9-_]").unwrap());
|
||||
static HYPHENS: Lazy<Regex> = Lazy::new(|| Regex::new("-+").unwrap());
|
||||
|
||||
pub fn slug<S: AsRef<str>>(s: S) -> String {
|
||||
// Get a reference
|
||||
let s = s.as_ref();
|
||||
// Convert unicode to ascii
|
||||
let mut s = ascii(s);
|
||||
// Convert string to lowercase
|
||||
s.make_ascii_lowercase();
|
||||
// Replace any non-simple characters
|
||||
let s = SIMPLES.replace_all(s.as_ref(), "-");
|
||||
// Replace any duplicated hyphens
|
||||
let s = HYPHENS.replace_all(s.as_ref(), "-");
|
||||
// Remove any surrounding hyphens
|
||||
let s = s.trim_matches('-');
|
||||
// Return the string
|
||||
s.to_owned()
|
||||
}
|
||||
pub mod fuzzy;
|
||||
pub mod slug;
|
||||
|
|
23
lib/src/fnc/util/string/slug.rs
Normal file
23
lib/src/fnc/util/string/slug.rs
Normal file
|
@ -0,0 +1,23 @@
|
|||
use ascii::any_ascii as ascii;
|
||||
use once_cell::sync::Lazy;
|
||||
use regex::Regex;
|
||||
|
||||
static SIMPLES: Lazy<Regex> = Lazy::new(|| Regex::new("[^a-z0-9-_]").unwrap());
|
||||
static HYPHENS: Lazy<Regex> = Lazy::new(|| Regex::new("-+").unwrap());
|
||||
|
||||
pub fn slug<S: AsRef<str>>(s: S) -> String {
|
||||
// Get a reference
|
||||
let s = s.as_ref();
|
||||
// Convert unicode to ascii
|
||||
let mut s = ascii(s);
|
||||
// Convert string to lowercase
|
||||
s.make_ascii_lowercase();
|
||||
// Replace any non-simple characters
|
||||
let s = SIMPLES.replace_all(s.as_ref(), "-");
|
||||
// Replace any duplicated hyphens
|
||||
let s = HYPHENS.replace_all(s.as_ref(), "-");
|
||||
// Remove any surrounding hyphens
|
||||
let s = s.trim_matches('-');
|
||||
// Return the string
|
||||
s.to_owned()
|
||||
}
|
|
@ -508,6 +508,8 @@ fn function_string(i: &str) -> IResult<&str, &str> {
|
|||
tag("trim"),
|
||||
tag("uppercase"),
|
||||
tag("words"),
|
||||
preceded(tag("distance::"), alt((tag("hamming"), tag("levenshtein")))),
|
||||
preceded(tag("similarity::"), alt((tag("fuzzy"), tag("jaro"), tag("smithwaterman")))),
|
||||
))(i)
|
||||
}
|
||||
|
||||
|
|
|
@ -4,6 +4,7 @@ use crate::ctx::Context;
|
|||
use crate::dbs::{Options, Transaction};
|
||||
use crate::doc::CursorDoc;
|
||||
use crate::err::Error;
|
||||
use crate::fnc::util::string::fuzzy::Fuzzy;
|
||||
use crate::sql::array::Uniq;
|
||||
use crate::sql::array::{array, Array};
|
||||
use crate::sql::block::{block, Block};
|
||||
|
@ -41,8 +42,6 @@ use crate::sql::uuid::{uuid as unique, Uuid};
|
|||
use async_recursion::async_recursion;
|
||||
use chrono::{DateTime, Utc};
|
||||
use derive::Store;
|
||||
use fuzzy_matcher::skim::SkimMatcherV2;
|
||||
use fuzzy_matcher::FuzzyMatcher;
|
||||
use geo::Point;
|
||||
use nom::branch::alt;
|
||||
use nom::bytes::complete::tag_no_case;
|
||||
|
@ -50,7 +49,6 @@ use nom::character::complete::char;
|
|||
use nom::combinator::{map, opt};
|
||||
use nom::multi::separated_list0;
|
||||
use nom::multi::separated_list1;
|
||||
use once_cell::sync::Lazy;
|
||||
use rust_decimal::prelude::*;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_json::Value as Json;
|
||||
|
@ -62,8 +60,6 @@ use std::ops::Deref;
|
|||
use std::ops::Neg;
|
||||
use std::str::FromStr;
|
||||
|
||||
static MATCHER: Lazy<SkimMatcherV2> = Lazy::new(|| SkimMatcherV2::default().ignore_case());
|
||||
|
||||
pub(crate) const TOKEN: &str = "$surrealdb::private::sql::Value";
|
||||
|
||||
#[derive(Clone, Debug, Default, Eq, PartialEq, Serialize, Deserialize, Hash)]
|
||||
|
@ -2322,11 +2318,11 @@ impl Value {
|
|||
pub fn fuzzy(&self, other: &Value) -> bool {
|
||||
match self {
|
||||
Value::Uuid(v) => match other {
|
||||
Value::Strand(w) => MATCHER.fuzzy_match(v.to_raw().as_str(), w.as_str()).is_some(),
|
||||
Value::Strand(w) => v.to_raw().as_str().fuzzy_match(w.as_str()),
|
||||
_ => false,
|
||||
},
|
||||
Value::Strand(v) => match other {
|
||||
Value::Strand(w) => MATCHER.fuzzy_match(v.as_str(), w.as_str()).is_some(),
|
||||
Value::Strand(w) => v.as_str().fuzzy_match(w.as_str()),
|
||||
_ => false,
|
||||
},
|
||||
_ => self.equal(other),
|
||||
|
|
|
@ -3609,6 +3609,70 @@ async fn function_string_reverse() -> Result<(), Error> {
|
|||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn function_string_similarity_fuzzy() -> Result<(), Error> {
|
||||
let sql = r#"
|
||||
RETURN string::similarity::fuzzy("", "");
|
||||
RETURN string::similarity::fuzzy("some", "text");
|
||||
RETURN string::similarity::fuzzy("text", "TEXT");
|
||||
RETURN string::similarity::fuzzy("TEXT", "TEXT");
|
||||
RETURN string::similarity::fuzzy("this could be a tricky test", "this test");
|
||||
"#;
|
||||
let dbs = Datastore::new("memory").await?;
|
||||
let ses = Session::for_kv().with_ns("test").with_db("test");
|
||||
let res = &mut dbs.execute(&sql, &ses, None).await?;
|
||||
assert_eq!(res.len(), 5);
|
||||
//
|
||||
let tmp = res.remove(0).result?;
|
||||
assert_eq!(tmp, Value::from(0));
|
||||
//
|
||||
let tmp = res.remove(0).result?;
|
||||
assert_eq!(tmp, Value::from(0));
|
||||
//
|
||||
let tmp = res.remove(0).result?;
|
||||
assert_eq!(tmp, Value::from(83));
|
||||
//
|
||||
let tmp = res.remove(0).result?;
|
||||
assert_eq!(tmp, Value::from(91));
|
||||
//
|
||||
let tmp = res.remove(0).result?;
|
||||
assert_eq!(tmp, Value::from(174));
|
||||
//
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn function_string_similarity_smithwaterman() -> Result<(), Error> {
|
||||
let sql = r#"
|
||||
RETURN string::similarity::smithwaterman("", "");
|
||||
RETURN string::similarity::smithwaterman("some", "text");
|
||||
RETURN string::similarity::smithwaterman("text", "TEXT");
|
||||
RETURN string::similarity::smithwaterman("TEXT", "TEXT");
|
||||
RETURN string::similarity::smithwaterman("this could be a tricky test", "this test");
|
||||
"#;
|
||||
let dbs = Datastore::new("memory").await?;
|
||||
let ses = Session::for_kv().with_ns("test").with_db("test");
|
||||
let res = &mut dbs.execute(&sql, &ses, None).await?;
|
||||
assert_eq!(res.len(), 5);
|
||||
//
|
||||
let tmp = res.remove(0).result?;
|
||||
assert_eq!(tmp, Value::from(0));
|
||||
//
|
||||
let tmp = res.remove(0).result?;
|
||||
assert_eq!(tmp, Value::from(0));
|
||||
//
|
||||
let tmp = res.remove(0).result?;
|
||||
assert_eq!(tmp, Value::from(83));
|
||||
//
|
||||
let tmp = res.remove(0).result?;
|
||||
assert_eq!(tmp, Value::from(91));
|
||||
//
|
||||
let tmp = res.remove(0).result?;
|
||||
assert_eq!(tmp, Value::from(174));
|
||||
//
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn function_string_slice() -> Result<(), Error> {
|
||||
let sql = r#"
|
||||
|
|
Loading…
Reference in a new issue