Feature: Add additional functions for comparing strings for similarity. (#1904)

Co-authored-by: Tobie Morgan Hitchcock <tobie@surrealdb.com>
This commit is contained in:
Eduardo Pereira de Sousa 2023-07-14 17:37:52 -03:00 committed by GitHub
parent b83cd86f9d
commit b3a1b39236
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
13 changed files with 223 additions and 31 deletions

View file

@ -289,6 +289,8 @@
"string"
"string::concat("
"string::contains("
"string::distance::hamming("
"string::distance::levenshtein("
"string::endsWith("
"string::join("
"string::len("
@ -296,6 +298,9 @@
"string::repeat("
"string::replace("
"string::reverse("
"string::similarity::fuzzy("
"string::similarity::jaro("
"string::similarity::smithwaterman("
"string::slice("
"string::slug("
"string::split("
@ -338,5 +343,17 @@
"type::string("
"type::table("
"type::thing("
"vector::dotproduct(",
"vector::magnitude(",
"vector::distance::chebyshev(",
"vector::distance::euclidean(",
"vector::distance::hamming(",
"vector::distance::mahalanobis(",
"vector::distance::manhattan(",
"vector::distance::minkowski(",
"vector::similarity::cosine(",
"vector::similarity::jaccard(",
"vector::similarity::pearson(",
"vector::similarity::spearman(",
# TODO: Add Javascript keywords

View file

@ -288,6 +288,8 @@
"string"
"string::concat("
"string::contains("
"string::distance::hamming("
"string::distance::levenshtein("
"string::endsWith("
"string::join("
"string::len("
@ -295,6 +297,9 @@
"string::repeat("
"string::replace("
"string::reverse("
"string::similarity::fuzzy("
"string::similarity::jaro("
"string::similarity::smithwaterman("
"string::slice("
"string::slug("
"string::split("
@ -335,5 +340,17 @@
"type::string("
"type::table("
"type::thing("
"vector::dotproduct(",
"vector::magnitude(",
"vector::distance::chebyshev(",
"vector::distance::euclidean(",
"vector::distance::hamming(",
"vector::distance::mahalanobis(",
"vector::distance::manhattan(",
"vector::distance::minkowski(",
"vector::similarity::cosine(",
"vector::similarity::jaccard(",
"vector::similarity::pearson(",
"vector::similarity::spearman(",
# TODO: Add Javascript keywords

View file

@ -248,6 +248,11 @@ pub fn synchronous(ctx: &Context<'_>, name: &str, args: Vec<Value>) -> Result<Va
"string::trim" => string::trim,
"string::uppercase" => string::uppercase,
"string::words" => string::words,
"string::distance::hamming" => string::distance::hamming,
"string::distance::levenshtein" => string::distance::levenshtein,
"string::similarity::fuzzy" => string::similarity::fuzzy,
"string::similarity::jaro" => string::similarity::jaro,
"string::similarity::smithwaterman" => string::similarity::smithwaterman,
//
"time::ceil" => time::ceil,
"time::day" => time::day,

View file

@ -1,6 +1,8 @@
use super::run;
use crate::fnc::script::modules::impl_module_def;
mod distance;
mod similarity;
pub struct Package;
impl_module_def!(
@ -8,6 +10,7 @@ impl_module_def!(
"string",
"concat" => run,
"contains" => run,
"distance" => (distance::Package),
"endsWith" => run,
"join" => run,
"len" => run,
@ -15,6 +18,7 @@ impl_module_def!(
"repeat" => run,
"replace" => run,
"reverse" => run,
"similarity" => (similarity::Package),
"slice" => run,
"slug" => run,
"split" => run,

View file

@ -0,0 +1,11 @@
use super::run;
use crate::fnc::script::modules::impl_module_def;
pub struct Package;
impl_module_def!(
Package,
"string::distance",
"hamming" => run,
"levenshtein" => run
);

View file

@ -0,0 +1,12 @@
use super::run;
use crate::fnc::script::modules::impl_module_def;
pub struct Package;
impl_module_def!(
Package,
"string::similarity",
"fuzzy" => run,
"jaro" => run,
"smithwaterman" => run
);

View file

@ -110,7 +110,7 @@ pub fn slice((val, beg, lim): (String, Option<isize>, Option<isize>)) -> Result<
}
pub fn slug((string,): (String,)) -> Result<Value, Error> {
Ok(string::slug(string).into())
Ok(string::slug::slug(string).into())
}
pub fn split((val, chr): (String, String)) -> Result<Value, Error> {
@ -133,6 +133,45 @@ pub fn words((string,): (String,)) -> Result<Value, Error> {
Ok(string.split_whitespace().collect::<Vec<&str>>().into())
}
pub mod distance {
use crate::err::Error;
use crate::sql::Value;
pub fn hamming((_, _): (String, String)) -> Result<Value, Error> {
Err(Error::FeatureNotYetImplemented {
feature: "string::distance::hamming() function",
})
}
pub fn levenshtein((_, _): (String, String)) -> Result<Value, Error> {
Err(Error::FeatureNotYetImplemented {
feature: "string::distance::levenshtein() function",
})
}
}
pub mod similarity {
use crate::err::Error;
use crate::fnc::util::string::fuzzy::Fuzzy;
use crate::sql::Value;
pub fn fuzzy((a, b): (String, String)) -> Result<Value, Error> {
Ok(a.as_str().fuzzy_score(b.as_str()).into())
}
pub fn jaro((_, _): (String, String)) -> Result<Value, Error> {
Err(Error::FeatureNotYetImplemented {
feature: "string::similarity::jaro() function",
})
}
pub fn smithwaterman((a, b): (String, String)) -> Result<Value, Error> {
Ok(a.as_str().fuzzy_score(b.as_str()).into())
}
}
#[cfg(test)]
mod tests {
use super::{contains, slice};

View file

@ -0,0 +1,23 @@
use fuzzy_matcher::skim::SkimMatcherV2;
use fuzzy_matcher::FuzzyMatcher;
use once_cell::sync::Lazy;
static MATCHER: Lazy<SkimMatcherV2> = Lazy::new(|| SkimMatcherV2::default().ignore_case());
pub trait Fuzzy {
/// Retrieve the fuzzy similarity score of this &str compared to another &str
fn fuzzy_match(&self, other: &str) -> bool;
/// Check if this &str matches another &str using a fuzzy algorithm
fn fuzzy_score(&self, other: &str) -> i64;
}
impl Fuzzy for str {
/// Retrieve the fuzzy similarity score of this &str compared to another &str
fn fuzzy_match(&self, other: &str) -> bool {
MATCHER.fuzzy_match(self, other).is_some()
}
/// Check if this &str matches another &str using a fuzzy algorithm
fn fuzzy_score(&self, other: &str) -> i64 {
MATCHER.fuzzy_match(self, other).unwrap_or(0)
}
}

View file

@ -1,23 +1,2 @@
use ascii::any_ascii as ascii;
use once_cell::sync::Lazy;
use regex::Regex;
static SIMPLES: Lazy<Regex> = Lazy::new(|| Regex::new("[^a-z0-9-_]").unwrap());
static HYPHENS: Lazy<Regex> = Lazy::new(|| Regex::new("-+").unwrap());
pub fn slug<S: AsRef<str>>(s: S) -> String {
// Get a reference
let s = s.as_ref();
// Convert unicode to ascii
let mut s = ascii(s);
// Convert string to lowercase
s.make_ascii_lowercase();
// Replace any non-simple characters
let s = SIMPLES.replace_all(s.as_ref(), "-");
// Replace any duplicated hyphens
let s = HYPHENS.replace_all(s.as_ref(), "-");
// Remove any surrounding hyphens
let s = s.trim_matches('-');
// Return the string
s.to_owned()
}
pub mod fuzzy;
pub mod slug;

View file

@ -0,0 +1,23 @@
use ascii::any_ascii as ascii;
use once_cell::sync::Lazy;
use regex::Regex;
static SIMPLES: Lazy<Regex> = Lazy::new(|| Regex::new("[^a-z0-9-_]").unwrap());
static HYPHENS: Lazy<Regex> = Lazy::new(|| Regex::new("-+").unwrap());
pub fn slug<S: AsRef<str>>(s: S) -> String {
// Get a reference
let s = s.as_ref();
// Convert unicode to ascii
let mut s = ascii(s);
// Convert string to lowercase
s.make_ascii_lowercase();
// Replace any non-simple characters
let s = SIMPLES.replace_all(s.as_ref(), "-");
// Replace any duplicated hyphens
let s = HYPHENS.replace_all(s.as_ref(), "-");
// Remove any surrounding hyphens
let s = s.trim_matches('-');
// Return the string
s.to_owned()
}

View file

@ -508,6 +508,8 @@ fn function_string(i: &str) -> IResult<&str, &str> {
tag("trim"),
tag("uppercase"),
tag("words"),
preceded(tag("distance::"), alt((tag("hamming"), tag("levenshtein")))),
preceded(tag("similarity::"), alt((tag("fuzzy"), tag("jaro"), tag("smithwaterman")))),
))(i)
}

View file

@ -4,6 +4,7 @@ use crate::ctx::Context;
use crate::dbs::{Options, Transaction};
use crate::doc::CursorDoc;
use crate::err::Error;
use crate::fnc::util::string::fuzzy::Fuzzy;
use crate::sql::array::Uniq;
use crate::sql::array::{array, Array};
use crate::sql::block::{block, Block};
@ -41,8 +42,6 @@ use crate::sql::uuid::{uuid as unique, Uuid};
use async_recursion::async_recursion;
use chrono::{DateTime, Utc};
use derive::Store;
use fuzzy_matcher::skim::SkimMatcherV2;
use fuzzy_matcher::FuzzyMatcher;
use geo::Point;
use nom::branch::alt;
use nom::bytes::complete::tag_no_case;
@ -50,7 +49,6 @@ use nom::character::complete::char;
use nom::combinator::{map, opt};
use nom::multi::separated_list0;
use nom::multi::separated_list1;
use once_cell::sync::Lazy;
use rust_decimal::prelude::*;
use serde::{Deserialize, Serialize};
use serde_json::Value as Json;
@ -62,8 +60,6 @@ use std::ops::Deref;
use std::ops::Neg;
use std::str::FromStr;
static MATCHER: Lazy<SkimMatcherV2> = Lazy::new(|| SkimMatcherV2::default().ignore_case());
pub(crate) const TOKEN: &str = "$surrealdb::private::sql::Value";
#[derive(Clone, Debug, Default, Eq, PartialEq, Serialize, Deserialize, Hash)]
@ -2322,11 +2318,11 @@ impl Value {
pub fn fuzzy(&self, other: &Value) -> bool {
match self {
Value::Uuid(v) => match other {
Value::Strand(w) => MATCHER.fuzzy_match(v.to_raw().as_str(), w.as_str()).is_some(),
Value::Strand(w) => v.to_raw().as_str().fuzzy_match(w.as_str()),
_ => false,
},
Value::Strand(v) => match other {
Value::Strand(w) => MATCHER.fuzzy_match(v.as_str(), w.as_str()).is_some(),
Value::Strand(w) => v.as_str().fuzzy_match(w.as_str()),
_ => false,
},
_ => self.equal(other),

View file

@ -3609,6 +3609,70 @@ async fn function_string_reverse() -> Result<(), Error> {
Ok(())
}
#[tokio::test]
async fn function_string_similarity_fuzzy() -> Result<(), Error> {
let sql = r#"
RETURN string::similarity::fuzzy("", "");
RETURN string::similarity::fuzzy("some", "text");
RETURN string::similarity::fuzzy("text", "TEXT");
RETURN string::similarity::fuzzy("TEXT", "TEXT");
RETURN string::similarity::fuzzy("this could be a tricky test", "this test");
"#;
let dbs = Datastore::new("memory").await?;
let ses = Session::for_kv().with_ns("test").with_db("test");
let res = &mut dbs.execute(&sql, &ses, None).await?;
assert_eq!(res.len(), 5);
//
let tmp = res.remove(0).result?;
assert_eq!(tmp, Value::from(0));
//
let tmp = res.remove(0).result?;
assert_eq!(tmp, Value::from(0));
//
let tmp = res.remove(0).result?;
assert_eq!(tmp, Value::from(83));
//
let tmp = res.remove(0).result?;
assert_eq!(tmp, Value::from(91));
//
let tmp = res.remove(0).result?;
assert_eq!(tmp, Value::from(174));
//
Ok(())
}
#[tokio::test]
async fn function_string_similarity_smithwaterman() -> Result<(), Error> {
let sql = r#"
RETURN string::similarity::smithwaterman("", "");
RETURN string::similarity::smithwaterman("some", "text");
RETURN string::similarity::smithwaterman("text", "TEXT");
RETURN string::similarity::smithwaterman("TEXT", "TEXT");
RETURN string::similarity::smithwaterman("this could be a tricky test", "this test");
"#;
let dbs = Datastore::new("memory").await?;
let ses = Session::for_kv().with_ns("test").with_db("test");
let res = &mut dbs.execute(&sql, &ses, None).await?;
assert_eq!(res.len(), 5);
//
let tmp = res.remove(0).result?;
assert_eq!(tmp, Value::from(0));
//
let tmp = res.remove(0).result?;
assert_eq!(tmp, Value::from(0));
//
let tmp = res.remove(0).result?;
assert_eq!(tmp, Value::from(83));
//
let tmp = res.remove(0).result?;
assert_eq!(tmp, Value::from(91));
//
let tmp = res.remove(0).result?;
assert_eq!(tmp, Value::from(174));
//
Ok(())
}
#[tokio::test]
async fn function_string_slice() -> Result<(), Error> {
let sql = r#"