From 98a482e471fa25621de95b0d236b022092b238c6 Mon Sep 17 00:00:00 2001 From: Emmanuel Keller Date: Sun, 16 Jul 2023 14:04:22 +0100 Subject: [PATCH] Implements additional function for vectors (#2266) --- lib/fuzz/fuzz_targets/fuzz_executor.dict | 35 +- lib/fuzz/fuzz_targets/fuzz_sql_parser.dict | 35 +- lib/src/fnc/mod.rs | 10 +- .../modules/surrealdb/functions/vector.rs | 13 +- lib/src/fnc/util/math/deviation.rs | 9 +- lib/src/fnc/util/math/dotproduct.rs | 15 - lib/src/fnc/util/math/euclideandistance.rs | 21 - lib/src/fnc/util/math/magnitude.rs | 12 - lib/src/fnc/util/math/mod.rs | 4 +- lib/src/fnc/util/math/variance.rs | 21 +- lib/src/fnc/util/math/vector.rs | 289 ++++++++++ lib/src/fnc/vector.rs | 106 ++-- lib/src/sql/function.rs | 10 +- lib/src/sql/number.rs | 27 +- lib/tests/function.rs | 528 ++++++++++++++---- 15 files changed, 890 insertions(+), 245 deletions(-) delete mode 100644 lib/src/fnc/util/math/dotproduct.rs delete mode 100644 lib/src/fnc/util/math/euclideandistance.rs delete mode 100644 lib/src/fnc/util/math/magnitude.rs create mode 100644 lib/src/fnc/util/math/vector.rs diff --git a/lib/fuzz/fuzz_targets/fuzz_executor.dict b/lib/fuzz/fuzz_targets/fuzz_executor.dict index 3d1c2d69..79243a9f 100644 --- a/lib/fuzz/fuzz_targets/fuzz_executor.dict +++ b/lib/fuzz/fuzz_targets/fuzz_executor.dict @@ -276,6 +276,9 @@ "uuid" "rand::uuid::v4(" "rand::uuid::v7(" +"search::score(" +"search::highlight(" +"search::offsets(" "session" "session::" "session::db(" @@ -343,17 +346,25 @@ "type::string(" "type::table(" "type::thing(" -"vector::dotproduct(", -"vector::magnitude(", -"vector::distance::chebyshev(", -"vector::distance::euclidean(", -"vector::distance::hamming(", -"vector::distance::mahalanobis(", -"vector::distance::manhattan(", -"vector::distance::minkowski(", -"vector::similarity::cosine(", -"vector::similarity::jaccard(", -"vector::similarity::pearson(", -"vector::similarity::spearman(", +"vector::add(" +"vector::angle(" +"vector::cross(" +"vector::divide(" +"vector::dot(" +"vector::magnitude(" +"vector::multiply(" +"vector::normalize(" +"vector::project(" +"vector::subtract(" +"vector::distance::chebyshev(" +"vector::distance::euclidean(" +"vector::distance::hamming(" +"vector::distance::mahalanobis(" +"vector::distance::manhattan(" +"vector::distance::minkowski(" +"vector::similarity::cosine(" +"vector::similarity::jaccard(" +"vector::similarity::pearson(" +"vector::similarity::spearman(" # TODO: Add Javascript keywords diff --git a/lib/fuzz/fuzz_targets/fuzz_sql_parser.dict b/lib/fuzz/fuzz_targets/fuzz_sql_parser.dict index 0028154f..b9fbb3fa 100644 --- a/lib/fuzz/fuzz_targets/fuzz_sql_parser.dict +++ b/lib/fuzz/fuzz_targets/fuzz_sql_parser.dict @@ -276,6 +276,9 @@ "uuid" "rand::uuid::v4(" "rand::uuid::v7(" +"search::score(" +"search::highlight(" +"search::offsets(" "session" "session::" "session::db(" @@ -340,17 +343,25 @@ "type::string(" "type::table(" "type::thing(" -"vector::dotproduct(", -"vector::magnitude(", -"vector::distance::chebyshev(", -"vector::distance::euclidean(", -"vector::distance::hamming(", -"vector::distance::mahalanobis(", -"vector::distance::manhattan(", -"vector::distance::minkowski(", -"vector::similarity::cosine(", -"vector::similarity::jaccard(", -"vector::similarity::pearson(", -"vector::similarity::spearman(", +"vector::add(" +"vector::angle(" +"vector::cross(" +"vector::divide(" +"vector::dot(" +"vector::magnitude(" +"vector::multiply(" +"vector::normalize(" +"vector::project(" +"vector::subtract(" +"vector::distance::chebyshev(" +"vector::distance::euclidean(" +"vector::distance::hamming(" +"vector::distance::mahalanobis(" +"vector::distance::manhattan(" +"vector::distance::minkowski(" +"vector::similarity::cosine(" +"vector::similarity::jaccard(" +"vector::similarity::pearson(" +"vector::similarity::spearman(" # TODO: Add Javascript keywords diff --git a/lib/src/fnc/mod.rs b/lib/src/fnc/mod.rs index 4ae4a206..98c9d8b2 100644 --- a/lib/src/fnc/mod.rs +++ b/lib/src/fnc/mod.rs @@ -289,8 +289,16 @@ pub fn synchronous(ctx: &Context<'_>, name: &str, args: Vec) -> Result r#type::table, "type::thing" => r#type::thing, // - "vector::dotproduct" => vector::dotproduct, + "vector::add" => vector::add, + "vector::angle" => vector::angle, + "vector::cross" => vector::cross, + "vector::dot" => vector::dot, + "vector::divide" => vector::divide, "vector::magnitude" => vector::magnitude, + "vector::multiply" => vector::multiply, + "vector::normalize" => vector::normalize, + "vector::project" => vector::project, + "vector::subtract" => vector::subtract, "vector::distance::chebyshev" => vector::distance::chebyshev, "vector::distance::euclidean" => vector::distance::euclidean, "vector::distance::hamming" => vector::distance::hamming, diff --git a/lib/src/fnc/script/modules/surrealdb/functions/vector.rs b/lib/src/fnc/script/modules/surrealdb/functions/vector.rs index 661cfebf..8feaf527 100644 --- a/lib/src/fnc/script/modules/surrealdb/functions/vector.rs +++ b/lib/src/fnc/script/modules/surrealdb/functions/vector.rs @@ -3,13 +3,22 @@ use crate::fnc::script::modules::impl_module_def; mod distance; mod similarity; + pub struct Package; impl_module_def!( Package, "vector", "distance" => (distance::Package), - "dotproduct" => run, + "similarity" => (similarity::Package), + "add" => run, + "angle" => run, + "cross" => run, + "divide" => run, + "dot" => run, "magnitude" => run, - "similarity" => (similarity::Package) + "multiply" => run, + "normalize" => run, + "project" => run, + "subtract" => run ); diff --git a/lib/src/fnc/util/math/deviation.rs b/lib/src/fnc/util/math/deviation.rs index 260db06d..d02e081d 100644 --- a/lib/src/fnc/util/math/deviation.rs +++ b/lib/src/fnc/util/math/deviation.rs @@ -1,4 +1,5 @@ -use super::variance::Variance; +use crate::fnc::util::math::mean::Mean; +use crate::fnc::util::math::variance::variance; use crate::sql::number::Number; pub trait Deviation { @@ -8,6 +9,10 @@ pub trait Deviation { impl Deviation for Vec { fn deviation(self, sample: bool) -> f64 { - self.variance(sample).sqrt() + deviation(&self, self.mean(), sample) } } + +pub(super) fn deviation(v: &[Number], mean: f64, sample: bool) -> f64 { + variance(v, mean, sample).sqrt() +} diff --git a/lib/src/fnc/util/math/dotproduct.rs b/lib/src/fnc/util/math/dotproduct.rs deleted file mode 100644 index 4ed130d4..00000000 --- a/lib/src/fnc/util/math/dotproduct.rs +++ /dev/null @@ -1,15 +0,0 @@ -use crate::sql::Number; - -pub trait DotProduct { - /// Dot Product of two vectors - fn dotproduct(&self, other: &Self) -> Option; -} - -impl DotProduct for Vec { - fn dotproduct(&self, other: &Self) -> Option { - if self.len() != other.len() { - return None; - } - Some(self.iter().zip(other.iter()).map(|(a, b)| a * b).sum()) - } -} diff --git a/lib/src/fnc/util/math/euclideandistance.rs b/lib/src/fnc/util/math/euclideandistance.rs deleted file mode 100644 index bc384ac4..00000000 --- a/lib/src/fnc/util/math/euclideandistance.rs +++ /dev/null @@ -1,21 +0,0 @@ -use crate::sql::Number; - -pub trait EuclideanDistance { - /// Euclidean Distance between two vectors (L2 Norm) - fn euclidean_distance(&self, other: &Self) -> Option; -} - -impl EuclideanDistance for Vec { - fn euclidean_distance(&self, other: &Self) -> Option { - if self.len() != other.len() { - return None; - } - Some( - self.iter() - .zip(other.iter()) - .map(|(a, b)| (a - b).pow(Number::Int(2))) - .sum::() - .sqrt(), - ) - } -} diff --git a/lib/src/fnc/util/math/magnitude.rs b/lib/src/fnc/util/math/magnitude.rs deleted file mode 100644 index 0160045d..00000000 --- a/lib/src/fnc/util/math/magnitude.rs +++ /dev/null @@ -1,12 +0,0 @@ -use crate::sql::Number; - -pub trait Magnitude { - /// Calculate the magnitude of a vector - fn magnitude(&self) -> Number; -} - -impl Magnitude for Vec { - fn magnitude(&self) -> Number { - self.iter().map(|a| a.clone().pow(Number::Int(2))).sum::().sqrt() - } -} diff --git a/lib/src/fnc/util/math/mod.rs b/lib/src/fnc/util/math/mod.rs index 6f5c6ebc..9021daa3 100644 --- a/lib/src/fnc/util/math/mod.rs +++ b/lib/src/fnc/util/math/mod.rs @@ -4,10 +4,7 @@ pub mod bottom; pub mod deviation; -pub mod dotproduct; -pub mod euclideandistance; pub mod interquartile; -pub mod magnitude; pub mod mean; pub mod median; pub mod midhinge; @@ -19,3 +16,4 @@ pub mod spread; pub mod top; pub mod trimean; pub mod variance; +pub mod vector; diff --git a/lib/src/fnc/util/math/variance.rs b/lib/src/fnc/util/math/variance.rs index b38f18e8..4132a3fe 100644 --- a/lib/src/fnc/util/math/variance.rs +++ b/lib/src/fnc/util/math/variance.rs @@ -9,15 +9,18 @@ pub trait Variance { impl Variance for Vec { fn variance(self, sample: bool) -> f64 { - match self.len() { - 0 => f64::NAN, - 1 => 0.0, - len => { - let mean = self.mean(); - let len = (len - sample as usize) as f64; - let out = self.iter().map(|x| (x.to_float() - mean).powi(2)).sum::() / len; - out - } + variance(&self, self.mean(), sample) + } +} + +pub(super) fn variance(v: &[Number], mean: f64, sample: bool) -> f64 { + match v.len() { + 0 => f64::NAN, + 1 => 0.0, + len => { + let len = (len - sample as usize) as f64; + let out = v.iter().map(|x| (x.to_float() - mean).powi(2)).sum::() / len; + out } } } diff --git a/lib/src/fnc/util/math/vector.rs b/lib/src/fnc/util/math/vector.rs new file mode 100644 index 00000000..eb8e3a65 --- /dev/null +++ b/lib/src/fnc/util/math/vector.rs @@ -0,0 +1,289 @@ +use crate::err::Error; +use crate::fnc::util::math::deviation::deviation; +use crate::fnc::util::math::mean::Mean; +use crate::sql::Number; +use std::collections::HashSet; + +pub trait Add { + /// Addition of two vectors + fn add(&self, other: &Self) -> Result, Error>; +} + +fn check_same_dimension(fnc: &str, a: &Vec, b: &Vec) -> Result<(), Error> { + if a.len() != b.len() { + Err(Error::InvalidArguments { + name: String::from(fnc), + message: String::from("The two vectors must be of the same dimension."), + }) + } else { + Ok(()) + } +} + +impl Add for Vec { + fn add(&self, other: &Self) -> Result, Error> { + check_same_dimension("vector::add", self, other)?; + Ok(self.iter().zip(other.iter()).map(|(a, b)| a + b).collect()) + } +} + +pub trait Angle { + /// Compute the angle between two vectors + fn angle(&self, other: &Self) -> Result; +} + +impl Angle for Vec { + fn angle(&self, other: &Self) -> Result { + check_same_dimension("vector::angle", self, other)?; + let dp = dot(self, other); + let m = self.magnitude() * other.magnitude(); + let d = vector_div(&dp, &m); + Ok(d.acos()) + } +} + +pub trait CosineSimilarity { + fn cosine_similarity(&self, other: &Self) -> Result; +} + +impl CosineSimilarity for Vec { + fn cosine_similarity(&self, other: &Self) -> Result { + check_same_dimension("vector::similarity::cosine", self, other)?; + let d = dot(self, other); + Ok(d / (self.magnitude() * other.magnitude())) + } +} + +pub trait Divide { + /// Division of two vectors + fn divide(&self, other: &Self) -> Result, Error>; +} + +fn vector_div(a: &Number, b: &Number) -> Number { + if a.is_nan() || b.is_nan() || b.is_zero() { + Number::NAN + } else { + a / b + } +} + +impl Divide for Vec { + fn divide(&self, other: &Self) -> Result, Error> { + check_same_dimension("vector::divide", self, other)?; + Ok(self.iter().zip(other.iter()).map(|(a, b)| vector_div(a, b)).collect()) + } +} + +pub trait HammingDistance { + fn hamming_distance(&self, other: &Self) -> Result; +} + +impl HammingDistance for Vec { + fn hamming_distance(&self, other: &Self) -> Result { + check_same_dimension("vector::distance::hamming", self, other)?; + Ok(self.iter().zip(other.iter()).filter(|&(a, b)| a != b).count().into()) + } +} + +pub trait JaccardSimilarity { + fn jaccard_similarity(&self, other: &Self) -> Result; +} + +impl JaccardSimilarity for Vec { + fn jaccard_similarity(&self, other: &Self) -> Result { + let set_a: HashSet<_> = HashSet::from_iter(self.iter()); + let set_b: HashSet<_> = HashSet::from_iter(other.iter()); + let intersection_size = set_a.intersection(&set_b).count() as f64; + let union_size = set_a.union(&set_b).count() as f64; + Ok((intersection_size / union_size).into()) + } +} + +pub trait PearsonSimilarity { + fn pearson_similarity(&self, other: &Self) -> Result; +} + +impl PearsonSimilarity for Vec { + fn pearson_similarity(&self, other: &Self) -> Result { + check_same_dimension("vector::similarity::pearson", self, other)?; + let m1 = self.mean(); + let m2 = other.mean(); + let covar: f64 = self + .iter() + .zip(other.iter()) + .map(|(x, y)| (x.to_float() - m1) * (y.to_float() - m2)) + .sum(); + let covar = covar / self.len() as f64; + let std_dev1 = deviation(self, m1, false); + let std_dev2 = deviation(other, m2, false); + Ok((covar / (std_dev1 * std_dev2)).into()) + } +} + +pub trait ManhattanDistance { + fn manhattan_distance(&self, other: &Self) -> Result; +} + +impl ManhattanDistance for Vec { + fn manhattan_distance(&self, other: &Self) -> Result { + check_same_dimension("vector::distance::manhattan", self, other)?; + Ok(self.iter().zip(other.iter()).map(|(a, b)| (a - b).abs()).sum()) + } +} + +pub trait MinkowskiDistance { + fn minkowski_distance(&self, other: &Self, order: Number) -> Result; +} + +impl MinkowskiDistance for Vec { + fn minkowski_distance(&self, other: &Self, order: Number) -> Result { + check_same_dimension("vector::distance::minkowski", self, other)?; + let p = order.to_float(); + let dist: f64 = self + .iter() + .zip(other.iter()) + .map(|(a, b)| (a.to_float() - b.to_float()).abs().powf(p)) + .sum(); + Ok(dist.powf(1.0 / p).into()) + } +} + +pub trait Multiply { + /// Multiplication of two vectors + fn multiply(&self, other: &Self) -> Result, Error>; +} + +impl Multiply for Vec { + fn multiply(&self, other: &Self) -> Result, Error> { + check_same_dimension("vector::multiply", self, other)?; + Ok(self.iter().zip(other.iter()).map(|(a, b)| a * b).collect()) + } +} + +pub trait Project { + /// Projection of two vectors + fn project(&self, other: &Self) -> Result, Error>; +} + +impl Project for Vec { + fn project(&self, other: &Self) -> Result, Error> { + check_same_dimension("vector::project", self, other)?; + let d = dot(self, other); + let m = magnitude_squared(other).into(); + let s = vector_div(&d, &m); + Ok(other.iter().map(|x| &s * x).collect()) + } +} + +pub trait ChebyshevDistance { + fn chebyshev_distance(&self, other: &Self) -> Result; +} + +impl ChebyshevDistance for Vec { + fn chebyshev_distance(&self, other: &Self) -> Result { + check_same_dimension("vector::distance::chebyshev", self, other)?; + Ok(self + .iter() + .zip(other.iter()) + .map(|(a, b)| (a.to_float() - b.to_float()).abs()) + .fold(f64::MIN, f64::max) + .into()) + } +} + +pub trait Subtract { + /// Subtraction of two vectors + fn subtract(&self, other: &Self) -> Result, Error>; +} + +impl Subtract for Vec { + fn subtract(&self, other: &Self) -> Result, Error> { + check_same_dimension("vector::subtract", self, other)?; + Ok(self.iter().zip(other.iter()).map(|(a, b)| a - b).collect()) + } +} + +pub trait CrossProduct { + /// Cross product of two vectors + fn cross(&self, other: &Self) -> Result, Error>; +} + +impl CrossProduct for Vec { + fn cross(&self, other: &Self) -> Result, Error> { + if self.len() != 3 || other.len() != 3 { + return Err(Error::InvalidArguments { + name: "vector::cross".to_string(), + message: String::from("Both vectors must have a dimension of 3."), + }); + } + let a0 = &self[0]; + let a1 = &self[1]; + let a2 = &self[2]; + let b0 = &other[0]; + let b1 = &other[1]; + let b2 = &other[2]; + let v = vec![a1 * b2 - a2 * b1, a2 * b0 - a0 * b2, a0 * b1 - a1 * b0]; + Ok(v) + } +} + +pub trait DotProduct { + /// Dot Product of two vectors + fn dot(&self, other: &Self) -> Result; +} + +impl DotProduct for Vec { + fn dot(&self, other: &Self) -> Result { + check_same_dimension("vector::dot", self, other)?; + Ok(dot(self, other)) + } +} + +fn dot(a: &[Number], b: &[Number]) -> Number { + a.iter().zip(b.iter()).map(|(a, b)| a * b).sum() +} + +pub trait EuclideanDistance { + /// Euclidean Distance between two vectors (L2 Norm) + fn euclidean_distance(&self, other: &Self) -> Result; +} + +impl EuclideanDistance for Vec { + fn euclidean_distance(&self, other: &Self) -> Result { + check_same_dimension("vector::distance::euclidean", self, other)?; + Ok(self + .iter() + .zip(other.iter()) + .map(|(a, b)| (a - b).to_float().powi(2)) + .sum::() + .sqrt() + .into()) + } +} + +fn magnitude_squared(v: &[Number]) -> f64 { + v.iter().map(|a| a.to_float().powi(2)).sum::() +} + +pub trait Magnitude { + /// Calculate the magnitude of a vector + fn magnitude(&self) -> Number; +} + +impl Magnitude for Vec { + fn magnitude(&self) -> Number { + magnitude_squared(self).sqrt().into() + } +} + +pub trait Normalize { + /// Normalize a vector + fn normalize(&self) -> Vec; +} + +impl Normalize for Vec { + fn normalize(&self) -> Vec { + let m = self.magnitude(); + self.iter().map(|a| vector_div(a, &m)).collect() + } +} diff --git a/lib/src/fnc/vector.rs b/lib/src/fnc/vector.rs index cf2e0985..2bcec9f6 100644 --- a/lib/src/fnc/vector.rs +++ b/lib/src/fnc/vector.rs @@ -1,48 +1,67 @@ use crate::err::Error; -use crate::fnc::util::math::dotproduct::DotProduct; -use crate::fnc::util::math::magnitude::Magnitude; +use crate::fnc::util::math::vector::{ + Add, Angle, CrossProduct, Divide, DotProduct, Magnitude, Multiply, Normalize, Project, Subtract, +}; use crate::sql::{Number, Value}; -pub fn dotproduct((a, b): (Vec, Vec)) -> Result { - match a.dotproduct(&b) { - None => Err(Error::InvalidArguments { - name: String::from("vector::dotproduct"), - message: String::from("The two vectors must be of the same length."), - }), - Some(dot) => Ok(dot.into()), - } +pub fn add((a, b): (Vec, Vec)) -> Result { + Ok(a.add(&b)?.into()) +} + +pub fn angle((a, b): (Vec, Vec)) -> Result { + Ok(a.angle(&b)?.into()) +} + +pub fn divide((a, b): (Vec, Vec)) -> Result { + Ok(a.divide(&b)?.into()) +} + +pub fn cross((a, b): (Vec, Vec)) -> Result { + Ok(a.cross(&b)?.into()) +} + +pub fn dot((a, b): (Vec, Vec)) -> Result { + Ok(a.dot(&b)?.into()) } pub fn magnitude((a,): (Vec,)) -> Result { Ok(a.magnitude().into()) } +pub fn multiply((a, b): (Vec, Vec)) -> Result { + Ok(a.multiply(&b)?.into()) +} + +pub fn normalize((a,): (Vec,)) -> Result { + Ok(a.normalize().into()) +} + +pub fn project((a, b): (Vec, Vec)) -> Result { + Ok(a.project(&b)?.into()) +} + +pub fn subtract((a, b): (Vec, Vec)) -> Result { + Ok(a.subtract(&b)?.into()) +} + pub mod distance { use crate::err::Error; - use crate::fnc::util::math::euclideandistance::EuclideanDistance; + use crate::fnc::util::math::vector::{ + ChebyshevDistance, EuclideanDistance, HammingDistance, ManhattanDistance, MinkowskiDistance, + }; use crate::sql::{Number, Value}; - pub fn chebyshev((_, _): (Vec, Vec)) -> Result { - Err(Error::FeatureNotYetImplemented { - feature: "vector::distance::chebyshev() function", - }) + pub fn chebyshev((a, b): (Vec, Vec)) -> Result { + Ok(a.chebyshev_distance(&b)?.into()) } pub fn euclidean((a, b): (Vec, Vec)) -> Result { - match a.euclidean_distance(&b) { - None => Err(Error::InvalidArguments { - name: String::from("vector::distance::euclidean"), - message: String::from("The two vectors must be of the same length."), - }), - Some(distance) => Ok(distance.into()), - } + Ok(a.euclidean_distance(&b)?.into()) } - pub fn hamming((_, _): (Vec, Vec)) -> Result { - Err(Error::FeatureNotYetImplemented { - feature: "vector::distance::hamming() function", - }) + pub fn hamming((a, b): (Vec, Vec)) -> Result { + Ok(a.hamming_distance(&b)?.into()) } pub fn mahalanobis((_, _): (Vec, Vec)) -> Result { @@ -51,46 +70,31 @@ pub mod distance { }) } - pub fn manhattan((_, _): (Vec, Vec)) -> Result { - Err(Error::FeatureNotYetImplemented { - feature: "vector::distance::manhattan() function", - }) + pub fn manhattan((a, b): (Vec, Vec)) -> Result { + Ok(a.manhattan_distance(&b)?.into()) } - pub fn minkowski((_, _): (Vec, Vec)) -> Result { - Err(Error::FeatureNotYetImplemented { - feature: "vector::distance::minkowski() function", - }) + pub fn minkowski((a, b, o): (Vec, Vec, Number)) -> Result { + Ok(a.minkowski_distance(&b, o)?.into()) } } pub mod similarity { use crate::err::Error; - use crate::fnc::util::math::dotproduct::DotProduct; - use crate::fnc::util::math::magnitude::Magnitude; + use crate::fnc::util::math::vector::{CosineSimilarity, JaccardSimilarity, PearsonSimilarity}; use crate::sql::{Number, Value}; pub fn cosine((a, b): (Vec, Vec)) -> Result { - match a.dotproduct(&b) { - None => Err(Error::InvalidArguments { - name: String::from("vector::similarity::cosine"), - message: String::from("The two vectors must be of the same length."), - }), - Some(dot) => Ok((dot / (a.magnitude() * b.magnitude())).into()), - } + Ok(a.cosine_similarity(&b)?.into()) } - pub fn jaccard((_, _): (Vec, Vec)) -> Result { - Err(Error::FeatureNotYetImplemented { - feature: "vector::similarity::jaccard() function", - }) + pub fn jaccard((a, b): (Vec, Vec)) -> Result { + Ok(a.jaccard_similarity(&b)?.into()) } - pub fn pearson((_, _): (Vec, Vec)) -> Result { - Err(Error::FeatureNotYetImplemented { - feature: "vector::similarity::pearson() function", - }) + pub fn pearson((a, b): (Vec, Vec)) -> Result { + Ok(a.pearson_similarity(&b)?.into()) } pub fn spearman((_, _): (Vec, Vec)) -> Result { diff --git a/lib/src/sql/function.rs b/lib/src/sql/function.rs index 3cd767f0..081091b1 100644 --- a/lib/src/sql/function.rs +++ b/lib/src/sql/function.rs @@ -555,8 +555,16 @@ fn function_type(i: &str) -> IResult<&str, &str> { fn function_vector(i: &str) -> IResult<&str, &str> { alt(( - tag("dotproduct"), + tag("add"), + tag("angle"), + tag("divide"), + tag("cross"), + tag("dot"), tag("magnitude"), + tag("multiply"), + tag("normalize"), + tag("project"), + tag("subtract"), preceded( tag("distance::"), alt(( diff --git a/lib/src/sql/number.rs b/lib/src/sql/number.rs index d9fd758a..46805c2c 100644 --- a/lib/src/sql/number.rs +++ b/lib/src/sql/number.rs @@ -234,6 +234,14 @@ impl Number { } } + pub fn is_zero(&self) -> bool { + match self { + Number::Int(v) => v == &0, + Number::Float(v) => v == &0.0, + Number::Decimal(v) => v == &Decimal::ZERO, + } + } + pub fn is_zero_or_positive(&self) -> bool { match self { Number::Int(v) => v >= &0, @@ -334,6 +342,10 @@ impl Number { } } + pub fn acos(self) -> Self { + self.to_float().acos().into() + } + pub fn ceil(self) -> Self { match self { Number::Int(v) => v.into(), @@ -637,7 +649,7 @@ impl Sort for Vec { } } -pub fn number(i: &str) -> IResult<&str, Number> { +fn not_nan(i: &str) -> IResult<&str, Number> { let (i, v) = recognize_float(i)?; let (i, suffix) = suffix(i)?; let (i, _) = ending(i)?; @@ -649,6 +661,10 @@ pub fn number(i: &str) -> IResult<&str, Number> { Ok((i, number)) } +pub fn number(i: &str) -> IResult<&str, Number> { + alt((map(tag("NaN"), |_| Number::NAN), not_nan))(i) +} + #[derive(Debug)] enum Suffix { None, @@ -691,6 +707,15 @@ mod tests { assert!(!decimal_is_integer(&Decimal::HALF_PI)); } + #[test] + fn number_nan() { + let sql = "NaN"; + let res = number(sql); + assert!(res.is_ok()); + let out = res.unwrap().1; + assert_eq!("NaN", format!("{}", out)); + } + #[test] fn number_int() { let sql = "123"; diff --git a/lib/tests/function.rs b/lib/tests/function.rs index b390871d..36e990b3 100644 --- a/lib/tests/function.rs +++ b/lib/tests/function.rs @@ -13,16 +13,19 @@ async fn test_queries(sql: &str, desired_responses: &[&str]) -> Result<(), Error let v = r?; if let Some(desired_response) = desired_responses.get(i) { let desired_value = Value::parse(*desired_response); - assert_eq!( - v, - desired_value, - "Recieved responce did not match \ + // If both values are NaN, they are equal from a test PoV + if !desired_value.is_nan() || !v.is_nan() { + assert_eq!( + v, + desired_value, + "Received response did not match \ expected. - Query responce #{}, - Desired responce: {desired_value}, + Query response #{}, + Desired response: {desired_value}, Actual response: {v}", - i + 1 - ); + i + 1 + ); + } } else { panic!("Response index {i} out of bounds of desired responses."); } @@ -30,6 +33,31 @@ async fn test_queries(sql: &str, desired_responses: &[&str]) -> Result<(), Error Ok(()) } +async fn check_test_is_error(sql: &str, expected_errors: &[&str]) -> Result<(), Error> { + let db = Datastore::new("memory").await?; + let session = Session::for_kv().with_ns("test").with_db("test"); + let response = db.execute(sql, &session, None).await?; + if response.len() != expected_errors.len() { + panic!( + "Wrong number of responses {} - expected {}.", + response.len(), + expected_errors.len() + ); + } + for (i, r) in response.into_iter().map(|r| r.result).enumerate() { + if let Some(expected_error) = expected_errors.get(i) { + if let Err(e) = r { + assert_eq!(e.to_string().as_str(), *expected_error) + } else { + panic!("Response index {i} is not an error."); + } + } else { + panic!("Response index {i} out of bounds of expected responses."); + } + } + Ok(()) +} + // -------------------------------------------------- // array // -------------------------------------------------- @@ -4640,123 +4668,417 @@ async fn function_type_thing() -> Result<(), Error> { } #[tokio::test] -async fn function_vector_distance_euclidean() -> Result<(), Error> { - let sql = r#" - RETURN vector::distance::euclidean([1, 2, 3], [1, 2, 3]); - RETURN vector::distance::euclidean([1, 2, 3], [-1, -2, -3]); - RETURN vector::distance::euclidean([1, 2, 3], [4, 5]); - RETURN vector::distance::euclidean([1, 2], [4, 5, 5]); - "#; - - let dbs = Datastore::new("memory").await?; - let ses = Session::for_kv().with_ns("test").with_db("test"); - let res = &mut dbs.execute(&sql, &ses, None).await?; - assert_eq!(res.len(), 4); - // - let tmp = res.remove(0).result?; - let val = Value::from(0); - assert_eq!(tmp, val); - // - let tmp = res.remove(0).result?; - let val = Value::from(7.483314773547883); - assert_eq!(tmp, val); - // - let tmp = res.remove(0).result; - assert!(tmp.is_err()); - // - let tmp = res.remove(0).result; - assert!(tmp.is_err()); +async fn function_vector_add() -> Result<(), Error> { + test_queries( + r#" + RETURN vector::add([1, 2, 3], [1, 2, 3]); + RETURN vector::add([1, 2, 3], [-1, -2, -3]); + "#, + &["[2, 4, 6]", "[0, 0, 0]"], + ) + .await?; + check_test_is_error( + r#" + RETURN vector::add([1, 2, 3], [4, 5]); + RETURN vector::add([1, 2], [4, 5, 5]); + "#, + &[ + "Incorrect arguments for function vector::add(). The two vectors must be of the same dimension.", + "Incorrect arguments for function vector::add(). The two vectors must be of the same dimension." + ], + ) + .await?; Ok(()) } #[tokio::test] -async fn function_vector_dotproduct() -> Result<(), Error> { - let sql = r#" - RETURN vector::dotproduct([1, 2, 3], [1, 2, 3]); - RETURN vector::dotproduct([1, 2, 3], [-1, -2, -3]); - RETURN vector::dotproduct([1, 2, 3], [4, 5]); - RETURN vector::dotproduct([1, 2], [4, 5, 5]); - "#; +async fn function_vector_angle() -> Result<(), Error> { + test_queries( + r#" + RETURN vector::angle([1,0,0], [0,1,0]); + RETURN vector::angle([5, 10, 15], [10, 5, 20]); + RETURN vector::angle([-3, 2, 5], [4, -1, 2]); + RETURN vector::angle([NaN, 2, 3], [-1, -2, NaN]); + "#, + &["1.5707963267948966", "0.36774908225917935", "1.7128722906354115", "NaN"], + ) + .await?; - let dbs = Datastore::new("memory").await?; - let ses = Session::for_kv().with_ns("test").with_db("test"); - let res = &mut dbs.execute(&sql, &ses, None).await?; - assert_eq!(res.len(), 4); - // - let tmp = res.remove(0).result?; - let val = Value::from(14); - assert_eq!(tmp, val); - // - let tmp = res.remove(0).result?; - let val = Value::from(-14); - assert_eq!(tmp, val); - // - let tmp = res.remove(0).result; - assert!(tmp.is_err()); - // - let tmp = res.remove(0).result; - assert!(tmp.is_err()); + check_test_is_error( + r#" + RETURN vector::angle([1, 2, 3], [4, 5]); + RETURN vector::angle([1, 2], [4, 5, 5]); + "#, + &[ + "Incorrect arguments for function vector::angle(). The two vectors must be of the same dimension.", + "Incorrect arguments for function vector::angle(). The two vectors must be of the same dimension." + ], + ).await?; + Ok(()) +} + +#[tokio::test] +async fn function_vector_cross() -> Result<(), Error> { + test_queries( + r#" + RETURN vector::cross([1, 2, 3], [4, 5, 6]); + RETURN vector::cross([1, 2, 3], [-4, -5, -6]); + RETURN vector::cross([1, NaN, 3], [NaN, -5, -6]); + "#, + &["[-3, 6, -3]", "[3, -6, 3]", "[NaN, NaN, NaN]"], + ) + .await?; + check_test_is_error( + r#" + RETURN vector::cross([1, 2, 3], [4, 5]); + RETURN vector::cross([1, 2], [4, 5, 5]); + "#, + &[ + "Incorrect arguments for function vector::cross(). Both vectors must have a dimension of 3.", + "Incorrect arguments for function vector::cross(). Both vectors must have a dimension of 3." + ], + ) + .await?; + Ok(()) +} + +#[tokio::test] +async fn function_vector_dot() -> Result<(), Error> { + test_queries( + r#" + RETURN vector::dot([1, 2, 3], [1, 2, 3]); + RETURN vector::dot([1, 2, 3], [-1, -2, -3]); + "#, + &["14", "-14"], + ) + .await?; + + check_test_is_error( + r#" + RETURN vector::dot([1, 2, 3], [4, 5]); + RETURN vector::dot([1, 2], [4, 5, 5]); + "#, + &[ + "Incorrect arguments for function vector::dot(). The two vectors must be of the same dimension.", + "Incorrect arguments for function vector::dot(). The two vectors must be of the same dimension." + ], + ).await?; Ok(()) } #[tokio::test] async fn function_vector_magnitude() -> Result<(), Error> { - let sql = r#" + test_queries( + r#" RETURN vector::magnitude([]); RETURN vector::magnitude([1]); RETURN vector::magnitude([5]); RETURN vector::magnitude([1,2,3,3,3,4,5]); - "#; + "#, + &["0", "1", "5", "8.54400374531753"], + ) + .await +} - let dbs = Datastore::new("memory").await?; - let ses = Session::for_kv().with_ns("test").with_db("test"); - let res = &mut dbs.execute(&sql, &ses, None).await?; - assert_eq!(res.len(), 4); - // - let tmp = res.remove(0).result?; - let val = Value::from(0); - assert_eq!(tmp, val); - // - let tmp = res.remove(0).result?; - let val = Value::from(1); - assert_eq!(tmp, val); - // - let tmp = res.remove(0).result?; - let val = Value::from(5); - assert_eq!(tmp, val); - // - let tmp = res.remove(0).result?; - let val = Value::from(8.54400374531753); - assert_eq!(tmp, val); +#[tokio::test] +async fn function_vector_normalize() -> Result<(), Error> { + test_queries( + r#" + RETURN vector::normalize([]); + RETURN vector::normalize([1]); + RETURN vector::normalize([5]); + RETURN vector::normalize([4,3]); + "#, + &["[]", "[1]", "[1]", "[0.8,0.6]"], + ) + .await +} + +#[tokio::test] +async fn function_vector_multiply() -> Result<(), Error> { + test_queries( + r#" + RETURN vector::multiply([1, 2, 3], [1, 2, 3]); + RETURN vector::multiply([1, 2, 3], [-1, -2, -3]); + "#, + &["[1, 4, 9]", "[-1, -4, -9]"], + ) + .await?; + check_test_is_error( + r#" + RETURN vector::multiply([1, 2, 3], [4, 5]); + RETURN vector::multiply([1, 2], [4, 5, 5]); + "#, + &[ + "Incorrect arguments for function vector::multiply(). The two vectors must be of the same dimension.", + "Incorrect arguments for function vector::multiply(). The two vectors must be of the same dimension." + ], + ) + .await?; + Ok(()) +} + +#[tokio::test] +async fn function_vector_project() -> Result<(), Error> { + test_queries( + r#" + RETURN vector::project([1, 2, 3], [4, 5, 6]); + RETURN vector::project([1, -2, 3], [-4, 5, 6]); + RETURN vector::project([NaN, -2, 3], [-4, NaN, NaN]); + "#, + &[ + "[1.6623376623376624, 2.077922077922078, 2.4935064935064934]", + "[-0.2077922077922078, 0.25974025974025977, 0.3116883116883117]", + "[NaN, NaN, NaN]", + ], + ) + .await?; + check_test_is_error( + r#" + RETURN vector::project([1, 2, 3], [4, 5]); + RETURN vector::project([1, 2], [4, 5, 5]); + "#, + &[ + "Incorrect arguments for function vector::project(). The two vectors must be of the same dimension.", + "Incorrect arguments for function vector::project(). The two vectors must be of the same dimension." + ], + ) + .await?; + Ok(()) +} + +#[tokio::test] +async fn function_vector_divide() -> Result<(), Error> { + test_queries( + r#" + RETURN vector::divide([10, NaN, 20, 30, 0], [0, 1, 2, 0, 4]); + RETURN vector::divide([10, -20, 30, 0], [0, -1, 2, -3]); + "#, + &["[NaN, NaN, 10, NaN, 0]", "[NaN, 20, 15, 0]"], + ) + .await?; + check_test_is_error( + r#" + RETURN vector::divide([1, 2, 3], [4, 5]); + RETURN vector::divide([1, 2], [4, 5, 5]); + "#, + &[ + "Incorrect arguments for function vector::divide(). The two vectors must be of the same dimension.", + "Incorrect arguments for function vector::divide(). The two vectors must be of the same dimension." + ], + ) + .await?; + Ok(()) +} + +#[tokio::test] +async fn function_vector_subtract() -> Result<(), Error> { + test_queries( + r#" + RETURN vector::subtract([1, 2, 3], [1, 2, 3]); + RETURN vector::subtract([1, 2, 3], [-1, -2, -3]); + "#, + &["[0, 0, 0]", "[2, 4, 6]"], + ) + .await?; + check_test_is_error( + r#" + RETURN vector::subtract([1, 2, 3], [4, 5]); + RETURN vector::subtract([1, 2], [4, 5, 5]); + "#, + &[ + "Incorrect arguments for function vector::subtract(). The two vectors must be of the same dimension.", + "Incorrect arguments for function vector::subtract(). The two vectors must be of the same dimension." + ], + ) + .await?; Ok(()) } #[tokio::test] async fn function_vector_similarity_cosine() -> Result<(), Error> { - let sql = r#" + test_queries( + r#" RETURN vector::similarity::cosine([1, 2, 3], [1, 2, 3]); RETURN vector::similarity::cosine([1, 2, 3], [-1, -2, -3]); - RETURN vector::similarity::cosine([1, 2, 3], [4, 5]); - RETURN vector::similarity::cosine([1, 2], [4, 5, 5]); - "#; + RETURN vector::similarity::cosine([NaN, 1, 2, 3], [NaN, 1, 2, 3]); + RETURN vector::similarity::cosine([10, 50, 200], [400, 100, 20]); + "#, + &["1.0", "-1.0", "NaN", "0.15258215962441316"], + ) + .await?; - let dbs = Datastore::new("memory").await?; - let ses = Session::for_kv().with_ns("test").with_db("test"); - let res = &mut dbs.execute(&sql, &ses, None).await?; - assert_eq!(res.len(), 4); - // - let tmp = res.remove(0).result?; - let val = Value::from(1.0); - assert_eq!(tmp, val); - // - let tmp = res.remove(0).result?; - let val = Value::from(-1.0); - assert_eq!(tmp, val); - // - let tmp = res.remove(0).result; - assert!(tmp.is_err()); - // - let tmp = res.remove(0).result; - assert!(tmp.is_err()); + check_test_is_error( + r"RETURN vector::similarity::cosine([1, 2, 3], [4, 5]); + RETURN vector::similarity::cosine([1, 2], [4, 5, 5]);", + &[ + "Incorrect arguments for function vector::similarity::cosine(). The two vectors must be of the same dimension.", + "Incorrect arguments for function vector::similarity::cosine(). The two vectors must be of the same dimension." + ]).await?; + Ok(()) +} + +#[tokio::test] +async fn function_vector_similarity_jaccard() -> Result<(), Error> { + test_queries( + r#" + RETURN vector::similarity::jaccard([1, 2, 3], [3, 2, 1]); + RETURN vector::similarity::jaccard([1, 2, 3], [-3, -2, -1]); + RETURN vector::similarity::jaccard([1, -2, 3, -4], [4, 3, 2, 1]); + RETURN vector::similarity::jaccard([NaN, 1, 2, 3], [NaN, 2, 3, 4]); + RETURN vector::similarity::jaccard([0,1,2,5,6], [0,2,3,4,5,7,9]); + "#, + &["1.0", "0", "0.3333333333333333", "0.6", "0.3333333333333333"], + ) + .await?; + Ok(()) +} + +#[tokio::test] +async fn function_vector_similarity_pearson() -> Result<(), Error> { + test_queries( + r#" + RETURN vector::similarity::pearson([1, 2, 3, 4, 5], [1, 2.5, 3.5, 4.2, 5.1]); + RETURN vector::similarity::pearson([NaN, 1, 2, 3, 4, 5], [NaN, 1, 2.5, 3.5, 4.2, 5.1]); + RETURN vector::similarity::pearson([1,2,3], [1,5,7]); + "#, + &["0.9894065340659606", "NaN", "0.9819805060619659"], + ) + .await?; + + check_test_is_error( + r"RETURN vector::similarity::pearson([1, 2, 3], [4, 5]); + RETURN vector::similarity::pearson([1, 2], [4, 5, 5]);", + &[ + "Incorrect arguments for function vector::similarity::pearson(). The two vectors must be of the same dimension.", + "Incorrect arguments for function vector::similarity::pearson(). The two vectors must be of the same dimension." + ]).await?; + Ok(()) +} + +#[tokio::test] +async fn function_vector_distance_euclidean() -> Result<(), Error> { + test_queries( + r#" + RETURN vector::distance::euclidean([1, 2, 3], [1, 2, 3]); + RETURN vector::distance::euclidean([NaN, 2, 3], [-1, NaN, -3]); + RETURN vector::distance::euclidean([1, 2, 3], [-1, -2, -3]); + RETURN vector::distance::euclidean([10, 50, 200], [400, 100, 20]); + RETURN vector::distance::euclidean([10, 20, 15, 10, 5], [12, 24, 18, 8, 7]); + "#, + &["0", "NaN", "7.483314773547883", "432.43496620879307", "6.082762530298219"], + ) + .await?; + check_test_is_error( + r"RETURN vector::distance::euclidean([1, 2, 3], [4, 5]); + RETURN vector::distance::euclidean([1, 2], [4, 5, 5]);", + &[ + "Incorrect arguments for function vector::distance::euclidean(). The two vectors must be of the same dimension.", + "Incorrect arguments for function vector::distance::euclidean(). The two vectors must be of the same dimension." + ]).await?; + Ok(()) +} + +#[tokio::test] +async fn function_vector_distance_manhattan() -> Result<(), Error> { + test_queries( + r#" + RETURN vector::distance::manhattan([1, 2, 3], [4, 5, 6]); + RETURN vector::distance::manhattan([1, 2, 3], [-4, -5, -6]); + RETURN vector::distance::manhattan([1.1, 2, 3.3], [4, 5.5, 6.6]); + RETURN vector::distance::manhattan([NaN, 1, 2, 3], [NaN, 4, 5, 6]); + RETURN vector::distance::manhattan([10, 20, 15, 10, 5], [12, 24, 18, 8, 7]); + "#, + &["9", "21", "9.7", "NaN", "13"], + ) + .await?; + + check_test_is_error( + r"RETURN vector::distance::manhattan([1, 2, 3], [4, 5]); + RETURN vector::distance::manhattan([1, 2], [4, 5, 5]);", + &[ + "Incorrect arguments for function vector::distance::manhattan(). The two vectors must be of the same dimension.", + "Incorrect arguments for function vector::distance::manhattan(). The two vectors must be of the same dimension." + ]).await?; + Ok(()) +} + +#[tokio::test] +async fn function_vector_distance_hamming() -> Result<(), Error> { + test_queries( + r#" + RETURN vector::distance::hamming([1, 2, 2], [1, 2, 3]); + RETURN vector::distance::hamming([-1, -2, -3], [-2, -2, -2]); + RETURN vector::distance::hamming([1.1, 2.2, -3.3], [1.1, 2, -3.3]); + RETURN vector::distance::hamming([NaN, 1, 2, 3], [NaN, 1, 2, 3]); + RETURN vector::distance::hamming([0, 0, 0, 0, 0, 1], [0, 0, 0, 0, 1, 0]); + "#, + &["1", "2", "1", "0", "2"], + ) + .await?; + + check_test_is_error( + r"RETURN vector::distance::hamming([1, 2, 3], [4, 5]); + RETURN vector::distance::hamming([1, 2], [4, 5, 5]);", + &[ + "Incorrect arguments for function vector::distance::hamming(). The two vectors must be of the same dimension.", + "Incorrect arguments for function vector::distance::hamming(). The two vectors must be of the same dimension." + ]).await?; + Ok(()) +} + +#[tokio::test] +async fn function_vector_distance_minkowski() -> Result<(), Error> { + test_queries( + r#" + RETURN vector::distance::minkowski([1, 2, 3], [4, 5, 6], 3); + RETURN vector::distance::minkowski([-1, -2, -3], [-4, -5, -6], 3); + RETURN vector::distance::minkowski([1.1, 2.2, 3], [4, 5.5, 6.6], 3); + RETURN vector::distance::minkowski([NaN, 1, 2, 3], [NaN, 4, 5, 6], 3); + RETURN vector::distance::minkowski([10, 20, 15, 10, 5], [12, 24, 18, 8, 7], 1); + RETURN vector::distance::minkowski([10, 20, 15, 10, 5], [12, 24, 18, 8, 7], 2); + "#, + &[ + "4.3267487109222245", + "4.3267487109222245", + "4.747193170917638", + "NaN", + "13.0", + "6.082762530298219", + ], + ) + .await?; + + check_test_is_error( + r"RETURN vector::distance::minkowski([1, 2, 3], [4, 5], 3); + RETURN vector::distance::minkowski([1, 2], [4, 5, 5], 3);", + &[ + "Incorrect arguments for function vector::distance::minkowski(). The two vectors must be of the same dimension.", + "Incorrect arguments for function vector::distance::minkowski(). The two vectors must be of the same dimension." + ]).await?; + Ok(()) +} + +#[tokio::test] +async fn function_vector_distance_chebyshev() -> Result<(), Error> { + test_queries( + r#" + RETURN vector::distance::chebyshev([1, 2, 3], [4, 5, 6]); + RETURN vector::distance::chebyshev([-1, -2, -3], [-4, -5, -6]); + RETURN vector::distance::chebyshev([1.1, 2.2, 3], [4, 5.5, 6.6]); + RETURN vector::distance::chebyshev([NaN, 1, 2, 3], [NaN, 4, 5, 6]); + RETURN vector::distance::chebyshev([2, 4, 5, 3, 8, 2], [3, 1, 5, -3, 7, 2]); + "#, + &["3.0", "3.0", "3.5999999999999996", "3.0", "6.0"], + ) + .await?; + + check_test_is_error( + r"RETURN vector::distance::chebyshev([1, 2, 3], [4, 5]); + RETURN vector::distance::chebyshev([1, 2], [4, 5, 5]);", + &[ + "Incorrect arguments for function vector::distance::chebyshev(). The two vectors must be of the same dimension.", + "Incorrect arguments for function vector::distance::chebyshev(). The two vectors must be of the same dimension." + ]).await?; Ok(()) }