search::highlight option to highlight only the searched characters (edgengram, ngram) (#3719)
This commit is contained in:
parent
00bc9db496
commit
45c296a270
10 changed files with 300 additions and 125 deletions
core/src
lib/tests
tests
|
@ -1,6 +1,7 @@
|
|||
use crate::err::Error;
|
||||
use crate::sql::value::Value;
|
||||
use crate::sql::{Array, Bytes, Datetime, Duration, Kind, Number, Object, Regex, Strand, Thing};
|
||||
use std::vec::IntoIter;
|
||||
|
||||
/// Implemented by types that are commonly used, in a certain way, as arguments.
|
||||
pub trait FromArg: Sized {
|
||||
|
@ -186,6 +187,7 @@ impl_tuple!(0,);
|
|||
impl_tuple!(1, A);
|
||||
impl_tuple!(2, A, B);
|
||||
impl_tuple!(3, A, B, C);
|
||||
impl_tuple!(4, A, B, C, D);
|
||||
|
||||
// Some functions take a single, optional argument, or no arguments at all.
|
||||
impl<A: FromArg> FromArgs for (Option<A>,) {
|
||||
|
@ -240,6 +242,34 @@ impl<A: FromArg, B: FromArg> FromArgs for (A, Option<B>) {
|
|||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn get_arg<T: FromArg, E: Fn() -> Error>(
|
||||
name: &str,
|
||||
pos: usize,
|
||||
args: &mut IntoIter<Value>,
|
||||
err: E,
|
||||
) -> Result<T, Error> {
|
||||
T::from_arg(args.next().ok_or_else(err)?).map_err(|e| Error::InvalidArguments {
|
||||
name: name.to_owned(),
|
||||
message: format!("Argument {pos} was the wrong type. {e}"),
|
||||
})
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn get_opt_arg<T: FromArg>(
|
||||
name: &str,
|
||||
pos: usize,
|
||||
args: &mut IntoIter<Value>,
|
||||
) -> Result<Option<T>, Error> {
|
||||
Ok(match args.next() {
|
||||
Some(v) => Some(T::from_arg(v).map_err(|e| Error::InvalidArguments {
|
||||
name: name.to_owned(),
|
||||
message: format!("Argument {pos} was the wrong type. {e}"),
|
||||
})?),
|
||||
None => None,
|
||||
})
|
||||
}
|
||||
|
||||
// Some functions take 2 or 3 arguments, so the third argument is optional.
|
||||
impl<A: FromArg, B: FromArg, C: FromArg> FromArgs for (A, B, Option<C>) {
|
||||
fn from_args(name: &str, args: Vec<Value>) -> Result<Self, Error> {
|
||||
|
@ -249,24 +279,11 @@ impl<A: FromArg, B: FromArg, C: FromArg> FromArgs for (A, B, Option<C>) {
|
|||
};
|
||||
// Process the function arguments
|
||||
let mut args = args.into_iter();
|
||||
// Process the first function argument
|
||||
let a = A::from_arg(args.next().ok_or_else(err)?).map_err(|e| Error::InvalidArguments {
|
||||
name: name.to_owned(),
|
||||
message: format!("Argument 1 was the wrong type. {e}"),
|
||||
})?;
|
||||
// Process the second function argument
|
||||
let b = B::from_arg(args.next().ok_or_else(err)?).map_err(|e| Error::InvalidArguments {
|
||||
name: name.to_owned(),
|
||||
message: format!("Argument 2 was the wrong type. {e}"),
|
||||
})?;
|
||||
// Process the third function argument
|
||||
let c = match args.next() {
|
||||
Some(c) => Some(C::from_arg(c).map_err(|e| Error::InvalidArguments {
|
||||
name: name.to_owned(),
|
||||
message: format!("Argument 3 was the wrong type. {e}"),
|
||||
})?),
|
||||
None => None,
|
||||
};
|
||||
|
||||
let a: A = get_arg(name, 1, &mut args, err)?;
|
||||
let b: B = get_arg(name, 2, &mut args, err)?;
|
||||
let c: Option<C> = get_opt_arg(name, 3, &mut args)?;
|
||||
|
||||
// Process additional function arguments
|
||||
if args.next().is_some() {
|
||||
// Too many arguments
|
||||
|
@ -276,6 +293,30 @@ impl<A: FromArg, B: FromArg, C: FromArg> FromArgs for (A, B, Option<C>) {
|
|||
}
|
||||
}
|
||||
|
||||
// Some functions take 3 or 4 arguments, so the fourth argument is optional.
|
||||
impl<A: FromArg, B: FromArg, C: FromArg, D: FromArg> FromArgs for (A, B, C, Option<D>) {
|
||||
fn from_args(name: &str, args: Vec<Value>) -> Result<Self, Error> {
|
||||
let err = || Error::InvalidArguments {
|
||||
name: name.to_owned(),
|
||||
message: String::from("Expected 3 or 4 arguments."),
|
||||
};
|
||||
// Process the function arguments
|
||||
let mut args = args.into_iter();
|
||||
|
||||
let a: A = get_arg(name, 1, &mut args, err)?;
|
||||
let b: B = get_arg(name, 2, &mut args, err)?;
|
||||
let c: C = get_arg(name, 3, &mut args, err)?;
|
||||
let d: Option<D> = get_opt_arg(name, 4, &mut args)?;
|
||||
|
||||
// Process additional function arguments
|
||||
if args.next().is_some() {
|
||||
// Too many arguments
|
||||
return Err(err());
|
||||
}
|
||||
Ok((a, b, c, d))
|
||||
}
|
||||
}
|
||||
|
||||
// Some functions take 0, 1, or 2 arguments, so both arguments are optional.
|
||||
// It is safe to assume that, if the first argument is None, the second argument will also be None.
|
||||
impl<A: FromArg, B: FromArg> FromArgs for (Option<A>, Option<B>) {
|
||||
|
@ -286,22 +327,10 @@ impl<A: FromArg, B: FromArg> FromArgs for (Option<A>, Option<B>) {
|
|||
};
|
||||
// Process the function arguments
|
||||
let mut args = args.into_iter();
|
||||
// Process the first function argument
|
||||
let a = match args.next() {
|
||||
Some(a) => Some(A::from_arg(a).map_err(|e| Error::InvalidArguments {
|
||||
name: name.to_owned(),
|
||||
message: format!("Argument 1 was the wrong type. {e}"),
|
||||
})?),
|
||||
None => None,
|
||||
};
|
||||
// Process the second function argument
|
||||
let b = match args.next() {
|
||||
Some(b) => Some(B::from_arg(b).map_err(|e| Error::InvalidArguments {
|
||||
name: name.to_owned(),
|
||||
message: format!("Argument 2 was the wrong type. {e}"),
|
||||
})?),
|
||||
None => None,
|
||||
};
|
||||
|
||||
let a: Option<A> = get_opt_arg(name, 1, &mut args)?;
|
||||
let b: Option<B> = get_opt_arg(name, 2, &mut args)?;
|
||||
|
||||
// Process additional function arguments
|
||||
if args.next().is_some() {
|
||||
// Too many arguments
|
||||
|
@ -320,22 +349,10 @@ impl<A: FromArg, B: FromArg> FromArgs for (Option<(A, B)>,) {
|
|||
};
|
||||
// Process the function arguments
|
||||
let mut args = args.into_iter();
|
||||
// Process the first function argument
|
||||
let a = match args.next() {
|
||||
Some(a) => Some(A::from_arg(a).map_err(|e| Error::InvalidArguments {
|
||||
name: name.to_owned(),
|
||||
message: format!("Argument 1 was the wrong type. {e}"),
|
||||
})?),
|
||||
None => None,
|
||||
};
|
||||
// Process the second function argument
|
||||
let b = match args.next() {
|
||||
Some(b) => Some(B::from_arg(b).map_err(|e| Error::InvalidArguments {
|
||||
name: name.to_owned(),
|
||||
message: format!("Argument 2 was the wrong type. {e}"),
|
||||
})?),
|
||||
None => None,
|
||||
};
|
||||
|
||||
let a: Option<A> = get_opt_arg(name, 1, &mut args)?;
|
||||
let b: Option<B> = get_opt_arg(name, 2, &mut args)?;
|
||||
|
||||
// Process additional function arguments
|
||||
if a.is_some() != b.is_some() || args.next().is_some() {
|
||||
// One argument, or too many arguments
|
||||
|
@ -355,27 +372,11 @@ impl<A: FromArg, B: FromArg, C: FromArg> FromArgs for (A, Option<B>, Option<C>)
|
|||
};
|
||||
// Process the function arguments
|
||||
let mut args = args.into_iter();
|
||||
// Process the first function argument
|
||||
let a = A::from_arg(args.next().ok_or_else(err)?).map_err(|e| Error::InvalidArguments {
|
||||
name: name.to_owned(),
|
||||
message: format!("Argument 1 was the wrong type. {e}"),
|
||||
})?;
|
||||
// Process the second function argument
|
||||
let b = match args.next() {
|
||||
Some(b) => Some(B::from_arg(b).map_err(|e| Error::InvalidArguments {
|
||||
name: name.to_owned(),
|
||||
message: format!("Argument 2 was the wrong type. {e}"),
|
||||
})?),
|
||||
None => None,
|
||||
};
|
||||
// Process the third function argument
|
||||
let c = match args.next() {
|
||||
Some(c) => Some(C::from_arg(c).map_err(|e| Error::InvalidArguments {
|
||||
name: name.to_owned(),
|
||||
message: format!("Argument 3 was the wrong type. {e}"),
|
||||
})?),
|
||||
None => None,
|
||||
};
|
||||
|
||||
let a: A = get_arg(name, 1, &mut args, err)?;
|
||||
let b: Option<B> = get_opt_arg(name, 2, &mut args)?;
|
||||
let c: Option<C> = get_opt_arg(name, 3, &mut args)?;
|
||||
|
||||
// Process additional function arguments
|
||||
if args.next().is_some() {
|
||||
// Too many arguments
|
||||
|
|
|
@ -51,10 +51,11 @@ pub async fn score(
|
|||
|
||||
pub async fn highlight(
|
||||
(ctx, txn, doc): (&Context<'_>, Option<&Transaction>, Option<&CursorDoc<'_>>),
|
||||
(prefix, suffix, match_ref): (Value, Value, Value),
|
||||
(prefix, suffix, match_ref, partial): (Value, Value, Value, Option<Value>),
|
||||
) -> Result<Value, Error> {
|
||||
if let Some((txn, exe, doc, thg)) = get_execution_context(ctx, txn, doc) {
|
||||
exe.highlight(txn, thg, prefix, suffix, &match_ref, doc.doc.as_ref()).await
|
||||
let partial = partial.map(|p| p.convert_to_bool()).unwrap_or(Ok(false))?;
|
||||
exe.highlight(txn, thg, prefix, suffix, match_ref, partial, doc.doc.as_ref()).await
|
||||
} else {
|
||||
Ok(Value::None)
|
||||
}
|
||||
|
@ -62,10 +63,11 @@ pub async fn highlight(
|
|||
|
||||
pub async fn offsets(
|
||||
(ctx, txn, doc): (&Context<'_>, Option<&Transaction>, Option<&CursorDoc<'_>>),
|
||||
(match_ref,): (Value,),
|
||||
(match_ref, partial): (Value, Option<Value>),
|
||||
) -> Result<Value, Error> {
|
||||
if let Some((txn, exe, _, thg)) = get_execution_context(ctx, txn, doc) {
|
||||
exe.offsets(txn, thg, &match_ref).await
|
||||
let partial = partial.map(|p| p.convert_to_bool()).unwrap_or(Ok(false))?;
|
||||
exe.offsets(txn, thg, match_ref, partial).await
|
||||
} else {
|
||||
Ok(Value::None)
|
||||
}
|
||||
|
|
|
@ -41,7 +41,7 @@ impl Analyzer {
|
|||
txn: &Transaction,
|
||||
t: &Terms,
|
||||
query_string: String,
|
||||
) -> Result<Vec<Option<TermId>>, Error> {
|
||||
) -> Result<Vec<Option<(TermId, u32)>>, Error> {
|
||||
let tokens = self.generate_tokens(ctx, opt, txn, query_string).await?;
|
||||
// We first collect every unique terms
|
||||
// as it can contains duplicates
|
||||
|
@ -54,7 +54,7 @@ impl Analyzer {
|
|||
let mut tx = txn.lock().await;
|
||||
for term in terms {
|
||||
let opt_term_id = t.get_term_id(&mut tx, tokens.get_token_string(term)?).await?;
|
||||
res.push(opt_term_id);
|
||||
res.push(opt_term_id.map(|tid| (tid, term.get_char_len())));
|
||||
}
|
||||
Ok(res)
|
||||
}
|
||||
|
@ -233,6 +233,7 @@ mod tests {
|
|||
use super::Analyzer;
|
||||
use crate::ctx::Context;
|
||||
use crate::dbs::{Options, Transaction};
|
||||
use crate::idx::ft::analyzer::tokenizer::{Token, Tokens};
|
||||
use crate::kvs::{Datastore, LockType, TransactionType};
|
||||
use crate::{
|
||||
sql::{statements::DefineStatement, Statement},
|
||||
|
@ -241,7 +242,7 @@ mod tests {
|
|||
use futures::lock::Mutex;
|
||||
use std::sync::Arc;
|
||||
|
||||
pub(super) async fn test_analyzer(def: &str, input: &str, expected: &[&str]) {
|
||||
async fn get_analyzer_tokens(def: &str, input: &str) -> Tokens {
|
||||
let ds = Datastore::new("memory").await.unwrap();
|
||||
let tx = ds.transaction(TransactionType::Read, LockType::Optimistic).await.unwrap();
|
||||
let txn: Transaction = Arc::new(Mutex::new(tx));
|
||||
|
@ -251,11 +252,15 @@ mod tests {
|
|||
panic!()
|
||||
};
|
||||
let a: Analyzer = az.into();
|
||||
|
||||
let tokens = a
|
||||
.generate_tokens(&Context::default(), &Options::default(), &txn, input.to_string())
|
||||
.await
|
||||
.unwrap();
|
||||
tokens
|
||||
}
|
||||
|
||||
pub(super) async fn test_analyzer(def: &str, input: &str, expected: &[&str]) {
|
||||
let tokens = get_analyzer_tokens(def, input).await;
|
||||
let mut res = vec![];
|
||||
for t in tokens.list() {
|
||||
res.push(tokens.get_token_string(t).unwrap());
|
||||
|
|
|
@ -84,26 +84,31 @@ impl TryFrom<Tokens> for Value {
|
|||
#[derive(Clone, Debug, PartialOrd, PartialEq, Eq, Ord, Hash)]
|
||||
pub(super) enum Token {
|
||||
Ref {
|
||||
chars: (Position, Position),
|
||||
chars: (Position, Position, Position),
|
||||
bytes: (Position, Position),
|
||||
len: u32,
|
||||
},
|
||||
String {
|
||||
chars: (Position, Position),
|
||||
chars: (Position, Position, Position),
|
||||
bytes: (Position, Position),
|
||||
term: String,
|
||||
len: u32,
|
||||
},
|
||||
}
|
||||
|
||||
impl Token {
|
||||
fn new_token(&self, term: String) -> Self {
|
||||
let len = term.chars().count() as u32;
|
||||
match self {
|
||||
Token::Ref {
|
||||
chars,
|
||||
bytes,
|
||||
..
|
||||
} => Token::String {
|
||||
chars: *chars,
|
||||
bytes: *bytes,
|
||||
term,
|
||||
len,
|
||||
},
|
||||
Token::String {
|
||||
chars,
|
||||
|
@ -113,6 +118,7 @@ impl Token {
|
|||
chars: *chars,
|
||||
bytes: *bytes,
|
||||
term,
|
||||
len,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
@ -122,11 +128,11 @@ impl Token {
|
|||
Token::Ref {
|
||||
chars,
|
||||
..
|
||||
} => Offset::new(i, chars.0, chars.1),
|
||||
} => Offset::new(i, chars.0, chars.1, chars.2),
|
||||
Token::String {
|
||||
chars,
|
||||
..
|
||||
} => Offset::new(i, chars.0, chars.1),
|
||||
} => Offset::new(i, chars.0, chars.1, chars.2),
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -135,7 +141,7 @@ impl Token {
|
|||
Token::Ref {
|
||||
chars,
|
||||
..
|
||||
} => chars.0 == chars.1,
|
||||
} => chars.0 == chars.2,
|
||||
Token::String {
|
||||
term,
|
||||
..
|
||||
|
@ -143,6 +149,19 @@ impl Token {
|
|||
}
|
||||
}
|
||||
|
||||
pub(super) fn get_char_len(&self) -> u32 {
|
||||
match self {
|
||||
Token::Ref {
|
||||
len,
|
||||
..
|
||||
} => *len,
|
||||
Token::String {
|
||||
len,
|
||||
..
|
||||
} => *len,
|
||||
}
|
||||
}
|
||||
|
||||
pub(super) fn get_str<'a>(&'a self, i: &'a str) -> Result<&str, Error> {
|
||||
match self {
|
||||
Token::Ref {
|
||||
|
@ -207,8 +226,9 @@ impl Tokenizer {
|
|||
// The last pos may be more advanced due to the is_valid process
|
||||
if last_char_pos < current_char_pos {
|
||||
t.push(Token::Ref {
|
||||
chars: (last_char_pos, current_char_pos),
|
||||
chars: (last_char_pos, last_char_pos, current_char_pos),
|
||||
bytes: (last_byte_pos, current_byte_pos),
|
||||
len: current_char_pos - last_char_pos,
|
||||
});
|
||||
}
|
||||
last_char_pos = current_char_pos;
|
||||
|
@ -225,8 +245,9 @@ impl Tokenizer {
|
|||
}
|
||||
if current_char_pos != last_char_pos {
|
||||
t.push(Token::Ref {
|
||||
chars: (last_char_pos, current_char_pos),
|
||||
chars: (last_char_pos, last_char_pos, current_char_pos),
|
||||
bytes: (last_byte_pos, current_byte_pos),
|
||||
len: current_char_pos - last_char_pos,
|
||||
});
|
||||
}
|
||||
Tokens {
|
||||
|
|
|
@ -15,7 +15,13 @@ pub(super) struct Highlighter {
|
|||
}
|
||||
|
||||
impl Highlighter {
|
||||
pub(super) fn new(prefix: Value, suffix: Value, idiom: &Idiom, doc: &Value) -> Self {
|
||||
pub(super) fn new(
|
||||
prefix: Value,
|
||||
suffix: Value,
|
||||
partial: bool,
|
||||
idiom: &Idiom,
|
||||
doc: &Value,
|
||||
) -> Self {
|
||||
let prefix = prefix.to_raw_string().chars().collect();
|
||||
let suffix = suffix.to_raw_string().chars().collect();
|
||||
// Extract the fields we want to highlight
|
||||
|
@ -24,12 +30,12 @@ impl Highlighter {
|
|||
fields,
|
||||
prefix,
|
||||
suffix,
|
||||
offseter: Offseter::default(),
|
||||
offseter: Offseter::new(partial),
|
||||
}
|
||||
}
|
||||
|
||||
pub(super) fn highlight(&mut self, os: Vec<Offset>) {
|
||||
self.offseter.highlight(os);
|
||||
pub(super) fn highlight(&mut self, term_len: u32, os: Vec<Offset>) {
|
||||
self.offseter.highlight(term_len, os);
|
||||
}
|
||||
|
||||
fn extract(val: Value, vals: &mut Vec<String>) {
|
||||
|
@ -104,27 +110,41 @@ impl TryFrom<Highlighter> for Value {
|
|||
}
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
pub(super) struct Offseter {
|
||||
partial: bool,
|
||||
offsets: HashMap<u32, BTreeMap<Position, Position>>,
|
||||
}
|
||||
|
||||
impl Offseter {
|
||||
pub(super) fn highlight(&mut self, os: Vec<Offset>) {
|
||||
pub(super) fn new(partial: bool) -> Self {
|
||||
Self {
|
||||
partial,
|
||||
offsets: Default::default(),
|
||||
}
|
||||
}
|
||||
|
||||
pub(super) fn highlight(&mut self, term_len: u32, os: Vec<Offset>) {
|
||||
for o in os {
|
||||
let (start, end) = if self.partial {
|
||||
let start = o.gen_start.min(o.end);
|
||||
let end = (start + term_len).min(o.end);
|
||||
(start, end)
|
||||
} else {
|
||||
(o.start, o.end)
|
||||
};
|
||||
match self.offsets.entry(o.index) {
|
||||
HEntry::Occupied(mut e) => match e.get_mut().entry(o.start) {
|
||||
HEntry::Occupied(mut e) => match e.get_mut().entry(start) {
|
||||
BEntry::Vacant(e) => {
|
||||
e.insert(o.end);
|
||||
e.insert(end);
|
||||
}
|
||||
BEntry::Occupied(mut e) => {
|
||||
if o.end.gt(e.get()) {
|
||||
e.insert(o.end);
|
||||
e.insert(end);
|
||||
}
|
||||
}
|
||||
},
|
||||
HEntry::Vacant(e) => {
|
||||
e.insert(BTreeMap::from([(o.start, o.end)]));
|
||||
e.insert(BTreeMap::from([(start, end)]));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -332,7 +332,7 @@ impl FtIndex {
|
|||
opt: &Options,
|
||||
txn: &Transaction,
|
||||
query_string: String,
|
||||
) -> Result<Vec<Option<TermId>>, Error> {
|
||||
) -> Result<Vec<Option<(TermId, u32)>>, Error> {
|
||||
let t = self.terms.read().await;
|
||||
let terms = self.analyzer.extract_terms(ctx, opt, txn, &t, query_string).await?;
|
||||
Ok(terms)
|
||||
|
@ -341,11 +341,11 @@ impl FtIndex {
|
|||
pub(super) async fn get_terms_docs(
|
||||
&self,
|
||||
tx: &mut kvs::Transaction,
|
||||
terms: &Vec<Option<TermId>>,
|
||||
terms: &Vec<Option<(TermId, u32)>>,
|
||||
) -> Result<Vec<Option<(TermId, RoaringTreemap)>>, Error> {
|
||||
let mut terms_docs = Vec::with_capacity(terms.len());
|
||||
for opt_term_id in terms {
|
||||
if let Some(term_id) = opt_term_id {
|
||||
for opt_term in terms {
|
||||
if let Some((term_id, _)) = opt_term {
|
||||
let docs = self.term_docs.get_docs(tx, *term_id).await?;
|
||||
if let Some(docs) = docs {
|
||||
terms_docs.push(Some((*term_id, docs)));
|
||||
|
@ -402,19 +402,20 @@ impl FtIndex {
|
|||
&self,
|
||||
tx: &mut kvs::Transaction,
|
||||
thg: &Thing,
|
||||
terms: &[Option<TermId>],
|
||||
terms: &[Option<(TermId, u32)>],
|
||||
prefix: Value,
|
||||
suffix: Value,
|
||||
partial: bool,
|
||||
idiom: &Idiom,
|
||||
doc: &Value,
|
||||
) -> Result<Value, Error> {
|
||||
let doc_key: Key = thg.into();
|
||||
if let Some(doc_id) = self.doc_ids.read().await.get_doc_id(tx, doc_key).await? {
|
||||
let mut hl = Highlighter::new(prefix, suffix, idiom, doc);
|
||||
for term_id in terms.iter().flatten() {
|
||||
let mut hl = Highlighter::new(prefix, suffix, partial, idiom, doc);
|
||||
for (term_id, term_len) in terms.iter().flatten() {
|
||||
let o = self.offsets.get_offsets(tx, doc_id, *term_id).await?;
|
||||
if let Some(o) = o {
|
||||
hl.highlight(o.0);
|
||||
hl.highlight(*term_len, o.0);
|
||||
}
|
||||
}
|
||||
return hl.try_into();
|
||||
|
@ -426,15 +427,16 @@ impl FtIndex {
|
|||
&self,
|
||||
tx: &mut kvs::Transaction,
|
||||
thg: &Thing,
|
||||
terms: &[Option<TermId>],
|
||||
terms: &[Option<(TermId, u32)>],
|
||||
partial: bool,
|
||||
) -> Result<Value, Error> {
|
||||
let doc_key: Key = thg.into();
|
||||
if let Some(doc_id) = self.doc_ids.read().await.get_doc_id(tx, doc_key).await? {
|
||||
let mut or = Offseter::default();
|
||||
for term_id in terms.iter().flatten() {
|
||||
let mut or = Offseter::new(partial);
|
||||
for (term_id, term_len) in terms.iter().flatten() {
|
||||
let o = self.offsets.get_offsets(tx, doc_id, *term_id).await?;
|
||||
if let Some(o) = o {
|
||||
or.highlight(o.0);
|
||||
or.highlight(*term_len, o.0);
|
||||
}
|
||||
}
|
||||
return or.try_into();
|
||||
|
|
|
@ -59,15 +59,20 @@ impl Offsets {
|
|||
#[derive(Clone, Debug, PartialEq)]
|
||||
pub(super) struct Offset {
|
||||
pub(super) index: u32,
|
||||
// Start position of the original term
|
||||
pub(super) start: Position,
|
||||
// Start position of the generated term
|
||||
pub(super) gen_start: Position,
|
||||
// End position of the original term
|
||||
pub(super) end: Position,
|
||||
}
|
||||
|
||||
impl Offset {
|
||||
pub(super) fn new(index: u32, start: Position, end: Position) -> Self {
|
||||
pub(super) fn new(index: u32, start: Position, gen_start: Position, end: Position) -> Self {
|
||||
Self {
|
||||
index,
|
||||
start,
|
||||
gen_start,
|
||||
end,
|
||||
}
|
||||
}
|
||||
|
@ -94,6 +99,7 @@ impl TryFrom<OffsetRecords> for Val {
|
|||
// `starts` and `offsets` are likely to be ascending
|
||||
for o in &offsets.0 {
|
||||
decompressed.push(o.start);
|
||||
decompressed.push(o.gen_start);
|
||||
decompressed.push(o.end);
|
||||
}
|
||||
Ok(bincode::serialize(&decompressed)?)
|
||||
|
@ -109,17 +115,25 @@ impl TryFrom<Val> for OffsetRecords {
|
|||
}
|
||||
let decompressed: Vec<u32> = bincode::deserialize(&val)?;
|
||||
let mut iter = decompressed.iter();
|
||||
let s = *iter.next().ok_or(Error::CorruptedIndex("OffsetRecords::try_from(1)"))?;
|
||||
let mut indexes = Vec::with_capacity(s as usize);
|
||||
for _ in 0..s {
|
||||
let n_offsets = *iter.next().ok_or(Error::CorruptedIndex("OffsetRecords::try_from(1)"))?;
|
||||
// <= v1.4 the Offset contains only two field: start and end.
|
||||
// We check the number of integers. If there is only 3 per offset this is the old format.
|
||||
let without_gen_start = n_offsets * 3 + 1 == (decompressed.len() as u32);
|
||||
let mut indexes = Vec::with_capacity(n_offsets as usize);
|
||||
for _ in 0..n_offsets {
|
||||
let index = *iter.next().ok_or(Error::CorruptedIndex("OffsetRecords::try_from(2)"))?;
|
||||
indexes.push(index);
|
||||
}
|
||||
let mut res = Vec::with_capacity(s as usize);
|
||||
let mut res = Vec::with_capacity(n_offsets as usize);
|
||||
for index in indexes {
|
||||
let start = *iter.next().ok_or(Error::CorruptedIndex("OffsetRecords::try_from(3)"))?;
|
||||
let end = *iter.next().ok_or(Error::CorruptedIndex("OffsetRecords::try_from(4)"))?;
|
||||
res.push(Offset::new(index, start, end));
|
||||
let gen_start = if without_gen_start {
|
||||
start
|
||||
} else {
|
||||
*iter.next().ok_or(Error::CorruptedIndex("OffsetRecords::try_from(4)"))?
|
||||
};
|
||||
let end = *iter.next().ok_or(Error::CorruptedIndex("OffsetRecords::try_from(5)"))?;
|
||||
res.push(Offset::new(index, start, gen_start, end));
|
||||
}
|
||||
Ok(OffsetRecords(res))
|
||||
}
|
||||
|
@ -132,10 +146,28 @@ mod tests {
|
|||
|
||||
#[test]
|
||||
fn test_offset_records() {
|
||||
let o =
|
||||
OffsetRecords(vec![Offset::new(0, 1, 2), Offset::new(0, 11, 22), Offset::new(1, 3, 4)]);
|
||||
let o = OffsetRecords(vec![
|
||||
Offset::new(0, 1, 2, 3),
|
||||
Offset::new(0, 11, 13, 22),
|
||||
Offset::new(1, 1, 3, 4),
|
||||
]);
|
||||
let v: Val = o.clone().try_into().unwrap();
|
||||
let o2 = v.try_into().unwrap();
|
||||
assert_eq!(o, o2)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_migrate_v1_offset_records() {
|
||||
let decompressed = vec![3u32, 0, 0, 1, 1, 3, 11, 22, 1, 4];
|
||||
let v = bincode::serialize(&decompressed).unwrap();
|
||||
let o: OffsetRecords = v.try_into().unwrap();
|
||||
assert_eq!(
|
||||
o,
|
||||
OffsetRecords(vec![
|
||||
Offset::new(0, 1, 1, 3),
|
||||
Offset::new(0, 11, 11, 22),
|
||||
Offset::new(1, 1, 1, 4),
|
||||
])
|
||||
)
|
||||
}
|
||||
}
|
||||
|
|
|
@ -433,16 +433,18 @@ impl QueryExecutor {
|
|||
None
|
||||
}
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub(crate) async fn highlight(
|
||||
&self,
|
||||
txn: &Transaction,
|
||||
thg: &Thing,
|
||||
prefix: Value,
|
||||
suffix: Value,
|
||||
match_ref: &Value,
|
||||
match_ref: Value,
|
||||
partial: bool,
|
||||
doc: &Value,
|
||||
) -> Result<Value, Error> {
|
||||
if let Some((e, ft)) = self.get_ft_entry_and_index(match_ref) {
|
||||
if let Some((e, ft)) = self.get_ft_entry_and_index(&match_ref) {
|
||||
let mut run = txn.lock().await;
|
||||
return ft
|
||||
.highlight(
|
||||
|
@ -451,6 +453,7 @@ impl QueryExecutor {
|
|||
&e.0.terms,
|
||||
prefix,
|
||||
suffix,
|
||||
partial,
|
||||
e.0.index_option.id_ref(),
|
||||
doc,
|
||||
)
|
||||
|
@ -463,11 +466,12 @@ impl QueryExecutor {
|
|||
&self,
|
||||
txn: &Transaction,
|
||||
thg: &Thing,
|
||||
match_ref: &Value,
|
||||
match_ref: Value,
|
||||
partial: bool,
|
||||
) -> Result<Value, Error> {
|
||||
if let Some((e, ft)) = self.get_ft_entry_and_index(match_ref) {
|
||||
if let Some((e, ft)) = self.get_ft_entry_and_index(&match_ref) {
|
||||
let mut run = txn.lock().await;
|
||||
return ft.extract_offsets(&mut run, thg, &e.0.terms).await;
|
||||
return ft.extract_offsets(&mut run, thg, &e.0.terms, partial).await;
|
||||
}
|
||||
Ok(Value::None)
|
||||
}
|
||||
|
@ -504,7 +508,7 @@ struct FtEntry(Arc<Inner>);
|
|||
struct Inner {
|
||||
index_option: IndexOption,
|
||||
doc_ids: Arc<RwLock<DocIds>>,
|
||||
terms: Vec<Option<TermId>>,
|
||||
terms: Vec<Option<(TermId, u32)>>,
|
||||
terms_docs: TermsDocs,
|
||||
scorer: Option<BM25Scorer>,
|
||||
}
|
||||
|
|
|
@ -190,6 +190,92 @@ async fn select_where_matches_using_index_and_arrays_with_parallel() -> Result<(
|
|||
select_where_matches_using_index_and_arrays(true).await
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn select_where_matches_partial_highlight() -> Result<(), Error> {
|
||||
let sql = r"
|
||||
CREATE blog:1 SET content = 'Hello World!';
|
||||
DEFINE ANALYZER simple TOKENIZERS blank,class FILTERS lowercase,edgengram(2,100);
|
||||
DEFINE INDEX blog_content ON blog FIELDS content SEARCH ANALYZER simple BM25 HIGHLIGHTS;
|
||||
SELECT id, search::highlight('<em>', '</em>', 1) AS content FROM blog WHERE content @1@ 'he';
|
||||
SELECT id, search::highlight('<em>', '</em>', 1, false) AS content FROM blog WHERE content @1@ 'he';
|
||||
SELECT id, search::highlight('<em>', '</em>', 1, true) AS content FROM blog WHERE content @1@ 'he';
|
||||
SELECT id, search::offsets(1) AS content FROM blog WHERE content @1@ 'he';
|
||||
SELECT id, search::offsets(1, false) AS content FROM blog WHERE content @1@ 'he';
|
||||
SELECT id, search::offsets(1, true) AS content FROM blog WHERE content @1@ 'he';
|
||||
";
|
||||
let dbs = new_ds().await?;
|
||||
let ses = Session::owner().with_ns("test").with_db("test");
|
||||
let res = &mut dbs.execute(&sql, &ses, None).await?;
|
||||
assert_eq!(res.len(), 9);
|
||||
//
|
||||
for _ in 0..3 {
|
||||
let _ = res.remove(0).result?;
|
||||
}
|
||||
//
|
||||
for i in 0..2 {
|
||||
let tmp = res.remove(0).result?;
|
||||
let val = Value::parse(
|
||||
"[
|
||||
{
|
||||
id: blog:1,
|
||||
content: '<em>Hello</em> World!'
|
||||
}
|
||||
]",
|
||||
);
|
||||
assert_eq!(format!("{:#}", tmp), format!("{:#}", val), "{i}");
|
||||
}
|
||||
//
|
||||
let tmp = res.remove(0).result?;
|
||||
let val = Value::parse(
|
||||
"[
|
||||
{
|
||||
id: blog:1,
|
||||
content: '<em>He</em>llo World!'
|
||||
}
|
||||
]",
|
||||
);
|
||||
assert_eq!(format!("{:#}", tmp), format!("{:#}", val));
|
||||
//
|
||||
for i in 0..2 {
|
||||
let tmp = res.remove(0).result?;
|
||||
let val = Value::parse(
|
||||
"[
|
||||
{
|
||||
content: {
|
||||
0: [
|
||||
{
|
||||
e: 5,
|
||||
s: 0
|
||||
}
|
||||
]
|
||||
},
|
||||
id: blog:1
|
||||
}
|
||||
]",
|
||||
);
|
||||
assert_eq!(format!("{:#}", tmp), format!("{:#}", val), "{i}");
|
||||
}
|
||||
//
|
||||
let tmp = res.remove(0).result?;
|
||||
let val = Value::parse(
|
||||
"[
|
||||
{
|
||||
content: {
|
||||
0: [
|
||||
{
|
||||
e: 2,
|
||||
s: 0
|
||||
}
|
||||
]
|
||||
},
|
||||
id: blog:1
|
||||
}
|
||||
]",
|
||||
);
|
||||
assert_eq!(format!("{:#}", tmp), format!("{:#}", val));
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn select_where_matches_using_index_and_objects(parallel: bool) -> Result<(), Error> {
|
||||
let p = if parallel {
|
||||
"PARALLEL"
|
||||
|
|
|
@ -160,8 +160,10 @@ mod database_upgrade {
|
|||
];
|
||||
|
||||
// Set of QUERY and RESULT to check for Full Text Search
|
||||
const CHECK_FTS: [Check; 1] =
|
||||
[("SELECT name FROM account WHERE name @@ 'Tobie'", Expected::One("{\"name\":\"Tobie\"}"))];
|
||||
const CHECK_FTS: [Check; 1] = [(
|
||||
"SELECT search::highlight('<em>','</em>', 1) AS name FROM account WHERE name @1@ 'Tobie'",
|
||||
Expected::One("{\"name\":\"<em>Tobie</em>\"}"),
|
||||
)];
|
||||
|
||||
// Set of DATA for VectorSearch and Knn Operator checking
|
||||
const DATA_MTREE: [&str; 4] = [
|
||||
|
|
Loading…
Reference in a new issue