From 89d9e1703b06ad6f84933e1399e70a9d055dfbe8 Mon Sep 17 00:00:00 2001 From: metamuffin Date: Wed, 11 Mar 2026 05:00:42 +0100 Subject: tokenize lowercase --- database/src/kv/index.rs | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) (limited to 'database') diff --git a/database/src/kv/index.rs b/database/src/kv/index.rs index 469a6e8..e4f707c 100644 --- a/database/src/kv/index.rs +++ b/database/src/kv/index.rs @@ -62,19 +62,19 @@ pub fn update_index( SortKey::Text(path) => { let mut tokens = HashSet::new(); for val in path.get_matching_values(ob) { - for tok in text_tokenizer(val) { + for tok in text_tokenizer(str::from_utf8(val).unwrap()) { tokens.insert(tok); } } - for &tok in &tokens { + for tok in &tokens { for mut k in ks.clone() { k.push(0); - k.extend(tok); + k.extend(tok.as_bytes()); index_counter(txn, &k, remove)?; } for mut k in ks.clone() { k.push(1); - k.extend(tok); + k.extend(tok.as_bytes()); k.extend(row.to_be_bytes()); index_marker(txn, &k, remove)?; } @@ -181,24 +181,22 @@ pub fn iter_index<'a>( ) } Sort::TextSearch(_, text) => { - let search_tokens = text_tokenizer(text.as_bytes()) - .map(|e| e.to_owned()) - .collect::>(); - let mut min_tok = Vec::new(); + let search_tokens = text_tokenizer(text); + let mut min_tok = String::new(); let mut min_count = u64::MAX; for token in &search_tokens { let mut k = prefix.clone(); k.push(0); - k.extend(token); + k.extend(token.as_bytes()); let count = read_counter(txn, &k, 0)?; if count < min_count { min_count = count; - min_tok = token.to_owned() + min_tok = token.to_string() } } let mut min_token_prefix = prefix.clone(); min_token_prefix.push(1); - min_token_prefix.extend(&min_tok); + min_token_prefix.extend(min_tok.as_bytes()); Box::new( PrefixIterator { inner: txn.iter(&min_token_prefix, false)?, @@ -213,7 +211,7 @@ pub fn iter_index<'a>( for token in &search_tokens { let mut k = prefix.clone(); k.push(1); - k.extend(token); + k.extend(token.as_bytes()); k.extend(rn.to_be_bytes()); let v = match txn.get(&k) { Ok(v) => v, @@ -230,9 +228,11 @@ pub fn iter_index<'a>( }) } -fn text_tokenizer(text: &[u8]) -> impl Iterator { - text.split(|x| matches!(*x, b' ' | b',' | b':' | b'/' | b'+' | b'&')) +fn text_tokenizer(text: &str) -> HashSet { + text.split(|x| matches!(x, ' ' | ',' | ':' | '/' | '+' | '&')) .filter(|x| !x.is_empty()) + .map(|s| s.to_lowercase()) + .collect() } fn inc_key(mut k: Vec) -> Vec { -- cgit v1.3