diff options
| author | metamuffin <metamuffin@disroot.org> | 2026-03-11 05:00:42 +0100 |
|---|---|---|
| committer | metamuffin <metamuffin@disroot.org> | 2026-03-11 05:00:42 +0100 |
| commit | 89d9e1703b06ad6f84933e1399e70a9d055dfbe8 (patch) | |
| tree | 8d359e07a8e36c9d8205364213277e21cf0e9933 /database | |
| parent | 3115a894fa7008ef4ee657945cf8addaec8f98cb (diff) | |
| download | jellything-89d9e1703b06ad6f84933e1399e70a9d055dfbe8.tar jellything-89d9e1703b06ad6f84933e1399e70a9d055dfbe8.tar.bz2 jellything-89d9e1703b06ad6f84933e1399e70a9d055dfbe8.tar.zst | |
tokenize lowercase
Diffstat (limited to 'database')
| -rw-r--r-- | database/src/kv/index.rs | 28 |
1 files changed, 14 insertions, 14 deletions
diff --git a/database/src/kv/index.rs b/database/src/kv/index.rs index 469a6e8..e4f707c 100644 --- a/database/src/kv/index.rs +++ b/database/src/kv/index.rs @@ -62,19 +62,19 @@ pub fn update_index( SortKey::Text(path) => { let mut tokens = HashSet::new(); for val in path.get_matching_values(ob) { - for tok in text_tokenizer(val) { + for tok in text_tokenizer(str::from_utf8(val).unwrap()) { tokens.insert(tok); } } - for &tok in &tokens { + for tok in &tokens { for mut k in ks.clone() { k.push(0); - k.extend(tok); + k.extend(tok.as_bytes()); index_counter(txn, &k, remove)?; } for mut k in ks.clone() { k.push(1); - k.extend(tok); + k.extend(tok.as_bytes()); k.extend(row.to_be_bytes()); index_marker(txn, &k, remove)?; } @@ -181,24 +181,22 @@ pub fn iter_index<'a>( ) } Sort::TextSearch(_, text) => { - let search_tokens = text_tokenizer(text.as_bytes()) - .map(|e| e.to_owned()) - .collect::<Vec<_>>(); - let mut min_tok = Vec::new(); + let search_tokens = text_tokenizer(text); + let mut min_tok = String::new(); let mut min_count = u64::MAX; for token in &search_tokens { let mut k = prefix.clone(); k.push(0); - k.extend(token); + k.extend(token.as_bytes()); let count = read_counter(txn, &k, 0)?; if count < min_count { min_count = count; - min_tok = token.to_owned() + min_tok = token.to_string() } } let mut min_token_prefix = prefix.clone(); min_token_prefix.push(1); - min_token_prefix.extend(&min_tok); + min_token_prefix.extend(min_tok.as_bytes()); Box::new( PrefixIterator { inner: txn.iter(&min_token_prefix, false)?, @@ -213,7 +211,7 @@ pub fn iter_index<'a>( for token in &search_tokens { let mut k = prefix.clone(); k.push(1); - k.extend(token); + k.extend(token.as_bytes()); k.extend(rn.to_be_bytes()); let v = match txn.get(&k) { Ok(v) => v, @@ -230,9 +228,11 @@ pub fn iter_index<'a>( }) } -fn text_tokenizer(text: &[u8]) -> impl Iterator<Item = &[u8]> { - text.split(|x| matches!(*x, b' ' | b',' | b':' | b'/' | b'+' | b'&')) +fn text_tokenizer(text: &str) -> HashSet<String> { + text.split(|x| matches!(x, ' ' | ',' | ':' | '/' | '+' | '&')) .filter(|x| !x.is_empty()) + .map(|s| s.to_lowercase()) + .collect() } fn inc_key(mut k: Vec<u8>) -> Vec<u8> { |