diff options
author | metamuffin <metamuffin@disroot.org> | 2024-11-27 22:34:03 +0000 |
---|---|---|
committer | lialenck <lialenck@noreply.codeberg.org> | 2024-11-27 22:34:03 +0000 |
commit | 2a17ceac1ab5cdee98d20a928795a1aba06c8be7 (patch) | |
tree | 1c5c31e238870776dc324a91ddface2ae15e050a | |
parent | 467674743fb638ea56713aecc719a80505b82a17 (diff) | |
download | embeddings-sort-2a17ceac1ab5cdee98d20a928795a1aba06c8be7.tar embeddings-sort-2a17ceac1ab5cdee98d20a928795a1aba06c8be7.tar.bz2 embeddings-sort-2a17ceac1ab5cdee98d20a928795a1aba06c8be7.tar.zst |
Replace sled with redb (Also replaces serde to bincode.) (#2)
Reviewed-on: https://codeberg.org/lialenck/embeddings-sort/pulls/2
Co-authored-by: metamuffin <metamuffin@disroot.org>
Co-committed-by: metamuffin <metamuffin@disroot.org>
-rw-r--r-- | Cargo.lock | 153 | ||||
-rw-r--r-- | Cargo.toml | 7 | ||||
-rw-r--r-- | src/cache.rs | 45 | ||||
-rw-r--r-- | src/embedders/mod.rs | 4 | ||||
-rw-r--r-- | src/embedders/pure.rs | 4 | ||||
-rw-r--r-- | src/embedders/vecmetric.rs | 8 | ||||
-rw-r--r-- | src/main.rs | 23 |
7 files changed, 91 insertions, 153 deletions
@@ -157,14 +157,24 @@ checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6" [[package]] name = "bincode" -version = "1.3.3" +version = "2.0.0-rc.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b1f45e9417d87227c7a56d22e471c6206462cba514c7590c09aff4cf6d1ddcad" +checksum = "f11ea1a0346b94ef188834a65c068a03aec181c94896d481d7a0a40d85b0ce95" dependencies = [ + "bincode_derive", "serde", ] [[package]] +name = "bincode_derive" +version = "2.0.0-rc.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7e30759b3b99a1b802a7a3aa21c85c3ded5c28e1c83170d82d70f08bbf7f3e4c" +dependencies = [ + "virtue", +] + +[[package]] name = "bit-vec" version = "0.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" @@ -520,6 +530,7 @@ version = "0.3.1" dependencies = [ "ahash", "anyhow", + "bincode", "clap", "dirs", "fastembed", @@ -529,11 +540,9 @@ dependencies = [ "partitions", "pathdiff", "rayon", + "redb", "reflink-copy", - "serde", "sha2", - "sled", - "typed-sled", ] [[package]] @@ -663,25 +672,6 @@ dependencies = [ ] [[package]] -name = "fs2" -version = "0.4.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9564fc758e15025b46aa6643b1b77d047d1a56a1aea6e01002ac0c7026876213" -dependencies = [ - "libc", - "winapi", -] - -[[package]] -name = "fxhash" -version = "0.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c31b6d751ae2c7f11320402d34e41349dd1016f8d5d45e48c4312bc8625af50c" -dependencies = [ - "byteorder", -] - -[[package]] name = "generic-array" version = "0.14.7" source = "registry+https://github.com/rust-lang/crates.io-index" @@ -960,15 +950,6 @@ dependencies = [ ] [[package]] -name = "instant" -version = "0.1.13" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e0242819d153cba4b4b05a5a8f2a7e9bbf97b6055b2a002b395c96b5ff3c0222" -dependencies = [ - "cfg-if", -] - -[[package]] name = "interpolate_name" version = "0.2.4" source = "registry+https://github.com/rust-lang/crates.io-index" @@ -1069,7 +1050,7 @@ checksum = "c0ff37bd590ca25063e35af745c343cb7a0271906fb7b37e4813e8f79f00268d" dependencies = [ "bitflags 2.6.0", "libc", - "redox_syscall 0.5.7", + "redox_syscall", ] [[package]] @@ -1085,16 +1066,6 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4ee93343901ab17bd981295f2cf0026d4ad018c7c31ba84549a4ddbb47a45104" [[package]] -name = "lock_api" -version = "0.4.12" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "07af8b9cdd281b7915f413fa73f29ebd5d55d0d3f0155584dade1ff18cea1b17" -dependencies = [ - "autocfg", - "scopeguard", -] - -[[package]] name = "log" version = "0.4.22" source = "registry+https://github.com/rust-lang/crates.io-index" @@ -1416,31 +1387,6 @@ dependencies = [ ] [[package]] -name = "parking_lot" -version = "0.11.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7d17b78036a60663b797adeaee46f5c9dfebb86948d1255007a1d6be0271ff99" -dependencies = [ - "instant", - "lock_api", - "parking_lot_core", -] - -[[package]] -name = "parking_lot_core" -version = "0.8.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "60a2cfe6f0ad2bfc16aefa463b497d5c7a5ecd44a23efa72aa342d90177356dc" -dependencies = [ - "cfg-if", - "instant", - "libc", - "redox_syscall 0.2.16", - "smallvec", - "winapi", -] - -[[package]] name = "partitions" version = "0.2.4" source = "registry+https://github.com/rust-lang/crates.io-index" @@ -1469,26 +1415,6 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e" [[package]] -name = "pin-project" -version = "1.1.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "be57f64e946e500c8ee36ef6331845d40a93055567ec57e8fae13efd33759b95" -dependencies = [ - "pin-project-internal", -] - -[[package]] -name = "pin-project-internal" -version = "1.1.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3c0f5fad0874fc7abcd4d750e76917eaebbecaa2c20bde22e1dbeeba8beb758c" -dependencies = [ - "proc-macro2", - "quote", - "syn", -] - -[[package]] name = "pin-project-lite" version = "0.2.15" source = "registry+https://github.com/rust-lang/crates.io-index" @@ -1707,12 +1633,12 @@ dependencies = [ ] [[package]] -name = "redox_syscall" -version = "0.2.16" +name = "redb" +version = "2.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fb5a58c1855b4b6819d59012155603f0b22ad30cad752600aadfcb695265519a" +checksum = "84b1de48a7cf7ba193e81e078d17ee2b786236eed1d3f7c60f8a09545efc4925" dependencies = [ - "bitflags 1.3.2", + "libc", ] [[package]] @@ -1857,12 +1783,6 @@ dependencies = [ ] [[package]] -name = "scopeguard" -version = "1.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" - -[[package]] name = "security-framework" version = "2.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" @@ -1959,22 +1879,6 @@ dependencies = [ ] [[package]] -name = "sled" -version = "0.34.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7f96b4737c2ce5987354855aed3797279def4ebf734436c6aa4552cf8e169935" -dependencies = [ - "crc32fast", - "crossbeam-epoch", - "crossbeam-utils", - "fs2", - "fxhash", - "libc", - "log", - "parking_lot", -] - -[[package]] name = "smallvec" version = "1.13.2" source = "registry+https://github.com/rust-lang/crates.io-index" @@ -2218,19 +2122,6 @@ dependencies = [ ] [[package]] -name = "typed-sled" -version = "0.2.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1060f05a4450ec5b758da60951b04f225a93a62079316630e76cf25c4034500d" -dependencies = [ - "bincode", - "pin-project", - "serde", - "sled", - "thiserror", -] - -[[package]] name = "typenum" version = "1.17.0" source = "registry+https://github.com/rust-lang/crates.io-index" @@ -2360,6 +2251,12 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" [[package]] +name = "virtue" +version = "0.0.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9dcc60c0624df774c82a0ef104151231d37da4962957d691c011c852b2473314" + +[[package]] name = "wasi" version = "0.11.0+wasi-snapshot-preview1" source = "registry+https://github.com/rust-lang/crates.io-index" @@ -3,8 +3,6 @@ name = "embeddings-sort" version = "0.3.1" edition = "2021" -# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html - [dependencies] ahash = "0" anyhow = "1" @@ -18,7 +16,6 @@ partitions = "0" pathdiff = "0" rayon = "1" reflink-copy = "0" -serde = "1" sha2 = "0" -sled = "0" -typed-sled = "0" +redb = "2" +bincode = { version = "2.0.0-rc.3", features = ["derive"] } diff --git a/src/cache.rs b/src/cache.rs new file mode 100644 index 0000000..608adb5 --- /dev/null +++ b/src/cache.rs @@ -0,0 +1,45 @@ +use crate::{FileHash, MetricElem}; +use anyhow::Result; +use bincode::config::standard; +use redb::{Database, TableDefinition}; +use std::path::Path; + +const T_ENTRIES: TableDefinition<(&str, FileHash), &[u8]> = TableDefinition::new("entries"); + +pub struct Cache { + db: Database, +} +impl Cache { + pub fn open(path: &Path) -> Result<Self> { + let db = Database::create(path)?; + let txn = db.begin_write()?; + txn.open_table(T_ENTRIES)?; + txn.commit()?; + Ok(Self { db }) + } + pub fn get<E: MetricElem>(&self, type_name: &'static str, hash: FileHash) -> Result<Option<E>> { + let txn = self.db.begin_read()?; + let table = txn.open_table(T_ENTRIES)?; + if let Some(e) = table.get((type_name, hash))? { + Ok(Some(bincode::decode_from_slice(e.value(), standard())?.0)) + } else { + Ok(None) + } + } + pub fn insert<E: MetricElem>( + &self, + type_name: &'static str, + hash: FileHash, + value: &E, + ) -> Result<()> { + let txn = self.db.begin_write()?; + let mut table = txn.open_table(T_ENTRIES)?; + table.insert( + (type_name, hash), + bincode::encode_to_vec(value, standard())?.as_slice(), + )?; + drop(table); + txn.commit()?; + Ok(()) + } +} diff --git a/src/embedders/mod.rs b/src/embedders/mod.rs index 1a1721d..83484a1 100644 --- a/src/embedders/mod.rs +++ b/src/embedders/mod.rs @@ -6,12 +6,12 @@ pub(crate) use pure::*; pub(crate) use vecmetric::*; use anyhow::Result; +use bincode::{Decode, Encode}; use indicatif::{ParallelProgressIterator, ProgressStyle}; use rayon::prelude::*; -use serde::{Deserialize, Serialize}; use std::path::{Path, PathBuf}; -pub trait MetricElem: Send + Sync + 'static + Serialize + for<'a> Deserialize<'a> { +pub trait MetricElem: Send + Sync + 'static + Encode + Decode { fn dist(&self, _: &Self) -> f64; } diff --git a/src/embedders/pure.rs b/src/embedders/pure.rs index 09c8321..531368c 100644 --- a/src/embedders/pure.rs +++ b/src/embedders/pure.rs @@ -1,5 +1,5 @@ use anyhow::{bail, Result}; -use serde::{Deserialize, Serialize}; +use bincode::{Decode, Encode}; use std::path::Path; use crate::{EmbedderT, MetricElem}; @@ -22,7 +22,7 @@ impl EmbedderT for BrightnessEmbedder { } #[repr(transparent)] -#[derive(Serialize, Deserialize)] +#[derive(Encode, Decode)] pub(crate) struct Hue(f64); impl MetricElem for Hue { fn dist(&self, b: &Hue) -> f64 { diff --git a/src/embedders/vecmetric.rs b/src/embedders/vecmetric.rs index 9f2f143..65d71df 100644 --- a/src/embedders/vecmetric.rs +++ b/src/embedders/vecmetric.rs @@ -1,13 +1,13 @@ use super::MetricElem; -use serde::{Deserialize, Serialize}; +use bincode::{Decode, Encode}; pub trait VecMetric: MetricElem + From<Vec<f32>> {} -#[derive(Deserialize, Serialize)] +#[derive(Decode, Encode)] pub struct AngularDistance(pub Vec<f32>); -#[derive(Deserialize, Serialize)] +#[derive(Decode, Encode)] pub struct EuclidianDistance(pub Vec<f32>); -#[derive(Deserialize, Serialize)] +#[derive(Decode, Encode)] pub struct ManhattenDistance(pub Vec<f32>); impl VecMetric for AngularDistance {} diff --git a/src/main.rs b/src/main.rs index 2e63dd8..45621cf 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,4 +1,5 @@ use anyhow::{anyhow, Result}; +use cache::Cache; use clap::Parser; use sha2::{Digest, Sha512_256}; use std::{ @@ -12,9 +13,12 @@ use std::path::absolute; use embedders::*; use tsp_approx::*; +pub mod cache; mod embedders; mod tsp_approx; +pub type FileHash = [u8; 32]; + #[derive(Debug, Clone, Copy, clap::ValueEnum)] enum Embedder { Brightness, @@ -84,12 +88,12 @@ struct Config { fn get_config() -> Result<Config> { let glob_cache_dir = dirs::cache_dir().ok_or(anyhow!("Could not get cache directory"))?; - Ok(Config { - cache_dir: glob_cache_dir.join("embeddings-sort"), - }) + let cache_dir = glob_cache_dir.join("embeddings-sort"); + std::fs::create_dir_all(&cache_dir)?; + Ok(Config { cache_dir }) } -fn hash_file(p: &PathBuf) -> Result<[u8; 32]> { +fn hash_file(p: &PathBuf) -> Result<FileHash> { let mut f = fs::File::open(p)?; let mut hasher = Sha512_256::new(); io::copy(&mut f, &mut hasher)?; @@ -105,18 +109,13 @@ fn process_embedder<E>(mut e: E, args: &Args, cfg: &Config) -> Result<(Vec<PathB where E: BatchEmbedder, { - let db = sled::open(cfg.cache_dir.join("embeddings.db"))?; - let tree = typed_sled::Tree::<[u8; 32], E::Embedding>::open(&db, E::NAME); + let cache = Cache::open(&cfg.cache_dir.join("embeddings.db-v2"))?; // find cached embeddings let mut embeds = args .images .iter() - .map(|path| { - let h = hash_file(path)?; - let r: Result<Option<E::Embedding>> = tree.get(&h).map_err(|e| e.into()); - r - }) + .map(|path| cache.get(E::NAME, hash_file(path)?)) .collect::<Result<Vec<_>>>()?; // find indices of missing embeddings @@ -148,7 +147,7 @@ where { match emb { Ok(emb) => { - tree.insert(&hash_file(&args.images[idx])?, &emb)?; + cache.insert(E::NAME, hash_file(&args.images[idx])?, &emb)?; embeds[idx] = Some(emb); } Err(e) => { |