aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authormetamuffin <metamuffin@disroot.org>2024-11-27 22:34:03 +0000
committerlialenck <lialenck@noreply.codeberg.org>2024-11-27 22:34:03 +0000
commit2a17ceac1ab5cdee98d20a928795a1aba06c8be7 (patch)
tree1c5c31e238870776dc324a91ddface2ae15e050a
parent467674743fb638ea56713aecc719a80505b82a17 (diff)
downloadembeddings-sort-2a17ceac1ab5cdee98d20a928795a1aba06c8be7.tar
embeddings-sort-2a17ceac1ab5cdee98d20a928795a1aba06c8be7.tar.bz2
embeddings-sort-2a17ceac1ab5cdee98d20a928795a1aba06c8be7.tar.zst
Replace sled with redb (Also replaces serde to bincode.) (#2)
Reviewed-on: https://codeberg.org/lialenck/embeddings-sort/pulls/2 Co-authored-by: metamuffin <metamuffin@disroot.org> Co-committed-by: metamuffin <metamuffin@disroot.org>
-rw-r--r--Cargo.lock153
-rw-r--r--Cargo.toml7
-rw-r--r--src/cache.rs45
-rw-r--r--src/embedders/mod.rs4
-rw-r--r--src/embedders/pure.rs4
-rw-r--r--src/embedders/vecmetric.rs8
-rw-r--r--src/main.rs23
7 files changed, 91 insertions, 153 deletions
diff --git a/Cargo.lock b/Cargo.lock
index 23f8ba3..8b439c2 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -157,14 +157,24 @@ checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6"
[[package]]
name = "bincode"
-version = "1.3.3"
+version = "2.0.0-rc.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b1f45e9417d87227c7a56d22e471c6206462cba514c7590c09aff4cf6d1ddcad"
+checksum = "f11ea1a0346b94ef188834a65c068a03aec181c94896d481d7a0a40d85b0ce95"
dependencies = [
+ "bincode_derive",
"serde",
]
[[package]]
+name = "bincode_derive"
+version = "2.0.0-rc.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7e30759b3b99a1b802a7a3aa21c85c3ded5c28e1c83170d82d70f08bbf7f3e4c"
+dependencies = [
+ "virtue",
+]
+
+[[package]]
name = "bit-vec"
version = "0.5.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -520,6 +530,7 @@ version = "0.3.1"
dependencies = [
"ahash",
"anyhow",
+ "bincode",
"clap",
"dirs",
"fastembed",
@@ -529,11 +540,9 @@ dependencies = [
"partitions",
"pathdiff",
"rayon",
+ "redb",
"reflink-copy",
- "serde",
"sha2",
- "sled",
- "typed-sled",
]
[[package]]
@@ -663,25 +672,6 @@ dependencies = [
]
[[package]]
-name = "fs2"
-version = "0.4.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9564fc758e15025b46aa6643b1b77d047d1a56a1aea6e01002ac0c7026876213"
-dependencies = [
- "libc",
- "winapi",
-]
-
-[[package]]
-name = "fxhash"
-version = "0.2.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c31b6d751ae2c7f11320402d34e41349dd1016f8d5d45e48c4312bc8625af50c"
-dependencies = [
- "byteorder",
-]
-
-[[package]]
name = "generic-array"
version = "0.14.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -960,15 +950,6 @@ dependencies = [
]
[[package]]
-name = "instant"
-version = "0.1.13"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e0242819d153cba4b4b05a5a8f2a7e9bbf97b6055b2a002b395c96b5ff3c0222"
-dependencies = [
- "cfg-if",
-]
-
-[[package]]
name = "interpolate_name"
version = "0.2.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -1069,7 +1050,7 @@ checksum = "c0ff37bd590ca25063e35af745c343cb7a0271906fb7b37e4813e8f79f00268d"
dependencies = [
"bitflags 2.6.0",
"libc",
- "redox_syscall 0.5.7",
+ "redox_syscall",
]
[[package]]
@@ -1085,16 +1066,6 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4ee93343901ab17bd981295f2cf0026d4ad018c7c31ba84549a4ddbb47a45104"
[[package]]
-name = "lock_api"
-version = "0.4.12"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "07af8b9cdd281b7915f413fa73f29ebd5d55d0d3f0155584dade1ff18cea1b17"
-dependencies = [
- "autocfg",
- "scopeguard",
-]
-
-[[package]]
name = "log"
version = "0.4.22"
source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -1416,31 +1387,6 @@ dependencies = [
]
[[package]]
-name = "parking_lot"
-version = "0.11.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7d17b78036a60663b797adeaee46f5c9dfebb86948d1255007a1d6be0271ff99"
-dependencies = [
- "instant",
- "lock_api",
- "parking_lot_core",
-]
-
-[[package]]
-name = "parking_lot_core"
-version = "0.8.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "60a2cfe6f0ad2bfc16aefa463b497d5c7a5ecd44a23efa72aa342d90177356dc"
-dependencies = [
- "cfg-if",
- "instant",
- "libc",
- "redox_syscall 0.2.16",
- "smallvec",
- "winapi",
-]
-
-[[package]]
name = "partitions"
version = "0.2.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -1469,26 +1415,6 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e"
[[package]]
-name = "pin-project"
-version = "1.1.7"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "be57f64e946e500c8ee36ef6331845d40a93055567ec57e8fae13efd33759b95"
-dependencies = [
- "pin-project-internal",
-]
-
-[[package]]
-name = "pin-project-internal"
-version = "1.1.7"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3c0f5fad0874fc7abcd4d750e76917eaebbecaa2c20bde22e1dbeeba8beb758c"
-dependencies = [
- "proc-macro2",
- "quote",
- "syn",
-]
-
-[[package]]
name = "pin-project-lite"
version = "0.2.15"
source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -1707,12 +1633,12 @@ dependencies = [
]
[[package]]
-name = "redox_syscall"
-version = "0.2.16"
+name = "redb"
+version = "2.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fb5a58c1855b4b6819d59012155603f0b22ad30cad752600aadfcb695265519a"
+checksum = "84b1de48a7cf7ba193e81e078d17ee2b786236eed1d3f7c60f8a09545efc4925"
dependencies = [
- "bitflags 1.3.2",
+ "libc",
]
[[package]]
@@ -1857,12 +1783,6 @@ dependencies = [
]
[[package]]
-name = "scopeguard"
-version = "1.2.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49"
-
-[[package]]
name = "security-framework"
version = "2.11.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -1959,22 +1879,6 @@ dependencies = [
]
[[package]]
-name = "sled"
-version = "0.34.7"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7f96b4737c2ce5987354855aed3797279def4ebf734436c6aa4552cf8e169935"
-dependencies = [
- "crc32fast",
- "crossbeam-epoch",
- "crossbeam-utils",
- "fs2",
- "fxhash",
- "libc",
- "log",
- "parking_lot",
-]
-
-[[package]]
name = "smallvec"
version = "1.13.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -2218,19 +2122,6 @@ dependencies = [
]
[[package]]
-name = "typed-sled"
-version = "0.2.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1060f05a4450ec5b758da60951b04f225a93a62079316630e76cf25c4034500d"
-dependencies = [
- "bincode",
- "pin-project",
- "serde",
- "sled",
- "thiserror",
-]
-
-[[package]]
name = "typenum"
version = "1.17.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -2360,6 +2251,12 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a"
[[package]]
+name = "virtue"
+version = "0.0.13"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9dcc60c0624df774c82a0ef104151231d37da4962957d691c011c852b2473314"
+
+[[package]]
name = "wasi"
version = "0.11.0+wasi-snapshot-preview1"
source = "registry+https://github.com/rust-lang/crates.io-index"
diff --git a/Cargo.toml b/Cargo.toml
index 8def49a..de2391e 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -3,8 +3,6 @@ name = "embeddings-sort"
version = "0.3.1"
edition = "2021"
-# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
-
[dependencies]
ahash = "0"
anyhow = "1"
@@ -18,7 +16,6 @@ partitions = "0"
pathdiff = "0"
rayon = "1"
reflink-copy = "0"
-serde = "1"
sha2 = "0"
-sled = "0"
-typed-sled = "0"
+redb = "2"
+bincode = { version = "2.0.0-rc.3", features = ["derive"] }
diff --git a/src/cache.rs b/src/cache.rs
new file mode 100644
index 0000000..608adb5
--- /dev/null
+++ b/src/cache.rs
@@ -0,0 +1,45 @@
+use crate::{FileHash, MetricElem};
+use anyhow::Result;
+use bincode::config::standard;
+use redb::{Database, TableDefinition};
+use std::path::Path;
+
+const T_ENTRIES: TableDefinition<(&str, FileHash), &[u8]> = TableDefinition::new("entries");
+
+pub struct Cache {
+ db: Database,
+}
+impl Cache {
+ pub fn open(path: &Path) -> Result<Self> {
+ let db = Database::create(path)?;
+ let txn = db.begin_write()?;
+ txn.open_table(T_ENTRIES)?;
+ txn.commit()?;
+ Ok(Self { db })
+ }
+ pub fn get<E: MetricElem>(&self, type_name: &'static str, hash: FileHash) -> Result<Option<E>> {
+ let txn = self.db.begin_read()?;
+ let table = txn.open_table(T_ENTRIES)?;
+ if let Some(e) = table.get((type_name, hash))? {
+ Ok(Some(bincode::decode_from_slice(e.value(), standard())?.0))
+ } else {
+ Ok(None)
+ }
+ }
+ pub fn insert<E: MetricElem>(
+ &self,
+ type_name: &'static str,
+ hash: FileHash,
+ value: &E,
+ ) -> Result<()> {
+ let txn = self.db.begin_write()?;
+ let mut table = txn.open_table(T_ENTRIES)?;
+ table.insert(
+ (type_name, hash),
+ bincode::encode_to_vec(value, standard())?.as_slice(),
+ )?;
+ drop(table);
+ txn.commit()?;
+ Ok(())
+ }
+}
diff --git a/src/embedders/mod.rs b/src/embedders/mod.rs
index 1a1721d..83484a1 100644
--- a/src/embedders/mod.rs
+++ b/src/embedders/mod.rs
@@ -6,12 +6,12 @@ pub(crate) use pure::*;
pub(crate) use vecmetric::*;
use anyhow::Result;
+use bincode::{Decode, Encode};
use indicatif::{ParallelProgressIterator, ProgressStyle};
use rayon::prelude::*;
-use serde::{Deserialize, Serialize};
use std::path::{Path, PathBuf};
-pub trait MetricElem: Send + Sync + 'static + Serialize + for<'a> Deserialize<'a> {
+pub trait MetricElem: Send + Sync + 'static + Encode + Decode {
fn dist(&self, _: &Self) -> f64;
}
diff --git a/src/embedders/pure.rs b/src/embedders/pure.rs
index 09c8321..531368c 100644
--- a/src/embedders/pure.rs
+++ b/src/embedders/pure.rs
@@ -1,5 +1,5 @@
use anyhow::{bail, Result};
-use serde::{Deserialize, Serialize};
+use bincode::{Decode, Encode};
use std::path::Path;
use crate::{EmbedderT, MetricElem};
@@ -22,7 +22,7 @@ impl EmbedderT for BrightnessEmbedder {
}
#[repr(transparent)]
-#[derive(Serialize, Deserialize)]
+#[derive(Encode, Decode)]
pub(crate) struct Hue(f64);
impl MetricElem for Hue {
fn dist(&self, b: &Hue) -> f64 {
diff --git a/src/embedders/vecmetric.rs b/src/embedders/vecmetric.rs
index 9f2f143..65d71df 100644
--- a/src/embedders/vecmetric.rs
+++ b/src/embedders/vecmetric.rs
@@ -1,13 +1,13 @@
use super::MetricElem;
-use serde::{Deserialize, Serialize};
+use bincode::{Decode, Encode};
pub trait VecMetric: MetricElem + From<Vec<f32>> {}
-#[derive(Deserialize, Serialize)]
+#[derive(Decode, Encode)]
pub struct AngularDistance(pub Vec<f32>);
-#[derive(Deserialize, Serialize)]
+#[derive(Decode, Encode)]
pub struct EuclidianDistance(pub Vec<f32>);
-#[derive(Deserialize, Serialize)]
+#[derive(Decode, Encode)]
pub struct ManhattenDistance(pub Vec<f32>);
impl VecMetric for AngularDistance {}
diff --git a/src/main.rs b/src/main.rs
index 2e63dd8..45621cf 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -1,4 +1,5 @@
use anyhow::{anyhow, Result};
+use cache::Cache;
use clap::Parser;
use sha2::{Digest, Sha512_256};
use std::{
@@ -12,9 +13,12 @@ use std::path::absolute;
use embedders::*;
use tsp_approx::*;
+pub mod cache;
mod embedders;
mod tsp_approx;
+pub type FileHash = [u8; 32];
+
#[derive(Debug, Clone, Copy, clap::ValueEnum)]
enum Embedder {
Brightness,
@@ -84,12 +88,12 @@ struct Config {
fn get_config() -> Result<Config> {
let glob_cache_dir = dirs::cache_dir().ok_or(anyhow!("Could not get cache directory"))?;
- Ok(Config {
- cache_dir: glob_cache_dir.join("embeddings-sort"),
- })
+ let cache_dir = glob_cache_dir.join("embeddings-sort");
+ std::fs::create_dir_all(&cache_dir)?;
+ Ok(Config { cache_dir })
}
-fn hash_file(p: &PathBuf) -> Result<[u8; 32]> {
+fn hash_file(p: &PathBuf) -> Result<FileHash> {
let mut f = fs::File::open(p)?;
let mut hasher = Sha512_256::new();
io::copy(&mut f, &mut hasher)?;
@@ -105,18 +109,13 @@ fn process_embedder<E>(mut e: E, args: &Args, cfg: &Config) -> Result<(Vec<PathB
where
E: BatchEmbedder,
{
- let db = sled::open(cfg.cache_dir.join("embeddings.db"))?;
- let tree = typed_sled::Tree::<[u8; 32], E::Embedding>::open(&db, E::NAME);
+ let cache = Cache::open(&cfg.cache_dir.join("embeddings.db-v2"))?;
// find cached embeddings
let mut embeds = args
.images
.iter()
- .map(|path| {
- let h = hash_file(path)?;
- let r: Result<Option<E::Embedding>> = tree.get(&h).map_err(|e| e.into());
- r
- })
+ .map(|path| cache.get(E::NAME, hash_file(path)?))
.collect::<Result<Vec<_>>>()?;
// find indices of missing embeddings
@@ -148,7 +147,7 @@ where
{
match emb {
Ok(emb) => {
- tree.insert(&hash_file(&args.images[idx])?, &emb)?;
+ cache.insert(E::NAME, hash_file(&args.images[idx])?, &emb)?;
embeds[idx] = Some(emb);
}
Err(e) => {