diff options
author | Lia Lenckowski <lialenck@protonmail.com> | 2023-09-06 14:33:33 +0200 |
---|---|---|
committer | Lia Lenckowski <lialenck@protonmail.com> | 2023-09-06 14:33:33 +0200 |
commit | 51007f5b8ff6d5960ac034854ceae1ab15237b6a (patch) | |
tree | e7dfecf965c5fd83c38c5f3ab86d140410df58dd /src/main.rs | |
parent | cb67cd7389b68750518b67c7a58beb7659298352 (diff) | |
download | embeddings-sort-51007f5b8ff6d5960ac034854ceae1ab15237b6a.tar embeddings-sort-51007f5b8ff6d5960ac034854ceae1ab15237b6a.tar.bz2 embeddings-sort-51007f5b8ff6d5960ac034854ceae1ab15237b6a.tar.zst |
cache by hash instead of path
Diffstat (limited to 'src/main.rs')
-rw-r--r-- | src/main.rs | 39 |
1 files changed, 23 insertions, 16 deletions
diff --git a/src/main.rs b/src/main.rs index c4a0c26..5caa4ce 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,10 +1,10 @@ #![feature(iterator_try_collect)] +use anyhow::Result; use clap::Parser; use priority_queue::PriorityQueue; -use std::cmp::Ordering; -use std::collections::HashMap; -use std::path::PathBuf; +use sha2::{Sha512_256, Digest}; +use std::{cmp::Ordering, collections::HashMap, fs, io, path::PathBuf}; use embedders::*; @@ -30,9 +30,8 @@ struct Config { base_dirs: xdg::BaseDirectories, } -fn get_config() -> Result<Config, String> { - let dirs = xdg::BaseDirectories::with_prefix("embeddings-sort") - .map_err(|_| "oh no")?; +fn get_config() -> Result<Config> { + let dirs = xdg::BaseDirectories::with_prefix("embeddings-sort")?; Ok(Config{base_dirs: dirs}) } @@ -113,21 +112,30 @@ fn tsp_from_mst(mst: HashMap<usize, Vec<usize>>) -> Vec<usize> { tsp_path } -fn process_embedder<E>(mut e: E, args: Args, cfg: Config) -> Result<Vec<PathBuf>, String> +fn hash_file(p: &PathBuf) -> Result<[u8; 32]> { + let mut f = fs::File::open(p)?; + let mut hasher = Sha512_256::new(); + io::copy(&mut f, &mut hasher)?; + Ok(hasher.finalize().into_iter().collect::<Vec<u8>>().try_into().unwrap()) +} + +fn process_embedder<E>(mut e: E, args: Args, cfg: Config) -> Result<Vec<PathBuf>> where E: EmbedderT { - if args.images.len() == 0 { + if args.images.is_empty() { return Ok(Vec::new()); } - let db = sled::open(cfg.base_dirs.place_cache_file("embeddings.db") - .map_err(|e| e.to_string())?).map_err(|e| e.to_string())?; - let tree = typed_sled::Tree::<PathBuf, E::Embedding>::open(&db, E::NAME); + let db = sled::open(cfg.base_dirs.place_cache_file("embeddings.db")?)?; + let tree = typed_sled::Tree::<[u8; 32], E::Embedding>::open(&db, E::NAME); - // TODO nicht pfad, sondern hash vom bild als key nehmen let mut embeds: Vec<Option<_>> = args.images .iter() - .map(|p| tree.get(p).map_err(|e| e.to_string())) + .map(|p| { + let h = hash_file(p)?; + let r: Result<Option<E::Embedding>> = tree.get(&h).map_err(|e| e.into()); + r + }) .try_collect()?; let missing_embeds_indices: Vec<_> = embeds @@ -145,8 +153,7 @@ fn process_embedder<E>(mut e: E, args: Args, cfg: Config) -> Result<Vec<PathBuf> for (idx, emb) in missing_embeds_indices .into_iter().zip(missing_embeds.into_iter()) { - // TODO hier auch hash statt pfad - tree.insert(&args.images[idx], &emb).map_err(|e| e.to_string())?; + tree.insert(&hash_file(&args.images[idx])?, &emb)?; embeds[idx] = Some(emb); } @@ -156,7 +163,7 @@ fn process_embedder<E>(mut e: E, args: Args, cfg: Config) -> Result<Vec<PathBuf> Ok(tsp_path.iter().map(|i| args.images[*i].clone()).collect()) } -fn main() -> Result<(), String> { +fn main() -> Result<()> { let cfg = get_config()?; let args = Args::parse(); |