#![feature(iterator_try_collect, absolute_path)] use anyhow::Result; use clap::Parser; use priority_queue::PriorityQueue; use sha2::{Sha512_256, Digest}; use std::{cmp::Ordering, collections::HashMap, fs, io, io::Write, path, path::PathBuf}; use embedders::*; use pure_embedders::*; use ai_embedders::*; mod embedders; mod pure_embedders; mod ai_embedders; #[derive(Debug, Clone, Copy, clap::ValueEnum)] enum Embedder { Brightness, Hue, Color, Content, } #[derive(Debug, Parser)] struct Args { /// Characteristic to sort by #[arg(short, long, default_value = "content")] embedder: Embedder, /// Symlink the sorted images into this directory #[arg(short = 's', long)] symlink_dir: Option, /// Copy the sorted images into this directory. Uses COW when available #[arg(short = 'o', long)] copy_dir: Option, /// Write sorted paths into stdout, one per line #[arg(short = 'c', long)] stdout: bool, /// Write sorted paths into stdout, null-separated. Overrides -c #[arg(short = '0', long)] stdout0: bool, images: Vec, } #[derive(Debug)] struct Config { base_dirs: xdg::BaseDirectories, } fn get_config() -> Result { let dirs = xdg::BaseDirectories::with_prefix("embeddings-sort")?; Ok(Config { base_dirs: dirs }) } fn get_mst(embeds: &Vec) -> HashMap> where M: MetricElem { // wrapper struct to // - reverse the ordering // - implement Ord, even though the type is backed by an f64 #[repr(transparent)] #[derive(Debug, PartialEq)] struct DownOrd (f64); impl Eq for DownOrd {} impl PartialOrd for DownOrd { fn partial_cmp(&self, other: &Self) -> Option { Some(self.cmp(other)) } } impl Ord for DownOrd { fn cmp(&self, other: &Self) -> Ordering { self.0.partial_cmp(&other.0).unwrap().reverse() } } let num_embeds = embeds.len(); let mut possible_edges = PriorityQueue::with_capacity((num_embeds * num_embeds - num_embeds) / 2); let mut mst = HashMap::with_capacity(num_embeds); // here, we start at 0. // we might get a better result in the end if we started with a vertex next // to the lowest-cost edge, but we don't know which one that is (though we // could compute that without changing our asymptotic complexity) mst.insert(0, Vec::new()); for i in 1..num_embeds { possible_edges.push((0, i), DownOrd(embeds[0].dist(&embeds[i]))); } // prims algorithm or something like that while mst.len() < num_embeds { // find the edge with the least cost that connects us to a new vertex let (new, old) = loop { let ((a, b), _) = possible_edges.pop().unwrap(); if !mst.contains_key(&a) { break (a, b); } else if !mst.contains_key(&b) { break (b, a); } }; mst.insert(new, Vec::new()); // insert all the new edges we could take mst.entry(old).and_modify(|v|v.push(new)); for i in 0..num_embeds { // don't consider edges taking us to nodes we already visited if mst.contains_key(&i) { continue; } possible_edges.push((new, i), DownOrd(embeds[new].dist(&embeds[i]))); } } mst } fn tsp_from_mst(mst: HashMap>) -> Vec { fn dfs(cur: usize, t: &HashMap>, into: &mut Vec) { into.push(cur); t.get(&cur).unwrap().iter().for_each(|c| dfs(*c, t, into)); } let mut tsp_path = Vec::with_capacity(mst.len()); dfs(0, &mst, &mut tsp_path); tsp_path } fn hash_file(p: &PathBuf) -> Result<[u8; 32]> { let mut f = fs::File::open(p)?; let mut hasher = Sha512_256::new(); io::copy(&mut f, &mut hasher)?; Ok(hasher.finalize().into_iter().collect::>().try_into().unwrap()) } fn process_embedder(mut e: E, args: &Args, cfg: &Config) -> Result> where E: BatchEmbedder { if args.images.is_empty() { return Ok(Vec::new()); } let db = sled::open(cfg.base_dirs.place_cache_file("embeddings.db")?)?; let tree = typed_sled::Tree::<[u8; 32], E::Embedding>::open(&db, E::NAME); let mut embeds: Vec> = args.images .iter() .map(|p| { let h = hash_file(p)?; let r: Result> = tree.get(&h).map_err(|e| e.into()); r }) .try_collect()?; let missing_embeds_indices: Vec<_> = embeds .iter() .enumerate() .filter_map(|(i, v)| match v { None => Some(i), Some(_) => None, }).collect(); // TODO only run e.embeds if !missing_embeds_indices.is_empty(); this allows // for optimizations in the ai embedde (move pip to ::embeds() instead of ::new()) let missing_embeds = e.embeds(&missing_embeds_indices .iter() .map(|i| args.images[*i].clone()) .collect::>())?; for (idx, emb) in missing_embeds_indices .into_iter().zip(missing_embeds.into_iter()) { tree.insert(&hash_file(&args.images[idx])?, &emb)?; embeds[idx] = Some(emb); } let embeds: Vec<_> = embeds.into_iter().map(|e| e.unwrap()).collect(); let tsp_path = tsp_from_mst(get_mst(&embeds)); Ok(tsp_path.iter().map(|i| args.images[*i].clone()).collect()) } fn symlink_into(tsp: &[PathBuf], target: &PathBuf) -> Result<()> { fs::create_dir_all(target)?; let pad_len = (tsp.len() as f64).log10().ceil() as usize; for (i, p) in tsp.iter().enumerate() { let ext: String = match p.extension() { None => "".to_string(), Some(e) => format!(".{}", e.to_str().unwrap()), }; let tp = target.join(format!("{:0pl$}{ext}", i, pl = pad_len, ext = ext)); let rel_path = pathdiff::diff_paths(path::absolute(p)?, path::absolute(target)?).unwrap(); std::os::unix::fs::symlink(rel_path, tp); } Ok(()) } fn copy_into(tsp: &[PathBuf], target: &PathBuf) -> Result<()> { fs::create_dir_all(target)?; let pad_len = (tsp.len() as f64).log10().ceil() as usize; for (i, p) in tsp.iter().enumerate() { let ext: String = match p.extension() { None => "".to_string(), Some(e) => format!(".{}", e.to_str().unwrap()), }; let tp = target.join(format!("{:0pl$}{ext}", i, pl = pad_len, ext = ext)); reflink_copy::reflink_or_copy(p, tp)?; } Ok(()) } fn main() -> Result<()> { let cfg = get_config()?; let args = Args::parse(); let tsp_path = match args.embedder { Embedder::Brightness => process_embedder(BrightnessEmbedder, &args, &cfg), Embedder::Hue => process_embedder(HueEmbedder, &args, &cfg), Embedder::Color => process_embedder(ColorEmbedder, &args, &cfg), Embedder::Content => process_embedder(ContentEmbedder::new(&cfg), &args, &cfg), }?; if let Some(p) = args.symlink_dir { symlink_into(&tsp_path, &p)? } if let Some(p) = args.copy_dir { copy_into(&tsp_path, &p)? } let path_delim = if args.stdout0 { Some(0) } else if args.stdout { Some(b'\n') } else { None }; path_delim.into_iter().try_for_each(|delim| { let mut o = io::BufWriter::new(io::stdout().lock()); for p in &tsp_path { o.write(p.as_os_str().to_str().unwrap().as_bytes())?; o.write(&[delim])?; } o.flush() })?; Ok(()) }