From 4a17c06f22d3236da6f30c397695ef3771a9d393 Mon Sep 17 00:00:00 2001 From: metamuffin Date: Wed, 20 Sep 2023 17:23:45 +0200 Subject: support for different vector metrics --- src/embedders/ai.rs | 35 +++++++++++++---------------------- src/embedders/mod.rs | 2 ++ src/embedders/vecmetric.rs | 43 +++++++++++++++++++++++++++++++++++++++++++ src/main.rs | 16 +++++++++++++--- 4 files changed, 71 insertions(+), 25 deletions(-) create mode 100644 src/embedders/vecmetric.rs diff --git a/src/embedders/ai.rs b/src/embedders/ai.rs index 3848674..e772e4a 100644 --- a/src/embedders/ai.rs +++ b/src/embedders/ai.rs @@ -1,39 +1,30 @@ +use crate::{BatchEmbedder, Config}; use anyhow::Result; use indicatif::{ProgressBar, ProgressIterator, ProgressStyle}; -use serde::{Deserialize, Serialize}; use std::{ fs::{remove_file, File}, io::{copy, BufRead, BufReader, Cursor}, + marker::PhantomData, path::PathBuf, process::{Command, Stdio}, }; -use crate::{BatchEmbedder, Config, MetricElem}; +use super::vecmetric::VecMetric; -#[repr(transparent)] -#[derive(Serialize, Deserialize)] -pub(crate) struct Imgbedding(Vec); -impl MetricElem for Imgbedding { - fn dist(&self, other: &Self) -> f64 { - self.0 - .iter() - .zip(other.0.iter()) - .map(|(a, b)| (a - b).powf(2.)) - .sum::() - .sqrt() as f64 - } -} - -pub(crate) struct ContentEmbedder<'a> { +pub(crate) struct ContentEmbedder<'a, Metric> { cfg: &'a Config, + _sim: PhantomData, } -impl<'a> ContentEmbedder<'a> { +impl<'a, Metric> ContentEmbedder<'a, Metric> { pub(crate) fn new(cfg: &'a Config) -> Self { - ContentEmbedder { cfg } + ContentEmbedder { + cfg, + _sim: PhantomData::default(), + } } } -impl<'a> Drop for ContentEmbedder<'a> { +impl<'a, Metric> Drop for ContentEmbedder<'a, Metric> { fn drop(&mut self) { self.cfg .base_dirs @@ -45,8 +36,8 @@ impl<'a> Drop for ContentEmbedder<'a> { } } -impl BatchEmbedder for ContentEmbedder<'_> { - type Embedding = Imgbedding; +impl BatchEmbedder for ContentEmbedder<'_, Metric> { + type Embedding = Metric; const NAME: &'static str = "imgbeddings"; fn embeds(&mut self, paths: &[PathBuf]) -> Result> { diff --git a/src/embedders/mod.rs b/src/embedders/mod.rs index 353222b..5ade40d 100644 --- a/src/embedders/mod.rs +++ b/src/embedders/mod.rs @@ -1,7 +1,9 @@ pub mod ai; pub mod pure; +pub mod vecmetric; pub(crate) use ai::*; pub(crate) use pure::*; +pub(crate) use vecmetric::*; use anyhow::Result; use indicatif::{ParallelProgressIterator, ProgressStyle}; diff --git a/src/embedders/vecmetric.rs b/src/embedders/vecmetric.rs new file mode 100644 index 0000000..474a6d0 --- /dev/null +++ b/src/embedders/vecmetric.rs @@ -0,0 +1,43 @@ +use super::MetricElem; +use serde::{Deserialize, Serialize}; + +pub trait VecMetric: MetricElem + From> {} + +#[derive(Deserialize, Serialize)] +pub struct CosineSimilarity(pub Vec); +#[derive(Deserialize, Serialize)] +pub struct EuclidianDistance(pub Vec); +#[derive(Deserialize, Serialize)] +pub struct ManhattenDistance(pub Vec); + +impl VecMetric for CosineSimilarity {} +impl VecMetric for EuclidianDistance {} +impl VecMetric for ManhattenDistance {} +#[rustfmt::skip] impl From> for CosineSimilarity { fn from(value: Vec) -> Self { Self(value) } } +#[rustfmt::skip] impl From> for EuclidianDistance { fn from(value: Vec) -> Self { Self(value) } } +#[rustfmt::skip] impl From> for ManhattenDistance { fn from(value: Vec) -> Self { Self(value) } } + +impl MetricElem for CosineSimilarity { + fn dist(&self, _other: &Self) -> f64 { + todo!() + } +} +impl MetricElem for EuclidianDistance { + fn dist(&self, other: &Self) -> f64 { + self.0 + .iter() + .zip(other.0.iter()) + .map(|(a, b)| (a - b).powf(2.)) + .sum::() + .sqrt() as f64 + } +} +impl MetricElem for ManhattenDistance { + fn dist(&self, other: &Self) -> f64 { + self.0 + .iter() + .zip(other.0.iter()) + .map(|(a, b)| (a - b).abs()) + .sum::() as f64 + } +} diff --git a/src/main.rs b/src/main.rs index 474532c..2dda2da 100644 --- a/src/main.rs +++ b/src/main.rs @@ -19,7 +19,9 @@ enum Embedder { Brightness, Hue, Color, - Content, + ContentEuclidean, + ContentCosineSim, + ContentManhatten, } #[derive(Debug, Clone, Copy, clap::ValueEnum)] @@ -32,7 +34,7 @@ enum TspAlg { #[derive(Debug, Parser)] struct Args { /// Characteristic to sort by - #[arg(short, long, default_value = "content")] + #[arg(short, long, default_value = "content-euclidean")] embedder: Embedder, /// Symlink the sorted images into this directory @@ -175,7 +177,15 @@ fn main() -> Result<()> { Embedder::Brightness => process_embedder(BrightnessEmbedder, &args, &cfg), Embedder::Hue => process_embedder(HueEmbedder, &args, &cfg), Embedder::Color => process_embedder(ColorEmbedder, &args, &cfg), - Embedder::Content => process_embedder(ContentEmbedder::new(&cfg), &args, &cfg), + Embedder::ContentCosineSim => { + process_embedder(ContentEmbedder::::new(&cfg), &args, &cfg) + } + Embedder::ContentEuclidean => { + process_embedder(ContentEmbedder::::new(&cfg), &args, &cfg) + } + Embedder::ContentManhatten => { + process_embedder(ContentEmbedder::::new(&cfg), &args, &cfg) + } }?; if args.benchmark { -- cgit v1.2.3-70-g09d2 From 8b65d87640e7367cd88e72c320e4370f4ba471a6 Mon Sep 17 00:00:00 2001 From: metamuffin Date: Wed, 20 Sep 2023 21:25:54 +0200 Subject: cosine similarity (works poorly) --- src/embedders/vecmetric.rs | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/src/embedders/vecmetric.rs b/src/embedders/vecmetric.rs index 474a6d0..0c63911 100644 --- a/src/embedders/vecmetric.rs +++ b/src/embedders/vecmetric.rs @@ -18,8 +18,16 @@ impl VecMetric for ManhattenDistance {} #[rustfmt::skip] impl From> for ManhattenDistance { fn from(value: Vec) -> Self { Self(value) } } impl MetricElem for CosineSimilarity { - fn dist(&self, _other: &Self) -> f64 { - todo!() + fn dist(&self, other: &Self) -> f64 { + let x = self + .0 + .iter() + .zip(other.0.iter()) + .map(|(a, b)| *a * *b) + .sum::(); + let mag_a = self.0.iter().map(|x| x.powi(2)).sum::(); + let mag_b = other.0.iter().map(|x| x.powi(2)).sum::(); + (x / (mag_a * mag_b).sqrt()) as f64 } } impl MetricElem for EuclidianDistance { -- cgit v1.2.3-70-g09d2 From d0ce1d9134968a15d37135622138f6b8b7667454 Mon Sep 17 00:00:00 2001 From: metamuffin Date: Wed, 20 Sep 2023 21:40:05 +0200 Subject: replace cosinesim with L2-norm euclidean --- src/embedders/vecmetric.rs | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/src/embedders/vecmetric.rs b/src/embedders/vecmetric.rs index 0c63911..1bda3a8 100644 --- a/src/embedders/vecmetric.rs +++ b/src/embedders/vecmetric.rs @@ -19,15 +19,14 @@ impl VecMetric for ManhattenDistance {} impl MetricElem for CosineSimilarity { fn dist(&self, other: &Self) -> f64 { - let x = self - .0 + let len_a = self.0.iter().map(|x| x.powi(2)).sum::().sqrt(); + let len_b = other.0.iter().map(|x| x.powi(2)).sum::().sqrt(); + self.0 .iter() .zip(other.0.iter()) - .map(|(a, b)| *a * *b) - .sum::(); - let mag_a = self.0.iter().map(|x| x.powi(2)).sum::(); - let mag_b = other.0.iter().map(|x| x.powi(2)).sum::(); - (x / (mag_a * mag_b).sqrt()) as f64 + .map(|(a, b)| (*a / len_a - *b / len_b).powi(2)) + .sum::() + .sqrt() as f64 } } impl MetricElem for EuclidianDistance { -- cgit v1.2.3-70-g09d2 From 4c7c58f487d0ccb70162421e5a871a4020454022 Mon Sep 17 00:00:00 2001 From: metamuffin Date: Wed, 20 Sep 2023 22:15:48 +0200 Subject: use angular distance instead of cossim replacement --- src/embedders/ai.rs | 2 +- src/embedders/vecmetric.rs | 22 ++++++++++++---------- src/main.rs | 6 +++--- 3 files changed, 16 insertions(+), 14 deletions(-) diff --git a/src/embedders/ai.rs b/src/embedders/ai.rs index e772e4a..ac27708 100644 --- a/src/embedders/ai.rs +++ b/src/embedders/ai.rs @@ -1,4 +1,3 @@ -use crate::{BatchEmbedder, Config}; use anyhow::Result; use indicatif::{ProgressBar, ProgressIterator, ProgressStyle}; use std::{ @@ -10,6 +9,7 @@ use std::{ }; use super::vecmetric::VecMetric; +use crate::{BatchEmbedder, Config}; pub(crate) struct ContentEmbedder<'a, Metric> { cfg: &'a Config, diff --git a/src/embedders/vecmetric.rs b/src/embedders/vecmetric.rs index 1bda3a8..9f2f143 100644 --- a/src/embedders/vecmetric.rs +++ b/src/embedders/vecmetric.rs @@ -4,29 +4,31 @@ use serde::{Deserialize, Serialize}; pub trait VecMetric: MetricElem + From> {} #[derive(Deserialize, Serialize)] -pub struct CosineSimilarity(pub Vec); +pub struct AngularDistance(pub Vec); #[derive(Deserialize, Serialize)] pub struct EuclidianDistance(pub Vec); #[derive(Deserialize, Serialize)] pub struct ManhattenDistance(pub Vec); -impl VecMetric for CosineSimilarity {} +impl VecMetric for AngularDistance {} impl VecMetric for EuclidianDistance {} impl VecMetric for ManhattenDistance {} -#[rustfmt::skip] impl From> for CosineSimilarity { fn from(value: Vec) -> Self { Self(value) } } +#[rustfmt::skip] impl From> for AngularDistance { fn from(value: Vec) -> Self { Self(value) } } #[rustfmt::skip] impl From> for EuclidianDistance { fn from(value: Vec) -> Self { Self(value) } } #[rustfmt::skip] impl From> for ManhattenDistance { fn from(value: Vec) -> Self { Self(value) } } -impl MetricElem for CosineSimilarity { +impl MetricElem for AngularDistance { fn dist(&self, other: &Self) -> f64 { - let len_a = self.0.iter().map(|x| x.powi(2)).sum::().sqrt(); - let len_b = other.0.iter().map(|x| x.powi(2)).sum::().sqrt(); - self.0 + let x = self + .0 .iter() .zip(other.0.iter()) - .map(|(a, b)| (*a / len_a - *b / len_b).powi(2)) - .sum::() - .sqrt() as f64 + .map(|(a, b)| *a * *b) + .sum::(); + let mag_a = self.0.iter().map(|x| x.powi(2)).sum::(); + let mag_b = other.0.iter().map(|x| x.powi(2)).sum::(); + let cossim = x / (mag_a * mag_b).sqrt(); + cossim.acos() as f64 } } impl MetricElem for EuclidianDistance { diff --git a/src/main.rs b/src/main.rs index 2dda2da..f9bf4fc 100644 --- a/src/main.rs +++ b/src/main.rs @@ -20,7 +20,7 @@ enum Embedder { Hue, Color, ContentEuclidean, - ContentCosineSim, + ContentAngularDistance, ContentManhatten, } @@ -177,8 +177,8 @@ fn main() -> Result<()> { Embedder::Brightness => process_embedder(BrightnessEmbedder, &args, &cfg), Embedder::Hue => process_embedder(HueEmbedder, &args, &cfg), Embedder::Color => process_embedder(ColorEmbedder, &args, &cfg), - Embedder::ContentCosineSim => { - process_embedder(ContentEmbedder::::new(&cfg), &args, &cfg) + Embedder::ContentAngularDistance => { + process_embedder(ContentEmbedder::::new(&cfg), &args, &cfg) } Embedder::ContentEuclidean => { process_embedder(ContentEmbedder::::new(&cfg), &args, &cfg) -- cgit v1.2.3-70-g09d2 From bcfc1b328f10188567ec99720a04418dec728868 Mon Sep 17 00:00:00 2001 From: metamuffin Date: Wed, 20 Sep 2023 22:21:33 +0200 Subject: use direct phantomdata constructor --- src/embedders/ai.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/embedders/ai.rs b/src/embedders/ai.rs index ac27708..120714c 100644 --- a/src/embedders/ai.rs +++ b/src/embedders/ai.rs @@ -19,7 +19,7 @@ impl<'a, Metric> ContentEmbedder<'a, Metric> { pub(crate) fn new(cfg: &'a Config) -> Self { ContentEmbedder { cfg, - _sim: PhantomData::default(), + _sim: PhantomData, } } } -- cgit v1.2.3-70-g09d2