diff options
Diffstat (limited to 'import/src/vgmdb.rs')
-rw-r--r-- | import/src/vgmdb.rs | 108 |
1 files changed, 108 insertions, 0 deletions
diff --git a/import/src/vgmdb.rs b/import/src/vgmdb.rs new file mode 100644 index 0000000..9ac76d6 --- /dev/null +++ b/import/src/vgmdb.rs @@ -0,0 +1,108 @@ +/* + This file is part of jellything (https://codeberg.org/metamuffin/jellything) + which is licensed under the GNU Affero General Public License (version 3); see /COPYING. + Copyright (C) 2025 metamuffin <metamuffin.org> +*/ + +use crate::USER_AGENT; +use anyhow::Result; +use jellybase::cache::{async_cache_file, async_cache_memory, CachePath}; +use log::info; +use regex::Regex; +use reqwest::{ + header::{HeaderMap, HeaderName, HeaderValue}, + Client, ClientBuilder, +}; +use std::{ + sync::{Arc, LazyLock}, + time::Duration, +}; +use tokio::{ + io::AsyncWriteExt, + sync::Semaphore, + time::{sleep_until, Instant}, +}; + +pub struct Vgmdb { + client: Client, + rate_limit: Arc<Semaphore>, +} + +static RE_IMAGE_URL_FROM_HTML: LazyLock<Regex> = LazyLock::new(|| { + Regex::new(r#"href='(?<url>https://media.vgm.io/artists/[-/\w\.]+)'"#).unwrap() +}); + +impl Vgmdb { + pub fn new() -> Self { + let client = ClientBuilder::new() + .default_headers(HeaderMap::from_iter([ + ( + HeaderName::from_static("user-agent"), + HeaderValue::from_static(USER_AGENT), + ), + ( + HeaderName::from_static("x-comment"), + HeaderValue::from_static("Please add an API, thanks!"), + ), + ])) + .build() + .unwrap(); + Self { + client, + rate_limit: Arc::new(Semaphore::new(3)), + } + } + + pub async fn get_artist_image(&self, id: u64) -> Result<Option<CachePath>> { + if let Some(url) = self.get_artist_image_url(id).await? { + Ok(Some( + async_cache_file("api-vgmdb-media", url.clone(), |mut file| async move { + info!("downloading image {url:?}"); + let mut res = self.client.get(url).send().await?.error_for_status()?; + while let Some(chunk) = res.chunk().await? { + file.write_all(&chunk).await?; + } + Ok(()) + }) + .await?, + )) + } else { + Ok(None) + } + } + + pub async fn get_artist_image_url(&self, id: u64) -> Result<Option<String>> { + let html = self.scrape_artist_page(id).await?; + if let Some(cap) = RE_IMAGE_URL_FROM_HTML.captures(&html) { + if let Some(url) = cap.name("url").map(|m| m.as_str()) { + return Ok(Some(url.to_string())); + } + } + return Ok(None); + } + + pub async fn scrape_artist_page(&self, id: u64) -> Result<Arc<String>> { + async_cache_memory("api-vgmdb-artist", id.clone(), || async move { + let _permit = self.rate_limit.clone().acquire_owned().await?; + let permit_drop_ts = Instant::now() + Duration::from_secs(1); + info!("scrape artist: {id}"); + + let resp = self + .client + .get(format!("https://vgmdb.net/artist/{id}")) + .send() + .await? + .error_for_status()? + .text() + .await?; + + tokio::task::spawn(async move { + sleep_until(permit_drop_ts).await; + drop(_permit); + }); + + Ok(resp) + }) + .await + } +} |