aboutsummaryrefslogtreecommitdiff
path: root/import/src/vgmdb.rs
blob: 6278aaab36be5181798cd07a4b70ef615163e0b4 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
/*
    This file is part of jellything (https://codeberg.org/metamuffin/jellything)
    which is licensed under the GNU Affero General Public License (version 3); see /COPYING.
    Copyright (C) 2025 metamuffin <metamuffin.org>
*/

use crate::USER_AGENT;
use anyhow::{Context, Result};
use jellycache::{async_cache_file, async_cache_memory, CachePath};
use log::info;
use regex::Regex;
use reqwest::{
    header::{HeaderMap, HeaderName, HeaderValue},
    Client, ClientBuilder,
};
use std::{
    sync::{Arc, LazyLock},
    time::Duration,
};
use tokio::{
    io::AsyncWriteExt,
    sync::Semaphore,
    time::{sleep_until, Instant},
};

pub struct Vgmdb {
    client: Client,
    rate_limit: Arc<Semaphore>,
}

static RE_IMAGE_URL_FROM_HTML: LazyLock<Regex> = LazyLock::new(|| {
    Regex::new(r#"href='(?<url>https://media.vgm.io/artists/[-/\w\.]+)'"#).unwrap()
});

impl Vgmdb {
    pub fn new() -> Self {
        let client = ClientBuilder::new()
            .default_headers(HeaderMap::from_iter([
                (
                    HeaderName::from_static("user-agent"),
                    HeaderValue::from_static(USER_AGENT),
                ),
                (
                    HeaderName::from_static("x-comment"),
                    HeaderValue::from_static("Please add an API, thanks!"),
                ),
            ]))
            .build()
            .unwrap();
        Self {
            client,
            rate_limit: Arc::new(Semaphore::new(3)),
        }
    }

    pub async fn get_artist_image(&self, id: u64) -> Result<Option<CachePath>> {
        if let Some(url) = self.get_artist_image_url(id).await? {
            Ok(Some(
                async_cache_file("api-vgmdb-media", url.clone(), |mut file| async move {
                    info!("downloading image {url:?}");
                    let mut res = self.client.get(url).send().await?.error_for_status()?;
                    while let Some(chunk) = res.chunk().await? {
                        file.write_all(&chunk).await?;
                    }
                    Ok(())
                })
                .await
                .context("vgmdb media download")?,
            ))
        } else {
            Ok(None)
        }
    }

    pub async fn get_artist_image_url(&self, id: u64) -> Result<Option<String>> {
        let html = self.scrape_artist_page(id).await?;
        if let Some(cap) = RE_IMAGE_URL_FROM_HTML.captures(&html) {
            if let Some(url) = cap.name("url").map(|m| m.as_str()) {
                return Ok(Some(url.to_string()));
            }
        }
        return Ok(None);
    }

    pub async fn scrape_artist_page(&self, id: u64) -> Result<Arc<String>> {
        async_cache_memory("api-vgmdb-artist", id.clone(), || async move {
            let _permit = self.rate_limit.clone().acquire_owned().await?;
            let permit_drop_ts = Instant::now() + Duration::from_secs(1);
            info!("scrape artist: {id}");

            let resp = self
                .client
                .get(format!("https://vgmdb.net/artist/{id}"))
                .send()
                .await?
                .error_for_status()?
                .text()
                .await?;

            tokio::task::spawn(async move {
                sleep_until(permit_drop_ts).await;
                drop(_permit);
            });

            Ok(resp)
        })
        .await
        .context("vgmdb artist page scrape")
    }
}