aboutsummaryrefslogtreecommitdiff
path: root/import/src/vgmdb.rs
blob: 402fd9082f38d1f1e3ce4806a73cd307fa226fc6 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
/*
    This file is part of jellything (https://codeberg.org/metamuffin/jellything)
    which is licensed under the GNU Affero General Public License (version 3); see /COPYING.
    Copyright (C) 2025 metamuffin <metamuffin.org>
*/

use crate::USER_AGENT;
use anyhow::{Context, Result};
use jellycache::{cache, cache_store, HashKey};
use jellycommon::Asset;
use log::info;
use regex::Regex;
use reqwest::{
    header::{HeaderMap, HeaderName, HeaderValue},
    Client, ClientBuilder,
};
use std::{
    sync::{Arc, LazyLock},
    time::Duration,
};
use tokio::{
    runtime::Handle,
    sync::Semaphore,
    time::{sleep_until, Instant},
};

pub struct Vgmdb {
    client: Client,
    rate_limit: Arc<Semaphore>,
}

static RE_IMAGE_URL_FROM_HTML: LazyLock<Regex> = LazyLock::new(|| {
    Regex::new(r#"href='(?<url>https://media.vgm.io/artists/[-/\w\.]+)'"#).unwrap()
});

impl Default for Vgmdb {
    fn default() -> Self {
        Self::new()
    }
}

impl Vgmdb {
    pub fn new() -> Self {
        let client = ClientBuilder::new()
            .default_headers(HeaderMap::from_iter([
                (
                    HeaderName::from_static("user-agent"),
                    HeaderValue::from_static(USER_AGENT),
                ),
                (
                    HeaderName::from_static("x-comment"),
                    HeaderValue::from_static("Please add an API, thanks!"),
                ),
            ]))
            .build()
            .unwrap();
        Self {
            client,
            rate_limit: Arc::new(Semaphore::new(3)),
        }
    }

    pub fn get_artist_image(&self, id: u64, rt: &Handle) -> Result<Option<Asset>> {
        if let Some(url) = self.get_artist_image_url(id, rt)? {
            cache_store(
                format!("ext/vgmdb/artist-image/{}.image", HashKey(&url)),
                move || {
                    rt.block_on(async {
                        info!("downloading image {url:?}");
                        Ok(self
                            .client
                            .get(url)
                            .send()
                            .await?
                            .error_for_status()?
                            .bytes()
                            .await?
                            .to_vec())
                    })
                },
            )
            .context("vgmdb media download")
            .map(Asset)
            .map(Some)
        } else {
            Ok(None)
        }
    }

    pub fn get_artist_image_url(&self, id: u64, rt: &Handle) -> Result<Option<String>> {
        let html = self.scrape_artist_page(id, rt)?;
        if let Some(cap) = RE_IMAGE_URL_FROM_HTML.captures(&str::from_utf8(&html).unwrap()) {
            if let Some(url) = cap.name("url").map(|m| m.as_str()) {
                return Ok(Some(url.to_string()));
            }
        }
        Ok(None)
    }

    pub fn scrape_artist_page(&self, id: u64, rt: &Handle) -> Result<Vec<u8>> {
        cache(&format!("ext/vgmdb/artist-page/{id}.html"), move || {
            rt.block_on(async {
                let _permit = self.rate_limit.clone().acquire_owned().await?;
                let permit_drop_ts = Instant::now() + Duration::from_secs(1);
                info!("scrape artist: {id}");

                let resp = self
                    .client
                    .get(format!("https://vgmdb.net/artist/{id}"))
                    .send()
                    .await?
                    .error_for_status()?
                    .bytes()
                    .await?
                    .to_vec();

                tokio::task::spawn(async move {
                    sleep_until(permit_drop_ts).await;
                    drop(_permit);
                });

                Ok(resp)
            })
        })
        .context("vgmdb artist page scrape")
    }
}