chore: scraping user data becomes impossible
This commit is contained in:
@@ -6,21 +6,13 @@ authors = ["mokurin000"]
|
||||
description = "CLI tool for SDGB protocol"
|
||||
|
||||
[features]
|
||||
default = ["compio", "fetchall"]
|
||||
default = ["compio"]
|
||||
|
||||
compio = ["dep:compio", "sdgb-api/compio"]
|
||||
tokio = ["dep:tokio", "sdgb-api/tokio"]
|
||||
|
||||
fetchall = [
|
||||
"dep:redb",
|
||||
"dep:futures-util",
|
||||
"dep:parquet",
|
||||
"dep:music-db",
|
||||
"sdgb-api/parquet",
|
||||
]
|
||||
|
||||
[dependencies]
|
||||
sdgb-api = { workspace = true, features = ["bincode"] }
|
||||
sdgb-api = { workspace = true, features = [] }
|
||||
music-db = { workspace = true, optional = true }
|
||||
|
||||
# (de)serialization
|
||||
@@ -32,10 +24,6 @@ strum = { workspace = true }
|
||||
spdlog-rs = { workspace = true }
|
||||
snafu = { workspace = true }
|
||||
|
||||
|
||||
# kv database
|
||||
redb = { workspace = true, optional = true }
|
||||
|
||||
# async runtime
|
||||
tokio = { workspace = true, features = ["macros"], optional = true }
|
||||
compio = { workspace = true, features = ["macros"], optional = true }
|
||||
@@ -49,7 +37,5 @@ ctrlc = { version = "3.4.7", features = ["termination"] }
|
||||
# magic macro
|
||||
crabtime = { workspace = true }
|
||||
|
||||
parquet = { workspace = true, optional = true }
|
||||
|
||||
[build-dependencies]
|
||||
version_check = "0.9.5"
|
||||
|
||||
64
sdgb-cli/src/cache/mod.rs
vendored
64
sdgb-cli/src/cache/mod.rs
vendored
@@ -1,64 +0,0 @@
|
||||
use std::sync::LazyLock;
|
||||
|
||||
use redb::{ReadTransaction, ReadableDatabase as _, Table, TableDefinition, WriteTransaction};
|
||||
|
||||
static DATABASE: LazyLock<redb::Database> = LazyLock::new(|| {
|
||||
let mut db = redb::Database::builder()
|
||||
.create("players.redb")
|
||||
.expect("failed to open database");
|
||||
_ = db.compact();
|
||||
db
|
||||
});
|
||||
|
||||
pub fn write_txn() -> Result<WriteTransaction, redb::Error> {
|
||||
Ok(DATABASE.begin_write()?)
|
||||
}
|
||||
|
||||
pub fn read_txn() -> Result<ReadTransaction, redb::Error> {
|
||||
Ok(DATABASE.begin_read()?)
|
||||
}
|
||||
|
||||
pub fn open_table<'a>(
|
||||
write: &'a WriteTransaction,
|
||||
definition: TableDefinition<'_, u32, Vec<u8>>,
|
||||
) -> Result<Table<'a, u32, Vec<u8>>, redb::Error> {
|
||||
Ok(write.open_table(definition)?)
|
||||
}
|
||||
|
||||
pub fn open_table_ro(
|
||||
read: &ReadTransaction,
|
||||
definition: TableDefinition<'_, u32, Vec<u8>>,
|
||||
) -> Result<redb::ReadOnlyTable<u32, Vec<u8>>, redb::Error> {
|
||||
Ok(read.open_table(definition)?)
|
||||
}
|
||||
|
||||
#[crabtime::function]
|
||||
fn table_definitions_impl(tables: Vec<String>) {
|
||||
let mut defs: Vec<String> = Vec::new();
|
||||
|
||||
for table in tables {
|
||||
let definition = table.to_uppercase();
|
||||
let table_name = format!("\"{table}\"");
|
||||
|
||||
crabtime::output!(
|
||||
pub const {{definition}}: TableDefinition<'_, u32, Vec<u8>> = redb::TableDefinition::new({{table_name}});
|
||||
);
|
||||
|
||||
defs.push(format!("write_txn.open_table({definition})?;"));
|
||||
}
|
||||
|
||||
let init_statements = defs.join("\n");
|
||||
|
||||
crabtime::output!(
|
||||
pub fn init_db() -> Result<(), redb::Error> {
|
||||
let write_txn = DATABASE.begin_write()?;
|
||||
{
|
||||
{ init_statements }
|
||||
}
|
||||
write_txn.commit()?;
|
||||
Ok(())
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
table_definitions_impl!(["players", "b50", "records", "regions"]);
|
||||
@@ -85,55 +85,6 @@ pub enum Commands {
|
||||
token: Option<String>,
|
||||
},
|
||||
|
||||
/// Scrape all user, read possible id from stdin
|
||||
#[cfg(feature = "fetchall")]
|
||||
ListAllUser {
|
||||
#[arg(short, long, default_value_t = 5)]
|
||||
concurrency: usize,
|
||||
},
|
||||
#[cfg(feature = "fetchall")]
|
||||
/// Scrape B50 data
|
||||
ScrapeAllB50 {
|
||||
#[arg(short, long, default_value_t = 5)]
|
||||
concurrency: usize,
|
||||
|
||||
#[arg(long, default_value_t = 1000)]
|
||||
min_rating: i64,
|
||||
#[arg(long, default_value_t = 16500)]
|
||||
max_rating: i64,
|
||||
},
|
||||
/// Scrape Region data
|
||||
#[cfg(feature = "fetchall")]
|
||||
ScrapeAllRegion {
|
||||
#[arg(short, long, default_value_t = 5)]
|
||||
concurrency: usize,
|
||||
|
||||
#[arg(long, default_value_t = 1000)]
|
||||
min_rating: i64,
|
||||
#[arg(long, default_value_t = 16500)]
|
||||
max_rating: i64,
|
||||
},
|
||||
/// Scrape all player record
|
||||
#[cfg(feature = "fetchall")]
|
||||
ScrapeAllRecord {
|
||||
#[arg(short, long, default_value_t = 5)]
|
||||
concurrency: usize,
|
||||
|
||||
#[arg(long, default_value_t = 10000)]
|
||||
min_rating: i64,
|
||||
#[arg(long, default_value_t = 16400)]
|
||||
max_rating: i64,
|
||||
},
|
||||
|
||||
#[cfg(feature = "fetchall")]
|
||||
ListAllUserDump {},
|
||||
#[cfg(feature = "fetchall")]
|
||||
ScrapeAllB50Dump {},
|
||||
#[cfg(feature = "fetchall")]
|
||||
ScrapeAllRegionDump {},
|
||||
#[cfg(feature = "fetchall")]
|
||||
ScrapeAllRecordDump {},
|
||||
|
||||
Logout {
|
||||
#[arg(short, long)]
|
||||
user_id: u32,
|
||||
|
||||
@@ -32,8 +32,6 @@ use crate::{
|
||||
utils::{human_readable_display, json_display, login_action},
|
||||
};
|
||||
|
||||
#[cfg(feature = "fetchall")]
|
||||
mod cache;
|
||||
mod commands;
|
||||
mod utils;
|
||||
|
||||
@@ -221,216 +219,6 @@ async fn main() -> Result<(), Box<dyn snafu::Error>> {
|
||||
println!("{}", String::from_utf8_lossy(&resp));
|
||||
}
|
||||
|
||||
#[cfg(feature = "fetchall")]
|
||||
Commands::ListAllUser { concurrency } => {
|
||||
use crate::{cache::PLAYERS, utils::helpers::cached_concurrent_fetch};
|
||||
use sdgb_api::title::methods::GetUserPreviewApiExt;
|
||||
use std::io::BufRead as _;
|
||||
|
||||
let mut user_ids = Vec::new();
|
||||
{
|
||||
let mut stdin = std::io::stdin().lock();
|
||||
let mut buf = String::new();
|
||||
|
||||
while stdin.read_line(&mut buf).is_ok_and(|size| size != 0) {
|
||||
if buf.is_empty() {
|
||||
continue;
|
||||
}
|
||||
|
||||
let user_id: u32 = buf.trim().parse()?;
|
||||
buf.clear();
|
||||
user_ids.push(user_id);
|
||||
}
|
||||
}
|
||||
|
||||
cached_concurrent_fetch::<GetUserPreviewApiExt>(
|
||||
user_ids,
|
||||
&client,
|
||||
concurrency,
|
||||
PLAYERS,
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
|
||||
#[cfg(feature = "fetchall")]
|
||||
Commands::ScrapeAllRecord {
|
||||
concurrency,
|
||||
min_rating,
|
||||
max_rating,
|
||||
} => {
|
||||
use crate::{
|
||||
cache::{PLAYERS, RECORDS},
|
||||
utils::helpers::{cached_concurrent_fetch_userfn, read_cache},
|
||||
};
|
||||
|
||||
let players: Vec<GetUserPreviewApiResp> = read_cache(PLAYERS)?;
|
||||
cached_concurrent_fetch_userfn(
|
||||
players
|
||||
.iter()
|
||||
.filter(|p| p.player_rating >= min_rating && p.player_rating <= max_rating)
|
||||
.map(|p| p.user_id)
|
||||
.collect::<Vec<u32>>(),
|
||||
&client,
|
||||
concurrency,
|
||||
RECORDS,
|
||||
get_user_all_music,
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
#[cfg(feature = "fetchall")]
|
||||
Commands::ScrapeAllB50 {
|
||||
concurrency,
|
||||
min_rating,
|
||||
max_rating,
|
||||
} => {
|
||||
use sdgb_api::title::methods::GetUserRatingApiExt;
|
||||
|
||||
use crate::{
|
||||
cache::{B50, PLAYERS},
|
||||
utils::helpers::{cached_concurrent_fetch, read_cache},
|
||||
};
|
||||
|
||||
let players: Vec<GetUserPreviewApiResp> = read_cache(PLAYERS)?;
|
||||
cached_concurrent_fetch::<GetUserRatingApiExt>(
|
||||
players
|
||||
.iter()
|
||||
.filter(|p| p.player_rating >= min_rating && p.player_rating <= max_rating)
|
||||
.map(|p| p.user_id)
|
||||
.collect::<Vec<u32>>(),
|
||||
&client,
|
||||
concurrency,
|
||||
B50,
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
#[cfg(feature = "fetchall")]
|
||||
Commands::ScrapeAllRegion {
|
||||
concurrency,
|
||||
min_rating,
|
||||
max_rating,
|
||||
} => {
|
||||
use sdgb_api::title::methods::GetUserRegionApiExt;
|
||||
|
||||
use crate::{
|
||||
cache::{PLAYERS, REGIONS},
|
||||
utils::helpers::{cached_concurrent_fetch, read_cache},
|
||||
};
|
||||
|
||||
let players: Vec<GetUserPreviewApiResp> = read_cache(PLAYERS)?;
|
||||
cached_concurrent_fetch::<GetUserRegionApiExt>(
|
||||
players
|
||||
.iter()
|
||||
.filter(|p| p.player_rating >= min_rating && p.player_rating <= max_rating)
|
||||
.map(|p| p.user_id)
|
||||
.collect::<Vec<u32>>(),
|
||||
&client,
|
||||
concurrency,
|
||||
REGIONS,
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
|
||||
#[cfg(feature = "fetchall")]
|
||||
Commands::ListAllUserDump {} => {
|
||||
use crate::{
|
||||
cache::PLAYERS,
|
||||
utils::helpers::{dump_parquet, read_cache},
|
||||
};
|
||||
|
||||
let players: Vec<GetUserPreviewApiResp> = read_cache(PLAYERS)?;
|
||||
dump_parquet(players, "players.parquet")?;
|
||||
}
|
||||
#[cfg(feature = "fetchall")]
|
||||
Commands::ScrapeAllRegionDump {} => {
|
||||
use crate::{
|
||||
cache::REGIONS,
|
||||
utils::helpers::{dump_parquet, read_cache},
|
||||
};
|
||||
use sdgb_api::title::model::{GetUserRegionApiResp, UserRegionFlatten};
|
||||
|
||||
let regions: Vec<GetUserRegionApiResp> = read_cache(REGIONS)?;
|
||||
let regions_flat = regions
|
||||
.into_iter()
|
||||
.map(Vec::<UserRegionFlatten>::from)
|
||||
.flatten()
|
||||
.collect::<Vec<_>>();
|
||||
dump_parquet(regions_flat, "regions.parquet")?;
|
||||
}
|
||||
#[cfg(feature = "fetchall")]
|
||||
Commands::ScrapeAllRecordDump {} => {
|
||||
use crate::{
|
||||
cache::RECORDS,
|
||||
utils::helpers::{dump_parquet, read_cache},
|
||||
};
|
||||
use sdgb_api::title::model::GetUserMusicApiResp;
|
||||
use sdgb_api::title::model::UserMusicDetailFlatten;
|
||||
|
||||
let records: Vec<GetUserMusicApiResp> = read_cache(RECORDS)?;
|
||||
dump_parquet(
|
||||
records
|
||||
.into_iter()
|
||||
.map(|resp| {
|
||||
resp.user_music_list
|
||||
.into_iter()
|
||||
.map(|music| music.user_music_detail_list)
|
||||
.flatten()
|
||||
.map(move |detail| UserMusicDetailFlatten::new(resp.user_id, detail))
|
||||
})
|
||||
.flatten()
|
||||
.collect::<Vec<UserMusicDetailFlatten>>(),
|
||||
"records.parquet",
|
||||
)?;
|
||||
}
|
||||
#[cfg(feature = "fetchall")]
|
||||
Commands::ScrapeAllB50Dump {} => {
|
||||
use sdgb_api::title::model::{MusicRating, MusicRatingFlatten};
|
||||
|
||||
use crate::{
|
||||
cache::B50,
|
||||
utils::helpers::{dump_parquet, read_cache},
|
||||
};
|
||||
|
||||
let records: Vec<GetUserRatingApiResp> = read_cache(B50)?;
|
||||
dump_parquet::<MusicRatingFlatten>(
|
||||
records
|
||||
.into_iter()
|
||||
.map(
|
||||
|GetUserRatingApiResp {
|
||||
user_id,
|
||||
user_rating,
|
||||
}| {
|
||||
user_rating
|
||||
.rating_list
|
||||
.into_iter()
|
||||
.chain(user_rating.next_rating_list)
|
||||
.filter_map(
|
||||
move |MusicRating {
|
||||
music_id,
|
||||
level,
|
||||
rom_version,
|
||||
achievement,
|
||||
}| {
|
||||
let (_rank, dx_rating) =
|
||||
music_db::query_music_level(music_id, level)?
|
||||
.dx_rating(achievement);
|
||||
Some(MusicRatingFlatten {
|
||||
user_id,
|
||||
music_id,
|
||||
level,
|
||||
rom_version,
|
||||
achievement,
|
||||
dx_rating,
|
||||
})
|
||||
},
|
||||
)
|
||||
},
|
||||
)
|
||||
.flatten()
|
||||
.collect::<Vec<_>>(),
|
||||
"b50.parquet",
|
||||
)?;
|
||||
}
|
||||
|
||||
Commands::Userdata {
|
||||
user_id,
|
||||
skip_login,
|
||||
|
||||
@@ -1,183 +0,0 @@
|
||||
use std::sync::Arc;
|
||||
use std::{fs::OpenOptions, io::BufWriter};
|
||||
use std::{path::Path, sync::atomic::Ordering};
|
||||
|
||||
use futures_util::StreamExt;
|
||||
use nyquest_preset::nyquest::AsyncClient;
|
||||
|
||||
use parquet::basic::BrotliLevel;
|
||||
use parquet::file::properties::WriterProperties;
|
||||
use parquet::file::writer::SerializedFileWriter;
|
||||
use parquet::record::RecordWriter;
|
||||
use redb::ReadableTable;
|
||||
use redb::TableDefinition;
|
||||
use spdlog::{error, info};
|
||||
|
||||
use sdgb_api::title::MaiVersionExt;
|
||||
use sdgb_api::title::{Sdgb1_53, methods::APIExt};
|
||||
use sdgb_api::{ApiError, bincode};
|
||||
|
||||
use bincode::{BorrowDecode, Encode, borrow_decode_from_slice};
|
||||
|
||||
use crate::{EARLY_QUIT, cache};
|
||||
|
||||
#[allow(unused)]
|
||||
pub fn read_cache_keys(
|
||||
definition: TableDefinition<'_, u32, Vec<u8>>,
|
||||
) -> Result<Vec<u32>, Box<dyn snafu::Error>> {
|
||||
let txn = cache::read_txn()?;
|
||||
let table = cache::open_table_ro(&txn, definition)?;
|
||||
|
||||
Ok(table
|
||||
.iter()?
|
||||
.flatten()
|
||||
.map(|(value, _)| value.value())
|
||||
.collect::<Vec<u32>>())
|
||||
}
|
||||
|
||||
pub fn read_cache<D>(
|
||||
definition: TableDefinition<'_, u32, Vec<u8>>,
|
||||
) -> Result<Vec<D>, Box<dyn snafu::Error>>
|
||||
where
|
||||
D: for<'d> BorrowDecode<'d, ()>,
|
||||
{
|
||||
let txn = cache::read_txn()?;
|
||||
let table = cache::open_table_ro(&txn, definition)?;
|
||||
|
||||
let config =
|
||||
bincode::config::Configuration::<bincode::config::LittleEndian>::default().with_no_limit();
|
||||
|
||||
Ok(table
|
||||
.iter()?
|
||||
.flatten()
|
||||
.map(|d| borrow_decode_from_slice(&d.1.value(), config))
|
||||
.flatten()
|
||||
.map(|(value, _)| value)
|
||||
.collect::<Vec<D>>())
|
||||
}
|
||||
|
||||
pub fn dump_parquet<D>(
|
||||
data: impl Into<Vec<D>>,
|
||||
output_path: impl AsRef<Path>,
|
||||
) -> Result<(), Box<dyn snafu::Error>>
|
||||
where
|
||||
for<'a> &'a [D]: RecordWriter<D>,
|
||||
{
|
||||
let data = data.into();
|
||||
let file = OpenOptions::new()
|
||||
.create(true)
|
||||
.truncate(true)
|
||||
.write(true)
|
||||
.open(output_path)?;
|
||||
|
||||
#[cfg(file_lock_ready)]
|
||||
file.try_lock()?;
|
||||
|
||||
let writer = BufWriter::new(file);
|
||||
let schema = data.as_slice().schema()?;
|
||||
let props = Arc::new(
|
||||
WriterProperties::builder()
|
||||
.set_compression(parquet::basic::Compression::BROTLI(BrotliLevel::try_new(
|
||||
6,
|
||||
)?))
|
||||
.build(),
|
||||
);
|
||||
|
||||
let mut writer = SerializedFileWriter::new(writer, schema, props).unwrap();
|
||||
let mut row_group = writer.next_row_group().unwrap();
|
||||
|
||||
data.as_slice().write_to_row_group(&mut row_group)?;
|
||||
row_group.close()?;
|
||||
|
||||
writer.close().unwrap();
|
||||
info!("dumped {} records", data.len());
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn cached_concurrent_fetch<A: APIExt>(
|
||||
user_ids: impl Into<Vec<u32>>,
|
||||
client: &AsyncClient,
|
||||
concurrency: usize,
|
||||
definition: TableDefinition<'_, u32, Vec<u8>>,
|
||||
) -> Result<(), Box<dyn snafu::Error>>
|
||||
where
|
||||
A::Payload: From<u32>,
|
||||
A::Response: Encode + for<'a> BorrowDecode<'a, ()>,
|
||||
{
|
||||
cached_concurrent_fetch_userfn(
|
||||
user_ids,
|
||||
client,
|
||||
concurrency,
|
||||
definition,
|
||||
async |client, user_id| {
|
||||
Sdgb1_53::request_ext::<A>(client, A::Payload::from(user_id), user_id).await
|
||||
},
|
||||
)
|
||||
.await
|
||||
}
|
||||
|
||||
pub async fn cached_concurrent_fetch_userfn<R>(
|
||||
user_ids: impl Into<Vec<u32>>,
|
||||
client: &AsyncClient,
|
||||
concurrency: usize,
|
||||
definition: TableDefinition<'_, u32, Vec<u8>>,
|
||||
scrape: impl AsyncFn(&AsyncClient, u32) -> Result<R, ApiError>,
|
||||
) -> Result<(), Box<dyn snafu::Error>>
|
||||
where
|
||||
R: Encode + for<'a> BorrowDecode<'a, ()>,
|
||||
{
|
||||
let _ = cache::init_db();
|
||||
|
||||
let user_ids = user_ids.into();
|
||||
let read = cache::read_txn()?;
|
||||
let write = cache::write_txn()?;
|
||||
let config = sdgb_api::bincode::config::Configuration::<
|
||||
sdgb_api::bincode::config::LittleEndian,
|
||||
>::default()
|
||||
.with_no_limit();
|
||||
|
||||
info!("number of user_id: {}", user_ids.len());
|
||||
|
||||
let collect = futures_util::stream::iter(user_ids)
|
||||
.map(async |user_id| {
|
||||
{
|
||||
let cache_table = cache::open_table_ro(&read, definition)?;
|
||||
let data = cache_table.get(user_id)?;
|
||||
if data.is_some() {
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
|
||||
if EARLY_QUIT.load(Ordering::Relaxed) {
|
||||
return Err("early skip due to ctrl-c")?;
|
||||
}
|
||||
|
||||
let resp = scrape(&client, user_id).await;
|
||||
match &resp {
|
||||
Ok(resp) => {
|
||||
use sdgb_api::bincode::encode_to_vec;
|
||||
|
||||
if let Ok(mut table) = cache::open_table(&write, definition)
|
||||
&& let Ok(encoded) = encode_to_vec(resp, config)
|
||||
{
|
||||
info!("encode length for {user_id}: {}", encoded.len());
|
||||
_ = table.insert(user_id, encoded);
|
||||
}
|
||||
}
|
||||
Err(sdgb_api::ApiError::JSON { .. }) => {}
|
||||
Err(e) => {
|
||||
error!("fetch failed: {e}");
|
||||
}
|
||||
}
|
||||
|
||||
Result::<_, Box<dyn snafu::Error>>::Ok(())
|
||||
})
|
||||
.buffer_unordered(concurrency) // slower to avoid being banned
|
||||
.collect::<Vec<_>>()
|
||||
.await;
|
||||
drop(collect);
|
||||
|
||||
let _ = write.commit();
|
||||
Ok(())
|
||||
}
|
||||
@@ -65,6 +65,3 @@ pub fn human_readable_display(
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[cfg(feature = "fetchall")]
|
||||
pub mod helpers;
|
||||
|
||||
Reference in New Issue
Block a user