Files
sdgb-utils-rs/sdgb-cli/src/utils/helpers/mod.rs
2025-09-19 03:18:20 +08:00

209 lines
5.7 KiB
Rust

use std::sync::Arc;
use std::{fs::OpenOptions, io::BufWriter};
use std::{path::Path, sync::atomic::Ordering};
use futures_util::StreamExt;
use nyquest_preset::nyquest::AsyncClient;
use parquet::basic::BrotliLevel;
use parquet::file::properties::WriterProperties;
use parquet::file::writer::SerializedFileWriter;
use parquet::record::RecordWriter;
use redb::ReadableTable;
use redb::TableDefinition;
use serde::Serialize;
use spdlog::{error, info};
use sdgb_api::title::MaiVersionExt;
use sdgb_api::title::{Sdgb1_50, methods::APIExt};
use sdgb_api::{ApiError, bincode};
use bincode::{BorrowDecode, Encode, borrow_decode_from_slice};
use crate::{EARLY_QUIT, cache};
#[allow(unused)]
pub fn read_cache_keys(
definition: TableDefinition<'_, u32, Vec<u8>>,
) -> Result<Vec<u32>, Box<dyn snafu::Error>> {
let txn = cache::read_txn()?;
let table = cache::open_table_ro(&txn, definition)?;
Ok(table
.iter()?
.flatten()
.map(|(value, _)| value.value())
.collect::<Vec<u32>>())
}
pub fn read_cache<D>(
definition: TableDefinition<'_, u32, Vec<u8>>,
) -> Result<Vec<D>, Box<dyn snafu::Error>>
where
D: for<'d> BorrowDecode<'d, ()>,
{
let txn = cache::read_txn()?;
let table = cache::open_table_ro(&txn, definition)?;
let config =
bincode::config::Configuration::<bincode::config::LittleEndian>::default().with_no_limit();
Ok(table
.iter()?
.flatten()
.map(|d| borrow_decode_from_slice(&d.1.value(), config))
.flatten()
.map(|(value, _)| value)
.collect::<Vec<D>>())
}
pub fn dump_parquet<D>(
data: impl Into<Vec<D>>,
output_path: impl AsRef<Path>,
) -> Result<(), Box<dyn snafu::Error>>
where
for<'a> &'a [D]: RecordWriter<D>,
{
let data = data.into();
let file = OpenOptions::new()
.create(true)
.truncate(true)
.write(true)
.open(output_path)?;
#[cfg(file_lock_ready)]
file.try_lock()?;
let writer = BufWriter::new(file);
let schema = data.as_slice().schema()?;
let props = Arc::new(
WriterProperties::builder()
.set_compression(parquet::basic::Compression::BROTLI(BrotliLevel::try_new(
6,
)?))
.build(),
);
let mut writer = SerializedFileWriter::new(writer, schema, props).unwrap();
let mut row_group = writer.next_row_group().unwrap();
data.as_slice().write_to_row_group(&mut row_group)?;
row_group.close()?;
writer.close().unwrap();
info!("dumped {} records", data.len());
Ok(())
}
pub fn dump_json<D>(
output_path: impl AsRef<Path>,
definition: TableDefinition<'_, u32, Vec<u8>>,
) -> Result<(), Box<dyn snafu::Error>>
where
D: for<'d> BorrowDecode<'d, ()> + Serialize,
{
let file = OpenOptions::new()
.create(true)
.truncate(true)
.write(true)
.open(output_path)?;
#[cfg(file_lock_ready)]
file.try_lock()?;
let data = read_cache::<D>(definition)?;
let writer = BufWriter::new(file);
serde_json::to_writer_pretty(writer, &data)?;
info!("dumped {} records", data.len());
Ok(())
}
pub async fn cached_concurrent_fetch<A: APIExt>(
user_ids: impl Into<Vec<u32>>,
client: &AsyncClient,
concurrency: usize,
definition: TableDefinition<'_, u32, Vec<u8>>,
) -> Result<(), Box<dyn snafu::Error>>
where
A::Payload: From<u32>,
A::Response: Encode + for<'a> BorrowDecode<'a, ()>,
{
cached_concurrent_fetch_userfn(
user_ids,
client,
concurrency,
definition,
async |client, user_id| {
Sdgb1_50::request_ext::<A>(client, A::Payload::from(user_id), user_id).await
},
)
.await
}
pub async fn cached_concurrent_fetch_userfn<R>(
user_ids: impl Into<Vec<u32>>,
client: &AsyncClient,
concurrency: usize,
definition: TableDefinition<'_, u32, Vec<u8>>,
scrape: impl AsyncFn(&AsyncClient, u32) -> Result<R, ApiError>,
) -> Result<(), Box<dyn snafu::Error>>
where
R: Encode + for<'a> BorrowDecode<'a, ()>,
{
let _ = cache::init_db();
let user_ids = user_ids.into();
let read = cache::read_txn()?;
let write = cache::write_txn()?;
let config = sdgb_api::bincode::config::Configuration::<
sdgb_api::bincode::config::LittleEndian,
>::default()
.with_no_limit();
info!("number of user_id: {}", user_ids.len());
let collect = futures_util::stream::iter(user_ids)
.map(async |user_id| {
{
let cache_table = cache::open_table_ro(&read, definition)?;
let data = cache_table.get(user_id)?;
if data.is_some() {
return Ok(());
}
}
if EARLY_QUIT.load(Ordering::Relaxed) {
return Err("early skip due to ctrl-c")?;
}
let resp = scrape(&client, user_id).await;
match &resp {
Ok(resp) => {
use sdgb_api::bincode::encode_to_vec;
if let Ok(mut table) = cache::open_table(&write, definition)
&& let Ok(encoded) = encode_to_vec(resp, config)
{
info!("encode length for {user_id}: {}", encoded.len());
_ = table.insert(user_id, encoded);
}
}
Err(sdgb_api::ApiError::JSON { .. }) => {}
Err(e) => {
error!("fetch failed: {e}");
}
}
Result::<_, Box<dyn snafu::Error>>::Ok(())
})
.buffer_unordered(concurrency) // slower to avoid being banned
.collect::<Vec<_>>()
.await;
drop(collect);
let _ = write.commit();
Ok(())
}