refactor: dedup clean uid

This commit is contained in:
mokurin000
2025-08-11 13:12:11 +08:00
parent b0942e2af4
commit 32cf576b88
3 changed files with 11 additions and 16 deletions

11
utils/hash_userid.py Normal file
View File

@@ -0,0 +1,11 @@
from sys import argv
import polars as pl
import polars_hash as pl_hash
file = argv[1]
pl.scan_parquet(file).with_columns(
pl.col("user_id").cast(pl.String).add("Lt2N5xgjJOqRsT5qVt7wWYw6SqOPZDI7")
).with_columns(
pl_hash.col("user_id").chash.sha2_256().str.head(16)
).collect().write_parquet(file.replace(".parquet", "_pub.parquet"))