From 27078e2335cfa09e85f0e4abcf89160cd9fab46f Mon Sep 17 00:00:00 2001 From: narugo1992 Date: Tue, 27 Aug 2024 21:45:17 +0800 Subject: [PATCH] dev(narugo): add arknights zoo, ci skip --- requirements-zoo.txt | 3 + zoo/__init__.py | 0 zoo/arknights.py | 144 +++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 147 insertions(+) create mode 100644 zoo/__init__.py create mode 100644 zoo/arknights.py diff --git a/requirements-zoo.txt b/requirements-zoo.txt index b0328bb..21030dc 100644 --- a/requirements-zoo.txt +++ b/requirements-zoo.txt @@ -11,3 +11,6 @@ natsort tabulate torch speechbrain +requests +pyarrow +pandas \ No newline at end of file diff --git a/zoo/__init__.py b/zoo/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/zoo/arknights.py b/zoo/arknights.py new file mode 100644 index 0000000..2d07462 --- /dev/null +++ b/zoo/arknights.py @@ -0,0 +1,144 @@ +import os +from functools import lru_cache + +import pandas as pd +import requests +from ditk import logging +from hbutils.string import plural_word +from hbutils.system import TemporaryDirectory +from hfutils.operate import get_hf_fs, get_hf_client, upload_directory_as_directory +from hfutils.utils import number_to_tag + + +@lru_cache() +def get_raw_meta(lang): + if lang == 'zh': + resp = requests.get( + 'https://raw.githubusercontent.com/Kengxxiao/ArknightsGameData/master/zh_CN/gamedata/excel/charword_table.json') + else: + prefix = { + 'jp': 'ja_JP', + 'en': 'en_US', + 'kr': 'ko_KR', + }[lang] + resp = requests.get( + f'https://raw.githubusercontent.com/Kengxxiao/ArknightsGameData_YoStar/main/{prefix}/gamedata/excel/charword_table.json') + + return resp.json() + + +@lru_cache() +def get_cv_list(lang): + d = get_raw_meta('zh') + mt = { + 'zh': 'CN_MANDARIN', + 'en': 'EN', + 'jp': 'JP', + 'kr': 'KR', + } + + for key, value in d["voiceLangDict"].items(): + if value['charId'] == key and key.startswith('char_') and mt[lang] in value['dict']: + yield key, value['dict'][mt[lang]]['cvName'][0] + + +@lru_cache() +def get_text_for_lang(lang): + d = get_raw_meta(lang) + cvmap = {key: cvname for key, cvname in get_cv_list(lang)} + rows = [] + vmap = { + 'jp': 'voice', + 'en': 'voice_en', + 'zh': 'voice_cn', + 'kr': 'voice_kr', + } + for key, value in d["charWords"].items(): + if value['charId'] in cvmap: + rows.append({ + 'id': key, + 'char_id': value['charId'], + 'voice_actor_name': cvmap[value['charId']], + 'char_word_id': value['charWordId'], + 'lock_description': value['lockDescription'], + 'place_type': value['placeType'], + 'unlock_param': value['unlockParam'], + 'unlock_type': value['unlockType'], + 'voice_asset': value['voiceAsset'], + 'voice_id': value['voiceId'], + 'voice_index': value['voiceIndex'], + 'voice_text': value['voiceText'], + 'voice_title': value['voiceTitle'], + 'voice_type': value['voiceType'], + 'word_key': value['wordKey'], + 'file_url': f'https://torappu.prts.wiki/assets/audio/{vmap[lang]}/{value["charId"]}/{value["voiceId"].lower()}.mp3', + }) + + return pd.DataFrame(rows) + + +def sync(lang): + repository = f'deepghs/arknights_voices_{lang}' + hf_fs = get_hf_fs() + hf_client = get_hf_client() + + if not hf_client.repo_exists(repo_id=repository, repo_type='dataset'): + hf_client.create_repo(repo_id=repository, repo_type='dataset', private=True) + hf_client.update_repo_visibility(repo_id=repository, repo_type='dataset', private=True) + attr_lines = hf_fs.read_text(f'datasets/{repository}/.gitattributes').splitlines(keepends=False) + attr_lines.append('*.json filter=lfs diff=lfs merge=lfs -text') + attr_lines.append('*.csv filter=lfs diff=lfs merge=lfs -text') + hf_fs.write_text( + f'datasets/{repository}/.gitattributes', + os.linesep.join(attr_lines), + ) + + df = get_text_for_lang(lang) + + with TemporaryDirectory() as upload_dir: + parquet_file = os.path.join(upload_dir, 'table.parquet') + df.to_parquet(parquet_file, engine='pyarrow', index=False) + + with open(os.path.join(upload_dir, 'README.md'), 'w') as f: + print('---', file=f) + print('license: other', file=f) + print('task_categories:', file=f) + print('- automatic-speech-recognition', file=f) + print('- audio-classification', file=f) + print('language:', file=f) + print(f'- {lang}', file=f) + print('tags:', file=f) + print('- voice', file=f) + print('- anime', file=f) + print('modality:', file=f) + print('- audio', file=f) + print('- text', file=f) + print('size_categories:', file=f) + print(f'- {number_to_tag(len(df))}', file=f) + print('---', file=f) + print('', file=f) + + print(f'# {lang.upper()} Voice-Text Dataset for Arknights Waifus', file=f) + print(f'', file=f) + print(f'{plural_word(len(df), "record")} in total.', file=f) + print(f'', file=f) + df_shown = df[:50][ + ['id', 'char_id', 'voice_actor_name', 'voice_title', 'voice_text', 'file_url'] + ] + print(df_shown.to_markdown(index=False), file=f) + print(f'', file=f) + + upload_directory_as_directory( + repo_id=repository, + repo_type='dataset', + local_directory=upload_dir, + path_in_repo='.', + message=f'Sync {plural_word(len(df), "record")} for arknights {lang} voices' + ) + + +if __name__ == '__main__': + logging.try_init_root(logging.INFO) + sync( + lang='jp', + )