Update: PyInstaller ビルド後の本体に組み込む辞書を ZStandard で圧縮し、インストールサイズを 90MB 程度…

…削減する
Aivis-Project · Nov 8, 2024 · c683a04 · c683a04
1 parent a5b2d3e
commit c683a04
Show file tree

Hide file tree

Showing 9 changed files with 193 additions and 20 deletions.
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -6,6 +6,7 @@ typos = "typos"
 test = "pytest"
 update-snapshots = "pytest --snapshot-update"
 update-licenses = "bash tools/create_venv_and_generate_licenses.bash"
+compress-dictionaries = "poetry run python tools/compress_dictionaries.py"
 build = "poetry run task update-licenses && pyinstaller --noconfirm run.spec"
 
 [tool.pysen]
@@ -81,6 +82,7 @@ pydantic = "^2.7.3"
 starlette = "^0.38.4"
 jaconv = "^0.3.4"
 httpx = "^0.27.0"
+zstandard = "^0.23.0"
 # aivmlib は AIVMX ファイルのメタデータ読み取りに必要
 aivmlib = { git = "https://x-access-token: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@github.com/Aivis-Project/aivmlib.git", rev = "9731dc6f20c2282e09fa870790043a20b2662c16" }
 # AivisSpeech-Engine にはカスタマイズされた Style-Bert-VITS2 が必要

diff --git a/resources/dictionaries/01_default.csv.zst b/resources/dictionaries/01_default.csv.zst
diff --git a/resources/dictionaries/02_tdmelodic.csv.zst b/resources/dictionaries/02_tdmelodic.csv.zst
diff --git a/resources/dictionaries/03_tdmelodic.csv.zst b/resources/dictionaries/03_tdmelodic.csv.zst
diff --git a/resources/engine_manifest_assets/dependency_licenses.json b/resources/engine_manifest_assets/dependency_licenses.json
diff --git a/run.spec b/run.spec
@@ -6,7 +6,9 @@ import re
 import sys
 
 datas = [
-    ('resources', 'resources'),
+    ('resources/dictionaries/*.csv.zst', 'resources/dictionaries'),
+    ('resources/engine_manifest_assets', 'resources/engine_manifest_assets'),
+    ('resources/setting_ui_template.html', 'resources'),
     ('engine_manifest.json', '.'),
     ('presets.yaml', '.'),
 ]

diff --git a/tools/compress_dictionaries.py b/tools/compress_dictionaries.py
@@ -0,0 +1,50 @@
+#!/usr/bin/env python3
+
+import pathlib
+
+import zstandard
+
+
+def CompressDictionaries() -> None:
+    """
+    ../resources/dictionaries/ 以下の csv ファイルを ZStandard で圧縮し、
+    同じディレクトリに .csv.zst として保存する
+    """
+
+    # 圧縮レベルは 1-22 まで指定可能
+    # 数値が大きいほど圧縮率が高くなるが、圧縮・解凍に時間がかかる
+    # 5 は圧縮率と解凍速度のバランスが良い値
+    compression_level = 5
+
+    # ../resources/dictionaries/ のパスを取得
+    dictionaries_path = (
+        pathlib.Path(__file__).parent.parent / "resources" / "dictionaries"
+    )
+    if not dictionaries_path.exists():
+        print("Error: ../resources/dictionaries/ does not exist")
+        return
+
+    # csv ファイルを列挙
+    csv_files = list(dictionaries_path.glob("**/*.csv"))
+    if len(csv_files) == 0:
+        print("Error: No csv files found")
+        return
+
+    # ZStandard の圧縮器を初期化
+    compressor = zstandard.ZstdCompressor(level=compression_level)
+
+    # csv ファイルを圧縮
+    for csv_file in csv_files:
+        # 出力先のパスを生成
+        output_path = csv_file.with_suffix(".csv.zst")
+
+        # 圧縮を実行
+        with open(csv_file, "rb") as input_file:
+            with open(output_path, "wb") as output_file:
+                compressor.copy_stream(input_file, output_file)
+
+        print(f"Compressed: {csv_file.name} -> {output_path.name}")
+
+
+if __name__ == "__main__":
+    CompressDictionaries()
diff --git a/voicevox_engine/user_dict/user_dict_manager.py b/voicevox_engine/user_dict/user_dict_manager.py
@@ -9,6 +9,7 @@
 from uuid import UUID, uuid4
 
 import pyopenjtalk
+import zstandard
 from pydantic import TypeAdapter
 
 from ..logging import logger
@@ -129,12 +130,18 @@ def update_dict(self) -> None:
                 default_dict_files = [default_dict_dir_path / "01_default.csv"]
                 logger.info("Using only default dictionary for pytest.")
             else:
-                default_dict_files = sorted(default_dict_dir_path.glob("*.csv"))
+                default_dict_files = sorted(default_dict_dir_path.glob("*.csv.zst"))
             if len(default_dict_files) == 0:
                 logger.warning("Cannot find default dictionary.")
                 return
+
+            # ZStandard デコーダーの初期化
+            decompressor = zstandard.ZstdDecompressor()
+
             for file_path in default_dict_files:
-                default_dict_content = file_path.read_text(encoding="utf-8")
+                with file_path.open("rb") as f:
+                    with decompressor.stream_reader(f) as reader:
+                        default_dict_content = reader.read().decode("utf-8")
                 if not default_dict_content.endswith("\n"):
                     default_dict_content += "\n"
                 csv_text += default_dict_content