From 86598bed225daa195f35152cfec563271fc3f31f Mon Sep 17 00:00:00 2001 From: qaz Date: Sun, 14 Jun 2020 02:52:19 +0800 Subject: [PATCH] =?UTF-8?q?=E5=8A=A0=E5=85=A5=E7=AC=A6=E5=8F=B7=E8=AF=8D?= =?UTF-8?q?=E6=B1=87?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- build | 17 ++++++++++------- src/clover-dict-gen | 10 ++++++++++ 2 files changed, 20 insertions(+), 7 deletions(-) diff --git a/build b/build index 6b0444b..53204e6 100755 --- a/build +++ b/build @@ -123,13 +123,6 @@ extract 360万中文词库+词性+词频.zip || exit ln -sf rime-essay/essay.txt essay.txt || exit ln -sf rime-pinyin-simp/pinyin_simp.dict.yaml pinyin_simp.dict.yaml || exit -# 开始生成词典 -../src/clover-dict-gen --minfreq=$minfreq || exit -for i in THUOCL/data/THUOCL_*; do - echo "转换 $i" - ../src/thuocl2rime $i || exit -done - # 生成符号列表 cd rime-symbols || exit mkdir -p opencc || exit @@ -137,6 +130,16 @@ cd opencc || exit ../rime-symbols-gen || exit cd ../.. || exit +# 生成符号词汇 +cat */opencc/*.txt | opencc -c t2s.json | uniq > symbols.txt + +# 开始生成词典 +../src/clover-dict-gen --minfreq=$minfreq || exit +for i in THUOCL/data/THUOCL_*; do + echo "转换 $i" + ../src/thuocl2rime $i || exit +done + # 生成 data 目录 mkdir -p ../data || exit cp ../src/*.yaml ../data || exit diff --git a/src/clover-dict-gen b/src/clover-dict-gen index e634955..5a6f17e 100755 --- a/src/clover-dict-gen +++ b/src/clover-dict-gen @@ -265,6 +265,16 @@ def main(args): 100000, PrintProcess('正在合并袖珍简化字拼音的词库 (%s/%s)').process) print('成功合并袖珍简化字拼音 %s 个汉字, %s 个词组。' % r) + # 合并转换符号词汇 + r = generator.mergeDict( + open('symbols.txt', + 'r', + encoding = 'utf-8').read(), + 10000, + 0, + 100000, PrintProcess('正在合并符号词汇 (%s/%s)').process) + print('成功合并符号词汇 %s 个汉字, %s 个词组。' % r) + word_dict_name = 'clover.base' parse_dict_name = 'clover.phrase'