Skip to content

Commit

Permalink
Build new 廣韻 data for Qieyun.js v0.15
Browse files Browse the repository at this point in the history
  • Loading branch information
syimyuzya committed Jul 7, 2024
1 parent 4e11afc commit c2838d5
Show file tree
Hide file tree
Showing 5 changed files with 29,411 additions and 25,415 deletions.
14 changes: 11 additions & 3 deletions DEVELOP.md
Original file line number Diff line number Diff line change
@@ -1,11 +1,19 @@
# Develop

The data of 廣韻 is originally extracted from [廣韻字音表](https://zhuanlan.zhihu.com/p/20430939), created by poem.
## Sources

Build
- 廣韻(20170209).csv: From [廣韻字音表](https://zhuanlan.zhihu.com/p/20430939), created by poem.
- rime-table-bfa9b50.tsv: From [切韻新韻圖](https://phesoca.com/rime-table/) by unt, built from git commit `bfa9b50`.
- split.csv: Maintained here, ultimately also from 切韻新韻圖.

## Build

```sh
git restore -Ws source '廣韻(20170209).csv' 'v2音韻地位.csv'
python build.py
python check.py
```

## Remarks

- poem 表註「應補」者,給出 Unicode 字頭者均可見於原表末尾(小韻內字序號帶 .5),未給出者(以 IDS 或文字描述字頭)則仍未錄
- poem 表註「應換序」及「順序應爲」者,均未修正,且釋義補充字段亦有問題(似乎源自早先有女同車《廣韻全字表》底本差異)
248 changes: 170 additions & 78 deletions build.py
Original file line number Diff line number Diff line change
@@ -1,82 +1,174 @@
import csv


v2_dict = {}
with open('v2音韻地位.csv') as fin:
next(fin)
for row in csv.reader(fin):
v2_dict[int(row[0])] = row[1:]

小韻細分override = {
409: ['㘋', ''],
3521: ['', '訐'],
3708: ['癔', ''],
}
for v in 小韻細分override.values():
v[0] = set(v[0])
v[1] = set(v[1])


def get小韻細分override(小韻號: int, 字頭: str) -> int | None:
if 小韻號 not in 小韻細分override:
return None
for i, chs in enumerate(小韻細分override[小韻號]):
if 字頭 in chs:
return i
return None


use第二地位 = set()
with open('廣韻(20170209).csv') as f, open('韻書/廣韻.csv', 'w') as g:
next(f) # skip header

print('小韻號,小韻內字序,韻目原貌,最簡描述,反切覈校前,反切,字頭覈校前,字頭,釋義,釋義補充,圖片id', file=g)

for line in f:
xs = line.rstrip('\n').split(',')
反切覈校前, 反切, 字頭覈校前, 字頭, 釋義, 釋義補充, 韻目原貌, 圖片id, 小韻號, 小韻內字序 = xs[19], xs[
20], xs[23], xs[24], xs[25], xs[26], xs[39], xs[56], xs[58], xs[59]

小韻號 = int(小韻號)

# 異體調整
if 韻目原貌 == '真':
韻目原貌 = '眞'

# 無反切的小韻
if len(反切覈校前) != 2:
反切覈校前 = ''
if len(反切) != 2:
反切 = ''

最簡描述, v2字頭, v2反切, _ = v2_dict[小韻號]
# NOTE poem 表的小韻內字序可能有 .5,不全是整數
if 小韻內字序.strip() == '1':
assert 字頭 in v2字頭.split('/'), f'{字頭} not in {v2字頭}'

if '/' in v2字頭:
最簡描述 = 最簡描述.split('/')
if (細分 := get小韻細分override(小韻號, 字頭)) is not None:
最簡描述 = 最簡描述[細分]
elif 小韻號 in use第二地位:
最簡描述 = 最簡描述[1]
else:
v2字頭 = v2字頭.split('/')
if 字頭 == v2字頭[1]:
use第二地位.add(小韻號)
最簡描述 = 最簡描述[1]
幫組 = tuple('幫滂並明')
幫見影組 = tuple('幫滂並明見溪羣疑影曉匣云')


def process_音韻地位(row: list[str]) -> str:
, , 等類, , = row[10:15]
if not :
return ''
if (pos := .find('→')) != -1:
= [pos + 1 :]
if 等類 == '四(?)':
等類 = '三' if in ('脂', '麻') else '二'
等類 = 等類.replace('(', '')
等類 = 等類.replace(')', '')
if in 幫見影組 and 等類 == '三':
if == '麻' or ( == '幽' and not in 幫組):
等類 += 'A'
elif == '幽' or ( == '蒸' and ( == '合' or in 幫組)):
等類 += 'B'
else:
等類 += 'C'
return + + 等類 + +


def main():
小韻_data: dict[str, list[str]] = {}
with open('src/rime-table-bfa9b50.tsv') as fin:
next(fin)
for line in fin:
row = line.rstrip('\n').split('\t')
小韻號 = row[0]
小韻_data[小韻號] = row

音韻地位_data: dict[str, str] = {
key: process_音韻地位(row) for key, row in 小韻_data.items()
}

has_細分: dict[str, list[str]] = {}
小韻細分_data: dict[str, list[str]] = {}
with open('src/split.csv') as fin:
next(fin)
for row in csv.reader(fin):
小韻號 = row[0]
assert 小韻號[-1].isalpha()
反切 = row[1]
assert (
小韻_data[小韻號][2] == 反切
), f'反切 mismatch in 小韻 #{小韻號}, 小韻_data: {小韻_data[小韻號][2]}, 小韻細分_data: {反切}'
has_細分.setdefault(小韻號[:-1], []).append(小韻號[-1])
小韻細分_data[小韻號] = row

小韻細分_coverage: dict[str, set[str]] = {}
廣韻_data: list[tuple[tuple[int, float], list[str]]] = []
with open('src/廣韻(20170209).csv') as fin:
rows = csv.reader(fin)

header = next(rows)
success = True
for [idx, field] in (
(18, '字頭-補'),
(19, '廣韻反切原貌(覈校前)'),
(20, '廣韻反切(覈校後)'),
(23, '廣韻字頭原貌(覈校前)'),
(24, '廣韻字頭(覈校後)'),
(25, '廣韻釋義'),
(26, '釋義補充'),
(39, '廣韻韻部原貌(調整前)'),
(56, '廣韻頁序'),
(58, '小韻序'),
(59, '小韻內字序'),
):
if header[idx] != field:
success = False
print(
f'[Error] header mismatch: expected {repr(field)}, got header[{idx}] = {repr(header[idx])}'
)
if not success:
print()
print(list(enumerate(header)))
exit(2)

for row in rows:
(
增刪說明,
字頭,
釋義,
釋義補充,
韻目原貌,
小韻號原貌,
小韻內字序,
) = (
row[18],
row[24],
row[25],
row[26],
row[39],
row[58],
row[59],
)

if 增刪說明 == '應刪':
continue

order_key = (int(小韻號原貌), float(小韻內字序))

# 小韻號
if 小韻號原貌 in has_細分:
for 細分 in has_細分[小韻號原貌]:
小韻號 = 小韻號原貌 + 細分
if 字頭 in 小韻細分_data[小韻號][2]:
小韻細分_coverage.setdefault(小韻號, set()).add(字頭)
break
else:
最簡描述 = 最簡描述[0]
if 最簡描述 == '(deleted)':
最簡描述 = ''

if v2反切 != 反切:
assert 反切 == 反切覈校前
反切 = v2反切
if 反切覈校前 == 反切:
反切 = ''
if 字頭覈校前 == 字頭:
字頭 = ''

print(小韻號, 小韻內字序, 韻目原貌, 最簡描述, 反切覈校前, 反切, 字頭覈校前,
字頭, 釋義, 釋義補充, 圖片id, sep=',', file=g)
raise ValueError(
f'cannot determine 小韻細分 for {字頭} (小韻 #{小韻號原貌})'
)
else:
小韻號 = 小韻號原貌

音韻地位 = 音韻地位_data[小韻號]
反切 = 小韻_data[小韻號][2]
if 反切 == '無':
反切 = ''

字頭又作 = {
'𩏑': '韓',
'𧖴': '脈',
}.get(字頭, '')

廣韻_data.append(
(
order_key,
[
小韻號,
小韻內字序,
韻目原貌,
音韻地位,
反切,
字頭,
字頭又作,
釋義,
釋義補充,
],
)
)

for 小韻號, cov in 小韻細分_coverage.items():
specified = set(小韻細分_data[小韻號][2])
diff = specified - cov
assert not diff, f'字頭 listed in 小韻細分_data but not seen: {"".join(sorted(diff))} (小韻 #{小韻號})'

廣韻_data.sort(key=lambda x: x[0])

last_原小韻號 = 0
小韻內字序 = 0
with open('韻書/廣韻.csv', 'w') as fout:
print(
'小韻號,小韻內字序,韻目原貌,音韻地位,反切,字頭,字頭又作,釋義,釋義補充',
file=fout,
)
for (原小韻號, _), row in 廣韻_data:
if 原小韻號 != last_原小韻號:
last_原小韻號 = 原小韻號
小韻內字序 = 0
小韻內字序 += 1
row[1] = 小韻內字序
print(*row, sep=',', file=fout)


if __name__ == '__main__':
main()
Loading

0 comments on commit c2838d5

Please sign in to comment.