-
Notifications
You must be signed in to change notification settings - Fork 0
/
jmdict_data.py
executable file
·336 lines (306 loc) · 13.3 KB
/
jmdict_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
#!/usr/bin/env python3
# Alphabet Soup gives language learners easily digestible chunks for practice.
# Copyright 2019-2020 Yorwba
# Alphabet Soup is free software: you can redistribute it and/or
# modify it under the terms of the GNU Affero General Public License
# as published by the Free Software Foundation, either version 3 of
# the License, or (at your option) any later version.
# Alphabet Soup is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
# You should have received a copy of the GNU Affero General Public License
# along with Alphabet Soup. If not, see <https://www.gnu.org/licenses/>.
import argparse
from collections import defaultdict
import lxml.etree as etree
import gzip
import os
import sqlite3
def create_tables():
c = conn.cursor()
c.execute(
'''
CREATE TABLE entry (
ent_seq integer,
variant integer,
lemma text,
pos text,
PRIMARY KEY (ent_seq, variant))
''')
c.execute(
'''
CREATE INDEX entry_lemma_pos_index ON entry (lemma, pos)
''')
c.execute(
'''
CREATE TABLE gloss (
ent_seq integer,
variant integer,
lang text,
gloss text,
PRIMARY KEY (ent_seq, variant, lang))
''')
c.execute(
'''
CREATE TABLE disambiguator_to_pos (
disambiguator text,
pos text)
''')
def read_dictionary(jmdict):
with gzip.open(jmdict) as f:
for event, node in etree.iterparse(f, tag='entry'):
children = node.getchildren()
ent_seq, = (child.text for child in children if child.tag == 'ent_seq')
kanji_elements = [child for child in children if child.tag == 'k_ele']
reading_elements = [child for child in children if child.tag == 'r_ele']
senses = [child for child in children if child.tag == 'sense']
transes = [child for child in children if child.tag == 'trans']
kanjis = [child.text
for k_ele in kanji_elements
for child in k_ele.iterchildren()
if child.tag == 'keb']
if not kanjis:
kanjis = [None]
kanji_readings = []
for r_ele in reading_elements:
restrictions = set()
for child in r_ele.iterchildren():
if child.tag == 're_restr':
restrictions.add(child.text)
elif child.tag == 'reb':
reading = child.text
for kanji in kanjis:
if not kanji:
kanji = reading
if not restrictions or kanji in restrictions:
kanji_readings.append((kanji, reading))
# (readings, miscellaneous) by [(kanji, pos)][lang][gloss]
rm_by_kplg = \
defaultdict(
lambda: defaultdict(
lambda: defaultdict(
lambda: (set(), set()))))
parts_of_speech = frozenset()
miscellanea = frozenset()
for sense in senses:
kanji_restrictions = set()
reading_restrictions = set()
current_parts_of_speech = set()
current_miscellanea = set()
glosses = defaultdict(list)
for child in sense.iterchildren():
if child.tag == 'stagk':
kanji_restrictions.add(child.text)
elif child.tag == 'stagr':
reading_restrictions.add(child.text)
elif child.tag == 'pos':
current_parts_of_speech.add(child.text)
elif child.tag == 'misc':
current_miscellanea.add(child.text)
elif child.tag == 'gloss':
language = child.get('{http://www.w3.org/XML/1998/namespace}lang')
if not language:
language = 'eng'
if child.text: # XXX who adds a gloss without text???
glosses[language].append(child.text)
if current_parts_of_speech:
parts_of_speech = frozenset(current_parts_of_speech)
if current_miscellanea:
miscellanea = frozenset(current_miscellanea)
for kanji, reading in kanji_readings:
if ((not kanji_restrictions
or kanji in kanji_restrictions)
and
(not reading_restrictions
or reading in reading_restrictions)):
if (kanji != reading and
'word usually written using kana alone' in miscellanea):
lemma_options = [kanji, reading]
else:
lemma_options = [kanji]
for lemma in lemma_options:
for pos in parts_of_speech:
rm_by_lg = rm_by_kplg[(lemma, pos)]
for lang, gloss in glosses.items():
rm_by_g = rm_by_lg[lang]
for glos in gloss:
readings, misc = rm_by_g[glos]
readings.add(reading)
misc.update(miscellanea)
# Name translations in JMnedict
for trans in transes:
name_types = set()
glosses = defaultdict(list)
for child in trans.iterchildren():
if child.tag == 'name_type':
name_types.add(child.text)
elif child.tag == 'trans_det':
language = child.get('{http://www.w3.org/XML/1998/namespace}lang')
if not language:
language = 'eng'
glosses[language].append(child.text)
elif child.tag == 'xref':
pass
else:
import pdb; pdb.set_trace()
for kanji, reading in kanji_readings:
for lemma in [kanji, reading]:
for pos in name_types:
rm_by_lg = rm_by_kplg[(lemma, pos)]
for lang, gloss in glosses.items():
rm_by_g = rm_by_lg[lang]
for glos in gloss:
readings, misc = rm_by_g[glos]
readings.add(reading)
# gloss by [(kanji, pos)][lang][{reading}][{misc}]
g_by_kplrm = \
defaultdict(
lambda: defaultdict(
lambda: defaultdict(
lambda: defaultdict(
list))))
for kp, rm_by_lg in rm_by_kplg.items():
g_by_lrm = g_by_kplrm[kp]
for lang, rm_by_g in rm_by_lg.items():
g_by_rm = g_by_lrm[lang]
for gloss, (readings, misc) in rm_by_g.items():
readings = frozenset(readings)
misc = frozenset(misc)
g_by_rm[readings][misc].append(gloss)
# gloss by [(kanji, pos)][lang]
g_by_kpl = {
kp: {
lang:
'\n\n'.join(
'\n'.join(
[', '.join(
f'[{reading}]'
for reading in readings)
+':']
+ ['\n'.join(
[f'\n({", ".join(misc)})' if misc else '']
+ gloss)
for misc, gloss in g_by_m.items()])
for readings, g_by_m in g_by_rm.items())
for lang, g_by_rm in g_by_lrm.items()}
for kp, g_by_lrm in g_by_kplrm.items()}
for variant_number, ((kanji, pos), glosses) in enumerate(g_by_kpl.items()):
for lang, gloss in glosses.items():
yield ent_seq, variant_number, kanji, pos, lang, gloss
def associate_disambiguator_and_pos(args):
c = conn.cursor()
c.execute(f'ATTACH DATABASE ? as sentences', (args.sentence_database,))
while True:
disambiguator_pos_mappings = [
(set(disambiguators.split('\t')), set(pos.split('\t')), frequency)
for disambiguators, pos, frequency
in c.execute(
'''
WITH
unmatched_disambiguators AS (
SELECT
sentences.lemma.text as lemma,
group_concat(disambiguator, "\t") as disambiguators
FROM sentences.lemma
WHERE disambiguator NOT IN (
SELECT disambiguator
FROM disambiguator_to_pos NATURAL JOIN entry
WHERE entry.lemma = sentences.lemma.text)
GROUP BY sentences.lemma.text),
possible_pos AS (
SELECT
lemma,
group_concat(pos, "\t") as pos
FROM entry
GROUP BY lemma)
SELECT disambiguators, pos, count(*) as frequency
FROM unmatched_disambiguators NATURAL JOIN possible_pos
GROUP BY disambiguators, pos
ORDER BY frequency
''')]
easy_cases = [
(disambiguator, next(iter(pos)))
for disambiguators, pos, frequency
in disambiguator_pos_mappings
if len(pos) == 1
for disambiguator in disambiguators]
c.executemany(
'''
INSERT INTO disambiguator_to_pos (disambiguator, pos)
VALUES (?, ?)
''',
easy_cases)
if not easy_cases:
intersections = {}
for disambiguators, pos, frequency in disambiguator_pos_mappings:
for disambiguator in disambiguators:
if disambiguator in intersections:
intersections[disambiguator] =\
intersections[disambiguator] & pos
else:
intersections[disambiguator] = pos
intersected_cases = [
(disambiguator, po)
for disambiguator, pos in intersections.items()
for po in pos]
c.executemany(
'''
INSERT INTO disambiguator_to_pos (disambiguator, pos)
VALUES (?, ?)
''',
intersected_cases)
if not intersected_cases:
# TODO: maybe solve set cover instead?
remaining_cases = [
(disambiguator, po)
for disambiguators, pos, frequency
in disambiguator_pos_mappings
for disambiguator in disambiguators
for po in pos]
c.executemany(
'''
INSERT INTO disambiguator_to_pos (disambiguator, pos)
VALUES (?, ?)
''',
remaining_cases)
if not remaining_cases:
break
def convert(args):
try:
os.remove(args.database)
except FileNotFoundError:
pass
global conn
conn = sqlite3.connect(args.database)
create_tables()
c = conn.cursor()
for d in (args.jmnedict, args.jmdict):
for (ent_seq, variant, kanji, pos, lang, gloss) in read_dictionary(d):
c.execute(
'''
INSERT OR IGNORE INTO entry (ent_seq, variant, lemma, pos)
VALUES (?, ?, ?, ?)
''',
(ent_seq, variant, kanji, pos))
c.execute(
'''
INSERT OR IGNORE INTO gloss (ent_seq, variant, lang, gloss)
VALUES (?, ?, ?, ?)
''',
(ent_seq, variant, lang, gloss))
associate_disambiguator_and_pos(args)
conn.commit()
def main(argv):
parser = argparse.ArgumentParser(
description='JMdict XML to SQLite converter')
parser.add_argument('command', nargs=1, choices={'convert'})
parser.add_argument('--jmdict', type=str, default='data/jmdict/JMdict.gz')
parser.add_argument('--jmnedict', type=str, default='data/jmdict/JMnedict.xml.gz')
parser.add_argument('--database', type=str, default='data/jpn_dictionary.sqlite')
parser.add_argument('--sentence-database', type=str, default='data/jpn_sentences.sqlite')
args = parser.parse_args(argv[1:])
globals()[args.command[0].replace('-', '_')](args)
if __name__ == '__main__':
import sys
main(sys.argv)