-
-
Notifications
You must be signed in to change notification settings - Fork 9
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
5 changed files
with
199 additions
and
4 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,4 @@ | ||
*.sqlite3-journal | ||
__pycache__ | ||
|
||
/res/*.sqlite3 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,188 @@ | ||
#!/usr/bin/python3 | ||
# encoding: utf-8 | ||
|
||
# -- ; {{{1 | ||
# | ||
# File : jiten/kanji.py | ||
# Maintainer : Felix C. Stegerman <[email protected]> | ||
# Date : 2020-06-15 | ||
# | ||
# Copyright : Copyright (C) 2020 Felix C. Stegerman | ||
# Version : v0.0.1 | ||
# License : GPLv3+ | ||
# | ||
# -- ; }}}1 | ||
|
||
# {{{1 | ||
r""" | ||
>>> kanjidic = parse_kanjidic() | ||
>>> len(kanjidic) | ||
13108 | ||
>>> len([ x for x in kanjidic if x.level == "常用1" ]) | ||
80 | ||
>>> len([ x for x in kanjidic if x.level == "常用2" ]) | ||
160 | ||
>>> len([ x for x in kanjidic if x.level == "常用3" ]) | ||
200 | ||
>>> len([ x for x in kanjidic if x.level == "常用4" ]) | ||
197 | ||
>>> len([ x for x in kanjidic if x.level == "常用5" ]) | ||
197 | ||
>>> len([ x for x in kanjidic if x.level == "常用6" ]) | ||
192 | ||
>>> len([ x for x in kanjidic if x.level == "常用" ]) | ||
1110 | ||
>>> len([ x for x in kanjidic if x.level == "人名" ]) | ||
650 | ||
>>> len([ x for x in kanjidic if x.level == "人名(常用)" ]) | ||
212 | ||
>>> len([ x for x in kanjidic if x.level is None ]) | ||
10110 | ||
>>> len([ x for x in kanjidic if x.freq is not None ]) | ||
2501 | ||
>>> len([ x for x in kanjidic if x.jlpt is not None ]) | ||
2230 | ||
>>> len([ x for x in kanjidic if x.skip is None ]) | ||
952 | ||
>>> len([ x for x in kanjidic if x.skip is None and x.cat == "KANJI" ]) | ||
403 | ||
>>> len([ x for x in kanjidic if x.strokes > 25 ]) | ||
94 | ||
>>> len([ x for x in kanjidic if x.cat == "KANJI" ]) | ||
12559 | ||
>>> len([ x for x in kanjidic if x.cat == "CJK COMPATIBILITY IDEOGRAPH" ]) | ||
82 | ||
>>> len([ x for x in kanjidic if x.cat == "CJK UNIFIED IDEOGRAPH" ]) | ||
467 | ||
>>> len([ x for x in kanjidic if len(x.nanori) ]) | ||
1353 | ||
>>> len([ x for x in kanjidic if not len(x.on) ]) | ||
953 | ||
>>> len([ x for x in kanjidic if not len(x.kun) ]) | ||
3289 | ||
>>> len([ x for x in kanjidic if not len(x.meaning) ]) | ||
2753 | ||
""" # }}}1 | ||
|
||
import gzip, re, sys | ||
import xml.etree.ElementTree as ET | ||
|
||
from collections import namedtuple | ||
|
||
import click | ||
|
||
from . import misc as M | ||
from .sql import sqlite_do | ||
|
||
SQLITE_FILE = "res/kanji.sqlite3" | ||
KANJIDIC_FILE = "res/jmdict/kanjidic2.xml.gz" | ||
|
||
Entry = namedtuple("Entry", """char cat level strokes freq jlpt | ||
skip on kun nanori meaning""".split()) | ||
|
||
def level(l): | ||
if 1 <= l <= 6: return "常用" + str(l) | ||
if l == 8 : return "常用" | ||
if l == 9 : return "人名" | ||
if l == 10 : return "人名(常用)" | ||
raise ValueError("unexpected level: " + l) | ||
|
||
def category(c): | ||
if M.iskanji(c) : return "KANJI" | ||
if M.iscompat(c): return "CJK COMPATIBILITY IDEOGRAPH" | ||
if M.isuniext(c): return "CJK UNIFIED IDEOGRAPH" | ||
raise ValueError("unexpected category for: " + c) | ||
|
||
def maybe(x, f, d = None): | ||
return d if x is None else f(x) | ||
|
||
# TODO | ||
# * rmgroup?! | ||
def parse_kanjidic(file = KANJIDIC_FILE): # {{{1 | ||
data = [] | ||
with gzip.open(file) as f: | ||
with click.progressbar(ET.parse(f).getroot(), width = 0, | ||
label = "parsing kanjidic") as bar: | ||
for e in bar: | ||
if e.tag != "character": continue | ||
char = e.find("literal").text.strip() | ||
lvl = maybe(e.find(".//grade"), lambda e: level(int(e.text))) | ||
strokes = int(e.find(".//stroke_count").text) | ||
freq = maybe(e.find(".//freq"), lambda e: int(e.text)) | ||
jlpt = maybe(e.find(".//jlpt"), lambda e: int(e.text)) # *OLD* JLPT (1-4) | ||
skip = maybe(e.find(".//q_code[@qc_type='skip']"), | ||
lambda e: e.text.strip()) | ||
on = tuple( r.text.strip() for r in | ||
e.findall(".//reading[@r_type='ja_on']") ) | ||
kun = tuple( r.text.strip() for r in | ||
e.findall(".//reading[@r_type='ja_kun']") ) | ||
nanori = tuple( n.text.strip() for n in e.findall(".//nanori") ) | ||
meaning = tuple( m.text.strip() for m in e.findall(".//meaning") | ||
if "m_lang" not in m.attrib ) | ||
assert len(char) == 1 | ||
assert all( M.iskatakana(c) or c in ".-" for x in on for c in x ) | ||
assert all( all( M.ishiragana(c) or c in ".-ー" for c in x ) or | ||
all( M.iskatakana(c) for c in x ) for x in kun ) | ||
assert all( "\n" not in x for x in on ) | ||
assert all( "\n" not in x for x in kun ) | ||
assert all( "\n" not in x for x in nanori ) | ||
assert all( "\n" not in x for x in meaning ) | ||
data.append(Entry(char, category(char), lvl, strokes, freq, | ||
jlpt, skip, on, kun, nanori, meaning)) | ||
return data | ||
# }}}1 | ||
|
||
def kanjidic2sqldb(data, file = SQLITE_FILE): # {{{1 | ||
with sqlite_do(file) as c: | ||
c.executescript(KANJIDIC_CREATE_SQL) | ||
with click.progressbar(data, width = 0, label = "writing kanjidic") as bar: | ||
for e in bar: | ||
c.execute("INSERT INTO entry VALUES ({})" | ||
.format(",".join(["?"]*12)), | ||
(ord(e.char), e.char, e.cat, e.level, e.strokes, | ||
e.freq, e.jlpt, e.skip, "\n".join(e.on), | ||
"\n".join(e.kun), "\n".join(e.nanori), | ||
"\n".join(e.meaning))) | ||
# }}}1 | ||
|
||
# {{{1 | ||
KANJIDIC_CREATE_SQL = """ | ||
DROP TABLE IF EXISTS entry; | ||
CREATE TABLE entry( | ||
code INTEGER PRIMARY KEY ASC, | ||
char TEXT, | ||
cat TEXT, | ||
level TEXT, | ||
strokes INTEGER, | ||
freq INTEGER, | ||
jlpt INTEGER, | ||
skip TEXT, | ||
on_ TEXT, | ||
kun TEXT, | ||
nanori TEXT, | ||
meaning TEXT | ||
); | ||
""" # }}}1 | ||
|
||
def setup(): | ||
kanjidic = parse_kanjidic() | ||
kanjidic2sqldb(kanjidic) | ||
|
||
# TODO | ||
def search(q, max_results = None, file = SQLITE_FILE): # {{{1 | ||
... | ||
# }}}1 | ||
|
||
if __name__ == "__main__": | ||
if "--doctest" in sys.argv: | ||
import doctest | ||
if doctest.testmod(verbose = True)[0]: sys.exit(1) | ||
|
||
# vim: set tw=70 sw=2 sts=2 et fdm=marker : |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters