Skip to content

Commit

Permalink
+ kanji
Browse files Browse the repository at this point in the history
  • Loading branch information
obfusk committed Jun 15, 2020
1 parent 485d166 commit 249582b
Show file tree
Hide file tree
Showing 5 changed files with 199 additions and 4 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
*.sqlite3-journal
__pycache__

/res/*.sqlite3
1 change: 1 addition & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,5 @@ SHELL = /bin/bash
test:
export PYTHONPATH=$$PWD/src ;\
python3 -m jiten.jmdict --doctest ;\
python3 -m jiten.kanji --doctest ;\
python3 -m jiten.misc --doctest
2 changes: 2 additions & 0 deletions src/jiten/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
import click

from . import jmdict as J
from . import kanji as K

@click.group()
@click.option("-v", "--verbose", is_flag = True, help = "Be verbose.")
Expand Down Expand Up @@ -98,6 +99,7 @@ def serve(ctx, host, port):
@cli.command(help = "Create sqlite databases from XML files.")
def setup():
J.setup()
K.setup()

if __name__ == "__main__":
cli()
Expand Down
188 changes: 188 additions & 0 deletions src/jiten/kanji.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,188 @@
#!/usr/bin/python3
# encoding: utf-8

# -- ; {{{1
#
# File : jiten/kanji.py
# Maintainer : Felix C. Stegerman <[email protected]>
# Date : 2020-06-15
#
# Copyright : Copyright (C) 2020 Felix C. Stegerman
# Version : v0.0.1
# License : GPLv3+
#
# -- ; }}}1

# {{{1
r"""
>>> kanjidic = parse_kanjidic()
>>> len(kanjidic)
13108
>>> len([ x for x in kanjidic if x.level == "常用1" ])
80
>>> len([ x for x in kanjidic if x.level == "常用2" ])
160
>>> len([ x for x in kanjidic if x.level == "常用3" ])
200
>>> len([ x for x in kanjidic if x.level == "常用4" ])
197
>>> len([ x for x in kanjidic if x.level == "常用5" ])
197
>>> len([ x for x in kanjidic if x.level == "常用6" ])
192
>>> len([ x for x in kanjidic if x.level == "常用" ])
1110
>>> len([ x for x in kanjidic if x.level == "人名" ])
650
>>> len([ x for x in kanjidic if x.level == "人名(常用)" ])
212
>>> len([ x for x in kanjidic if x.level is None ])
10110
>>> len([ x for x in kanjidic if x.freq is not None ])
2501
>>> len([ x for x in kanjidic if x.jlpt is not None ])
2230
>>> len([ x for x in kanjidic if x.skip is None ])
952
>>> len([ x for x in kanjidic if x.skip is None and x.cat == "KANJI" ])
403
>>> len([ x for x in kanjidic if x.strokes > 25 ])
94
>>> len([ x for x in kanjidic if x.cat == "KANJI" ])
12559
>>> len([ x for x in kanjidic if x.cat == "CJK COMPATIBILITY IDEOGRAPH" ])
82
>>> len([ x for x in kanjidic if x.cat == "CJK UNIFIED IDEOGRAPH" ])
467
>>> len([ x for x in kanjidic if len(x.nanori) ])
1353
>>> len([ x for x in kanjidic if not len(x.on) ])
953
>>> len([ x for x in kanjidic if not len(x.kun) ])
3289
>>> len([ x for x in kanjidic if not len(x.meaning) ])
2753
""" # }}}1

import gzip, re, sys
import xml.etree.ElementTree as ET

from collections import namedtuple

import click

from . import misc as M
from .sql import sqlite_do

SQLITE_FILE = "res/kanji.sqlite3"
KANJIDIC_FILE = "res/jmdict/kanjidic2.xml.gz"

Entry = namedtuple("Entry", """char cat level strokes freq jlpt
skip on kun nanori meaning""".split())

def level(l):
if 1 <= l <= 6: return "常用" + str(l)
if l == 8 : return "常用"
if l == 9 : return "人名"
if l == 10 : return "人名(常用)"
raise ValueError("unexpected level: " + l)

def category(c):
if M.iskanji(c) : return "KANJI"
if M.iscompat(c): return "CJK COMPATIBILITY IDEOGRAPH"
if M.isuniext(c): return "CJK UNIFIED IDEOGRAPH"
raise ValueError("unexpected category for: " + c)

def maybe(x, f, d = None):
return d if x is None else f(x)

# TODO
# * rmgroup?!
def parse_kanjidic(file = KANJIDIC_FILE): # {{{1
data = []
with gzip.open(file) as f:
with click.progressbar(ET.parse(f).getroot(), width = 0,
label = "parsing kanjidic") as bar:
for e in bar:
if e.tag != "character": continue
char = e.find("literal").text.strip()
lvl = maybe(e.find(".//grade"), lambda e: level(int(e.text)))
strokes = int(e.find(".//stroke_count").text)
freq = maybe(e.find(".//freq"), lambda e: int(e.text))
jlpt = maybe(e.find(".//jlpt"), lambda e: int(e.text)) # *OLD* JLPT (1-4)
skip = maybe(e.find(".//q_code[@qc_type='skip']"),
lambda e: e.text.strip())
on = tuple( r.text.strip() for r in
e.findall(".//reading[@r_type='ja_on']") )
kun = tuple( r.text.strip() for r in
e.findall(".//reading[@r_type='ja_kun']") )
nanori = tuple( n.text.strip() for n in e.findall(".//nanori") )
meaning = tuple( m.text.strip() for m in e.findall(".//meaning")
if "m_lang" not in m.attrib )
assert len(char) == 1
assert all( M.iskatakana(c) or c in ".-" for x in on for c in x )
assert all( all( M.ishiragana(c) or c in ".-ー" for c in x ) or
all( M.iskatakana(c) for c in x ) for x in kun )
assert all( "\n" not in x for x in on )
assert all( "\n" not in x for x in kun )
assert all( "\n" not in x for x in nanori )
assert all( "\n" not in x for x in meaning )
data.append(Entry(char, category(char), lvl, strokes, freq,
jlpt, skip, on, kun, nanori, meaning))
return data
# }}}1

def kanjidic2sqldb(data, file = SQLITE_FILE): # {{{1
with sqlite_do(file) as c:
c.executescript(KANJIDIC_CREATE_SQL)
with click.progressbar(data, width = 0, label = "writing kanjidic") as bar:
for e in bar:
c.execute("INSERT INTO entry VALUES ({})"
.format(",".join(["?"]*12)),
(ord(e.char), e.char, e.cat, e.level, e.strokes,
e.freq, e.jlpt, e.skip, "\n".join(e.on),
"\n".join(e.kun), "\n".join(e.nanori),
"\n".join(e.meaning)))
# }}}1

# {{{1
KANJIDIC_CREATE_SQL = """
DROP TABLE IF EXISTS entry;
CREATE TABLE entry(
code INTEGER PRIMARY KEY ASC,
char TEXT,
cat TEXT,
level TEXT,
strokes INTEGER,
freq INTEGER,
jlpt INTEGER,
skip TEXT,
on_ TEXT,
kun TEXT,
nanori TEXT,
meaning TEXT
);
""" # }}}1

def setup():
kanjidic = parse_kanjidic()
kanjidic2sqldb(kanjidic)

# TODO
def search(q, max_results = None, file = SQLITE_FILE): # {{{1
...
# }}}1

if __name__ == "__main__":
if "--doctest" in sys.argv:
import doctest
if doctest.testmod(verbose = True)[0]: sys.exit(1)

# vim: set tw=70 sw=2 sts=2 et fdm=marker :
11 changes: 7 additions & 4 deletions src/jiten/misc.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,13 @@

OKPUNC = "々"

ispunc = lambda c: 0x3000 <= ord(c) <= 0x303f
ishiragana = lambda c: 0x3040 <= ord(c) <= 0x309f
iskatakana = lambda c: 0x30a0 <= ord(c) <= 0x30ff
iskanji = lambda c: 0x4e00 <= ord(c) <= 0x9faf
ispunc = lambda c: 0x3000 <= ord(c) <= 0x303f
ishiragana = lambda c: 0x3040 <= ord(c) <= 0x309f
iskatakana = lambda c: 0x30a0 <= ord(c) <= 0x30ff
iskanji = lambda c: 0x4e00 <= ord(c) <= 0x9faf
iscompat = lambda c: 0xf900 <= ord(c) <= 0xfaff
isuniext = lambda c: 0x3400 <= ord(c) <= 0x4dbf or \
0x20000 <= ord(c) <= 0x2ebef

iskana = lambda c: ishiragana(c) or iskatakana(c)
isjap = lambda c: iskanji(c) or iskana(c) # probably
Expand Down

0 comments on commit 249582b

Please sign in to comment.