Skip to content

Commit

Permalink
More work on de-vendoring pykakasi
Browse files Browse the repository at this point in the history
  • Loading branch information
kovidgoyal committed Nov 12, 2024
1 parent ec8c06c commit 04d5728
Show file tree
Hide file tree
Showing 7 changed files with 64 additions and 128 deletions.
7 changes: 0 additions & 7 deletions COPYRIGHT
Original file line number Diff line number Diff line change
Expand Up @@ -323,13 +323,6 @@ License: GPL-3
The full text of the GPL is distributed as in
/usr/share/common-licenses/GPL-3 on Debian systems.

Files: src/calibre/ebooks/unihandecode/pykakasi/*
Copyright: 2011, Hiroshi Miura <[email protected]>
Copyright: 1992, Hironobu Takahashi
License: GPL-2+
The full text of the GPL is distributed as in
/usr/share/common-licenses/GPL on Debian systems.

Files: src/calibre/ebooks/unihandecode/*
Copyright: 2010-2011, Hiroshi Miura <[email protected]>
Copyright: 2009, John Schember
Expand Down
5 changes: 3 additions & 2 deletions bypy/linux/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,11 +85,12 @@ def ignore_in_lib(base, items, ignored_dirs=None):
ignored_dirs = {'.svn', '.bzr', '.git', 'test', 'tests', 'testing'}
for name in items:
path = j(base, name)
is_kakasi = 'pykakasi' in path
if os.path.isdir(path):
if name != 'plugins' and (name in ignored_dirs or not is_package_dir(path)):
if name != 'plugins' and (name in ignored_dirs or not is_package_dir(path)) and not (is_kakasi and name == 'data'):
ans.append(name)
else:
if name.rpartition('.')[-1] not in ('so', 'py'):
if name.rpartition('.')[-1] not in ('so', 'py') and not (is_kakasi and name.endswith('.db')):
ans.append(name)
return ans

Expand Down
7 changes: 5 additions & 2 deletions bypy/macos/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -614,12 +614,15 @@ def add_packages_from_dir(self, src):

@flush
def add_package_dir(self, x, dest=None):
is_kakasi = 'pykakasi' in x
allowed_exts = ('', '.py', '.so')
if is_kakasi:
allowed_exts += ('.db',)
def ignore(root, files):
ans = []
for y in files:
ext = os.path.splitext(y)[1]
if ext not in ('', '.py', '.so') or \
(not ext and not os.path.isdir(join(root, y))):
if ext not in allowed_exts or (not ext and not os.path.isdir(join(root, y))):
ans.append(y)

return ans
Expand Down
18 changes: 18 additions & 0 deletions bypy/sources.json
Original file line number Diff line number Diff line change
Expand Up @@ -1043,6 +1043,24 @@
}
},

{
"name": "wrapt",
"unix": {
"filename": "wrapt-1.16.0-py3-none-any.whl",
"hash": "sha256:6906c4100a8fcbf2fa735f6059214bb13b97f75b1a61777fcf6432121ef12ef1",
"urls": ["pypi"]
}
},

{
"name": "deprecated",
"unix": {
"filename": "Deprecated-1.2.14-py2.py3-none-any.whl",
"hash": "sha256:6fac8b097794a90302bdbb17b9b815e732d3c4720583ff1b198499d78470466c",
"urls": ["pypi"]
}
},

{
"name": "pykakasi",
"unix": {
Expand Down
5 changes: 2 additions & 3 deletions setup/commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
'gui',
'git_version',
'develop', 'install',
'kakasi', 'rapydscript', 'cacerts', 'recent_uas', 'resources',
'rapydscript', 'cacerts', 'recent_uas', 'resources',
'check', 'test', 'test_rs', 'upgrade_source_code',
'sdist', 'bootstrap', 'extdev',
'manual', 'tag_release',
Expand Down Expand Up @@ -90,10 +90,9 @@
test = Test()
test_rs = TestRS()

from setup.resources import CACerts, Kakasi, RapydScript, RecentUAs, Resources
from setup.resources import CACerts, RapydScript, RecentUAs, Resources

resources = Resources()
kakasi = Kakasi()
cacerts = CACerts()
recent_uas = RecentUAs()
rapydscript = RapydScript()
Expand Down
117 changes: 3 additions & 114 deletions setup/resources.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,11 @@
import glob
import json
import os
import re
import shutil
import zipfile
from zlib import compress

from polyglot.builtins import codepoint_to_chr, iteritems, itervalues, only_unicode_recursive
from setup import Command, __appname__, basenames, download_securely, dump_json
from polyglot.builtins import iteritems, itervalues, only_unicode_recursive
from setup import Command, basenames, download_securely, dump_json


def get_opts_from_parser(parser):
Expand All @@ -29,113 +27,6 @@ def do_opt(opt):
yield from do_opt(o)


class Kakasi(Command): # {{{

description = 'Compile resources for unihandecode'

KAKASI_PATH = os.path.join(Command.SRC, __appname__,
'ebooks', 'unihandecode', 'pykakasi')

def run(self, opts):
self.records = {}
src = self.j(self.KAKASI_PATH, 'kakasidict.utf8')
dest = self.j(self.RESOURCES, 'localization',
'pykakasi','kanwadict2.calibre_msgpack')
base = os.path.dirname(dest)
if not os.path.exists(base):
os.makedirs(base)

if self.newer(dest, src):
self.info('\tGenerating Kanwadict')

for line in open(src, "rb"):
self.parsekdict(line)
self.kanwaout(dest)

src = self.j(self.KAKASI_PATH, 'itaijidict.utf8')
dest = self.j(self.RESOURCES, 'localization',
'pykakasi','itaijidict2.calibre_msgpack')

if self.newer(dest, src):
self.info('\tGenerating Itaijidict')
self.mkitaiji(src, dest)

src = self.j(self.KAKASI_PATH, 'kanadict.utf8')
dest = self.j(self.RESOURCES, 'localization',
'pykakasi','kanadict2.calibre_msgpack')

if self.newer(dest, src):
self.info('\tGenerating kanadict')
self.mkkanadict(src, dest)

def mkitaiji(self, src, dst):
dic = {}
for line in open(src, "rb"):
line = line.decode('utf-8').strip()
if line.startswith(';;'): # skip comment
continue
if re.match(r"^$",line):
continue
pair = re.sub(r'\\u([0-9a-fA-F]{4})', lambda x:codepoint_to_chr(int(x.group(1),16)), line)
dic[pair[0]] = pair[1]
from calibre.utils.serialize import msgpack_dumps
with open(dst, 'wb') as f:
f.write(msgpack_dumps(dic))

def mkkanadict(self, src, dst):
dic = {}
for line in open(src, "rb"):
line = line.decode('utf-8').strip()
if line.startswith(';;'): # skip comment
continue
if re.match(r"^$",line):
continue
(alpha, kana) = line.split(' ')
dic[kana] = alpha
from calibre.utils.serialize import msgpack_dumps
with open(dst, 'wb') as f:
f.write(msgpack_dumps(dic))

def parsekdict(self, line):
line = line.decode('utf-8').strip()
if line.startswith(';;'): # skip comment
return
(yomi, kanji) = line.split(' ')
if ord(yomi[-1:]) <= ord('z'):
tail = yomi[-1:]
yomi = yomi[:-1]
else:
tail = ''
self.updaterec(kanji, yomi, tail)

def updaterec(self, kanji, yomi, tail):
key = "%04x"%ord(kanji[0])
if key in self.records:
if kanji in self.records[key]:
rec = self.records[key][kanji]
rec.append((yomi,tail))
self.records[key].update({kanji: rec})
else:
self.records[key][kanji]=[(yomi, tail)]
else:
self.records[key] = {}
self.records[key][kanji]=[(yomi, tail)]

def kanwaout(self, out):
from calibre.utils.serialize import msgpack_dumps
with open(out, 'wb') as f:
dic = {}
for k, v in iteritems(self.records):
dic[k] = compress(msgpack_dumps(v))
f.write(msgpack_dumps(dic))

def clean(self):
kakasi = self.j(self.RESOURCES, 'localization', 'pykakasi')
if os.path.exists(kakasi):
shutil.rmtree(kakasi)
# }}}


class CACerts(Command): # {{{

description = 'Get updated mozilla CA certificate bundle'
Expand Down Expand Up @@ -213,7 +104,7 @@ def run(self, opts):
class Resources(Command): # {{{

description = 'Compile various needed calibre resources'
sub_commands = ['kakasi', 'liberation_fonts', 'mathjax', 'rapydscript', 'hyphenation', 'piper_voices']
sub_commands = ['liberation_fonts', 'mathjax', 'rapydscript', 'hyphenation', 'piper_voices']

def run(self, opts):
from calibre.utils.serialize import msgpack_dumps
Expand Down Expand Up @@ -337,8 +228,6 @@ def clean(self):
x = self.j(self.RESOURCES, x+'.pickle')
if os.path.exists(x):
os.remove(x)
from setup.commands import kakasi
kakasi.clean()
for x in ('builtin_recipes.xml', 'builtin_recipes.zip',
'template-functions.json', 'user-manual-translation-stats.json'):
x = self.j(self.RESOURCES, x)
Expand Down
33 changes: 33 additions & 0 deletions src/calibre/ebooks/unihandecode/jadecoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,15 +16,48 @@
Copyright (c) 2010 Hiroshi Miura
'''

import pickle
import re
from importlib.resources import files

from pykakasi import kakasi
from pykakasi.kanji import Itaiji, Kanwa
from pykakasi.properties import Configurations
from pykakasi.scripts import Jisyo

from calibre.ebooks.unihandecode.jacodepoints import CODEPOINTS as JACODES
from calibre.ebooks.unihandecode.unicodepoints import CODEPOINTS
from calibre.ebooks.unihandecode.unidecoder import Unidecoder


# pykakasi uses paths for its dictionaries rather than using the
# Traversable API of importlib.resources so we have to hack around it, sigh.
def dictdata(dbfile: str):
t = files('pykakasi')
q = t.joinpath('data').joinpath(dbfile)
return q.read_bytes()


def jisyo_init(self, dbname):
self._dict = pickle.loads(dictdata(dbname))


def itaiji_init(self):
if self._itaijidict is None:
with self._lock:
if self._itaijidict is None:
self._itaijidict = pickle.loads(dictdata(Configurations.jisyo_itaiji))

def kanwa_init(self):
if self._jisyo_table is None:
with self._lock:
if self._jisyo_table is None:
self._jisyo_table = pickle.loads(dictdata(Configurations.jisyo_kanwa))

Jisyo.__init__ = jisyo_init
Itaiji.__init__ = itaiji_init
Kanwa.__init__ = kanwa_init

class Jadecoder(Unidecoder):

def __init__(self):
Expand Down

0 comments on commit 04d5728

Please sign in to comment.