Skip to content

Commit

Permalink
core: added multi-language support, #98
Browse files Browse the repository at this point in the history
  • Loading branch information
matze-dd authored Nov 6, 2020
1 parent a9e909f commit 42499ff
Show file tree
Hide file tree
Showing 7 changed files with 265 additions and 66 deletions.
7 changes: 7 additions & 0 deletions yalafi/defs.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,13 @@ class VoidToken(TextToken):
def __init__(self, pos):
super().__init__(pos, '')

class LanguageToken(TextToken):
def __init__(self, pos, lang='', back=False, hard=False):
super().__init__(pos, '')
self.lang = lang
self.back = back
self.hard = hard

class MathBeginToken(TextToken):
def __init__(self, pos, text, env):
super().__init__(pos, text)
Expand Down
16 changes: 10 additions & 6 deletions yalafi/mathparser.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,8 +71,8 @@ def expand_display_math(self, buf, tok, env):
['&', '\\\\', '$$', '\\]'], env.name)
tokens = self.detect_math_parts(tokens)
sec, next_repl = self.replace_section(False, tokens,
first_section, next_repl,
self.parser.parms.math_repl_display)
first_section, next_repl,
self.parser.parms.lang_context().math_repl_display)
out += sec
if end and end.txt == '&':
out.append(defs.SpaceToken(out[-1].pos, ' ', pos_fix=True))
Expand All @@ -97,8 +97,9 @@ def expand_display_math(self, buf, tok, env):
txt = self.parser.get_text_direct(out).strip()
out = [defs.ActionToken(start_simple),
defs.SpaceToken(start_simple, ' ', pos_fix=True),
defs.TextToken(start_simple,
self.parser.parms.math_repl_display[0], pos_fix=True)]
defs.TextToken(start_simple, self.parser.parms.
lang_context().math_repl_display[0],
pos_fix=True)]
if txt and txt[-1] in self.parser.parms.math_punctuation:
out.append(defs.TextToken(out[-1].pos, txt[-1],
pos_fix=True))
Expand All @@ -113,7 +114,7 @@ def expand_inline_math(self, buf, tok):
tokens = self.detect_math_parts(tokens)
out = [defs.ActionToken(tok.pos)]
t, x = self.replace_section(True, tokens, True, True,
self.parser.parms.math_repl_inline)
self.parser.parms.lang_context().math_repl_inline)
out += t
out.append(defs.ActionToken(out[-1].pos))
return out
Expand Down Expand Up @@ -177,6 +178,8 @@ def special(t):
out.append(tok)
elif tok.txt in parms.math_ignore:
pass
elif type(tok) is defs.LanguageToken:
pass
elif tok.txt in parms.math_space:
out.append(defs.MathSpaceToken(tok.pos, ' '))
elif tok.txt in parms.math_operators:
Expand Down Expand Up @@ -238,7 +241,8 @@ def replace_section(self, inline, tokens, first_section,
op = tok.leading_op()
elem = tok.has_elem(parms)
if not inline and first_part and op:
s = parms.math_op_text.get(op.txt, parms.math_op_text[None])
s = parms.lang_context().math_op_text.get(
op.txt, parms.lang_context().math_op_text[None])
out.append(defs.SpaceToken(tok.pos, ' ', pos_fix=True))
out.append(defs.TextToken(op.pos, s, pos_fix=True))
out.append(defs.SpaceToken(op.pos, ' ', pos_fix=True))
Expand Down
140 changes: 96 additions & 44 deletions yalafi/parameters.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,43 +158,64 @@ def labs_itemize(level):

]

# set language-dependent parameters
# set language-dependent scanner parameters
#
def init_language(self, language):
if language == 'de':
# German
self.special_tokens.update(self.special_tokens_de)
self.proof_name = 'Beweis'
self.math_repl_inline = ['B-B-B', 'C-C-C', 'D-D-D',
'E-E-E', 'F-F-F', 'G-G-G']
self.math_repl_display = ['U-U-U', 'V-V-V', 'W-W-W',
'X-X-X', 'Y-Y-Y', 'Z-Z-Z']
self.math_op_text = {'+': 'plus', '-': 'minus',
def init_scanner_language(self, lang):
lang = lang[:2].lower()
if lang == 'de':
self.special_tokens.update({
'"-': '',
'"=': '-',
'"`': '\N{DOUBLE LOW-9 QUOTATION MARK}', # \glqq
'"\'': '\N{LEFT DOUBLE QUOTATION MARK}', # \grqq
})


# set language-dependent parser parameters
# - settings for 'en' are taken as fall back
#
def init_parser_languages(self, lang):
settings = self.parser_lang_settings = {}

settings['en'] = ParserLanguageSettings(
proof_name = 'Proof',
math_repl_inline = ['B-B-B', 'C-C-C', 'D-D-D',
'E-E-E', 'F-F-F', 'G-G-G'],
math_repl_display = ['U-U-U', 'V-V-V', 'W-W-W',
'X-X-X', 'Y-Y-Y', 'Z-Z-Z'],
math_op_text = {'+': 'plus', '-': 'minus',
'\\cdot': 'times', '\\times': 'times',
'/': 'over',
None: 'equal'}, # default value
lang_change_repl = ['K-K-K', 'L-L-L', 'M-M-M', 'N-N-N']
)
settings['de'] = ParserLanguageSettings(
proof_name = 'Beweis',
math_repl_inline = ['B-B-B', 'C-C-C', 'D-D-D',
'E-E-E', 'F-F-F', 'G-G-G'],
math_repl_display = ['U-U-U', 'V-V-V', 'W-W-W',
'X-X-X', 'Y-Y-Y', 'Z-Z-Z'],
math_op_text = {'+': 'plus', '-': 'minus',
'\\cdot': 'mal', '\\times': 'mal',
'/': 'durch',
None: 'gleich'} # default value
elif language == 'ru':
# Russian
self.proof_name = 'Доказательство'
self.math_repl_inline = ['Б-Б-Б', 'В-В-В', 'Г-Г-Г',
'Д-Д-Д', 'Е-Е-Е', 'Ж-Ж-Ж']
self.math_repl_display = ['Ц-Ц-Ц', 'Ч-Ч-Ч', 'Ш-Ш-Ш',
'Ы-Ы-Ы', 'Э-Э-Э', 'Ю-Ю-Ю']
self.math_op_text = {'+': 'плюс', '-': 'минус',
None: 'gleich'}, # default value
lang_change_repl = ['K-K-K', 'L-L-L', 'M-M-M', 'N-N-N']
)
settings['ru'] = ParserLanguageSettings(
proof_name = 'Доказательство',
math_repl_inline = ['Б-Б-Б', 'В-В-В', 'Г-Г-Г',
'Д-Д-Д', 'Е-Е-Е', 'Ж-Ж-Ж'],
math_repl_display = ['Ц-Ц-Ц', 'Ч-Ч-Ч', 'Ш-Ш-Ш',
'Ы-Ы-Ы', 'Э-Э-Э', 'Ю-Ю-Ю'],
math_op_text = {'+': 'плюс', '-': 'минус',
'\\cdot': 'раз', '\\times': 'раз',
'/': 'на',
None: 'равно'} # default value
else:
# default: English
self.proof_name = 'Proof'
self.math_repl_inline = ['B-B-B', 'C-C-C', 'D-D-D',
'E-E-E', 'F-F-F', 'G-G-G']
self.math_repl_display = ['U-U-U', 'V-V-V', 'W-W-W',
'X-X-X', 'Y-Y-Y', 'Z-Z-Z']
self.math_op_text = {'+': 'plus', '-': 'minus',
'\\cdot': 'times', '\\times': 'times',
'/': 'over',
None: 'equal'} # default value
None: 'равно'}, # default value
lang_change_repl = ['К-К-К', 'Л-Л-Л', 'М-М-М', 'Н-Н-Н']
)

self.parser_lang_stack = [(settings[self.check_parser_lang(lang)],
lang)]

# set misc collections
#
Expand Down Expand Up @@ -314,17 +335,6 @@ def init_collections(self):

}

# "special" tokens for German
#
self.special_tokens_de = {

'"-': '',
'"=': '-',
'"`': '\N{DOUBLE LOW-9 QUOTATION MARK}', # \glqq
'"\'': '\N{LEFT DOUBLE QUOTATION MARK}', # \grqq

}

# set math collections
#
def init_math_collections(self):
Expand Down Expand Up @@ -399,11 +409,53 @@ def init_math_collections(self):
def macro_character(self, c):
return c >= 'a' and c <= 'z' or c >= 'A' and c <= 'Z' or c == '@'

# transform given lang into valid key for dictionary
# self.parser_lang_settings
#
def check_parser_lang(self, lang):
lang = lang[:2].lower()
return lang if lang in self.parser_lang_settings else 'en'

# switch current parser language settings to new language
#
def change_parser_lang(self, tok):
if tok.back:
if len(self.parser_lang_stack) > 1:
self.parser_lang_stack.pop()
else:
if tok.hard:
self.parser_lang_stack[-1] = (
self.parser_lang_settings[self.check_parser_lang(tok.lang)],
tok.lang)
else:
self.parser_lang_stack.append(
(self.parser_lang_settings[self.check_parser_lang(tok.lang)],
tok.lang))

def lang_context(self):
return self.parser_lang_stack[-1][0]

def lang_context_lang(self):
return self.parser_lang_stack[-1][1]

def __init__(self, language='en'):
self.init_collections()
self.init_math_collections()
self.init_language(language)
self.multi_language = False
self.ml_continue_thresh = 3
self.init_parser_languages(language)
self.init_scanner_language(language)
self.scanner = scanner.Scanner(self)
self.init_macros()
self.init_environments()


class ParserLanguageSettings:
def __init__(self, proof_name, math_repl_inline, math_repl_display,
math_op_text, lang_change_repl):
self.proof_name = proof_name
self.math_repl_inline = math_repl_inline
self.math_repl_display = math_repl_display
self.math_op_text = math_op_text
self.lang_change_repl = lang_change_repl

18 changes: 13 additions & 5 deletions yalafi/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -241,6 +241,10 @@ def expand_sequence(self, buf, env_stop=None):
else:
out.append(defs.ActionToken(tok.pos))
out.append(defs.TextToken(tok.pos, tok.txt))
elif type(tok) is defs.LanguageToken:
if self.parms.multi_language:
self.parms.change_parser_lang(tok)
out.append(tok)
elif type(tok) is defs.CommentToken:
pass
else:
Expand Down Expand Up @@ -341,8 +345,10 @@ def expand_arguments(self, buf, mac, start):
arguments_extr.append(arg_extr)

if mac.extract:
toks = self.generate_replacements(arguments_extr,
mac.extract, start)
toks = ([defs.LanguageToken(start,
lang=self.parms.lang_context_lang(), hard=True)]
+ self.generate_replacements(arguments_extr,
mac.extract, start))
self.extracted.append(self.expand_sequence(scanner.Buffer(toks)))
out = [defs.ActionToken(start)]
if callable(mac.repl):
Expand Down Expand Up @@ -485,7 +491,8 @@ def eval(t):
t.can_end = '\n' in txt and not txt[:txt.find('\n')].strip()
return t

tokens = [t for t in tokens if t.txt or type(t) is defs.ActionToken]
tokens = [t for t in tokens if t.txt or
type(t) in (defs.ActionToken, defs.LanguageToken)]
tokens = [eval(t) for t in tokens]
tok = eval(defs.TextToken(0, ''))
tok.can_start = True
Expand Down Expand Up @@ -514,6 +521,7 @@ def eval(t):
break
if (can_remove and len(buf) > 1
and any(type(t) is defs.ActionToken for t in buf)):
lang_toks = [t for t in buf if type(t) is defs.LanguageToken]
t1 = copy.copy(buf[0])
t2 = copy.copy(buf[-1])
# in t1, we remove all behind the last newline
Expand All @@ -531,7 +539,7 @@ def eval(t):
else:
t2.txt = ''
t2.pos += len(txt)
buf = [t1]
buf = [t1] + lang_toks
tokens.append(eval(t2))
# NB: we deleted a line break
tok = eval(defs.TextToken(t2.pos, ''))
Expand All @@ -541,7 +549,7 @@ def eval(t):
tokens.append(eval(buf.pop()))
out += buf

return [t for t in out if t.txt]
return [t for t in out if t.txt or type(t) is defs.LanguageToken]

# \item: if [...] label is specified, look back in text and append
# a possible previous punctuation mark
Expand Down
2 changes: 1 addition & 1 deletion yalafi/scanner.py
Original file line number Diff line number Diff line change
Expand Up @@ -219,5 +219,5 @@ def look_ahead(self):
#
def is_space(self, tok):
return type(tok) in (defs.SpaceToken, defs.CommentToken,
defs.ActionToken, defs.VoidToken)
defs.ActionToken, defs.VoidToken, defs.LanguageToken)

40 changes: 30 additions & 10 deletions yalafi/tex2txt.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,16 +28,19 @@

from . import parameters, parser, utils

def tex2txt(latex, opts):
def tex2txt(latex, opts, multi_language=False, modify_parms=None):
def read(file):
try:
with open(file, encoding=opts.ienc) as f:
return True, f.read()
except:
return False, ''
parms = parameters.Parameters(opts.lang)

parms = parameters.Parameters(opts.lang or '')
parms.multi_language = multi_language
packages = get_packages(opts.dcls, parms.class_modules)
packages.extend(get_packages(opts.pack, parms.package_modules))

if opts.defs:
packages.append(('', utils.get_latex_handler(opts.defs)))
if opts.extr:
Expand All @@ -46,16 +49,32 @@ def read(file):
extr = []
if opts.seqs:
parms.math_displayed_simple = True

if modify_parms:
modify_parms(parms)
p = parser.Parser(parms, packages, read_macros=read)
toks = p.parse(latex, extract=extr)
txt, pos = utils.get_txt_pos(toks)
if opts.repl:
txt, pos = utils.replace_phrases(txt, pos, opts.repl)
if opts.unkn:
txt = '\n'.join(p.get_unknowns()) + '\n'
pos = [0 for n in range(len(txt))]
pos = [n + 1 for n in pos]
return txt, pos

if not multi_language:
txt, pos = utils.get_txt_pos(toks)
if opts.repl:
txt, pos = utils.replace_phrases(txt, pos, opts.repl)
if opts.unkn:
txt = '\n'.join(p.get_unknowns()) + '\n'
pos = [0 for n in range(len(txt))]
pos = [n + 1 for n in pos]
return txt, pos

main_lang = opts.lang or ''
ml = utils.get_txt_pos_ml(toks, main_lang, parms)
if opts.repl and main_lang in ml:
for part in ml[main_lang]:
part[0], part[1] = utils.replace_phrases(part[0], part[1],
opts.repl)
for lang in ml:
for part in ml[lang]:
part[1]= list(n + 1 for n in part[1])
return ml

def get_packages(packs, prefix):
ret = []
Expand All @@ -72,6 +91,7 @@ def get_packages(packs, prefix):
ret.append((p, utils.get_module_handler(p, prefix)))
return ret


#########################################################
#
# the rest is copied from tex2txt/tex2txt.py
Expand Down
Loading

0 comments on commit 42499ff

Please sign in to comment.