-
Notifications
You must be signed in to change notification settings - Fork 3
/
indextools.py
90 lines (71 loc) · 2.44 KB
/
indextools.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
"""
Adapt an index generated by XeLaTeX so that it sorts words containing diacritics correctly
The ways how the *dx files are generated varies between different versions of TexLive
"""
import re, sys
try:
from langsci.delatex import dediacriticize
except ImportError:
from delatex import dediacriticize
try:
from langsci.asciify import asciify
except ImportError:
from asciify import asciify
# the LaTeX index entries consist of the string to be displayed (after the "@")
# and the string used for sorting (before the "@").
p = re.compile(r"\\indexentry \{(.*)\|hyper")
ignoredic={}
def processline(s):
global ignoredic
"""Conform the input string to the index requirements and return the conformed string
To conform the string, first LaTex diacritics like {\'{e}} are removed. Then, Unicode
is translated to ASCII
Args:
s (str): the input string
Returns:
str: the output string
Example:
>>> print(processline("\v{C}{\'{e}}pl\"o, Slavomír")
Ceplo, Slavomir
"""
if s.strip() == "":
return s
# find the substring used for sorting
m = p.match(s)
try:
items = p.match(s).group(1).split("@")
sortstring = items[0]
has_at = False
if len(items)>1:
has_at = True
except AttributeError:
print("%s could not be parsed" % repr(s))
return ""
processedstring = asciify(dediacriticize(sortstring))
if sortstring == processedstring:
return s
else:
if sortstring not in ignoredic:
print("%s => %s" % (sortstring, processedstring))
ignoredic[sortstring] = True
if has_at:
result = s.replace("%s@" % sortstring, "%s@" % processedstring)
return result
else:
result = s.replace(sortstring, "%s@%s" % (processedstring,sortstring))
return result
def processfile(filename):
"""Read a file and write the fixed output to another file with "mod" appended to its name
Args:
filename (str): the path to the file
Returns:
None
"""
print("Reading", filename)
with open(filename, encoding="utf-8") as indexfile:
lines = indexfile.readlines()
print("Found %i lines" % len(lines))
# read all lines, process them and write them to output file
processedlines = list(map(processline, lines))
with open(filename.replace(".", "mod."), "w", encoding="utf-8") as out:
out.write("".join(processedlines))