-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathutil.py
172 lines (143 loc) · 4.78 KB
/
util.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
import json
import collections
import re
import mwparserfromhell as mw
from typing import *
VERSION_EXTRACTOR = re.compile(r"(.*?)([0-9]+)?$")
LINK_PATTERN = re.compile(r"(.*?)\[\[(.+?\|)?(.+?)\]\](.*)")
EMBED_PATTERN = re.compile(r"(.*?)\{\{(.+?\|)?(.+?)\}\}(.*)")
def each_row(rowName: str, code):
rows = code.filter_templates(matches=lambda t: t.name.matches(rowName))
if len(rows) < 1:
return
for row in rows:
result = {}
for param in row.params:
key = str(param.name)
value = str(param.value)
result[key] = value
yield result
def each_version(template_name: str, code, include_base: bool = False,
mergable_keys: List[str] = None) -> Iterator[Tuple[int, Dict[str, Any]]]:
"""
each_version is a generator that yields each version of an infobox
with variants, such as {{Infobox Item}} on [[Ring of charos]]
"""
if mergable_keys is None:
mergable_keys = ["version", "image", "caption"]
infoboxes = code.filter_templates(matches=lambda t: t.name.matches(template_name))
if len(infoboxes) < 1:
return
for infobox in infoboxes:
base: Dict[str, str] = {}
versions: Dict[int, Dict[str, str]] = {}
for param in infobox.params:
matcher = VERSION_EXTRACTOR.match(str(param.name).strip())
if matcher is None:
raise AssertionError()
primary = matcher.group(1)
dic = base
if matcher.group(2) != None:
version = int(matcher.group(2))
if not version in versions:
versions[version] = {}
dic = versions[version]
dic[primary] = param.value
if len(versions) == 0:
yield (-1, base)
else:
all_mergable = True
for versionID, versionDict in versions.items():
for key in versionDict:
if not key in mergable_keys:
all_mergable = False
if all_mergable:
yield (-1, base)
else:
if include_base:
yield (-1, base)
for versionID, versionDict in versions.items():
yield (versionID, {**base, **versionDict})
def write_json(name: str, minName: str, docs: Dict[Any, Dict[str, Any]]):
items = []
for (id, doc) in docs.items():
named = {k: v for (k, v) in doc.items() if not k.startswith("__")}
nameless = named.copy()
if "name" in nameless:
del nameless["name"]
if nameless != {}:
items.append((id, named, nameless))
withNames = collections.OrderedDict([(k, v) for (k, v, _) in items])
with open(name, "w+") as fi:
json.dump(withNames, fi, indent=2)
withoutNames = collections.OrderedDict([(k, v) for (k, _, v) in items])
with open(minName, "w+") as fi:
json.dump(withoutNames, fi, separators=(",", ":"))
def write_list_json(name: str, minName: str, docs: List[Dict[str, Any]], skipSort: bool = False):
with open(name, "w+") as fi:
json.dump(docs, fi, indent=2)
with open(minName, "w+") as fi:
json.dump(docs, fi, separators=(",", ":"))
def get_doc_for_id_string(source: str, version: Dict[str, str], docs: Dict[str, Dict],
allow_duplicates: bool = False) -> Optional[Dict]:
if not "id" in version:
print("page {} is missing an id".format(source))
return None
ids = [id for id in map(lambda id: id.strip(), str(version["id"]).split(",")) if id != "" and id.isdigit()]
if len(ids) == 0:
print("page {} is has an empty id".format(source))
return None
doc = {}
doc["__source__"] = source
invalid = False
for id in ids:
if not allow_duplicates and id in docs:
print("page {} is has the same id as {}".format(source, docs[id]["__source__"]))
invalid = True
docs[id] = doc
if invalid:
return None
return doc
def copy(name: Union[str, Tuple[str, str]],
doc: Dict,
version: Dict[str, Any],
convert: Callable[[Any], Any] = lambda x: x) -> bool:
src_name = name if isinstance(name, str) else name[0]
dst_name = name if isinstance(name, str) else name[1]
if not src_name in version:
return False
strval = str(version[src_name]).strip()
if strval == "":
return False
newval = convert(strval)
if not newval:
return False
doc[dst_name] = newval
return True
def has_template(name: str, code) -> bool:
return len(code.filter_templates(matches=lambda t: t.name.matches(name))) != 0
def strip(input: str) -> str:
stripped = input.strip()
stripped = stripped.replace("===", "")
stripped = stripped.replace("==", "")
parts = []
linkMatcher = LINK_PATTERN.search(stripped)
while not linkMatcher is None:
parts.append(linkMatcher.group(1))
parts.append(linkMatcher.group(3))
stripped = linkMatcher.group(4)
linkMatcher = LINK_PATTERN.search(stripped)
parts.append(stripped)
stripped = "".join(parts)
parts = []
embedMatcher = EMBED_PATTERN.search(stripped)
while not embedMatcher is None:
parts.append(embedMatcher.group(1))
parts.append(embedMatcher.group(3))
stripped = embedMatcher.group(4)
embedMatcher = EMBED_PATTERN.search(stripped)
parts.append(stripped)
stripped = "".join(parts)
stripped = stripped.replace("[[", "")
stripped = stripped.replace("]]", "")
return stripped