-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathwikipedia_duplicated_citations_merger.py
195 lines (163 loc) · 10.1 KB
/
wikipedia_duplicated_citations_merger.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
#! /usr/bin/env python3
# Merge duplicated Wikipedia references/citations. Written by davidhcefx, 2022.4.10.
from typing import Dict, List, Tuple, Set, Optional, Pattern, Match, Any
from string import digits
import re
import json
from hashlib import md5
import requests
try:
import readline # for filename autocompletion
readline.parse_and_bind('tab: complete') # type: ignore[attr-defined]
except ModuleNotFoundError:
pass
REF_PATTERN = re.compile(r'<ref\b(?P<name>[^>]*)(?<!/)>(?P<pay>.*?)</ref>', re.DOTALL)
NAME_ATTR_PATTERN = re.compile(r'name\s*=\s*(\w+|"[^"]*?")', re.DOTALL) # name attribute in <ref>
# https://stackoverflow.com/questions/3809401/what-is-a-good-regular-expression-to-match-a-url
URL_PATTERN = re.compile(r'https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)')
template_names: Optional[Pattern[str]] = None
template_params: Optional[Pattern[str]] = None
class CitationDatabase:
"""
Associate citation-payloads with its short-names.
A citation-payload is the part enclosed by <ref></ref>.
"""
def __init__(self) -> None:
self.shortnames: Dict[str, str] = dict() # map a payload to its short name
def add(self, payload: str, shortname: str) -> None:
assert shortname not in self.shortnames.values(), \
f"Name collision '{shortname}'. Check the article or try adding more hash digits."
self.shortnames[payload] = shortname
def has(self, payload: str) -> bool:
# we simply query with the exact text here, but one can extend to other logics
# such as URL/DOI detection or to ignore whitespace differences.
return payload in self.shortnames
def get_shortname(self, payload: str) -> str:
return self.shortnames[payload] if self.has(payload) else ''
def generate_short_name(payload: str) -> str:
"""Generate wiki-accepted short-names from citation-payloads."""
global template_names, template_params
if template_names is None:
temp_names = 'book|arXiv|AV media|AV media notes|bioRxiv|conference|encyclopedia|episode|interview|magazine|mailing list|journal|map|news|newsgroup|podcast|press release|report|serial|sign |speech|techreport|thesis|web'
template_names = re.compile(r'[cC]ite ({})'.format(temp_names))
if template_params is None:
temp_params = 'access-date|agency|archive-date|archive-url|arxiv|asin|asin-tld|at|author|author-link|author-link1|author-link2|author-link3|author-link4|author-link5|author-mask|author-mask1|author-mask2|author-mask3|author-mask4|author-mask5|author2|authors|bibcode|bibcode-access|biorxiv|book-title|cartography|chapter|chapter-format|chapter-url|chapter-url-access|citeseerx|class|conference|conference-url|credits|date|department|display-authors|display-editors|display-translators|docket|doi|doi-access|doi-broken-date|edition|editor|editor-first|editor-first1|editor-first2|editor-first3|editor-first4|editor-first5|editor-last|editor-last1|editor-last2|editor-last3|editor-last4|editor-last5|editor-link|editor-link1|editor-link2|editor-link3|editor-link4|editor-link5|editor-mask1|editor-mask2|editor-mask3|editor-mask4|editor-mask5|editor1-first|editor1-last|editor1-link|editor2-first|editor2-last|editor2-link|editor3-first|editor3-last|editor3-link|editor4-first|editor4-last|editor4-link|editor5-first|editor5-last|editor5-link|editors|eissn|encyclopedia|episode|episode-link|eprint|event|first|first1|first2|first3|first4|first5|format|hdl|hdl-access|host|id|inset|institution|interviewer|isbn|ismn|issn|issue|jfm|journal|jstor|jstor-access|language|last|last1|last2|last3|last4|last5|lccn|location|magazine|mailing-list|map|map-url|medium|message-id|minutes|mode|mr|name-list-style|network|newsgroup|no-pp|number|oclc|ol|ol-access|orig-date|osti|osti-access|others|page|pages|people|pmc|pmc-embargo-date|pmid|postscript|publication-date|publication-place|publisher|quote|quote-page|quote-pages|ref|registration|rfc|s2cid|s2cid-access|sbn|scale|script-chapter|script-quote|script-title|season|section|sections|series|series-link|series-no|ssrn|station|subject|subject-link|subject-link2|subject-link3|subject-link4|subject2|subject3|subject4|time|title|title-link|trans-chapter|trans-quote|trans-title|transcript|transcript-url|translator-first1|translator-first2|translator-first3|translator-first4|translator-first5|translator-last1|translator-last2|translator-last3|translator-last4|translator-last5|translator-link1|translator-link2|translator-link3|translator-link4|translator-link5|translator-mask1|translator-mask2|translator-mask3|translator-mask4|translator-mask5|type|url|url-access|url-status|via|volume|website|work|year|zbl|zbl'
template_params = re.compile(r'\|\s*({})\s*='.format(temp_params))
# remove urls, wiki template names and template parameters
s = template_params.sub('', template_names.sub('', URL_PATTERN.sub('', payload)))
words = ''.join(re.findall(r'\w', s))
# compute md5 to avoid collisions (11 - 8 = 3 hash digits)
name = words[:8] + md5(payload.encode()).hexdigest()
return '{}{}'.format('_' if name[0] in digits else '', name[:11])
def get_duplicated_refs(article: str, db: CitationDatabase) -> Set[str]:
"""Return a set of duplicated citation-payloads; update db along the way."""
result = set()
idx = 0
while ref := REF_PATTERN.search(article, idx):
name_str, payload = ref.group('name', 'pay')
if db.has(payload):
result.add(payload)
else:
# parse or generate shortname
shortname = n.group(1).strip('"') if (n := NAME_ATTR_PATTERN.search(name_str)) \
else generate_short_name(payload)
db.add(payload, shortname)
idx = ref.end()
return result
def has_name_attribute(ref: Match[str]) -> bool:
return NAME_ATTR_PATTERN.search(ref.group('name')) is not None
def merge(article: str) -> Tuple[str, int, List[str]]:
"""
Merge duplicated refs in wiki source code.
Return new article, merge counts and a list of duplicated refs.
There are five kinds of refs: (we aim to merge the 2nd and 5th)
1) no name unique payload: <ref>a</ref>
2) no name duplicated payload: <ref>a</ref><ref>a</ref>
3) has name no payload (SKIPPED): <ref name="n" />
4) has name has unique payload: <ref name="n">a</ref>
5) has name has duplicated payload: <ref name="n">a</ref><ref name="n">a</ref>
"""
new_article = []
db = CitationDatabase()
duplicated_refs: Set[str] = get_duplicated_refs(article, db)
seen: Set[str] = set()
merge_count = 0
idx = 0
while ref := REF_PATTERN.search(article, idx):
new_article.append(article[idx:ref.start()]) # output the previous segment
idx = ref.end()
payload = ref.group('pay')
if payload not in duplicated_refs:
# no modification needed if it's unique
new_article.append(ref.group())
continue
if payload in seen:
# ignore duplicated payload body
new_article.append('<ref name="{}" />'.format(db.get_shortname(payload)))
merge_count += 1
else:
if not has_name_attribute(ref):
new_article.append(
'<ref name="{}">{}</ref>'.format(db.get_shortname(payload), payload)
)
else:
new_article.append(ref.group())
seen.add(payload)
new_article.append(article[idx:]) # the last segment
return (''.join(new_article), merge_count, list(duplicated_refs))
def extract_wikitext(url: str) -> str:
"""Extract Wikitext from a wiki url."""
if not (match := re.fullmatch(r'(?P<host>.+)/wiki/(?P<page>.+)', url)):
raise SystemExit('Invalid or unrecognized wiki URL.')
query = {
'action': 'parse',
'prop': 'wikitext',
'format': 'json',
'formatversion': '2',
'page': requests.utils.unquote(match.group('page')), # type: ignore
}
if r := requests.get(match.group('host') + '/w/api.php', query, timeout=5):
return str(json.loads(r.text)['parse']['wikitext'])
return ''
def menu(prompt: str, default: int, options: List[str]) -> int:
"""Return choice or throw SystemExit, 1 <= choice <= len(options)."""
print(prompt)
for i, op in enumerate(options):
print('[{}] {}{}'.format(i + 1, op, ' (default)' if i + 1 == default else ''))
if not 1 <= (choice := int(input('> ') or default)) <= len(options):
raise SystemExit('Invalid choice!')
return choice
def main() -> None:
print('{:=<40}\n{:^40}\n{:=<40}'.format('', 'Wikipedia Duplicated Citations Merger', ''))
ch = menu('\nHow do you wish to provide the input?', 3,
['Fetch from wikipedia', 'Load from file ...', 'Paste it here directly.'])
src_input: Tuple[str, Any] = ('u', input('URL of the wiki page: ')) if ch == 1 \
else ('f', input('Please provide the file name to load: ')) if ch == 2 \
else ('0', 0)
ch = menu('\nHow do you wish to get the result?', 3,
['Save the result to \'result.txt\'.',
'Save the result to ...',
'Display it here directly.'])
file_output = 'result.txt' if ch == 1 \
else input('Please provide the file name to save: ') if ch == 2 \
else ''
if src_input[0] == '0':
print('\nPaste your wiki article source code here:')
print('Press CTRL + D (or CTRL + Z plus Enter) when completed.\n{:=<40}'.format(''))
article = extract_wikitext(src_input[1]) if src_input[0] == 'u' else open(src_input[1]).read()
new_article, merge_count, duplicated_refs = merge(article)
if file_output:
open(file_output, 'w').write(new_article)
else:
print('\n\nHere\'s the result:\n{:=<40}'.format(''))
print(new_article)
print('{:=<40}'.format(''))
if merge_count == 0:
print('No duplicated references detected.')
else:
print('Successfully merged {} references. Duplicated ones are:'.format(merge_count))
print("- '{}'".format("'\n- '".join(duplicated_refs)))
if __name__ == '__main__':
main()
## Old pattern:
# REF_PATTERN = re.compile(r'<ref\b([^>]*)(?<!/)>(([^<]|<(?!/ref>))*)</ref>', re.DOTALL)