-
Notifications
You must be signed in to change notification settings - Fork 3
/
org.py
executable file
·531 lines (466 loc) · 19.8 KB
/
org.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
#!/usr/bin/python3
# -*- coding: utf-8 -*-
#
# Process (extract, filter, merge) Vossantos in an org mode file.
#
# Usage: Without any arguments, extracts all Vossanto canidates from
# the given org file.
#
# Author: rja
#
# Changes:
# 2019-12-18 (rja)
# - added field sourceImageLicense
# 2019-12-16 (rja)
# - added option "--images" to enrich URLs for Wikipedia Commons images
# 2019-12-14 (rja)
# - added help message for --fields
# - renamed field "wikidata" to "sourceUrl"
# - added field aUrlId
# - cleaned up JSON output
# - added options --desks and --authors
# 2019-12-13 (rja)
# - refactored iteration over parts from array to dict
# - changed handling of command line parameters for selecting columns
# - fixed source phrase and modifier extraction
# - added JSON export
# - reorganised help messages
# - added generation of unique id (useful for JSON)
# - re-added original line (as we need it for conversion to markdown using pandoc)
# 2019-12-13 (ms)
# - moved file to parent dir
# - updated sourcephrase and modifier extraction (see extract_sourcephrase / extract_modifier) -> generalization from theof
# - stripped original line that is added to "parts"
# 2019-02-15 (rja)
# - added "-g" option to output original line and "-H" to print year headings
# - bumped version from 0.0.6 to 0.7.0 for semantic versioning
# 2018-10-24 (rja)
# - added option "--ignore-source-ids" to ignore candidates where the
# source id is contained in the given file
# 2018-10-08 (rja)
# - added option "-c" to output classification (True/False) and renamned
# existing "-c" to "-C"
# 2018-09-11 (rja)
# - normalising "None" to "" in output
# 2018-08-22 (rja)
# - added extraction of status that explains false positives
# 2018-08-21 (rja)
# - added extraction of source phrase as it appears in the text
# - key uses source phrase instead of source label and ignores modifier markup ('/')
# - key now uses all characters from sentence (not just the first 40)
# 2018-08-16 (rja)
# - added date extraction
# 2018-08-15 (rja)
# - improved modifier extraction
# 2018-08-09 (rja)
# - added option -o to output modifier
# 2018-03-02 (rja)
# - added option -U to include article URLs from file
# - added option -u to extract article URLs
# - annotated line regexp
# 2017-05-17 (rja)
# - simplified file parameters to support reading from STDIN
# 2017-05-14 (rja)
# - added options for selection
# 2017-05-13 (rja)
# - renamed from mergeorg.py and extended for extraction and filtering
# - migrated to Python3
# 2017-05-11 (rja)
# - initial version
import re
import argparse
import sys
import json
from collections import OrderedDict, Counter
version = "0.8.6"
# 1. [[https://www.wikidata.org/wiki/Q83484][Anthony Quinn]] (1987/01/02/0000232) ''I sometimes feel like *the Anthony Quinn of* my set.''
line_re_str = """
^ # beginning of string
(?P<newmark>> )? # new candidates are marked with "> "line
(?P<id>[0-9]+)\. # all candidates are numbered
[ !]+ # space and/or !
\+? # modifier for false positive
\[\[.+/ # start of Wikidata URL
(?P<wdid>[^/]+) # Wikidata id
\]\[ # separators
(?P<wdlabel>.+) # Wikidata label
\]\] # end of Wikidata URL
\ # space
\( # beginning of file id
(?P<article> # beginning of full article part
(\[\[)? # opening markup for article URL
(?P<aurl>http.+?)? # article URL
(\]\[)? # separators for article URL
(?P<fid> # full file id
(?P<year>\\d{4}) # year
/ # separator
(?P<month>\\d{2}) # month
/ # separator
(?P<day>\\d{2}) # day
/ # separator
(?P<aid>\\d+) # article id
) # end of full file id
(\]\])? # closing markup for article URL
) # end of full article part
\) # end of file id
\ # space
(?P<sentence>.+?[^+]) # sentence
(?P<truefalse>\+)? # false positive indicator
(\ \( # beginning of status token explaining false positives
(?P<status>[WD]+) # a combination of characters
\))? # end of (optional) token
$ # end of string
"""
re_line = re.compile(line_re_str, re.VERBOSE)
# to extract the modifier (enclosed in /.../) from the sentence
re_modifier = re.compile("\\* ['\"]*/(.+?)/([^0-9A-Za-z]|$)")
# to extract the exact source phrase (enclosed in * ... *) from the sentence
re_sourcephrase = re.compile("\\*\\w+ (.+?) \\w+\\*")
# to remove markup from the sentences
re_clean = re.compile(r"[*/.\s]")
# remove line breaks and tabs from text
re_ws = re.compile('[\n\t\r]+')
# to extract article URL ids
re_aurlid = re.compile(r'http://query\.nytimes\.com/gst/fullpage\.html\?res=(.+)')
# reads the file into which the other file shall be merged
# all non-vossanto lines are returned in lines,
# all following (vossanto) lines are indexed in index using
# a key generated by match_line
def read_file(flines):
lines = []
index = None
for line in flines:
# different handling for lines before and after the heading
if line.startswith("* results"):
index = dict()
lines.append(line)
else:
if index is not None:
# index lines after heading "* results"
parts = match_line(line)
if parts:
year, key = get_key(parts)
if year not in index:
index[year] = dict()
index[year][key] = line
else:
# store lines before heading "* results"
lines.append(line)
return lines, index
# Read a TSV file with two columns into a dict.
# The first column is used as key and the second column as value.
# Lines starting with # are ignored.
def read_dict(flines, sep='\t', comment='#'):
d = dict()
for line in flines:
if not line.startswith(comment):
try:
key, val = line.strip().split(sep, 1)
except ValueError:
pass
else:
d[key] = val
return d
def gen_truefalse(candidates, true_positive, false_positive):
for cand in candidates:
if true_positive == false_positive or true_positive == cand["classification"] or false_positive != cand["classification"]:
yield cand
# Enriches the Vossantos with additional information.
# The file should have two columns, the first being the article id, the
# second the data to be added for each Vossanto.
def gen_enrich(parts, key, f, sep='\t', missing=''):
aid_to_val = read_dict(f, sep=sep)
for part in parts:
if part["aId"] in aid_to_val:
part[key] = aid_to_val[part["aId"]]
else:
# always add the key, otherwise CSV columns get messed up
part[key] = missing
yield part
# Enriches the Vossantos with image information from Wikimedia commons.
# The file should have three columns, the first being the source id, the
# second the URL to the image page and the third the URL to the image itself.
def gen_enrich_images(parts, f, sep='\t', missing=''):
images = read_dict(f, sep=sep)
for part in parts:
if part["sourceId"] in images:
# further split value
page_url, image_url, image_license = images[part["sourceId"]].split(sep)
# https://commons.wikimedia.org/wiki/File:RodneyDangerfield1978.jpg https://upload.wikimedia.org/wikipedia/commons/b/bf/RodneyDangerfield1978.jpgx
# strip off common prefixes
source_image_id = page_url[len("https://commons.wikimedia.org/wiki/File:"):]
source_image_thumb = image_url[len("https://upload.wikimedia.org/wikipedia/commons/"):]
else:
# always add the key, otherwise CSV columns get messed up
source_image_id = missing
source_image_thumb = missing
image_license = missing
part["sourceImId"] = source_image_id
part["sourceImThumb"] = source_image_thumb
part["sourceImLicense"] = image_license
yield part
# Skip all candidates whose source's id is contained in sourcefile.
# Sourcefile must contain one Wikidata id per line, followed by their name.
# Lines starting with # are ignored.
def gen_filter_sources(candidates, sourcefile):
if sourcefile:
sources = read_dict(sourcefile)
for cand in candidates:
if cand["sourceId"] not in sources:
yield cand
else:
for cand in candidates:
yield cand
def gen_candidates(lines):
for line in lines:
parts = match_line(line)
if parts:
yield parts
# remove control characters
def gen_rm_ctrl(parts):
for part in parts:
yield [re_ws.sub(' ', part[p]).strip() for p in part]
# generates a key for a Vossanto
def get_key(parts):
return parts["year"], "|".join([parts["year"], parts["aId"], parts["sourcePhrase"], re_clean.sub('', parts["sentence"])])
def select_parts(parts, fields):
if len(fields) > 0 and "ALL" not in fields:
ids = Counter()
for part in parts:
result = OrderedDict()
for key in fields:
if key in part:
result[key] = part[key]
elif key == "id":
# generate (hopefully unique) id
result["id"] = part["aId"] + "_" + str(ids[part["aId"]])
ids[part["aId"]] += 1
yield result
else:
# when nothing has been selected, return everything
for part in parts:
yield part
# checks if the line is a Vossanto line
def match_line(line):
# detect the Vossanto lines
match = re_line.match(line.strip())
if match:
d = match.groupdict()
# prepare some values
trueVoss = d["truefalse"] != "+"
sourcePhrase = extract_sourcephrase(d["sentence"], trueVoss)
modifier = extract_modifier(d["sentence"], trueVoss)
return {
"year" : d["year"],
"date" : d["year"] + "-" + d["month"] + "-" + d["day"],
"aId" : d["aid"],
"fId" : d["fid"],
"sourceId" : d["wdid"],
"sourceLabel" : d["wdlabel"],
"sourcePhrase" : sourcePhrase,
"sourceUrl" : "[[https://www.wikidata.org/wiki/" + d["wdid"] + "][" + d["wdlabel"] + "]]",
"modifier" : modifier,
"text" : d["sentence"],
"aUrl" : d["aurl"],
"aUrlId" : get_article_url_id(d["aurl"]),
"classification" : trueVoss,
"line" : line.strip(),
"newVoss" : d["newmark"], # FIXME: where is this used?
"status" : d["status"] # FIXME: where is this used?
}
return None
# extract the modifier (enclosed in /.../) from the sentence
def extract_modifier(sentence, trueVoss):
# ignore non-Vossantos
if trueVoss:
match = re_modifier.search(sentence)
if match:
return match.group(1)
return ""
# extract the source phrase (enclosed in *the ... of*) from the sentence
def extract_sourcephrase(sentence, trueVoss):
if trueVoss:
match = re_sourcephrase.search(sentence)
if match:
return match.group(1)
return ""
# check whether article URL is normalised ("http://query.nytimes.com/gst/fullpage.html?res=<HEXSTRING>")
# and if so, returns HEXSTRING
def get_article_url_id(aurl):
match = re_aurlid.match(aurl)
if match:
return match.group(1)
return None
# given a line, either adds the URL for the article or (if already existent), changes it
def set_article_url(line, urls):
# detect Vossanto line
match = re_line.match(line.strip())
if match:
d = match.groupdict()
fid = d["fid"]
if fid not in urls:
print("WARN: URL for", fid, "not found", file=sys.stderr)
else:
url = urls[fid]
# implement
article = d["article"]
return line.replace(article, "[[" + url + "][" + fid + "]]")
else:
print("WARN: line did not match", line[:50], file=sys.stderr)
# inserts a vossanto line into the index
def insert(index, line, string_new = '> '):
# extract key for this line
parts = match_line(line)
if not parts:
# print warning only if not a year heading
if not re.match("^\*{2,3} [0-9]{4}$", line.strip()):
print("WARN: line did not match", line[:50], file=sys.stderr)
return
# add new Vossanto
year, key = get_key(parts)
if key not in index[year]:
index[year][key] = string_new + line
# convert value to string, taking care of None
def part_to_string(p):
if p is None:
return ""
return str(p)
# print CSV/TSV lines
def print_csv(parts, sep):
for part in parts:
print(sep.join([part_to_string(part[p]) for p in part]))
# print JSON lines
def print_json(parts):
print("[")
first = True
for part in parts:
if first:
first = False
else:
print(",")
# FIXME: ignore keys with empty values
print(json.dummps(part), end='')
print("\n]")
# prints heading for each year
# must be called before select_parts, such that year information is available
# works by interleaving printing with the iteration through yield
def print_heading(parts):
# to detect changing years (to print a heading)
prev_year = None
for part in parts:
year = part["year"]
if year != prev_year:
print("\n**", year)
prev_year = year
# this enables us print the heading between the final print statements
yield part
def parse_fields(s):
return s.split(",")
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Manipulate Vossantos in and extracts them from org files.')
parser.add_argument('file', type=argparse.FileType('r', encoding='utf-8'), nargs='?', default=sys.stdin, help='org mode file to process (default: STDIN)')
# filtering options
filtering = parser.add_argument_group('filter arguments')
filtering.add_argument('-T', '--true', action="store_true", help="output only true Vossantos")
filtering.add_argument('-F', '--false', action="store_true", help="output only false positives")
filtering.add_argument('--ignore-source-ids', type=argparse.FileType('r', encoding='utf-8'), metavar="FILE", help='ignore candidates with a source id contained in FILE')
# output format options
output = parser.add_argument_group('output arguments')
output.add_argument('-f', '--fields', type=parse_fields, metavar="FDS", default="ALL", help="fields to be included (default: '%(default)s')")
parser.add_argument('-l', '--list-fields', action="store_true", help="list available fields for --fields")
output.add_argument('-o', '--output', type=str, metavar="FMT", help="output format (default: '%(default)s')", default="csv", choices=["csv", "json"])
output.add_argument('-s', '--sep', type=str, metavar="SEP", help="output separator for csv (default: '\\t')", default='\t')
output.add_argument('-n', '--new', type=str, metavar="NEW", help="string to mark new entries (default: '%(default)s')", default='> ')
output.add_argument('-c', '--clean', action="store_true", help="clean whitespace")
output.add_argument('-H', '--heading', action="store_true", help="print year heading (only csv)")
enrich = parser.add_argument_group('enrichment arguments', "Expect TSV files with article (or source) id in first column.")
enrich.add_argument('-u', '--urls', type=argparse.FileType('r', encoding='utf-8'), metavar="F", help='add article URLs (prints org file!)')
enrich.add_argument('-a', '--authors', type=argparse.FileType('r', encoding='utf-8'), metavar="F", help='add article authors')
enrich.add_argument('-i', '--images', type=argparse.FileType('r', encoding='utf-8'), metavar="F", help='add source images')
enrich.add_argument('-d', '--desks', type=argparse.FileType('r', encoding='utf-8'), metavar="F", help='add article desks')
# special options
special = parser.add_argument_group('special arguments')
special.add_argument('-m', '--merge', type=argparse.FileType('r', encoding='utf-8'), metavar="F", help='file to merge')
special.add_argument('-v', '--version', action="version", version="%(prog)s " + version)
args = parser.parse_args()
if args.merge:
# read file into which the other file shall be merged
lines, index = read_file(args.file)
# read new file and insert Vossantos
for line in args.merge:
insert(index, line, args.new)
# print first (unchanged) part of original file
for line in lines:
print(line, end='')
# print Vossanto lines
for year in sorted(index):
print()
print("**", year)
for line in sorted(index[year]):
print(index[year][line], end='')
elif args.urls:
# read URL file
urls = read_dict(args.include_urls)
# read file
lines, index = read_file(args.file)
# print first (unchanged) part of original file
for line in lines:
print(line, end='')
# print Vossanto lines
for year in sorted(index):
print()
print("**", year)
for line in sorted(index[year]):
# add URL to line
print(set_article_url(index[year][line], urls), end='')
elif args.list_fields:
print("""
The following values are allowed for the --fields (-f) option:
aId article id in the Sandhaus corpus
aUrl article URL
aUrlId article id in the URL (for "fullpage.html" URLs only)
author author (requires --author)
classification 'True' for true Vossantos, 'False' otherwise
date date in format YYYY-MM-DD
desk desk (requires --desk)
fId file id in the
id unique id (generated using the article id)
line original line from the input file
modifier modifier
sourceId Wikidata id of the source
sourceLabel Wikidata label of the source
sourcePhrase name of the source as it appears in the text
sourceUrl Wikidata URL of the source
sourceImId Wikimedia Commons id for source image (requires --images)
sourceImThumb Wikimedia Commons thumbnail path for source image (requires --images)
sourceImLicense Wikimedia Commons license for image (requires --images)
text text (typically a sentence) containing the Vossanto
year publication year of the article
Several fields can be concatenated by ",", their order is taken into
account.
Special/obsolete keywords:
ALL print all available fields
newVoss whether the Vossanto has been marked as new
status additional information (D=duplicate, W=wrong detected)
""")
else:
# default: extract and print Vossantos
parts = gen_candidates(args.file)
parts = gen_truefalse(parts, args.true, args.false)
parts = gen_filter_sources(parts, args.ignore_source_ids)
if args.authors:
parts = gen_enrich(parts, "author", args.authors)
if args.desks:
parts = gen_enrich(parts, "desk", args.desks)
if args.images:
parts = gen_enrich_images(parts, args.images)
if args.heading:
# interleaving the headings works by yielding the parts in a loop
parts = print_heading(parts)
parts = select_parts(parts, args.fields)
if args.clean:
parts = gen_rm_ctrl(parts)
if args.output == "json":
print_json(parts)
else:
print_csv(parts, args.sep)