forked from bennokr/wdps
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathstarter-code.py
42 lines (32 loc) · 1.09 KB
/
starter-code.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
KEYNAME = "WARC-TREC-ID"
def find_labels(payload, labels):
key = None
for line in payload.splitlines():
if line.startswith(KEYNAME):
key = line.split(': ')[1]
break
for label, freebase_id in labels.items():
if key and (label in payload):
yield key, label, freebase_id
def split_records(stream):
payload = ''
for line in stream:
if line.strip() == "WARC/1.0":
yield payload
payload = ''
else:
payload += line
if __name__ == '__main__':
import sys
import pdb
try:
_, INPUT = sys.argv
except Exception as e:
print('Usage: python starter-code.py INPUT')
sys.exit(0)
cheats = dict((line.split('\t',2) for line in open('data/sample-labels-cheat.txt').read().splitlines()))
print(cheats)
with open(INPUT, errors='ignore') as fo:
for record in split_records(fo):
for key, label, freebase_id in find_labels(record, cheats):
print(key + '\t' + label + '\t' + freebase_id)