-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtransform-msmarco-v2-passages-to-trec.py
35 lines (27 loc) · 1.28 KB
/
transform-msmarco-v2-passages-to-trec.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
#!/home/mrim/galuscap/tools/environments/reranking/.reranking/bin/python3
import re
import os
import gzip
import json
import sys, getopt
#"/home/mrim/galuscap/experiments/msmarco/bm25/run.msmarco-v2-passage-2021"
msmarco_directory = "/home/mrim/data/collection/msmarco_v2/passage/msmarco_v2_passage/"
trec_directory = "/home/mrim/data/collection/msmarco_v2/passage/msmarco_v2_passage_trec_normalized/"
# Read MSMARCO
for msmarco_filename in os.listdir(msmarco_directory):
print(msmarco_filename)
output_filename = trec_directory + "/" + msmarco_filename
output_text = ""
with gzip.open(msmarco_directory + "/" + msmarco_filename, 'r') as msmarco_file:
for json_line in msmarco_file:
json_str = json_line.decode('utf-8')
data = json.loads(json_str)
pid = data['pid']
passage = data['passage']
passage = re.sub(r'[^ \w+]', '', passage)
passage = ' '.join(passage.split())
if (not (not passage or passage.isspace())):
text = "<DOC>\n<DOCNO>" + pid + "</DOCNO>\n<DOCID>" + pid + "</DOCID>\n<TEXT>\n" + passage + "\n</TEXT>\n</DOC>\n"
output_text = output_text + text
output_file = open(output_filename,'w')
print(output_text, file=output_file)