-
Notifications
You must be signed in to change notification settings - Fork 6
/
export.py
78 lines (62 loc) · 3.01 KB
/
export.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
"""
Export matched variants to literature.json
"""
import sys
import datetime
import json
import argparse
import sqlite3
import pandas as pd
def top_papers(mentions, pyhgvs):
""" Return (pmid, points) tuple sorted by points against pyhgvs """
top = mentions[mentions.pyhgvs_Genomic_Coordinate_38 == pyhgvs] \
.groupby(["pyhgvs_Genomic_Coordinate_38", "pmid"]) \
.agg({"points": sum}) \
.sort_values("points", ascending=False)
return top.reset_index()[["pmid", "points"]].values
def top_snippets(mentions, pyhgvs, pmid):
""" Return list of top snippets for this variant and paper by points """
paper = mentions[(mentions.pyhgvs_Genomic_Coordinate_38 == pyhgvs) & (mentions.pmid == pmid)]
return [snippet
for snippets in paper.sort_values("points", ascending=False).snippets.values
for snippet in snippets.split("|")][:3]
def top_papers_and_snippets(mentions, pyhgvs):
return [{"pmid": str(p[0]), "points": int(p[1]),
"mentions": top_snippets(mentions, pyhgvs, p[0])}
for p in top_papers(mentions, pyhgvs)]
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Export literature.json")
parser.add_argument("path", help="Path to input artifacts")
args = parser.parse_args()
connection = sqlite3.connect("file:{}/articles.db?mode=ro".format(args.path), uri=True)
articles = pd.read_sql_query("SELECT * FROM articles", connection)
articles.pmid = articles.pmid.astype(str)
print("{} articles loaded from the articles sqlite database".format(articles.shape[0]))
mentions = pd.read_csv("{}/mentions-matched.tsv".format(args.path), sep="\t", encoding="utf-8")
mentions.pmid = mentions.pmid.astype(str)
print("Total matched mentions: {}".format(mentions.shape[0]))
mentions = mentions.drop_duplicates(["pyhgvs_Genomic_Coordinate_38", "pmid", "snippets"])
print("After dropping duplicates: {}".format(mentions.shape[0]))
mentions.head()
variants = {}
remaining = mentions.pyhgvs_Genomic_Coordinate_38.unique().shape[0]
print(datetime.datetime.now())
for pyhgvs in mentions.pyhgvs_Genomic_Coordinate_38.unique():
sys.stdout.write("Remaining: {}\r".format(remaining))
sys.stdout.flush()
variants[pyhgvs] = top_papers_and_snippets(mentions, pyhgvs)
remaining -= 1
lit = {
"date": open("{}/pubs-date.txt".format(args.path)).read().strip(),
"papers": articles[articles.pmid.isin(mentions.pmid)].set_index(
"pmid", drop=False).to_dict(orient="index"),
"variants": variants
}
with open("{}/literature.json".format(args.path), "w") as output:
output.write(json.dumps(lit, sort_keys=True))
print("Exported {} variants in {} papers".format(
len(lit["variants"].keys()), len(lit["papers"].keys())))
with open("{}/literature.json".format(args.path)) as f:
lit = json.loads(f.read())
print("{} Papers and {} Variants exported".format(len(lit["papers"]), len(lit["variants"])))