-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfacet-to-tsv.py
96 lines (83 loc) · 3.31 KB
/
facet-to-tsv.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
import pandas as pd
import numpy as np
import argparse
import json
import os, ssl
import sys
import time
def getArguments():
# Set up the command line parser
parser = argparse.ArgumentParser(
formatter_class=argparse.RawDescriptionHelpFormatter,
description=""
)
# A JSON file as output by adc_search.py
parser.add_argument("adc_analysis_json")
# Field file
parser.add_argument(
"--field_file",
dest="field_file",
default=None,
help="File that contains a list of AIRR fields in dot notation (subject.subject_id). These fields are output in every repertoire query output in a 'Repertoire' object. If no file is provided then an empty repertoire object is created."
)
# Output file
parser.add_argument(
"--output_file",
dest="output_file",
default=None,
help="The output file to use. If none supplied, uses stdout."
)
# Parse the command line arguements.
options = parser.parse_args()
return options
if __name__ == "__main__":
# Get the command line arguments.
options = getArguments()
# Get the output file handle
if options.output_file == None:
output_handle = sys.stdout
else:
try:
output_handle = open(options.output_file, "w")
except Exception as err:
print("IR-ERROR: Unable to open output file %s - %s" % (options.output_file, err))
sys.exit(1)
# Read in the repertoire field file
if options.field_file is None:
repertoire_field_df = pd.DataFrame([])
else:
try:
repertoire_field_df = pd.read_csv(options.field_file, sep='\t',
engine='python', encoding='utf-8-sig')
except Exception as err:
print("IR-ERROR: Unable to open file %s - %s" % (options.repository_url_file, err))
sys.exit(1)
# Open the analysis file.
with open(options.adc_analysis_json) as f:
analysis_dict = json.load(f)
# Set up the data frame
columns = ["repository","repertoire_id","count"]
for index, row in repertoire_field_df.iterrows():
columns.append(row["Fields"])
analysis_df = pd.DataFrame(columns=columns)
print("IR-INFO: Empty Dataframe ", analysis_df, sep='\n')
for repository_dict in analysis_dict:
repository = repository_dict["repository"]
for result in repository_dict["results"]:
record_dict = dict()
record_dict["repository"] = repository
if "Facet" in result:
if len(result["Facet"]) >= 1:
record_dict["count"] = result["Facet"][0]["count"]
record_dict["repertoire_id"] = result["Facet"][0]["repertoire_id"]
else:
record_dict["count"] = 0
record_dict["repertoire_id"] = ""
if "Repertoire" in result:
repertoire_info = result["Repertoire"]
for field in repertoire_info:
record_dict[field] = repertoire_info[field]
print("IR-INFO: Record = " + str(record_dict))
analysis_df = pd.concat([analysis_df, pd.DataFrame([record_dict])], ignore_index=True)
print("IR-INFO: Dataframe ", analysis_df, sep='\n')
analysis_df.to_csv(output_handle, sep='\t',index=False)