Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] Scripts to retrieve data from Nextstrain #24

Open
wants to merge 8 commits into
base: master
Choose a base branch
from
33 changes: 33 additions & 0 deletions fetch_nextstrain/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
# Fetch variation data from Nextstrain API

First draft script to retrieve and parse the mutations reported in raw data from [Nextstrain](http://data.nextstrain.org/ncov.json).

I include all the variation data (amino acid and nucleotide) and all the metadata (we could decide later which fields are necessary).

Annotations describing all distinct amino acid one letter codes or all mutation
events occuring at a certain location can be fetched from the generated
nextstrain_data.csv and prepared for the SWISS-MODEL annotation system by
the annotate_variation.py script.

To run it, you require the the root directory of the github
repository in your PYTHONPATH. On Linux/Mac execute the following:

```
export PYTHONPATH=<path_to_covid_repo>:$PYTHONPATH
```

Usage and available options of the script can be displayed by executing

```
python annotate_variation.py --help
```

- Example annotation of all distinct amino acids is available [here](https://beta.swissmodel.expasy.org/repository/covid_annotation_project/LPRPYc)
- Example annotation of all observed mutation events is available [here](https://beta.swissmodel.expasy.org/repository/covid_annotation_project/vtvJJN)

TODO:

- Give format to the output (done).
- Select the fields of interest.
- Replace protein names with UniProtKB AC (done).

136 changes: 136 additions & 0 deletions fetch_nextstrain/annotate_variation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
import csv
import matplotlib.pyplot as plt
from argparse import ArgumentParser
from utils.uniprot import seq_from_ac
from utils.sm_annotations import Annotation


def _parse_args():
parser = ArgumentParser(
description="Loads the csv generated by get_nextstrain_data.py and "
"collects annotations that can be displayed in the SWISS-MODEL "
"annotation system. Annotations can either refer to the distinct "
"amino acids or mutation events observed at a certain location in the "
"Nextstrain data (see mutevents flag to control that behaviour)."
)
parser.add_argument(
"--csv",
required=True,
type=str,
dest="csv",
help="CSV file generated by get_nextstrain_data.py",
)
parser.add_argument(
"--post",
dest="post",
help="If set, the annotations "
"are directly uploaded to the SWISS-MODEL annotation system and an "
"accessible URL is printed to stdout. By default, the annotations are "
"printed to stdout to be manually copy pasted in the annotation upload "
"form in SWISS-MODEL",
action="store_true",
)
parser.add_argument(
"--cm",
dest="cm",
help="String that specifies a matplotlib color map to encode number of "
"distinct amino acids / mutation events at a certain location. "
"Default: cool. Available maps:"
"(https://matplotlib.org/3.1.1/gallery/color/colormap_reference.html).",
default="cool",
)
parser.add_argument(
"--mutevents",
dest="mutevents",
help="If set, annotations refer to all mutation events observed at a "
"certain location in the Nexstrain data. Mutations are relative to an "
"underlying phylogenetic tree. By default, the annotations consider "
"all distinct amino acids ever observed at a certain location.",
action="store_true",
)

args = parser.parse_args()
# Directly assign matplotlib colormap to args. If the specified map is not
# available, matplotlib raises an expressive error message.
args.cm = plt.get_cmap(args.cm)
return args


def _parse_mutations(csv_file):
mutations = list()
with open(csv_file, newline="") as fh:
reader = csv.DictReader(fh)
for row in reader:
mutations.append((row["Protein"], row["Mutation"]))
return mutations


def _annotate(mutations, mutevents, cm):
"""
The nextstrain data is relative to a phylogenetic tree that they build.
So a mutation at location x can either be a mutation from reference
to something else, a mutation from something else back to the reference
or even something that has nothing to do with the reference at all.
The variations we're reporting are either all observed amino acid one letter
codes (olc) at a certain location => the unique set of olcs containing the
the reference and all corresponding olcs reported in the nextstrain
mutations, or the mutation events themselves (controlled by mutevents flag).
"""

# lets fetch all reference sequences
acs = [m[0] for m in mutations]
sequences = dict()
for ac in set(acs):
sequences[ac] = seq_from_ac(ac)

# key: (uniprot_ac, rnum), value: [olc1, olc2, ..., olcx]
variations = dict()
for m in mutations:
uniprot_ac = m[0]
rnum = int(m[1][1 : len(m[1]) - 1])
key = (uniprot_ac, rnum)
if key not in variations:
variations[key] = set()
if mutevents:
variations[key].add(m[1][0]+"->"+m[1][-1]) # formatted as from->to
else:
variations[key].add(sequences[uniprot_ac][rnum - 1])
variations[key].add(m[1][0]) # first letter -> mutation from
variations[key].add(m[1][-1]) # last letter -> mutation to

# find min/max numbers of distinc amino acids to scale color gradient
sizes = [len(item) for item in variations.values()]
min_mut = min(sizes)
max_mut = max(sizes)

# Let's not directly add it to the annotation object but rather do an
# intermediate list so we can sort it by uniprot accession code / residue
# number
annotation_data = list()
for var_key in variations.keys():
if max_mut == min_mut:
color = cm(1.0)
else:
color = cm(float(len(variations[var_key]) - min_mut) / (max_mut - min_mut))
annotation_data.append(
[var_key[0], var_key[1], color, ",".join(variations[var_key])]
)
annotation = Annotation()
for d in sorted(annotation_data):
annotation.add(d[0], d[1], d[2], d[3])

return annotation


def main():
args = _parse_args()
mutations = _parse_mutations(args.csv)
annotation = _annotate(mutations, args.mutevents, args.cm)
if args.post:
print(annotation.post())
else:
print(annotation)


if __name__ == "__main__":
main()
84 changes: 84 additions & 0 deletions fetch_nextstrain/get_nextstrain_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
#!/usr/bin/env python3

import json
import requests

def parse_json(input_file):
'''
Wrapper to colect the data from parse_node. Currently, results
are organized as a list for each isolate. First field contains
isolate name, second field mutation data and third field all
the metadata.
'''

data = []

def parse_node(input_file, array):
'''
Search recursively across a json dictionary and return the
keys corresponding to name, branch_attrs (mutations info),
node_attrs (metadata, e.g. authors).
'''
root = input_file
if 'history' not in root['name'] and 'NODE' not in root['name']:
isolate = root['name']
mutations_node = root['branch_attrs']['mutations']
# mutations = [(mutation, mutations_node[mutation]) for mutation in mutations_node if mutation != 'nuc']
author = root['node_attrs']['author']['value']
gisaig = root['node_attrs']['gisaid_epi_isl']['value']
if mutations_node:
for key in mutations_node:
if key != 'nuc':
protein = key
mutation = mutations_node[key][0]
array.append((isolate, protein, mutation, author, gisaig))
if 'children' in root:
for record in root['children']:
parse_node(record, array)
return array

results = parse_node(input_file, data)
return results

def get_nextstrain_data():
''' Fetch data from nextstrain raw data site.'''

url = 'https://data.nextstrain.org/ncov.json'
r = requests.get(url)
nextstrain_json = r.json()
results = parse_json(nextstrain_json['tree'])
return results

def give_format_output(data):
''' Transform the raw data from nextstrain into a .csv file. ORFs names aer replaced by UniProtKB identifiers.'''

identifiers = {'ORF1b': 'P0DTD1', 'ORF1a': 'P0DTD1', 'S': 'P0DTC2', 'ORF3a': 'P0DTC3', 'E': 'P0DTC4', 'M': 'P0DTC5', 'ORF6': 'P0DTC6', 'ORF7a': 'P0DTC7', 'ORF7b': 'P0DTD8', 'ORF8': 'P0DTC8', 'N': 'P0DTC9', 'ORF14': 'P0DTD3', 'ORF9b': 'P0DTD2', 'ORF10': 'A0A663DJA2'}

with open('nextstrain_data.csv', 'w') as fh:
headers = f'Protein,ORF,Mutation,Isolate,Author,GISAID\n'
fh.write(headers)
for record in data:
if record[1] in identifiers:
protein = identifiers[record[1]]
else:
protein = record[1]
orf = record[1]
if orf == 'ORF1b':
original = record[2][0]
mutant = record[2][-1]
position = int(record[2][1:-1]) + 4401
mutation = original + str(position) + mutant
else:
mutation = record[2]
author = record[3]
isolate = record[0]
gisaid = record[4]
output = f'{protein},{orf},{mutation},{isolate},{author},{gisaid}\n'
fh.write(output)

def main():
results = get_nextstrain_data()
give_format_output(results)

if __name__ == '__main__':
main()
Loading