SWISS-MODEL · tomasMasson · Apr 7, 2020 · Apr 7, 2020 · Apr 7, 2020 · Apr 8, 2020
diff --git a/fetch_nextstrain/README.md b/fetch_nextstrain/README.md
@@ -0,0 +1,33 @@
+# Fetch variation data from Nextstrain API
+
+First draft script to retrieve and parse the mutations reported in raw data from [Nextstrain](http://data.nextstrain.org/ncov.json). 
+
+I include all the variation data (amino acid and nucleotide) and all the metadata (we could decide later which fields are necessary).
+
+Annotations describing all distinct amino acid one letter codes or all mutation 
+events occuring at a certain location can be fetched from the generated 
+nextstrain_data.csv and prepared for the SWISS-MODEL annotation system by 
+the annotate_variation.py script.
+
+To run it, you require the the root directory of the github 
+repository in your PYTHONPATH. On Linux/Mac execute the following:
+
+```
+export PYTHONPATH=<path_to_covid_repo>:$PYTHONPATH
+```
+
+Usage and available options of the script can be displayed by executing
+
+```
+python annotate_variation.py --help
+```
+
+- Example annotation of all distinct amino acids is available [here](https://beta.swissmodel.expasy.org/repository/covid_annotation_project/LPRPYc)
+- Example annotation of all observed mutation events is available [here](https://beta.swissmodel.expasy.org/repository/covid_annotation_project/vtvJJN)
+
+TODO:
+
+- Give format to the output (done).
+- Select the fields of interest.
+- Replace protein names with UniProtKB AC (done).
+
diff --git a/fetch_nextstrain/annotate_variation.py b/fetch_nextstrain/annotate_variation.py
@@ -0,0 +1,136 @@
+import csv
+import matplotlib.pyplot as plt
+from argparse import ArgumentParser
+from utils.uniprot import seq_from_ac
+from utils.sm_annotations import Annotation
+
+
+def _parse_args():
+    parser = ArgumentParser(
+        description="Loads the csv generated by get_nextstrain_data.py and "
+        "collects annotations that can be displayed in the SWISS-MODEL "
+        "annotation system. Annotations can either refer to the distinct "
+        "amino acids or mutation events observed at a certain location in the "
+        "Nextstrain data (see mutevents flag to control that behaviour)."
+    )
+    parser.add_argument(
+        "--csv",
+        required=True,
+        type=str,
+        dest="csv",
+        help="CSV file generated by get_nextstrain_data.py",
+    )
+    parser.add_argument(
+        "--post",
+        dest="post",
+        help="If set, the annotations "
+        "are directly uploaded to the SWISS-MODEL annotation system and an "
+        "accessible URL is printed to stdout. By default, the annotations are "
+        "printed to stdout to be manually copy pasted in the annotation upload "
+        "form in SWISS-MODEL",
+        action="store_true",
+    )
+    parser.add_argument(
+        "--cm",
+        dest="cm",
+        help="String that specifies a matplotlib color map to encode number of "
+        "distinct amino acids / mutation events at a certain location. "
+        "Default: cool. Available maps:"
+        "(https://matplotlib.org/3.1.1/gallery/color/colormap_reference.html).",
+        default="cool",
+    )
+    parser.add_argument(
+        "--mutevents",
+        dest="mutevents",
+        help="If set, annotations refer to all mutation events observed at a "
+        "certain location in the Nexstrain data. Mutations are relative to an "
+        "underlying phylogenetic tree. By default, the annotations consider "
+        "all distinct amino acids ever observed at a certain location.",
+        action="store_true",
+    )
+
+    args = parser.parse_args()
+    # Directly assign matplotlib colormap to args. If the specified map is not
+    # available, matplotlib raises an expressive error message.
+    args.cm = plt.get_cmap(args.cm)
+    return args
+
+
+def _parse_mutations(csv_file):
+    mutations = list()
+    with open(csv_file, newline="") as fh:
+        reader = csv.DictReader(fh)
+        for row in reader:
+            mutations.append((row["Protein"], row["Mutation"]))
+    return mutations
+
+
+def _annotate(mutations, mutevents, cm):
+    """
+    The nextstrain data is relative to a phylogenetic tree that they build.
+    So a mutation at location x can either be a mutation from reference 
+    to something else, a mutation from something else back to the reference
+    or even something that has nothing to do with the reference at all.
+    The variations we're reporting are either all observed amino acid one letter 
+    codes (olc) at a certain location => the unique set of olcs containing the 
+    the reference and all corresponding olcs reported in the nextstrain 
+    mutations, or the mutation events themselves (controlled by mutevents flag).
+    """
+
+    # lets fetch all reference sequences
+    acs = [m[0] for m in mutations]
+    sequences = dict()
+    for ac in set(acs):
+        sequences[ac] = seq_from_ac(ac)
+
+    # key: (uniprot_ac, rnum), value: [olc1, olc2, ..., olcx]
+    variations = dict()
+    for m in mutations:
+        uniprot_ac = m[0]
+        rnum = int(m[1][1 : len(m[1]) - 1])
+        key = (uniprot_ac, rnum)
+        if key not in variations:
+            variations[key] = set()
+        if mutevents:
+            variations[key].add(m[1][0]+"->"+m[1][-1]) # formatted as from->to
+        else:
+            variations[key].add(sequences[uniprot_ac][rnum - 1])
+            variations[key].add(m[1][0])  # first letter -> mutation from
+            variations[key].add(m[1][-1])  # last letter -> mutation to
+
+    # find min/max numbers of distinc amino acids to scale color gradient
+    sizes = [len(item) for item in variations.values()]
+    min_mut = min(sizes)
+    max_mut = max(sizes)
+
+    # Let's not directly add it to the annotation object but rather do an
+    # intermediate list so we can sort it by uniprot accession code / residue
+    # number
+    annotation_data = list()
+    for var_key in variations.keys():
+        if max_mut == min_mut:
+            color = cm(1.0)
+        else:
+            color = cm(float(len(variations[var_key]) - min_mut) / (max_mut - min_mut))
+        annotation_data.append(
+            [var_key[0], var_key[1], color, ",".join(variations[var_key])]
+        )
+    annotation = Annotation()
+    for d in sorted(annotation_data):
+        annotation.add(d[0], d[1], d[2], d[3])
+
+    return annotation
+
+
+def main():
+    args = _parse_args()
+    mutations = _parse_mutations(args.csv)
+    annotation = _annotate(mutations, args.mutevents, args.cm)
+    if args.post:
+        print(annotation.post())
+    else:
+        print(annotation)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/fetch_nextstrain/get_nextstrain_data.py b/fetch_nextstrain/get_nextstrain_data.py
@@ -0,0 +1,84 @@
+#!/usr/bin/env python3
+
+import json
+import requests
+
+def parse_json(input_file):
+    '''
+    Wrapper to colect the data from parse_node. Currently, results
+    are organized as a list for each isolate. First field contains
+    isolate name, second field mutation data and third field all
+    the metadata.
+    '''
+
+    data = []
+
+    def parse_node(input_file, array):    
+        '''
+        Search recursively across a json dictionary and return the
+        keys corresponding to name, branch_attrs (mutations info),
+        node_attrs (metadata, e.g. authors).
+        ''' 
+        root = input_file
+        if 'history' not in root['name'] and 'NODE' not in root['name']:
+            isolate = root['name']
+            mutations_node = root['branch_attrs']['mutations']
+#            mutations = [(mutation, mutations_node[mutation]) for mutation in mutations_node if mutation != 'nuc']
+            author = root['node_attrs']['author']['value']
+            gisaig = root['node_attrs']['gisaid_epi_isl']['value']
+            if mutations_node:
+                for key in mutations_node:
+                    if key != 'nuc':
+                        protein = key
+                        mutation = mutations_node[key][0]
+                        array.append((isolate, protein, mutation, author, gisaig))
+        if 'children' in root:
+            for record in root['children']:
+                parse_node(record, array)
+        return array
+
+    results = parse_node(input_file, data)
+    return results
+
+def get_nextstrain_data():
+    ''' Fetch data from nextstrain raw data site.'''
+
+    url = 'https://data.nextstrain.org/ncov.json'
+    r = requests.get(url)
+    nextstrain_json = r.json()
+    results = parse_json(nextstrain_json['tree'])
+    return results
+
+def give_format_output(data):
+    ''' Transform the raw data from nextstrain into a .csv file. ORFs names aer replaced by UniProtKB identifiers.'''
+
+    identifiers = {'ORF1b': 'P0DTD1', 'ORF1a': 'P0DTD1', 'S': 'P0DTC2', 'ORF3a': 'P0DTC3', 'E': 'P0DTC4', 'M': 'P0DTC5', 'ORF6': 'P0DTC6', 'ORF7a': 'P0DTC7', 'ORF7b': 'P0DTD8', 'ORF8': 'P0DTC8', 'N': 'P0DTC9', 'ORF14': 'P0DTD3', 'ORF9b': 'P0DTD2', 'ORF10': 'A0A663DJA2'}
+
+    with open('nextstrain_data.csv', 'w') as fh:
+        headers = f'Protein,ORF,Mutation,Isolate,Author,GISAID\n'
+        fh.write(headers)
+        for record in data:
+            if record[1] in identifiers:
+                protein = identifiers[record[1]]
+            else:
+                protein = record[1]
+            orf = record[1]
+            if orf == 'ORF1b':
+                original = record[2][0]
+                mutant = record[2][-1]
+                position = int(record[2][1:-1]) + 4401
+                mutation = original + str(position) + mutant
+            else:
+                mutation = record[2]
+            author = record[3]
+            isolate = record[0]
+            gisaid = record[4]
+            output = f'{protein},{orf},{mutation},{isolate},{author},{gisaid}\n'
+            fh.write(output)
+
+def main():
+    results = get_nextstrain_data()
+    give_format_output(results)
+
+if __name__ == '__main__':
+    main()