Merge pull request #407 from apriltuesday/output-ontology

Add ability to use alternate target ontologies
EBIvariation · Dec 20, 2023 · 29e89bf · 29e89bf
2 parents 2b968b4 + d49c887
commit 29e89bf
Show file tree

Hide file tree

Showing 45 changed files with 8,307 additions and 191 deletions.
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -43,7 +43,11 @@ jobs:
       run: python -m pytest --cov=cmat --cov-append tests -k integration
 
     - name: End-to-end test of evidence string generation pipeline
-      run: bash tests/output_generation/test_pipeline.sh
+      run: bash tests/pipelines/test_annotation_pipeline.sh
+
+# TODO Takes too long to run regularly, maybe just on tags?
+#    - name: End-to-end test of curation pipelines
+#      run: bash tests/pipelines/test_curation_pipelines.sh
 
     - name: Upload the coverage data to Coveralls
       env: 

diff --git a/README.md b/README.md
@@ -38,6 +38,10 @@ export CODE_ROOT=
 export LATEST_MAPPINGS=${CODE_ROOT}/mappings/latest_mappings.tsv
 ```
 
+If this is your first time running the pipelines with a specific target ontology (i.e. you don't have a latest mappings file to use),
+you can use an empty TSV file containing just the header `#ontology=<code>`, where `<code>` is taken from [this list](https://www.ebi.ac.uk/ols4/ontologies) of supportable ontologies.
+This file will be filled with automated and manually curated mappings as processing continues.
+
 To confirm everything is set up properly, you can run the annotation pipeline on the small dataset included with the tests.
 It should take a couple minutes to run and generate a file `annotated_clinvar.xml.gz` in the test directory.
 ```bash

diff --git a/bin/evaluation/check_latest_mappings.py b/bin/evaluation/check_latest_mappings.py
@@ -3,25 +3,25 @@
 import csv
 import multiprocessing
 
-from cmat.output_generation.clinvar_to_evidence_strings import load_efo_mapping
+from cmat.output_generation.clinvar_to_evidence_strings import load_ontology_mapping
 from cmat.output_generation.evaluation.ols_utils import fetch_eval_data
 
 
 def main(mapping_file, output_file):
     """Load mapping file, map identifiers to synonyms in OLS, and dump results to TSV."""
-    mappings = load_efo_mapping(mapping_file)
+    mappings, target_ontology = load_ontology_mapping(mapping_file)
     all_uris = [uri for v in mappings.values() for uri, _ in v]
     process_pool = multiprocessing.Pool(processes=24)
     annotated_traits = [
-        process_pool.apply(fetch_eval_data, kwds={'uri': uri, 'include_neighbors': False})
+        process_pool.apply(fetch_eval_data, kwds={'uri': uri, 'include_neighbors': False, 'target_ontology': target_ontology})
         for uri in all_uris
     ]
     with open(output_file, 'w+') as f:
         csv.writer(f, delimiter="\t").writerows(annotated_traits)
 
 
 if __name__ == '__main__':
-    parser = argparse.ArgumentParser(description='Script to check if mappings are obsolete in EFO and find synonyms')
+    parser = argparse.ArgumentParser(description='Script to check if mappings are obsolete and find synonyms')
     parser.add_argument('--latest-mappings', required=True, help='Latest mappings file')
     parser.add_argument('--output-file', required=True, help='File to output dataframe')
     args = parser.parse_args()

diff --git a/bin/generate_annotated_xml.py b/bin/generate_annotated_xml.py
@@ -5,7 +5,7 @@
 
 parser = argparse.ArgumentParser('Generates annotated ClinVar XML from ClinVar data and trait mappings')
 parser.add_argument('--clinvar-xml',  help='ClinVar XML release',                    required=True)
-parser.add_argument('--efo-mapping',  help='Disease string to ontology mappings',    required=True)
+parser.add_argument('--trait-mapping', help='Disease string to ontology mappings',   required=True)
 parser.add_argument('--gene-mapping', help='Variant to gene & consequence mappings', required=True)
 parser.add_argument('--output-xml',   help='Output XML file',                        required=True)
 parser.add_argument('--eval-gene-file', help='Gene mappings for evaluation',         required=False)
@@ -16,6 +16,6 @@
 if __name__ == '__main__':
     args = parser.parse_args()
     generate_annotated_clinvar_xml(
-        clinvar_xml_file=args.clinvar_xml, efo_mapping_file=args.efo_mapping, gene_mapping_file=args.gene_mapping,
+        clinvar_xml_file=args.clinvar_xml, trait_mapping_file=args.trait_mapping, gene_mapping_file=args.gene_mapping,
         output_xml_file=args.output_xml, eval_gene_file=args.eval_gene_file, eval_xref_file=args.eval_xref_file,
         eval_latest_file=args.eval_latest_file)
diff --git a/bin/trait_mapping/create_efo_table.py b/bin/trait_mapping/create_efo_table.py
@@ -4,7 +4,7 @@
 import re
 import requests
 
-from cmat.trait_mapping.ols import OLS_EFO_SERVER
+from cmat.trait_mapping.ols import OLS_SERVER
 from requests import HTTPError
 from retry import retry
 
@@ -31,7 +31,7 @@
 
 def ols_url_template(ontology, term):
     # OLS url to query for a term details
-    return f'{OLS_EFO_SERVER}/api/ontologies/{ontology}/terms?iri={term}'
+    return f'{OLS_SERVER}/api/ontologies/{ontology}/terms?iri={term}'
 
 
 def oxo_url_template(curie):

diff --git a/bin/trait_mapping/create_table_for_manual_curation.py b/bin/trait_mapping/create_table_for_manual_curation.py
@@ -4,29 +4,29 @@
 
 import pandas as pd
 
-from cmat.output_generation.clinvar_to_evidence_strings import load_efo_mapping
+from cmat.output_generation.clinvar_to_evidence_strings import load_ontology_mapping
 from cmat.trait_mapping.ols import (
-    get_ontology_label_from_ols, is_current_and_in_efo, is_in_efo, get_replacement_term,
+    get_ontology_label_from_ols, is_current_and_in_ontology, is_in_ontology, get_replacement_term,
 )
 
 
-def previous_and_replacement_mappings(trait_name, previous_mappings):
+def previous_and_replacement_mappings(trait_name, previous_mappings, ontology):
     if trait_name not in previous_mappings:
         yield '', ''
         return
     for uri, label in previous_mappings[trait_name]:
-        trait_status = get_trait_status(uri)
+        trait_status = get_trait_status(uri, ontology)
         trait_string = '|'.join([uri, label, 'NOT_SPECIFIED', 'previously-used', trait_status])
-        replacement_string = find_replacement_mapping(uri)
+        replacement_string = find_replacement_mapping(uri, ontology)
         yield trait_string, replacement_string
 
 
-def find_replacement_mapping(previous_uri):
-    replacement_uri = get_replacement_term(previous_uri)
+def find_replacement_mapping(previous_uri, ontology):
+    replacement_uri = get_replacement_term(previous_uri, ontology)
     if not replacement_uri:
         return ''
     label = get_ontology_label(replacement_uri)
-    trait_status = get_trait_status(replacement_uri)
+    trait_status = get_trait_status(replacement_uri, ontology)
     trait_string = '|'.join([replacement_uri, label, 'NOT_SPECIFIED', 'replacement', trait_status])
     return trait_string
 
@@ -43,11 +43,11 @@ def get_ontology_label(uri):
     return label if label is not None else ''
 
 
-def get_trait_status(uri):
-    uri_is_current_and_in_efo = is_current_and_in_efo(uri)
-    uri_in_efo = is_in_efo(uri)
-    if uri_in_efo:
-        trait_status = 'EFO_CURRENT' if uri_is_current_and_in_efo else 'EFO_OBSOLETE'
+def get_trait_status(uri, ontology):
+    uri_is_current_and_in_ontology = is_current_and_in_ontology(uri, ontology)
+    uri_in_ontology = is_in_ontology(uri, ontology)
+    if uri_in_ontology:
+        trait_status = f'{ontology.upper()}_CURRENT' if uri_is_current_and_in_ontology else f'{ontology.upper()}_OBSOLETE'
     else:
         trait_status = 'NOT_CONTAINED'
     return trait_status
@@ -71,13 +71,13 @@ def get_trait_status(uri):
     args = parser.parse_args()
 
     # Load all previous mappings: ClinVar trait name to ontology URI
-    previous_mappings = load_efo_mapping(args.previous_mappings)
+    previous_mappings, target_ontology = load_ontology_mapping(args.previous_mappings)
 
     # Load previous curator comments: ClinVar trait name to comment string
     try:
         previous_comments = pd.read_csv(args.previous_comments, sep='\t', header=None)
         previous_comments = dict(zip(previous_comments[0], previous_comments[1]))
-    except pd.errors.EmptyDataError:
+    except (FileNotFoundError, pd.errors.EmptyDataError):
         previous_comments = {}
 
     # Process all mappings which require manual curation
@@ -92,7 +92,8 @@ def get_trait_status(uri):
         # Use maximum of 50 mappings to improve Google Sheets performance
         mappings = fields[3:53]
         exact_mapping = find_exact_mapping(trait_name, mappings)
-        for previous_mapping, replacement_mapping in previous_and_replacement_mappings(trait_name, previous_mappings):
+        for previous_mapping, replacement_mapping in previous_and_replacement_mappings(trait_name, previous_mappings,
+                                                                                       target_ontology):
             rows.append([trait_name, trait_freq, notes, previous_mapping, exact_mapping, replacement_mapping]
                         + mappings)
 

diff --git a/bin/trait_mapping/export_curation_table.py b/bin/trait_mapping/export_curation_table.py
@@ -20,7 +20,7 @@ def export_table(input_filepath, done_filepath, import_filepath, comments_filepa
 
     # Comments column
     comment_rows = curation_table[curation_table['Comment'].notna() & curation_table['Status'].notna()]
-    comment_rows = comment_rows[['ClinVar label', 'Comment']]
+    comment_rows = comment_rows[['ClinVar label', 'Comment']].astype(str)
     # Remove double quotes as they just cause problems
     comment_rows['Comment'] = comment_rows['Comment'].str.replace('"', '')
     comment_rows.to_csv(comments_filepath, sep='\t', header=False, index=False)

diff --git a/bin/trait_mapping/process_traits.py b/bin/trait_mapping/process_traits.py
@@ -9,7 +9,7 @@ def launch():
 
     main.process_traits(parser.input_traits_filepath, parser.output_mappings_filepath,
                         parser.output_curation_filepath, parser.filters, parser.zooma_host,
-                        parser.oxo_target_list, parser.oxo_distance)
+                        parser.oxo_target_list, parser.oxo_distance, parser.target_ontology)
 
 
 class ArgParser:
@@ -27,8 +27,8 @@ def __init__(self, argv):
                             help="path to output file for mappings")
         parser.add_argument("-c", dest="output_curation_filepath", required=True,
                             help="path to output file for curation")
-        parser.add_argument("-n", dest="ontologies", default="efo,ordo,hp,mondo",
-                            help="ontologies to use in query")
+        parser.add_argument("-n", dest="zooma_ontologies", default="efo,ordo,hp,mondo",
+                            help="ontologies to use in zooma query")
         parser.add_argument("-r", dest="required", default="cttv,eva-clinvar,clinvar-xrefs,gwas",
                             help="data sources to use in query.")
         parser.add_argument("-p", dest="preferred", default="eva-clinvar,cttv,gwas,clinvar-xrefs",
@@ -39,20 +39,23 @@ def __init__(self, argv):
                             help="target ontologies to use with OxO")
         parser.add_argument("-d", dest="oxo_distance", default=3,
                             help="distance to use to query OxO.")
+        parser.add_argument('--target-ontology', help='ID of target ontology (default EFO, for allowable values see'
+                                                      'https://www.ebi.ac.uk/ols/ontologies)', default='EFO')
 
         args = parser.parse_args(args=argv[1:])
 
         self.input_traits_filepath = args.input_traits_filepath
         self.output_mappings_filepath = args.output_mappings_filepath
         self.output_curation_filepath = args.output_curation_filepath
 
-        self.filters = {"ontologies": args.ontologies,
+        self.filters = {"ontologies": args.zooma_ontologies,
                         "required": args.required,
                         "preferred": args.preferred}
 
         self.zooma_host = args.zooma_host
         self.oxo_target_list = [target.strip() for target in args.oxo_target_list.split(",")]
         self.oxo_distance = args.oxo_distance
+        self.target_ontology = args.target_ontology
 
 
 if __name__ == '__main__':

diff --git a/cmat/output_generation/annotated_clinvar.py b/cmat/output_generation/annotated_clinvar.py
@@ -6,27 +6,29 @@
 
 from cmat.clinvar_xml_io import ClinVarTrait, ClinVarRecordMeasure, ClinVarDataset, ClinVarRecord
 from cmat.clinvar_xml_io.xml_parsing import iterate_rcv_from_xml
-from cmat.output_generation.clinvar_to_evidence_strings import load_efo_mapping, get_consequence_types
+from cmat.output_generation.clinvar_to_evidence_strings import load_ontology_mapping, get_consequence_types
 from cmat.output_generation import consequence_type as CT
 from cmat.output_generation.evaluation.set_metrics import SetComparisonMetrics
 
 PROCESSOR = 'CMAT'
 
 
 class AnnotatingClinVarDataset(ClinVarDataset):
-    """This class provides the ability to parse ClinVar records (RCVs) and annotate them with EFO mappings and
+    """This class provides the ability to parse ClinVar records (RCVs) and annotate them with ontology mappings and
     consequence mappings on the fly."""
 
-    def __init__(self, clinvar_xml, string_to_efo_mappings, variant_to_gene_mappings,
+    def __init__(self, clinvar_xml, string_to_ontology_mappings, variant_to_gene_mappings, target_ontology,
                  eval_gene_mappings=None, eval_xref_mappings=None, eval_latest_mappings=None):
         super().__init__(clinvar_xml)
         self.header_attr['ProcessedBy'] = PROCESSOR
-        self.string_to_efo_mappings = string_to_efo_mappings
+        self.string_to_ontology_mappings = string_to_ontology_mappings
         self.variant_to_gene_mappings = variant_to_gene_mappings
+        self.target_ontology = target_ontology
 
         self.eval_gene_mappings = eval_gene_mappings
         self.eval_xref_mappings = eval_xref_mappings
         self.eval_latest_mappings = eval_latest_mappings
+        self.do_eval = (self.eval_gene_mappings and self.eval_xref_mappings and self.eval_latest_mappings)
         self.overall_counts = {}
         self.obsolete_counts = {}
         self.gene_metrics = None
@@ -61,8 +63,9 @@ def __iter__(self):
         self.complex_variant_metrics = (SetComparisonMetrics(), SetComparisonMetrics())
 
         self.trait_metrics = SetComparisonMetrics()
-        self.mismatches_file = open('mismatches.tsv', 'w+')
-        self.mismatches_file.write('RCV\tCV\tCMAT\n')
+        if self.do_eval:
+            self.mismatches_file = open('mismatches.tsv', 'w+')
+            self.mismatches_file.write('RCV\tCV\tCMAT\n')
 
         for rcv in iterate_rcv_from_xml(self.clinvar_xml):
             record = AnnotatedClinVarRecord(rcv)
@@ -75,7 +78,8 @@ def __iter__(self):
         self.trait_metrics.finalise()
         for metrics in self.simple_variant_metrics + self.repeat_variant_metrics + self.complex_variant_metrics:
             metrics.finalise()
-        self.mismatches_file.close()
+        if self.do_eval:
+            self.mismatches_file.close()
 
     def annotate(self, record):
         self.overall_counts['total'] += 1
@@ -117,20 +121,20 @@ def annotate_and_count_measure(self, record):
 
     def annotate_and_count_traits(self, record):
         for trait in record.traits_with_valid_names:
-            # Get current EFO ids
+            # Get current EFO ids - only used for evaluation
             existing_efo_ids = set()
             for db, iden, _ in trait.current_efo_aligned_xrefs:
                 curie = OntologyUri(iden, db).curie
                 if curie:
                     existing_efo_ids.add(curie)
 
             # Add annotations - only based on preferred name
-            efo_ids = [
-                EfoMappedClinVarTrait.format_efo_id(efo_id)
-                for efo_id, efo_label
-                in self.string_to_efo_mappings.get(trait.preferred_or_other_valid_name.lower(), [])
+            target_ontology_ids = [
+                OntologyMappedClinVarTrait.format_ontology_id(ontology_id)
+                for ontology_id, ontology_label
+                in self.string_to_ontology_mappings.get(trait.preferred_or_other_valid_name.lower(), [])
             ]
-            trait.add_efo_mappings(efo_ids)
+            trait.add_ontology_mappings(target_ontology_ids, self.target_ontology)
 
             # Evaluation
             if self.eval_xref_mappings and self.eval_latest_mappings:
@@ -145,7 +149,7 @@ def annotate_and_count_traits(self, record):
                         existing_current_efo_ids.add(cv_id)
 
                 annotated_current_efo_ids = set()
-                for efo_id in efo_ids:
+                for efo_id in target_ontology_ids:
 
                     # Check whether annotated ID is obsolete
                     self.obsolete_counts['cmat_total'] += 1
@@ -210,25 +214,25 @@ def print_counter(counter):
 class AnnotatedClinVarRecord(ClinVarRecord):
 
     def __init__(self, rcv):
-        super().__init__(rcv, trait_class=EfoMappedClinVarTrait, measure_class=EnsemblAnnotatedClinVarMeasure)
+        super().__init__(rcv, trait_class=OntologyMappedClinVarTrait, measure_class=EnsemblAnnotatedClinVarMeasure)
 
 
-class EfoMappedClinVarTrait(ClinVarTrait):
+class OntologyMappedClinVarTrait(ClinVarTrait):
 
-    def add_efo_mappings(self, efo_ids):
-        efo_elts = []
-        for efo_id in efo_ids:
-            efo_id = self.format_efo_id(efo_id)
+    def add_ontology_mappings(self, ontology_ids, target_ontology):
+        ontology_elts = []
+        for ontology_id in ontology_ids:
+            ontology_id = self.format_ontology_id(ontology_id)
             # Include Status attribute so this isn't included among current xrefs
-            efo_elts.append(ET.Element('XRef', attrib={
-                'ID': efo_id, 'DB': 'EFO', 'Status': 'annotated', 'providedBy': PROCESSOR}))
-        self.trait_xml.extend(efo_elts)
+            ontology_elts.append(ET.Element('XRef', attrib={
+                'ID': ontology_id, 'DB': target_ontology, 'Status': 'annotated', 'providedBy': PROCESSOR}))
+        self.trait_xml.extend(ontology_elts)
 
     @staticmethod
-    def format_efo_id(efo_id):
-        if efo_id.startswith('http'):
-            return efo_id.split('/')[-1].replace('_', ':')
-        return efo_id
+    def format_ontology_id(ontology_id):
+        if ontology_id.startswith('http'):
+            return ontology_id.split('/')[-1].replace('_', ':')
+        return ontology_id
 
 
 class EnsemblAnnotatedClinVarMeasure(ClinVarRecordMeasure):
@@ -308,22 +312,24 @@ def string_to_set(s):
     return set(x for x in re.sub(r"{|}|'", '', s).split(', ') if x)
 
 
-def generate_annotated_clinvar_xml(clinvar_xml_file, efo_mapping_file, gene_mapping_file, output_xml_file,
+def generate_annotated_clinvar_xml(clinvar_xml_file, trait_mapping_file, gene_mapping_file, output_xml_file,
                                    eval_gene_file=None, eval_xref_file=None, eval_latest_file=None):
-    """Generate an annotated XML file of ClinVar RCVs based on EFO mappings file and gene mapping file (as documented in
+    """Generate an annotated XML file of ClinVar RCVs based on trait mapping and gene mapping files (as documented in
     clinvar_to_evidence_strings)."""
-    string_to_efo_mappings = load_efo_mapping(efo_mapping_file)
+    string_to_ontology_mappings, target_ontology = load_ontology_mapping(trait_mapping_file)
     variant_to_gene_mappings = CT.process_consequence_type_file(gene_mapping_file)
-    # Need both files to do an evaluation
+    # Need all files to do an evaluation
     if eval_gene_file and eval_xref_file and eval_latest_file:
         eval_gene_mappings = load_evaluation_gene_mappings(eval_gene_file)
         eval_xref_mappings = load_evaluation_xref_mappings(eval_xref_file)
         eval_latest_mappings = load_evaluation_latest(eval_latest_file)
-        dataset = AnnotatingClinVarDataset(clinvar_xml_file, string_to_efo_mappings, variant_to_gene_mappings,
+        dataset = AnnotatingClinVarDataset(clinvar_xml_file, string_to_ontology_mappings, variant_to_gene_mappings,
+                                           target_ontology,
                                            eval_gene_mappings=eval_gene_mappings,
                                            eval_xref_mappings=eval_xref_mappings,
                                            eval_latest_mappings=eval_latest_mappings)
     else:
-        dataset = AnnotatingClinVarDataset(clinvar_xml_file, string_to_efo_mappings, variant_to_gene_mappings)
+        dataset = AnnotatingClinVarDataset(clinvar_xml_file, string_to_ontology_mappings, variant_to_gene_mappings,
+                                           target_ontology)
     dataset.write(output_xml_file)
     dataset.report()