Skip to content

Commit

Permalink
Merge pull request #407 from apriltuesday/output-ontology
Browse files Browse the repository at this point in the history
Add ability to use alternate target ontologies
  • Loading branch information
apriltuesday authored Dec 20, 2023
2 parents 2b968b4 + d49c887 commit 29e89bf
Show file tree
Hide file tree
Showing 45 changed files with 8,307 additions and 191 deletions.
6 changes: 5 additions & 1 deletion .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,11 @@ jobs:
run: python -m pytest --cov=cmat --cov-append tests -k integration

- name: End-to-end test of evidence string generation pipeline
run: bash tests/output_generation/test_pipeline.sh
run: bash tests/pipelines/test_annotation_pipeline.sh

# TODO Takes too long to run regularly, maybe just on tags?
# - name: End-to-end test of curation pipelines
# run: bash tests/pipelines/test_curation_pipelines.sh

- name: Upload the coverage data to Coveralls
env:
Expand Down
4 changes: 4 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,10 @@ export CODE_ROOT=
export LATEST_MAPPINGS=${CODE_ROOT}/mappings/latest_mappings.tsv
```

If this is your first time running the pipelines with a specific target ontology (i.e. you don't have a latest mappings file to use),
you can use an empty TSV file containing just the header `#ontology=<code>`, where `<code>` is taken from [this list](https://www.ebi.ac.uk/ols4/ontologies) of supportable ontologies.
This file will be filled with automated and manually curated mappings as processing continues.

To confirm everything is set up properly, you can run the annotation pipeline on the small dataset included with the tests.
It should take a couple minutes to run and generate a file `annotated_clinvar.xml.gz` in the test directory.
```bash
Expand Down
8 changes: 4 additions & 4 deletions bin/evaluation/check_latest_mappings.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,25 +3,25 @@
import csv
import multiprocessing

from cmat.output_generation.clinvar_to_evidence_strings import load_efo_mapping
from cmat.output_generation.clinvar_to_evidence_strings import load_ontology_mapping
from cmat.output_generation.evaluation.ols_utils import fetch_eval_data


def main(mapping_file, output_file):
"""Load mapping file, map identifiers to synonyms in OLS, and dump results to TSV."""
mappings = load_efo_mapping(mapping_file)
mappings, target_ontology = load_ontology_mapping(mapping_file)
all_uris = [uri for v in mappings.values() for uri, _ in v]
process_pool = multiprocessing.Pool(processes=24)
annotated_traits = [
process_pool.apply(fetch_eval_data, kwds={'uri': uri, 'include_neighbors': False})
process_pool.apply(fetch_eval_data, kwds={'uri': uri, 'include_neighbors': False, 'target_ontology': target_ontology})
for uri in all_uris
]
with open(output_file, 'w+') as f:
csv.writer(f, delimiter="\t").writerows(annotated_traits)


if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Script to check if mappings are obsolete in EFO and find synonyms')
parser = argparse.ArgumentParser(description='Script to check if mappings are obsolete and find synonyms')
parser.add_argument('--latest-mappings', required=True, help='Latest mappings file')
parser.add_argument('--output-file', required=True, help='File to output dataframe')
args = parser.parse_args()
Expand Down
4 changes: 2 additions & 2 deletions bin/generate_annotated_xml.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

parser = argparse.ArgumentParser('Generates annotated ClinVar XML from ClinVar data and trait mappings')
parser.add_argument('--clinvar-xml', help='ClinVar XML release', required=True)
parser.add_argument('--efo-mapping', help='Disease string to ontology mappings', required=True)
parser.add_argument('--trait-mapping', help='Disease string to ontology mappings', required=True)
parser.add_argument('--gene-mapping', help='Variant to gene & consequence mappings', required=True)
parser.add_argument('--output-xml', help='Output XML file', required=True)
parser.add_argument('--eval-gene-file', help='Gene mappings for evaluation', required=False)
Expand All @@ -16,6 +16,6 @@
if __name__ == '__main__':
args = parser.parse_args()
generate_annotated_clinvar_xml(
clinvar_xml_file=args.clinvar_xml, efo_mapping_file=args.efo_mapping, gene_mapping_file=args.gene_mapping,
clinvar_xml_file=args.clinvar_xml, trait_mapping_file=args.trait_mapping, gene_mapping_file=args.gene_mapping,
output_xml_file=args.output_xml, eval_gene_file=args.eval_gene_file, eval_xref_file=args.eval_xref_file,
eval_latest_file=args.eval_latest_file)
4 changes: 2 additions & 2 deletions bin/trait_mapping/create_efo_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import re
import requests

from cmat.trait_mapping.ols import OLS_EFO_SERVER
from cmat.trait_mapping.ols import OLS_SERVER
from requests import HTTPError
from retry import retry

Expand All @@ -31,7 +31,7 @@

def ols_url_template(ontology, term):
# OLS url to query for a term details
return f'{OLS_EFO_SERVER}/api/ontologies/{ontology}/terms?iri={term}'
return f'{OLS_SERVER}/api/ontologies/{ontology}/terms?iri={term}'


def oxo_url_template(curie):
Expand Down
33 changes: 17 additions & 16 deletions bin/trait_mapping/create_table_for_manual_curation.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,29 +4,29 @@

import pandas as pd

from cmat.output_generation.clinvar_to_evidence_strings import load_efo_mapping
from cmat.output_generation.clinvar_to_evidence_strings import load_ontology_mapping
from cmat.trait_mapping.ols import (
get_ontology_label_from_ols, is_current_and_in_efo, is_in_efo, get_replacement_term,
get_ontology_label_from_ols, is_current_and_in_ontology, is_in_ontology, get_replacement_term,
)


def previous_and_replacement_mappings(trait_name, previous_mappings):
def previous_and_replacement_mappings(trait_name, previous_mappings, ontology):
if trait_name not in previous_mappings:
yield '', ''
return
for uri, label in previous_mappings[trait_name]:
trait_status = get_trait_status(uri)
trait_status = get_trait_status(uri, ontology)
trait_string = '|'.join([uri, label, 'NOT_SPECIFIED', 'previously-used', trait_status])
replacement_string = find_replacement_mapping(uri)
replacement_string = find_replacement_mapping(uri, ontology)
yield trait_string, replacement_string


def find_replacement_mapping(previous_uri):
replacement_uri = get_replacement_term(previous_uri)
def find_replacement_mapping(previous_uri, ontology):
replacement_uri = get_replacement_term(previous_uri, ontology)
if not replacement_uri:
return ''
label = get_ontology_label(replacement_uri)
trait_status = get_trait_status(replacement_uri)
trait_status = get_trait_status(replacement_uri, ontology)
trait_string = '|'.join([replacement_uri, label, 'NOT_SPECIFIED', 'replacement', trait_status])
return trait_string

Expand All @@ -43,11 +43,11 @@ def get_ontology_label(uri):
return label if label is not None else ''


def get_trait_status(uri):
uri_is_current_and_in_efo = is_current_and_in_efo(uri)
uri_in_efo = is_in_efo(uri)
if uri_in_efo:
trait_status = 'EFO_CURRENT' if uri_is_current_and_in_efo else 'EFO_OBSOLETE'
def get_trait_status(uri, ontology):
uri_is_current_and_in_ontology = is_current_and_in_ontology(uri, ontology)
uri_in_ontology = is_in_ontology(uri, ontology)
if uri_in_ontology:
trait_status = f'{ontology.upper()}_CURRENT' if uri_is_current_and_in_ontology else f'{ontology.upper()}_OBSOLETE'
else:
trait_status = 'NOT_CONTAINED'
return trait_status
Expand All @@ -71,13 +71,13 @@ def get_trait_status(uri):
args = parser.parse_args()

# Load all previous mappings: ClinVar trait name to ontology URI
previous_mappings = load_efo_mapping(args.previous_mappings)
previous_mappings, target_ontology = load_ontology_mapping(args.previous_mappings)

# Load previous curator comments: ClinVar trait name to comment string
try:
previous_comments = pd.read_csv(args.previous_comments, sep='\t', header=None)
previous_comments = dict(zip(previous_comments[0], previous_comments[1]))
except pd.errors.EmptyDataError:
except (FileNotFoundError, pd.errors.EmptyDataError):
previous_comments = {}

# Process all mappings which require manual curation
Expand All @@ -92,7 +92,8 @@ def get_trait_status(uri):
# Use maximum of 50 mappings to improve Google Sheets performance
mappings = fields[3:53]
exact_mapping = find_exact_mapping(trait_name, mappings)
for previous_mapping, replacement_mapping in previous_and_replacement_mappings(trait_name, previous_mappings):
for previous_mapping, replacement_mapping in previous_and_replacement_mappings(trait_name, previous_mappings,
target_ontology):
rows.append([trait_name, trait_freq, notes, previous_mapping, exact_mapping, replacement_mapping]
+ mappings)

Expand Down
2 changes: 1 addition & 1 deletion bin/trait_mapping/export_curation_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ def export_table(input_filepath, done_filepath, import_filepath, comments_filepa

# Comments column
comment_rows = curation_table[curation_table['Comment'].notna() & curation_table['Status'].notna()]
comment_rows = comment_rows[['ClinVar label', 'Comment']]
comment_rows = comment_rows[['ClinVar label', 'Comment']].astype(str)
# Remove double quotes as they just cause problems
comment_rows['Comment'] = comment_rows['Comment'].str.replace('"', '')
comment_rows.to_csv(comments_filepath, sep='\t', header=False, index=False)
Expand Down
11 changes: 7 additions & 4 deletions bin/trait_mapping/process_traits.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ def launch():

main.process_traits(parser.input_traits_filepath, parser.output_mappings_filepath,
parser.output_curation_filepath, parser.filters, parser.zooma_host,
parser.oxo_target_list, parser.oxo_distance)
parser.oxo_target_list, parser.oxo_distance, parser.target_ontology)


class ArgParser:
Expand All @@ -27,8 +27,8 @@ def __init__(self, argv):
help="path to output file for mappings")
parser.add_argument("-c", dest="output_curation_filepath", required=True,
help="path to output file for curation")
parser.add_argument("-n", dest="ontologies", default="efo,ordo,hp,mondo",
help="ontologies to use in query")
parser.add_argument("-n", dest="zooma_ontologies", default="efo,ordo,hp,mondo",
help="ontologies to use in zooma query")
parser.add_argument("-r", dest="required", default="cttv,eva-clinvar,clinvar-xrefs,gwas",
help="data sources to use in query.")
parser.add_argument("-p", dest="preferred", default="eva-clinvar,cttv,gwas,clinvar-xrefs",
Expand All @@ -39,20 +39,23 @@ def __init__(self, argv):
help="target ontologies to use with OxO")
parser.add_argument("-d", dest="oxo_distance", default=3,
help="distance to use to query OxO.")
parser.add_argument('--target-ontology', help='ID of target ontology (default EFO, for allowable values see'
'https://www.ebi.ac.uk/ols/ontologies)', default='EFO')

args = parser.parse_args(args=argv[1:])

self.input_traits_filepath = args.input_traits_filepath
self.output_mappings_filepath = args.output_mappings_filepath
self.output_curation_filepath = args.output_curation_filepath

self.filters = {"ontologies": args.ontologies,
self.filters = {"ontologies": args.zooma_ontologies,
"required": args.required,
"preferred": args.preferred}

self.zooma_host = args.zooma_host
self.oxo_target_list = [target.strip() for target in args.oxo_target_list.split(",")]
self.oxo_distance = args.oxo_distance
self.target_ontology = args.target_ontology


if __name__ == '__main__':
Expand Down
72 changes: 39 additions & 33 deletions cmat/output_generation/annotated_clinvar.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,27 +6,29 @@

from cmat.clinvar_xml_io import ClinVarTrait, ClinVarRecordMeasure, ClinVarDataset, ClinVarRecord
from cmat.clinvar_xml_io.xml_parsing import iterate_rcv_from_xml
from cmat.output_generation.clinvar_to_evidence_strings import load_efo_mapping, get_consequence_types
from cmat.output_generation.clinvar_to_evidence_strings import load_ontology_mapping, get_consequence_types
from cmat.output_generation import consequence_type as CT
from cmat.output_generation.evaluation.set_metrics import SetComparisonMetrics

PROCESSOR = 'CMAT'


class AnnotatingClinVarDataset(ClinVarDataset):
"""This class provides the ability to parse ClinVar records (RCVs) and annotate them with EFO mappings and
"""This class provides the ability to parse ClinVar records (RCVs) and annotate them with ontology mappings and
consequence mappings on the fly."""

def __init__(self, clinvar_xml, string_to_efo_mappings, variant_to_gene_mappings,
def __init__(self, clinvar_xml, string_to_ontology_mappings, variant_to_gene_mappings, target_ontology,
eval_gene_mappings=None, eval_xref_mappings=None, eval_latest_mappings=None):
super().__init__(clinvar_xml)
self.header_attr['ProcessedBy'] = PROCESSOR
self.string_to_efo_mappings = string_to_efo_mappings
self.string_to_ontology_mappings = string_to_ontology_mappings
self.variant_to_gene_mappings = variant_to_gene_mappings
self.target_ontology = target_ontology

self.eval_gene_mappings = eval_gene_mappings
self.eval_xref_mappings = eval_xref_mappings
self.eval_latest_mappings = eval_latest_mappings
self.do_eval = (self.eval_gene_mappings and self.eval_xref_mappings and self.eval_latest_mappings)
self.overall_counts = {}
self.obsolete_counts = {}
self.gene_metrics = None
Expand Down Expand Up @@ -61,8 +63,9 @@ def __iter__(self):
self.complex_variant_metrics = (SetComparisonMetrics(), SetComparisonMetrics())

self.trait_metrics = SetComparisonMetrics()
self.mismatches_file = open('mismatches.tsv', 'w+')
self.mismatches_file.write('RCV\tCV\tCMAT\n')
if self.do_eval:
self.mismatches_file = open('mismatches.tsv', 'w+')
self.mismatches_file.write('RCV\tCV\tCMAT\n')

for rcv in iterate_rcv_from_xml(self.clinvar_xml):
record = AnnotatedClinVarRecord(rcv)
Expand All @@ -75,7 +78,8 @@ def __iter__(self):
self.trait_metrics.finalise()
for metrics in self.simple_variant_metrics + self.repeat_variant_metrics + self.complex_variant_metrics:
metrics.finalise()
self.mismatches_file.close()
if self.do_eval:
self.mismatches_file.close()

def annotate(self, record):
self.overall_counts['total'] += 1
Expand Down Expand Up @@ -117,20 +121,20 @@ def annotate_and_count_measure(self, record):

def annotate_and_count_traits(self, record):
for trait in record.traits_with_valid_names:
# Get current EFO ids
# Get current EFO ids - only used for evaluation
existing_efo_ids = set()
for db, iden, _ in trait.current_efo_aligned_xrefs:
curie = OntologyUri(iden, db).curie
if curie:
existing_efo_ids.add(curie)

# Add annotations - only based on preferred name
efo_ids = [
EfoMappedClinVarTrait.format_efo_id(efo_id)
for efo_id, efo_label
in self.string_to_efo_mappings.get(trait.preferred_or_other_valid_name.lower(), [])
target_ontology_ids = [
OntologyMappedClinVarTrait.format_ontology_id(ontology_id)
for ontology_id, ontology_label
in self.string_to_ontology_mappings.get(trait.preferred_or_other_valid_name.lower(), [])
]
trait.add_efo_mappings(efo_ids)
trait.add_ontology_mappings(target_ontology_ids, self.target_ontology)

# Evaluation
if self.eval_xref_mappings and self.eval_latest_mappings:
Expand All @@ -145,7 +149,7 @@ def annotate_and_count_traits(self, record):
existing_current_efo_ids.add(cv_id)

annotated_current_efo_ids = set()
for efo_id in efo_ids:
for efo_id in target_ontology_ids:

# Check whether annotated ID is obsolete
self.obsolete_counts['cmat_total'] += 1
Expand Down Expand Up @@ -210,25 +214,25 @@ def print_counter(counter):
class AnnotatedClinVarRecord(ClinVarRecord):

def __init__(self, rcv):
super().__init__(rcv, trait_class=EfoMappedClinVarTrait, measure_class=EnsemblAnnotatedClinVarMeasure)
super().__init__(rcv, trait_class=OntologyMappedClinVarTrait, measure_class=EnsemblAnnotatedClinVarMeasure)


class EfoMappedClinVarTrait(ClinVarTrait):
class OntologyMappedClinVarTrait(ClinVarTrait):

def add_efo_mappings(self, efo_ids):
efo_elts = []
for efo_id in efo_ids:
efo_id = self.format_efo_id(efo_id)
def add_ontology_mappings(self, ontology_ids, target_ontology):
ontology_elts = []
for ontology_id in ontology_ids:
ontology_id = self.format_ontology_id(ontology_id)
# Include Status attribute so this isn't included among current xrefs
efo_elts.append(ET.Element('XRef', attrib={
'ID': efo_id, 'DB': 'EFO', 'Status': 'annotated', 'providedBy': PROCESSOR}))
self.trait_xml.extend(efo_elts)
ontology_elts.append(ET.Element('XRef', attrib={
'ID': ontology_id, 'DB': target_ontology, 'Status': 'annotated', 'providedBy': PROCESSOR}))
self.trait_xml.extend(ontology_elts)

@staticmethod
def format_efo_id(efo_id):
if efo_id.startswith('http'):
return efo_id.split('/')[-1].replace('_', ':')
return efo_id
def format_ontology_id(ontology_id):
if ontology_id.startswith('http'):
return ontology_id.split('/')[-1].replace('_', ':')
return ontology_id


class EnsemblAnnotatedClinVarMeasure(ClinVarRecordMeasure):
Expand Down Expand Up @@ -308,22 +312,24 @@ def string_to_set(s):
return set(x for x in re.sub(r"{|}|'", '', s).split(', ') if x)


def generate_annotated_clinvar_xml(clinvar_xml_file, efo_mapping_file, gene_mapping_file, output_xml_file,
def generate_annotated_clinvar_xml(clinvar_xml_file, trait_mapping_file, gene_mapping_file, output_xml_file,
eval_gene_file=None, eval_xref_file=None, eval_latest_file=None):
"""Generate an annotated XML file of ClinVar RCVs based on EFO mappings file and gene mapping file (as documented in
"""Generate an annotated XML file of ClinVar RCVs based on trait mapping and gene mapping files (as documented in
clinvar_to_evidence_strings)."""
string_to_efo_mappings = load_efo_mapping(efo_mapping_file)
string_to_ontology_mappings, target_ontology = load_ontology_mapping(trait_mapping_file)
variant_to_gene_mappings = CT.process_consequence_type_file(gene_mapping_file)
# Need both files to do an evaluation
# Need all files to do an evaluation
if eval_gene_file and eval_xref_file and eval_latest_file:
eval_gene_mappings = load_evaluation_gene_mappings(eval_gene_file)
eval_xref_mappings = load_evaluation_xref_mappings(eval_xref_file)
eval_latest_mappings = load_evaluation_latest(eval_latest_file)
dataset = AnnotatingClinVarDataset(clinvar_xml_file, string_to_efo_mappings, variant_to_gene_mappings,
dataset = AnnotatingClinVarDataset(clinvar_xml_file, string_to_ontology_mappings, variant_to_gene_mappings,
target_ontology,
eval_gene_mappings=eval_gene_mappings,
eval_xref_mappings=eval_xref_mappings,
eval_latest_mappings=eval_latest_mappings)
else:
dataset = AnnotatingClinVarDataset(clinvar_xml_file, string_to_efo_mappings, variant_to_gene_mappings)
dataset = AnnotatingClinVarDataset(clinvar_xml_file, string_to_ontology_mappings, variant_to_gene_mappings,
target_ontology)
dataset.write(output_xml_file)
dataset.report()
Loading

0 comments on commit 29e89bf

Please sign in to comment.