Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

remove debugging, simplify method, remove duplicate method #50

Merged
merged 2 commits into from
Mar 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion src/config/download_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,11 @@ HUMAN_ISO:
url: http://skyhook.berkeleybop.org/silver-issue-325-gopreprocess/products/upstream_and_raw_data/goa_human_isoform-src.gaf.gz
MGI_XREF:
url: https://www.informatics.jax.org/downloads/reports/HOM_MouseHumanSequence.rpt
# drives the curation system - minerva outage causes this and happens weekly - Wednesday every other week
GO:
url: http://skyhook.berkeleybop.org/go-ontology-dev/ontology/go.json
GO_RELEASE:
url: http://skyhook.berkeleybop.org/release/ontology/go.json
url: http://skyhook.berkeleybop.org/go-ontology-dev/ontology/go.json
GOA_taxon_10090:
url: https://ftp.ebi.ac.uk/pub/databases/GO/goa/MOUSE/goa_mouse.gaf.gz
GOA_taxon_10090_ISOFORM:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,4 +53,5 @@ def retrieve_ortho_map(self):
genes[pair.get("Gene2ID")].append(pair.get("Gene1ID"))
else:
genes[pair.get("Gene2ID")] = [pair.get("Gene1ID")]

return genes
20 changes: 13 additions & 7 deletions src/gopreprocess/file_processors/gaf_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,16 +110,20 @@ def parse_ortho_gaf(self):
continue # remove annotations that don't have a subject in the namespaces we're interested in
if str(source_assoc.evidence.type) not in experimental_evidence_codes:
continue
if self.source is None and (source_assoc.provided_by == self.taxon_to_provider[self.target_taxon]
or source_assoc.provided_by == "GO_Central"
or source_assoc.provided_by == "GOC"):
continue
has_reference = any(reference.namespace == "PMID" for reference in source_assoc.evidence.has_supporting_reference)
if (
source_assoc.provided_by == self.taxon_to_provider[self.target_taxon]
or source_assoc.provided_by == "GO_Central"
):
continue # remove self-annotations
has_reference = any(
reference.namespace == "PMID" for reference in source_assoc.evidence.has_supporting_reference
)
if not has_reference:
counter = counter + 1
if str(source_assoc.object.id) in ["GO:0005515", "GO:0005488"]:
continue
if source_assoc.subject.id.namespace == "UniProtKB":
# TODO convert to report files
# check if the incoming HGNC identifier is in the map we made from UniProt to HGNC via
# the MGI xref file
if str(source_assoc.subject.id) not in self.uniprot_to_hgnc_map.keys():
Expand All @@ -128,7 +132,9 @@ def parse_ortho_gaf(self):
# if it's in the mapped dictionary, then we can replace the UniProt identifier with the
# HGNC identifier, formatting that as a Curie with separate Namespace and ID fields.
mapped_id = self.uniprot_to_hgnc_map[str(source_assoc.subject.id)]
source_assoc.subject.id = Curie(namespace=mapped_id.split(":")[0], identity=mapped_id.split(":")[1])
source_assoc.subject.id = Curie(
namespace=mapped_id.split(":")[0], identity=mapped_id.split(":")[1]
)
self.convertible_annotations.append(source_assoc)
return self.convertible_annotations

Expand Down Expand Up @@ -169,4 +175,4 @@ def parse_p2g_gaf(self):
if str(source_assoc.object.id) in ["GO:0005575", "GO:0008150", "GO:0003674"]:
continue # remove root terms
self.convertible_p2g_annotations.append(source_assoc)
return self.convertible_p2g_annotations
return self.convertible_p2g_annotations
28 changes: 22 additions & 6 deletions src/gopreprocess/ortho_annotation_creation_controller.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,10 @@

import collections
import copy
import click
from datetime import datetime
from typing import List

import sys
import pandas as pd
import pystow
from gopreprocess.file_processors.ontology_processor import get_GO_aspector
Expand Down Expand Up @@ -44,9 +45,9 @@ def dump_converted_annotations(converted_target_annotations: List[List[str]], so
:type target_taxon: str

"""

# using pandas in order to take advantage of pystow in terms of file location and handling
df = pd.DataFrame(converted_target_annotations)
print(df.columns)
df = df.applymap(convert_curie_to_string)
# Deduplicate the rows
df_deduplicated = df.drop_duplicates()
Expand Down Expand Up @@ -196,6 +197,7 @@ def convert_annotations(self) -> None:

source_genes = OrthoProcessor(target_genes, ortho_path, self.target_taxon, self.source_taxon).genes


transformed = {}
for key, values in source_genes.items():
for value in values:
Expand Down Expand Up @@ -230,8 +232,8 @@ def convert_annotations(self) -> None:
# need in order to download and store the gene ontology JSON file used to create the closure in the
# GoAspector object.
go_aspector = get_GO_aspector("GO")

for annotation in source_annotations:
click.echo(annotation)
if str(annotation.subject.id) in source_gene_set:
# generate the target annotation based on the source annotation
new_annotations = self.generate_annotation(
Expand All @@ -243,9 +245,17 @@ def convert_annotations(self) -> None:
transformed_source_genes=transformed,
)
for new_annotation in new_annotations:
click.echo(new_annotation.to_gaf_2_2_tsv())
converted_target_annotations.append(new_annotation.to_gaf_2_2_tsv())

dump_converted_annotations(converted_target_annotations, source_taxon=self.source_taxon, target_taxon=self.target_taxon)
if converted_target_annotations:
dump_converted_annotations(converted_target_annotations,
source_taxon=self.source_taxon,
target_taxon=self.target_taxon)
else:
print("FAIL!: no annotations to dump!")
click.echo("No annotations were converted.")
sys.exit(1) # Exit with a non-zero status to indicate failure

def generate_annotation(
self,
Expand Down Expand Up @@ -281,7 +291,9 @@ def generate_annotation(

if str(annotation.subject.id) in source_genes.keys():
for gene in source_genes[str(annotation.subject.id)]:
if gene in transformed_source_genes and len(transformed_source_genes[gene]) > 1 and go_aspector.is_biological_process(str(annotation.object.id)):
if (gene in transformed_source_genes
and len(transformed_source_genes[gene]) > 1
and go_aspector.is_biological_process(str(annotation.object.id))):
output = (
"NON_1TO1_BP"
+ str(annotation.subject.id)
Expand All @@ -294,16 +306,20 @@ def generate_annotation(
+ " "
+ str(annotation.evidence.has_supporting_reference)
)
print("greater than 1 BP")
annotation_skipped.append(output)
else:
new_annotation = copy.deepcopy(annotation)
if str(annotation.subject.id) in hgnc_to_uniprot_map.keys():
print("HGNC to UniProt map", str(annotation.subject.id))
uniprot_id = hgnc_to_uniprot_map[str(annotation.subject.id)] # convert back to UniProtKB ID
uniprot_curie = Curie(namespace=uniprot_id.split(":")[0], identity=uniprot_id.split(":")[1])
new_annotation.evidence.with_support_from = [ConjunctiveSet(elements=[uniprot_curie])]
else:
new_annotation.evidence.with_support_from = [ConjunctiveSet(elements=[str(annotation.subject.id)])]
new_annotation.evidence.has_supporting_reference = [Curie(namespace="GO_REF", identity=self.ortho_reference)]
print("no HGNC to UniProt map", str(annotation.subject.id))
new_annotation.evidence.has_supporting_reference = [Curie(namespace="GO_REF",
identity=self.ortho_reference)]
# if there is only one human ortholog of the mouse gene and the annotation is not a biological
# process, then we add it, else we skip it. inferred from sequence similarity
new_annotation.evidence.type = Curie(namespace="ECO", identity=iso_eco_code.split(":")[1])
Expand Down
Loading