geneontology · sierra-moxon · Mar 18, 2024 · Mar 18, 2024 · Mar 18, 2024
diff --git a/src/config/download_config.yaml b/src/config/download_config.yaml
@@ -12,10 +12,11 @@ HUMAN_ISO:
   url: http://skyhook.berkeleybop.org/silver-issue-325-gopreprocess/products/upstream_and_raw_data/goa_human_isoform-src.gaf.gz
 MGI_XREF:
   url: https://www.informatics.jax.org/downloads/reports/HOM_MouseHumanSequence.rpt
+# drives the curation system - minerva outage causes this and happens weekly - Wednesday every other week
 GO:
   url: http://skyhook.berkeleybop.org/go-ontology-dev/ontology/go.json
 GO_RELEASE:
-  url: http://skyhook.berkeleybop.org/release/ontology/go.json
+  url: http://skyhook.berkeleybop.org/go-ontology-dev/ontology/go.json
 GOA_taxon_10090:
   url: https://ftp.ebi.ac.uk/pub/databases/GO/goa/MOUSE/goa_mouse.gaf.gz
 GOA_taxon_10090_ISOFORM:

diff --git a/src/gopreprocess/file_processors/alliance_orthology_processor.py b/src/gopreprocess/file_processors/alliance_orthology_processor.py
@@ -53,4 +53,5 @@ def retrieve_ortho_map(self):
                         genes[pair.get("Gene2ID")].append(pair.get("Gene1ID"))
                     else:
                         genes[pair.get("Gene2ID")] = [pair.get("Gene1ID")]
+
         return genes
diff --git a/src/gopreprocess/file_processors/gaf_processor.py b/src/gopreprocess/file_processors/gaf_processor.py
@@ -110,16 +110,20 @@ def parse_ortho_gaf(self):
                         continue  # remove annotations that don't have a subject in the namespaces we're interested in
                     if str(source_assoc.evidence.type) not in experimental_evidence_codes:
                         continue
-                    if self.source is None and (source_assoc.provided_by == self.taxon_to_provider[self.target_taxon]
-                                                or source_assoc.provided_by == "GO_Central"
-                                                or source_assoc.provided_by == "GOC"):
-                        continue
-                    has_reference = any(reference.namespace == "PMID" for reference in source_assoc.evidence.has_supporting_reference)
+                    if (
+                        source_assoc.provided_by == self.taxon_to_provider[self.target_taxon]
+                        or source_assoc.provided_by == "GO_Central"
+                    ):
+                        continue  # remove self-annotations
+                    has_reference = any(
+                        reference.namespace == "PMID" for reference in source_assoc.evidence.has_supporting_reference
+                    )
                     if not has_reference:
                         counter = counter + 1
                     if str(source_assoc.object.id) in ["GO:0005515", "GO:0005488"]:
                         continue
                     if source_assoc.subject.id.namespace == "UniProtKB":
+                        # TODO convert to report files
                         # check if the incoming HGNC identifier is in the map we made from UniProt to HGNC via
                         # the MGI xref file
                         if str(source_assoc.subject.id) not in self.uniprot_to_hgnc_map.keys():
@@ -128,7 +132,9 @@ def parse_ortho_gaf(self):
                             # if it's in the mapped dictionary, then we can replace the UniProt identifier with the
                             # HGNC identifier, formatting that as a Curie with separate Namespace and ID fields.
                             mapped_id = self.uniprot_to_hgnc_map[str(source_assoc.subject.id)]
-                            source_assoc.subject.id = Curie(namespace=mapped_id.split(":")[0], identity=mapped_id.split(":")[1])
+                            source_assoc.subject.id = Curie(
+                                namespace=mapped_id.split(":")[0], identity=mapped_id.split(":")[1]
+                            )
                     self.convertible_annotations.append(source_assoc)
         return self.convertible_annotations
 
@@ -169,4 +175,4 @@ def parse_p2g_gaf(self):
                     if str(source_assoc.object.id) in ["GO:0005575", "GO:0008150", "GO:0003674"]:
                         continue  # remove root terms
                     self.convertible_p2g_annotations.append(source_assoc)
-        return self.convertible_p2g_annotations
+        return self.convertible_p2g_annotations
diff --git a/src/gopreprocess/ortho_annotation_creation_controller.py b/src/gopreprocess/ortho_annotation_creation_controller.py
@@ -7,9 +7,10 @@
 
 import collections
 import copy
+import click
 from datetime import datetime
 from typing import List
-
+import sys
 import pandas as pd
 import pystow
 from gopreprocess.file_processors.ontology_processor import get_GO_aspector
@@ -44,9 +45,9 @@ def dump_converted_annotations(converted_target_annotations: List[List[str]], so
     :type target_taxon: str
 
     """
+
     # using pandas in order to take advantage of pystow in terms of file location and handling
     df = pd.DataFrame(converted_target_annotations)
-    print(df.columns)
     df = df.applymap(convert_curie_to_string)
     # Deduplicate the rows
     df_deduplicated = df.drop_duplicates()
@@ -196,6 +197,7 @@ def convert_annotations(self) -> None:
 
         source_genes = OrthoProcessor(target_genes, ortho_path, self.target_taxon, self.source_taxon).genes
 
+
         transformed = {}
         for key, values in source_genes.items():
             for value in values:
@@ -230,8 +232,8 @@ def convert_annotations(self) -> None:
         # need in order to download and store the gene ontology JSON file used to create the closure in the
         # GoAspector object.
         go_aspector = get_GO_aspector("GO")
-
         for annotation in source_annotations:
+            click.echo(annotation)
             if str(annotation.subject.id) in source_gene_set:
                 # generate the target annotation based on the source annotation
                 new_annotations = self.generate_annotation(
@@ -243,9 +245,17 @@ def convert_annotations(self) -> None:
                     transformed_source_genes=transformed,
                 )
                 for new_annotation in new_annotations:
+                    click.echo(new_annotation.to_gaf_2_2_tsv())
                     converted_target_annotations.append(new_annotation.to_gaf_2_2_tsv())
 
-        dump_converted_annotations(converted_target_annotations, source_taxon=self.source_taxon, target_taxon=self.target_taxon)
+        if converted_target_annotations:
+            dump_converted_annotations(converted_target_annotations,
+                                       source_taxon=self.source_taxon,
+                                       target_taxon=self.target_taxon)
+        else:
+            print("FAIL!: no annotations to dump!")
+            click.echo("No annotations were converted.")
+            sys.exit(1)  # Exit with a non-zero status to indicate failure
 
     def generate_annotation(
         self,
@@ -281,7 +291,9 @@ def generate_annotation(
 
         if str(annotation.subject.id) in source_genes.keys():
             for gene in source_genes[str(annotation.subject.id)]:
-                if gene in transformed_source_genes and len(transformed_source_genes[gene]) > 1 and go_aspector.is_biological_process(str(annotation.object.id)):
+                if (gene in transformed_source_genes
+                        and len(transformed_source_genes[gene]) > 1
+                        and go_aspector.is_biological_process(str(annotation.object.id))):
                     output = (
                         "NON_1TO1_BP"
                         + str(annotation.subject.id)
@@ -294,16 +306,20 @@ def generate_annotation(
                         + " "
                         + str(annotation.evidence.has_supporting_reference)
                     )
+                    print("greater than 1 BP")
                     annotation_skipped.append(output)
                 else:
                     new_annotation = copy.deepcopy(annotation)
                     if str(annotation.subject.id) in hgnc_to_uniprot_map.keys():
+                        print("HGNC to UniProt map", str(annotation.subject.id))
                         uniprot_id = hgnc_to_uniprot_map[str(annotation.subject.id)]  # convert back to UniProtKB ID
                         uniprot_curie = Curie(namespace=uniprot_id.split(":")[0], identity=uniprot_id.split(":")[1])
                         new_annotation.evidence.with_support_from = [ConjunctiveSet(elements=[uniprot_curie])]
                     else:
                         new_annotation.evidence.with_support_from = [ConjunctiveSet(elements=[str(annotation.subject.id)])]
-                    new_annotation.evidence.has_supporting_reference = [Curie(namespace="GO_REF", identity=self.ortho_reference)]
+                        print("no HGNC to UniProt map", str(annotation.subject.id))
+                    new_annotation.evidence.has_supporting_reference = [Curie(namespace="GO_REF",
+                                                                              identity=self.ortho_reference)]
                     # if there is only one human ortholog of the mouse gene and the annotation is not a biological
                     # process, then we add it, else we skip it. inferred from sequence similarity
                     new_annotation.evidence.type = Curie(namespace="ECO", identity=iso_eco_code.split(":")[1])