Merge pull request #675 from biolink/minimal_changes_for_gpad_gpi_2_0

Minimal changes for gpad gpi 2 0
biolink · May 17, 2024 · 798a99b · 798a99b
2 parents 44ad77a + dd89b63
commit 798a99b
Show file tree

Hide file tree

Showing 21 changed files with 10,490 additions and 2,058 deletions.
diff --git a/.github/workflows/make-tests.yaml b/.github/workflows/make-tests.yaml
@@ -9,7 +9,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python: [ "3.7", "3.8", "3.9", "3.10", "3.11" ]
+        python: [ "3.9", "3.10", "3.11" ]
 
     # Steps represent a sequence of tasks that will be executed as part of the job
     steps:

diff --git a/.gitignore b/.gitignore
@@ -49,6 +49,8 @@ coverage.xml
 *,cover
 .hypothesis/
 .pytest_cache/
+tests/resources/mgi.gpi
+tests/resources/mgi.gpi.gz
 
 # Translations
 *.mo

diff --git a/Makefile b/Makefile
@@ -24,13 +24,24 @@ foo:
 
 # only run local tests
 travis_test:
-	pytest tests/test_*local*.py tests/test_*parse*.py tests/test*writer*.py tests/test_qc.py \
-	       tests/test_rdfgen.py tests/test_phenosim_engine.py tests/test_ontol.py \
-		   tests/test_validation_rules.py tests/unit/test_annotation_scorer.py \
-		   tests/test_goassociation_model.py tests/test_relations.py \
-		   tests/unit/test_golr_search_query.py tests/unit/test_owlsim2_api.py \
-		   tests/test_collections.py \
-		   tests/test_gocamgen.py
+	@if [ -d ".venv" ] && [ -f "pyproject.toml" ]; then \
+		echo "Running tests in Poetry environment..."; \
+		poetry run pytest tests/test_*local*.py tests/test_*parse*.py tests/test*writer*.py tests/test_qc.py \
+		tests/test_rdfgen.py tests/test_phenosim_engine.py tests/test_ontol.py \
+		tests/test_validation_rules.py tests/unit/test_annotation_scorer.py \
+		tests/test_goassociation_model.py tests/test_relations.py \
+		tests/unit/test_golr_search_query.py tests/unit/test_owlsim2_api.py \
+		tests/test_collections.py \
+		tests/test_gocamgen.py; \
+	else \
+		pytest tests/test_*local*.py tests/test_*parse*.py tests/test*writer*.py tests/test_qc.py \
+		tests/test_rdfgen.py tests/test_phenosim_engine.py tests/test_ontol.py \
+		tests/test_validation_rules.py tests/unit/test_annotation_scorer.py \
+		tests/test_goassociation_model.py tests/test_relations.py \
+		tests/unit/test_golr_search_query.py tests/unit/test_owlsim2_api.py \
+		tests/test_collections.py \
+		tests/test_gocamgen.py; \
+	fi
 
 cleandist:
 	rm dist/* || true

diff --git a/bin/README.md b/bin/README.md
@@ -1 +1,17 @@
 See [command line docs](http://ontobio.readthedocs.io/en/latest/commandline.html#commandline) on ReadTheDocs
+
+To test validate.py "validate" command, the command that produces the final GPADs in the pipeline via the "mega make" 
+(aka: "produces GAFs, GPADs, ttl" stage), on a particular source:
+
+```bash
+poetry install
+poetry run validate produce -m ../go-site/metadata --gpad -t . -o go-basic.json --base-download-url "http://skyhook.berkeleybop.org/[PIPELINE_BRANCH_NAME]/" --only-dataset mgi MGI
+poetry run validate produce -m ../go-site/metadata --gpad -t . -o go-basic.json --base-download-url "http://skyhook.berkeleybop.org/[PIPELINE_BRANCH_NAME]/" --only-dataset goa_chicken goa
+```
+
+
+To test whether a GAF file is valid (passes all the GORules):
+```bash
+poetry install
+poetry run python3 ontobio-parse-assocs.py --file [path_to_file.gaf] --format GAF -o mgi_valid.gaf --report-md mgi.report.md -r [path_to_go.json] -l all validate
+```
diff --git a/bin/ontobio-parse-assocs.py b/bin/ontobio-parse-assocs.py
@@ -34,6 +34,7 @@
 import sys
 import json
 import logging
+from typing import Dict, List
 
 def main():
     """

diff --git a/bin/validate.py b/bin/validate.py
diff --git a/ontobio/io/assocwriter.py b/ontobio/io/assocwriter.py
@@ -6,6 +6,7 @@
 import datetime
 import json
 import logging
+import click
 
 from typing import List, Union
 
@@ -108,18 +109,23 @@ def write(self, assocs, meta=None):
 GPAD_2_0 = "2.0"
 GPAD_1_2 = "1.2"
 
+
 class GpadWriter(AssocWriter):
     """
     Writes Associations in GPAD format
     """
-    def __init__(self, file=None, version=GPAD_1_2):
+    def __init__(self, file=None, version=None):
         self.file = file
+        click.echo("Writing GPAD version: {}".format(version))
         if version in [GPAD_1_2, GPAD_2_0]:
             self.version = version
         else:
             self.version = GPAD_1_2
 
-        self._write("!gpa-version: {}\n".format(self.version))
+        self._write("!gpad-version: {}\n".format(self.version))
+        click.echo("Writing GPAD version: {}".format(self.version))
+        self._write("!generated-by: {}\n".format("GO Central"))
+        self._write("!date-generated: {}\n".format(str(datetime.datetime.now().strftime("%Y-%m-%dT%H:%M"))))
         self.ecomap = ecomap.EcoMap()
 
     def as_tsv(self, assoc: Union[association.GoAssociation, dict]):
@@ -136,7 +142,6 @@ def as_tsv(self, assoc: Union[association.GoAssociation, dict]):
             return assoc.to_gpad_1_2_tsv()
 
 
-
 class GafWriter(AssocWriter):
     """
     Writes Associations in GAF format.
@@ -151,13 +156,13 @@ class GafWriter(AssocWriter):
 
     The only difference in 2.1 and 2.2 are how qualifiers (column 4) are handled.
     GAF 2.1 allows empty or only `NOT` qualifier values, and only allows
-    `colocalizes_with` and `contributes_to` as qualifer values. However in 2.2
+    `colocalizes_with` and `contributes_to` as qualifier values. However, in 2.2
     qualifier must *not* be empty and cannot have only `NOT` as it's a modifier
-    on existing qualifers. The set of allowed qualifiers in 2.2 is also expanded.
+    on existing qualifiers. The set of allowed qualifiers in 2.2 is also expanded.
 
     So if there's a mismatch between converting from an annotation and a GAF
     version then that annotation is just skipped and not written out with an
-    error message displayed. Mismatch occurances of this kind would appear if
+    error message displayed. Mismatch occurrences of this kind would appear if
     the incoming annotation has a qualifier in the 2.2 set, but 2.1 is being
     written out, or if the qualifier is empty and 2.2 is being written.
     """

diff --git a/ontobio/io/entitywriter.py b/ontobio/io/entitywriter.py
@@ -71,11 +71,16 @@ def write(self, entities, meta=None):
         for e in entities:
             self.write_entity(e)
 
+
 class GpiWriter(EntityWriter):
     """
-    Writes entities in GPI format
+    Writes entities in GPI 1.2 or 2.0 (https://github.com/geneontology/go-annotation/blob/master/specs/gpad-gpi-2-0.md) format
+
+    :param file: file
+    :param version: str
+
+    Takes an "entity" dictionary generated typically from a GoAssociation object
 
-    Takes an entity dictionary:
     {
         'id': id, (String)
         'label': db_object_symbol, (String)
@@ -89,29 +94,79 @@ class GpiWriter(EntityWriter):
         }
     }
     """
-    def __init__(self, file=None):
+    def __init__(self, file=None, version=None):
         self.file = file
+        self.version = version
         if self.file:
-            self.file.write("!gpi-version: 1.2\n")
+            if self.version == "2.0":
+                self.file.write("!gpi-version: 2.0\n")
+            else:
+                self.file.write("!gpi-version: 1.2\n")
 
     def write_entity(self, entity):
         """
         Write a single entity to a line in the output file
+
+        :param entity: dict ; typically a dictionary representing an instance of a GoAssociation object
+        :param gpi_output_version: str ; the version of the GPAD output file to write
+        :return: None
+
+               GPI 2.0 spec <-- entity attributes
+               
+            1. DB_Object_ID <-- entity.id (CURIE format)
+            2. DB_Object_symbol <-- entity.label
+            3. DB_Object_Name <-- entity.full_name
+            4. DB_Object_Synonyms <-- entity.synonyms
+            5. DB_Object_Type <-- entity.type
+            6. DB_Object_Taxon <-- entity.taxon
+            7. Encoded_by <-- does not appear in GAF file, this is optional in GPI
+            8. Parent_Protein <-- entity.parents # unclear if this is a list or a single value
+            9. Protein_Containing_Complex_Members <-- does not appear in GAF file, this is optional in GPI
+            10. DB_Xrefs <-- entity.xrefs
+            11. Gene_Product_Properties <-- entity.properties
+
+                GPI 1.2 spec <-- entity attributes
+
+            1. DB <-- entity.id.prefix
+            2. DB_Object_ID	 <-- entity.id.local_id
+            3. DB_Object_Symbol <-- entity.label
+            4. DB_Object_Name <-- entity.full_name
+            5. DB_Object_Synonym(s) <-- entity.synonyms
+            6. DB_Object_Type <-- entity.type
+            7. Taxon <-- entity.taxon
+            8. Parent_Object_ID <-- entity.parents # unclear if this is a list or a single value
+            9. DB_Xref(s) <-- entity.xrefs
+            10. Properties <-- entity.properties
+
         """
-        db, db_object_id = self._split_prefix(entity)
-        taxon = normalize_taxon(entity["taxon"]["id"])
-
-        vals = [
-            db,
-            db_object_id,
-            entity.get('label'),
-            entity.get('full_name'),
-            entity.get('synonyms'),
-            entity.get('type'),
-            taxon,
-            entity.get('parents'),
-            entity.get('xrefs'),
-            entity.get('properties')
-        ]
+
+        if self.version == "2.0":
+            vals = [
+                entity.get('id'),  # DB_Object_ID
+                entity.get('label'),  # DB_Object_symbol
+                entity.get('full_name'),  # DB_Object_Name
+                entity.get('synonyms'),  # DB_Object_Synonyms
+                entity.get('type'),  # DB_Object_Type
+                normalize_taxon(entity.get("taxon").get("id")),  # DB_Object_Taxon
+                "",  # Encoded_by
+                entity.get('parents'),  # Parent_Protein
+                "",  # Protein_Containing_Complex_Members
+                entity.get('xrefs'),  # DB_Xrefs
+                entity.get('properties')  # Gene_Product_Properties
+            ]
+        else:
+            prefix, local_id = self._split_prefix(entity)
+            vals = [
+                prefix,  # DB
+                local_id,  # DB_Object_ID
+                entity.get('label'),  # DB_Object_Symbol
+                entity.get('full_name'),  # DB_Object_Symbol
+                entity.get('synonyms'),  # DB_Object_Name
+                entity.get('type'),  # DB_Object_Synonyms
+                normalize_taxon(entity.get("taxon").get("id")),  # taxon
+                entity.get('parents'),  # Parent_Object_ID
+                entity.get('xrefs'),  # DB_Xref(s)
+                entity.get('properties')  # Properties
+            ]
 
         self._write_row(vals)
diff --git a/ontobio/io/gafgpibridge.py b/ontobio/io/gafgpibridge.py
@@ -3,6 +3,7 @@
 
 from ontobio.model.association import GoAssociation, gp_type_label_to_curie
 
+
 class Entity(dict):
 
     def __init__(self, d):
@@ -13,34 +14,36 @@ def __hash__(self):
         return hash(d)
 
 
+def convert_association(association):
+    """
+    'id' is already `join`ed in both the Association and the Entity,
+    so we don't have to worry about what that looks like. We assume
+    it's correct.
+    """
+
+    if isinstance(association, GoAssociation):
+        # print(json.dumps(association, indent=4))
+        gpi_obj = {
+            'id': str(association.subject.id),
+            'label': association.subject.label,  # db_object_symbol,
+            'full_name': association.subject.fullname,  # db_object_name,
+            'synonyms': association.subject.synonyms,
+            'type': [gp_type_label_to_curie(association.subject.type[0])], #db_object_type,
+            'parents': "",  # GAF does not have this field, but it's optional in GPI
+            'xrefs': "",  # GAF does not have this field, but it's optional in GPI
+            'taxon': {
+                'id': str(association.subject.taxon)
+            }
+        }
+        return Entity(gpi_obj)
+
+    return None
+
+
 class GafGpiBridge(object):
 
     def __init__(self):
         self.cache = []
 
-    def convert_association(self, association) -> Entity:
-        """
-        'id' is already `join`ed in both the Association and the Entity,
-        so we don't have to worry about what that looks like. We assume
-        it's correct.
-        """
-        if isinstance(association, GoAssociation):
-            # print(json.dumps(association, indent=4))
-            gpi_obj = {
-                'id': str(association.subject.id),
-                'label': association.subject.label,  # db_object_symbol,
-                'full_name': association.subject.fullname,  # db_object_name,
-                'synonyms': association.subject.synonyms,
-                'type': [gp_type_label_to_curie(association.subject.type[0])], #db_object_type,
-                'parents': "", # GAF does not have this field, but it's optional in GPI
-                'xrefs': "", # GAF does not have this field, but it's optional in GPI
-                'taxon': {
-                    'id': str(association.subject.taxon)
-                }
-            }
-            return Entity(gpi_obj)
-
-        return None
-
-    def entities(self) -> List[Entity]:
+    def entities(self):
         return list(self.cache)
diff --git a/ontobio/model/association.py b/ontobio/model/association.py
@@ -491,7 +491,7 @@ class GoAssociation:
     """
     The internal model used by the parsers and qc Rules engine that all annotations are parsed into.
 
-    If an annotation textual line cannot be parsed into a GoAssociation then it is not a well formed line.
+    If an annotation textual line cannot be parsed into a GoAssociation then it is not a well-formed line.
 
     This class provides several methods to convert this GoAssociation into other representations, like GAF and GPAD
     of each version, as well as the old style dictionary Association that this class replaced (for compatibility if needed).
@@ -501,7 +501,7 @@ class GoAssociation:
     """
     source_line: Optional[str]
     subject: Subject
-    relation: Curie # This is the relation Curie
+    relation: Curie  # This is the relation Curie
     object: Term
     negated: bool
     qualifiers: List[Curie]
@@ -644,6 +644,12 @@ def to_gpad_2_0_tsv(self) -> List:
         """
 
         props_list = ["{key}={value}".format(key=key, value=value) for key, value in self.properties]
+        gp_isoforms = None
+        if self.subject_extensions:
+            gp_isoforms = self.subject_extensions[0].term
+        if gp_isoforms:
+            self.subject.id = gp_isoforms
+
         return [
             str(self.subject.id),
             "NOT" if self.negated else "",
@@ -656,7 +662,7 @@ def to_gpad_2_0_tsv(self) -> List:
             ymd_str(self.date, "-"),
             self.provided_by,
             ConjunctiveSet.list_to_str(self.object_extensions,
-                conjunct_to_str=lambda conj: conj.display()),
+                                       conjunct_to_str=lambda conj: conj.display()),
             "|".join(props_list)
         ]