Skip to content

Commit

Permalink
Merge pull request #675 from biolink/minimal_changes_for_gpad_gpi_2_0
Browse files Browse the repository at this point in the history
Minimal changes for gpad gpi 2 0
  • Loading branch information
sierra-moxon authored May 17, 2024
2 parents 44ad77a + dd89b63 commit 798a99b
Show file tree
Hide file tree
Showing 21 changed files with 10,490 additions and 2,058 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/make-tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
python: [ "3.7", "3.8", "3.9", "3.10", "3.11" ]
python: [ "3.9", "3.10", "3.11" ]

# Steps represent a sequence of tasks that will be executed as part of the job
steps:
Expand Down
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,8 @@ coverage.xml
*,cover
.hypothesis/
.pytest_cache/
tests/resources/mgi.gpi
tests/resources/mgi.gpi.gz

# Translations
*.mo
Expand Down
25 changes: 18 additions & 7 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -24,13 +24,24 @@ foo:

# only run local tests
travis_test:
pytest tests/test_*local*.py tests/test_*parse*.py tests/test*writer*.py tests/test_qc.py \
tests/test_rdfgen.py tests/test_phenosim_engine.py tests/test_ontol.py \
tests/test_validation_rules.py tests/unit/test_annotation_scorer.py \
tests/test_goassociation_model.py tests/test_relations.py \
tests/unit/test_golr_search_query.py tests/unit/test_owlsim2_api.py \
tests/test_collections.py \
tests/test_gocamgen.py
@if [ -d ".venv" ] && [ -f "pyproject.toml" ]; then \
echo "Running tests in Poetry environment..."; \
poetry run pytest tests/test_*local*.py tests/test_*parse*.py tests/test*writer*.py tests/test_qc.py \
tests/test_rdfgen.py tests/test_phenosim_engine.py tests/test_ontol.py \
tests/test_validation_rules.py tests/unit/test_annotation_scorer.py \
tests/test_goassociation_model.py tests/test_relations.py \
tests/unit/test_golr_search_query.py tests/unit/test_owlsim2_api.py \
tests/test_collections.py \
tests/test_gocamgen.py; \
else \
pytest tests/test_*local*.py tests/test_*parse*.py tests/test*writer*.py tests/test_qc.py \
tests/test_rdfgen.py tests/test_phenosim_engine.py tests/test_ontol.py \
tests/test_validation_rules.py tests/unit/test_annotation_scorer.py \
tests/test_goassociation_model.py tests/test_relations.py \
tests/unit/test_golr_search_query.py tests/unit/test_owlsim2_api.py \
tests/test_collections.py \
tests/test_gocamgen.py; \
fi

cleandist:
rm dist/* || true
Expand Down
16 changes: 16 additions & 0 deletions bin/README.md
Original file line number Diff line number Diff line change
@@ -1 +1,17 @@
See [command line docs](http://ontobio.readthedocs.io/en/latest/commandline.html#commandline) on ReadTheDocs

To test validate.py "validate" command, the command that produces the final GPADs in the pipeline via the "mega make"
(aka: "produces GAFs, GPADs, ttl" stage), on a particular source:

```bash
poetry install
poetry run validate produce -m ../go-site/metadata --gpad -t . -o go-basic.json --base-download-url "http://skyhook.berkeleybop.org/[PIPELINE_BRANCH_NAME]/" --only-dataset mgi MGI
poetry run validate produce -m ../go-site/metadata --gpad -t . -o go-basic.json --base-download-url "http://skyhook.berkeleybop.org/[PIPELINE_BRANCH_NAME]/" --only-dataset goa_chicken goa
```


To test whether a GAF file is valid (passes all the GORules):
```bash
poetry install
poetry run python3 ontobio-parse-assocs.py --file [path_to_file.gaf] --format GAF -o mgi_valid.gaf --report-md mgi.report.md -r [path_to_go.json] -l all validate
```
1 change: 1 addition & 0 deletions bin/ontobio-parse-assocs.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
import sys
import json
import logging
from typing import Dict, List

def main():
"""
Expand Down
315 changes: 237 additions & 78 deletions bin/validate.py

Large diffs are not rendered by default.

17 changes: 11 additions & 6 deletions ontobio/io/assocwriter.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import datetime
import json
import logging
import click

from typing import List, Union

Expand Down Expand Up @@ -108,18 +109,23 @@ def write(self, assocs, meta=None):
GPAD_2_0 = "2.0"
GPAD_1_2 = "1.2"


class GpadWriter(AssocWriter):
"""
Writes Associations in GPAD format
"""
def __init__(self, file=None, version=GPAD_1_2):
def __init__(self, file=None, version=None):
self.file = file
click.echo("Writing GPAD version: {}".format(version))
if version in [GPAD_1_2, GPAD_2_0]:
self.version = version
else:
self.version = GPAD_1_2

self._write("!gpa-version: {}\n".format(self.version))
self._write("!gpad-version: {}\n".format(self.version))
click.echo("Writing GPAD version: {}".format(self.version))
self._write("!generated-by: {}\n".format("GO Central"))
self._write("!date-generated: {}\n".format(str(datetime.datetime.now().strftime("%Y-%m-%dT%H:%M"))))
self.ecomap = ecomap.EcoMap()

def as_tsv(self, assoc: Union[association.GoAssociation, dict]):
Expand All @@ -136,7 +142,6 @@ def as_tsv(self, assoc: Union[association.GoAssociation, dict]):
return assoc.to_gpad_1_2_tsv()



class GafWriter(AssocWriter):
"""
Writes Associations in GAF format.
Expand All @@ -151,13 +156,13 @@ class GafWriter(AssocWriter):
The only difference in 2.1 and 2.2 are how qualifiers (column 4) are handled.
GAF 2.1 allows empty or only `NOT` qualifier values, and only allows
`colocalizes_with` and `contributes_to` as qualifer values. However in 2.2
`colocalizes_with` and `contributes_to` as qualifier values. However, in 2.2
qualifier must *not* be empty and cannot have only `NOT` as it's a modifier
on existing qualifers. The set of allowed qualifiers in 2.2 is also expanded.
on existing qualifiers. The set of allowed qualifiers in 2.2 is also expanded.
So if there's a mismatch between converting from an annotation and a GAF
version then that annotation is just skipped and not written out with an
error message displayed. Mismatch occurances of this kind would appear if
error message displayed. Mismatch occurrences of this kind would appear if
the incoming annotation has a qualifier in the 2.2 set, but 2.1 is being
written out, or if the qualifier is empty and 2.2 is being written.
"""
Expand Down
93 changes: 74 additions & 19 deletions ontobio/io/entitywriter.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,11 +71,16 @@ def write(self, entities, meta=None):
for e in entities:
self.write_entity(e)


class GpiWriter(EntityWriter):
"""
Writes entities in GPI format
Writes entities in GPI 1.2 or 2.0 (https://github.com/geneontology/go-annotation/blob/master/specs/gpad-gpi-2-0.md) format
:param file: file
:param version: str
Takes an "entity" dictionary generated typically from a GoAssociation object
Takes an entity dictionary:
{
'id': id, (String)
'label': db_object_symbol, (String)
Expand All @@ -89,29 +94,79 @@ class GpiWriter(EntityWriter):
}
}
"""
def __init__(self, file=None):
def __init__(self, file=None, version=None):
self.file = file
self.version = version
if self.file:
self.file.write("!gpi-version: 1.2\n")
if self.version == "2.0":
self.file.write("!gpi-version: 2.0\n")
else:
self.file.write("!gpi-version: 1.2\n")

def write_entity(self, entity):
"""
Write a single entity to a line in the output file
:param entity: dict ; typically a dictionary representing an instance of a GoAssociation object
:param gpi_output_version: str ; the version of the GPAD output file to write
:return: None
GPI 2.0 spec <-- entity attributes
1. DB_Object_ID <-- entity.id (CURIE format)
2. DB_Object_symbol <-- entity.label
3. DB_Object_Name <-- entity.full_name
4. DB_Object_Synonyms <-- entity.synonyms
5. DB_Object_Type <-- entity.type
6. DB_Object_Taxon <-- entity.taxon
7. Encoded_by <-- does not appear in GAF file, this is optional in GPI
8. Parent_Protein <-- entity.parents # unclear if this is a list or a single value
9. Protein_Containing_Complex_Members <-- does not appear in GAF file, this is optional in GPI
10. DB_Xrefs <-- entity.xrefs
11. Gene_Product_Properties <-- entity.properties
GPI 1.2 spec <-- entity attributes
1. DB <-- entity.id.prefix
2. DB_Object_ID <-- entity.id.local_id
3. DB_Object_Symbol <-- entity.label
4. DB_Object_Name <-- entity.full_name
5. DB_Object_Synonym(s) <-- entity.synonyms
6. DB_Object_Type <-- entity.type
7. Taxon <-- entity.taxon
8. Parent_Object_ID <-- entity.parents # unclear if this is a list or a single value
9. DB_Xref(s) <-- entity.xrefs
10. Properties <-- entity.properties
"""
db, db_object_id = self._split_prefix(entity)
taxon = normalize_taxon(entity["taxon"]["id"])

vals = [
db,
db_object_id,
entity.get('label'),
entity.get('full_name'),
entity.get('synonyms'),
entity.get('type'),
taxon,
entity.get('parents'),
entity.get('xrefs'),
entity.get('properties')
]

if self.version == "2.0":
vals = [
entity.get('id'), # DB_Object_ID
entity.get('label'), # DB_Object_symbol
entity.get('full_name'), # DB_Object_Name
entity.get('synonyms'), # DB_Object_Synonyms
entity.get('type'), # DB_Object_Type
normalize_taxon(entity.get("taxon").get("id")), # DB_Object_Taxon
"", # Encoded_by
entity.get('parents'), # Parent_Protein
"", # Protein_Containing_Complex_Members
entity.get('xrefs'), # DB_Xrefs
entity.get('properties') # Gene_Product_Properties
]
else:
prefix, local_id = self._split_prefix(entity)
vals = [
prefix, # DB
local_id, # DB_Object_ID
entity.get('label'), # DB_Object_Symbol
entity.get('full_name'), # DB_Object_Symbol
entity.get('synonyms'), # DB_Object_Name
entity.get('type'), # DB_Object_Synonyms
normalize_taxon(entity.get("taxon").get("id")), # taxon
entity.get('parents'), # Parent_Object_ID
entity.get('xrefs'), # DB_Xref(s)
entity.get('properties') # Properties
]

self._write_row(vals)
53 changes: 28 additions & 25 deletions ontobio/io/gafgpibridge.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

from ontobio.model.association import GoAssociation, gp_type_label_to_curie


class Entity(dict):

def __init__(self, d):
Expand All @@ -13,34 +14,36 @@ def __hash__(self):
return hash(d)


def convert_association(association):
"""
'id' is already `join`ed in both the Association and the Entity,
so we don't have to worry about what that looks like. We assume
it's correct.
"""

if isinstance(association, GoAssociation):
# print(json.dumps(association, indent=4))
gpi_obj = {
'id': str(association.subject.id),
'label': association.subject.label, # db_object_symbol,
'full_name': association.subject.fullname, # db_object_name,
'synonyms': association.subject.synonyms,
'type': [gp_type_label_to_curie(association.subject.type[0])], #db_object_type,
'parents': "", # GAF does not have this field, but it's optional in GPI
'xrefs': "", # GAF does not have this field, but it's optional in GPI
'taxon': {
'id': str(association.subject.taxon)
}
}
return Entity(gpi_obj)

return None


class GafGpiBridge(object):

def __init__(self):
self.cache = []

def convert_association(self, association) -> Entity:
"""
'id' is already `join`ed in both the Association and the Entity,
so we don't have to worry about what that looks like. We assume
it's correct.
"""
if isinstance(association, GoAssociation):
# print(json.dumps(association, indent=4))
gpi_obj = {
'id': str(association.subject.id),
'label': association.subject.label, # db_object_symbol,
'full_name': association.subject.fullname, # db_object_name,
'synonyms': association.subject.synonyms,
'type': [gp_type_label_to_curie(association.subject.type[0])], #db_object_type,
'parents': "", # GAF does not have this field, but it's optional in GPI
'xrefs': "", # GAF does not have this field, but it's optional in GPI
'taxon': {
'id': str(association.subject.taxon)
}
}
return Entity(gpi_obj)

return None

def entities(self) -> List[Entity]:
def entities(self):
return list(self.cache)
12 changes: 9 additions & 3 deletions ontobio/model/association.py
Original file line number Diff line number Diff line change
Expand Up @@ -491,7 +491,7 @@ class GoAssociation:
"""
The internal model used by the parsers and qc Rules engine that all annotations are parsed into.
If an annotation textual line cannot be parsed into a GoAssociation then it is not a well formed line.
If an annotation textual line cannot be parsed into a GoAssociation then it is not a well-formed line.
This class provides several methods to convert this GoAssociation into other representations, like GAF and GPAD
of each version, as well as the old style dictionary Association that this class replaced (for compatibility if needed).
Expand All @@ -501,7 +501,7 @@ class GoAssociation:
"""
source_line: Optional[str]
subject: Subject
relation: Curie # This is the relation Curie
relation: Curie # This is the relation Curie
object: Term
negated: bool
qualifiers: List[Curie]
Expand Down Expand Up @@ -644,6 +644,12 @@ def to_gpad_2_0_tsv(self) -> List:
"""

props_list = ["{key}={value}".format(key=key, value=value) for key, value in self.properties]
gp_isoforms = None
if self.subject_extensions:
gp_isoforms = self.subject_extensions[0].term
if gp_isoforms:
self.subject.id = gp_isoforms

return [
str(self.subject.id),
"NOT" if self.negated else "",
Expand All @@ -656,7 +662,7 @@ def to_gpad_2_0_tsv(self) -> List:
ymd_str(self.date, "-"),
self.provided_by,
ConjunctiveSet.list_to_str(self.object_extensions,
conjunct_to_str=lambda conj: conj.display()),
conjunct_to_str=lambda conj: conj.display()),
"|".join(props_list)
]

Expand Down
Loading

0 comments on commit 798a99b

Please sign in to comment.