Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Minimal changes for gpad gpi 2 0 #675

Merged
merged 16 commits into from
May 17, 2024
Merged
Show file tree
Hide file tree
Changes from 14 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/make-tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
python: [ "3.7", "3.8", "3.9", "3.10", "3.11" ]
python: [ "3.9", "3.10", "3.11" ]

# Steps represent a sequence of tasks that will be executed as part of the job
steps:
Expand Down
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,8 @@ coverage.xml
*,cover
.hypothesis/
.pytest_cache/
tests/resources/mgi.gpi
tests/resources/mgi.gpi.gz

# Translations
*.mo
Expand Down
25 changes: 18 additions & 7 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -24,13 +24,24 @@ foo:

# only run local tests
travis_test:
pytest tests/test_*local*.py tests/test_*parse*.py tests/test*writer*.py tests/test_qc.py \
tests/test_rdfgen.py tests/test_phenosim_engine.py tests/test_ontol.py \
tests/test_validation_rules.py tests/unit/test_annotation_scorer.py \
tests/test_goassociation_model.py tests/test_relations.py \
tests/unit/test_golr_search_query.py tests/unit/test_owlsim2_api.py \
tests/test_collections.py \
tests/test_gocamgen.py
@if [ -d ".venv" ] && [ -f "pyproject.toml" ]; then \
echo "Running tests in Poetry environment..."; \
poetry run pytest tests/test_*local*.py tests/test_*parse*.py tests/test*writer*.py tests/test_qc.py \
tests/test_rdfgen.py tests/test_phenosim_engine.py tests/test_ontol.py \
tests/test_validation_rules.py tests/unit/test_annotation_scorer.py \
tests/test_goassociation_model.py tests/test_relations.py \
tests/unit/test_golr_search_query.py tests/unit/test_owlsim2_api.py \
tests/test_collections.py \
tests/test_gocamgen.py; \
else \
pytest tests/test_*local*.py tests/test_*parse*.py tests/test*writer*.py tests/test_qc.py \
tests/test_rdfgen.py tests/test_phenosim_engine.py tests/test_ontol.py \
tests/test_validation_rules.py tests/unit/test_annotation_scorer.py \
tests/test_goassociation_model.py tests/test_relations.py \
tests/unit/test_golr_search_query.py tests/unit/test_owlsim2_api.py \
tests/test_collections.py \
tests/test_gocamgen.py; \
fi

cleandist:
rm dist/* || true
Expand Down
16 changes: 16 additions & 0 deletions bin/README.md
Original file line number Diff line number Diff line change
@@ -1 +1,17 @@
See [command line docs](http://ontobio.readthedocs.io/en/latest/commandline.html#commandline) on ReadTheDocs

To test validate.py "validate" command, the command that produces the final GPADs in the pipeline via the "mega make"
(aka: "produces GAFs, GPADs, ttl" stage), on a particular source:

```bash
poetry install
poetry run validate produce -m ../go-site/metadata --gpad -t . -o go-basic.json --base-download-url "http://skyhook.berkeleybop.org/[PIPELINE_BRANCH_NAME]/" --only-dataset mgi MGI
poetry run validate produce -m ../go-site/metadata --gpad -t . -o go-basic.json --base-download-url "http://skyhook.berkeleybop.org/[PIPELINE_BRANCH_NAME]/" --only-dataset goa_chicken goa
```


To test whether a GAF file is valid (passes all the GORules):
```bash
poetry install
poetry run python3 ontobio-parse-assocs.py --file [path_to_file.gaf] --format GAF -o mgi_valid.gaf --report-md mgi.report.md -r [path_to_go.json] -l all validate
```
1 change: 1 addition & 0 deletions bin/ontobio-parse-assocs.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
import sys
import json
import logging
from typing import Dict, List

def main():
"""
Expand Down
292 changes: 215 additions & 77 deletions bin/validate.py

Large diffs are not rendered by default.

17 changes: 11 additions & 6 deletions ontobio/io/assocwriter.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import datetime
import json
import logging
import click

from typing import List, Union

Expand Down Expand Up @@ -108,18 +109,23 @@ def write(self, assocs, meta=None):
GPAD_2_0 = "2.0"
GPAD_1_2 = "1.2"


class GpadWriter(AssocWriter):
"""
Writes Associations in GPAD format
"""
def __init__(self, file=None, version=GPAD_1_2):
def __init__(self, file=None, version=None):
self.file = file
click.echo("Writing GPAD version: {}".format(version))
if version in [GPAD_1_2, GPAD_2_0]:
self.version = version
else:
self.version = GPAD_1_2

self._write("!gpa-version: {}\n".format(self.version))
self._write("!gpad-version: {}\n".format(self.version))
click.echo("Writing GPAD version: {}".format(self.version))
self._write("!generated-by: {}\n".format("GO Central"))
self._write("!date-generated: {}\n".format(str(datetime.datetime.now().strftime("%Y-%m-%dT%H:%M"))))
self.ecomap = ecomap.EcoMap()

def as_tsv(self, assoc: Union[association.GoAssociation, dict]):
Expand All @@ -136,7 +142,6 @@ def as_tsv(self, assoc: Union[association.GoAssociation, dict]):
return assoc.to_gpad_1_2_tsv()



class GafWriter(AssocWriter):
"""
Writes Associations in GAF format.
Expand All @@ -151,13 +156,13 @@ class GafWriter(AssocWriter):

The only difference in 2.1 and 2.2 are how qualifiers (column 4) are handled.
GAF 2.1 allows empty or only `NOT` qualifier values, and only allows
`colocalizes_with` and `contributes_to` as qualifer values. However in 2.2
`colocalizes_with` and `contributes_to` as qualifier values. However, in 2.2
qualifier must *not* be empty and cannot have only `NOT` as it's a modifier
on existing qualifers. The set of allowed qualifiers in 2.2 is also expanded.
on existing qualifiers. The set of allowed qualifiers in 2.2 is also expanded.

So if there's a mismatch between converting from an annotation and a GAF
version then that annotation is just skipped and not written out with an
error message displayed. Mismatch occurances of this kind would appear if
error message displayed. Mismatch occurrences of this kind would appear if
the incoming annotation has a qualifier in the 2.2 set, but 2.1 is being
written out, or if the qualifier is empty and 2.2 is being written.
"""
Expand Down
93 changes: 74 additions & 19 deletions ontobio/io/entitywriter.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,11 +71,16 @@ def write(self, entities, meta=None):
for e in entities:
self.write_entity(e)


class GpiWriter(EntityWriter):
"""
Writes entities in GPI format
Writes entities in GPI 1.2 or 2.0 (https://github.com/geneontology/go-annotation/blob/master/specs/gpad-gpi-2-0.md) format

:param file: file
:param version: str

Takes an "entity" dictionary generated typically from a GoAssociation object

Takes an entity dictionary:
{
'id': id, (String)
'label': db_object_symbol, (String)
Expand All @@ -89,29 +94,79 @@ class GpiWriter(EntityWriter):
}
}
"""
def __init__(self, file=None):
def __init__(self, file=None, version=None):
self.file = file
self.version = version
if self.file:
self.file.write("!gpi-version: 1.2\n")
if self.version == "2.0":
self.file.write("!gpi-version: 2.0\n")
else:
self.file.write("!gpi-version: 1.2\n")

def write_entity(self, entity):
"""
Write a single entity to a line in the output file

:param entity: dict ; typically a dictionary representing an instance of a GoAssociation object
:param gpi_output_version: str ; the version of the GPAD output file to write
:return: None

GPI 2.0 spec <-- entity attributes

1. DB_Object_ID <-- entity.id (CURIE format)
2. DB_Object_symbol <-- entity.label
3. DB_Object_Name <-- entity.full_name
4. DB_Object_Synonyms <-- entity.synonyms
5. DB_Object_Type <-- entity.type
6. DB_Object_Taxon <-- entity.taxon
7. Encoded_by <-- does not appear in GAF file, this is optional in GPI
8. Parent_Protein <-- entity.parents # unclear if this is a list or a single value
9. Protein_Containing_Complex_Members <-- does not appear in GAF file, this is optional in GPI
10. DB_Xrefs <-- entity.xrefs
11. Gene_Product_Properties <-- entity.properties

GPI 1.2 spec <-- entity attributes

1. DB <-- entity.id.prefix
2. DB_Object_ID <-- entity.id.local_id
3. DB_Object_Symbol <-- entity.label
4. DB_Object_Name <-- entity.full_name
5. DB_Object_Synonym(s) <-- entity.synonyms
6. DB_Object_Type <-- entity.type
7. Taxon <-- entity.taxon
8. Parent_Object_ID <-- entity.parents # unclear if this is a list or a single value
9. DB_Xref(s) <-- entity.xrefs
10. Properties <-- entity.properties

"""
db, db_object_id = self._split_prefix(entity)
taxon = normalize_taxon(entity["taxon"]["id"])

vals = [
db,
db_object_id,
entity.get('label'),
entity.get('full_name'),
entity.get('synonyms'),
entity.get('type'),
taxon,
entity.get('parents'),
entity.get('xrefs'),
entity.get('properties')
]

if self.version == "2.0":
vals = [
entity.get('id'), # DB_Object_ID
entity.get('label'), # DB_Object_symbol
entity.get('full_name'), # DB_Object_Name
entity.get('synonyms'), # DB_Object_Synonyms
entity.get('type'), # DB_Object_Type
normalize_taxon(entity.get("taxon").get("id")), # DB_Object_Taxon
"", # Encoded_by
entity.get('parents'), # Parent_Protein
"", # Protein_Containing_Complex_Members
entity.get('xrefs'), # DB_Xrefs
entity.get('properties') # Gene_Product_Properties
]
else:
prefix, local_id = self._split_prefix(entity)
vals = [
prefix, # DB
local_id, # DB_Object_ID
entity.get('label'), # DB_Object_Symbol
entity.get('full_name'), # DB_Object_Symbol
entity.get('synonyms'), # DB_Object_Name
entity.get('type'), # DB_Object_Synonyms
normalize_taxon(entity.get("taxon").get("id")), # taxon
entity.get('parents'), # Parent_Object_ID
entity.get('xrefs'), # DB_Xref(s)
entity.get('properties') # Properties
]

self._write_row(vals)
53 changes: 28 additions & 25 deletions ontobio/io/gafgpibridge.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

from ontobio.model.association import GoAssociation, gp_type_label_to_curie


class Entity(dict):

def __init__(self, d):
Expand All @@ -13,34 +14,36 @@ def __hash__(self):
return hash(d)


def convert_association(association):
"""
'id' is already `join`ed in both the Association and the Entity,
so we don't have to worry about what that looks like. We assume
it's correct.
"""

if isinstance(association, GoAssociation):
# print(json.dumps(association, indent=4))
gpi_obj = {
'id': str(association.subject.id),
'label': association.subject.label, # db_object_symbol,
'full_name': association.subject.fullname, # db_object_name,
'synonyms': association.subject.synonyms,
'type': [gp_type_label_to_curie(association.subject.type[0])], #db_object_type,
'parents': "", # GAF does not have this field, but it's optional in GPI
'xrefs': "", # GAF does not have this field, but it's optional in GPI
'taxon': {
'id': str(association.subject.taxon)
}
}
return Entity(gpi_obj)

return None


class GafGpiBridge(object):

def __init__(self):
self.cache = []

def convert_association(self, association) -> Entity:
"""
'id' is already `join`ed in both the Association and the Entity,
so we don't have to worry about what that looks like. We assume
it's correct.
"""
if isinstance(association, GoAssociation):
# print(json.dumps(association, indent=4))
gpi_obj = {
'id': str(association.subject.id),
'label': association.subject.label, # db_object_symbol,
'full_name': association.subject.fullname, # db_object_name,
'synonyms': association.subject.synonyms,
'type': [gp_type_label_to_curie(association.subject.type[0])], #db_object_type,
'parents': "", # GAF does not have this field, but it's optional in GPI
'xrefs': "", # GAF does not have this field, but it's optional in GPI
'taxon': {
'id': str(association.subject.taxon)
}
}
return Entity(gpi_obj)

return None

def entities(self) -> List[Entity]:
def entities(self):
return list(self.cache)
12 changes: 9 additions & 3 deletions ontobio/model/association.py
Original file line number Diff line number Diff line change
Expand Up @@ -491,7 +491,7 @@ class GoAssociation:
"""
The internal model used by the parsers and qc Rules engine that all annotations are parsed into.

If an annotation textual line cannot be parsed into a GoAssociation then it is not a well formed line.
If an annotation textual line cannot be parsed into a GoAssociation then it is not a well-formed line.

This class provides several methods to convert this GoAssociation into other representations, like GAF and GPAD
of each version, as well as the old style dictionary Association that this class replaced (for compatibility if needed).
Expand All @@ -501,7 +501,7 @@ class GoAssociation:
"""
source_line: Optional[str]
subject: Subject
relation: Curie # This is the relation Curie
relation: Curie # This is the relation Curie
object: Term
negated: bool
qualifiers: List[Curie]
Expand Down Expand Up @@ -644,6 +644,12 @@ def to_gpad_2_0_tsv(self) -> List:
"""

props_list = ["{key}={value}".format(key=key, value=value) for key, value in self.properties]
gp_isoforms = None
if self.subject_extensions:
gp_isoforms = self.subject_extensions[0].term
if gp_isoforms:
self.subject.id = gp_isoforms

return [
str(self.subject.id),
"NOT" if self.negated else "",
Expand All @@ -656,7 +662,7 @@ def to_gpad_2_0_tsv(self) -> List:
ymd_str(self.date, "-"),
self.provided_by,
ConjunctiveSet.list_to_str(self.object_extensions,
conjunct_to_str=lambda conj: conj.display()),
conjunct_to_str=lambda conj: conj.display()),
"|".join(props_list)
]

Expand Down
Loading
Loading