Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Go site 2210 gorule 0000027 must check dbs are in the db xref file #677

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 6 additions & 3 deletions bin/ontobio-parse-assocs.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,11 +146,13 @@ def main():
rule_set = assocparser.RuleSet.ALL

goref_metadata = None
ref_species_metadata = None
ref_species_metadata = None
db_type_name_regex_id_syntax = None
if args.metadata_dir:
absolute_metadata = os.path.abspath(args.metadata_dir)
goref_metadata = metadata.yamldown_lookup(os.path.join(absolute_metadata, "gorefs"))
ref_species_metadata = metadata.yaml_set(absolute_metadata, "go-reference-species.yaml", "taxon_id")
db_type_name_regex_id_syntax = metadata.database_type_name_regex_id_syntax(absolute_metadata)

retracted_pub_set = None
if args.retracted_pub_set:
Expand All @@ -173,6 +175,7 @@ def main():
gpi_authority_path=args.gpi,
goref_metadata=goref_metadata,
ref_species_metadata=ref_species_metadata,
db_type_name_regex_id_syntax=db_type_name_regex_id_syntax,
retracted_pub_set=retracted_pub_set,
rule_set=rule_set
)
Expand All @@ -197,8 +200,8 @@ def main():
outfh = None
if args.outfile is not None:
two_mb = 2097152
outfh = open(args.outfile, "w", buffering=two_mb)
func(ont, args.file, outfh, p, args)
outfh = open(args.outfile, "w", buffering=two_mb)
func(ont, args.file, outfh, p, args)
if filtered_evidence_file:
filtered_evidence_file.close()

Expand Down
20 changes: 19 additions & 1 deletion ontobio/io/assocparser.py
Original file line number Diff line number Diff line change
Expand Up @@ -234,6 +234,7 @@ def __init__(self,
ref_species_metadata=None,
group_metadata=None,
dbxrefs=None,
db_type_name_regex_id_syntax=None,
retracted_pub_set=None,
suppress_rule_reporting_tags=[],
annotation_inferences=None,
Expand All @@ -259,6 +260,7 @@ def __init__(self,
self.goref_metadata = goref_metadata
self.ref_species_metadata = ref_species_metadata
self.group_metadata = group_metadata
self.db_type_name_regex_id_syntax = db_type_name_regex_id_syntax
self.retracted_pub_set = retracted_pub_set
self.suppress_rule_reporting_tags = suppress_rule_reporting_tags
self.annotation_inferences = annotation_inferences
Expand Down Expand Up @@ -703,6 +705,7 @@ def _unroll_withfrom_and_replair_obsoletes(self, line: SplitLine, gaf_or_gpad: s
return None
else:
fixed_element_individual = element_individual

if grouped_fixed_elements == '':
grouped_fixed_elements = fixed_element_individual
else:
Expand Down Expand Up @@ -816,7 +819,22 @@ def _validate_id(self, id, line: SplitLine, allowed_ids=None, context=None):
if id_prefix not in self.config.class_idspaces:
self.report.error(line.line, Report.INVALID_IDSPACE, id_prefix, "allowed: {}".format(self.config.class_idspaces), rule=27)
return False


# ensure id_syntax is valid, else output a warning
if self.config.db_type_name_regex_id_syntax is not None:
if id_prefix in self.config.db_type_name_regex_id_syntax:
type_name_regex_patterns = self.config.db_type_name_regex_id_syntax[id_prefix]
identity_matches_pattern = False
for regex in type_name_regex_patterns.values():
if regex.match(right):
identity_matches_pattern = True
break
if identity_matches_pattern == False:
self.report.warning(line.line, Report.INVALID_ID, id,
"GORULE:0000027: {} does not match any id_syntax patterns for {} in dbxrefs".format(right, id_prefix), taxon=line.taxon, rule=27)
else:
self.report.warning(line.line, Report.INVALID_ID, id,
"GORULE:0000027: {} not found in list of database names in dbxrefs".format(id_prefix), taxon=line.taxon, rule=27)
return True

def validate_pipe_separated_ids(self, column, line: SplitLine, empty_allowed=False, extra_delims="") -> Optional[List[str]]:
Expand Down
16 changes: 12 additions & 4 deletions ontobio/io/gafparser.py
Original file line number Diff line number Diff line change
Expand Up @@ -215,7 +215,7 @@ def parse_line(self, line):
references = self.validate_curie_ids(assoc.evidence.has_supporting_reference, split_line)
if references is None:
# Reporting occurs in above function call
return assocparser.ParseResult(line, [], True)
return assocparser.ParseResult(line, [], True)

# With/From
for wf in assoc.evidence.with_support_from:
Expand All @@ -238,7 +238,12 @@ def parse_line(self, line):
if repaired is None:
assoc.object_extensions = []
return assocparser.ParseResult(line, [], True)
assoc.object_extensions = repaired
assoc.object_extensions = repaired

# Check subject_extensions and output warnings
if (0 < len(assoc.subject_extensions)):
for ext in assoc.subject_extensions:
self._validate_id(str(ext.term), split_line)


## Run GO Rules, save split values into individual variables
Expand All @@ -264,7 +269,7 @@ def parse_line(self, line):

if self.config.group_idspace is not None and assoc.provided_by not in self.config.group_idspace:
self.report.warning(line, Report.INVALID_ID, assoc.provided_by,
"GORULE:0000027: assigned_by is not present in groups reference", taxon=str(assoc.object.taxon), rule=27)
"GORULE:0000027: {assigned_by} is not present in groups reference".format(assigned_by=assoc.provided_by), taxon=str(assoc.object.taxon), rule=27)

db = assoc.subject.id.namespace
if self.config.entity_idspaces is not None and db not in self.config.entity_idspaces:
Expand All @@ -274,7 +279,10 @@ def parse_line(self, line):
# If we found a synonym
self.report.warning(line, Report.INVALID_ID_DBXREF, db, "GORULE:0000027: {} is a synonym for the correct ID {}, and has been updated".format(db, upgrade), taxon=str(assoc.object.taxon), rule=27)
assoc.subject.id.namespace = upgrade

else:
self.report.warning(line, Report.INVALID_ID, assoc.subject.id.namespace,
"GORULE:0000027: {subject_id_namespace} is not present in dbxrefs".format(subject_id_namespace=assoc.subject.id.namespace), taxon=str(assoc.object.taxon), rule=27)

## --
## db + db_object_id. CARD=1
## --assigned_by
Expand Down
2 changes: 1 addition & 1 deletion ontobio/io/gpadparser.py
Original file line number Diff line number Diff line change
Expand Up @@ -215,7 +215,7 @@ def parse_line(self, line):
return assocparser.ParseResult(line, [], True)

if not self._validate_id(str(assoc.evidence.type), split_line):
return assocparser.ParseResult(line, [], True)
return assocparser.ParseResult(line, [], True)

if assoc.interacting_taxon:
if not self._validate_taxon(str(assoc.interacting_taxon), split_line):
Expand Down
32 changes: 26 additions & 6 deletions ontobio/validation/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import yaml
import os
import glob
import re

from dataclasses import dataclass

Expand Down Expand Up @@ -147,19 +148,38 @@ def source_path(dataset_metadata, target_dir, group):
return path

def database_entities(metadata):
dbxrefs = database_yaml(metadata)

d = BiDiMultiMap()
for entity in dbxrefs:
d[entity["database"]] = set(entity.get("synonyms", []))

return d

def database_type_name_regex_id_syntax(metadata):
dbxrefs = database_yaml(metadata)

d = {}
for entity in dbxrefs:
type_names = {}
entity_types = entity.get("entity_types", {})
for et in entity_types:
if "id_syntax" in et and "type_name" in et:
type_names[et["type_name"]] = re.compile(et["id_syntax"])
if len(type_names) > 0:
d[entity["database"]] = type_names

return d

def database_yaml(metadata):
dbxrefs_path = os.path.join(os.path.abspath(metadata), "db-xrefs.yaml")
try:
with open(dbxrefs_path, "r") as db_xrefs_file:
click.echo("Found db-xrefs at {path}".format(path=dbxrefs_path))
dbxrefs = yaml.load(db_xrefs_file, Loader=yaml.FullLoader)
except Exception as e:
raise click.ClickException("Could not find or read {}: {}".format(dbxrefs_path, str(e)))

d = BiDiMultiMap()
for entity in dbxrefs:
d[entity["database"]] = set(entity.get("synonyms", []))

return d
return dbxrefs

def groups(metadata) -> Set[str]:
groups_path = os.path.join(os.path.abspath(metadata), "groups.yaml")
Expand Down
44 changes: 44 additions & 0 deletions tests/test_gafparser.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
import pytest
import io
import json
import re

ecomap = EcoMap()
ecomap.mappings()
Expand Down Expand Up @@ -603,6 +604,49 @@ def test_factory():

assert len(aset.associations_by_subj) > 0
assert found == 2

def test_id_syntax():
database_id_syntax_lookups = {}
go_types = {}
pattern = '\\d{7}'
go_types['molecular_function'] = re.compile(pattern)
go_types['biological_process'] = re.compile(pattern)
go_types['cellular_component'] = re.compile(pattern)
database_id_syntax_lookups['GO'] = go_types

pmid_types = {}
pmid_types['entity'] = re.compile('[0-9]+')
database_id_syntax_lookups['PMID'] = pmid_types

pombase_types = {}
pombase_types['entity'] = re.compile('S\\w+(\\.)?\\w+(\\.)?')
database_id_syntax_lookups['PomBase'] = pombase_types
p = GafParser(config=assocparser.AssocParserConfig(
ontology=OntologyFactory().create(ONT), db_type_name_regex_id_syntax=database_id_syntax_lookups))

assoc_result = p.parse_line("PomBase\tSPBC1289.03c\tspi1\t\tGO:0005515\tPMID:18422602\tIPI\tPomBase:SPAC25A8.01c\tF\tRan GTPase Spi1\t\tprotein\ttaxon:4896\t20080718\tPomBase\t")
assert len(assoc_result.associations) == 1
assert assoc_result.skipped == False
messages = p.report.to_report_json()["messages"]
assert "gorule-0000027" not in messages

p = GafParser(config=assocparser.AssocParserConfig(
ontology=OntologyFactory().create(ONT), db_type_name_regex_id_syntax=database_id_syntax_lookups))
assoc_result = p.parse_line("PomBase\tSPBC1289.03c\tspi1\t\tGO:0005515\tPMID:PMID:18422602\tIPI\tPomBase:SPAC25A8.01c\tF\tRan GTPase Spi1\t\tprotein\ttaxon:4896\t20080718\tPomBase\t")
assert len(assoc_result.associations) == 1
assert assoc_result.skipped == False
messages = p.report.to_report_json()["messages"]
assert len(messages["gorule-0000027"]) == 1
assert messages["gorule-0000027"][0]["obj"] == "PMID:PMID:18422602"

p = GafParser(config=assocparser.AssocParserConfig(
ontology=OntologyFactory().create(ONT), db_type_name_regex_id_syntax=database_id_syntax_lookups))
assoc_result = p.parse_line("PomBase\tSPBC1289.03c\tspi1\t\tGO:0005515\tBLA:18422602\tIPI\tPomBase:SPAC25A8.01c\tF\tRan GTPase Spi1\t\tprotein\ttaxon:4896\t20080718\tPomBase\t")
assert len(assoc_result.associations) == 1
assert assoc_result.skipped == False
messages = p.report.to_report_json()["messages"]
assert len(messages["gorule-0000027"]) == 1
assert messages["gorule-0000027"][0]["obj"] == "BLA:18422602"


def test_gaf_gpi_bridge():
Expand Down
89 changes: 88 additions & 1 deletion tests/test_gpad_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,11 @@
from ontobio.model.association import Curie, Subject

import yaml
import re

POMBASE = "tests/resources/truncated-pombase.gpad"
ALT_ID_ONT = "tests/resources/obsolete.json"


def test_obsolete_term_repair_withfrom():

vals = ["ZFIN",
Expand Down Expand Up @@ -318,6 +318,93 @@ def test_unmapped_eco_to_gaf_codes():
result = parser.parse_line("\t".join(vals))
assert len(result.associations) == 1

def test_id_syntax():
database_id_syntax_lookups = {}
go_types = {}
pattern = '\\d{7}'
go_types['molecular_function'] = re.compile(pattern)
go_types['biological_process'] = re.compile(pattern)
go_types['cellular_component'] = re.compile(pattern)
database_id_syntax_lookups['GO'] = go_types

pmid_types = {}
pmid_types['entity'] = re.compile('[0-9]+')
database_id_syntax_lookups['PMID'] = pmid_types

pombase_types = {}
pombase_types['entity'] = re.compile('S\\w+(\\.)?\\w+(\\.)?')
database_id_syntax_lookups['PomBase'] = pombase_types

eco_types = {}
eco_types['entity'] = re.compile(pattern)
database_id_syntax_lookups['ECO'] = eco_types

vals = ["PomBase",
"SPAC25A8.01c",
"acts_upstream_of_or_within",
"GO:0007155",
"PMID:15494018",
"ECO:0000305",
"GO:0005913",
"",
"20041026",
"ZFIN",
"",
"PomBase"
]

config = assocparser.AssocParserConfig(
ontology=OntologyFactory().create(ALT_ID_ONT), db_type_name_regex_id_syntax=database_id_syntax_lookups)
p = GpadParser(config=config)
result = p.parse_line("\t".join(vals))
assert len(result.associations) == 1
assert result.skipped == False
messages = p.report.to_report_json()["messages"]
assert "gorule-0000027" not in messages

vals = ["PomBase",
"SPAC25A8.01c",
"acts_upstream_of_or_within",
"GO:0007155",
"PMID:PMID:15494018",
"ECO:0000305",
"GO:0005913",
"",
"20041026",
"ZFIN",
"",
"PomBase"
]

p = GpadParser(config=config)
result = p.parse_line("\t".join(vals))
assert len(result.associations) == 1
assert result.skipped == False
messages = p.report.to_report_json()["messages"]
assert len(messages["gorule-0000027"]) == 1
assert messages["gorule-0000027"][0]["obj"] == "PMID:PMID:15494018"

vals = ["PomBase",
"SPAC25A8.01c",
"acts_upstream_of_or_within",
"GO:0007155",
"BLA:15494018",
"ECO:0000305",
"GO:0005913",
"",
"20041026",
"ZFIN",
"",
"PomBase"
]
p = GpadParser(config=config)
result = p.parse_line("\t".join(vals))
assert len(result.associations) == 1
assert result.skipped == False
messages = p.report.to_report_json()["messages"]
assert len(messages["gorule-0000027"]) == 1
assert messages["gorule-0000027"][0]["obj"] == "BLA:15494018"

def test_gpi_check():
report = assocparser.Report(group="unknown", dataset="unknown")
vals = [
Expand Down
Loading