Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Go site 2210 gorule 0000027 must check dbs are in the db xref file #677

Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 6 additions & 3 deletions bin/ontobio-parse-assocs.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,11 +146,13 @@ def main():
rule_set = assocparser.RuleSet.ALL

goref_metadata = None
ref_species_metadata = None
ref_species_metadata = None
db_type_name_regex_id_syntax = None
if args.metadata_dir:
absolute_metadata = os.path.abspath(args.metadata_dir)
goref_metadata = metadata.yamldown_lookup(os.path.join(absolute_metadata, "gorefs"))
ref_species_metadata = metadata.yaml_set(absolute_metadata, "go-reference-species.yaml", "taxon_id")
db_type_name_regex_id_syntax = metadata.database_type_name_regex_id_syntax(absolute_metadata)

retracted_pub_set = None
if args.retracted_pub_set:
Expand All @@ -173,6 +175,7 @@ def main():
gpi_authority_path=args.gpi,
goref_metadata=goref_metadata,
ref_species_metadata=ref_species_metadata,
db_type_name_regex_id_syntax=db_type_name_regex_id_syntax,
retracted_pub_set=retracted_pub_set,
rule_set=rule_set
)
Expand All @@ -197,8 +200,8 @@ def main():
outfh = None
if args.outfile is not None:
two_mb = 2097152
outfh = open(args.outfile, "w", buffering=two_mb)
func(ont, args.file, outfh, p, args)
outfh = open(args.outfile, "w", buffering=two_mb)
func(ont, args.file, outfh, p, args)
if filtered_evidence_file:
filtered_evidence_file.close()

Expand Down
40 changes: 40 additions & 0 deletions ontobio/io/assocparser.py
Original file line number Diff line number Diff line change
Expand Up @@ -234,6 +234,7 @@ def __init__(self,
ref_species_metadata=None,
group_metadata=None,
dbxrefs=None,
db_type_name_regex_id_syntax=None,
retracted_pub_set=None,
suppress_rule_reporting_tags=[],
annotation_inferences=None,
Expand All @@ -259,6 +260,7 @@ def __init__(self,
self.goref_metadata = goref_metadata
self.ref_species_metadata = ref_species_metadata
self.group_metadata = group_metadata
self.db_type_name_regex_id_syntax = db_type_name_regex_id_syntax
self.retracted_pub_set = retracted_pub_set
self.suppress_rule_reporting_tags = suppress_rule_reporting_tags
self.annotation_inferences = annotation_inferences
Expand Down Expand Up @@ -703,6 +705,11 @@ def _unroll_withfrom_and_replair_obsoletes(self, line: SplitLine, gaf_or_gpad: s
return None
else:
fixed_element_individual = element_individual

#Check db-xref
self._validate_curie_using_db_xrefs(association.Curie.from_str(fixed_element_individual), fixed_element_individual, line)


if grouped_fixed_elements == '':
grouped_fixed_elements = fixed_element_individual
else:
Expand Down Expand Up @@ -768,10 +775,43 @@ def _repair_extensions(self, extensions, line: SplitLine, subclassof=None):
else:
curSet.append(association.ExtensionUnit(relation = association.Curie(e.relation.namespace, e.relation.identity), term = association.Curie(e.term.namespace, e.term.identity)))

#Check db-xref
self._validate_curie_using_db_xrefs(e.term, str(e.term), line)

grouped_set.append(association.ConjunctiveSet(curSet))

return grouped_set

def _validate_curie_using_db_xrefs(self, curie, curieStr, line: SplitLine):
if curie is None:
self.report.warning(line.line, Report.INVALID_ID, curieStr,"GORULE:0000027: curie is empty", taxon=line.taxon, rule=27)
return False
if isinstance(curie, association.Curie) is False:
self.report.warning(line.line, Report.INVALID_ID, curieStr,"GORULE:0000027: Not a curie", taxon=line.taxon, rule=27)
return False
if curie.namespace is None:
self.report.warning(line.line, Report.INVALID_ID, curie.namespace, "GORULE:0000027: Curie namespace is empty", taxon=line.taxon, rule=27)
return False
if curie.identity is None:
self.report.warning(line.line, Report.INVALID_ID, curie.identity, "GORULE:0000027: Curie identity is empty", taxon=line.taxon, rule=27)
return False

if self.config.db_type_name_regex_id_syntax is not None:
if curie.namespace in self.config.db_type_name_regex_id_syntax:
type_name_regex_patterns = self.config.db_type_name_regex_id_syntax[curie.namespace]
identity_matches_pattern = False
for regex in type_name_regex_patterns.values():
if regex.match(curie.identity):
identity_matches_pattern = True
break
if identity_matches_pattern == False:
self.report.warning(line, Report.INVALID_ID, curie.identity,
"GORULE:0000027: {} does not match any id_syntax patterns for {} in dbxrefs".format(str(curie), curie.namespace), taxon=line.taxon, rule=27)
return False
return True



def _validate_symbol(self, symbol, line: SplitLine):
if symbol is None or symbol == "":
self.report.warning(line.line, Report.INVALID_SYMBOL, symbol, "GORULE:0000027: symbol is empty",
Expand Down
12 changes: 11 additions & 1 deletion ontobio/io/gafparser.py
Original file line number Diff line number Diff line change
Expand Up @@ -207,6 +207,8 @@ def parse_line(self, line):
print("skipping because {} not validated!".format(assoc.object.id))
return assocparser.ParseResult(line, [], True)

self._validate_curie_using_db_xrefs(assoc.object.id, str(assoc.object.id), split_line)

valid_goid = self._validate_ontology_class_id(str(assoc.object.id), split_line)
if valid_goid is None:
return assocparser.ParseResult(line, [], True)
Expand All @@ -216,6 +218,8 @@ def parse_line(self, line):
if references is None:
# Reporting occurs in above function call
return assocparser.ParseResult(line, [], True)
for reference in references:
self._validate_curie_using_db_xrefs(reference, str(reference), split_line)

# With/From
for wf in assoc.evidence.with_support_from:
Expand Down Expand Up @@ -264,7 +268,7 @@ def parse_line(self, line):

if self.config.group_idspace is not None and assoc.provided_by not in self.config.group_idspace:
self.report.warning(line, Report.INVALID_ID, assoc.provided_by,
"GORULE:0000027: assigned_by is not present in groups reference", taxon=str(assoc.object.taxon), rule=27)
"GORULE:0000027: {assigned_by} is not present in groups reference".format(assigned_by=assoc.provided_by), taxon=str(assoc.object.taxon), rule=27)

db = assoc.subject.id.namespace
if self.config.entity_idspaces is not None and db not in self.config.entity_idspaces:
Expand All @@ -274,7 +278,13 @@ def parse_line(self, line):
# If we found a synonym
self.report.warning(line, Report.INVALID_ID_DBXREF, db, "GORULE:0000027: {} is a synonym for the correct ID {}, and has been updated".format(db, upgrade), taxon=str(assoc.object.taxon), rule=27)
assoc.subject.id.namespace = upgrade
else:
self.report.warning(line, Report.INVALID_ID, assoc.subject.id.namespace,
"GORULE:0000027: {subject_id_namespace} is not present in dbxrefs".format(subject_id_namespace=assoc.subject.id.namespace), taxon=str(assoc.object.taxon), rule=27)

# Validate against db-xref id_syntax
self._validate_curie_using_db_xrefs(assoc.subject.id, str(assoc.subject.id), split_line)

## --
## db + db_object_id. CARD=1
## --assigned_by
Expand Down
6 changes: 6 additions & 0 deletions ontobio/io/gpadparser.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,7 @@ def parse_line(self, line):
valid_goid = self._validate_ontology_class_id(str(assoc.object.id), split_line)
if valid_goid is None:
return assocparser.ParseResult(line, [], True)
self._validate_curie_using_db_xrefs(association.Curie.from_str(valid_goid), valid_goid, split_line)
assoc.object.id = association.Curie.from_str(valid_goid)

go_rule_results = qc.test_go_rules(assoc, self.config)
Expand Down Expand Up @@ -216,6 +217,9 @@ def parse_line(self, line):

if not self._validate_id(str(assoc.evidence.type), split_line):
return assocparser.ParseResult(line, [], True)

#Ensure db and dbid are valid
self._validate_curie_using_db_xrefs(assoc.subject.id, str(assoc.subject.id), split_line)

if assoc.interacting_taxon:
if not self._validate_taxon(str(assoc.interacting_taxon), split_line):
Expand All @@ -235,6 +239,8 @@ def parse_line(self, line):
references = self.validate_curie_ids(assoc.evidence.has_supporting_reference, split_line)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@mugitty Is there any overlap of logic between self.validate_curie_ids here and self._validate_curie_using_db_xrefs just below? Could self._validate_curie_using_db_xrefs be incorporated into self.validate_curie_ids?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@dustine32 , validate_curie_ids calls _validate_id . This checks for things like "DB:id" or for annotations, prefix has to be in GO id space. However, it does not validate against the syntax pattern specified in the db-xrefs file. This is meant as a catch-all for any identifier in the GAF line that has a database field and id_syntax

Copy link
Collaborator

@dustine32 dustine32 Jun 24, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@mugitty Would there be any issue if you incorporated this db-xrefs syntax pattern checking inside _validate_id? It looks like _validate_id assumes every id is a CURIE, which by definition should always have colon-separated database field and id_syntax (else an error will be reported). The only complication I could think of is if a metadata/db-xrefs is not supplied when _validate_id is called but you could just make this optional in _validate_id (i.e., if self.config.db_type_name_regex_id_syntax is not None).

I guess my main concern is separating validation logic throughout the code that really should be in the same place.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@dustine32, as you suggested, I could incorporate in _validate_id and add a check for self.config.db_type_name_regex_id_syntax is not None.

Let me update

if references is None:
return assocparser.ParseResult(line, [], True)
for reference in references:
self._validate_curie_using_db_xrefs(reference, str(reference), split_line)

# With/From
if assoc.evidence.with_support_from is not None:
Expand Down
32 changes: 26 additions & 6 deletions ontobio/validation/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import yaml
import os
import glob
import re

from dataclasses import dataclass

Expand Down Expand Up @@ -147,19 +148,38 @@ def source_path(dataset_metadata, target_dir, group):
return path

def database_entities(metadata):
dbxrefs = database_yaml(metadata)

d = BiDiMultiMap()
for entity in dbxrefs:
d[entity["database"]] = set(entity.get("synonyms", []))

return d

def database_type_name_regex_id_syntax(metadata):
dbxrefs = database_yaml(metadata)

d = {}
for entity in dbxrefs:
type_names = {}
entity_types = entity.get("entity_types", {})
for et in entity_types:
if "id_syntax" in et and "type_name" in et:
type_names[et["type_name"]] = re.compile(et["id_syntax"])
if len(type_names) > 0:
d[entity["database"]] = type_names

return d

def database_yaml(metadata):
dbxrefs_path = os.path.join(os.path.abspath(metadata), "db-xrefs.yaml")
try:
with open(dbxrefs_path, "r") as db_xrefs_file:
click.echo("Found db-xrefs at {path}".format(path=dbxrefs_path))
dbxrefs = yaml.load(db_xrefs_file, Loader=yaml.FullLoader)
except Exception as e:
raise click.ClickException("Could not find or read {}: {}".format(dbxrefs_path, str(e)))

d = BiDiMultiMap()
for entity in dbxrefs:
d[entity["database"]] = set(entity.get("synonyms", []))

return d
return dbxrefs

def groups(metadata) -> Set[str]:
groups_path = os.path.join(os.path.abspath(metadata), "groups.yaml")
Expand Down
Loading