Skip to content

Commit

Permalink
For #2360 - Updated to handle DBs without id syntax pattern
Browse files Browse the repository at this point in the history
  • Loading branch information
mugitty committed Sep 6, 2024
1 parent 649b1b5 commit 1100c01
Show file tree
Hide file tree
Showing 4 changed files with 49 additions and 13 deletions.
19 changes: 10 additions & 9 deletions ontobio/io/assocparser.py
Original file line number Diff line number Diff line change
Expand Up @@ -811,7 +811,7 @@ def _validate_id(self, id, line: SplitLine, allowed_ids=None, context=None):

if allowed_ids is not None and id_prefix not in allowed_ids:
# For now we will just issue a warning here, and we won't filter out the annotation here
self.report.warning(line.line, Report.INVALID_ID_DBXREF, id_prefix, "allowed: {}".format(allowed_ids), rule=27)
self.report.warning(line.line, Report.INVALID_ID_DBXREF, id_prefix, "{} is not present in DB xrefs file".format(id_prefix), rule=27)

# ensure that the ID space of the annotation class (e.g. GO)
# conforms to what is expected
Expand All @@ -824,14 +824,15 @@ def _validate_id(self, id, line: SplitLine, allowed_ids=None, context=None):
if self.config.db_type_name_regex_id_syntax is not None:
if id_prefix in self.config.db_type_name_regex_id_syntax:
type_name_regex_patterns = self.config.db_type_name_regex_id_syntax[id_prefix]
identity_matches_pattern = False
for regex in type_name_regex_patterns.values():
if regex.fullmatch(right):
identity_matches_pattern = True
break
if identity_matches_pattern == False:
self.report.warning(line.line, Report.INVALID_ID, id,
"GORULE:0000027: {} does not match any id_syntax patterns for {} in dbxrefs".format(right, id_prefix), taxon=line.taxon, rule=27)
if type_name_regex_patterns is not None and len(type_name_regex_patterns) > 0:
identity_matches_pattern = False
for regex in type_name_regex_patterns.values():
if regex.fullmatch(right):
identity_matches_pattern = True
break
if identity_matches_pattern == False:
self.report.warning(line.line, Report.INVALID_ID, id,
"GORULE:0000027: {} does not match any id_syntax patterns for {} in dbxrefs".format(right, id_prefix), taxon=line.taxon, rule=27)
else:
self.report.warning(line.line, Report.INVALID_ID, id,
"GORULE:0000027: {} not found in list of database names in dbxrefs".format(id_prefix), taxon=line.taxon, rule=27)
Expand Down
5 changes: 2 additions & 3 deletions ontobio/validation/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,9 +165,8 @@ def database_type_name_regex_id_syntax(metadata):
entity_types = entity.get("entity_types", {})
for et in entity_types:
if "id_syntax" in et and "type_name" in et:
type_names[et["type_name"]] = re.compile(et["id_syntax"])
if len(type_names) > 0:
d[entity["database"]] = type_names
type_names[et["type_name"]] = re.compile(et["id_syntax"])
d[entity["database"]] = type_names

return d

Expand Down
12 changes: 11 additions & 1 deletion tests/test_gafparser.py
Original file line number Diff line number Diff line change
Expand Up @@ -620,7 +620,11 @@ def test_id_syntax():

pombase_types = {}
pombase_types['entity'] = re.compile('S\\w+(\\.)?\\w+(\\.)?')
database_id_syntax_lookups['PomBase'] = pombase_types
database_id_syntax_lookups['PomBase'] = pombase_types

wb_ref_types = {}
database_id_syntax_lookups['WB_REF'] = wb_ref_types

p = GafParser(config=assocparser.AssocParserConfig(
ontology=OntologyFactory().create(ONT), db_type_name_regex_id_syntax=database_id_syntax_lookups))

Expand All @@ -629,6 +633,12 @@ def test_id_syntax():
assert assoc_result.skipped == False
messages = p.report.to_report_json()["messages"]
assert "gorule-0000027" not in messages

assoc_result = p.parse_line("PomBase\tSPBC1289.03c\tspi1\t\tGO:0005515\tWB_REF:WBPaper00006408|PMID:18422602\tIPI\tPomBase:SPAC25A8.01c\tF\tRan GTPase Spi1\t\tprotein\ttaxon:4896\t20080718\tPomBase\t")
assert len(assoc_result.associations) == 1
assert assoc_result.skipped == False
messages = p.report.to_report_json()["messages"]
assert "gorule-0000027" not in messages

p = GafParser(config=assocparser.AssocParserConfig(
ontology=OntologyFactory().create(ONT), db_type_name_regex_id_syntax=database_id_syntax_lookups))
Expand Down
26 changes: 26 additions & 0 deletions tests/test_gpad_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -339,6 +339,9 @@ def test_id_syntax():
eco_types['entity'] = re.compile(pattern)
database_id_syntax_lookups['ECO'] = eco_types

wb_ref_types = {}
database_id_syntax_lookups['WB_REF'] = wb_ref_types

vals = ["PomBase",
"SPAC25A8.01c",
"acts_upstream_of_or_within",
Expand All @@ -362,6 +365,29 @@ def test_id_syntax():
messages = p.report.to_report_json()["messages"]
assert "gorule-0000027" not in messages

vals = ["PomBase",
"SPAC25A8.01c",
"acts_upstream_of_or_within",
"GO:0007155",
"WB_REF:WBPaper00006408|PMID:15494018",
"ECO:0000305",
"GO:0005913",
"",
"20041026",
"ZFIN",
"",
"PomBase"
]

config = assocparser.AssocParserConfig(
ontology=OntologyFactory().create(ALT_ID_ONT), db_type_name_regex_id_syntax=database_id_syntax_lookups)
p = GpadParser(config=config)
result = p.parse_line("\t".join(vals))
assert len(result.associations) == 1
assert result.skipped == False
messages = p.report.to_report_json()["messages"]
assert "gorule-0000027" not in messages

vals = ["PomBase",
"SPAC25A8.01c",
"acts_upstream_of_or_within",
Expand Down

0 comments on commit 1100c01

Please sign in to comment.