From 1100c01b495909c73477c0f892938f762c3dd2e2 Mon Sep 17 00:00:00 2001 From: Anushya Muruganujan Date: Fri, 6 Sep 2024 15:44:55 -0700 Subject: [PATCH] For #2360 - Updated to handle DBs without id syntax pattern --- ontobio/io/assocparser.py | 19 ++++++++++--------- ontobio/validation/metadata.py | 5 ++--- tests/test_gafparser.py | 12 +++++++++++- tests/test_gpad_parser.py | 26 ++++++++++++++++++++++++++ 4 files changed, 49 insertions(+), 13 deletions(-) diff --git a/ontobio/io/assocparser.py b/ontobio/io/assocparser.py index 531ba56c..ccb1bed8 100644 --- a/ontobio/io/assocparser.py +++ b/ontobio/io/assocparser.py @@ -811,7 +811,7 @@ def _validate_id(self, id, line: SplitLine, allowed_ids=None, context=None): if allowed_ids is not None and id_prefix not in allowed_ids: # For now we will just issue a warning here, and we won't filter out the annotation here - self.report.warning(line.line, Report.INVALID_ID_DBXREF, id_prefix, "allowed: {}".format(allowed_ids), rule=27) + self.report.warning(line.line, Report.INVALID_ID_DBXREF, id_prefix, "{} is not present in DB xrefs file".format(id_prefix), rule=27) # ensure that the ID space of the annotation class (e.g. GO) # conforms to what is expected @@ -824,14 +824,15 @@ def _validate_id(self, id, line: SplitLine, allowed_ids=None, context=None): if self.config.db_type_name_regex_id_syntax is not None: if id_prefix in self.config.db_type_name_regex_id_syntax: type_name_regex_patterns = self.config.db_type_name_regex_id_syntax[id_prefix] - identity_matches_pattern = False - for regex in type_name_regex_patterns.values(): - if regex.fullmatch(right): - identity_matches_pattern = True - break - if identity_matches_pattern == False: - self.report.warning(line.line, Report.INVALID_ID, id, - "GORULE:0000027: {} does not match any id_syntax patterns for {} in dbxrefs".format(right, id_prefix), taxon=line.taxon, rule=27) + if type_name_regex_patterns is not None and len(type_name_regex_patterns) > 0: + identity_matches_pattern = False + for regex in type_name_regex_patterns.values(): + if regex.fullmatch(right): + identity_matches_pattern = True + break + if identity_matches_pattern == False: + self.report.warning(line.line, Report.INVALID_ID, id, + "GORULE:0000027: {} does not match any id_syntax patterns for {} in dbxrefs".format(right, id_prefix), taxon=line.taxon, rule=27) else: self.report.warning(line.line, Report.INVALID_ID, id, "GORULE:0000027: {} not found in list of database names in dbxrefs".format(id_prefix), taxon=line.taxon, rule=27) diff --git a/ontobio/validation/metadata.py b/ontobio/validation/metadata.py index b1e969ea..a0a33241 100644 --- a/ontobio/validation/metadata.py +++ b/ontobio/validation/metadata.py @@ -165,9 +165,8 @@ def database_type_name_regex_id_syntax(metadata): entity_types = entity.get("entity_types", {}) for et in entity_types: if "id_syntax" in et and "type_name" in et: - type_names[et["type_name"]] = re.compile(et["id_syntax"]) - if len(type_names) > 0: - d[entity["database"]] = type_names + type_names[et["type_name"]] = re.compile(et["id_syntax"]) + d[entity["database"]] = type_names return d diff --git a/tests/test_gafparser.py b/tests/test_gafparser.py index 53e99bfa..25132735 100644 --- a/tests/test_gafparser.py +++ b/tests/test_gafparser.py @@ -620,7 +620,11 @@ def test_id_syntax(): pombase_types = {} pombase_types['entity'] = re.compile('S\\w+(\\.)?\\w+(\\.)?') - database_id_syntax_lookups['PomBase'] = pombase_types + database_id_syntax_lookups['PomBase'] = pombase_types + + wb_ref_types = {} + database_id_syntax_lookups['WB_REF'] = wb_ref_types + p = GafParser(config=assocparser.AssocParserConfig( ontology=OntologyFactory().create(ONT), db_type_name_regex_id_syntax=database_id_syntax_lookups)) @@ -629,6 +633,12 @@ def test_id_syntax(): assert assoc_result.skipped == False messages = p.report.to_report_json()["messages"] assert "gorule-0000027" not in messages + + assoc_result = p.parse_line("PomBase\tSPBC1289.03c\tspi1\t\tGO:0005515\tWB_REF:WBPaper00006408|PMID:18422602\tIPI\tPomBase:SPAC25A8.01c\tF\tRan GTPase Spi1\t\tprotein\ttaxon:4896\t20080718\tPomBase\t") + assert len(assoc_result.associations) == 1 + assert assoc_result.skipped == False + messages = p.report.to_report_json()["messages"] + assert "gorule-0000027" not in messages p = GafParser(config=assocparser.AssocParserConfig( ontology=OntologyFactory().create(ONT), db_type_name_regex_id_syntax=database_id_syntax_lookups)) diff --git a/tests/test_gpad_parser.py b/tests/test_gpad_parser.py index 0ceb3d0b..09b835aa 100644 --- a/tests/test_gpad_parser.py +++ b/tests/test_gpad_parser.py @@ -339,6 +339,9 @@ def test_id_syntax(): eco_types['entity'] = re.compile(pattern) database_id_syntax_lookups['ECO'] = eco_types + wb_ref_types = {} + database_id_syntax_lookups['WB_REF'] = wb_ref_types + vals = ["PomBase", "SPAC25A8.01c", "acts_upstream_of_or_within", @@ -362,6 +365,29 @@ def test_id_syntax(): messages = p.report.to_report_json()["messages"] assert "gorule-0000027" not in messages + vals = ["PomBase", + "SPAC25A8.01c", + "acts_upstream_of_or_within", + "GO:0007155", + "WB_REF:WBPaper00006408|PMID:15494018", + "ECO:0000305", + "GO:0005913", + "", + "20041026", + "ZFIN", + "", + "PomBase" + ] + + config = assocparser.AssocParserConfig( + ontology=OntologyFactory().create(ALT_ID_ONT), db_type_name_regex_id_syntax=database_id_syntax_lookups) + p = GpadParser(config=config) + result = p.parse_line("\t".join(vals)) + assert len(result.associations) == 1 + assert result.skipped == False + messages = p.report.to_report_json()["messages"] + assert "gorule-0000027" not in messages + vals = ["PomBase", "SPAC25A8.01c", "acts_upstream_of_or_within",