diff --git a/bin/ontobio-parse-assocs.py b/bin/ontobio-parse-assocs.py index 5f5f9f52..44b9fa42 100755 --- a/bin/ontobio-parse-assocs.py +++ b/bin/ontobio-parse-assocs.py @@ -146,11 +146,13 @@ def main(): rule_set = assocparser.RuleSet.ALL goref_metadata = None - ref_species_metadata = None + ref_species_metadata = None + db_type_name_regex_id_syntax = None if args.metadata_dir: absolute_metadata = os.path.abspath(args.metadata_dir) goref_metadata = metadata.yamldown_lookup(os.path.join(absolute_metadata, "gorefs")) ref_species_metadata = metadata.yaml_set(absolute_metadata, "go-reference-species.yaml", "taxon_id") + db_type_name_regex_id_syntax = metadata.database_type_name_regex_id_syntax(absolute_metadata) retracted_pub_set = None if args.retracted_pub_set: @@ -173,6 +175,7 @@ def main(): gpi_authority_path=args.gpi, goref_metadata=goref_metadata, ref_species_metadata=ref_species_metadata, + db_type_name_regex_id_syntax=db_type_name_regex_id_syntax, retracted_pub_set=retracted_pub_set, rule_set=rule_set ) @@ -197,8 +200,8 @@ def main(): outfh = None if args.outfile is not None: two_mb = 2097152 - outfh = open(args.outfile, "w", buffering=two_mb) - func(ont, args.file, outfh, p, args) + outfh = open(args.outfile, "w", buffering=two_mb) + func(ont, args.file, outfh, p, args) if filtered_evidence_file: filtered_evidence_file.close() diff --git a/ontobio/io/assocparser.py b/ontobio/io/assocparser.py index c8676e05..103b8a7e 100644 --- a/ontobio/io/assocparser.py +++ b/ontobio/io/assocparser.py @@ -234,6 +234,7 @@ def __init__(self, ref_species_metadata=None, group_metadata=None, dbxrefs=None, + db_type_name_regex_id_syntax=None, retracted_pub_set=None, suppress_rule_reporting_tags=[], annotation_inferences=None, @@ -259,6 +260,7 @@ def __init__(self, self.goref_metadata = goref_metadata self.ref_species_metadata = ref_species_metadata self.group_metadata = group_metadata + self.db_type_name_regex_id_syntax = db_type_name_regex_id_syntax self.retracted_pub_set = retracted_pub_set self.suppress_rule_reporting_tags = suppress_rule_reporting_tags self.annotation_inferences = annotation_inferences @@ -703,6 +705,7 @@ def _unroll_withfrom_and_replair_obsoletes(self, line: SplitLine, gaf_or_gpad: s return None else: fixed_element_individual = element_individual + if grouped_fixed_elements == '': grouped_fixed_elements = fixed_element_individual else: @@ -816,7 +819,22 @@ def _validate_id(self, id, line: SplitLine, allowed_ids=None, context=None): if id_prefix not in self.config.class_idspaces: self.report.error(line.line, Report.INVALID_IDSPACE, id_prefix, "allowed: {}".format(self.config.class_idspaces), rule=27) return False - + + # ensure id_syntax is valid, else output a warning + if self.config.db_type_name_regex_id_syntax is not None: + if id_prefix in self.config.db_type_name_regex_id_syntax: + type_name_regex_patterns = self.config.db_type_name_regex_id_syntax[id_prefix] + identity_matches_pattern = False + for regex in type_name_regex_patterns.values(): + if regex.match(right): + identity_matches_pattern = True + break + if identity_matches_pattern == False: + self.report.warning(line.line, Report.INVALID_ID, id, + "GORULE:0000027: {} does not match any id_syntax patterns for {} in dbxrefs".format(right, id_prefix), taxon=line.taxon, rule=27) + else: + self.report.warning(line.line, Report.INVALID_ID, id, + "GORULE:0000027: {} not found in list of database names in dbxrefs".format(id_prefix), taxon=line.taxon, rule=27) return True def validate_pipe_separated_ids(self, column, line: SplitLine, empty_allowed=False, extra_delims="") -> Optional[List[str]]: diff --git a/ontobio/io/gafparser.py b/ontobio/io/gafparser.py index 44b9d9a8..6236b868 100644 --- a/ontobio/io/gafparser.py +++ b/ontobio/io/gafparser.py @@ -215,7 +215,7 @@ def parse_line(self, line): references = self.validate_curie_ids(assoc.evidence.has_supporting_reference, split_line) if references is None: # Reporting occurs in above function call - return assocparser.ParseResult(line, [], True) + return assocparser.ParseResult(line, [], True) # With/From for wf in assoc.evidence.with_support_from: @@ -238,7 +238,12 @@ def parse_line(self, line): if repaired is None: assoc.object_extensions = [] return assocparser.ParseResult(line, [], True) - assoc.object_extensions = repaired + assoc.object_extensions = repaired + + # Check subject_extensions and output warnings + if (0 < len(assoc.subject_extensions)): + for ext in assoc.subject_extensions: + self._validate_id(str(ext.term), split_line) ## Run GO Rules, save split values into individual variables @@ -264,7 +269,7 @@ def parse_line(self, line): if self.config.group_idspace is not None and assoc.provided_by not in self.config.group_idspace: self.report.warning(line, Report.INVALID_ID, assoc.provided_by, - "GORULE:0000027: assigned_by is not present in groups reference", taxon=str(assoc.object.taxon), rule=27) + "GORULE:0000027: {assigned_by} is not present in groups reference".format(assigned_by=assoc.provided_by), taxon=str(assoc.object.taxon), rule=27) db = assoc.subject.id.namespace if self.config.entity_idspaces is not None and db not in self.config.entity_idspaces: @@ -274,7 +279,10 @@ def parse_line(self, line): # If we found a synonym self.report.warning(line, Report.INVALID_ID_DBXREF, db, "GORULE:0000027: {} is a synonym for the correct ID {}, and has been updated".format(db, upgrade), taxon=str(assoc.object.taxon), rule=27) assoc.subject.id.namespace = upgrade - + else: + self.report.warning(line, Report.INVALID_ID, assoc.subject.id.namespace, + "GORULE:0000027: {subject_id_namespace} is not present in dbxrefs".format(subject_id_namespace=assoc.subject.id.namespace), taxon=str(assoc.object.taxon), rule=27) + ## -- ## db + db_object_id. CARD=1 ## --assigned_by diff --git a/ontobio/io/gpadparser.py b/ontobio/io/gpadparser.py index 4876fa88..b7782a31 100644 --- a/ontobio/io/gpadparser.py +++ b/ontobio/io/gpadparser.py @@ -215,7 +215,7 @@ def parse_line(self, line): return assocparser.ParseResult(line, [], True) if not self._validate_id(str(assoc.evidence.type), split_line): - return assocparser.ParseResult(line, [], True) + return assocparser.ParseResult(line, [], True) if assoc.interacting_taxon: if not self._validate_taxon(str(assoc.interacting_taxon), split_line): diff --git a/ontobio/validation/metadata.py b/ontobio/validation/metadata.py index b5be0c1d..b1e969ea 100644 --- a/ontobio/validation/metadata.py +++ b/ontobio/validation/metadata.py @@ -2,6 +2,7 @@ import yaml import os import glob +import re from dataclasses import dataclass @@ -147,6 +148,30 @@ def source_path(dataset_metadata, target_dir, group): return path def database_entities(metadata): + dbxrefs = database_yaml(metadata) + + d = BiDiMultiMap() + for entity in dbxrefs: + d[entity["database"]] = set(entity.get("synonyms", [])) + + return d + +def database_type_name_regex_id_syntax(metadata): + dbxrefs = database_yaml(metadata) + + d = {} + for entity in dbxrefs: + type_names = {} + entity_types = entity.get("entity_types", {}) + for et in entity_types: + if "id_syntax" in et and "type_name" in et: + type_names[et["type_name"]] = re.compile(et["id_syntax"]) + if len(type_names) > 0: + d[entity["database"]] = type_names + + return d + +def database_yaml(metadata): dbxrefs_path = os.path.join(os.path.abspath(metadata), "db-xrefs.yaml") try: with open(dbxrefs_path, "r") as db_xrefs_file: @@ -154,12 +179,7 @@ def database_entities(metadata): dbxrefs = yaml.load(db_xrefs_file, Loader=yaml.FullLoader) except Exception as e: raise click.ClickException("Could not find or read {}: {}".format(dbxrefs_path, str(e))) - - d = BiDiMultiMap() - for entity in dbxrefs: - d[entity["database"]] = set(entity.get("synonyms", [])) - - return d + return dbxrefs def groups(metadata) -> Set[str]: groups_path = os.path.join(os.path.abspath(metadata), "groups.yaml") diff --git a/tests/test_gafparser.py b/tests/test_gafparser.py index a1c300a9..1ee74396 100644 --- a/tests/test_gafparser.py +++ b/tests/test_gafparser.py @@ -16,6 +16,7 @@ import pytest import io import json +import re ecomap = EcoMap() ecomap.mappings() @@ -603,6 +604,49 @@ def test_factory(): assert len(aset.associations_by_subj) > 0 assert found == 2 + +def test_id_syntax(): + database_id_syntax_lookups = {} + go_types = {} + pattern = '\\d{7}' + go_types['molecular_function'] = re.compile(pattern) + go_types['biological_process'] = re.compile(pattern) + go_types['cellular_component'] = re.compile(pattern) + database_id_syntax_lookups['GO'] = go_types + + pmid_types = {} + pmid_types['entity'] = re.compile('[0-9]+') + database_id_syntax_lookups['PMID'] = pmid_types + + pombase_types = {} + pombase_types['entity'] = re.compile('S\\w+(\\.)?\\w+(\\.)?') + database_id_syntax_lookups['PomBase'] = pombase_types + p = GafParser(config=assocparser.AssocParserConfig( + ontology=OntologyFactory().create(ONT), db_type_name_regex_id_syntax=database_id_syntax_lookups)) + + assoc_result = p.parse_line("PomBase\tSPBC1289.03c\tspi1\t\tGO:0005515\tPMID:18422602\tIPI\tPomBase:SPAC25A8.01c\tF\tRan GTPase Spi1\t\tprotein\ttaxon:4896\t20080718\tPomBase\t") + assert len(assoc_result.associations) == 1 + assert assoc_result.skipped == False + messages = p.report.to_report_json()["messages"] + assert "gorule-0000027" not in messages + + p = GafParser(config=assocparser.AssocParserConfig( + ontology=OntologyFactory().create(ONT), db_type_name_regex_id_syntax=database_id_syntax_lookups)) + assoc_result = p.parse_line("PomBase\tSPBC1289.03c\tspi1\t\tGO:0005515\tPMID:PMID:18422602\tIPI\tPomBase:SPAC25A8.01c\tF\tRan GTPase Spi1\t\tprotein\ttaxon:4896\t20080718\tPomBase\t") + assert len(assoc_result.associations) == 1 + assert assoc_result.skipped == False + messages = p.report.to_report_json()["messages"] + assert len(messages["gorule-0000027"]) == 1 + assert messages["gorule-0000027"][0]["obj"] == "PMID:PMID:18422602" + + p = GafParser(config=assocparser.AssocParserConfig( + ontology=OntologyFactory().create(ONT), db_type_name_regex_id_syntax=database_id_syntax_lookups)) + assoc_result = p.parse_line("PomBase\tSPBC1289.03c\tspi1\t\tGO:0005515\tBLA:18422602\tIPI\tPomBase:SPAC25A8.01c\tF\tRan GTPase Spi1\t\tprotein\ttaxon:4896\t20080718\tPomBase\t") + assert len(assoc_result.associations) == 1 + assert assoc_result.skipped == False + messages = p.report.to_report_json()["messages"] + assert len(messages["gorule-0000027"]) == 1 + assert messages["gorule-0000027"][0]["obj"] == "BLA:18422602" def test_gaf_gpi_bridge(): diff --git a/tests/test_gpad_parser.py b/tests/test_gpad_parser.py index b07beaba..c331a8ea 100644 --- a/tests/test_gpad_parser.py +++ b/tests/test_gpad_parser.py @@ -7,11 +7,11 @@ from ontobio.model.association import Curie, Subject import yaml +import re POMBASE = "tests/resources/truncated-pombase.gpad" ALT_ID_ONT = "tests/resources/obsolete.json" - def test_obsolete_term_repair_withfrom(): vals = ["ZFIN", @@ -318,6 +318,93 @@ def test_unmapped_eco_to_gaf_codes(): result = parser.parse_line("\t".join(vals)) assert len(result.associations) == 1 +def test_id_syntax(): + database_id_syntax_lookups = {} + go_types = {} + pattern = '\\d{7}' + go_types['molecular_function'] = re.compile(pattern) + go_types['biological_process'] = re.compile(pattern) + go_types['cellular_component'] = re.compile(pattern) + database_id_syntax_lookups['GO'] = go_types + + pmid_types = {} + pmid_types['entity'] = re.compile('[0-9]+') + database_id_syntax_lookups['PMID'] = pmid_types + + pombase_types = {} + pombase_types['entity'] = re.compile('S\\w+(\\.)?\\w+(\\.)?') + database_id_syntax_lookups['PomBase'] = pombase_types + + eco_types = {} + eco_types['entity'] = re.compile(pattern) + database_id_syntax_lookups['ECO'] = eco_types + + vals = ["PomBase", + "SPAC25A8.01c", + "acts_upstream_of_or_within", + "GO:0007155", + "PMID:15494018", + "ECO:0000305", + "GO:0005913", + "", + "20041026", + "ZFIN", + "", + "PomBase" + ] + + config = assocparser.AssocParserConfig( + ontology=OntologyFactory().create(ALT_ID_ONT), db_type_name_regex_id_syntax=database_id_syntax_lookups) + p = GpadParser(config=config) + result = p.parse_line("\t".join(vals)) + assert len(result.associations) == 1 + assert result.skipped == False + messages = p.report.to_report_json()["messages"] + assert "gorule-0000027" not in messages + + vals = ["PomBase", + "SPAC25A8.01c", + "acts_upstream_of_or_within", + "GO:0007155", + "PMID:PMID:15494018", + "ECO:0000305", + "GO:0005913", + "", + "20041026", + "ZFIN", + "", + "PomBase" + ] + + p = GpadParser(config=config) + result = p.parse_line("\t".join(vals)) + assert len(result.associations) == 1 + assert result.skipped == False + messages = p.report.to_report_json()["messages"] + assert len(messages["gorule-0000027"]) == 1 + assert messages["gorule-0000027"][0]["obj"] == "PMID:PMID:15494018" + + vals = ["PomBase", + "SPAC25A8.01c", + "acts_upstream_of_or_within", + "GO:0007155", + "BLA:15494018", + "ECO:0000305", + "GO:0005913", + "", + "20041026", + "ZFIN", + "", + "PomBase" + ] + p = GpadParser(config=config) + result = p.parse_line("\t".join(vals)) + assert len(result.associations) == 1 + assert result.skipped == False + messages = p.report.to_report_json()["messages"] + assert len(messages["gorule-0000027"]) == 1 + assert messages["gorule-0000027"][0]["obj"] == "BLA:15494018" + def test_gpi_check(): report = assocparser.Report(group="unknown", dataset="unknown") vals = [