biolink · mugitty · Jul 8, 2024 · Jun 12, 2024 · Jun 24, 2024 · Jun 25, 2024
diff --git a/bin/ontobio-parse-assocs.py b/bin/ontobio-parse-assocs.py
@@ -146,11 +146,13 @@ def main():
         rule_set = assocparser.RuleSet.ALL
 
     goref_metadata = None
-    ref_species_metadata = None  
+    ref_species_metadata = None
+    db_type_name_regex_id_syntax = None      
     if args.metadata_dir:
         absolute_metadata = os.path.abspath(args.metadata_dir)
         goref_metadata = metadata.yamldown_lookup(os.path.join(absolute_metadata, "gorefs"))
         ref_species_metadata = metadata.yaml_set(absolute_metadata, "go-reference-species.yaml", "taxon_id")
+        db_type_name_regex_id_syntax = metadata.database_type_name_regex_id_syntax(absolute_metadata)        
 
     retracted_pub_set = None
     if args.retracted_pub_set:
@@ -173,6 +175,7 @@ def main():
         gpi_authority_path=args.gpi,
         goref_metadata=goref_metadata,
         ref_species_metadata=ref_species_metadata,
+        db_type_name_regex_id_syntax=db_type_name_regex_id_syntax,
         retracted_pub_set=retracted_pub_set,
         rule_set=rule_set
     )
@@ -197,8 +200,8 @@ def main():
     outfh = None
     if args.outfile is not None:
         two_mb = 2097152
-        outfh = open(args.outfile, "w", buffering=two_mb)
-    func(ont, args.file, outfh, p, args)
+        outfh = open(args.outfile, "w", buffering=two_mb)      
+    func(ont, args.file, outfh, p, args)   
     if filtered_evidence_file:
         filtered_evidence_file.close()
 

diff --git a/ontobio/io/assocparser.py b/ontobio/io/assocparser.py
@@ -234,6 +234,7 @@ def __init__(self,
                  ref_species_metadata=None,
                  group_metadata=None,
                  dbxrefs=None,
+                 db_type_name_regex_id_syntax=None,                 
                  retracted_pub_set=None,
                  suppress_rule_reporting_tags=[],
                  annotation_inferences=None,
@@ -259,6 +260,7 @@ def __init__(self,
         self.goref_metadata = goref_metadata
         self.ref_species_metadata = ref_species_metadata
         self.group_metadata = group_metadata
+        self.db_type_name_regex_id_syntax = db_type_name_regex_id_syntax        
         self.retracted_pub_set = retracted_pub_set
         self.suppress_rule_reporting_tags = suppress_rule_reporting_tags
         self.annotation_inferences = annotation_inferences
@@ -703,6 +705,11 @@ def _unroll_withfrom_and_replair_obsoletes(self, line: SplitLine, gaf_or_gpad: s
                         return None
                 else:
                     fixed_element_individual = element_individual
+
+                #Check db-xref
+                self._validate_curie_using_db_xrefs(association.Curie.from_str(fixed_element_individual), fixed_element_individual, line)
+
+
                 if grouped_fixed_elements == '':
                     grouped_fixed_elements = fixed_element_individual
                 else:
@@ -768,10 +775,43 @@ def _repair_extensions(self, extensions, line: SplitLine, subclassof=None):
                 else:
                     curSet.append(association.ExtensionUnit(relation = association.Curie(e.relation.namespace, e.relation.identity), term = association.Curie(e.term.namespace, e.term.identity)))        
 
+                #Check db-xref
+                self._validate_curie_using_db_xrefs(e.term, str(e.term), line)
+
             grouped_set.append(association.ConjunctiveSet(curSet))
 
         return grouped_set
 
+    def _validate_curie_using_db_xrefs(self, curie, curieStr, line: SplitLine):
+        if curie is None:
+            self.report.warning(line.line, Report.INVALID_ID, curieStr,"GORULE:0000027: curie is empty", taxon=line.taxon, rule=27)
+            return False            
+        if  isinstance(curie, association.Curie) is False:
+            self.report.warning(line.line, Report.INVALID_ID, curieStr,"GORULE:0000027: Not a curie", taxon=line.taxon, rule=27)
+            return False
+        if curie.namespace is None:
+            self.report.warning(line.line, Report.INVALID_ID, curie.namespace, "GORULE:0000027: Curie namespace is empty", taxon=line.taxon, rule=27)
+            return False        
+        if curie.identity is None:
+            self.report.warning(line.line, Report.INVALID_ID, curie.identity, "GORULE:0000027: Curie identity is empty", taxon=line.taxon, rule=27)
+            return False
+
+        if self.config.db_type_name_regex_id_syntax is not None:
+            if curie.namespace in self.config.db_type_name_regex_id_syntax:
+                type_name_regex_patterns = self.config.db_type_name_regex_id_syntax[curie.namespace]
+                identity_matches_pattern = False
+                for regex in type_name_regex_patterns.values():
+                    if regex.match(curie.identity):
+                        identity_matches_pattern = True
+                        break
+                if identity_matches_pattern == False:
+                    self.report.warning(line, Report.INVALID_ID, curie.identity,
+                    "GORULE:0000027: {} does not match any id_syntax patterns for {} in dbxrefs".format(str(curie), curie.namespace), taxon=line.taxon, rule=27)    
+                    return False        
+        return True
+
+
+
     def _validate_symbol(self, symbol, line: SplitLine):
         if symbol is None or symbol == "":
             self.report.warning(line.line, Report.INVALID_SYMBOL, symbol, "GORULE:0000027: symbol is empty",

diff --git a/ontobio/io/gafparser.py b/ontobio/io/gafparser.py
@@ -207,6 +207,8 @@ def parse_line(self, line):
             print("skipping because {} not validated!".format(assoc.object.id))
             return assocparser.ParseResult(line, [], True)
 
+        self._validate_curie_using_db_xrefs(assoc.object.id, str(assoc.object.id), split_line)
+
         valid_goid = self._validate_ontology_class_id(str(assoc.object.id), split_line)
         if valid_goid is None:
             return assocparser.ParseResult(line, [], True)
@@ -216,6 +218,8 @@ def parse_line(self, line):
         if references is None:
             # Reporting occurs in above function call
             return assocparser.ParseResult(line, [], True)
+        for reference in references:
+            self._validate_curie_using_db_xrefs(reference, str(reference), split_line)        
 
         # With/From
         for wf in assoc.evidence.with_support_from:
@@ -264,7 +268,7 @@ def parse_line(self, line):
 
         if self.config.group_idspace is not None and assoc.provided_by not in self.config.group_idspace:
             self.report.warning(line, Report.INVALID_ID, assoc.provided_by,
-                "GORULE:0000027: assigned_by is not present in groups reference", taxon=str(assoc.object.taxon), rule=27)
+                "GORULE:0000027: {assigned_by} is not present in groups reference".format(assigned_by=assoc.provided_by), taxon=str(assoc.object.taxon), rule=27)
 
         db = assoc.subject.id.namespace
         if self.config.entity_idspaces is not None and db not in self.config.entity_idspaces:
@@ -274,7 +278,13 @@ def parse_line(self, line):
                 # If we found a synonym
                 self.report.warning(line, Report.INVALID_ID_DBXREF, db, "GORULE:0000027: {} is a synonym for the correct ID {}, and has been updated".format(db, upgrade), taxon=str(assoc.object.taxon), rule=27)
                 assoc.subject.id.namespace = upgrade
+            else:
+                self.report.warning(line, Report.INVALID_ID, assoc.subject.id.namespace,
+                "GORULE:0000027: {subject_id_namespace} is not present in dbxrefs".format(subject_id_namespace=assoc.subject.id.namespace), taxon=str(assoc.object.taxon), rule=27)    
 
+        # Validate against db-xref id_syntax
+        self._validate_curie_using_db_xrefs(assoc.subject.id, str(assoc.subject.id), split_line) 
+
         ## --
         ## db + db_object_id. CARD=1
         ## --assigned_by

diff --git a/ontobio/io/gpadparser.py b/ontobio/io/gpadparser.py
@@ -184,6 +184,7 @@ def parse_line(self, line):
         valid_goid = self._validate_ontology_class_id(str(assoc.object.id), split_line)
         if valid_goid is None:
             return assocparser.ParseResult(line, [], True)
+        self._validate_curie_using_db_xrefs(association.Curie.from_str(valid_goid),  valid_goid, split_line)
         assoc.object.id = association.Curie.from_str(valid_goid)
 
         go_rule_results = qc.test_go_rules(assoc, self.config)
@@ -216,6 +217,9 @@ def parse_line(self, line):
 
         if not self._validate_id(str(assoc.evidence.type), split_line):
             return assocparser.ParseResult(line, [], True)
+
+        #Ensure db and dbid are valid
+        self._validate_curie_using_db_xrefs(assoc.subject.id, str(assoc.subject.id), split_line)         
 
         if assoc.interacting_taxon:
             if not self._validate_taxon(str(assoc.interacting_taxon), split_line):
@@ -235,6 +239,8 @@ def parse_line(self, line):
         references = self.validate_curie_ids(assoc.evidence.has_supporting_reference, split_line)
         if references is None:
             return assocparser.ParseResult(line, [], True)
+        for reference in references:
+            self._validate_curie_using_db_xrefs(reference, str(reference), split_line)  
 
         # With/From
         if assoc.evidence.with_support_from is not None:

diff --git a/ontobio/validation/metadata.py b/ontobio/validation/metadata.py
@@ -2,6 +2,7 @@
 import yaml
 import os
 import glob
+import re
 
 from dataclasses import dataclass
 
@@ -147,19 +148,38 @@ def source_path(dataset_metadata, target_dir, group):
     return path
 
 def database_entities(metadata):
+    dbxrefs = database_yaml(metadata)
+
+    d = BiDiMultiMap()
+    for entity in dbxrefs:
+        d[entity["database"]] = set(entity.get("synonyms", []))
+
+    return d
+
+def database_type_name_regex_id_syntax(metadata):
+    dbxrefs = database_yaml(metadata)
+
+    d = {}
+    for entity in dbxrefs:
+        type_names = {}
+        entity_types = entity.get("entity_types", {})
+        for et in entity_types:
+            if "id_syntax" in et and "type_name" in et:
+                type_names[et["type_name"]] = re.compile(et["id_syntax"])
+        if len(type_names) > 0:        
+            d[entity["database"]] = type_names
+
+    return d    
+
+def database_yaml(metadata):
     dbxrefs_path = os.path.join(os.path.abspath(metadata), "db-xrefs.yaml")
     try:
         with open(dbxrefs_path, "r") as db_xrefs_file:
             click.echo("Found db-xrefs at {path}".format(path=dbxrefs_path))
             dbxrefs = yaml.load(db_xrefs_file, Loader=yaml.FullLoader)
     except Exception as e:
         raise click.ClickException("Could not find or read {}: {}".format(dbxrefs_path, str(e)))
-
-    d = BiDiMultiMap()
-    for entity in dbxrefs:
-        d[entity["database"]] = set(entity.get("synonyms", []))
-
-    return d
+    return dbxrefs    
 
 def groups(metadata) -> Set[str]:
     groups_path = os.path.join(os.path.abspath(metadata), "groups.yaml")