Improve resolving of URLs via banana curation

biopragmatics · Mar 18, 2021 · 7c908e2 · 7c908e2
1 parent 59432fa
commit 7c908e2
Show file tree

Hide file tree

Showing 7 changed files with 213 additions and 20 deletions.
diff --git a/src/bioregistry/__init__.py b/src/bioregistry/__init__.py
@@ -3,7 +3,11 @@
 """Extract registry information."""
 
 from .resolve import (  # noqa
-    get, get_email, get_example, get_format, get_name, get_pattern, get_pattern_re, get_version, get_versions,
-    is_deprecated, namespace_in_lui, normalize_prefix, validate,
+    get, get_banana, get_email, get_example, get_format, get_identifiers_org_prefix, get_name, get_pattern,
+    get_pattern_re, get_version, get_versions, is_deprecated, namespace_in_lui, normalize_prefix,
+)
+from .resolve_identifier import ( # noqa
+    get_identifiers_org_curie, get_identifiers_org_url, get_obofoundry_link, get_ols_link,
+    get_providers, validate,
 )
 from .utils import read_bioregistry  # noqa
diff --git a/src/bioregistry/app/wsgi.py b/src/bioregistry/app/wsgi.py
@@ -53,14 +53,13 @@ def _get_identifier(prefix, identifier):
         return abort(404, f'invalid prefix: {prefix}')
     if not bioregistry.validate(prefix, identifier):
         return abort(404, f'invalid identifier: {prefix}:{identifier} for pattern {bioregistry.get_pattern(prefix)}')
-    formatter = bioregistry.get_format(prefix)
-    if formatter is None:
-        return abort(404, f'missing resolution for {prefix}')
+    providers = bioregistry.get_providers(prefix, identifier)
+    if not providers:
+        return abort(404, f'no providers available for {prefix}:{identifier}')
 
-    url = formatter.replace('$1', identifier)
     return dict(
         query=dict(prefix=prefix, identifier=identifier),
-        url=url,
+        providers=providers,
     )
 
 

diff --git a/src/bioregistry/data/bioregistry.json b/src/bioregistry/data/bioregistry.json
@@ -5138,6 +5138,7 @@
     }
   },
   "did": {
+    "banana": "did",
     "miriam": {
       "deprecated": false,
       "description": "DIDs are an effort by the W3C Credentials Community Group and the wider Internet identity community to define identifiers that can be registered, updated, resolved, and revoked without any dependency on a central authority or intermediary.",
@@ -8896,6 +8897,7 @@
     }
   },
   "gramene.growthstage": {
+    "banana": "gramene.growthstage:GRO\\",
     "miriam": {
       "deprecated": false,
       "description": "Gramene is a comparative genome mapping database for grasses and crop plants. It combines a semi-automatically generated database of cereal genomic and expressed sequence tag sequences, genetic maps, map relations, quantitative trait loci (QTL), and publications, with a curated database of mutants (genes and alleles), molecular markers, and proteins. This collection refers to growth stage ontology information in Gramene.",
@@ -12923,6 +12925,7 @@
     }
   },
   "mge": {
+    "banana": "mge",
     "miriam": {
       "deprecated": false,
       "description": "ACLAME is a database dedicated to the collection and classification of mobile genetic elements (MGEs) from various sources, comprising all known phage genomes, plasmids and transposons.",
@@ -14440,6 +14443,7 @@
     }
   },
   "mzspec": {
+    "banana": "mzspec",
     "miriam": {
       "deprecated": false,
       "description": "The Universal Spectrum Identifier (USI) is a compound identifier that provides an abstract path to refer to a single spectrum generated by a mass spectrometer, and potentially the ion that is thought to have produced it.",
@@ -15627,6 +15631,7 @@
     }
   },
   "ocid": {
+    "banana": "ocid",
     "miriam": {
       "deprecated": false,
       "description": "'ocid' stands for \"Ontology Concept Identifiers\" and are 12 digit long integers covering IDs in topical ontologies from anatomy up to toxicology.",
@@ -15979,6 +15984,7 @@
     }
   },
   "oma.hog": {
+    "banana": "HOG",
     "example": "0459895",
     "miriam": {
       "deprecated": false,
@@ -15990,6 +15996,7 @@
       "prefix": "oma.hog",
       "sampleId": "HOG:0459895"
     },
+    "url": "https://omabrowser.org/oma/hog/HOG:$1",
     "namespace.rewrite": "HOG"
   },
   "oma.protein": {
@@ -17302,6 +17309,7 @@
     ]
   },
   "peco": {
+    "banana": "EO",
     "bioportal": {
       "name": "Plant Experimental Conditions Ontology",
       "prefix": "PECO"
@@ -21257,6 +21265,7 @@
     }
   },
   "swh": {
+    "banana": "swh",
     "miriam": {
       "deprecated": false,
       "description": "Software Heritage is the universal archive of software source code.",
@@ -22880,6 +22889,7 @@
     ]
   },
   "vario": {
+    "banana": "VariO",
     "bioportal": {
       "name": "Variation Ontology",
       "prefix": "VARIO"

diff --git a/src/bioregistry/resolve.py b/src/bioregistry/resolve.py
@@ -16,7 +16,6 @@
     'get_pattern',
     'get_pattern_re',
     'namespace_in_lui',
-    'validate',
     'get_format',
     'get_example',
     'is_deprecated',
@@ -95,17 +94,34 @@ def namespace_in_lui(prefix: str) -> Optional[bool]:
     return entry.get('miriam', {}).get('namespaceEmbeddedInLui')
 
 
-def validate(prefix: str, identifier: str) -> Optional[bool]:
-    """Validate the identifier against the prefix's pattern, if it exists."""
-    pattern = get_pattern_re(prefix)
-    if pattern is None:
+def get_identifiers_org_prefix(prefix: str) -> Optional[str]:
+    """Get the identifiers.org prefix if available."""
+    return _get_mapped_prefix(prefix, 'miriam')
+
+
+def get_obofoundry_prefix(prefix: str) -> Optional[str]:
+    """Get the OBO Foundry prefix if available."""
+    return _get_mapped_prefix(prefix, 'obofoundry')
+
+
+def get_ols_prefix(prefix: str) -> Optional[str]:
+    """Get the OLS prefix if available."""
+    return _get_mapped_prefix(prefix, 'ols')
+
+
+def _get_mapped_prefix(prefix: str, external: str) -> Optional[str]:
+    entry = get(prefix)
+    if entry is None:
         return None
+    return entry.get(external, {}).get('prefix')
 
-    if namespace_in_lui(prefix) and not identifier.startswith(f'{prefix.upper()}:'):
-        # Some cases do not use uppercase
-        identifier = f'{prefix.upper()}:{identifier}'
 
-    return bool(pattern.match(identifier))
+def get_banana(prefix: str) -> Optional[str]:
+    """Get the optional redundant prefix to go before an identifier."""
+    entry = get(prefix)
+    if entry is None:
+        return None
+    return entry.get('banana')
 
 
 def get_format(prefix: str) -> Optional[str]:
@@ -116,10 +132,12 @@ def get_format(prefix: str) -> Optional[str]:
     url = entry.get('url')
     if url is not None:
         return url
-    miriam_id = entry.get('miriam', {}).get('prefix')
+    miriam_id = get_identifiers_org_prefix(prefix)
     if miriam_id is not None:
         if namespace_in_lui(prefix):
-            miriam_id = miriam_id.upper()  # not exact solution, some less common ones don't use capitalization
+            # not exact solution, some less common ones don't use capitalization
+            # align with the banana solution
+            miriam_id = miriam_id.upper()
         return f'https://identifiers.org/{miriam_id}:$1'
     ols_id = entry.get('ols', {}).get('prefix')
     if ols_id is not None:

diff --git a/src/bioregistry/resolve_identifier.py b/src/bioregistry/resolve_identifier.py
@@ -0,0 +1,94 @@
+# -*- coding: utf-8 -*-
+
+"""Resolvers for CURIE (e.g., pairs of prefix and identifier)."""
+
+from typing import Mapping, Optional
+
+from .resolve import (
+    get_banana, get_format, get_identifiers_org_prefix, get_obofoundry_prefix, get_ols_prefix, get_pattern_re,
+    namespace_in_lui,
+)
+
+__all__ = [
+    'validate',
+    'get_providers',
+    'get_identifiers_org_url',
+    'get_identifiers_org_curie',
+    'get_obofoundry_link',
+    'get_ols_link',
+]
+
+
+def validate(prefix: str, identifier: str) -> Optional[bool]:
+    """Validate the identifier against the prefix's pattern, if it exists."""
+    pattern = get_pattern_re(prefix)
+    if pattern is None:
+        return None
+
+    if namespace_in_lui(prefix) and not identifier.startswith(f'{prefix.upper()}:'):
+        # Some cases do not use uppercase
+        identifier = f'{prefix.upper()}:{identifier}'
+
+    return bool(pattern.match(identifier))
+
+
+def get_providers(prefix: str, identifier: str) -> Mapping[str, str]:
+    """Get all providers for the CURIE."""
+    providers = {}
+    bioregistry_format = get_format(prefix)
+    if bioregistry_format:
+        providers['bioregistry'] = bioregistry_format.replace('$1', identifier)
+    for provider, get_url in [
+        ('miriam', get_identifiers_org_url),
+        ('obofoundry', get_obofoundry_link),
+        ('ols', get_ols_link),
+    ]:
+        link = get_url(prefix, identifier)
+        if link:
+            providers[provider] = link
+    return providers
+
+
+def get_identifiers_org_url(prefix: str, identifier: str) -> Optional[str]:
+    """Get the identifiers.org URL for the given CURIE."""
+    curie = get_identifiers_org_curie(prefix, identifier)
+    if curie is None:
+        return None
+    return f'https://identifiers.org/{curie}'
+
+
+def get_identifiers_org_curie(prefix: str, identifier: str) -> Optional[str]:
+    """Get the identifiers.org CURIE for the given CURIE."""
+    miriam_prefix = get_identifiers_org_prefix(prefix)
+    if miriam_prefix is None:
+        return None
+    if not namespace_in_lui(prefix):
+        return f'{prefix}:{identifier}'
+    banana = get_banana(prefix)
+    if banana:
+        if identifier.startswith(f'{banana}:'):
+            return identifier
+        else:
+            return f'{banana}:{identifier}'
+    else:
+        if identifier.startswith(prefix.upper()):
+            return identifier
+        else:
+            return f'{prefix.upper()}:{identifier}'
+
+
+def get_obofoundry_link(prefix: str, identifier: str) -> Optional[str]:
+    """Get the OBO Foundry URL if possible."""
+    obo_prefix = get_obofoundry_prefix(prefix)
+    if obo_prefix is None:
+        return None
+    return f'http://purl.obolibrary.org/obo/{obo_prefix.upper()}_{identifier}'
+
+
+def get_ols_link(prefix: str, identifier: str) -> Optional[str]:
+    """Get the OLS URL if possible."""
+    ols_prefix = get_ols_prefix(prefix)
+    obo_link = get_obofoundry_link(prefix, identifier)
+    if ols_prefix is None or obo_link is None:
+        return None
+    return f'https://www.ebi.ac.uk/ols/ontologies/{ols_prefix}/terms?iri={obo_link}'
diff --git a/tests/test_identifiers_org.py b/tests/test_identifiers_org.py
@@ -0,0 +1,43 @@
+# -*- coding: utf-8 -*-
+
+"""Tests for identifiers.org."""
+
+import unittest
+
+import requests
+
+from bioregistry import get_identifiers_org_curie, get_identifiers_org_url
+
+
+class TestIdentifiersOrg(unittest.TestCase):
+    """Tests for identifiers.org."""
+
+    def test_url(self):
+        """Test formatting URLs."""
+        for prefix, identifier, expected, _reason in [
+            ('efo', '0000400', 'efo:0000400', 'test simple concatenation'),
+            ('chebi', 'CHEBI:1234', 'CHEBI:1234', 'test redundant namespace (standard)'),
+            ('chebi', '1234', 'CHEBI:1234', 'test exclusion of redundant namespace (standard)'),
+            (
+                'mzspec',
+                'PXD002255::ES_XP_Ubi_97H_HCD_349:scan:9617:LAEIYVNSSFYK/2',
+                'mzspec:PXD002255::ES_XP_Ubi_97H_HCD_349:scan:9617:LAEIYVNSSFYK/2',
+                'test simple concatenation with false banana',
+            ),
+            (
+                'mzspec',
+                'mzspec:PXD002255::ES_XP_Ubi_97H_HCD_349:scan:9617:LAEIYVNSSFYK/2',
+                'mzspec:PXD002255::ES_XP_Ubi_97H_HCD_349:scan:9617:LAEIYVNSSFYK/2',
+                'test simple concatenation (redundant) with false banana',
+            ),
+        ]:
+            with self.subTest(p=prefix, i=identifier):
+                curie = get_identifiers_org_curie(prefix, identifier)
+                self.assertEqual(expected, curie, msg='wrong CURIE')
+
+                url = get_identifiers_org_url(prefix, identifier)
+                self.assertEqual(f'https://identifiers.org/{curie}', url, msg='wrong URL')
+
+                # Check that the URL resolves
+                res = requests.get(url)
+                self.assertEqual(200, res.status_code, msg=res.reason)
diff --git a/tests/test_resolve.py b/tests/test_resolve.py
@@ -4,6 +4,8 @@
 
 import unittest
 
+import requests
+
 import bioregistry
 
 
@@ -65,7 +67,6 @@ def test_validate_false(self):
             with self.subTest(prefix=prefix, identifier=identifier):
                 self.assertFalse(bioregistry.validate(prefix, identifier))
 
-    @unittest.skip
     def test_lui(self):
         """Test the LUI makes sense (spoilers, they don't).
 
@@ -76,9 +77,33 @@ def test_lui(self):
         for prefix in bioregistry.read_bioregistry():
             if not bioregistry.namespace_in_lui(prefix):
                 continue
+            if bioregistry.get_banana(prefix):
+                continue  # rewrite rules are applied to prefixes with bananas
+            if prefix in {'ark', 'obi'}:
+                continue  # these patterns on identifiers.org are garb
             with self.subTest(prefix=prefix):
                 re_pattern = bioregistry.get_pattern(prefix)
+                miriam_prefix = bioregistry.get_identifiers_org_prefix(prefix)
                 self.assertTrue(
-                    re_pattern.startswith(f'^{prefix.upper()}') or re_pattern.startswith(prefix.upper()),
+                    re_pattern.startswith(f'^{miriam_prefix.upper()}') or re_pattern.startswith(miriam_prefix.upper()),
                     msg=f'{prefix} pattern: {re_pattern}',
                 )
+
+    def test_banana(self):
+        """Test that entries curated with a new banana are resolved properly."""
+        for prefix, entry in bioregistry.read_bioregistry().items():
+            banana = entry.get('banana')
+            if banana is None:
+                continue
+            if prefix in {'gramene.growthstage', 'oma.hog'}:
+                continue  # identifiers.org is broken for these prefixes
+            with self.subTest(
+                prefix=prefix,
+                banana=banana,
+                pattern=bioregistry.get_pattern(prefix),
+            ):
+                identifier = bioregistry.get_example(prefix)
+                self.assertIsNotNone(identifier)
+                url = bioregistry.resolve_identifier.get_identifiers_org_url(prefix, identifier)
+                res = requests.get(url)
+                self.assertEqual(200, res.status_code, msg=f'failed with URL: {url}')