Skip to content

Commit

Permalink
Improve resolving of URLs via banana curation
Browse files Browse the repository at this point in the history
  • Loading branch information
cthoyt committed Mar 18, 2021
1 parent 59432fa commit 7c908e2
Show file tree
Hide file tree
Showing 7 changed files with 213 additions and 20 deletions.
8 changes: 6 additions & 2 deletions src/bioregistry/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,11 @@
"""Extract registry information."""

from .resolve import ( # noqa
get, get_email, get_example, get_format, get_name, get_pattern, get_pattern_re, get_version, get_versions,
is_deprecated, namespace_in_lui, normalize_prefix, validate,
get, get_banana, get_email, get_example, get_format, get_identifiers_org_prefix, get_name, get_pattern,
get_pattern_re, get_version, get_versions, is_deprecated, namespace_in_lui, normalize_prefix,
)
from .resolve_identifier import ( # noqa
get_identifiers_org_curie, get_identifiers_org_url, get_obofoundry_link, get_ols_link,
get_providers, validate,
)
from .utils import read_bioregistry # noqa
9 changes: 4 additions & 5 deletions src/bioregistry/app/wsgi.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,14 +53,13 @@ def _get_identifier(prefix, identifier):
return abort(404, f'invalid prefix: {prefix}')
if not bioregistry.validate(prefix, identifier):
return abort(404, f'invalid identifier: {prefix}:{identifier} for pattern {bioregistry.get_pattern(prefix)}')
formatter = bioregistry.get_format(prefix)
if formatter is None:
return abort(404, f'missing resolution for {prefix}')
providers = bioregistry.get_providers(prefix, identifier)
if not providers:
return abort(404, f'no providers available for {prefix}:{identifier}')

url = formatter.replace('$1', identifier)
return dict(
query=dict(prefix=prefix, identifier=identifier),
url=url,
providers=providers,
)


Expand Down
10 changes: 10 additions & 0 deletions src/bioregistry/data/bioregistry.json
Original file line number Diff line number Diff line change
Expand Up @@ -5138,6 +5138,7 @@
}
},
"did": {
"banana": "did",
"miriam": {
"deprecated": false,
"description": "DIDs are an effort by the W3C Credentials Community Group and the wider Internet identity community to define identifiers that can be registered, updated, resolved, and revoked without any dependency on a central authority or intermediary.",
Expand Down Expand Up @@ -8896,6 +8897,7 @@
}
},
"gramene.growthstage": {
"banana": "gramene.growthstage:GRO\\",
"miriam": {
"deprecated": false,
"description": "Gramene is a comparative genome mapping database for grasses and crop plants. It combines a semi-automatically generated database of cereal genomic and expressed sequence tag sequences, genetic maps, map relations, quantitative trait loci (QTL), and publications, with a curated database of mutants (genes and alleles), molecular markers, and proteins. This collection refers to growth stage ontology information in Gramene.",
Expand Down Expand Up @@ -12923,6 +12925,7 @@
}
},
"mge": {
"banana": "mge",
"miriam": {
"deprecated": false,
"description": "ACLAME is a database dedicated to the collection and classification of mobile genetic elements (MGEs) from various sources, comprising all known phage genomes, plasmids and transposons.",
Expand Down Expand Up @@ -14440,6 +14443,7 @@
}
},
"mzspec": {
"banana": "mzspec",
"miriam": {
"deprecated": false,
"description": "The Universal Spectrum Identifier (USI) is a compound identifier that provides an abstract path to refer to a single spectrum generated by a mass spectrometer, and potentially the ion that is thought to have produced it.",
Expand Down Expand Up @@ -15627,6 +15631,7 @@
}
},
"ocid": {
"banana": "ocid",
"miriam": {
"deprecated": false,
"description": "'ocid' stands for \"Ontology Concept Identifiers\" and are 12 digit long integers covering IDs in topical ontologies from anatomy up to toxicology.",
Expand Down Expand Up @@ -15979,6 +15984,7 @@
}
},
"oma.hog": {
"banana": "HOG",
"example": "0459895",
"miriam": {
"deprecated": false,
Expand All @@ -15990,6 +15996,7 @@
"prefix": "oma.hog",
"sampleId": "HOG:0459895"
},
"url": "https://omabrowser.org/oma/hog/HOG:$1",
"namespace.rewrite": "HOG"
},
"oma.protein": {
Expand Down Expand Up @@ -17302,6 +17309,7 @@
]
},
"peco": {
"banana": "EO",
"bioportal": {
"name": "Plant Experimental Conditions Ontology",
"prefix": "PECO"
Expand Down Expand Up @@ -21257,6 +21265,7 @@
}
},
"swh": {
"banana": "swh",
"miriam": {
"deprecated": false,
"description": "Software Heritage is the universal archive of software source code.",
Expand Down Expand Up @@ -22880,6 +22889,7 @@
]
},
"vario": {
"banana": "VariO",
"bioportal": {
"name": "Variation Ontology",
"prefix": "VARIO"
Expand Down
40 changes: 29 additions & 11 deletions src/bioregistry/resolve.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@
'get_pattern',
'get_pattern_re',
'namespace_in_lui',
'validate',
'get_format',
'get_example',
'is_deprecated',
Expand Down Expand Up @@ -95,17 +94,34 @@ def namespace_in_lui(prefix: str) -> Optional[bool]:
return entry.get('miriam', {}).get('namespaceEmbeddedInLui')


def validate(prefix: str, identifier: str) -> Optional[bool]:
"""Validate the identifier against the prefix's pattern, if it exists."""
pattern = get_pattern_re(prefix)
if pattern is None:
def get_identifiers_org_prefix(prefix: str) -> Optional[str]:
"""Get the identifiers.org prefix if available."""
return _get_mapped_prefix(prefix, 'miriam')


def get_obofoundry_prefix(prefix: str) -> Optional[str]:
"""Get the OBO Foundry prefix if available."""
return _get_mapped_prefix(prefix, 'obofoundry')


def get_ols_prefix(prefix: str) -> Optional[str]:
"""Get the OLS prefix if available."""
return _get_mapped_prefix(prefix, 'ols')


def _get_mapped_prefix(prefix: str, external: str) -> Optional[str]:
entry = get(prefix)
if entry is None:
return None
return entry.get(external, {}).get('prefix')

if namespace_in_lui(prefix) and not identifier.startswith(f'{prefix.upper()}:'):
# Some cases do not use uppercase
identifier = f'{prefix.upper()}:{identifier}'

return bool(pattern.match(identifier))
def get_banana(prefix: str) -> Optional[str]:
"""Get the optional redundant prefix to go before an identifier."""
entry = get(prefix)
if entry is None:
return None
return entry.get('banana')


def get_format(prefix: str) -> Optional[str]:
Expand All @@ -116,10 +132,12 @@ def get_format(prefix: str) -> Optional[str]:
url = entry.get('url')
if url is not None:
return url
miriam_id = entry.get('miriam', {}).get('prefix')
miriam_id = get_identifiers_org_prefix(prefix)
if miriam_id is not None:
if namespace_in_lui(prefix):
miriam_id = miriam_id.upper() # not exact solution, some less common ones don't use capitalization
# not exact solution, some less common ones don't use capitalization
# align with the banana solution
miriam_id = miriam_id.upper()
return f'https://identifiers.org/{miriam_id}:$1'
ols_id = entry.get('ols', {}).get('prefix')
if ols_id is not None:
Expand Down
94 changes: 94 additions & 0 deletions src/bioregistry/resolve_identifier.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
# -*- coding: utf-8 -*-

"""Resolvers for CURIE (e.g., pairs of prefix and identifier)."""

from typing import Mapping, Optional

from .resolve import (
get_banana, get_format, get_identifiers_org_prefix, get_obofoundry_prefix, get_ols_prefix, get_pattern_re,
namespace_in_lui,
)

__all__ = [
'validate',
'get_providers',
'get_identifiers_org_url',
'get_identifiers_org_curie',
'get_obofoundry_link',
'get_ols_link',
]


def validate(prefix: str, identifier: str) -> Optional[bool]:
"""Validate the identifier against the prefix's pattern, if it exists."""
pattern = get_pattern_re(prefix)
if pattern is None:
return None

if namespace_in_lui(prefix) and not identifier.startswith(f'{prefix.upper()}:'):
# Some cases do not use uppercase
identifier = f'{prefix.upper()}:{identifier}'

return bool(pattern.match(identifier))


def get_providers(prefix: str, identifier: str) -> Mapping[str, str]:
"""Get all providers for the CURIE."""
providers = {}
bioregistry_format = get_format(prefix)
if bioregistry_format:
providers['bioregistry'] = bioregistry_format.replace('$1', identifier)
for provider, get_url in [
('miriam', get_identifiers_org_url),
('obofoundry', get_obofoundry_link),
('ols', get_ols_link),
]:
link = get_url(prefix, identifier)
if link:
providers[provider] = link
return providers


def get_identifiers_org_url(prefix: str, identifier: str) -> Optional[str]:
"""Get the identifiers.org URL for the given CURIE."""
curie = get_identifiers_org_curie(prefix, identifier)
if curie is None:
return None
return f'https://identifiers.org/{curie}'


def get_identifiers_org_curie(prefix: str, identifier: str) -> Optional[str]:
"""Get the identifiers.org CURIE for the given CURIE."""
miriam_prefix = get_identifiers_org_prefix(prefix)
if miriam_prefix is None:
return None
if not namespace_in_lui(prefix):
return f'{prefix}:{identifier}'
banana = get_banana(prefix)
if banana:
if identifier.startswith(f'{banana}:'):
return identifier
else:
return f'{banana}:{identifier}'
else:
if identifier.startswith(prefix.upper()):
return identifier
else:
return f'{prefix.upper()}:{identifier}'


def get_obofoundry_link(prefix: str, identifier: str) -> Optional[str]:
"""Get the OBO Foundry URL if possible."""
obo_prefix = get_obofoundry_prefix(prefix)
if obo_prefix is None:
return None
return f'http://purl.obolibrary.org/obo/{obo_prefix.upper()}_{identifier}'


def get_ols_link(prefix: str, identifier: str) -> Optional[str]:
"""Get the OLS URL if possible."""
ols_prefix = get_ols_prefix(prefix)
obo_link = get_obofoundry_link(prefix, identifier)
if ols_prefix is None or obo_link is None:
return None
return f'https://www.ebi.ac.uk/ols/ontologies/{ols_prefix}/terms?iri={obo_link}'
43 changes: 43 additions & 0 deletions tests/test_identifiers_org.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
# -*- coding: utf-8 -*-

"""Tests for identifiers.org."""

import unittest

import requests

from bioregistry import get_identifiers_org_curie, get_identifiers_org_url


class TestIdentifiersOrg(unittest.TestCase):
"""Tests for identifiers.org."""

def test_url(self):
"""Test formatting URLs."""
for prefix, identifier, expected, _reason in [
('efo', '0000400', 'efo:0000400', 'test simple concatenation'),
('chebi', 'CHEBI:1234', 'CHEBI:1234', 'test redundant namespace (standard)'),
('chebi', '1234', 'CHEBI:1234', 'test exclusion of redundant namespace (standard)'),
(
'mzspec',
'PXD002255::ES_XP_Ubi_97H_HCD_349:scan:9617:LAEIYVNSSFYK/2',
'mzspec:PXD002255::ES_XP_Ubi_97H_HCD_349:scan:9617:LAEIYVNSSFYK/2',
'test simple concatenation with false banana',
),
(
'mzspec',
'mzspec:PXD002255::ES_XP_Ubi_97H_HCD_349:scan:9617:LAEIYVNSSFYK/2',
'mzspec:PXD002255::ES_XP_Ubi_97H_HCD_349:scan:9617:LAEIYVNSSFYK/2',
'test simple concatenation (redundant) with false banana',
),
]:
with self.subTest(p=prefix, i=identifier):
curie = get_identifiers_org_curie(prefix, identifier)
self.assertEqual(expected, curie, msg='wrong CURIE')

url = get_identifiers_org_url(prefix, identifier)
self.assertEqual(f'https://identifiers.org/{curie}', url, msg='wrong URL')

# Check that the URL resolves
res = requests.get(url)
self.assertEqual(200, res.status_code, msg=res.reason)
29 changes: 27 additions & 2 deletions tests/test_resolve.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@

import unittest

import requests

import bioregistry


Expand Down Expand Up @@ -65,7 +67,6 @@ def test_validate_false(self):
with self.subTest(prefix=prefix, identifier=identifier):
self.assertFalse(bioregistry.validate(prefix, identifier))

@unittest.skip
def test_lui(self):
"""Test the LUI makes sense (spoilers, they don't).
Expand All @@ -76,9 +77,33 @@ def test_lui(self):
for prefix in bioregistry.read_bioregistry():
if not bioregistry.namespace_in_lui(prefix):
continue
if bioregistry.get_banana(prefix):
continue # rewrite rules are applied to prefixes with bananas
if prefix in {'ark', 'obi'}:
continue # these patterns on identifiers.org are garb
with self.subTest(prefix=prefix):
re_pattern = bioregistry.get_pattern(prefix)
miriam_prefix = bioregistry.get_identifiers_org_prefix(prefix)
self.assertTrue(
re_pattern.startswith(f'^{prefix.upper()}') or re_pattern.startswith(prefix.upper()),
re_pattern.startswith(f'^{miriam_prefix.upper()}') or re_pattern.startswith(miriam_prefix.upper()),
msg=f'{prefix} pattern: {re_pattern}',
)

def test_banana(self):
"""Test that entries curated with a new banana are resolved properly."""
for prefix, entry in bioregistry.read_bioregistry().items():
banana = entry.get('banana')
if banana is None:
continue
if prefix in {'gramene.growthstage', 'oma.hog'}:
continue # identifiers.org is broken for these prefixes
with self.subTest(
prefix=prefix,
banana=banana,
pattern=bioregistry.get_pattern(prefix),
):
identifier = bioregistry.get_example(prefix)
self.assertIsNotNone(identifier)
url = bioregistry.resolve_identifier.get_identifiers_org_url(prefix, identifier)
res = requests.get(url)
self.assertEqual(200, res.status_code, msg=f'failed with URL: {url}')

0 comments on commit 7c908e2

Please sign in to comment.