-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
3cf0cf2
commit 70f4180
Showing
6 changed files
with
448 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,78 @@ | ||
"""Provide source fetching for ChemIDplus.""" | ||
import logging | ||
import re | ||
from pathlib import Path | ||
from typing import Optional, Tuple | ||
|
||
import requests | ||
|
||
from .base_source import DataSource, RemoteDataError | ||
|
||
_logger = logging.getLogger(__name__) | ||
|
||
|
||
class ChemIDplusData(DataSource): | ||
"""Provide access to ChemIDplus database.""" | ||
|
||
def __init__(self, data_dir: Optional[Path] = None, silent: bool = False) -> None: | ||
"""Set common class parameters. | ||
:param data_dir: direct location to store data files in. If not provided, tries | ||
to find a "chemidplus" subdirectory within the path at environment variable | ||
$WAGS_TAILS_DIR, or within a "wags_tails" subdirectory under environment | ||
variables $XDG_DATA_HOME or $XDG_DATA_DIRS, or finally, at | ||
``~/.local/share/`` | ||
:param silent: if True, don't print any info/updates to console | ||
""" | ||
self._src_name = "chemidplus" | ||
super().__init__(data_dir, silent) | ||
|
||
@staticmethod | ||
def _get_latest_version() -> str: | ||
"""Retrieve latest version value | ||
:return: latest release value | ||
:raise RemoteDataError: if unable to parse version number from data file | ||
""" | ||
latest_url = "https://ftp.nlm.nih.gov/projects/chemidlease/CurrentChemID.xml" | ||
headers = {"Range": "bytes=0-300"} # leave some slack to capture date | ||
r = requests.get(latest_url, headers=headers) | ||
r.raise_for_status() | ||
result = re.search(r" date=\"([0-9]{4}-[0-9]{2}-[0-9]{2})\">", r.text) | ||
if result: | ||
return result.groups()[0] | ||
else: | ||
raise RemoteDataError( | ||
"Unable to parse latest ChemIDplus version number from partial access to latest file" | ||
) | ||
|
||
def get_latest( | ||
self, from_local: bool = False, force_refresh: bool = False | ||
) -> Tuple[Path, str]: | ||
"""Get path to latest version of data, and its version value | ||
:param from_local: if True, use latest available local file | ||
:param force_refresh: if True, fetch and return data from remote regardless of | ||
whether a local copy is present | ||
:return: Path to location of data, and version value of it | ||
:raise ValueError: if both ``force_refresh`` and ``from_local`` are True | ||
""" | ||
if force_refresh and from_local: | ||
raise ValueError("Cannot set both `force_refresh` and `from_local`") | ||
|
||
if from_local: | ||
file_path = self._get_latest_local_file("chemidplus_*.db") | ||
return file_path, self._parse_file_version(file_path) | ||
|
||
latest_version = self._get_latest_version() | ||
latest_file = self._data_dir / f"chemidplus_{latest_version}.db" | ||
if (not force_refresh) and latest_file.exists(): | ||
_logger.debug( | ||
f"Found existing file, {latest_file.name}, matching latest version {latest_version}." | ||
) | ||
return latest_file, latest_version | ||
self._http_download( | ||
"https://ftp.nlm.nih.gov/projects/chemidlease/CurrentChemID.xml", | ||
latest_file, | ||
) | ||
return latest_file, latest_version |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,74 @@ | ||
"""Provide source fetching for DrugBank.""" | ||
import logging | ||
from pathlib import Path | ||
from typing import Optional, Tuple | ||
|
||
import requests | ||
|
||
from .base_source import DataSource, RemoteDataError | ||
|
||
_logger = logging.getLogger(__name__) | ||
|
||
|
||
class DrugBankDataData(DataSource): | ||
"""Provide access to DrugBank database.""" | ||
|
||
def __init__(self, data_dir: Optional[Path] = None, silent: bool = False) -> None: | ||
"""Set common class parameters. | ||
:param data_dir: direct location to store data files in. If not provided, tries | ||
to find a "drugbank" subdirectory within the path at environment variable | ||
$WAGS_TAILS_DIR, or within a "wags_tails" subdirectory under environment | ||
variables $XDG_DATA_HOME or $XDG_DATA_DIRS, or finally, at | ||
``~/.local/share/`` | ||
:param silent: if True, don't print any info/updates to console | ||
""" | ||
self._src_name = "drugbank" | ||
super().__init__(data_dir, silent) | ||
|
||
@staticmethod | ||
def _get_latest_version() -> Tuple[str, str]: | ||
"""Retrieve latest version value | ||
:return: latest release value and base download URL | ||
:raise RemoteDataError: if unable to parse version number from releases API | ||
""" | ||
releases_url = "https://go.drugbank.com/releases.json" | ||
r = requests.get(releases_url) | ||
r.raise_for_status() | ||
try: | ||
latest = r.json()[0] | ||
return latest["version"], latest["url"] | ||
except (KeyError, IndexError): | ||
raise RemoteDataError( | ||
"Unable to parse latest DrugBank version number from releases API endpoint" | ||
) | ||
|
||
def get_latest( | ||
self, from_local: bool = False, force_refresh: bool = False | ||
) -> Tuple[Path, str]: | ||
"""Get path to latest version of data, and its version value | ||
:param from_local: if True, use latest available local file | ||
:param force_refresh: if True, fetch and return data from remote regardless of | ||
whether a local copy is present | ||
:return: Path to location of data, and version value of it | ||
:raise ValueError: if both ``force_refresh`` and ``from_local`` are True | ||
""" | ||
if force_refresh and from_local: | ||
raise ValueError("Cannot set both `force_refresh` and `from_local`") | ||
|
||
if from_local: | ||
file_path = self._get_latest_local_file("drugbank_*.db") | ||
return file_path, self._parse_file_version(file_path) | ||
|
||
latest_version, latest_url_base = self._get_latest_version() | ||
latest_url = f"{latest_url_base}/downloads/all-drugbank-vocabulary" | ||
latest_file = self._data_dir / f"drugbank_{latest_version}.db" | ||
if (not force_refresh) and latest_file.exists(): | ||
_logger.debug( | ||
f"Found existing file, {latest_file.name}, matching latest version {latest_version}." | ||
) | ||
return latest_file, latest_version | ||
self._http_download(latest_url, latest_file, handler=self._zip_handler) | ||
return latest_file, latest_version |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,73 @@ | ||
"""Provide source fetching for Drugs@FDA.""" | ||
import logging | ||
from pathlib import Path | ||
from typing import Optional, Tuple | ||
|
||
import requests | ||
|
||
from .base_source import DataSource, RemoteDataError | ||
|
||
_logger = logging.getLogger(__name__) | ||
|
||
|
||
class DrugsAtFdaData(DataSource): | ||
"""Provide access to Drugs@FDA database.""" | ||
|
||
def __init__(self, data_dir: Optional[Path] = None, silent: bool = False) -> None: | ||
"""Set common class parameters. | ||
:param data_dir: direct location to store data files in. If not provided, tries | ||
to find a "drugsatfda" subdirectory within the path at environment variable | ||
$WAGS_TAILS_DIR, or within a "wags_tails" subdirectory under environment | ||
variables $XDG_DATA_HOME or $XDG_DATA_DIRS, or finally, at | ||
``~/.local/share/`` | ||
:param silent: if True, don't print any info/updates to console | ||
""" | ||
self._src_name = "drugsatfda" | ||
super().__init__(data_dir, silent) | ||
|
||
@staticmethod | ||
def _get_latest_version() -> str: | ||
"""Retrieve latest version value | ||
:return: latest release value | ||
:raise RemoteDataError: if unable to parse version number from releases API | ||
""" | ||
r = requests.get("https://api.fda.gov/download.json") | ||
r.raise_for_status() | ||
r_json = r.json() | ||
try: | ||
return r_json["results"]["drug"]["drugsfda"]["export_date"] | ||
except KeyError: | ||
raise RemoteDataError( | ||
"Unable to parse latest DrugBank version number from releases API endpoint" | ||
) | ||
|
||
def get_latest( | ||
self, from_local: bool = False, force_refresh: bool = False | ||
) -> Tuple[Path, str]: | ||
"""Get path to latest version of data, and its version value | ||
:param from_local: if True, use latest available local file | ||
:param force_refresh: if True, fetch and return data from remote regardless of | ||
whether a local copy is present | ||
:return: Path to location of data, and version value of it | ||
:raise ValueError: if both ``force_refresh`` and ``from_local`` are True | ||
""" | ||
if force_refresh and from_local: | ||
raise ValueError("Cannot set both `force_refresh` and `from_local`") | ||
|
||
if from_local: | ||
file_path = self._get_latest_local_file("drugsatfda_*.json") | ||
return file_path, self._parse_file_version(file_path) | ||
|
||
latest_version = self._get_latest_version() | ||
latest_url = "https://download.open.fda.gov/drug/drugsfda/drug-drugsfda-0001-of-0001.json.zip" | ||
latest_file = self._data_dir / f"drugsatfda_{latest_version}.json" | ||
if (not force_refresh) and latest_file.exists(): | ||
_logger.debug( | ||
f"Found existing file, {latest_file.name}, matching latest version {latest_version}." | ||
) | ||
return latest_file, latest_version | ||
self._http_download(latest_url, latest_file, handler=self._zip_handler) | ||
return latest_file, latest_version |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,110 @@ | ||
"""Provide source fetching for NCI Thesaurus.""" | ||
import logging | ||
import re | ||
from pathlib import Path | ||
from typing import Optional, Tuple | ||
|
||
import requests | ||
|
||
from .base_source import DataSource, RemoteDataError | ||
|
||
_logger = logging.getLogger(__name__) | ||
|
||
|
||
class NcitData(DataSource): | ||
"""Provide access to NCI Thesaurus database.""" | ||
|
||
def __init__(self, data_dir: Optional[Path] = None, silent: bool = False) -> None: | ||
"""Set common class parameters. | ||
:param data_dir: direct location to store data files in. If not provided, tries | ||
to find a "ncit" subdirectory within the path at environment variable | ||
$WAGS_TAILS_DIR, or within a "wags_tails" subdirectory under environment | ||
variables $XDG_DATA_HOME or $XDG_DATA_DIRS, or finally, at | ||
``~/.local/share/`` | ||
:param silent: if True, don't print any info/updates to console | ||
""" | ||
self._src_name = "ncit" | ||
super().__init__(data_dir, silent) | ||
|
||
@staticmethod | ||
def _get_latest_version() -> str: | ||
"""Retrieve latest version value | ||
:return: latest release value | ||
:raise RemoteDataError: if unable to parse version number from releases API | ||
""" | ||
r = requests.get("https://ncithesaurus.nci.nih.gov/ncitbrowser/") | ||
r.raise_for_status() | ||
r_text = r.text.split("\n") | ||
pattern = re.compile(r"Version:(\d\d\.\d\d\w)") | ||
for line in r_text: | ||
if "Version" in line: | ||
match = re.match(pattern, line.strip()) | ||
if match and match.groups(): | ||
return match.groups()[0] | ||
else: | ||
raise RemoteDataError( | ||
"Unable to parse latest NCIt version number homepage HTML." | ||
) | ||
|
||
@staticmethod | ||
def _get_url(version: str) -> str: | ||
"""Locate URL for requested version of NCIt data. | ||
NCI has a somewhat inconsistent file structure, so some tricks are needed. | ||
:param version: requested version | ||
:return: URL for NCIt OWL file | ||
:raise RemoteDataError: if unexpected NCI directory structure is encountered | ||
""" | ||
base_url = "https://evs.nci.nih.gov/ftp1/NCI_Thesaurus" | ||
# ping base NCIt directory | ||
release_fname = f"Thesaurus_{version}.OWL.zip" | ||
src_url = f"{base_url}/{release_fname}" | ||
r_try = requests.get(src_url) | ||
if r_try.status_code != 200: | ||
# ping NCIt archive directories | ||
archive_url = f"{base_url}/archive/{version}_Release/{release_fname}" | ||
archive_try = requests.get(archive_url) | ||
if archive_try.status_code != 200: | ||
old_archive_url = f"{base_url}/archive/20{version[0:2]}/{version}_Release/{release_fname}" | ||
old_archive_try = requests.get(old_archive_url) | ||
if old_archive_try.status_code != 200: | ||
raise RemoteDataError( | ||
f"Unable to locate URL for NCIt version {version}" | ||
) | ||
else: | ||
src_url = old_archive_url | ||
else: | ||
src_url = archive_url | ||
return src_url | ||
|
||
def get_latest( | ||
self, from_local: bool = False, force_refresh: bool = False | ||
) -> Tuple[Path, str]: | ||
"""Get path to latest version of data, and its version value | ||
:param from_local: if True, use latest available local file | ||
:param force_refresh: if True, fetch and return data from remote regardless of | ||
whether a local copy is present | ||
:return: Path to location of data, and version value of it | ||
:raise ValueError: if both ``force_refresh`` and ``from_local`` are True | ||
""" | ||
if force_refresh and from_local: | ||
raise ValueError("Cannot set both `force_refresh` and `from_local`") | ||
|
||
if from_local: | ||
file_path = self._get_latest_local_file("ncit_*.owl") | ||
return file_path, self._parse_file_version(file_path) | ||
|
||
latest_version = self._get_latest_version() | ||
latest_file = self._data_dir / f"ncit_{latest_version}.owl" | ||
if (not force_refresh) and latest_file.exists(): | ||
_logger.debug( | ||
f"Found existing file, {latest_file.name}, matching latest version {latest_version}." | ||
) | ||
return latest_file, latest_version | ||
url = self._get_url(latest_version) | ||
self._http_download(url, latest_file, handler=self._zip_handler) | ||
return latest_file, latest_version |
Oops, something went wrong.