Skip to content

Commit

Permalink
add init therapy
Browse files Browse the repository at this point in the history
  • Loading branch information
jsstevenson committed Oct 20, 2023
1 parent 3cf0cf2 commit 70f4180
Show file tree
Hide file tree
Showing 6 changed files with 448 additions and 0 deletions.
19 changes: 19 additions & 0 deletions src/wags_tails/base_source.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import os
import re
import tempfile
import zipfile
from pathlib import Path
from typing import Callable, Dict, Generator, Optional, Tuple

Expand Down Expand Up @@ -115,6 +116,24 @@ def _get_data_base() -> Path:
data_base_dir.mkdir(exist_ok=True, parents=True)
return data_base_dir

def _zip_handler(self, dl_path: Path, outfile_path: Path) -> None:
"""Provide simple callback function to extract the largest file within a given
zipfile and save it within the appropriate data directory.
:param Path dl_path: path to temp data file
:param Path outfile_path: path to save file within
"""
with zipfile.ZipFile(dl_path, "r") as zip_ref:
if len(zip_ref.filelist) > 1:
files = sorted(
zip_ref.filelist, key=lambda z: z.file_size, reverse=True
)
target = files[0]
else:
target = zip_ref.filelist[0]
target.filename = outfile_path.name
zip_ref.extract(target, path=outfile_path.parent)
os.remove(dl_path)

def _http_download(
self,
url: str,
Expand Down
78 changes: 78 additions & 0 deletions src/wags_tails/chemidplus.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
"""Provide source fetching for ChemIDplus."""
import logging
import re
from pathlib import Path
from typing import Optional, Tuple

import requests

from .base_source import DataSource, RemoteDataError

_logger = logging.getLogger(__name__)


class ChemIDplusData(DataSource):
"""Provide access to ChemIDplus database."""

def __init__(self, data_dir: Optional[Path] = None, silent: bool = False) -> None:
"""Set common class parameters.
:param data_dir: direct location to store data files in. If not provided, tries
to find a "chemidplus" subdirectory within the path at environment variable
$WAGS_TAILS_DIR, or within a "wags_tails" subdirectory under environment
variables $XDG_DATA_HOME or $XDG_DATA_DIRS, or finally, at
``~/.local/share/``
:param silent: if True, don't print any info/updates to console
"""
self._src_name = "chemidplus"
super().__init__(data_dir, silent)

@staticmethod
def _get_latest_version() -> str:
"""Retrieve latest version value
:return: latest release value
:raise RemoteDataError: if unable to parse version number from data file
"""
latest_url = "https://ftp.nlm.nih.gov/projects/chemidlease/CurrentChemID.xml"
headers = {"Range": "bytes=0-300"} # leave some slack to capture date
r = requests.get(latest_url, headers=headers)
r.raise_for_status()
result = re.search(r" date=\"([0-9]{4}-[0-9]{2}-[0-9]{2})\">", r.text)
if result:
return result.groups()[0]
else:
raise RemoteDataError(
"Unable to parse latest ChemIDplus version number from partial access to latest file"
)

def get_latest(
self, from_local: bool = False, force_refresh: bool = False
) -> Tuple[Path, str]:
"""Get path to latest version of data, and its version value
:param from_local: if True, use latest available local file
:param force_refresh: if True, fetch and return data from remote regardless of
whether a local copy is present
:return: Path to location of data, and version value of it
:raise ValueError: if both ``force_refresh`` and ``from_local`` are True
"""
if force_refresh and from_local:
raise ValueError("Cannot set both `force_refresh` and `from_local`")

if from_local:
file_path = self._get_latest_local_file("chemidplus_*.db")
return file_path, self._parse_file_version(file_path)

latest_version = self._get_latest_version()
latest_file = self._data_dir / f"chemidplus_{latest_version}.db"
if (not force_refresh) and latest_file.exists():
_logger.debug(
f"Found existing file, {latest_file.name}, matching latest version {latest_version}."
)
return latest_file, latest_version
self._http_download(
"https://ftp.nlm.nih.gov/projects/chemidlease/CurrentChemID.xml",
latest_file,
)
return latest_file, latest_version
74 changes: 74 additions & 0 deletions src/wags_tails/drugbank.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
"""Provide source fetching for DrugBank."""
import logging
from pathlib import Path
from typing import Optional, Tuple

import requests

from .base_source import DataSource, RemoteDataError

_logger = logging.getLogger(__name__)


class DrugBankDataData(DataSource):
"""Provide access to DrugBank database."""

def __init__(self, data_dir: Optional[Path] = None, silent: bool = False) -> None:
"""Set common class parameters.
:param data_dir: direct location to store data files in. If not provided, tries
to find a "drugbank" subdirectory within the path at environment variable
$WAGS_TAILS_DIR, or within a "wags_tails" subdirectory under environment
variables $XDG_DATA_HOME or $XDG_DATA_DIRS, or finally, at
``~/.local/share/``
:param silent: if True, don't print any info/updates to console
"""
self._src_name = "drugbank"
super().__init__(data_dir, silent)

@staticmethod
def _get_latest_version() -> Tuple[str, str]:
"""Retrieve latest version value
:return: latest release value and base download URL
:raise RemoteDataError: if unable to parse version number from releases API
"""
releases_url = "https://go.drugbank.com/releases.json"
r = requests.get(releases_url)
r.raise_for_status()
try:
latest = r.json()[0]
return latest["version"], latest["url"]
except (KeyError, IndexError):
raise RemoteDataError(
"Unable to parse latest DrugBank version number from releases API endpoint"
)

def get_latest(
self, from_local: bool = False, force_refresh: bool = False
) -> Tuple[Path, str]:
"""Get path to latest version of data, and its version value
:param from_local: if True, use latest available local file
:param force_refresh: if True, fetch and return data from remote regardless of
whether a local copy is present
:return: Path to location of data, and version value of it
:raise ValueError: if both ``force_refresh`` and ``from_local`` are True
"""
if force_refresh and from_local:
raise ValueError("Cannot set both `force_refresh` and `from_local`")

if from_local:
file_path = self._get_latest_local_file("drugbank_*.db")
return file_path, self._parse_file_version(file_path)

latest_version, latest_url_base = self._get_latest_version()
latest_url = f"{latest_url_base}/downloads/all-drugbank-vocabulary"
latest_file = self._data_dir / f"drugbank_{latest_version}.db"
if (not force_refresh) and latest_file.exists():
_logger.debug(
f"Found existing file, {latest_file.name}, matching latest version {latest_version}."
)
return latest_file, latest_version
self._http_download(latest_url, latest_file, handler=self._zip_handler)
return latest_file, latest_version
73 changes: 73 additions & 0 deletions src/wags_tails/drugsatfda.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
"""Provide source fetching for Drugs@FDA."""
import logging
from pathlib import Path
from typing import Optional, Tuple

import requests

from .base_source import DataSource, RemoteDataError

_logger = logging.getLogger(__name__)


class DrugsAtFdaData(DataSource):
"""Provide access to Drugs@FDA database."""

def __init__(self, data_dir: Optional[Path] = None, silent: bool = False) -> None:
"""Set common class parameters.
:param data_dir: direct location to store data files in. If not provided, tries
to find a "drugsatfda" subdirectory within the path at environment variable
$WAGS_TAILS_DIR, or within a "wags_tails" subdirectory under environment
variables $XDG_DATA_HOME or $XDG_DATA_DIRS, or finally, at
``~/.local/share/``
:param silent: if True, don't print any info/updates to console
"""
self._src_name = "drugsatfda"
super().__init__(data_dir, silent)

@staticmethod
def _get_latest_version() -> str:
"""Retrieve latest version value
:return: latest release value
:raise RemoteDataError: if unable to parse version number from releases API
"""
r = requests.get("https://api.fda.gov/download.json")
r.raise_for_status()
r_json = r.json()
try:
return r_json["results"]["drug"]["drugsfda"]["export_date"]
except KeyError:
raise RemoteDataError(
"Unable to parse latest DrugBank version number from releases API endpoint"
)

def get_latest(
self, from_local: bool = False, force_refresh: bool = False
) -> Tuple[Path, str]:
"""Get path to latest version of data, and its version value
:param from_local: if True, use latest available local file
:param force_refresh: if True, fetch and return data from remote regardless of
whether a local copy is present
:return: Path to location of data, and version value of it
:raise ValueError: if both ``force_refresh`` and ``from_local`` are True
"""
if force_refresh and from_local:
raise ValueError("Cannot set both `force_refresh` and `from_local`")

if from_local:
file_path = self._get_latest_local_file("drugsatfda_*.json")
return file_path, self._parse_file_version(file_path)

latest_version = self._get_latest_version()
latest_url = "https://download.open.fda.gov/drug/drugsfda/drug-drugsfda-0001-of-0001.json.zip"
latest_file = self._data_dir / f"drugsatfda_{latest_version}.json"
if (not force_refresh) and latest_file.exists():
_logger.debug(
f"Found existing file, {latest_file.name}, matching latest version {latest_version}."
)
return latest_file, latest_version
self._http_download(latest_url, latest_file, handler=self._zip_handler)
return latest_file, latest_version
110 changes: 110 additions & 0 deletions src/wags_tails/ncit.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
"""Provide source fetching for NCI Thesaurus."""
import logging
import re
from pathlib import Path
from typing import Optional, Tuple

import requests

from .base_source import DataSource, RemoteDataError

_logger = logging.getLogger(__name__)


class NcitData(DataSource):
"""Provide access to NCI Thesaurus database."""

def __init__(self, data_dir: Optional[Path] = None, silent: bool = False) -> None:
"""Set common class parameters.
:param data_dir: direct location to store data files in. If not provided, tries
to find a "ncit" subdirectory within the path at environment variable
$WAGS_TAILS_DIR, or within a "wags_tails" subdirectory under environment
variables $XDG_DATA_HOME or $XDG_DATA_DIRS, or finally, at
``~/.local/share/``
:param silent: if True, don't print any info/updates to console
"""
self._src_name = "ncit"
super().__init__(data_dir, silent)

@staticmethod
def _get_latest_version() -> str:
"""Retrieve latest version value
:return: latest release value
:raise RemoteDataError: if unable to parse version number from releases API
"""
r = requests.get("https://ncithesaurus.nci.nih.gov/ncitbrowser/")
r.raise_for_status()
r_text = r.text.split("\n")
pattern = re.compile(r"Version:(\d\d\.\d\d\w)")
for line in r_text:
if "Version" in line:
match = re.match(pattern, line.strip())
if match and match.groups():
return match.groups()[0]
else:
raise RemoteDataError(
"Unable to parse latest NCIt version number homepage HTML."
)

@staticmethod
def _get_url(version: str) -> str:
"""Locate URL for requested version of NCIt data.
NCI has a somewhat inconsistent file structure, so some tricks are needed.
:param version: requested version
:return: URL for NCIt OWL file
:raise RemoteDataError: if unexpected NCI directory structure is encountered
"""
base_url = "https://evs.nci.nih.gov/ftp1/NCI_Thesaurus"
# ping base NCIt directory
release_fname = f"Thesaurus_{version}.OWL.zip"
src_url = f"{base_url}/{release_fname}"
r_try = requests.get(src_url)
if r_try.status_code != 200:
# ping NCIt archive directories
archive_url = f"{base_url}/archive/{version}_Release/{release_fname}"
archive_try = requests.get(archive_url)
if archive_try.status_code != 200:
old_archive_url = f"{base_url}/archive/20{version[0:2]}/{version}_Release/{release_fname}"
old_archive_try = requests.get(old_archive_url)
if old_archive_try.status_code != 200:
raise RemoteDataError(
f"Unable to locate URL for NCIt version {version}"
)
else:
src_url = old_archive_url
else:
src_url = archive_url
return src_url

def get_latest(
self, from_local: bool = False, force_refresh: bool = False
) -> Tuple[Path, str]:
"""Get path to latest version of data, and its version value
:param from_local: if True, use latest available local file
:param force_refresh: if True, fetch and return data from remote regardless of
whether a local copy is present
:return: Path to location of data, and version value of it
:raise ValueError: if both ``force_refresh`` and ``from_local`` are True
"""
if force_refresh and from_local:
raise ValueError("Cannot set both `force_refresh` and `from_local`")

if from_local:
file_path = self._get_latest_local_file("ncit_*.owl")
return file_path, self._parse_file_version(file_path)

latest_version = self._get_latest_version()
latest_file = self._data_dir / f"ncit_{latest_version}.owl"
if (not force_refresh) and latest_file.exists():
_logger.debug(
f"Found existing file, {latest_file.name}, matching latest version {latest_version}."
)
return latest_file, latest_version
url = self._get_url(latest_version)
self._http_download(url, latest_file, handler=self._zip_handler)
return latest_file, latest_version
Loading

0 comments on commit 70f4180

Please sign in to comment.