Skip to content

Commit

Permalink
feat: add custom source
Browse files Browse the repository at this point in the history
  • Loading branch information
jsstevenson committed Oct 25, 2023
1 parent 69cb105 commit 3a5f7bd
Show file tree
Hide file tree
Showing 16 changed files with 279 additions and 17 deletions.
2 changes: 2 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@ docstring-quotes = "double"
[tool.ruff.per-file-ignores]
# ANN001 - missing-type-function-argument
# ANN2 - missing-return-type
# ANN102 - missing-type-annotation
# ANN201 - Missing type annotation
# ANN102 - missing-type-cls
# D103 - Missing docstring in public function
Expand All @@ -97,3 +98,4 @@ docstring-quotes = "double"
"tests/*" = ["ANN001", "ANN102", "ANN2"]
"*__init__.py" = ["F401"]
"docs/source/conf.py" = ["D100", "I001", "D103", "ANN201", "ANN001"]
"src/wags_tails/base_source.py" = ["ANN102"]
4 changes: 3 additions & 1 deletion src/wags_tails/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""Data acquisition tools for Wagnerds."""
from .chembl import ChemblData
from .chemidplus import ChemIDplusData
from .custom import CustomData
from .drugbank import DrugBankData
from .drugsatfda import DrugsAtFdaData
from .guide_to_pharmacology import GToPLigandData
Expand All @@ -9,12 +10,13 @@
from .rxnorm import RxNormData

__all__ = [
"MondoData",
"ChemblData",
"ChemIDplusData",
"CustomData",
"DrugBankData",
"DrugsAtFdaData",
"GToPLigandData",
"HemOncData",
"MondoData",
"RxNormData",
]
18 changes: 11 additions & 7 deletions src/wags_tails/base_source.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,8 +64,8 @@ def get_latest(

### shared utilities

@staticmethod
def _get_data_base() -> Path:
@classmethod
def _get_data_base(cls) -> Path:
"""Get base data storage location.
By default, conform to `XDG Base Directory Specification <https://specifications.freedesktop.org/basedir-spec/basedir-spec-latest.html>`_,
Expand Down Expand Up @@ -103,8 +103,8 @@ def _get_data_base() -> Path:
data_base_dir.mkdir(exist_ok=True, parents=True)
return data_base_dir

@staticmethod
def _zip_handler(dl_path: Path, outfile_path: Path) -> None:
@classmethod
def _zip_handler(cls, dl_path: Path, outfile_path: Path) -> None:
"""Provide simple callback function to extract the largest file within a given
zipfile and save it within the appropriate data directory.
Expand All @@ -123,12 +123,14 @@ def _zip_handler(dl_path: Path, outfile_path: Path) -> None:
zip_ref.extract(target, path=outfile_path.parent)
os.remove(dl_path)

@classmethod
def _http_download(
self,
cls,
url: str,
outfile_path: Path,
headers: Optional[Dict] = None,
handler: Optional[Callable[[Path, Path], None]] = None,
tqdm_params: Optional[Dict] = None,
) -> None:
"""Perform HTTP download of remote data file.
Expand All @@ -139,6 +141,8 @@ def _http_download(
:param handler: provide if downloaded file requires additional action, e.g.
it's a zip file.
"""
if not tqdm_params:
tqdm_params = {}
_logger.info(f"Downloading {outfile_path.name} from {url}...")
if handler:
dl_path = Path(tempfile.gettempdir()) / "wags_tails_tmp"
Expand All @@ -149,7 +153,7 @@ def _http_download(
r.raise_for_status()
total_size = int(r.headers.get("content-length", 0))
with open(dl_path, "wb") as h:
if not self._tqdm_params["disable"]:
if not tqdm_params["disable"]:
if "apiKey" in url:
pattern = r"&apiKey=.{8}-.{4}-.{4}-.{4}-.{12}"
print_url = re.sub(pattern, "", os.path.basename(url))
Expand All @@ -158,7 +162,7 @@ def _http_download(
print(f"Downloading {os.path.basename(url)}")
with tqdm(
total=total_size,
**self._tqdm_params,
**tqdm_params,
) as progress_bar:
for chunk in r.iter_content(chunk_size=8192):
if chunk:
Expand Down
4 changes: 2 additions & 2 deletions src/wags_tails/chembl.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,9 @@

import requests

from wags_tails.base_source import DataSource, RemoteDataError
from wags_tails.version_utils import parse_file_version

from .base_source import DataSource, RemoteDataError

_logger = logging.getLogger(__name__)


Expand Down Expand Up @@ -97,5 +96,6 @@ def get_latest(
f"https://ftp.ebi.ac.uk/pub/databases/chembl/ChEMBLdb/latest/chembl_{latest_version}_sqlite.tar.gz",
latest_file,
handler=self._open_tarball,
tqdm_params=self._tqdm_params,
)
return latest_file, latest_version
9 changes: 7 additions & 2 deletions src/wags_tails/chemidplus.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
"""Provide source fetching for ChemIDplus."""
import logging
import re
from datetime import datetime
from pathlib import Path
from typing import Optional, Tuple

import requests

from wags_tails.version_utils import parse_file_version
from wags_tails.version_utils import DATE_VERSION_PATTERN, parse_file_version

from .base_source import DataSource, RemoteDataError

Expand Down Expand Up @@ -42,7 +43,10 @@ def _get_latest_version() -> str:
r.raise_for_status()
result = re.search(r" date=\"([0-9]{4}-[0-9]{2}-[0-9]{2})\">", r.text)
if result:
return result.groups()[0]
raw_date = result.groups()[0]
return datetime.strptime(raw_date, "%Y-%m-%d").strftime(
DATE_VERSION_PATTERN
)
else:
raise RemoteDataError(
"Unable to parse latest ChemIDplus version number from partial access to latest file"
Expand Down Expand Up @@ -76,5 +80,6 @@ def get_latest(
self._http_download(
"https://ftp.nlm.nih.gov/projects/chemidlease/CurrentChemID.xml",
latest_file,
tqdm_params=self._tqdm_params,
)
return latest_file, latest_version
84 changes: 84 additions & 0 deletions src/wags_tails/custom.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
"""Provide data acquisition class for custom data acquisition needs.
Some source data (e.g. Wikidata, for Thera-py), fetching data is a more involved and
customized process, but this library should be very dependency-light to ensure broad
compatibility.
"""
import logging
from pathlib import Path
from typing import Callable, Optional, Tuple

from wags_tails.base_source import DataSource
from wags_tails.version_utils import parse_file_version

_logger = logging.getLogger(__name__)


class CustomData(DataSource):
"""Data acquisition class using custom, user-provided acquisition methods."""

def __init__(
self,
src_name: str,
file_suffix: str,
latest_version_cb: Callable[[], str],
download_cb: Callable[[Path, str], None],
data_dir: Optional[Path] = None,
silent: bool = False,
) -> None:
"""Set common class parameters.
:param src_name: Name of source. Used to set some default file naming and location
parameters.
:param file_suffix: file type. Used to set some default naming and location
parameters.
:param latest_version_cb: function for acquiring latest version, returning that
value as a string
:param download_cb: function for acquiring data, taking arguments for the Path
to save the file to, and the latest version of the data
:param data_dir: direct location to store data files in. If not provided, tries
to find a ``<src_name>`` subdirectory within the path at environment variable
$WAGS_TAILS_DIR, or within a "wags_tails" subdirectory under environment
variables $XDG_DATA_HOME or $XDG_DATA_DIRS, or finally, at
``~/.local/share/``
:param silent: if True, don't print any info/updates to console
"""
self._src_name = src_name
self._file_suffix = file_suffix
self._get_latest_version = latest_version_cb
self._download_data = download_cb
super().__init__(data_dir, silent)

def get_latest(
self, from_local: bool = False, force_refresh: bool = False
) -> Tuple[Path, str]:
"""Get path to latest version of data.
:param from_local: if True, use latest available local file
:param force_refresh: if True, fetch and return data from remote regardless of
whether a local copy is present
:return: Path to location of data, and version value of it
:raise ValueError: if both ``force_refresh`` and ``from_local`` are True
"""
if force_refresh and from_local:
raise ValueError("Cannot set both `force_refresh` and `from_local`")

if from_local:
file_path = self._get_latest_local_file(
f"{self._src_name}_*.{self._file_suffix}"
)
return file_path, parse_file_version(
file_path, f"{self._src_name}_(\\d+).{self._file_suffix}"
)

latest_version = self._get_latest_version()
latest_file = (
self._data_dir / f"{self._src_name}_{latest_version}.{self._file_suffix}"
)
if (not force_refresh) and latest_file.exists():
_logger.debug(
f"Found existing file, {latest_file.name}, matching latest version {latest_version}."
)
return latest_file, latest_version
self._download_data(latest_file, latest_version)
return latest_file, latest_version
7 changes: 6 additions & 1 deletion src/wags_tails/drugbank.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,5 +93,10 @@ def get_latest(
f"Found existing file, {latest_file.name}, matching latest version {latest_version}."
)
return latest_file, latest_version
self._http_download(latest_url, latest_file, handler=self._zip_handler)
self._http_download(
latest_url,
latest_file,
handler=self._zip_handler,
tqdm_params=self._tqdm_params,
)
return latest_file, latest_version
7 changes: 6 additions & 1 deletion src/wags_tails/drugsatfda.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,5 +75,10 @@ def get_latest(
f"Found existing file, {latest_file.name}, matching latest version {latest_version}."
)
return latest_file, latest_version
self._http_download(latest_url, latest_file, handler=self._zip_handler)
self._http_download(
latest_url,
latest_file,
handler=self._zip_handler,
tqdm_params=self._tqdm_params,
)
return latest_file, latest_version
2 changes: 2 additions & 0 deletions src/wags_tails/guide_to_pharmacology.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,8 +112,10 @@ def _download_files(self, file_paths: GtoPLigandPaths) -> None:
self._http_download(
"https://www.guidetopharmacology.org/DATA/ligands.tsv",
file_paths.ligands,
tqdm_params=self._tqdm_params,
)
self._http_download(
"https://www.guidetopharmacology.org/DATA/ligand_id_mapping.tsv",
file_paths.ligand_id_mapping,
tqdm_params=self._tqdm_params,
)
1 change: 1 addition & 0 deletions src/wags_tails/hemonc.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,5 +140,6 @@ def get_latest(
handler=lambda dl_path, save_path: self._download_handler(
dl_path, file_paths
),
tqdm_params=self._tqdm_params,
)
return file_paths, latest_version
3 changes: 2 additions & 1 deletion src/wags_tails/mondo.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ def get_latest(
)
return latest_file, latest_version
else:
self._http_download(data_url, latest_file) # type: ignore
self._http_download(data_url, latest_file, tqdm_params=self._tqdm_params)
return latest_file, latest_version

def get_specific(
Expand Down Expand Up @@ -113,5 +113,6 @@ def get_specific(
self._http_download(
f"https://github.com/monarch-initiative/mondo/releases/download/{version}/mondo.owl",
local_file,
tqdm_params=self._tqdm_params,
)
return local_file
4 changes: 3 additions & 1 deletion src/wags_tails/ncit.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,5 +108,7 @@ def get_latest(
)
return latest_file, latest_version
url = self._get_url(latest_version)
self._http_download(url, latest_file, handler=self._zip_handler)
self._http_download(
url, latest_file, handler=self._zip_handler, tqdm_params=self._tqdm_params
)
return latest_file, latest_version
4 changes: 3 additions & 1 deletion src/wags_tails/rxnorm.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,9 @@ def _download_file(self, file_path: Path, version: str) -> None:
dl_url = f"https://download.nlm.nih.gov/umls/kss/rxnorm/RxNorm_full_{fmt_version}.zip"
url = f"https://uts-ws.nlm.nih.gov/download?url={dl_url}&apiKey={api_key}"

self._http_download(url, file_path, handler=self._zip_handler)
self._http_download(
url, file_path, handler=self._zip_handler, tqdm_params=self._tqdm_params
)

def get_latest(
self, from_local: bool = False, force_refresh: bool = False
Expand Down
6 changes: 6 additions & 0 deletions tests/fixtures/chemidplus.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<file name="chemid-2023-02-22.xml" date="2023-02-22">
<Chemical id="0000034742" displayFormula="C12-H14-O4" displayName="Monobutyl phthalate">
<NameList><NameOfSubstance>Monobutyl phthalate<SourceList><Source>MeSH</Source></SourceList></NameOfSubstance><Synonyms>Mono-n-butyl-phthalate<SourceList><Source>NLM</Source></SourceList></Synonyms></NameList>
</Chemical>
</file>
Loading

0 comments on commit 3a5f7bd

Please sign in to comment.