Skip to content

Commit

Permalink
feat: use latest HGNC data API (#58)
Browse files Browse the repository at this point in the history
  • Loading branch information
jsstevenson authored Nov 26, 2024
1 parent d6774ad commit 29de5a4
Show file tree
Hide file tree
Showing 4 changed files with 87 additions and 20 deletions.
36 changes: 25 additions & 11 deletions src/wags_tails/hgnc.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
"""Provide data fetching for HGNC."""

import ftplib
import datetime
from pathlib import Path

from wags_tails.base_source import DataSource
from wags_tails.utils.downloads import download_ftp
import requests

from wags_tails.base_source import DataSource, RemoteDataError
from wags_tails.utils.downloads import HTTPS_REQUEST_TIMEOUT, download_http
from wags_tails.utils.versioning import DATE_VERSION_PATTERN


class HgncData(DataSource):
Expand All @@ -22,21 +25,32 @@ def _get_latest_version(self) -> str:
:return: latest release value
"""
with ftplib.FTP(self._host) as ftp:
ftp.login()
timestamp = ftp.voidcmd(f"MDTM {self._directory_path}{self._host_filename}")
return timestamp[4:12]
r = requests.get(
"https://rest.genenames.org/info",
timeout=HTTPS_REQUEST_TIMEOUT,
headers={"Accept": "application/json"},
)
r.raise_for_status()
r_json = r.json()
try:
date = r_json["lastModified"]
except KeyError as e:
msg = f"Unable to parse latest {self._src_name} version number from info API endpoint"
raise RemoteDataError(msg) from e
return (
datetime.datetime.strptime(date.split("T")[0], "%Y-%m-%d")
.replace(tzinfo=datetime.UTC)
.strftime(DATE_VERSION_PATTERN)
)

def _download_data(self, version: str, outfile: Path) -> None: # noqa: ARG002
"""Download data file to specified location.
:param version: version to acquire
:param outfile: location and filename for final data file
"""
download_ftp(
self._host,
self._directory_path,
self._host_filename,
download_http(
"https://storage.googleapis.com/public-download-files/hgnc/json/json/hgnc_complete_set.json",
outfile,
tqdm_params=self._tqdm_params,
)
Empty file added tests/fixtures/hgnc.json
Empty file.
1 change: 1 addition & 0 deletions tests/fixtures/hgnc_info.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"lastModified": "2024-11-22T13:08:53.008Z", "numDoc": 45634, "responseHeader": {"QTime": 0, "status": 0}, "searchableFields": ["curator_notes", "alias_name", "vega_id", "ena", "prev_name", "locus_type", "entrez_id", "uniprot_ids", "alias_symbol", "omim_id", "locus_group", "status", "ccds_id", "location", "mane_select", "rna_central_id", "refseq_accession", "ucsc_id", "mgd_id", "ensembl_gene_id", "symbol", "rgd_id", "name", "hgnc_id", "prev_symbol"], "storedFields": ["agr", "mamit-trnadb", "gtrnadb", "ena", "date_name_changed", "location", "bioparadigms_slc", "gene_group_id", "locus_group", "prev_name", "date_symbol_changed", "uniprot_ids", "omim_id", "lncipedia", "ccds_id", "enzyme_id", "alias_name", "imgt", "snornabase", "name", "prev_symbol", "pseudogene.org", "hgnc_id", "ensembl_gene_id", "symbol", "alias_name", "cosmic", "vega_id", "date_modified", "curator_notes", "lsdb", "cd", "horde_id", "status", "locus_type", "merops", "entrez_id", "alias_symbol", "mirbase", "_version_", "uuid", "orphanet", "refseq_accession", "iuphar", "homeodb", "mane_select", "rna_central_id", "location", "gencc", "lncrnadb", "date_approved_reserved", "rgd_id", "pubmed_id", "gene_group", "ucsc_id", "mgd_id", "prev_name"]}
70 changes: 61 additions & 9 deletions tests/test_hgnc.py
Original file line number Diff line number Diff line change
@@ -1,31 +1,49 @@
"""Test HGNC data source."""

import json
from pathlib import Path

import pytest
import requests_mock

from wags_tails import HgncData


@pytest.fixture()
def hgnc_data_dir(base_data_dir: Path):
"""Provide HGNC data directory."""
"""Provide fixture for HGNC wags-tails directory"""
directory = base_data_dir / "hgnc"
directory.mkdir(exist_ok=True, parents=True)
return directory


@pytest.fixture()
def hgnc(hgnc_data_dir: Path):
"""Provide ChemblData fixture"""
"""Provide fixture for HGNC fetcher instance"""
return HgncData(hgnc_data_dir, silent=True)


def test_get_latest_local(
@pytest.fixture(scope="module")
def info_response(fixture_dir):
"""Provide fixture for HGNC website release info response"""
with (fixture_dir / "hgnc_info.json").open() as f:
return json.load(f)


@pytest.fixture(scope="module")
def hgnc_file(fixture_dir):
"""Provide fixture for HGNC data file"""
with (fixture_dir / "hgnc.json").open("rb") as f:
return f.read()


def test_get_latest(
hgnc: HgncData,
hgnc_data_dir: Path,
info_response: dict,
hgnc_file: str,
):
"""Test local file management in HgncData.get_latest()"""
"""Test HGNC fetcher"""
with pytest.raises(
ValueError, match="Cannot set both `force_refresh` and `from_local`"
):
Expand All @@ -34,8 +52,42 @@ def test_get_latest_local(
with pytest.raises(FileNotFoundError):
hgnc.get_latest(from_local=True)

file_path = hgnc_data_dir / "hgnc_20230914.json"
file_path.touch()
path, version = hgnc.get_latest(from_local=True)
assert path == file_path
assert version == "20230914"
with requests_mock.Mocker() as m:
m.get(
"https://rest.genenames.org/info",
json=info_response,
)
m.get(
"https://storage.googleapis.com/public-download-files/hgnc/json/json/hgnc_complete_set.json",
content=hgnc_file,
)
path, version = hgnc.get_latest()
assert path == hgnc_data_dir / "hgnc_20241122.json"
assert path.exists()
assert version == "20241122"
assert m.call_count == 2

path, version = hgnc.get_latest()
assert path == hgnc_data_dir / "hgnc_20241122.json"
assert path.exists()
assert version == "20241122"
assert m.call_count == 3

path, version = hgnc.get_latest(from_local=True)
assert path == hgnc_data_dir / "hgnc_20241122.json"
assert path.exists()
assert version == "20241122"
assert m.call_count == 3

(hgnc_data_dir / "hgnc_20230923.json").touch()
path, version = hgnc.get_latest(from_local=True)
assert path == hgnc_data_dir / "hgnc_20241122.json"
assert path.exists()
assert version == "20241122"
assert m.call_count == 3

path, version = hgnc.get_latest(force_refresh=True)
assert path == hgnc_data_dir / "hgnc_20241122.json"
assert path.exists()
assert version == "20241122"
assert m.call_count == 5

0 comments on commit 29de5a4

Please sign in to comment.