-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
10 changed files
with
388 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
"""Fetches transcript mapping data from Ensembl BioMart.""" | ||
|
||
from pathlib import Path | ||
|
||
from wags_tails.base_source import UnversionedDataSource | ||
from wags_tails.utils.downloads import download_http | ||
|
||
QUERY = '<Query virtualSchemaName="default" formatter="TSV" header="1" datasetConfigVersion="0.6"><Dataset name="hsapiens_gene_ensembl" interface="default"><Attribute name="ensembl_gene_id" /><Attribute name="ensembl_gene_id_version" /><Attribute name="ensembl_transcript_id" /><Attribute name="ensembl_transcript_id_version" /><Attribute name="ensembl_peptide_id" /><Attribute name="ensembl_peptide_id_version" /><Attribute name="transcript_mane_select" /><Attribute name="external_gene_name" /></Dataset></Query>' | ||
|
||
|
||
class EnsemblTranscriptMappingData(UnversionedDataSource): | ||
"""Provide access to Ensembl transcript mapping data, from the Ensembl BioMart.""" | ||
|
||
_src_name = "ensembl_transcript_mappings" | ||
_filetype = "tsv" | ||
|
||
def _download_data(self, version: str, outfile: Path) -> None: | ||
"""Download data file to specified location. | ||
:param version: version to acquire | ||
:param outfile: location and filename for final data file | ||
""" | ||
download_http( | ||
f"http://ensembl.org/biomart/martservice?query={QUERY}", | ||
outfile, | ||
tqdm_params=self._tqdm_params, | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
"""Fetches NCBI LRG_RefSeqGene data.""" | ||
import re | ||
from pathlib import Path | ||
|
||
import requests | ||
|
||
from .base_source import DataSource, RemoteDataError | ||
from .utils.downloads import HTTPS_REQUEST_TIMEOUT, download_http | ||
|
||
|
||
class NcbiLrgRefSeqGeneData(DataSource): | ||
"""Provide access to NCBI LRG_RefSeqGene data.""" | ||
|
||
_src_name = "ncbi_lrg_refseqgene" | ||
_filetype = "tsv" | ||
|
||
def _get_latest_version(self) -> str: | ||
"""Retrieve latest version value | ||
:return: latest release value | ||
:raise RemoteDataError: if unable to parse version number from file directory | ||
""" | ||
url = "https://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/RefSeqGene/" | ||
response = requests.get(url, timeout=HTTPS_REQUEST_TIMEOUT) | ||
response.raise_for_status() | ||
text = response.text | ||
for row in text.split("\n"): | ||
if "LRG_RefSeqGene" in row: | ||
break | ||
else: | ||
msg = f"Unable to parse LRG_RefSeqGene updated date from directory at {url}" | ||
raise RemoteDataError(msg) | ||
match = re.findall(r"\d\d\d\d-\d\d-\d\d", row) | ||
if not match: | ||
msg = f"Unable to parse LRG_RefSeqGene updated date from directory at {url}" | ||
raise RemoteDataError(msg) | ||
return match[0].replace("-", "") | ||
|
||
def _download_data(self, version: str, outfile: Path) -> None: | ||
"""Download data file to specified location. | ||
:param version: version to acquire | ||
:param outfile: location and filename for final data file | ||
""" | ||
download_http( | ||
"https://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/RefSeqGene/LRG_RefSeqGene", | ||
outfile, | ||
tqdm_params=self._tqdm_params, | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,43 @@ | ||
"""Fetches NCBI MANE summary data.""" | ||
from pathlib import Path | ||
|
||
import requests | ||
|
||
from .base_source import DataSource, RemoteDataError | ||
from .utils.downloads import HTTPS_REQUEST_TIMEOUT, download_http, handle_gzip | ||
|
||
|
||
class NcbiManeSummaryData(DataSource): | ||
"""Provide access to NCBI MANE summary file.""" | ||
|
||
_src_name = "ncbi_mane_summary" | ||
_filetype = "txt" | ||
|
||
def _get_latest_version(self) -> str: | ||
"""Retrieve latest version value | ||
:return: latest release value | ||
:raise RemoteDataError: if unable to parse version number from README | ||
""" | ||
latest_readme_url = "https://ftp.ncbi.nlm.nih.gov/refseq/MANE/MANE_human/current/README_versions.txt" | ||
response = requests.get(latest_readme_url, timeout=HTTPS_REQUEST_TIMEOUT) | ||
response.raise_for_status() | ||
text = response.text | ||
try: | ||
return text.split("\n")[0].split("\t")[1] | ||
except IndexError as e: | ||
msg = f"Unable to parse latest NCBI MANE summary version number from README at {latest_readme_url}" | ||
raise RemoteDataError(msg) from e | ||
|
||
def _download_data(self, version: str, outfile: Path) -> None: | ||
"""Download data file to specified location. | ||
:param version: version to acquire | ||
:param outfile: location and filename for final data file | ||
""" | ||
download_http( | ||
f"https://ftp.ncbi.nlm.nih.gov/refseq/MANE/MANE_human/release_{version}/MANE.GRCh38.v{version}.summary.txt.gz", | ||
outfile, | ||
handler=handle_gzip, | ||
tqdm_params=self._tqdm_params, | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2 Final//EN"> | ||
<html> | ||
<head> | ||
<title>Index of /refseq/H_sapiens/RefSeqGene</title> | ||
</head> | ||
<body> | ||
<h1>Index of /refseq/H_sapiens/RefSeqGene</h1> | ||
<pre>Name Last modified Size <hr><a href="/refseq/H_sapiens/">Parent Directory</a> - | ||
<a href="presentations/">presentations/</a> 2012-02-14 06:11 - | ||
<a href="Aligned2RefSeqGene">Aligned2RefSeqGene</a> 2024-02-01 06:05 789K | ||
<a href="GCF_000001405.25_refseqgene_alignments.gff3">GCF_000001405.25_refseqgene_alignments.gff3</a> 2022-10-04 14:35 2.5M | ||
<a href="LRG_RefSeqGene">LRG_RefSeqGene</a> 2024-02-01 06:07 2.5M | ||
<a href="README.txt">README.txt</a> 2016-03-29 15:33 6.8K | ||
<a href="gene_RefSeqGene">gene_RefSeqGene</a> 2024-02-01 06:05 192K | ||
<a href="refseqgene.1.genomic.fna.gz">refseqgene.1.genomic.fna.gz</a> 2024-01-30 10:57 26M | ||
<a href="refseqgene.1.genomic.gbff.gz">refseqgene.1.genomic.gbff.gz</a> 2024-01-30 10:58 439M | ||
<a href="refseqgene.2.genomic.fna.gz">refseqgene.2.genomic.fna.gz</a> 2024-01-30 10:58 37M | ||
<a href="refseqgene.2.genomic.gbff.gz">refseqgene.2.genomic.gbff.gz</a> 2024-01-30 10:58 428M | ||
<a href="refseqgene.3.genomic.fna.gz">refseqgene.3.genomic.fna.gz</a> 2024-01-30 10:58 33M | ||
<a href="refseqgene.3.genomic.gbff.gz">refseqgene.3.genomic.gbff.gz</a> 2024-01-30 10:58 460M | ||
<a href="refseqgene.4.genomic.fna.gz">refseqgene.4.genomic.fna.gz</a> 2024-01-30 10:58 22M | ||
<a href="refseqgene.4.genomic.gbff.gz">refseqgene.4.genomic.gbff.gz</a> 2024-01-30 10:58 316M | ||
<a href="refseqgene.5.genomic.fna.gz">refseqgene.5.genomic.fna.gz</a> 2024-01-30 10:58 23M | ||
<a href="refseqgene.5.genomic.gbff.gz">refseqgene.5.genomic.gbff.gz</a> 2024-01-30 10:58 337M | ||
<a href="refseqgene.6.genomic.fna.gz">refseqgene.6.genomic.fna.gz</a> 2024-01-30 10:58 33M | ||
<a href="refseqgene.6.genomic.gbff.gz">refseqgene.6.genomic.gbff.gz</a> 2024-01-30 10:58 470M | ||
<a href="refseqgene.7.genomic.fna.gz">refseqgene.7.genomic.fna.gz</a> 2024-01-30 10:58 14M | ||
<a href="refseqgene.7.genomic.gbff.gz">refseqgene.7.genomic.gbff.gz</a> 2024-01-30 10:58 188M | ||
<a href="refseqgene.files.installed">refseqgene.files.installed</a> 2024-01-30 10:58 861 | ||
<hr></pre> | ||
<a href="https://www.hhs.gov/vulnerability-disclosure-policy/index.html">HHS Vulnerability Disclosure</a> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
MANE Version 1.3 | ||
NCBI RefSeq Annotation Release GCF_000001405.40-RS_2023_10 | ||
Ensembl Release 112 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,63 @@ | ||
"""Test Ensembl Transcript Mappings data source.""" | ||
from pathlib import Path | ||
|
||
import pytest | ||
import requests_mock | ||
|
||
from wags_tails.ensembl_transcript_mappings import EnsemblTranscriptMappingData | ||
|
||
|
||
@pytest.fixture() | ||
def mappings_data_dir(base_data_dir: Path): | ||
"""Provide ensembl transcript mappings data directory.""" | ||
directory = base_data_dir / "ensembl_transcript_mappings" | ||
directory.mkdir(exist_ok=True, parents=True) | ||
return directory | ||
|
||
|
||
@pytest.fixture() | ||
def ensembl_transcript_mappings(mappings_data_dir: Path): | ||
"""Provide EnsemblTranscriptMappingData fixture""" | ||
return EnsemblTranscriptMappingData(mappings_data_dir, silent=True) | ||
|
||
|
||
def test_get_latest( | ||
ensembl_transcript_mappings: EnsemblTranscriptMappingData, | ||
mappings_data_dir: Path, | ||
): | ||
"""Test EnsemblTranscriptMappingData.get_latest()""" | ||
with pytest.raises( | ||
ValueError, match="Cannot set both `force_refresh` and `from_local`" | ||
): | ||
ensembl_transcript_mappings.get_latest(from_local=True, force_refresh=True) | ||
|
||
with pytest.raises(FileNotFoundError): | ||
ensembl_transcript_mappings.get_latest(from_local=True) | ||
|
||
with requests_mock.Mocker() as m: | ||
m.get( | ||
'http://ensembl.org/biomart/martservice?query=<Query virtualSchemaName="default" formatter="TSV" header="1" datasetConfigVersion="0.6"><Dataset name="hsapiens_gene_ensembl" interface="default"><Attribute name="ensembl_gene_id" /><Attribute name="ensembl_gene_id_version" /><Attribute name="ensembl_transcript_id" /><Attribute name="ensembl_transcript_id_version" /><Attribute name="ensembl_peptide_id" /><Attribute name="ensembl_peptide_id_version" /><Attribute name="transcript_mane_select" /><Attribute name="external_gene_name" /></Dataset></Query>', | ||
text="", | ||
) | ||
path, version = ensembl_transcript_mappings.get_latest() | ||
assert path == mappings_data_dir / "ensembl_transcript_mappings.tsv" | ||
assert path.exists() | ||
assert version == "" | ||
assert m.call_count == 1 | ||
|
||
path, version = ensembl_transcript_mappings.get_latest() | ||
assert path == mappings_data_dir / "ensembl_transcript_mappings.tsv" | ||
assert path.exists() | ||
assert version == "" | ||
assert m.call_count == 1, "don't make extra call if data already exists" | ||
|
||
path, version = ensembl_transcript_mappings.get_latest(from_local=True) | ||
assert path == mappings_data_dir / "ensembl_transcript_mappings.tsv" | ||
assert path.exists() | ||
assert m.call_count == 1, "don't make extra call if `from_local` == True" | ||
|
||
path, version = ensembl_transcript_mappings.get_latest(force_refresh=True) | ||
assert path == mappings_data_dir / "ensembl_transcript_mappings.tsv" | ||
assert path.exists() | ||
assert version == "" | ||
assert m.call_count == 2, "make extra call if `force_refresh` == True" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,82 @@ | ||
"""Test NCBI LRG_RefSeqGene data source.""" | ||
from pathlib import Path | ||
|
||
import pytest | ||
import requests_mock | ||
|
||
from wags_tails import NcbiLrgRefSeqGeneData | ||
|
||
|
||
@pytest.fixture() | ||
def ncbi_lrg_refseqgene_data_dir(base_data_dir: Path): | ||
"""Provide LRG_RefSeqGene data directory.""" | ||
directory = base_data_dir / "ncbi_lrg_refseqgene" | ||
directory.mkdir(exist_ok=True, parents=True) | ||
return directory | ||
|
||
|
||
@pytest.fixture() | ||
def ncbi_lrg_refseqgene(ncbi_lrg_refseqgene_data_dir: Path): | ||
"""Provide NcbiLrgRefSeqGeneData fixture""" | ||
return NcbiLrgRefSeqGeneData(ncbi_lrg_refseqgene_data_dir, silent=True) | ||
|
||
|
||
@pytest.fixture(scope="module") | ||
def index_html_file(fixture_dir: Path): | ||
"""Provide NIH file index page, for getting latest version.""" | ||
with (fixture_dir / "ncbi_lrg_refseqgene_index.html").open() as f: | ||
return f.read() | ||
|
||
|
||
def test_get_latest( | ||
ncbi_lrg_refseqgene: NcbiLrgRefSeqGeneData, | ||
ncbi_lrg_refseqgene_data_dir: Path, | ||
index_html_file: str, | ||
): | ||
"""Test NcbiLrgRefSeqGeneData.get_latest()""" | ||
with pytest.raises( | ||
ValueError, match="Cannot set both `force_refresh` and `from_local`" | ||
): | ||
ncbi_lrg_refseqgene.get_latest(from_local=True, force_refresh=True) | ||
|
||
with pytest.raises(FileNotFoundError): | ||
ncbi_lrg_refseqgene.get_latest(from_local=True) | ||
|
||
with requests_mock.Mocker() as m: | ||
m.get( | ||
"https://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/RefSeqGene/", | ||
text=index_html_file, | ||
) | ||
m.get( | ||
"https://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/RefSeqGene/LRG_RefSeqGene", | ||
text="", | ||
) | ||
path, version = ncbi_lrg_refseqgene.get_latest() | ||
assert path == ncbi_lrg_refseqgene_data_dir / "ncbi_lrg_refseqgene_20240201.tsv" | ||
assert path.exists() | ||
assert version == "20240201" | ||
assert m.call_count == 2 | ||
|
||
path, version = ncbi_lrg_refseqgene.get_latest() | ||
assert path == ncbi_lrg_refseqgene_data_dir / "ncbi_lrg_refseqgene_20240201.tsv" | ||
assert path.exists() | ||
assert version == "20240201" | ||
assert m.call_count == 3 | ||
|
||
path, version = ncbi_lrg_refseqgene.get_latest(from_local=True) | ||
assert path == ncbi_lrg_refseqgene_data_dir / "ncbi_lrg_refseqgene_20240201.tsv" | ||
assert path.exists() | ||
assert m.call_count == 3 | ||
|
||
(ncbi_lrg_refseqgene_data_dir / "ncbi_lrg_refseqgene_20240131.tsv").touch() | ||
path, version = ncbi_lrg_refseqgene.get_latest(from_local=True) | ||
assert path == ncbi_lrg_refseqgene_data_dir / "ncbi_lrg_refseqgene_20240201.tsv" | ||
assert path.exists() | ||
assert version == "20240201" | ||
assert m.call_count == 3 | ||
|
||
path, version = ncbi_lrg_refseqgene.get_latest(force_refresh=True) | ||
assert path == ncbi_lrg_refseqgene_data_dir / "ncbi_lrg_refseqgene_20240201.tsv" | ||
assert path.exists() | ||
assert version == "20240201" | ||
assert m.call_count == 5 |
Oops, something went wrong.