Skip to content

Commit

Permalink
Merge pull request #109 from sudlab/ns-rse/io-update-file-descriptions
Browse files Browse the repository at this point in the history
  • Loading branch information
ns-rse authored Dec 9, 2024
2 parents 1b30165 + ba68c3a commit 1541287
Show file tree
Hide file tree
Showing 2 changed files with 31 additions and 47 deletions.
58 changes: 31 additions & 27 deletions isoslam/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,21 @@ def create_config(args: argparse.Namespace | None = None) -> None:

def load_file(file_path: str | Path) -> Any:
"""
Load files.
Load files of different types.
Supports the following file types...
+----------+--------------------------------------------------------------------------+
| File | Description |
+==========+==========================================================================+
| ``.bam`` | The sequence data that is to be analysed. |
+----------+--------------------------------------------------------------------------+
| ``.bed`` | The locations of introns/splice junctions. |
+----------+--------------------------------------------------------------------------+
| ``.gtf`` | Transcript structures from which the ``.bed`` file is derived. |
+----------+--------------------------------------------------------------------------+
| ``.vcf`` | Locations of known sequences difference from the reference sequence. |
+----------+--------------------------------------------------------------------------+
Parameters
----------
Expand Down Expand Up @@ -206,16 +220,16 @@ def _get_loader(file_ext: str = "bam") -> Callable: # type: ignore[type-arg]
return _load_bed
if file_ext == ".gtf":
return _load_gtf
if file_ext == ".tbi":
return _load_tbi
if file_ext == ".vcf" or file_ext == ".vcf.gz":
return _load_vcf
raise ValueError(file_ext)


def _load_bam(bam_file: str | Path) -> pysam.libcalignmentfile.AlignmentFile:
"""
Load '.bam' file.
Load ``.bam`` file.
``.bam`` files are the sequence data that is to be analysed.
Parameters
----------
Expand All @@ -235,7 +249,9 @@ def _load_bam(bam_file: str | Path) -> pysam.libcalignmentfile.AlignmentFile:

def _load_bed(bed_file: str | Path) -> TextIO:
"""
Open '.bed' file for reading, supports gzip compressed formats.
Open ``.bed`` file for reading, supports gzip compressed formats.
``.bed`` files contain the locations of introns/splice junctions.
Parameters
----------
Expand All @@ -257,7 +273,9 @@ def _load_bed(bed_file: str | Path) -> TextIO:

def _load_gtf(gtf_file: str | Path) -> pysam.libctabix.tabix_generic_iterator:
"""
Load '.gtf' file and return as an iterable.
Load ``.gtf`` file and return as an iterable.
``.gtf`` files contain the transcript structures from which the ``.bed`` file is derived.
Parameters
----------
Expand All @@ -277,7 +295,13 @@ def _load_gtf(gtf_file: str | Path) -> pysam.libctabix.tabix_generic_iterator:

def _load_vcf(vcf_file: str | Path) -> pysam.libcbcf.VariantFile:
"""
Load '.vcf' file.
Load ``.vcf`` file.
``.vcf`` files contain the locations of known sequences difference from the reference sequence. Any ``T > C``
(e.g. SNPs) conversions that match to the location of known seuqence variations will be removed. These can be
obtained from a reference collection of variation data (such as `dbSNP
<https://www.ncbi.nlm.nih.gov/projects/SNP/get_html.cgi?whichHtml=overview>`_) or derived directly from the RNAseq
reads.
Parameters
----------
Expand All @@ -293,23 +317,3 @@ def _load_vcf(vcf_file: str | Path) -> pysam.libcbcf.VariantFile:
return pysam.VariantFile(vcf_file)
except FileNotFoundError as e:
raise e


def _load_tbi(tbi_file: str | Path) -> pysam.libcbcf.VariantFile:
"""
Load '.tbi' file.
Parameters
----------
tbi_file : str | Path
Path, as string or pathlib Path, to a '.tbi' file that is to be loaded.
Returns
-------
pysam.libcbcf.VariantFile
Loads the specified TBI file.
"""
try:
return pysam.VariantFile(tbi_file)
except FileNotFoundError as e:
raise e
20 changes: 0 additions & 20 deletions tests/test_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -200,26 +200,6 @@ def test_load_gtf(file_path: str | Path, object_type: str) -> None:
assert isinstance(gtf_file, object_type)


@pytest.mark.xfail(reason="File not in correct format.")
@pytest.mark.parametrize(
("file_path", "object_type", "compression", "is_remote"),
[
pytest.param(
RESOURCES / "vcf" / "d0.vcf.gz.tbi", pysam.libcbcf.VariantFile, "BGZF", False, id="d0 tbi as Path"
),
pytest.param("tests/resources/vcf/d0.vcf.gz.tbi", pysam.libcbcf.VariantFile, "BGZF", False, id="d0 tbi as str"),
],
)
def test_load_tbi(
file_path: str | Path, object_type: pysam.libcbcf.VariantFile, compression: str, is_remote: bool
) -> None:
"""Test loading of tbi file."""
tbi_file = io._load_tbi(file_path)
assert isinstance(tbi_file, object_type)
assert tbi_file.compression == compression
assert tbi_file.is_remote == is_remote


@pytest.mark.parametrize(
("file_path", "object_type", "compression", "is_remote"),
[
Expand Down

0 comments on commit 1541287

Please sign in to comment.