From ba68c3ae21039036286ce2c64aa77632bee675a2 Mon Sep 17 00:00:00 2001 From: Neil Shephard Date: Mon, 9 Dec 2024 11:39:20 +0000 Subject: [PATCH] doc(io): Improve docstrings of io functions Short descriptions for main file types added. Removes method to load `.tbi` files for now, test was failing anyway, suspect these are intermediary index files. --- isoslam/io.py | 58 ++++++++++++++++++++++++++---------------------- tests/test_io.py | 20 ----------------- 2 files changed, 31 insertions(+), 47 deletions(-) diff --git a/isoslam/io.py b/isoslam/io.py index c476c20..768454d 100644 --- a/isoslam/io.py +++ b/isoslam/io.py @@ -162,7 +162,21 @@ def create_config(args: argparse.Namespace | None = None) -> None: def load_file(file_path: str | Path) -> Any: """ - Load files. + Load files of different types. + + Supports the following file types... + + +----------+--------------------------------------------------------------------------+ + | File | Description | + +==========+==========================================================================+ + | ``.bam`` | The sequence data that is to be analysed. | + +----------+--------------------------------------------------------------------------+ + | ``.bed`` | The locations of introns/splice junctions. | + +----------+--------------------------------------------------------------------------+ + | ``.gtf`` | Transcript structures from which the ``.bed`` file is derived. | + +----------+--------------------------------------------------------------------------+ + | ``.vcf`` | Locations of known sequences difference from the reference sequence. | + +----------+--------------------------------------------------------------------------+ Parameters ---------- @@ -206,8 +220,6 @@ def _get_loader(file_ext: str = "bam") -> Callable: # type: ignore[type-arg] return _load_bed if file_ext == ".gtf": return _load_gtf - if file_ext == ".tbi": - return _load_tbi if file_ext == ".vcf" or file_ext == ".vcf.gz": return _load_vcf raise ValueError(file_ext) @@ -215,7 +227,9 @@ def _get_loader(file_ext: str = "bam") -> Callable: # type: ignore[type-arg] def _load_bam(bam_file: str | Path) -> pysam.libcalignmentfile.AlignmentFile: """ - Load '.bam' file. + Load ``.bam`` file. + + ``.bam`` files are the sequence data that is to be analysed. Parameters ---------- @@ -235,7 +249,9 @@ def _load_bam(bam_file: str | Path) -> pysam.libcalignmentfile.AlignmentFile: def _load_bed(bed_file: str | Path) -> TextIO: """ - Open '.bed' file for reading, supports gzip compressed formats. + Open ``.bed`` file for reading, supports gzip compressed formats. + + ``.bed`` files contain the locations of introns/splice junctions. Parameters ---------- @@ -257,7 +273,9 @@ def _load_bed(bed_file: str | Path) -> TextIO: def _load_gtf(gtf_file: str | Path) -> pysam.libctabix.tabix_generic_iterator: """ - Load '.gtf' file and return as an iterable. + Load ``.gtf`` file and return as an iterable. + + ``.gtf`` files contain the transcript structures from which the ``.bed`` file is derived. Parameters ---------- @@ -277,7 +295,13 @@ def _load_gtf(gtf_file: str | Path) -> pysam.libctabix.tabix_generic_iterator: def _load_vcf(vcf_file: str | Path) -> pysam.libcbcf.VariantFile: """ - Load '.vcf' file. + Load ``.vcf`` file. + + ``.vcf`` files contain the locations of known sequences difference from the reference sequence. Any ``T > C`` + (e.g. SNPs) conversions that match to the location of known seuqence variations will be removed. These can be + obtained from a reference collection of variation data (such as `dbSNP + `_) or derived directly from the RNAseq + reads. Parameters ---------- @@ -293,23 +317,3 @@ def _load_vcf(vcf_file: str | Path) -> pysam.libcbcf.VariantFile: return pysam.VariantFile(vcf_file) except FileNotFoundError as e: raise e - - -def _load_tbi(tbi_file: str | Path) -> pysam.libcbcf.VariantFile: - """ - Load '.tbi' file. - - Parameters - ---------- - tbi_file : str | Path - Path, as string or pathlib Path, to a '.tbi' file that is to be loaded. - - Returns - ------- - pysam.libcbcf.VariantFile - Loads the specified TBI file. - """ - try: - return pysam.VariantFile(tbi_file) - except FileNotFoundError as e: - raise e diff --git a/tests/test_io.py b/tests/test_io.py index 3e093b8..9ebfc33 100644 --- a/tests/test_io.py +++ b/tests/test_io.py @@ -200,26 +200,6 @@ def test_load_gtf(file_path: str | Path, object_type: str) -> None: assert isinstance(gtf_file, object_type) -@pytest.mark.xfail(reason="File not in correct format.") -@pytest.mark.parametrize( - ("file_path", "object_type", "compression", "is_remote"), - [ - pytest.param( - RESOURCES / "vcf" / "d0.vcf.gz.tbi", pysam.libcbcf.VariantFile, "BGZF", False, id="d0 tbi as Path" - ), - pytest.param("tests/resources/vcf/d0.vcf.gz.tbi", pysam.libcbcf.VariantFile, "BGZF", False, id="d0 tbi as str"), - ], -) -def test_load_tbi( - file_path: str | Path, object_type: pysam.libcbcf.VariantFile, compression: str, is_remote: bool -) -> None: - """Test loading of tbi file.""" - tbi_file = io._load_tbi(file_path) - assert isinstance(tbi_file, object_type) - assert tbi_file.compression == compression - assert tbi_file.is_remote == is_remote - - @pytest.mark.parametrize( ("file_path", "object_type", "compression", "is_remote"), [