diff --git a/isoslam/isoslam.py b/isoslam/isoslam.py index dd20fed..843cac7 100644 --- a/isoslam/isoslam.py +++ b/isoslam/isoslam.py @@ -4,22 +4,25 @@ from pathlib import Path from typing import Any +from loguru import logger + from isoslam import io def extract_transcripts(bed_file: str | Path) -> dict[Any, list[tuple[Any, int, int, Any, Any]]]: """ - Extract features from `.bed` file and return as a dictionary indexed by transcript_id. + Extract features from ``.bed`` file and return as a dictionary indexed by ``transcript_id``. Parameters ---------- bed_file : str | Path - Path, as string or pathlib Path, to a `.bed` file. + Path, as string or pathlib Path, to a ``.bed`` file. Returns ------- dict[Any, list[tuple[Any, int, int, Any, Any]]] - Nested dictionary of chromosome, start, end and bedstrand indexed by transcript_id. + Dictionary of ``chromosome``, ``start``, ``end``, ``transcript_id`` and ``bedstrand`` indexed by + ``transcript_id``. """ coordinates = defaultdict(list) for line in io.load_file(bed_file): @@ -34,4 +37,31 @@ def extract_transcripts(bed_file: str | Path) -> dict[Any, list[tuple[Any, int, contents[5], ) ) + logger.info(f"Extracted features from : {bed_file}") return coordinates + + +def extract_strand_transcript(gtf_file: str | Path) -> tuple[defaultdict[Any, Any], defaultdict[Any, list[Any]]]: + """ + Extract strand and transcript ID data from ``.gtf`` file. + + Parameters + ---------- + gtf_file : Path | str + Path to a 'gtf' file. + + Returns + ------- + tuple[dict[str, tuple[str]], dict[str, tuple[str]]] + Two dictionaries are returned, one of the ``strand`` the other of the ``transcript_id`` both using the ``gene_id`` as + the key. + """ + strand = defaultdict(str) + transcript = defaultdict(list) + for entry in io.load_file(gtf_file): + if not entry.feature == "transcript": + continue + strand[entry.gene_id] = entry.strand + transcript[entry.gene_id].append(entry.transcript_id) + logger.info(f"Extracted features from : {gtf_file}") + return (strand, transcript) diff --git a/tests/test_isoslam.py b/tests/test_isoslam.py index c93fd43..985c56e 100644 --- a/tests/test_isoslam.py +++ b/tests/test_isoslam.py @@ -40,3 +40,23 @@ def test_isoslam_extract_transcripts( ) -> None: """Test extraction of tanscript data from bed file using extract_transcripts().""" assert isoslam.extract_transcripts(bed_file) == expected_transcript + + +@pytest.mark.parametrize( + ("gtf_file", "expected_strand", "expected_transcript"), + [ + pytest.param( # type: ignore[misc] + RESOURCES / "gtf" / "test_wash1.gtf", + {"MSTRG.63147": "-"}, + {"MSTRG.63147": ["ENST00000442898"]}, + id="gtf file as Path", + ), + ], +) +def test_extract_strand_transcript( + gtf_file: str | Path, expected_strand: dict[Any, Any], expected_transcript: dict[Any, Any] +) -> None: + """Test extraction of strand and transcript from gtf file using extract_strand_transcript().""" + strand, transcript = isoslam.extract_strand_transcript(gtf_file) + assert strand == expected_strand + assert transcript == expected_transcript