feature: extract strand and transcript from gtf file

Closes #87 Function and test to extract `strand` and `transcript_id` to dictionaries indexed by `gene_id`.
sudlab · Nov 29, 2024 · 8f69539 · 8f69539
1 parent 099e89f
commit 8f69539
Show file tree

Hide file tree

Showing 2 changed files with 51 additions and 3 deletions.
diff --git a/isoslam/isoslam.py b/isoslam/isoslam.py
@@ -9,17 +9,18 @@
 
 def extract_transcripts(bed_file: str | Path) -> dict[Any, list[tuple[Any, int, int, Any, Any]]]:
     """
-    Extract features from `.bed` file and return as a dictionary indexed by transcript_id.
+    Extract features from ``.bed`` file and return as a dictionary indexed by ``transcript_id``.
 
     Parameters
     ----------
     bed_file : str | Path
-        Path, as string or pathlib Path, to a `.bed` file.
+        Path, as string or pathlib Path, to a ``.bed`` file.
 
     Returns
     -------
     dict[Any, list[tuple[Any, int, int, Any, Any]]]
-        Nested dictionary of chromosome, start, end and bedstrand indexed by transcript_id.
+        Dictionary of ``chromosome``, ``start``, ``end``, ``transcript_id`` and ``bedstrand`` indexed by
+       ``transcript_id``.
     """
     coordinates = defaultdict(list)
     for line in io.load_file(bed_file):
@@ -35,3 +36,28 @@ def extract_transcripts(bed_file: str | Path) -> dict[Any, list[tuple[Any, int,
             )
         )
     return coordinates
+
+
+def extract_strand_transcript(gtf_file: str | Path) -> tuple[defaultdict[Any, Any], defaultdict[Any, list[Any]]]:
+    """
+    Extract strand and transcript ID data from ``.gtf`` file.
+
+    Parameters
+    ----------
+    gtf_file : Path | str
+        Path to a 'gtf' file.
+
+    Returns
+    -------
+    tuple[dict[str, tuple[str]], dict[str, tuple[str]]]
+        Two dictionaries are returned, one of the ``strand`` the other of the ``transcript_id`` both using the ``gene_id`` as
+        the key.
+    """
+    strand = defaultdict(str)
+    transcript = defaultdict(list)
+    for entry in io.load_file(gtf_file):
+        if not entry.feature == "transcript":
+            continue
+        strand[entry.gene_id] = entry.strand
+        transcript[entry.gene_id].append(entry.transcript_id)
+    return (strand, transcript)
diff --git a/tests/test_isoslam.py b/tests/test_isoslam.py
@@ -40,3 +40,25 @@ def test_isoslam_extract_transcripts(
 ) -> None:
     """Test extraction of tanscript data from bed file using extract_transcripts()."""
     assert isoslam.extract_transcripts(bed_file) == expected_transcript
+
+
+@pytest.mark.parametrize(
+    ("gtf_file", "expected_strand", "expected_transcript"),
+    [
+        pytest.param(  # type: ignore[misc]
+            RESOURCES / "gtf" / "test_wash1.gtf",
+            {"MSTRG.63147": "-"},
+            {"MSTRG.63147": ["ENST00000442898"]},
+            id="gtf file as Path",
+        ),
+    ],
+)
+def test_extract_strand_transcript(
+    gtf_file: str | Path, expected_strand: dict[Any, Any], expected_transcript: dict[Any, Any]
+) -> None:
+    """Test extraction of strand and transcript from gtf file using extract_strand_transcript()."""
+    strand, transcript = isoslam.extract_strand_transcript(gtf_file)
+    print(f"{strand=}")
+    print(f"{transcript}")
+    assert strand == expected_strand
+    assert transcript == expected_transcript