Skip to content

Commit

Permalink
feature: extract strand and transcript from gtf file
Browse files Browse the repository at this point in the history
Closes #87

Function and test to extract `strand` and `transcript_id` to dictionaries indexed by `gene_id`.
  • Loading branch information
ns-rse committed Nov 29, 2024
1 parent 099e89f commit 8f69539
Show file tree
Hide file tree
Showing 2 changed files with 51 additions and 3 deletions.
32 changes: 29 additions & 3 deletions isoslam/isoslam.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,17 +9,18 @@

def extract_transcripts(bed_file: str | Path) -> dict[Any, list[tuple[Any, int, int, Any, Any]]]:
"""
Extract features from `.bed` file and return as a dictionary indexed by transcript_id.
Extract features from ``.bed`` file and return as a dictionary indexed by ``transcript_id``.
Parameters
----------
bed_file : str | Path
Path, as string or pathlib Path, to a `.bed` file.
Path, as string or pathlib Path, to a ``.bed`` file.
Returns
-------
dict[Any, list[tuple[Any, int, int, Any, Any]]]
Nested dictionary of chromosome, start, end and bedstrand indexed by transcript_id.
Dictionary of ``chromosome``, ``start``, ``end``, ``transcript_id`` and ``bedstrand`` indexed by
``transcript_id``.
"""
coordinates = defaultdict(list)
for line in io.load_file(bed_file):
Expand All @@ -35,3 +36,28 @@ def extract_transcripts(bed_file: str | Path) -> dict[Any, list[tuple[Any, int,
)
)
return coordinates


def extract_strand_transcript(gtf_file: str | Path) -> tuple[defaultdict[Any, Any], defaultdict[Any, list[Any]]]:
"""
Extract strand and transcript ID data from ``.gtf`` file.
Parameters
----------
gtf_file : Path | str
Path to a 'gtf' file.
Returns
-------
tuple[dict[str, tuple[str]], dict[str, tuple[str]]]
Two dictionaries are returned, one of the ``strand`` the other of the ``transcript_id`` both using the ``gene_id`` as
the key.
"""
strand = defaultdict(str)
transcript = defaultdict(list)
for entry in io.load_file(gtf_file):
if not entry.feature == "transcript":
continue
strand[entry.gene_id] = entry.strand
transcript[entry.gene_id].append(entry.transcript_id)
return (strand, transcript)
22 changes: 22 additions & 0 deletions tests/test_isoslam.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,3 +40,25 @@ def test_isoslam_extract_transcripts(
) -> None:
"""Test extraction of tanscript data from bed file using extract_transcripts()."""
assert isoslam.extract_transcripts(bed_file) == expected_transcript


@pytest.mark.parametrize(
("gtf_file", "expected_strand", "expected_transcript"),
[
pytest.param( # type: ignore[misc]
RESOURCES / "gtf" / "test_wash1.gtf",
{"MSTRG.63147": "-"},
{"MSTRG.63147": ["ENST00000442898"]},
id="gtf file as Path",
),
],
)
def test_extract_strand_transcript(
gtf_file: str | Path, expected_strand: dict[Any, Any], expected_transcript: dict[Any, Any]
) -> None:
"""Test extraction of strand and transcript from gtf file using extract_strand_transcript()."""
strand, transcript = isoslam.extract_strand_transcript(gtf_file)
print(f"{strand=}")
print(f"{transcript}")
assert strand == expected_strand
assert transcript == expected_transcript

0 comments on commit 8f69539

Please sign in to comment.