Skip to content

Commit

Permalink
Add fetch() to genome object
Browse files Browse the repository at this point in the history
  • Loading branch information
casblaauw committed Dec 3, 2024
1 parent 61c080a commit 192c882
Show file tree
Hide file tree
Showing 3 changed files with 80 additions and 3 deletions.
37 changes: 37 additions & 0 deletions src/crested/_genome.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from pysam import FastaFile

import crested._conf as conf
from crested.utils import reverse_complement


class Genome:
Expand Down Expand Up @@ -146,6 +147,42 @@ def name(self) -> str:
return basename
return self._name

def fetch(self, chrom=None, start=None, end=None, strand = "+", region = None) -> str:
"""
Fetch a sequence from a genomic region.
Start and end denote 0-based, half-open intervals, following the bed convention.
Parameters
----------
chrom
The chromosome of the region to extract.
start
The start of the region to extract. Assumes 0-indexed positions.
end
The end of the region to extract, exclusive.
strand
The strand of the region. If '-', the sequence is reverse-complemented. Default is "+".
region
Alternatively, a region string to parse. If supplied together with chrom/start/end, explicit coordinates take priority.
Returns
-------
The requested sequence, as a string.
"""
if region and any(chrom, start, end):
logger.warning("Both region and chrom/start/end supplied. Using chrom/start/end...")
elif region:
if region[-2] == ":":
chrom, start_end, strand = region.split(":")
else:
chrom, start_end = region.split(":")
start, end = map(int, start_end.split("-"))
seq = self.fasta.fetch(reference=chrom, start=start, end=end)
if strand == "-":
return reverse_complement(seq)
else:
return seq

def register_genome(genome: Genome):
"""
Expand Down
4 changes: 1 addition & 3 deletions src/crested/utils/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -350,9 +350,7 @@ def fetch_sequences(
genome = _resolve_genome(genome)
seqs = []
for region in regions:
chrom, start_end = region.split(":")
start, end = start_end.split("-")
seq = genome.fasta.fetch(chrom, int(start), int(end))
seq = genome.fetch(region = region)
if uppercase:
seq = seq.upper()
seqs.append(seq)
Expand Down
42 changes: 42 additions & 0 deletions tests/test_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,3 +105,45 @@ def test_import_beds_with_genome(genome):
assert all(
warning_text_filtered not in msg for msg in warning_messages
), "Warning about filtered regions was unexpectedly raised."

def test_genome_fetch(genome):
"""Test reading the genome."""
import crested

fasta_file = genome
genome = crested.Genome(
fasta=fasta_file,
chrom_sizes="tests/data/test.chrom.sizes",
)
seq = genome.fetch('chr1', 10000, 10100)
assert len(seq) == 100


def test_genome_fetch_region(genome):
"""Test reading the genome with a region string."""
import crested

fasta_file = genome
genome = crested.Genome(
fasta=fasta_file,
chrom_sizes="tests/data/test.chrom.sizes",
)
seq1 = genome.fetch('chr1', 10000, 10100)
seq2 = genome.fetch(region = 'chr1:10000-10100')
assert seq1 == seq2

def test_genome_fetch_reverse(genome):
"""Test reading the genome on the negative strand."""
import crested

fasta_file = genome
genome = crested.Genome(
fasta=fasta_file,
chrom_sizes="tests/data/test.chrom.sizes",
)
seq_forward = genome.fetch('chr1', 10000, 10100)
seq_rev = genome.fetch('chr1', 10000, 10100, "-")
seq_rev_region = genome.fetch(region = 'chr1:10000-10100:-')
assert seq_rev == crested.utils.reverse_complement(seq_forward)
assert seq_rev_region == seq_rev

0 comments on commit 192c882

Please sign in to comment.