diff --git a/src/crested/_genome.py b/src/crested/_genome.py index b40ea5c..8d17329 100644 --- a/src/crested/_genome.py +++ b/src/crested/_genome.py @@ -10,6 +10,7 @@ from pysam import FastaFile import crested._conf as conf +from crested.utils import reverse_complement class Genome: @@ -146,6 +147,42 @@ def name(self) -> str: return basename return self._name + def fetch(self, chrom=None, start=None, end=None, strand = "+", region = None) -> str: + """ + Fetch a sequence from a genomic region. + + Start and end denote 0-based, half-open intervals, following the bed convention. + + Parameters + ---------- + chrom + The chromosome of the region to extract. + start + The start of the region to extract. Assumes 0-indexed positions. + end + The end of the region to extract, exclusive. + strand + The strand of the region. If '-', the sequence is reverse-complemented. Default is "+". + region + Alternatively, a region string to parse. If supplied together with chrom/start/end, explicit coordinates take priority. + + Returns + ------- + The requested sequence, as a string. + """ + if region and any(chrom, start, end): + logger.warning("Both region and chrom/start/end supplied. Using chrom/start/end...") + elif region: + if region[-2] == ":": + chrom, start_end, strand = region.split(":") + else: + chrom, start_end = region.split(":") + start, end = map(int, start_end.split("-")) + seq = self.fasta.fetch(reference=chrom, start=start, end=end) + if strand == "-": + return reverse_complement(seq) + else: + return seq def register_genome(genome: Genome): """ diff --git a/src/crested/utils/_utils.py b/src/crested/utils/_utils.py index 3c1e321..844a74b 100644 --- a/src/crested/utils/_utils.py +++ b/src/crested/utils/_utils.py @@ -350,9 +350,7 @@ def fetch_sequences( genome = _resolve_genome(genome) seqs = [] for region in regions: - chrom, start_end = region.split(":") - start, end = start_end.split("-") - seq = genome.fasta.fetch(chrom, int(start), int(end)) + seq = genome.fetch(region = region) if uppercase: seq = seq.upper() seqs.append(seq) diff --git a/tests/test_data.py b/tests/test_data.py index 8745f34..4a00a22 100644 --- a/tests/test_data.py +++ b/tests/test_data.py @@ -105,3 +105,45 @@ def test_import_beds_with_genome(genome): assert all( warning_text_filtered not in msg for msg in warning_messages ), "Warning about filtered regions was unexpectedly raised." + +def test_genome_fetch(genome): + """Test reading the genome.""" + import crested + + fasta_file = genome + genome = crested.Genome( + fasta=fasta_file, + chrom_sizes="tests/data/test.chrom.sizes", + ) + seq = genome.fetch('chr1', 10000, 10100) + assert len(seq) == 100 + + +def test_genome_fetch_region(genome): + """Test reading the genome with a region string.""" + import crested + + fasta_file = genome + genome = crested.Genome( + fasta=fasta_file, + chrom_sizes="tests/data/test.chrom.sizes", + ) + seq1 = genome.fetch('chr1', 10000, 10100) + seq2 = genome.fetch(region = 'chr1:10000-10100') + assert seq1 == seq2 + +def test_genome_fetch_reverse(genome): + """Test reading the genome on the negative strand.""" + import crested + + fasta_file = genome + genome = crested.Genome( + fasta=fasta_file, + chrom_sizes="tests/data/test.chrom.sizes", + ) + seq_forward = genome.fetch('chr1', 10000, 10100) + seq_rev = genome.fetch('chr1', 10000, 10100, "-") + seq_rev_region = genome.fetch(region = 'chr1:10000-10100:-') + assert seq_rev == crested.utils.reverse_complement(seq_forward) + assert seq_rev_region == seq_rev +