Skip to content

Commit

Permalink
feat: Primer3.__init__() takes a list of VCF paths or a pre-construct…
Browse files Browse the repository at this point in the history
…ed VariantLookup
  • Loading branch information
emmcauley committed Oct 14, 2024
1 parent 0a83439 commit 8d049f5
Show file tree
Hide file tree
Showing 5 changed files with 94 additions and 64 deletions.
4 changes: 0 additions & 4 deletions prymer/api/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,6 @@
from prymer.api.variant_lookup import VariantLookup
from prymer.api.variant_lookup import VariantOverlapDetector
from prymer.api.variant_lookup import VariantType
from prymer.api.variant_lookup import cached
from prymer.api.variant_lookup import disk_based

__all__ = [
"ClusteredIntervals",
Expand All @@ -38,6 +36,4 @@
"VariantLookup",
"FileBasedVariantLookup",
"VariantOverlapDetector",
"cached",
"disk_based",
]
54 changes: 27 additions & 27 deletions prymer/api/variant_lookup.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,15 +10,11 @@
Two concrete implementations are provided that both take a list of VCF files to be queried:
- [`FileBasedVariantLookup`][prymer.api.variant_lookup.FileBasedVariantLookup] -- performs
disk-based retrieval of variants (using a VCF index). This class is recommended for large VCFs. The
[`disk_based()`][prymer.api.variant_lookup.disk_based] alternative constructor is
provided for easy construction of this object.
disk-based retrieval of variants (using a VCF index). This class is recommended for large VCFs.
- [`VariantOverlapDetector`][prymer.api.variant_lookup.VariantOverlapDetector] -- reads in
variants into memory and uses an
[`pybedlite.overlap_detector.OverlapDetector`](https://pybedlite.readthedocs.io/en/latest/api.html#pybedlite.overlap_detector.OverlapDetector)
for querying. This class is recommended for small VCFs. The
[`cached()`][prymer.api.variant_lookup.cached] alternative constructor is provided for
easy construction of this object.
for querying. This class is recommended for small VCFs.
Each class can also use minor allele frequency (MAF) to filter variants.
Expand All @@ -28,7 +24,7 @@
```python
>>> from pathlib import Path
>>> lookup = cached(vcf_paths=[Path("./tests/api/data/miniref.variants.vcf.gz")], min_maf=0.00, include_missing_mafs=True)
>>> lookup = VariantLookup.build(vcf_paths=[Path("./tests/api/data/miniref.variants.vcf.gz")], use_cache=True,min_maf=0.00, include_missing_mafs=True)
>>> lookup.query(refname="chr2", start=7999, end=8000)
[SimpleVariant(id='complex-variant-sv-1/1', refname='chr2', pos=8000, ref='T', alt='<DEL>', end=8000, variant_type=<VariantType.OTHER: 'OTHER'>, maf=None)]
>>> variants = lookup.query(refname="chr2", start=7999, end=9900)
Expand Down Expand Up @@ -254,6 +250,30 @@ def __init__(
self.min_maf: Optional[float] = min_maf
self.include_missing_mafs: bool = include_missing_mafs

@classmethod
def build(
cls,
vcf_paths: list[Path],
use_cache: Optional[bool] = True,
min_maf: Optional[float] = 0.00,
include_missing_mafs: Optional[bool] = False,
) -> "VariantLookup":
"""Constructs a `VariantLookup` object based on size of VCF(s) to query.
If `use_cache` is True, return a `VariantOverlapDetector` that caches all variants in
memory for fast lookup (appropriate for small VCFs). Otherwise, return a
`FileBasedVariantLookup` that queries indexed VCFs on disk for each lookup (appropriate
for larger VCFs).
"""
if use_cache:
return VariantOverlapDetector(
vcf_paths=vcf_paths, min_maf=min_maf, include_missing_mafs=include_missing_mafs
)
return FileBasedVariantLookup(
vcf_paths=vcf_paths, min_maf=min_maf, include_missing_mafs=include_missing_mafs
)

@final
def query(
self,
Expand Down Expand Up @@ -427,23 +447,3 @@ def calc_maf_from_filter(variant: pysam.VariantRecord) -> Optional[float]:
maf = num_alt / len(gts)

return maf


def cached(
vcf_paths: list[Path], min_maf: float, include_missing_mafs: bool = False
) -> VariantOverlapDetector:
"""Constructs a `VariantLookup` that caches all variants in memory for fast lookup.
Appropriate for small VCFs."""
return VariantOverlapDetector(
vcf_paths=vcf_paths, min_maf=min_maf, include_missing_mafs=include_missing_mafs
)


def disk_based(
vcf_paths: list[Path], min_maf: float, include_missing_mafs: bool = False
) -> FileBasedVariantLookup:
"""Constructs a `VariantLookup` that queries indexed VCFs on disk for each lookup.
Appropriate for large VCFs."""
return FileBasedVariantLookup(
vcf_paths=vcf_paths, min_maf=min_maf, include_missing_mafs=include_missing_mafs
)
46 changes: 20 additions & 26 deletions prymer/primer3/primer3.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
>>> from prymer.api.variant_lookup import VariantLookup, VariantOverlapDetector
>>> genome_fasta = Path("./tests/primer3/data/miniref.fa")
>>> genome_vcf = Path("./tests/primer3/data/miniref.variants.vcf.gz")
>>> designer = Primer3(genome_fasta=genome_fasta, list_of_vcfs=[genome_vcf], use_file_based_lookup=True)
>>> designer = Primer3(genome_fasta=genome_fasta, vcfs_or_lookup=[genome_vcf], use_cache=True)
```
Expand Down Expand Up @@ -143,11 +143,8 @@
from prymer.api.primer_pair import PrimerPair
from prymer.api.span import Span
from prymer.api.span import Strand
from prymer.api.variant_lookup import FileBasedVariantLookup
from prymer.api.variant_lookup import SimpleVariant
from prymer.api.variant_lookup import VariantOverlapDetector
from prymer.api.variant_lookup import cached
from prymer.api.variant_lookup import disk_based
from prymer.api.variant_lookup import VariantLookup
from prymer.primer3.primer3_failure_reason import Primer3FailureReason
from prymer.primer3.primer3_input import Primer3Input
from prymer.primer3.primer3_input_tag import Primer3InputTag
Expand Down Expand Up @@ -230,17 +227,18 @@ def __init__(
self,
genome_fasta: Path,
executable: Optional[str] = None,
list_of_vcfs: Optional[list[Path]] = None,
use_file_based_lookup: bool = True,
vcfs_or_lookup: Optional[list[Path] | VariantLookup] = None,
use_cache: bool = False,
min_maf: float = 0.0,
include_missing_mafs: bool = False,
) -> None:
"""
Args:
genome_fasta: Path to reference genome .fasta file
executable: string representation of the path to primer3_core
list_of_vcfs: an optional list of VCF files with which to hard-mask variants
use_file_based_lookup: whether to use a file-based `VariantLookup`
vcfs_or_lookup: an optional list of VCF files or `VariantLookup` object
with which to hard-mask variants
use_cache: whether to use an in-memory `VariantLookup` for fast querying
min_maf: an optional minimum Minor Allele Frequency with which to filter a list of
VCF files (return only variants with at least this minor allele frequency)
include_missing_mafs: when filtering variants with a minor allele frequency,
Expand All @@ -255,9 +253,18 @@ def __init__(
executable="primer3_core" if executable is None else executable
)
command: list[str] = [f"{executable_path}"]
if vcfs_or_lookup is None or isinstance(vcfs_or_lookup, VariantLookup):
self.vcfs_or_lookup = vcfs_or_lookup
elif isinstance(vcfs_or_lookup, list) and all(
isinstance(path, Path) for path in vcfs_or_lookup
): # if provided path, create appropriate `VariantLookup`
self.vcfs_or_lookup = VariantLookup.build(
vcf_paths=vcfs_or_lookup,
use_cache=use_cache,
min_maf=min_maf,
include_missing_mafs=include_missing_mafs,
)

self.list_of_vcfs = list_of_vcfs
self.use_file_based_lookup = use_file_based_lookup
self.min_maf = min_maf
self.include_missing_mafs = include_missing_mafs
self._fasta = pysam.FastaFile(filename=f"{genome_fasta}")
Expand Down Expand Up @@ -299,24 +306,11 @@ def get_design_sequences(self, region: Span) -> tuple[str, str]:
soft_masked = self._fasta.fetch(
reference=region.refname, start=region.start - 1, end=region.end
)
if self.list_of_vcfs is None:
if self.vcfs_or_lookup is None:
hard_masked = soft_masked
return soft_masked, hard_masked
variant_lookup: Union[FileBasedVariantLookup, VariantOverlapDetector]
if self.use_file_based_lookup is True:
variant_lookup = disk_based(
vcf_paths=self.list_of_vcfs,
min_maf=self.min_maf,
include_missing_mafs=self.include_missing_mafs,
)
else:
variant_lookup = cached(
vcf_paths=self.list_of_vcfs,
min_maf=self.min_maf,
include_missing_mafs=self.include_missing_mafs,
)

overlapping_variants: list[SimpleVariant] = variant_lookup.query(
overlapping_variants: list[SimpleVariant] = self.vcfs_or_lookup.query(
refname=region.refname, start=region.start, end=region.end
)
positions: list[int] = []
Expand Down
14 changes: 8 additions & 6 deletions tests/api/test_variant_lookup.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,7 @@
from prymer.api.variant_lookup import VariantLookup
from prymer.api.variant_lookup import VariantOverlapDetector
from prymer.api.variant_lookup import VariantType
from prymer.api.variant_lookup import cached
from prymer.api.variant_lookup import calc_maf_from_filter
from prymer.api.variant_lookup import disk_based


@pytest.mark.parametrize(
Expand Down Expand Up @@ -451,17 +449,21 @@ def test_simple_variant_conversion_logs(
def test_missing_index_file_raises(temp_missing_path: Path) -> None:
"""Test that both VariantLookup objects raise an error with a missing index file."""
with pytest.raises(ValueError, match="Cannot perform fetch with missing index file for VCF"):
disk_based(vcf_paths=[temp_missing_path], min_maf=0.01, include_missing_mafs=False)
FileBasedVariantLookup(
vcf_paths=[temp_missing_path], min_maf=0.01, include_missing_mafs=False
)
with pytest.raises(ValueError, match="Cannot perform fetch with missing index file for VCF"):
cached(vcf_paths=[temp_missing_path], min_maf=0.01, include_missing_mafs=False)
VariantOverlapDetector(
vcf_paths=[temp_missing_path], min_maf=0.01, include_missing_mafs=False
)


def test_missing_vcf_files_raises() -> None:
"""Test that an error is raised when no VCF_paths are provided."""
with pytest.raises(ValueError, match="No VCF paths given to query"):
disk_based(vcf_paths=[], min_maf=0.01, include_missing_mafs=False)
FileBasedVariantLookup(vcf_paths=[], min_maf=0.01, include_missing_mafs=False)
with pytest.raises(ValueError, match="No VCF paths given to query"):
cached(vcf_paths=[], min_maf=0.01, include_missing_mafs=False)
VariantOverlapDetector(vcf_paths=[], min_maf=0.01, include_missing_mafs=False)


@pytest.mark.parametrize("random_seed", [1, 10, 100, 1000, 10000])
Expand Down
40 changes: 39 additions & 1 deletion tests/primer3/test_primer3.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,14 @@
import pytest
from fgpyo.sequence import reverse_complement

from prymer.api import FileBasedVariantLookup
from prymer.api import MinOptMax
from prymer.api import Primer
from prymer.api import PrimerPair
from prymer.api import Span
from prymer.api import Strand
from prymer.api import VariantLookup
from prymer.api import VariantOverlapDetector
from prymer.primer3 import DesignLeftPrimersTask
from prymer.primer3 import DesignPrimerPairsTask
from prymer.primer3 import DesignRightPrimersTask
Expand Down Expand Up @@ -380,7 +383,7 @@ def test_variant_lookup(
expected_soft_masked: str,
) -> None:
"""Test that MAF filtering and masking are working as expected."""
with Primer3(genome_fasta=genome_ref, list_of_vcfs=[vcf_path], min_maf=0.01) as designer:
with Primer3(genome_fasta=genome_ref, vcfs_or_lookup=[vcf_path], min_maf=0.01) as designer:
actual_soft_masked, actual_hard_masked = designer.get_design_sequences(region=region)
assert actual_hard_masked == expected_hard_masked
assert actual_soft_masked == expected_soft_masked
Expand All @@ -392,6 +395,41 @@ def test_variant_lookup(
assert actual_soft_masked == expected_soft_masked


@pytest.mark.parametrize(
"use_cache,expected_var_lookup_type",
[(True, VariantOverlapDetector), (False, FileBasedVariantLookup)],
)
def test_variant_lookup_construction(
genome_ref: Path, vcf_path: Path, use_cache: bool, expected_var_lookup_type: VariantLookup
) -> None:
"""Test that, given a list of VCFs, Primer3 init() correctly sets
`self.vcfs_or_lookup` as expected."""

with Primer3(
genome_fasta=genome_ref, vcfs_or_lookup=[vcf_path], min_maf=0.01, use_cache=use_cache
) as designer:
assert type(designer.vcfs_or_lookup) is expected_var_lookup_type


def test_variant_lookup_preconstruction(genome_ref: Path, vcf_path: Path) -> None:
"""Test that Primer3.__init__() triggers correct VariantLookup.build()."""
with Primer3(
genome_fasta=genome_ref,
vcfs_or_lookup=FileBasedVariantLookup(
vcf_paths=[vcf_path], min_maf=0.01, include_missing_mafs=False
),
) as designer:
assert type(designer.vcfs_or_lookup) is FileBasedVariantLookup

with Primer3(
genome_fasta=genome_ref,
vcfs_or_lookup=VariantOverlapDetector(
vcf_paths=[vcf_path], min_maf=0.01, include_missing_mafs=False
),
) as designer:
assert type(designer.vcfs_or_lookup) is VariantOverlapDetector


def test_screen_pair_results(
valid_primer_pairs: list[PrimerPair], genome_ref: Path, pair_primer_params: Primer3Parameters
) -> None:
Expand Down

0 comments on commit 8d049f5

Please sign in to comment.