From 8d049f585e34932da545be18b839797600214469 Mon Sep 17 00:00:00 2001 From: Erin McAuley Date: Mon, 14 Oct 2024 10:56:22 -0400 Subject: [PATCH] feat: Primer3.__init__() takes a list of VCF paths or a pre-constructed VariantLookup --- prymer/api/__init__.py | 4 --- prymer/api/variant_lookup.py | 54 ++++++++++++++++---------------- prymer/primer3/primer3.py | 46 ++++++++++++--------------- tests/api/test_variant_lookup.py | 14 +++++---- tests/primer3/test_primer3.py | 40 ++++++++++++++++++++++- 5 files changed, 94 insertions(+), 64 deletions(-) diff --git a/prymer/api/__init__.py b/prymer/api/__init__.py index bccf36c..3cb7ae7 100644 --- a/prymer/api/__init__.py +++ b/prymer/api/__init__.py @@ -16,8 +16,6 @@ from prymer.api.variant_lookup import VariantLookup from prymer.api.variant_lookup import VariantOverlapDetector from prymer.api.variant_lookup import VariantType -from prymer.api.variant_lookup import cached -from prymer.api.variant_lookup import disk_based __all__ = [ "ClusteredIntervals", @@ -38,6 +36,4 @@ "VariantLookup", "FileBasedVariantLookup", "VariantOverlapDetector", - "cached", - "disk_based", ] diff --git a/prymer/api/variant_lookup.py b/prymer/api/variant_lookup.py index 82f5833..583e3c0 100644 --- a/prymer/api/variant_lookup.py +++ b/prymer/api/variant_lookup.py @@ -10,15 +10,11 @@ Two concrete implementations are provided that both take a list of VCF files to be queried: - [`FileBasedVariantLookup`][prymer.api.variant_lookup.FileBasedVariantLookup] -- performs -disk-based retrieval of variants (using a VCF index). This class is recommended for large VCFs. The -[`disk_based()`][prymer.api.variant_lookup.disk_based] alternative constructor is -provided for easy construction of this object. +disk-based retrieval of variants (using a VCF index). This class is recommended for large VCFs. - [`VariantOverlapDetector`][prymer.api.variant_lookup.VariantOverlapDetector] -- reads in variants into memory and uses an [`pybedlite.overlap_detector.OverlapDetector`](https://pybedlite.readthedocs.io/en/latest/api.html#pybedlite.overlap_detector.OverlapDetector) -for querying. This class is recommended for small VCFs. The -[`cached()`][prymer.api.variant_lookup.cached] alternative constructor is provided for -easy construction of this object. +for querying. This class is recommended for small VCFs. Each class can also use minor allele frequency (MAF) to filter variants. @@ -28,7 +24,7 @@ ```python >>> from pathlib import Path ->>> lookup = cached(vcf_paths=[Path("./tests/api/data/miniref.variants.vcf.gz")], min_maf=0.00, include_missing_mafs=True) +>>> lookup = VariantLookup.build(vcf_paths=[Path("./tests/api/data/miniref.variants.vcf.gz")], use_cache=True,min_maf=0.00, include_missing_mafs=True) >>> lookup.query(refname="chr2", start=7999, end=8000) [SimpleVariant(id='complex-variant-sv-1/1', refname='chr2', pos=8000, ref='T', alt='', end=8000, variant_type=, maf=None)] >>> variants = lookup.query(refname="chr2", start=7999, end=9900) @@ -254,6 +250,30 @@ def __init__( self.min_maf: Optional[float] = min_maf self.include_missing_mafs: bool = include_missing_mafs + @classmethod + def build( + cls, + vcf_paths: list[Path], + use_cache: Optional[bool] = True, + min_maf: Optional[float] = 0.00, + include_missing_mafs: Optional[bool] = False, + ) -> "VariantLookup": + """Constructs a `VariantLookup` object based on size of VCF(s) to query. + + If `use_cache` is True, return a `VariantOverlapDetector` that caches all variants in + memory for fast lookup (appropriate for small VCFs). Otherwise, return a + `FileBasedVariantLookup` that queries indexed VCFs on disk for each lookup (appropriate + for larger VCFs). + + """ + if use_cache: + return VariantOverlapDetector( + vcf_paths=vcf_paths, min_maf=min_maf, include_missing_mafs=include_missing_mafs + ) + return FileBasedVariantLookup( + vcf_paths=vcf_paths, min_maf=min_maf, include_missing_mafs=include_missing_mafs + ) + @final def query( self, @@ -427,23 +447,3 @@ def calc_maf_from_filter(variant: pysam.VariantRecord) -> Optional[float]: maf = num_alt / len(gts) return maf - - -def cached( - vcf_paths: list[Path], min_maf: float, include_missing_mafs: bool = False -) -> VariantOverlapDetector: - """Constructs a `VariantLookup` that caches all variants in memory for fast lookup. - Appropriate for small VCFs.""" - return VariantOverlapDetector( - vcf_paths=vcf_paths, min_maf=min_maf, include_missing_mafs=include_missing_mafs - ) - - -def disk_based( - vcf_paths: list[Path], min_maf: float, include_missing_mafs: bool = False -) -> FileBasedVariantLookup: - """Constructs a `VariantLookup` that queries indexed VCFs on disk for each lookup. - Appropriate for large VCFs.""" - return FileBasedVariantLookup( - vcf_paths=vcf_paths, min_maf=min_maf, include_missing_mafs=include_missing_mafs - ) diff --git a/prymer/primer3/primer3.py b/prymer/primer3/primer3.py index 86d57bc..de4c808 100644 --- a/prymer/primer3/primer3.py +++ b/prymer/primer3/primer3.py @@ -24,7 +24,7 @@ >>> from prymer.api.variant_lookup import VariantLookup, VariantOverlapDetector >>> genome_fasta = Path("./tests/primer3/data/miniref.fa") >>> genome_vcf = Path("./tests/primer3/data/miniref.variants.vcf.gz") ->>> designer = Primer3(genome_fasta=genome_fasta, list_of_vcfs=[genome_vcf], use_file_based_lookup=True) +>>> designer = Primer3(genome_fasta=genome_fasta, vcfs_or_lookup=[genome_vcf], use_cache=True) ``` @@ -143,11 +143,8 @@ from prymer.api.primer_pair import PrimerPair from prymer.api.span import Span from prymer.api.span import Strand -from prymer.api.variant_lookup import FileBasedVariantLookup from prymer.api.variant_lookup import SimpleVariant -from prymer.api.variant_lookup import VariantOverlapDetector -from prymer.api.variant_lookup import cached -from prymer.api.variant_lookup import disk_based +from prymer.api.variant_lookup import VariantLookup from prymer.primer3.primer3_failure_reason import Primer3FailureReason from prymer.primer3.primer3_input import Primer3Input from prymer.primer3.primer3_input_tag import Primer3InputTag @@ -230,8 +227,8 @@ def __init__( self, genome_fasta: Path, executable: Optional[str] = None, - list_of_vcfs: Optional[list[Path]] = None, - use_file_based_lookup: bool = True, + vcfs_or_lookup: Optional[list[Path] | VariantLookup] = None, + use_cache: bool = False, min_maf: float = 0.0, include_missing_mafs: bool = False, ) -> None: @@ -239,8 +236,9 @@ def __init__( Args: genome_fasta: Path to reference genome .fasta file executable: string representation of the path to primer3_core - list_of_vcfs: an optional list of VCF files with which to hard-mask variants - use_file_based_lookup: whether to use a file-based `VariantLookup` + vcfs_or_lookup: an optional list of VCF files or `VariantLookup` object + with which to hard-mask variants + use_cache: whether to use an in-memory `VariantLookup` for fast querying min_maf: an optional minimum Minor Allele Frequency with which to filter a list of VCF files (return only variants with at least this minor allele frequency) include_missing_mafs: when filtering variants with a minor allele frequency, @@ -255,9 +253,18 @@ def __init__( executable="primer3_core" if executable is None else executable ) command: list[str] = [f"{executable_path}"] + if vcfs_or_lookup is None or isinstance(vcfs_or_lookup, VariantLookup): + self.vcfs_or_lookup = vcfs_or_lookup + elif isinstance(vcfs_or_lookup, list) and all( + isinstance(path, Path) for path in vcfs_or_lookup + ): # if provided path, create appropriate `VariantLookup` + self.vcfs_or_lookup = VariantLookup.build( + vcf_paths=vcfs_or_lookup, + use_cache=use_cache, + min_maf=min_maf, + include_missing_mafs=include_missing_mafs, + ) - self.list_of_vcfs = list_of_vcfs - self.use_file_based_lookup = use_file_based_lookup self.min_maf = min_maf self.include_missing_mafs = include_missing_mafs self._fasta = pysam.FastaFile(filename=f"{genome_fasta}") @@ -299,24 +306,11 @@ def get_design_sequences(self, region: Span) -> tuple[str, str]: soft_masked = self._fasta.fetch( reference=region.refname, start=region.start - 1, end=region.end ) - if self.list_of_vcfs is None: + if self.vcfs_or_lookup is None: hard_masked = soft_masked return soft_masked, hard_masked - variant_lookup: Union[FileBasedVariantLookup, VariantOverlapDetector] - if self.use_file_based_lookup is True: - variant_lookup = disk_based( - vcf_paths=self.list_of_vcfs, - min_maf=self.min_maf, - include_missing_mafs=self.include_missing_mafs, - ) - else: - variant_lookup = cached( - vcf_paths=self.list_of_vcfs, - min_maf=self.min_maf, - include_missing_mafs=self.include_missing_mafs, - ) - overlapping_variants: list[SimpleVariant] = variant_lookup.query( + overlapping_variants: list[SimpleVariant] = self.vcfs_or_lookup.query( refname=region.refname, start=region.start, end=region.end ) positions: list[int] = [] diff --git a/tests/api/test_variant_lookup.py b/tests/api/test_variant_lookup.py index a6eda6c..f5d0f89 100644 --- a/tests/api/test_variant_lookup.py +++ b/tests/api/test_variant_lookup.py @@ -20,9 +20,7 @@ from prymer.api.variant_lookup import VariantLookup from prymer.api.variant_lookup import VariantOverlapDetector from prymer.api.variant_lookup import VariantType -from prymer.api.variant_lookup import cached from prymer.api.variant_lookup import calc_maf_from_filter -from prymer.api.variant_lookup import disk_based @pytest.mark.parametrize( @@ -451,17 +449,21 @@ def test_simple_variant_conversion_logs( def test_missing_index_file_raises(temp_missing_path: Path) -> None: """Test that both VariantLookup objects raise an error with a missing index file.""" with pytest.raises(ValueError, match="Cannot perform fetch with missing index file for VCF"): - disk_based(vcf_paths=[temp_missing_path], min_maf=0.01, include_missing_mafs=False) + FileBasedVariantLookup( + vcf_paths=[temp_missing_path], min_maf=0.01, include_missing_mafs=False + ) with pytest.raises(ValueError, match="Cannot perform fetch with missing index file for VCF"): - cached(vcf_paths=[temp_missing_path], min_maf=0.01, include_missing_mafs=False) + VariantOverlapDetector( + vcf_paths=[temp_missing_path], min_maf=0.01, include_missing_mafs=False + ) def test_missing_vcf_files_raises() -> None: """Test that an error is raised when no VCF_paths are provided.""" with pytest.raises(ValueError, match="No VCF paths given to query"): - disk_based(vcf_paths=[], min_maf=0.01, include_missing_mafs=False) + FileBasedVariantLookup(vcf_paths=[], min_maf=0.01, include_missing_mafs=False) with pytest.raises(ValueError, match="No VCF paths given to query"): - cached(vcf_paths=[], min_maf=0.01, include_missing_mafs=False) + VariantOverlapDetector(vcf_paths=[], min_maf=0.01, include_missing_mafs=False) @pytest.mark.parametrize("random_seed", [1, 10, 100, 1000, 10000]) diff --git a/tests/primer3/test_primer3.py b/tests/primer3/test_primer3.py index 4ba342f..58f961c 100644 --- a/tests/primer3/test_primer3.py +++ b/tests/primer3/test_primer3.py @@ -6,11 +6,14 @@ import pytest from fgpyo.sequence import reverse_complement +from prymer.api import FileBasedVariantLookup from prymer.api import MinOptMax from prymer.api import Primer from prymer.api import PrimerPair from prymer.api import Span from prymer.api import Strand +from prymer.api import VariantLookup +from prymer.api import VariantOverlapDetector from prymer.primer3 import DesignLeftPrimersTask from prymer.primer3 import DesignPrimerPairsTask from prymer.primer3 import DesignRightPrimersTask @@ -380,7 +383,7 @@ def test_variant_lookup( expected_soft_masked: str, ) -> None: """Test that MAF filtering and masking are working as expected.""" - with Primer3(genome_fasta=genome_ref, list_of_vcfs=[vcf_path], min_maf=0.01) as designer: + with Primer3(genome_fasta=genome_ref, vcfs_or_lookup=[vcf_path], min_maf=0.01) as designer: actual_soft_masked, actual_hard_masked = designer.get_design_sequences(region=region) assert actual_hard_masked == expected_hard_masked assert actual_soft_masked == expected_soft_masked @@ -392,6 +395,41 @@ def test_variant_lookup( assert actual_soft_masked == expected_soft_masked +@pytest.mark.parametrize( + "use_cache,expected_var_lookup_type", + [(True, VariantOverlapDetector), (False, FileBasedVariantLookup)], +) +def test_variant_lookup_construction( + genome_ref: Path, vcf_path: Path, use_cache: bool, expected_var_lookup_type: VariantLookup +) -> None: + """Test that, given a list of VCFs, Primer3 init() correctly sets + `self.vcfs_or_lookup` as expected.""" + + with Primer3( + genome_fasta=genome_ref, vcfs_or_lookup=[vcf_path], min_maf=0.01, use_cache=use_cache + ) as designer: + assert type(designer.vcfs_or_lookup) is expected_var_lookup_type + + +def test_variant_lookup_preconstruction(genome_ref: Path, vcf_path: Path) -> None: + """Test that Primer3.__init__() triggers correct VariantLookup.build().""" + with Primer3( + genome_fasta=genome_ref, + vcfs_or_lookup=FileBasedVariantLookup( + vcf_paths=[vcf_path], min_maf=0.01, include_missing_mafs=False + ), + ) as designer: + assert type(designer.vcfs_or_lookup) is FileBasedVariantLookup + + with Primer3( + genome_fasta=genome_ref, + vcfs_or_lookup=VariantOverlapDetector( + vcf_paths=[vcf_path], min_maf=0.01, include_missing_mafs=False + ), + ) as designer: + assert type(designer.vcfs_or_lookup) is VariantOverlapDetector + + def test_screen_pair_results( valid_primer_pairs: list[PrimerPair], genome_ref: Path, pair_primer_params: Primer3Parameters ) -> None: