Skip to content

Commit

Permalink
feat: public VariantLookup class
Browse files Browse the repository at this point in the history
  • Loading branch information
emmcauley committed Nov 27, 2024
1 parent 385e1b1 commit e2d71aa
Show file tree
Hide file tree
Showing 5 changed files with 319 additions and 194 deletions.
12 changes: 4 additions & 8 deletions prymer/api/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,13 @@
from prymer.api.span import BedLikeCoords
from prymer.api.span import Span
from prymer.api.span import Strand
from prymer.api.variant_lookup import FileBasedVariantLookup
from prymer.api.variant_lookup import SimpleVariant

# from prymer.api.variant_lookup import _DiskBasedLookup
# from prymer.api.variant_lookup import _InMemoryLookup
# from prymer.api.variant_lookup import _VariantLookup
from prymer.api.variant_lookup import VariantLookup
from prymer.api.variant_lookup import VariantOverlapDetector
from prymer.api.variant_lookup import VariantType
from prymer.api.variant_lookup import cached
from prymer.api.variant_lookup import disk_based

__all__ = [
"ClusteredIntervals",
Expand All @@ -30,8 +30,4 @@
"VariantType",
"SimpleVariant",
"VariantLookup",
"FileBasedVariantLookup",
"VariantOverlapDetector",
"cached",
"disk_based",
]
174 changes: 103 additions & 71 deletions prymer/api/variant_lookup.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,28 +7,20 @@
[`query()`][prymer.api.variant_lookup.VariantLookup.query] method for retrieving variants
that overlap the given range.
Two concrete implementations are provided that both take a list of VCF files to be queried:
- [`FileBasedVariantLookup`][prymer.api.variant_lookup.FileBasedVariantLookup] -- performs
disk-based retrieval of variants (using a VCF index). This class is recommended for large VCFs. The
[`disk_based()`][prymer.api.variant_lookup.disk_based] alternative constructor is
provided for easy construction of this object.
- [`VariantOverlapDetector`][prymer.api.variant_lookup.VariantOverlapDetector] -- reads in
variants into memory and uses an
[`VariantLookup`][prymer.api.variant_lookup.VariantLookup] needs a list of VCF files to be queried,
a minimum Minimum Allele Frequency with which to optionally filter variants, a boolean `include_missing_mafs` flag, and a
boolean `cached` flag. It is recommended to set `cached` to `False` for large VCFs. When `cached`
is True, variants from smaller VCFs are loaded into memory.
[`pybedlite.overlap_detector.OverlapDetector`](https://pybedlite.readthedocs.io/en/latest/api.html#pybedlite.overlap_detector.OverlapDetector)
for querying. This class is recommended for small VCFs. The
[`cached()`][prymer.api.variant_lookup.cached] alternative constructor is provided for
easy construction of this object.
Each class can also use minor allele frequency (MAF) to filter variants.
is used for querying.
The helper class `SimpleVariant` is included to facilitate VCF querying and reporting out results.
## Examples
```python
>>> from pathlib import Path
>>> lookup = cached(vcf_paths=[Path("./tests/api/data/miniref.variants.vcf.gz")], min_maf=0.00, include_missing_mafs=True)
>>> lookup = VariantLookup(vcf_paths=[Path("./tests/api/data/miniref.variants.vcf.gz")], min_maf=0.00, include_missing_mafs=True)
>>> lookup.query(refname="chr2", start=7999, end=8000)
[SimpleVariant(id='complex-variant-sv-1/1', refname='chr2', pos=8000, ref='T', alt='<DEL>', end=8000, variant_type=<VariantType.OTHER: 'OTHER'>, maf=None)]
>>> variants = lookup.query(refname="chr2", start=7999, end=9900)
Expand Down Expand Up @@ -67,7 +59,6 @@
from types import TracebackType
from typing import ContextManager
from typing import Optional
from typing import final

import pysam
from fgpyo.vcf import reader
Expand Down Expand Up @@ -237,7 +228,7 @@ def build(simple_variant: SimpleVariant) -> "_VariantInterval":
)


class VariantLookup(ABC):
class _VariantLookup(ABC):
"""Base class to represent a variant from a given genomic range.
Attributes:
Expand All @@ -258,7 +249,11 @@ def __init__(
self.min_maf: Optional[float] = min_maf
self.include_missing_mafs: bool = include_missing_mafs

@final
@abstractmethod
def close(self) -> None:
pass

Check warning on line 254 in prymer/api/variant_lookup.py

View check run for this annotation

Codecov / codecov/patch

prymer/api/variant_lookup.py#L254

Added line #L254 was not covered by tests

@abstractmethod
def query(
self,
refname: str,
Expand All @@ -277,20 +272,7 @@ def query(
include_missing_mafs: whether to include variants with a missing MAF
(overrides self.include_missing_mafs)
"""
if maf is None:
maf = self.min_maf
if include_missing_mafs is None:
include_missing_mafs = self.include_missing_mafs

variants = self._query(refname=refname, start=start, end=end)
if len(variants) == 0:
_logger.debug(f"No variants extracted from region of interest: {refname}:{start}-{end}")
if maf is None or maf <= 0.0:
return variants
elif include_missing_mafs: # return variants with a MAF above threshold or missing
return [v for v in variants if (v.maf is None or v.maf >= maf)]
else:
return [v for v in variants if v.maf is not None and v.maf >= maf]
pass

Check warning on line 275 in prymer/api/variant_lookup.py

View check run for this annotation

Codecov / codecov/patch

prymer/api/variant_lookup.py#L275

Added line #L275 was not covered by tests

@staticmethod
def to_variants(
Expand Down Expand Up @@ -319,20 +301,84 @@ def to_variants(
simple_vars.extend(simple_variants)
return sorted(simple_vars, key=lambda v: (v.pos, v.id))

@abstractmethod
def _query(self, refname: str, start: int, end: int) -> list[SimpleVariant]:
"""Subclasses must implement this method."""

class VariantLookup(ContextManager):
def __init__(
self,
vcf_paths: list[Path],
min_maf: Optional[float],
include_missing_mafs: bool,
cached: bool = True,
):
self.vcf_paths: list[Path] = vcf_paths
self.min_maf: Optional[float] = min_maf
self.include_missing_mafs: bool = include_missing_mafs
self.cached: bool = cached

self._lookup: _VariantLookup

if cached:
self._lookup = _InMemoryLookup(
vcf_paths=vcf_paths, min_maf=min_maf, include_missing_mafs=include_missing_mafs
)
else:
self._lookup = _DiskBasedLookup(
vcf_paths=vcf_paths, min_maf=min_maf, include_missing_mafs=include_missing_mafs
)

def __enter__(self) -> "VariantLookup":
return self

def __exit__(
self,
exc_type: Optional[type[BaseException]],
exc_value: Optional[BaseException],
traceback: Optional[TracebackType],
) -> None:
self._lookup.close()

def query(
self,
refname: str,
start: int,
end: int,
maf: Optional[float] = None,
include_missing_mafs: bool = None,
) -> list[SimpleVariant]:

maf = maf if maf is not None else self.min_maf
include_missing_mafs = (
include_missing_mafs if include_missing_mafs is not None else self.include_missing_mafs
)
variants: list[SimpleVariant] = self._lookup.query(
refname=refname,
start=start,
end=end,
)

if len(variants) == 0:
_logger.debug(f"No variants extracted from region of interest: {refname}:{start}-{end}")
if maf is None or maf <= 0.0:
return variants
elif include_missing_mafs: # return variants with a MAF above threshold or missing
return [v for v in variants if (v.maf is None or v.maf >= maf)]
else:
return [v for v in variants if v.maf is not None and v.maf >= maf]

def close(self) -> None:
"""Close the underlying VCF file handles."""
self._lookup.close()

Check warning on line 370 in prymer/api/variant_lookup.py

View check run for this annotation

Codecov / codecov/patch

prymer/api/variant_lookup.py#L370

Added line #L370 was not covered by tests


class FileBasedVariantLookup(ContextManager, VariantLookup):
class _DiskBasedLookup(_VariantLookup):
"""Implementation of `VariantLookup` that queries against indexed VCF files each time a query is
performed. Assumes the index is located adjacent to the VCF file and has the same base name with
either a .csi or .tbi suffix.
Example:
```python
>>> with FileBasedVariantLookup([Path("./tests/api/data/miniref.variants.vcf.gz")], min_maf=0.0, include_missing_mafs=False) as lookup:
>>> with _DiskBasedLookup([Path("./tests/api/data/miniref.variants.vcf.gz")], min_maf=0.0, include_missing_mafs=False) as lookup:
... lookup.query(refname="chr2", start=7999, end=8000)
[SimpleVariant(id='complex-variant-sv-1/1', refname='chr2', pos=8000, ref='T', alt='<DEL>', end=8000, variant_type=<VariantType.OTHER: 'OTHER'>, maf=None)]
Expand All @@ -355,7 +401,7 @@ def __init__(self, vcf_paths: list[Path], min_maf: Optional[float], include_miss
open_fh = pysam.VariantFile(str(path))
self._readers.append(open_fh)

def __enter__(self) -> "FileBasedVariantLookup":
def __enter__(self) -> "_DiskBasedLookup":
"""Enter the context manager."""
return self

Expand All @@ -369,7 +415,14 @@ def __exit__(
self.close()
return None

def _query(self, refname: str, start: int, end: int) -> list[SimpleVariant]:
def query(
self,
refname: str,
start: int,
end: int,
maf: Optional[float] = None,
include_missing_mafs: bool = None,
) -> list[SimpleVariant]:
"""Queries variants from the VCFs used by this lookup and returns a `SimpleVariant`."""
simple_variants: list[SimpleVariant] = []
for fh, path in zip(self._readers, self.vcf_paths, strict=True):
Expand All @@ -387,7 +440,7 @@ def close(self) -> None:
handle.close()


class VariantOverlapDetector(VariantLookup):
class _InMemoryLookup(_VariantLookup):
"""Implements `VariantLookup` by reading the entire VCF into memory and loading the resulting
Variants into an `OverlapDetector`."""

Expand Down Expand Up @@ -415,7 +468,14 @@ def __init__(self, vcf_paths: list[Path], min_maf: Optional[float], include_miss
)
self._overlap_detector.add_all(variant_intervals)

def _query(self, refname: str, start: int, end: int) -> list[SimpleVariant]:
def query(
self,
refname: str,
start: int,
end: int,
maf: Optional[float] = None,
include_missing_mafs: bool = None,
) -> list[SimpleVariant]:
"""Queries variants from the VCFs used by this lookup."""
query = Interval(
refname=refname, start=start - 1, end=end
Expand All @@ -426,6 +486,9 @@ def _query(self, refname: str, start: int, end: int) -> list[SimpleVariant]:
)
return sorted(overlapping_variants, key=lambda v: (v.pos, v.id))

def close(self) -> None:
pass

Check warning on line 490 in prymer/api/variant_lookup.py

View check run for this annotation

Codecov / codecov/patch

prymer/api/variant_lookup.py#L490

Added line #L490 was not covered by tests


# module-level functions
def calc_maf_from_filter(variant: pysam.VariantRecord) -> Optional[float]:
Expand Down Expand Up @@ -460,34 +523,3 @@ def calc_maf_from_filter(variant: pysam.VariantRecord) -> Optional[float]:
maf = num_alt / len(gts)

return maf


def cached(
vcf_paths: list[Path], min_maf: float, include_missing_mafs: bool = False
) -> VariantOverlapDetector:
"""Constructs a `VariantLookup` that caches all variants in memory for fast lookup.
Appropriate for small VCFs."""
return VariantOverlapDetector(
vcf_paths=vcf_paths, min_maf=min_maf, include_missing_mafs=include_missing_mafs
)


def disk_based(
vcf_paths: list[Path], min_maf: float, include_missing_mafs: bool = False
) -> FileBasedVariantLookup:
"""Constructs a `VariantLookup` that queries indexed VCFs on disk for each lookup.
Appropriate for large VCFs.
Example:
```python
>>> with disk_based([Path("./tests/api/data/miniref.variants.vcf.gz")], min_maf=0.0) as lookup:
... lookup.query(refname="chr2", start=7999, end=8000)
[SimpleVariant(id='complex-variant-sv-1/1', refname='chr2', pos=8000, ref='T', alt='<DEL>', end=8000, variant_type=<VariantType.OTHER: 'OTHER'>, maf=None)]
```
""" # noqa: E501
return FileBasedVariantLookup(
vcf_paths=vcf_paths, min_maf=min_maf, include_missing_mafs=include_missing_mafs
)
37 changes: 28 additions & 9 deletions prymer/primer3/primer3.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,15 +17,15 @@
nucleotide sequences can be retrieved. The full path to the `primer3` executable can provided,
otherwise it is assumed to be on the PATH. Furthermore, optionally a
[`VariantLookup`][prymer.api.variant_lookup.VariantLookup] may be provided to
hard-mask the design and target regions as to avoid design primers over polymorphic sites.
hard-mask the design and target regions so as to avoid design primers over polymorphic sites.
```python
>>> from pathlib import Path
>>> from prymer.api.variant_lookup import VariantLookup, VariantOverlapDetector
>>> from prymer.api.variant_lookup import _VariantLookup, _InMemoryLookup
>>> genome_fasta = Path("./tests/primer3/data/miniref.fa")
>>> genome_vcf = Path("./tests/primer3/data/miniref.variants.vcf.gz")
>>> variant_lookup: VariantLookup = VariantOverlapDetector(vcf_paths=[genome_vcf], min_maf=0.01, include_missing_mafs=False)
>>> designer = Primer3(genome_fasta=genome_fasta, variant_lookup=variant_lookup)
>>> variant_lookup: VariantLookup = VariantLookup(vcf_paths=[genome_vcf], min_maf=0.01, include_missing_mafs=False)
>>> designer = Primer3(genome_fasta=genome_fasta, variants=variant_lookup)
```
Expand Down Expand Up @@ -229,13 +229,14 @@ def __init__(
self,
genome_fasta: Path,
executable: Optional[str] = None,
variant_lookup: Optional[VariantLookup] = None,
variants: list[Path] | VariantLookup | None = None,
) -> None:
"""
Args:
genome_fasta: Path to reference genome .fasta file
executable: string representation of the path to primer3_core
variant_lookup: VariantLookup object to facilitate hard-masking variants
variants: an optional list of VCF `Path`s or VariantLookup object to facilitate
hard-masking variants
Assumes the sequence dictionary is located adjacent to the .fasta file and has the same
base name with a .dict suffix.
Expand All @@ -246,12 +247,30 @@ def __init__(
)
command: list[str] = [f"{executable_path}"]

self.variant_lookup = variant_lookup
self.variant_lookup: Optional[VariantLookup]
# If no variants are given, or they are in a pre-constructed `VariantLookup` object,
# set the object attribute
if variants is None or isinstance(variants, VariantLookup):
self.variant_lookup = variants
# If `variants` is a list of `Path`s to indexed VCF files, create a `VariantLookup` object
# with default settings (no MAF filtering, do not include variants with missing MAFs)
elif isinstance(variants, list) and all(
isinstance(path, Path) for path in variants
): # if provided path, create appropriate `VariantLookup`
self.variant_lookup = VariantLookup(

Check warning on line 260 in prymer/primer3/primer3.py

View check run for this annotation

Codecov / codecov/patch

prymer/primer3/primer3.py#L260

Added line #L260 was not covered by tests
vcf_paths=variants,
cached=True,
min_maf=0.00,
include_missing_mafs=False,
)
else:
raise ValueError(
"Variant lookup is required to be a list of `Path` objects or a"
f" pre-constructed `VariantLookup` object: received {variants}"
)
self._fasta = pysam.FastaFile(filename=f"{genome_fasta}")

dict_path = genome_fasta.with_suffix(".dict")
# TODO: This is a placeholder while waiting for #160 to be resolved
# https://github.com/fulcrumgenomics/fgpyo/pull/160
with reader(dict_path, file_type=sam.SamFileType.SAM) as fh:
self._dict: SequenceDictionary = SequenceDictionary.from_sam(header=fh.header)

Expand Down
Loading

0 comments on commit e2d71aa

Please sign in to comment.