Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Calculate mean over fixed size windows while converting intervals to values #1

Merged
merged 7 commits into from
Dec 23, 2023
10 changes: 5 additions & 5 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,8 @@ repos:
args: [ "-c", "pyproject.toml" ]
exclude: ^tests/
additional_dependencies: [ "bandit[toml]" ]
- repo: https://github.com/codespell-project/codespell
rev: v2.2.2
hooks:
- id: codespell
additional_dependencies: ["tomli"]
#- repo: https://github.com/codespell-project/codespell
# rev: v2.2.2
# hooks:
# - id: codespell
# additional_dependencies: ["tomli"]
59 changes: 59 additions & 0 deletions benchmark/06_benchmark_interval_to_values.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
"""
"""

import time

import numpy as np
import pandas as pd

from bigwig_loader import config
from bigwig_loader import intervals_to_values_gpu
from bigwig_loader.collection import BigWigCollection


def some_intervals():
df = pd.read_csv("example_data/benchmark_positions.tsv", sep="\t")
start = df["center"].values - 5000
return list(df["chr"]), start, start + 10000


def run_benchmark(collection, batch_sizes, chromosomes, starts, ends):
for batch_size in batch_sizes:
# print("batch_size", batch_size)

chrom = chromosomes[:batch_size]
start = starts[:batch_size]
end = ends[:batch_size]

# burn in
for _ in range(20):
collection.get_batch(chrom, start, end)

elapsed = []
start_time = time.perf_counter()
for _ in range(20):
collection.get_batch(chrom, start, end)
end_time = time.perf_counter()
elapsed.append(end_time - start_time)
start_time = end_time
# print("Seconds per batch:", np.mean(elapsed))
# print("standard deviation:", np.std(elapsed))
print(batch_size, np.mean(elapsed), np.std(elapsed))


def run_all_benchmarks(bigwig_path=config.bigwig_dir):
print("Loading from:", config.bigwig_dir)
bigwig_loader_collection = BigWigCollection(bigwig_path, first_n_files=None)
chromosomes, starts, ends = some_intervals()
batch_sizes = [1, 2, 64, 128] + [256 * i for i in range(1, 6)]

print("routing to different cupy kernels:")
run_benchmark(bigwig_loader_collection, batch_sizes, chromosomes, starts, ends)

intervals_to_values_gpu.ROUTE_KERNELS = False
print("routing to same cupy kernel:")
run_benchmark(bigwig_loader_collection, batch_sizes, chromosomes, starts, ends)


if __name__ == "__main__":
run_all_benchmarks()
9 changes: 8 additions & 1 deletion bigwig_loader/collection.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,13 +93,19 @@ def get_batch(
chromosomes: Union[Sequence[str], npt.NDArray[np.generic]],
start: Union[Sequence[int], npt.NDArray[np.int64]],
end: Union[Sequence[int], npt.NDArray[np.int64]],
window_size: int = 1,
out: Optional[cp.ndarray] = None,
) -> cp.ndarray:
memory_bank = self._get_memory_bank()
memory_bank.reset()

if (end[0] - start[0]) % window_size:
raise ValueError(
f"Sequence length {end[0] - start[0]} is not divisible by window size {window_size}"
)

if out is None:
sequence_length = end[0] - start[0]
sequence_length = (end[0] - start[0]) // window_size
out = self._get_out_tensor(len(start), sequence_length)

abs_start = self.make_positions_global(chromosomes, start)
Expand Down Expand Up @@ -142,6 +148,7 @@ def get_batch(
track_values=value[row_number_start:row_number_end],
query_starts=cp.asarray(abs_start, dtype=cp.uint32),
query_ends=cp.asarray(abs_end, dtype=cp.uint32),
window_size=window_size,
out=partial_out,
)
i = bigwig_end
Expand Down
9 changes: 8 additions & 1 deletion bigwig_loader/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ class BigWigDataset:
center_bin_to_predict: if given, only do prediction on a central window. Should be
smaller than or equal to sequence_length. If not given will be the same as
sequence_length.
window_size: used to down sample the resolution of the target from sequence_length
batch_size: batch size
super_batch_size: batch size that is used in the background to load data from
bigwig files. Should be larger than batch_size. If None, it will be equal to
Expand Down Expand Up @@ -61,6 +62,7 @@ def __init__(
reference_genome_path: Path,
sequence_length: int = 1000,
center_bin_to_predict: Optional[int] = None,
window_size: int = 1,
batch_size: int = 256,
super_batch_size: Optional[int] = None,
batches_per_epoch: Optional[int] = None,
Expand Down Expand Up @@ -97,6 +99,7 @@ def __init__(
reference_genome_path=reference_genome_path,
sequence_length=sequence_length,
center_bin_to_predict=center_bin_to_predict,
window_size=window_size,
batch_size=super_batch_size,
batches_per_epoch=super_batches_per_epoch,
maximum_unknown_bases_fraction=maximum_unknown_bases_fraction,
Expand Down Expand Up @@ -159,6 +162,7 @@ class BigWigSuperDataset:
center_bin_to_predict: if given, only do prediction on a central window. Should be
smaller than or equal to sequence_length. If not given will be the same as
sequence_length.
window_size: used to down sample the resolution of the target from sequence_length
batch_size: batch size
batches_per_epoch: because the length of an epoch is slightly arbitrary here,
the number of batches can be set by hand. If not the number of batches per
Expand All @@ -184,6 +188,7 @@ def __init__(
reference_genome_path: Path,
sequence_length: int = 1000,
center_bin_to_predict: Optional[int] = None,
window_size: int = 1,
batch_size: int = 256,
batches_per_epoch: Optional[int] = None,
maximum_unknown_bases_fraction: float = 0.1,
Expand Down Expand Up @@ -213,6 +218,7 @@ def __init__(
self.center_bin_to_predict = center_bin_to_predict
else:
self.center_bin_to_predict = sequence_length
self.window_size = window_size
self.batch_size = batch_size
self.batches_per_epoch = (
batches_per_epoch
Expand Down Expand Up @@ -282,7 +288,7 @@ def _out(self) -> cp.ndarray:
(
len(self.bigwig_collection),
self.batch_size,
self.center_bin_to_predict,
self.center_bin_to_predict // self.window_size,
),
dtype=cp.float32,
)
Expand All @@ -303,6 +309,7 @@ def __next__(self) -> tuple[Any, cp.ndarray]:
chromosomes,
start,
end,
window_size=self.window_size,
out=self._out,
)
return sequences, target
Expand Down
Loading