pfizer-opensource · jorenretel · Dec 23, 2023 · Dec 21, 2023 · Dec 21, 2023 · Dec 21, 2023
@@ -44,8 +44,8 @@ repos:
       args: [ "-c", "pyproject.toml" ]
       exclude: ^tests/
       additional_dependencies: [ "bandit[toml]" ]
--   repo: https://github.com/codespell-project/codespell
-    rev: v2.2.2
-    hooks:
-      - id: codespell
-        additional_dependencies: ["tomli"]
+#-   repo: https://github.com/codespell-project/codespell
+#    rev: v2.2.2
+#    hooks:
+#      - id: codespell
+#        additional_dependencies: ["tomli"]
@@ -0,0 +1,59 @@
+"""
+"""
+
+import time
+
+import numpy as np
+import pandas as pd
+
+from bigwig_loader import config
+from bigwig_loader import intervals_to_values_gpu
+from bigwig_loader.collection import BigWigCollection
+
+
+def some_intervals():
+    df = pd.read_csv("example_data/benchmark_positions.tsv", sep="\t")
+    start = df["center"].values - 5000
+    return list(df["chr"]), start, start + 10000
+
+
+def run_benchmark(collection, batch_sizes, chromosomes, starts, ends):
+    for batch_size in batch_sizes:
+        # print("batch_size", batch_size)
+
+        chrom = chromosomes[:batch_size]
+        start = starts[:batch_size]
+        end = ends[:batch_size]
+
+        # burn in
+        for _ in range(20):
+            collection.get_batch(chrom, start, end)
+
+        elapsed = []
+        start_time = time.perf_counter()
+        for _ in range(20):
+            collection.get_batch(chrom, start, end)
+            end_time = time.perf_counter()
+            elapsed.append(end_time - start_time)
+            start_time = end_time
+        # print("Seconds per batch:", np.mean(elapsed))
+        # print("standard deviation:", np.std(elapsed))
+        print(batch_size, np.mean(elapsed), np.std(elapsed))
+
+
+def run_all_benchmarks(bigwig_path=config.bigwig_dir):
+    print("Loading from:", config.bigwig_dir)
+    bigwig_loader_collection = BigWigCollection(bigwig_path, first_n_files=None)
+    chromosomes, starts, ends = some_intervals()
+    batch_sizes = [1, 2, 64, 128] + [256 * i for i in range(1, 6)]
+
+    print("routing to different cupy kernels:")
+    run_benchmark(bigwig_loader_collection, batch_sizes, chromosomes, starts, ends)
+
+    intervals_to_values_gpu.ROUTE_KERNELS = False
+    print("routing to same cupy kernel:")
+    run_benchmark(bigwig_loader_collection, batch_sizes, chromosomes, starts, ends)
+
+
+if __name__ == "__main__":
+    run_all_benchmarks()
@@ -93,13 +93,19 @@ def get_batch(
         chromosomes: Union[Sequence[str], npt.NDArray[np.generic]],
         start: Union[Sequence[int], npt.NDArray[np.int64]],
         end: Union[Sequence[int], npt.NDArray[np.int64]],
+        window_size: int = 1,
         out: Optional[cp.ndarray] = None,
     ) -> cp.ndarray:
         memory_bank = self._get_memory_bank()
         memory_bank.reset()
 
+        if (end[0] - start[0]) % window_size:
+            raise ValueError(
+                f"Sequence length {end[0] - start[0]} is not divisible by window size {window_size}"
+            )
+
         if out is None:
-            sequence_length = end[0] - start[0]
+            sequence_length = (end[0] - start[0]) // window_size
             out = self._get_out_tensor(len(start), sequence_length)
 
         abs_start = self.make_positions_global(chromosomes, start)
@@ -142,6 +148,7 @@ def get_batch(
                 track_values=value[row_number_start:row_number_end],
                 query_starts=cp.asarray(abs_start, dtype=cp.uint32),
                 query_ends=cp.asarray(abs_end, dtype=cp.uint32),
+                window_size=window_size,
                 out=partial_out,
             )
             i = bigwig_end

@@ -33,6 +33,7 @@ class BigWigDataset:
         center_bin_to_predict: if given, only do prediction on a central window. Should be
             smaller than or equal to sequence_length. If not given will be the same as
             sequence_length.
+        window_size: used to down sample the resolution of the target from sequence_length
         batch_size: batch size
         super_batch_size: batch size that is used in the background to load data from
             bigwig files. Should be larger than batch_size. If None, it will be equal to
@@ -61,6 +62,7 @@ def __init__(
         reference_genome_path: Path,
         sequence_length: int = 1000,
         center_bin_to_predict: Optional[int] = None,
+        window_size: int = 1,
         batch_size: int = 256,
         super_batch_size: Optional[int] = None,
         batches_per_epoch: Optional[int] = None,
@@ -97,6 +99,7 @@ def __init__(
             reference_genome_path=reference_genome_path,
             sequence_length=sequence_length,
             center_bin_to_predict=center_bin_to_predict,
+            window_size=window_size,
             batch_size=super_batch_size,
             batches_per_epoch=super_batches_per_epoch,
             maximum_unknown_bases_fraction=maximum_unknown_bases_fraction,
@@ -159,6 +162,7 @@ class BigWigSuperDataset:
         center_bin_to_predict: if given, only do prediction on a central window. Should be
             smaller than or equal to sequence_length. If not given will be the same as
             sequence_length.
+        window_size: used to down sample the resolution of the target from sequence_length
         batch_size: batch size
         batches_per_epoch: because the length of an epoch is slightly arbitrary here,
             the number of batches can be set by hand. If not the number of batches per
@@ -184,6 +188,7 @@ def __init__(
         reference_genome_path: Path,
         sequence_length: int = 1000,
         center_bin_to_predict: Optional[int] = None,
+        window_size: int = 1,
         batch_size: int = 256,
         batches_per_epoch: Optional[int] = None,
         maximum_unknown_bases_fraction: float = 0.1,
@@ -213,6 +218,7 @@ def __init__(
             self.center_bin_to_predict = center_bin_to_predict
         else:
             self.center_bin_to_predict = sequence_length
+        self.window_size = window_size
         self.batch_size = batch_size
         self.batches_per_epoch = (
             batches_per_epoch
@@ -282,7 +288,7 @@ def _out(self) -> cp.ndarray:
                 (
                     len(self.bigwig_collection),
                     self.batch_size,
-                    self.center_bin_to_predict,
+                    self.center_bin_to_predict // self.window_size,
                 ),
                 dtype=cp.float32,
             )
@@ -303,6 +309,7 @@ def __next__(self) -> tuple[Any, cp.ndarray]:
                 chromosomes,
                 start,
                 end,
+                window_size=self.window_size,
                 out=self._out,
             )
             return sequences, target