KernelTuner · fjwillemsen · Feb 8, 2024 · Feb 8, 2024 · Feb 9, 2024 · Feb 9, 2024
diff --git a/kernel_tuner/backends/cupy.py b/kernel_tuner/backends/cupy.py
@@ -47,7 +47,7 @@ def __init__(self, device=0, iterations=7, compiler_options=None, observers=None
         self.devprops = dev.attributes
         self.cc = dev.compute_capability
         self.max_threads = self.devprops["MaxThreadsPerBlock"]
-        self.cache_size_L2 = self.devprops["L2CacheSize"]
+        self.cache_size_L2 = int(self.devprops["L2CacheSize"])
 
         self.iterations = iterations
         self.current_module = None

diff --git a/kernel_tuner/backends/hip.py b/kernel_tuner/backends/hip.py
@@ -59,7 +59,7 @@ def __init__(self, device=0, iterations=7, compiler_options=None, observers=None
 
         self.name = self.hipProps._name.decode('utf-8')
         self.max_threads = self.hipProps.maxThreadsPerBlock
-        self.cache_size_L2 = self.hipProps.l2CacheSize
+        self.cache_size_L2 = int(self.hipProps.l2CacheSize)
         self.device = device
         self.compiler_options = compiler_options or []
         self.iterations = iterations

diff --git a/kernel_tuner/backends/nvcuda.py b/kernel_tuner/backends/nvcuda.py
@@ -72,6 +72,7 @@ def __init__(self, device=0, iterations=7, compiler_options=None, observers=None
             cudart.cudaDeviceAttr.cudaDevAttrL2CacheSize, device
         )
         cuda_error_check(err)
+        self.cache_size_L2 = int(self.cache_size_L2)
         self.cc = f"{major}{minor}"
         self.iterations = iterations
         self.current_module = None
@@ -330,7 +331,7 @@ def memset(allocation, value, size):
         :type size: int
 
         """
-        err = cudart.cudaMemset(allocation, value, size)
+        err = cudart.cudaMemset(allocation.__init__(), value, size)
         cuda_error_check(err)
 
     @staticmethod

diff --git a/kernel_tuner/backends/pycuda.py b/kernel_tuner/backends/pycuda.py
@@ -101,7 +101,7 @@ def _finish_up():
             str(k): v for (k, v) in self.context.get_device().get_attributes().items()
         }
         self.max_threads = devprops["MAX_THREADS_PER_BLOCK"]
-        self.cache_size_L2 = devprops["L2_CACHE_SIZE"]
+        self.cache_size_L2 = int(devprops["L2_CACHE_SIZE"])
         cc = str(devprops.get("COMPUTE_CAPABILITY_MAJOR", "0")) + str(
             devprops.get("COMPUTE_CAPABILITY_MINOR", "0")
         )

diff --git a/kernel_tuner/core.py b/kernel_tuner/core.py
@@ -339,8 +339,10 @@ def __init__(
         self.max_threads = dev.max_threads
         self.flush_possible = lang.upper() not in ['OPENCL', 'HIP', 'C', 'FORTRAN'] and isinstance(self.dev.cache_size_L2, int) and self.dev.cache_size_L2 > 0
         if self.flush_possible:
-            t = np.int32
-            self.flush_array = np.zeros((self.dev.cache_size_L2 // t(0).itemsize), order='F').astype(t)
+            self.flush_type = np.uint8
+            size = (self.dev.cache_size_L2 // self.flush_type(0).itemsize)
+            # self.flush_array = np.zeros((size), order='F', dtype=self.flush_type)
-            # self.flush_array = np.zeros((size), order='F', dtype=self.flush_type)
-            # self.flush_array = np.zeros((size), order='F', dtype=self.flush_type)
+            self.flush_array = np.empty((size), order='F', dtype=self.flush_type)
             self.flush_alloc = None
         if not quiet:
             print("Using: " + self.dev.name)
@@ -353,7 +355,7 @@ def flush_cache(self):
                 self.dev.free_mem(self.flush_alloc)
             # inspired by https://github.com/NVIDIA/nvbench/blob/main/nvbench/detail/l2flush.cuh#L51
             self.flush_alloc = self.dev.allocate_ndarray(self.flush_array)
-            self.dev.memset(self.flush_alloc, value=0, size=self.flush_array.nbytes)
+            self.dev.memset(self.flush_alloc, value=1, size=self.flush_array.nbytes)
 
     def benchmark_default(self, func, gpu_args, threads, grid, result, flush_cache=True, recopy_arrays=None):
         """
@@ -367,7 +369,7 @@ def benchmark_default(self, func, gpu_args, threads, grid, result, flush_cache=T
         ]
 
         self.dev.synchronize()
-        for _ in range(self.iterations):
+        for i in range(self.iterations):
             if flush_cache:
                 self.flush_cache()
             if recopy_arrays is not None: