KernelTuner · fjwillemsen · Feb 8, 2024 · Feb 8, 2024 · Feb 9, 2024 · Feb 9, 2024
diff --git a/CONTRIBUTING.rst b/CONTRIBUTING.rst
@@ -85,7 +85,7 @@ Steps without :bash:`sudo` access (e.g. on a cluster):
              - /path/to/directory
     * [Optional] both Mamba and Miniconda can be automatically activated via :bash:`~/.bashrc`. Do not forget to add these (usually provided at the end of the installation).
     * Exit the shell and re-enter to make sure Conda is available. :bash:`cd` to the kernel tuner directory.
-    * [Optional] if you have limited user folder space, the Pip cache can be pointed elsewhere with the environment variable :bash:`PIP_CACHE_DIR`. The cache location can be checked with :bash:`pip cache dir`.
+    * [Optional] if you have limited user folder space, the Pip cache can be pointed elsewhere with the environment variable :bash:`PIP_CACHE_DIR`. The cache location can be checked with :bash:`pip cache dir`. On Linu, to point the entire :bash:`~/.cache` default elsewhere, use the :bash:`XDG_CACHE_HOME` environment variable. 
-    * [Optional] if you have limited user folder space, the Pip cache can be pointed elsewhere with the environment variable :bash:`PIP_CACHE_DIR`. The cache location can be checked with :bash:`pip cache dir`. On Linu, to point the entire :bash:`~/.cache` default elsewhere, use the :bash:`XDG_CACHE_HOME` environment variable. 
+    * [Optional] if you have limited user folder space, the Pip cache can be pointed elsewhere with the environment variable :bash:`PIP_CACHE_DIR`. The cache location can be checked with :bash:`pip cache dir`. On Linux, to point the entire :bash:`~/.cache` default elsewhere, use the :bash:`XDG_CACHE_HOME` environment variable. 
-    * [Optional] if you have limited user folder space, the Pip cache can be pointed elsewhere with the environment variable :bash:`PIP_CACHE_DIR`. The cache location can be checked with :bash:`pip cache dir`. On Linu, to point the entire :bash:`~/.cache` default elsewhere, use the :bash:`XDG_CACHE_HOME` environment variable. 
+    * [Optional] if you have limited user folder space, the Pip cache can be pointed elsewhere with the environment variable :bash:`PIP_CACHE_DIR`. The cache location can be checked with :bash:`pip cache dir`. On Linux, to point the entire :bash:`~/.cache` default elsewhere, use the :bash:`XDG_CACHE_HOME` environment variable. 
     * [Optional] update Conda if available before continuing: :bash:`conda update -n base -c conda-forge conda`.
 #. Setup a virtual environment: :bash:`conda create --name kerneltuner python=3.11` (or whatever Python version and environment name you prefer).
 #. Activate the virtual environment: :bash:`conda activate kerneltuner`.

diff --git a/kernel_tuner/backends/backend.py b/kernel_tuner/backends/backend.py
@@ -2,6 +2,7 @@
 from __future__ import print_function
 
 from abc import ABC, abstractmethod
+from numpy import ndarray
 
 
 class Backend(ABC):
@@ -65,6 +66,16 @@ class GPUBackend(Backend):
     def __init__(self, device, iterations, compiler_options, observers):
         pass
 
+    @abstractmethod
+    def allocate_ndarray(self, array: ndarray) -> any:
+        """This method must allocate on the GPU a buffer for a given np.ndarray and return the pointer."""
+        pass
+
+    @abstractmethod
+    def free_mem(self, pointer):
+        """This method must free on the GPU a buffer for a given pointer."""
+        pass
+
     @abstractmethod
     def copy_constant_memory_args(self, cmem_args):
         """This method must implement the allocation and copy of constant memory to the GPU."""

diff --git a/kernel_tuner/backends/cupy.py b/kernel_tuner/backends/cupy.py
@@ -1,5 +1,6 @@
 """This module contains all Cupy specific kernel_tuner functions."""
 from __future__ import print_function
+from warnings import warn
 
 import numpy as np
 
@@ -46,6 +47,7 @@ def __init__(self, device=0, iterations=7, compiler_options=None, observers=None
         self.devprops = dev.attributes
         self.cc = dev.compute_capability
         self.max_threads = self.devprops["MaxThreadsPerBlock"]
+        self.cache_size_L2 = int(self.devprops["L2CacheSize"])
 
         self.iterations = iterations
         self.current_module = None
@@ -82,6 +84,18 @@ def __init__(self, device=0, iterations=7, compiler_options=None, observers=None
         self.env = env
         self.name = env["device_name"]
 
+    def allocate_ndarray(self, array):
+        alloc = cp.array(array)
+        self.allocations.append(alloc)
+        return alloc
+
+    def free_mem(self, pointer):
+        # iteratively comparing is required as comparing with list.remove is not properly implemented
+        to_remove = [i for i, alloc in enumerate(self.allocations) if cp.array_equal(alloc, pointer)]
+        assert len(to_remove) == 1
+        self.allocations.pop(to_remove[0])
+        del pointer # CuPy uses Python reference counter to free upon disuse
+
     def ready_argument_list(self, arguments):
         """Ready argument list to be passed to the kernel, allocates gpu mem.
 
@@ -97,8 +111,7 @@ def ready_argument_list(self, arguments):
         for arg in arguments:
             # if arg i is a numpy array copy to device
             if isinstance(arg, np.ndarray):
-                alloc = cp.array(arg)
-                self.allocations.append(alloc)
+                alloc = self.allocate_ndarray(arg)
                 gpu_args.append(alloc)
             # if not a numpy array, just pass argument along
             else:
@@ -124,6 +137,7 @@ def compile(self, kernel_instance):
         compiler_options = self.compiler_options
         if not any(["-std=" in opt for opt in self.compiler_options]):
             compiler_options = ["--std=c++11"] + self.compiler_options
+        # CuPy already sets the --gpu-architecture by itself, as per https://github.com/cupy/cupy/blob/main/cupy/cuda/compiler.py#L145
 
         options = tuple(compiler_options)
 
@@ -132,6 +146,7 @@ def compile(self, kernel_instance):
         )
 
         self.func = self.current_module.get_function(kernel_name)
+        self.num_regs = self.func.num_regs
         return self.func
 
     def start_event(self):
@@ -197,6 +212,8 @@ def run_kernel(self, func, gpu_args, threads, grid, stream=None):
             of the grid
         :type grid: tuple(int, int)
         """
+        if stream is None:
+            stream = self.stream
         func(grid, threads, gpu_args, stream=stream, shared_mem=self.smem_size)
 
     def memset(self, allocation, value, size):

diff --git a/kernel_tuner/backends/hip.py b/kernel_tuner/backends/hip.py
@@ -59,6 +59,7 @@ def __init__(self, device=0, iterations=7, compiler_options=None, observers=None
 
         self.name = self.hipProps._name.decode('utf-8')
         self.max_threads = self.hipProps.maxThreadsPerBlock
+        self.cache_size_L2 = int(self.hipProps.l2CacheSize)
         self.device = device
         self.compiler_options = compiler_options or []
         self.iterations = iterations
@@ -85,6 +86,11 @@ def __init__(self, device=0, iterations=7, compiler_options=None, observers=None
         for obs in self.observers:
             obs.register_device(self)
 
+    def allocate_ndarray(self, array):
+        return hip.hipMalloc(array.nbytes)
-    def allocate_ndarray(self, array):
-        return hip.hipMalloc(array.nbytes)
+    def allocate_ndarray(self, array):
+        alloc = hip.hipMalloc(array.nbytes)
+        self.allocations.append(alloc)
+        return alloc
-    def allocate_ndarray(self, array):
-        return hip.hipMalloc(array.nbytes)
+    def allocate_ndarray(self, array):
+        alloc = hip.hipMalloc(array.nbytes)
+        self.allocations.append(alloc)
+        return alloc
+
+    def free_mem(self, pointer):
+        raise NotImplementedError("PyHIP currently does not have a free function")
 
     def ready_argument_list(self, arguments):
         """Ready argument list to be passed to the HIP function.
@@ -106,7 +112,7 @@ def ready_argument_list(self, arguments):
             # Allocate space on device for array and convert to ctypes
             if isinstance(arg, np.ndarray):
                 if dtype_str in dtype_map.keys():
-                    device_ptr = hip.hipMalloc(arg.nbytes)
+                    device_ptr = self.allocate_ndarray(arg)
                     data_ctypes = arg.ctypes.data_as(ctypes.POINTER(dtype_map[dtype_str]))
                     hip.hipMemcpy_htod(device_ptr, data_ctypes, arg.nbytes)
                     # may be part of run_kernel, return allocations here instead

diff --git a/kernel_tuner/backends/nvcuda.py b/kernel_tuner/backends/nvcuda.py
@@ -1,9 +1,11 @@
 """This module contains all NVIDIA cuda-python specific kernel_tuner functions."""
+from warnings import warn
+
 import numpy as np
 
 from kernel_tuner.backends.backend import GPUBackend
 from kernel_tuner.observers.nvcuda import CudaRuntimeObserver
-from kernel_tuner.util import SkippableFailure, cuda_error_check
+from kernel_tuner.util import SkippableFailure, cuda_error_check, to_valid_nvrtc_gpu_arch_cc
 
 # embedded in try block to be able to generate documentation
 # and run tests without cuda-python installed
@@ -66,6 +68,11 @@ def __init__(self, device=0, iterations=7, compiler_options=None, observers=None
             cudart.cudaDeviceAttr.cudaDevAttrMaxThreadsPerBlock, device
         )
         cuda_error_check(err)
+        err, self.cache_size_L2 = cudart.cudaDeviceGetAttribute(
+            cudart.cudaDeviceAttr.cudaDevAttrL2CacheSize, device
+        )
+        cuda_error_check(err)
+        self.cache_size_L2 = int(self.cache_size_L2)
         self.cc = f"{major}{minor}"
         self.iterations = iterations
         self.current_module = None
@@ -107,9 +114,19 @@ def __init__(self, device=0, iterations=7, compiler_options=None, observers=None
 
     def __del__(self):
         for device_memory in self.allocations:
-            if isinstance(device_memory, cuda.CUdeviceptr):
-                err = cuda.cuMemFree(device_memory)
-                cuda_error_check(err)
+            self.free_mem(device_memory)
+
+    def allocate_ndarray(self, array):
+        err, device_memory = cuda.cuMemAlloc(array.nbytes)
+        cuda_error_check(err)
+        self.allocations.append(device_memory)
+        return device_memory
+
+    def free_mem(self, pointer):
+        assert isinstance(pointer, cuda.CUdeviceptr)
+        self.allocations.remove(pointer)
+        err = cuda.cuMemFree(pointer)
+        cuda_error_check(err)
 
     def ready_argument_list(self, arguments):
         """Ready argument list to be passed to the kernel, allocates gpu mem.
@@ -126,9 +143,7 @@ def ready_argument_list(self, arguments):
         for arg in arguments:
             # if arg is a numpy array copy it to device
             if isinstance(arg, np.ndarray):
-                err, device_memory = cuda.cuMemAlloc(arg.nbytes)
-                cuda_error_check(err)
-                self.allocations.append(device_memory)
+                device_memory = self.allocate_ndarray(arg)
                 gpu_args.append(device_memory)
                 self.memcpy_htod(device_memory, arg)
             # if not array, just pass along
@@ -161,12 +176,12 @@ def compile(self, kernel_instance):
             compiler_options.append(b"--std=c++11")
         if not any(["--std=" in opt for opt in self.compiler_options]):
             self.compiler_options.append("--std=c++11")
-        if not any([b"--gpu-architecture=" in opt for opt in compiler_options]):
+        if not any([b"--gpu-architecture=" in opt or b"-arch" in opt for opt in compiler_options]):
             compiler_options.append(
-                f"--gpu-architecture=compute_{self.cc}".encode("UTF-8")
+                f"--gpu-architecture=compute_{to_valid_nvrtc_gpu_arch_cc(self.cc)}".encode("UTF-8")
             )
-        if not any(["--gpu-architecture=" in opt for opt in self.compiler_options]):
-            self.compiler_options.append(f"--gpu-architecture=compute_{self.cc}")
+        if not any(["--gpu-architecture=" in opt or "-arch" in opt for opt in self.compiler_options]):
+            self.compiler_options.append(f"--gpu-architecture=compute_{to_valid_nvrtc_gpu_arch_cc(self.cc)}")
 
         err, program = nvrtc.nvrtcCreateProgram(
             str.encode(kernel_string), b"CUDAProgram", 0, [], []
@@ -192,6 +207,11 @@ def compile(self, kernel_instance):
             )
             cuda_error_check(err)
 
+            # get the number of registers per thread used in this kernel
+            num_regs = cuda.cuFuncGetAttribute(cuda.CUfunction_attribute.CU_FUNC_ATTRIBUTE_NUM_REGS, self.func)
+            assert num_regs[0] == 0, f"Retrieving number of registers per thread unsuccesful: code {num_regs[0]}"
+            self.num_regs = num_regs[1]
+
         except RuntimeError as re:
             _, n = nvrtc.nvrtcGetProgramLogSize(program)
             log = b" " * n
@@ -273,6 +293,8 @@ def run_kernel(self, func, gpu_args, threads, grid, stream=None):
             of the grid
         :type grid: tuple(int, int)
         """
+        if stream is None:
+            stream = self.stream
         arg_types = list()
         for arg in gpu_args:
             if isinstance(arg, cuda.CUdeviceptr):
@@ -309,7 +331,7 @@ def memset(allocation, value, size):
         :type size: int
 
         """
-        err = cudart.cudaMemset(allocation, value, size)
+        err = cudart.cudaMemset(allocation.__init__(), value, size)
         cuda_error_check(err)
 
     @staticmethod

diff --git a/kernel_tuner/backends/opencl.py b/kernel_tuner/backends/opencl.py
@@ -45,6 +45,10 @@ def __init__(
         self.max_threads = self.ctx.devices[0].get_info(
             cl.device_info.MAX_WORK_GROUP_SIZE
         )
+        # TODO the L2 cache size request fails
+        # self.cache_size_L2 = self.ctx.devices[0].get_info(
+        #     cl.device_affinity_domain.L2_CACHE
+        # )
         self.compiler_options = compiler_options or []
 
         # observer stuff
@@ -68,6 +72,13 @@ def __init__(
         self.env = env
         self.name = dev.name
 
+    def allocate_ndarray(self, array):
+        return cl.Buffer(self.ctx, self.mf.READ_WRITE | self.mf.COPY_HOST_PTR, hostbuf=array)
+
+    def free_mem(self, pointer):
+        assert isinstance(pointer, cl.Buffer)
+        pointer.release()
+
     def ready_argument_list(self, arguments):
         """Ready argument list to be passed to the kernel, allocates gpu mem.
 
@@ -83,13 +94,7 @@ def ready_argument_list(self, arguments):
         for arg in arguments:
             # if arg i is a numpy array copy to device
             if isinstance(arg, np.ndarray):
-                gpu_args.append(
-                    cl.Buffer(
-                        self.ctx,
-                        self.mf.READ_WRITE | self.mf.COPY_HOST_PTR,
-                        hostbuf=arg,
-                    )
-                )
+                gpu_args.append(self.allocate_ndarray(arg))
             # if not an array, just pass argument along
             else:
                 gpu_args.append(arg)

diff --git a/kernel_tuner/backends/pycuda.py b/kernel_tuner/backends/pycuda.py
@@ -101,6 +101,7 @@ def _finish_up():
             str(k): v for (k, v) in self.context.get_device().get_attributes().items()
         }
         self.max_threads = devprops["MAX_THREADS_PER_BLOCK"]
+        self.cache_size_L2 = int(devprops["L2_CACHE_SIZE"])
         cc = str(devprops.get("COMPUTE_CAPABILITY_MAJOR", "0")) + str(
             devprops.get("COMPUTE_CAPABILITY_MINOR", "0")
         )
@@ -151,7 +152,17 @@ def __del__(self):
         for gpu_mem in self.allocations:
             # if needed for when using mocks during testing
             if hasattr(gpu_mem, "free"):
-                gpu_mem.free()
+                self.free_mem(gpu_mem)
+
+    def allocate_ndarray(self, array):
+        alloc = drv.mem_alloc(array.nbytes)
+        self.allocations.append(alloc)
+        return alloc
+
+    def free_mem(self, pointer):
+        assert hasattr(pointer, "free")
+        self.allocations.remove(pointer)
+        pointer.free()
 
     def ready_argument_list(self, arguments):
         """Ready argument list to be passed to the kernel, allocates gpu mem.
@@ -168,8 +179,7 @@ def ready_argument_list(self, arguments):
         for arg in arguments:
             # if arg i is a numpy array copy to device
             if isinstance(arg, np.ndarray):
-                alloc = drv.mem_alloc(arg.nbytes)
-                self.allocations.append(alloc)
+                alloc = self.allocate_ndarray(arg)
                 gpu_args.append(alloc)
                 drv.memcpy_htod(gpu_args[-1], arg)
             elif isinstance(arg, torch.Tensor):
@@ -218,6 +228,8 @@ def compile(self, kernel_instance):
             )
 
             self.func = self.current_module.get_function(kernel_name)
+            if not isinstance(self.func, str):
+                self.num_regs = self.func.num_regs
             return self.func
         except drv.CompileError as e:
             if "uses too much shared data" in e.stderr: