From 81a68a418c5699ccef7e442af0588501f2f9e560 Mon Sep 17 00:00:00 2001 From: fjwillemsen Date: Thu, 8 Feb 2024 16:57:19 +0100 Subject: [PATCH 01/30] Added RegisterObserver with common interface among backends --- kernel_tuner/backends/cupy.py | 1 + kernel_tuner/backends/nvcuda.py | 5 +++++ kernel_tuner/backends/pycuda.py | 1 + kernel_tuner/observers/register.py | 12 ++++++++++++ 4 files changed, 19 insertions(+) create mode 100644 kernel_tuner/observers/register.py diff --git a/kernel_tuner/backends/cupy.py b/kernel_tuner/backends/cupy.py index a1e13ff03..19cb55a93 100644 --- a/kernel_tuner/backends/cupy.py +++ b/kernel_tuner/backends/cupy.py @@ -132,6 +132,7 @@ def compile(self, kernel_instance): ) self.func = self.current_module.get_function(kernel_name) + self.num_regs = self.func.num_regs return self.func def start_event(self): diff --git a/kernel_tuner/backends/nvcuda.py b/kernel_tuner/backends/nvcuda.py index c6fb73d5e..70bb637c5 100644 --- a/kernel_tuner/backends/nvcuda.py +++ b/kernel_tuner/backends/nvcuda.py @@ -192,6 +192,11 @@ def compile(self, kernel_instance): ) cuda_error_check(err) + # get the number of registers per thread used in this kernel + num_regs = cuda.cuFuncGetAttribute(cuda.CUfunction_attribute.CU_FUNC_ATTRIBUTE_NUM_REGS, self.func) + assert num_regs[0] == 0, f"Retrieving number of registers per thread unsuccesful: code {num_regs[0]}" + self.num_regs = num_regs[1] + except RuntimeError as re: _, n = nvrtc.nvrtcGetProgramLogSize(program) log = b" " * n diff --git a/kernel_tuner/backends/pycuda.py b/kernel_tuner/backends/pycuda.py index 3c168f824..3111d1a71 100644 --- a/kernel_tuner/backends/pycuda.py +++ b/kernel_tuner/backends/pycuda.py @@ -218,6 +218,7 @@ def compile(self, kernel_instance): ) self.func = self.current_module.get_function(kernel_name) + self.num_regs = self.func.num_regs return self.func except drv.CompileError as e: if "uses too much shared data" in e.stderr: diff --git a/kernel_tuner/observers/register.py b/kernel_tuner/observers/register.py new file mode 100644 index 000000000..b1310db2f --- /dev/null +++ b/kernel_tuner/observers/register.py @@ -0,0 +1,12 @@ +from kernel_tuner.observers.observer import BenchmarkObserver + +class RegisterObserver(BenchmarkObserver): + """Observer for counting the number of registers.""" + + def __init__(self) -> None: + super().__init__() + + def get_results(self): + return { + "num_regs": self.dev.num_regs + } \ No newline at end of file From 943b3c470c72bde8141ee40a5c4647aad0050280 Mon Sep 17 00:00:00 2001 From: fjwillemsen Date: Thu, 8 Feb 2024 17:24:36 +0100 Subject: [PATCH 02/30] Added test for RegisterObserver, added clause in case of mocktest --- kernel_tuner/backends/pycuda.py | 3 ++- test/test_observers.py | 26 +++++++++++++++++++++++++- 2 files changed, 27 insertions(+), 2 deletions(-) diff --git a/kernel_tuner/backends/pycuda.py b/kernel_tuner/backends/pycuda.py index 3111d1a71..7fddc9393 100644 --- a/kernel_tuner/backends/pycuda.py +++ b/kernel_tuner/backends/pycuda.py @@ -218,7 +218,8 @@ def compile(self, kernel_instance): ) self.func = self.current_module.get_function(kernel_name) - self.num_regs = self.func.num_regs + if not isinstance(self.func, str): + self.num_regs = self.func.num_regs return self.func except drv.CompileError as e: if "uses too much shared data" in e.stderr: diff --git a/test/test_observers.py b/test/test_observers.py index d881fed74..1765be334 100644 --- a/test/test_observers.py +++ b/test/test_observers.py @@ -2,9 +2,10 @@ import kernel_tuner from kernel_tuner.observers.nvml import NVMLObserver +from kernel_tuner.observers.register import RegisterObserver from kernel_tuner.observers.observer import BenchmarkObserver -from .context import skip_if_no_pycuda, skip_if_no_pynvml +from .context import skip_if_no_pycuda, skip_if_no_pynvml, skip_if_no_cupy, skip_if_no_cuda from .test_runners import env # noqa: F401 @@ -20,6 +21,29 @@ def test_nvml_observer(env): assert "temperature" in result[0] assert result[0]["temperature"] > 0 +@skip_if_no_pycuda +def test_register_observer_pycuda(env): + registerobserver = RegisterObserver() + env[-1]["block_size_x"] = [128] + result, _ = kernel_tuner.tune_kernel(*env, observers=[registerobserver], lang='CUDA') + assert "num_regs" in result[0] + assert result[0]["num_regs"] > 0 + +@skip_if_no_cupy +def test_register_observer_cupy(env): + registerobserver = RegisterObserver() + env[-1]["block_size_x"] = [128] + result, _ = kernel_tuner.tune_kernel(*env, observers=[registerobserver], lang='CuPy') + assert "num_regs" in result[0] + assert result[0]["num_regs"] > 0 + +@skip_if_no_cuda +def test_register_observer_nvcuda(env): + registerobserver = RegisterObserver() + env[-1]["block_size_x"] = [128] + result, _ = kernel_tuner.tune_kernel(*env, observers=[registerobserver], lang='NVCUDA') + assert "num_regs" in result[0] + assert result[0]["num_regs"] > 0 @skip_if_no_pycuda def test_custom_observer(env): From 1681730399f91d2143c0f7630b0c8e87a5438b42 Mon Sep 17 00:00:00 2001 From: fjwillemsen Date: Fri, 9 Feb 2024 11:43:26 +0100 Subject: [PATCH 03/30] Added useful error message in case Register Observer is not supported --- kernel_tuner/observers/register.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/kernel_tuner/observers/register.py b/kernel_tuner/observers/register.py index b1310db2f..92f22ffd8 100644 --- a/kernel_tuner/observers/register.py +++ b/kernel_tuner/observers/register.py @@ -7,6 +7,10 @@ def __init__(self) -> None: super().__init__() def get_results(self): + try: + registers_per_thread = self.dev.num_regs + except AttributeError: + raise NotImplementedError(f"Backend '{type(self.dev).__name__}' does not support count of registers per thread") return { - "num_regs": self.dev.num_regs + "num_regs": registers_per_thread } \ No newline at end of file From f153945b0f2c6a7a55d2dde4fa3f6ad1a8ce310e Mon Sep 17 00:00:00 2001 From: fjwillemsen Date: Fri, 9 Feb 2024 11:44:11 +0100 Subject: [PATCH 04/30] Added tests for Register Observer for OpenCL and HIP backends --- test/test_observers.py | 57 ++++++++++++++++++++++++------------------ 1 file changed, 33 insertions(+), 24 deletions(-) diff --git a/test/test_observers.py b/test/test_observers.py index 1765be334..c1cc460a9 100644 --- a/test/test_observers.py +++ b/test/test_observers.py @@ -1,12 +1,14 @@ - - import kernel_tuner from kernel_tuner.observers.nvml import NVMLObserver from kernel_tuner.observers.register import RegisterObserver from kernel_tuner.observers.observer import BenchmarkObserver -from .context import skip_if_no_pycuda, skip_if_no_pynvml, skip_if_no_cupy, skip_if_no_cuda +from .context import skip_if_no_pycuda, skip_if_no_pynvml, skip_if_no_cupy, skip_if_no_cuda, skip_if_no_opencl, skip_if_no_pyhip from .test_runners import env # noqa: F401 +from .test_opencl_functions import env as env_opencl # noqa: F401 +from .test_hip_functions import env as env_hip # noqa: F401 + +from pytest import raises @skip_if_no_pycuda @@ -22,39 +24,46 @@ def test_nvml_observer(env): assert result[0]["temperature"] > 0 @skip_if_no_pycuda -def test_register_observer_pycuda(env): - registerobserver = RegisterObserver() +def test_custom_observer(env): env[-1]["block_size_x"] = [128] - result, _ = kernel_tuner.tune_kernel(*env, observers=[registerobserver], lang='CUDA') + + class MyObserver(BenchmarkObserver): + def get_results(self): + return {"name": self.dev.name} + + result, _ = kernel_tuner.tune_kernel(*env, observers=[MyObserver()]) + + assert "name" in result[0] + assert len(result[0]["name"]) > 0 + +@skip_if_no_pycuda +def test_register_observer_pycuda(env): + result, _ = kernel_tuner.tune_kernel(*env, observers=[RegisterObserver()], lang='CUDA') assert "num_regs" in result[0] assert result[0]["num_regs"] > 0 @skip_if_no_cupy def test_register_observer_cupy(env): - registerobserver = RegisterObserver() - env[-1]["block_size_x"] = [128] - result, _ = kernel_tuner.tune_kernel(*env, observers=[registerobserver], lang='CuPy') + result, _ = kernel_tuner.tune_kernel(*env, observers=[RegisterObserver()], lang='CuPy') assert "num_regs" in result[0] assert result[0]["num_regs"] > 0 @skip_if_no_cuda def test_register_observer_nvcuda(env): - registerobserver = RegisterObserver() - env[-1]["block_size_x"] = [128] - result, _ = kernel_tuner.tune_kernel(*env, observers=[registerobserver], lang='NVCUDA') + result, _ = kernel_tuner.tune_kernel(*env, observers=[RegisterObserver()], lang='NVCUDA') assert "num_regs" in result[0] assert result[0]["num_regs"] > 0 -@skip_if_no_pycuda -def test_custom_observer(env): - env[-1]["block_size_x"] = [128] - - class MyObserver(BenchmarkObserver): - def get_results(self): - return {"name": self.dev.name} - - result, _ = kernel_tuner.tune_kernel(*env, observers=[MyObserver()]) - - assert "name" in result[0] - assert len(result[0]["name"]) > 0 +@skip_if_no_opencl +def test_register_observer_opencl(env_opencl): + with raises(NotImplementedError) as err: + kernel_tuner.tune_kernel(*env_opencl, observers=[RegisterObserver()], lang='OpenCL') + assert err.errisinstance(NotImplementedError) + assert "OpenCL" in str(err.value) +@skip_if_no_pyhip +def test_register_observer_hip(env_opencl): + with raises(NotImplementedError) as err: + kernel_tuner.tune_kernel(*env_opencl, observers=[RegisterObserver()], lang='HIP') + assert err.errisinstance(NotImplementedError) + assert "Hip" in str(err.value) From 7bd7c2b7ac67b35add3060a42e0a069980f2f871 Mon Sep 17 00:00:00 2001 From: fjwillemsen Date: Mon, 12 Feb 2024 12:07:34 +0100 Subject: [PATCH 05/30] Added instruction for pointing cache directory elsewhere --- CONTRIBUTING.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CONTRIBUTING.rst b/CONTRIBUTING.rst index 65f7140ce..793a066cb 100644 --- a/CONTRIBUTING.rst +++ b/CONTRIBUTING.rst @@ -85,7 +85,7 @@ Steps without :bash:`sudo` access (e.g. on a cluster): - /path/to/directory * [Optional] both Mamba and Miniconda can be automatically activated via :bash:`~/.bashrc`. Do not forget to add these (usually provided at the end of the installation). * Exit the shell and re-enter to make sure Conda is available. :bash:`cd` to the kernel tuner directory. - * [Optional] if you have limited user folder space, the Pip cache can be pointed elsewhere with the environment variable :bash:`PIP_CACHE_DIR`. The cache location can be checked with :bash:`pip cache dir`. + * [Optional] if you have limited user folder space, the Pip cache can be pointed elsewhere with the environment variable :bash:`PIP_CACHE_DIR`. The cache location can be checked with :bash:`pip cache dir`. On Linu, to point the entire :bash:`~/.cache` default elsewhere, use the :bash:`XDG_CACHE_HOME` environment variable. * [Optional] update Conda if available before continuing: :bash:`conda update -n base -c conda-forge conda`. #. Setup a virtual environment: :bash:`conda create --name kerneltuner python=3.11` (or whatever Python version and environment name you prefer). #. Activate the virtual environment: :bash:`conda activate kerneltuner`. From 9dea13795e1ccdc0d150ba75bb11c4966057df80 Mon Sep 17 00:00:00 2001 From: fjwillemsen Date: Mon, 12 Feb 2024 14:50:30 +0100 Subject: [PATCH 06/30] Non-argument streams are now correctly passed in the CuPy and NVCUDA backends --- kernel_tuner/backends/cupy.py | 2 ++ kernel_tuner/backends/nvcuda.py | 2 ++ 2 files changed, 4 insertions(+) diff --git a/kernel_tuner/backends/cupy.py b/kernel_tuner/backends/cupy.py index 19cb55a93..2e494a14f 100644 --- a/kernel_tuner/backends/cupy.py +++ b/kernel_tuner/backends/cupy.py @@ -198,6 +198,8 @@ def run_kernel(self, func, gpu_args, threads, grid, stream=None): of the grid :type grid: tuple(int, int) """ + if stream is None: + stream = self.stream func(grid, threads, gpu_args, stream=stream, shared_mem=self.smem_size) def memset(self, allocation, value, size): diff --git a/kernel_tuner/backends/nvcuda.py b/kernel_tuner/backends/nvcuda.py index 70bb637c5..4884a3a6d 100644 --- a/kernel_tuner/backends/nvcuda.py +++ b/kernel_tuner/backends/nvcuda.py @@ -278,6 +278,8 @@ def run_kernel(self, func, gpu_args, threads, grid, stream=None): of the grid :type grid: tuple(int, int) """ + if stream is None: + stream = self.stream arg_types = list() for arg in gpu_args: if isinstance(arg, cuda.CUdeviceptr): From df5414541d970472852ebe9cfb6c117e1ef27f94 Mon Sep 17 00:00:00 2001 From: fjwillemsen Date: Thu, 15 Feb 2024 16:18:09 +0100 Subject: [PATCH 07/30] Fixed several issues pertaining to the setting of clocks, in particular one where the memory clock would always be seen as not-equal due to a rounding error --- kernel_tuner/observers/nvml.py | 44 +++++++++++++--------------------- 1 file changed, 16 insertions(+), 28 deletions(-) diff --git a/kernel_tuner/observers/nvml.py b/kernel_tuner/observers/nvml.py index d33327a3c..200d98992 100644 --- a/kernel_tuner/observers/nvml.py +++ b/kernel_tuner/observers/nvml.py @@ -135,6 +135,8 @@ def persistence_mode(self, new_mode): raise ValueError( "Illegal value for persistence mode, should be either 0 or 1" ) + if self.persistence_mode == new_mode: + return try: pynvml.nvmlDeviceSetPersistenceMode(self.dev, new_mode) self._persistence_mode = pynvml.nvmlDeviceGetPersistenceMode(self.dev) @@ -168,21 +170,15 @@ def set_clocks(self, mem_clock, gr_clock): self.nvidia_smi, "-i", str(self.id), - "--lock-gpu-clocks=" + str(gr_clock) + "," + str(gr_clock), ] - subprocess.run(args, check=True) - args = [ - "sudo", - self.nvidia_smi, - "-i", - str(self.id), - "--lock-memory-clocks=" + str(mem_clock) + "," + str(mem_clock), - ] - subprocess.run(args, check=True) + command_set_mem_clocks = f"--lock-memory-clocks={str(mem_clock)},{str(mem_clock)}" + command_set_gpu_clocks = f"--lock-gpu-clocks={str(gr_clock)},{str(gr_clock)}" + subprocess.run(args + [command_set_gpu_clocks], check=True) + subprocess.run(args + [command_set_mem_clocks], check=True) else: try: - if self.persistence_mode != 0: - self.persistence_mode = 0 + if self.persistence_mode != 1: + self.persistence_mode = 1 except Exception: pass try: @@ -233,24 +229,20 @@ def reset_clocks(self): if ( gr_app_clock != self.gr_clock_default or mem_app_clock != self.mem_clock_default - ): + ): self.set_clocks(self.mem_clock_default, self.gr_clock_default) @property def gr_clock(self): """Control the graphics clock (may require permission), only values compatible with the memory clock can be set directly.""" - return pynvml.nvmlDeviceGetClockInfo(self.dev, pynvml.NVML_CLOCK_GRAPHICS) + if self.use_locked_clocks: + return pynvml.nvmlDeviceGetClockInfo(self.dev, pynvml.NVML_CLOCK_GRAPHICS) + else: + return pynvml.nvmlDeviceGetApplicationsClock(self.dev, pynvml.NVML_CLOCK_GRAPHICS) @gr_clock.setter def gr_clock(self, new_clock): - cur_clock = ( - pynvml.nvmlDeviceGetClockInfo(self.dev, pynvml.NVML_CLOCK_GRAPHICS) - if self.use_locked_clocks - else pynvml.nvmlDeviceGetApplicationsClock( - self.dev, pynvml.NVML_CLOCK_GRAPHICS - ) - ) - if new_clock != cur_clock: + if new_clock != self.gr_clock: self.set_clocks(self.mem_clock, new_clock) @property @@ -268,12 +260,8 @@ def mem_clock(self): @mem_clock.setter def mem_clock(self, new_clock): - cur_clock = ( - pynvml.nvmlDeviceGetClockInfo(self.dev, pynvml.NVML_CLOCK_MEM) - if self.use_locked_clocks - else pynvml.nvmlDeviceGetApplicationsClock(self.dev, pynvml.NVML_CLOCK_MEM) - ) - if new_clock != cur_clock: + if new_clock != self.mem_clock: + print(f"mem_clock setter calls set_clocks because {new_clock=} != {cur_clock=}") self.set_clocks(new_clock, self.gr_clock) @property From 4cc4a1399b2f48fc6cdbc65e9ede021365f91aec Mon Sep 17 00:00:00 2001 From: fjwillemsen Date: Thu, 15 Feb 2024 16:51:51 +0100 Subject: [PATCH 08/30] Time spent setting NVML parameters (clock & memory frequency, power) goes to framework time instead of benchmark time --- kernel_tuner/core.py | 23 ++++++++++++++++------- kernel_tuner/runners/sequential.py | 2 +- 2 files changed, 17 insertions(+), 8 deletions(-) diff --git a/kernel_tuner/core.py b/kernel_tuner/core.py index 174cd3af5..1cd39b6a3 100644 --- a/kernel_tuner/core.py +++ b/kernel_tuner/core.py @@ -391,12 +391,8 @@ def benchmark_continuous(self, func, gpu_args, threads, grid, result, duration): for obs in self.continuous_observers: result.update(obs.get_results()) - def benchmark(self, func, gpu_args, instance, verbose, objective): - """benchmark the kernel instance""" - logging.debug("benchmark " + instance.name) - logging.debug("thread block dimensions x,y,z=%d,%d,%d", *instance.threads) - logging.debug("grid dimensions x,y,z=%d,%d,%d", *instance.grid) - + def set_nvml_parameters(self, instance): + """Set the NVML parameters. Avoids setting time leaking into benchmark time.""" if self.use_nvml: if "nvml_pwr_limit" in instance.params: new_limit = int( @@ -409,6 +405,15 @@ def benchmark(self, func, gpu_args, instance, verbose, objective): if "nvml_mem_clock" in instance.params: self.nvml.mem_clock = instance.params["nvml_mem_clock"] + def benchmark(self, func, gpu_args, instance, verbose, objective, skip_nvml_setting=False): + """Benchmark the kernel instance.""" + logging.debug("benchmark " + instance.name) + logging.debug("thread block dimensions x,y,z=%d,%d,%d", *instance.threads) + logging.debug("grid dimensions x,y,z=%d,%d,%d", *instance.grid) + + if self.use_nvml and not skip_nvml_setting: + self.set_nvml_parameters(instance) + # Call the observers to register the configuration to be benchmarked for obs in self.dev.observers: obs.register_configuration(instance.params) @@ -577,11 +582,15 @@ def compile_and_benchmark(self, kernel_source, gpu_args, params, kernel_options, # benchmark if func: + # setting the NVML parameters here avoids this time from leaking into the benchmark time, ends up in framework time instead + if self.use_nvml: + self.set_nvml_parameters(instance) start_benchmark = time.perf_counter() result.update( - self.benchmark(func, gpu_args, instance, verbose, to.objective) + self.benchmark(func, gpu_args, instance, verbose, to.objective, skip_nvml_setting=False) ) last_benchmark_time = 1000 * (time.perf_counter() - start_benchmark) + print(f"Benchmark time: {last_benchmark_time}") except Exception as e: # dump kernel sources to temp file diff --git a/kernel_tuner/runners/sequential.py b/kernel_tuner/runners/sequential.py index c493a0089..23d9dc2ba 100644 --- a/kernel_tuner/runners/sequential.py +++ b/kernel_tuner/runners/sequential.py @@ -100,7 +100,7 @@ def run(self, parameter_space, tuning_options): params = process_metrics(params, tuning_options.metrics) # get the framework time by estimating based on other times - total_time = 1000 * (perf_counter() - self.start_time) - warmup_time + total_time = 1000 * ((perf_counter() - self.start_time) - warmup_time) # TODO is it valid that we deduct the warmup time here? params['strategy_time'] = self.last_strategy_time params['framework_time'] = max(total_time - (params['compile_time'] + params['verification_time'] + params['benchmark_time'] + params['strategy_time']), 0) params['timestamp'] = str(datetime.now(timezone.utc)) From e309bc1de7bcbf0291b099a3472de1c62099973d Mon Sep 17 00:00:00 2001 From: fjwillemsen Date: Thu, 15 Feb 2024 16:59:53 +0100 Subject: [PATCH 09/30] Time spent setting NVML parameters (clock & memory frequency, power) goes to framework time instead of benchmark time --- kernel_tuner/core.py | 1 - 1 file changed, 1 deletion(-) diff --git a/kernel_tuner/core.py b/kernel_tuner/core.py index 1cd39b6a3..18d3245d6 100644 --- a/kernel_tuner/core.py +++ b/kernel_tuner/core.py @@ -590,7 +590,6 @@ def compile_and_benchmark(self, kernel_source, gpu_args, params, kernel_options, self.benchmark(func, gpu_args, instance, verbose, to.objective, skip_nvml_setting=False) ) last_benchmark_time = 1000 * (time.perf_counter() - start_benchmark) - print(f"Benchmark time: {last_benchmark_time}") except Exception as e: # dump kernel sources to temp file From d6aac8b443d64fe6d4071a8ee19ca471c8a57308 Mon Sep 17 00:00:00 2001 From: Floris-Jan Willemsen Date: Thu, 15 Feb 2024 17:08:51 +0100 Subject: [PATCH 10/30] Removed redundant print statement --- kernel_tuner/observers/nvml.py | 1 - 1 file changed, 1 deletion(-) diff --git a/kernel_tuner/observers/nvml.py b/kernel_tuner/observers/nvml.py index 200d98992..0bd9adc84 100644 --- a/kernel_tuner/observers/nvml.py +++ b/kernel_tuner/observers/nvml.py @@ -261,7 +261,6 @@ def mem_clock(self): @mem_clock.setter def mem_clock(self, new_clock): if new_clock != self.mem_clock: - print(f"mem_clock setter calls set_clocks because {new_clock=} != {cur_clock=}") self.set_clocks(new_clock, self.gr_clock) @property From a020791a555912ce98a136c652d6f343fc95ee1e Mon Sep 17 00:00:00 2001 From: fjwillemsen Date: Wed, 28 Feb 2024 10:13:25 +0100 Subject: [PATCH 11/30] Added L2 cache size property to CUDA backends --- kernel_tuner/backends/cupy.py | 1 + kernel_tuner/backends/nvcuda.py | 4 ++++ kernel_tuner/backends/pycuda.py | 1 + 3 files changed, 6 insertions(+) diff --git a/kernel_tuner/backends/cupy.py b/kernel_tuner/backends/cupy.py index 2e494a14f..e6d43cebf 100644 --- a/kernel_tuner/backends/cupy.py +++ b/kernel_tuner/backends/cupy.py @@ -46,6 +46,7 @@ def __init__(self, device=0, iterations=7, compiler_options=None, observers=None self.devprops = dev.attributes self.cc = dev.compute_capability self.max_threads = self.devprops["MaxThreadsPerBlock"] + self.cache_size_L2 = self.devprops["L2CacheSize"] self.iterations = iterations self.current_module = None diff --git a/kernel_tuner/backends/nvcuda.py b/kernel_tuner/backends/nvcuda.py index 4884a3a6d..068cf453d 100644 --- a/kernel_tuner/backends/nvcuda.py +++ b/kernel_tuner/backends/nvcuda.py @@ -66,6 +66,10 @@ def __init__(self, device=0, iterations=7, compiler_options=None, observers=None cudart.cudaDeviceAttr.cudaDevAttrMaxThreadsPerBlock, device ) cuda_error_check(err) + err, self.cache_size_L2 = cudart.cudaDeviceGetAttribute( + cudart.cudaDeviceAttr.cudaDevAttrL2CacheSize, device + ) + cuda_error_check(err) self.cc = f"{major}{minor}" self.iterations = iterations self.current_module = None diff --git a/kernel_tuner/backends/pycuda.py b/kernel_tuner/backends/pycuda.py index 7fddc9393..659f51594 100644 --- a/kernel_tuner/backends/pycuda.py +++ b/kernel_tuner/backends/pycuda.py @@ -101,6 +101,7 @@ def _finish_up(): str(k): v for (k, v) in self.context.get_device().get_attributes().items() } self.max_threads = devprops["MAX_THREADS_PER_BLOCK"] + self.cache_size_L2 = devprops["L2_CACHE_SIZE"] cc = str(devprops.get("COMPUTE_CAPABILITY_MAJOR", "0")) + str( devprops.get("COMPUTE_CAPABILITY_MINOR", "0") ) From 6e6e5fb9bfeac34a87af19a1ea7c8f7e0e21d11a Mon Sep 17 00:00:00 2001 From: fjwillemsen Date: Wed, 28 Feb 2024 10:18:45 +0100 Subject: [PATCH 12/30] Added specification to CUPY compiler options --- kernel_tuner/backends/cupy.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel_tuner/backends/cupy.py b/kernel_tuner/backends/cupy.py index e6d43cebf..9a817a1c6 100644 --- a/kernel_tuner/backends/cupy.py +++ b/kernel_tuner/backends/cupy.py @@ -125,6 +125,10 @@ def compile(self, kernel_instance): compiler_options = self.compiler_options if not any(["-std=" in opt for opt in self.compiler_options]): compiler_options = ["--std=c++11"] + self.compiler_options + if not any([b"--gpu-architecture=" in opt for opt in compiler_options]): + compiler_options.append( + f"--gpu-architecture=compute_{self.cc}".encode("UTF-8") + ) options = tuple(compiler_options) From f15338faf2c771078c01393254740c24dd99b5a9 Mon Sep 17 00:00:00 2001 From: fjwillemsen Date: Wed, 28 Feb 2024 11:50:04 +0100 Subject: [PATCH 13/30] Added L2 cache size property to OpenCL, HIP and mocked PyCUDA backends --- kernel_tuner/backends/hip.py | 1 + kernel_tuner/backends/opencl.py | 4 ++++ test/test_pycuda_mocked.py | 3 ++- 3 files changed, 7 insertions(+), 1 deletion(-) diff --git a/kernel_tuner/backends/hip.py b/kernel_tuner/backends/hip.py index 1db4cb302..dc0ffb1cb 100644 --- a/kernel_tuner/backends/hip.py +++ b/kernel_tuner/backends/hip.py @@ -59,6 +59,7 @@ def __init__(self, device=0, iterations=7, compiler_options=None, observers=None self.name = self.hipProps._name.decode('utf-8') self.max_threads = self.hipProps.maxThreadsPerBlock + self.cache_size_L2 = self.hipProps.l2CacheSize self.device = device self.compiler_options = compiler_options or [] self.iterations = iterations diff --git a/kernel_tuner/backends/opencl.py b/kernel_tuner/backends/opencl.py index af3be1c00..4946d804f 100644 --- a/kernel_tuner/backends/opencl.py +++ b/kernel_tuner/backends/opencl.py @@ -45,6 +45,10 @@ def __init__( self.max_threads = self.ctx.devices[0].get_info( cl.device_info.MAX_WORK_GROUP_SIZE ) + # TODO the L2 cache size request fails + # self.cache_size_L2 = self.ctx.devices[0].get_info( + # cl.device_affinity_domain.L2_CACHE + # ) self.compiler_options = compiler_options or [] # observer stuff diff --git a/test/test_pycuda_mocked.py b/test/test_pycuda_mocked.py index 21f352a3f..e47fc8e8e 100644 --- a/test/test_pycuda_mocked.py +++ b/test/test_pycuda_mocked.py @@ -13,7 +13,8 @@ def setup_mock(drv): context = Mock() devprops = {'MAX_THREADS_PER_BLOCK': 1024, 'COMPUTE_CAPABILITY_MAJOR': 5, - 'COMPUTE_CAPABILITY_MINOR': 5} + 'COMPUTE_CAPABILITY_MINOR': 5, + 'L2_CACHE_SIZE': 4096} context.return_value.get_device.return_value.get_attributes.return_value = devprops context.return_value.get_device.return_value.compute_capability.return_value = "55" drv.Device.return_value.retain_primary_context.return_value = context() From 00ac419643f237fd39a21601e45289332b70132b Mon Sep 17 00:00:00 2001 From: fjwillemsen Date: Wed, 28 Feb 2024 12:54:04 +0100 Subject: [PATCH 14/30] Added function to check for compute capability validity, improved check on gpu-architecture compiler option, added gpu-architecture auto-adding to CuPy --- kernel_tuner/backends/cupy.py | 11 +++++++---- kernel_tuner/backends/nvcuda.py | 19 ++++++++++++------- kernel_tuner/util.py | 6 ++++++ 3 files changed, 25 insertions(+), 11 deletions(-) diff --git a/kernel_tuner/backends/cupy.py b/kernel_tuner/backends/cupy.py index 9a817a1c6..9b06ddb20 100644 --- a/kernel_tuner/backends/cupy.py +++ b/kernel_tuner/backends/cupy.py @@ -1,10 +1,12 @@ """This module contains all Cupy specific kernel_tuner functions.""" from __future__ import print_function +from warnings import warn import numpy as np from kernel_tuner.backends.backend import GPUBackend from kernel_tuner.observers.cupy import CupyRuntimeObserver +from kernel_tuner.util import is_valid_nvrtc_gpu_arch_cc # embedded in try block to be able to generate documentation # and run tests without cupy installed @@ -125,10 +127,11 @@ def compile(self, kernel_instance): compiler_options = self.compiler_options if not any(["-std=" in opt for opt in self.compiler_options]): compiler_options = ["--std=c++11"] + self.compiler_options - if not any([b"--gpu-architecture=" in opt for opt in compiler_options]): - compiler_options.append( - f"--gpu-architecture=compute_{self.cc}".encode("UTF-8") - ) + if is_valid_nvrtc_gpu_arch_cc(self.cc): + if not any(["--gpu-architecture=" in opt or "-arch" in opt for opt in compiler_options]): + compiler_options.append(f"--gpu-architecture=compute_{self.cc}") + else: + warn(f"Could not add compiler option '--gpu-architecture=compute_{self.cc}' as {self.cc} is an invalid target") options = tuple(compiler_options) diff --git a/kernel_tuner/backends/nvcuda.py b/kernel_tuner/backends/nvcuda.py index 068cf453d..9c3c37097 100644 --- a/kernel_tuner/backends/nvcuda.py +++ b/kernel_tuner/backends/nvcuda.py @@ -1,9 +1,11 @@ """This module contains all NVIDIA cuda-python specific kernel_tuner functions.""" +from warnings import warn + import numpy as np from kernel_tuner.backends.backend import GPUBackend from kernel_tuner.observers.nvcuda import CudaRuntimeObserver -from kernel_tuner.util import SkippableFailure, cuda_error_check +from kernel_tuner.util import SkippableFailure, cuda_error_check, is_valid_nvrtc_gpu_arch_cc # embedded in try block to be able to generate documentation # and run tests without cuda-python installed @@ -165,12 +167,15 @@ def compile(self, kernel_instance): compiler_options.append(b"--std=c++11") if not any(["--std=" in opt for opt in self.compiler_options]): self.compiler_options.append("--std=c++11") - if not any([b"--gpu-architecture=" in opt for opt in compiler_options]): - compiler_options.append( - f"--gpu-architecture=compute_{self.cc}".encode("UTF-8") - ) - if not any(["--gpu-architecture=" in opt for opt in self.compiler_options]): - self.compiler_options.append(f"--gpu-architecture=compute_{self.cc}") + if is_valid_nvrtc_gpu_arch_cc(self.cc): + if not any([b"--gpu-architecture=" in opt or b"-arch" in opt for opt in compiler_options]): + compiler_options.append( + f"--gpu-architecture=compute_{self.cc}".encode("UTF-8") + ) + if not any(["--gpu-architecture=" in opt or "-arch" in opt for opt in self.compiler_options]): + self.compiler_options.append(f"--gpu-architecture=compute_{self.cc}") + else: + warn(f"Could not add compiler option '--gpu-architecture=compute_{self.cc}' as {self.cc} is an invalid target") err, program = nvrtc.nvrtcCreateProgram( str.encode(kernel_string), b"CUDAProgram", 0, [], [] diff --git a/kernel_tuner/util.py b/kernel_tuner/util.py index 6e9cdf5b0..d0adeb047 100644 --- a/kernel_tuner/util.py +++ b/kernel_tuner/util.py @@ -570,6 +570,12 @@ def get_total_timings(results, env, overhead_time): return env +def is_valid_nvrtc_gpu_arch_cc(compute_capability: str) -> bool: + """Returns whether the Compute Capability is a valid argument for NVRTC `--gpu-architecture=`, as per https://docs.nvidia.com/cuda/nvrtc/index.html#group__options.""" + valid_cc = ['50', '52', '53', '60', '61', '62', '70', '72', '75', '80', '87', '89', '90', '90a'] + return str(compute_capability) in valid_cc + + def print_config(config, tuning_options, runner): """Print the configuration string with tunable parameters and benchmark results.""" print_config_output(tuning_options.tune_params, config, runner.quiet, tuning_options.metrics, runner.units) From 55ab07473d4cc12501bce74eae305e5f2573c707 Mon Sep 17 00:00:00 2001 From: fjwillemsen Date: Wed, 28 Feb 2024 12:55:17 +0100 Subject: [PATCH 15/30] Added a flush kernel to clear the L2 cache between runs --- kernel_tuner/core.py | 53 ++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 51 insertions(+), 2 deletions(-) diff --git a/kernel_tuner/core.py b/kernel_tuner/core.py index 18d3245d6..ee831c836 100644 --- a/kernel_tuner/core.py +++ b/kernel_tuner/core.py @@ -340,14 +340,62 @@ def __init__( if not quiet: print("Using: " + self.dev.name) - def benchmark_default(self, func, gpu_args, threads, grid, result): - """Benchmark one kernel execution at a time""" + if lang.upper() not in ['OPENCL', 'C', 'FORTRAN']: + # flush the L2 cache, inspired by https://github.com/pytorch/FBGEMM/blob/eb3c304e6c213b81f2b2077813d3c6d16597aa97/fbgemm_gpu/bench/verify_fp16_stochastic_benchmark.cu#L130 + flush_gpu_string = """ + __global__ void flush_gpu(char* d_flush, char* d_flush2, bool do_write) { + const int idx = blockIdx.x * blockDim.x + threadIdx.x; + const char val = d_flush[idx]; + if (do_write * val) { + d_flush2[idx] = val; + } + } + """ + cache_size = self.dev.cache_size_L2 + d_flush = np.ones((cache_size), order='F').astype(np.float32) + d_flush2 = np.ones((cache_size), order='F').astype(np.float32) + self.flush_kernel_gpu_args = [d_flush, d_flush2, np.int32(True)] + + from kernel_tuner.interface import Options + options = { + 'kernel_name': 'flush_gpu', + 'lang': 'CUDA', + 'arguments': self.flush_kernel_gpu_args, + 'problem_size': cache_size, + 'grid_div_x': None, + 'grid_div_y': None, + 'grid_div_z': None, + 'block_size_names': None, + } + options = Options(options) + flush_kernel_lang = lang.upper() if lang.upper() in ['CUDA', 'CUPY', 'NVCUDA'] else 'CUPY' + flush_kernel_source = KernelSource('flush_gpu', flush_gpu_string, flush_kernel_lang) + self.flush_kernel_instance = self.create_kernel_instance(flush_kernel_source, kernel_options=options, params=dict(), verbose=not quiet) + self.flush_kernel = self.compile_kernel(self.flush_kernel_instance, verbose=not quiet) + self.flush_kernel_gpu_args = self.ready_argument_list(self.flush_kernel_gpu_args) + + # from kernel_tuner.kernelbuilder import PythonKernel + # self.flush_kernel = PythonKernel('flush_gpu', flush_gpu_string, cache_size, self.flush_kernel_gpu_args) + + def flush_cache(self): + """This special function can be called to flush the L2 cache.""" + if hasattr(self, 'flush_kernel'): + return + self.dev.synchronize() + assert self.run_kernel(self.flush_kernel, self.flush_kernel_gpu_args, self.flush_kernel_instance) + # self.flush_kernel.run_kernel(self.flush_kernel.gpu_args) + self.dev.synchronize() + + def benchmark_default(self, func, gpu_args, threads, grid, result, flush_cache=True): + """Benchmark one kernel execution at a time. Run with `flush_cache=True` to avoid caching effects between iterations.""" observers = [ obs for obs in self.dev.observers if not isinstance(obs, ContinuousObserver) ] self.dev.synchronize() for _ in range(self.iterations): + if flush_cache: + self.flush_cache() for obs in observers: obs.before_start() self.dev.synchronize() @@ -1008,3 +1056,4 @@ def wrap_templated_kernel(kernel_string, kernel_name): new_kernel_string += wrapper_function return new_kernel_string, name + "_wrapper" + \ No newline at end of file From e106baeacbb4b12c0556e0cd90b6246d8f2f0704 Mon Sep 17 00:00:00 2001 From: fjwillemsen Date: Wed, 28 Feb 2024 12:55:54 +0100 Subject: [PATCH 16/30] Added a flush kernel to clear the L2 cache between runs --- kernel_tuner/core.py | 1 - 1 file changed, 1 deletion(-) diff --git a/kernel_tuner/core.py b/kernel_tuner/core.py index ee831c836..cf775d308 100644 --- a/kernel_tuner/core.py +++ b/kernel_tuner/core.py @@ -1056,4 +1056,3 @@ def wrap_templated_kernel(kernel_string, kernel_name): new_kernel_string += wrapper_function return new_kernel_string, name + "_wrapper" - \ No newline at end of file From 0cb5e3a4a9acd4d3aecb19c1ad953d1b899c7d56 Mon Sep 17 00:00:00 2001 From: fjwillemsen Date: Thu, 29 Feb 2024 21:46:27 +0100 Subject: [PATCH 17/30] Made function for scaling the compute capability to a valid one, added tests for this function, removed setting --gpu-architecture for CuPy as it is already set internally --- kernel_tuner/backends/cupy.py | 7 +------ kernel_tuner/backends/nvcuda.py | 17 +++++++---------- kernel_tuner/util.py | 22 +++++++++++++++++----- test/test_util_functions.py | 13 +++++++++++++ 4 files changed, 38 insertions(+), 21 deletions(-) diff --git a/kernel_tuner/backends/cupy.py b/kernel_tuner/backends/cupy.py index 9b06ddb20..f53663daa 100644 --- a/kernel_tuner/backends/cupy.py +++ b/kernel_tuner/backends/cupy.py @@ -6,7 +6,6 @@ from kernel_tuner.backends.backend import GPUBackend from kernel_tuner.observers.cupy import CupyRuntimeObserver -from kernel_tuner.util import is_valid_nvrtc_gpu_arch_cc # embedded in try block to be able to generate documentation # and run tests without cupy installed @@ -127,11 +126,7 @@ def compile(self, kernel_instance): compiler_options = self.compiler_options if not any(["-std=" in opt for opt in self.compiler_options]): compiler_options = ["--std=c++11"] + self.compiler_options - if is_valid_nvrtc_gpu_arch_cc(self.cc): - if not any(["--gpu-architecture=" in opt or "-arch" in opt for opt in compiler_options]): - compiler_options.append(f"--gpu-architecture=compute_{self.cc}") - else: - warn(f"Could not add compiler option '--gpu-architecture=compute_{self.cc}' as {self.cc} is an invalid target") + # CuPy already sets the --gpu-architecture by itself, as per https://github.com/cupy/cupy/blob/20ccd63c0acc40969c851b1917dedeb032209e8b/cupy/cuda/compiler.py#L145 options = tuple(compiler_options) diff --git a/kernel_tuner/backends/nvcuda.py b/kernel_tuner/backends/nvcuda.py index 9c3c37097..0a74f6d9f 100644 --- a/kernel_tuner/backends/nvcuda.py +++ b/kernel_tuner/backends/nvcuda.py @@ -5,7 +5,7 @@ from kernel_tuner.backends.backend import GPUBackend from kernel_tuner.observers.nvcuda import CudaRuntimeObserver -from kernel_tuner.util import SkippableFailure, cuda_error_check, is_valid_nvrtc_gpu_arch_cc +from kernel_tuner.util import SkippableFailure, cuda_error_check, to_valid_nvrtc_gpu_arch_cc # embedded in try block to be able to generate documentation # and run tests without cuda-python installed @@ -167,15 +167,12 @@ def compile(self, kernel_instance): compiler_options.append(b"--std=c++11") if not any(["--std=" in opt for opt in self.compiler_options]): self.compiler_options.append("--std=c++11") - if is_valid_nvrtc_gpu_arch_cc(self.cc): - if not any([b"--gpu-architecture=" in opt or b"-arch" in opt for opt in compiler_options]): - compiler_options.append( - f"--gpu-architecture=compute_{self.cc}".encode("UTF-8") - ) - if not any(["--gpu-architecture=" in opt or "-arch" in opt for opt in self.compiler_options]): - self.compiler_options.append(f"--gpu-architecture=compute_{self.cc}") - else: - warn(f"Could not add compiler option '--gpu-architecture=compute_{self.cc}' as {self.cc} is an invalid target") + if not any([b"--gpu-architecture=" in opt or b"-arch" in opt for opt in compiler_options]): + compiler_options.append( + f"--gpu-architecture=compute_{to_valid_nvrtc_gpu_arch_cc(self.cc)}".encode("UTF-8") + ) + if not any(["--gpu-architecture=" in opt or "-arch" in opt for opt in self.compiler_options]): + self.compiler_options.append(f"--gpu-architecture=compute_{to_valid_nvrtc_gpu_arch_cc(self.cc)}") err, program = nvrtc.nvrtcCreateProgram( str.encode(kernel_string), b"CUDAProgram", 0, [], [] diff --git a/kernel_tuner/util.py b/kernel_tuner/util.py index d0adeb047..77aa2607d 100644 --- a/kernel_tuner/util.py +++ b/kernel_tuner/util.py @@ -221,7 +221,7 @@ def check_block_size_names(block_size_names): if not isinstance(block_size_names, list): raise ValueError("block_size_names should be a list of strings!") if len(block_size_names) > 3: - raise ValueError("block_size_names should not contain more than 3 names!") + raise ValueError(f"block_size_names should not contain more than 3 names! ({block_size_names=})") if not all([isinstance(name, "".__class__) for name in block_size_names]): raise ValueError("block_size_names should contain only strings!") @@ -570,10 +570,22 @@ def get_total_timings(results, env, overhead_time): return env -def is_valid_nvrtc_gpu_arch_cc(compute_capability: str) -> bool: - """Returns whether the Compute Capability is a valid argument for NVRTC `--gpu-architecture=`, as per https://docs.nvidia.com/cuda/nvrtc/index.html#group__options.""" - valid_cc = ['50', '52', '53', '60', '61', '62', '70', '72', '75', '80', '87', '89', '90', '90a'] - return str(compute_capability) in valid_cc +def to_valid_nvrtc_gpu_arch_cc(compute_capability: str) -> str: + """Returns a valid Compute Capability for NVRTC `--gpu-architecture=`, as per https://docs.nvidia.com/cuda/nvrtc/index.html#group__options.""" + valid_cc = ['50', '52', '53', '60', '61', '62', '70', '72', '75', '80', '87', '89', '90', '90a'] # must be in ascending order, when updating also update test_to_valid_nvrtc_gpu_arch_cc + compute_capability = str(compute_capability) + if len(compute_capability) < 2: + raise ValueError(f"Compute capability '{compute_capability}' must be at least of length 2, is {len(compute_capability)}") + if compute_capability in valid_cc: + return compute_capability + # if the compute capability does not match, scale down to the nearest matching + subset_cc = [cc for cc in valid_cc if compute_capability[0] == cc[0]] + if len(subset_cc) > 0: + # get the next-highest valid CC + highest_cc_index = max([i for i, cc in enumerate(subset_cc) if int(cc[1]) <= int(compute_capability[1])]) + return subset_cc[highest_cc_index] + # if all else fails, return the default 52 + return '52' def print_config(config, tuning_options, runner): diff --git a/test/test_util_functions.py b/test/test_util_functions.py index c66964b0e..a202d5de3 100644 --- a/test/test_util_functions.py +++ b/test/test_util_functions.py @@ -146,6 +146,19 @@ def test_get_thread_block_dimensions(): assert threads[2] == 1 +def test_to_valid_nvrtc_gpu_arch_cc(): + assert to_valid_nvrtc_gpu_arch_cc("89") == "89" + assert to_valid_nvrtc_gpu_arch_cc("88") == "87" + assert to_valid_nvrtc_gpu_arch_cc("86") == "80" + assert to_valid_nvrtc_gpu_arch_cc("40") == "52" + assert to_valid_nvrtc_gpu_arch_cc("90b") == "90a" + assert to_valid_nvrtc_gpu_arch_cc("91c") == "90a" + assert to_valid_nvrtc_gpu_arch_cc("10123001") == "52" + with pytest.raises(ValueError): + assert to_valid_nvrtc_gpu_arch_cc("") + assert to_valid_nvrtc_gpu_arch_cc("1") + + def test_prepare_kernel_string(): kernel = "this is a weird kernel" grid = (3, 7) From b6825060ce1dfa1efd4b10e1f6807e76189b898b Mon Sep 17 00:00:00 2001 From: fjwillemsen Date: Fri, 1 Mar 2024 17:45:41 +0100 Subject: [PATCH 18/30] Applied suggestions from comments by @csbnw --- kernel_tuner/backends/cupy.py | 3 +- kernel_tuner/backends/hip.py | 1 - kernel_tuner/backends/nvcuda.py | 4 --- kernel_tuner/backends/opencl.py | 4 --- kernel_tuner/backends/pycuda.py | 1 - kernel_tuner/core.py | 52 ++------------------------------- test/test_pycuda_mocked.py | 3 +- test/test_util_functions.py | 2 +- 8 files changed, 5 insertions(+), 65 deletions(-) diff --git a/kernel_tuner/backends/cupy.py b/kernel_tuner/backends/cupy.py index f53663daa..914f211a7 100644 --- a/kernel_tuner/backends/cupy.py +++ b/kernel_tuner/backends/cupy.py @@ -47,7 +47,6 @@ def __init__(self, device=0, iterations=7, compiler_options=None, observers=None self.devprops = dev.attributes self.cc = dev.compute_capability self.max_threads = self.devprops["MaxThreadsPerBlock"] - self.cache_size_L2 = self.devprops["L2CacheSize"] self.iterations = iterations self.current_module = None @@ -126,7 +125,7 @@ def compile(self, kernel_instance): compiler_options = self.compiler_options if not any(["-std=" in opt for opt in self.compiler_options]): compiler_options = ["--std=c++11"] + self.compiler_options - # CuPy already sets the --gpu-architecture by itself, as per https://github.com/cupy/cupy/blob/20ccd63c0acc40969c851b1917dedeb032209e8b/cupy/cuda/compiler.py#L145 + # CuPy already sets the --gpu-architecture by itself, as per https://github.com/cupy/cupy/blob/main/cupy/cuda/compiler.py#L145 options = tuple(compiler_options) diff --git a/kernel_tuner/backends/hip.py b/kernel_tuner/backends/hip.py index dc0ffb1cb..1db4cb302 100644 --- a/kernel_tuner/backends/hip.py +++ b/kernel_tuner/backends/hip.py @@ -59,7 +59,6 @@ def __init__(self, device=0, iterations=7, compiler_options=None, observers=None self.name = self.hipProps._name.decode('utf-8') self.max_threads = self.hipProps.maxThreadsPerBlock - self.cache_size_L2 = self.hipProps.l2CacheSize self.device = device self.compiler_options = compiler_options or [] self.iterations = iterations diff --git a/kernel_tuner/backends/nvcuda.py b/kernel_tuner/backends/nvcuda.py index 0a74f6d9f..15259cb23 100644 --- a/kernel_tuner/backends/nvcuda.py +++ b/kernel_tuner/backends/nvcuda.py @@ -68,10 +68,6 @@ def __init__(self, device=0, iterations=7, compiler_options=None, observers=None cudart.cudaDeviceAttr.cudaDevAttrMaxThreadsPerBlock, device ) cuda_error_check(err) - err, self.cache_size_L2 = cudart.cudaDeviceGetAttribute( - cudart.cudaDeviceAttr.cudaDevAttrL2CacheSize, device - ) - cuda_error_check(err) self.cc = f"{major}{minor}" self.iterations = iterations self.current_module = None diff --git a/kernel_tuner/backends/opencl.py b/kernel_tuner/backends/opencl.py index 4946d804f..af3be1c00 100644 --- a/kernel_tuner/backends/opencl.py +++ b/kernel_tuner/backends/opencl.py @@ -45,10 +45,6 @@ def __init__( self.max_threads = self.ctx.devices[0].get_info( cl.device_info.MAX_WORK_GROUP_SIZE ) - # TODO the L2 cache size request fails - # self.cache_size_L2 = self.ctx.devices[0].get_info( - # cl.device_affinity_domain.L2_CACHE - # ) self.compiler_options = compiler_options or [] # observer stuff diff --git a/kernel_tuner/backends/pycuda.py b/kernel_tuner/backends/pycuda.py index 659f51594..7fddc9393 100644 --- a/kernel_tuner/backends/pycuda.py +++ b/kernel_tuner/backends/pycuda.py @@ -101,7 +101,6 @@ def _finish_up(): str(k): v for (k, v) in self.context.get_device().get_attributes().items() } self.max_threads = devprops["MAX_THREADS_PER_BLOCK"] - self.cache_size_L2 = devprops["L2_CACHE_SIZE"] cc = str(devprops.get("COMPUTE_CAPABILITY_MAJOR", "0")) + str( devprops.get("COMPUTE_CAPABILITY_MINOR", "0") ) diff --git a/kernel_tuner/core.py b/kernel_tuner/core.py index cf775d308..76bed9497 100644 --- a/kernel_tuner/core.py +++ b/kernel_tuner/core.py @@ -340,62 +340,14 @@ def __init__( if not quiet: print("Using: " + self.dev.name) - if lang.upper() not in ['OPENCL', 'C', 'FORTRAN']: - # flush the L2 cache, inspired by https://github.com/pytorch/FBGEMM/blob/eb3c304e6c213b81f2b2077813d3c6d16597aa97/fbgemm_gpu/bench/verify_fp16_stochastic_benchmark.cu#L130 - flush_gpu_string = """ - __global__ void flush_gpu(char* d_flush, char* d_flush2, bool do_write) { - const int idx = blockIdx.x * blockDim.x + threadIdx.x; - const char val = d_flush[idx]; - if (do_write * val) { - d_flush2[idx] = val; - } - } - """ - cache_size = self.dev.cache_size_L2 - d_flush = np.ones((cache_size), order='F').astype(np.float32) - d_flush2 = np.ones((cache_size), order='F').astype(np.float32) - self.flush_kernel_gpu_args = [d_flush, d_flush2, np.int32(True)] - - from kernel_tuner.interface import Options - options = { - 'kernel_name': 'flush_gpu', - 'lang': 'CUDA', - 'arguments': self.flush_kernel_gpu_args, - 'problem_size': cache_size, - 'grid_div_x': None, - 'grid_div_y': None, - 'grid_div_z': None, - 'block_size_names': None, - } - options = Options(options) - flush_kernel_lang = lang.upper() if lang.upper() in ['CUDA', 'CUPY', 'NVCUDA'] else 'CUPY' - flush_kernel_source = KernelSource('flush_gpu', flush_gpu_string, flush_kernel_lang) - self.flush_kernel_instance = self.create_kernel_instance(flush_kernel_source, kernel_options=options, params=dict(), verbose=not quiet) - self.flush_kernel = self.compile_kernel(self.flush_kernel_instance, verbose=not quiet) - self.flush_kernel_gpu_args = self.ready_argument_list(self.flush_kernel_gpu_args) - - # from kernel_tuner.kernelbuilder import PythonKernel - # self.flush_kernel = PythonKernel('flush_gpu', flush_gpu_string, cache_size, self.flush_kernel_gpu_args) - - def flush_cache(self): - """This special function can be called to flush the L2 cache.""" - if hasattr(self, 'flush_kernel'): - return - self.dev.synchronize() - assert self.run_kernel(self.flush_kernel, self.flush_kernel_gpu_args, self.flush_kernel_instance) - # self.flush_kernel.run_kernel(self.flush_kernel.gpu_args) - self.dev.synchronize() - - def benchmark_default(self, func, gpu_args, threads, grid, result, flush_cache=True): - """Benchmark one kernel execution at a time. Run with `flush_cache=True` to avoid caching effects between iterations.""" + def benchmark_default(self, func, gpu_args, threads, grid, result): + """Benchmark one kernel execution at a time.""" observers = [ obs for obs in self.dev.observers if not isinstance(obs, ContinuousObserver) ] self.dev.synchronize() for _ in range(self.iterations): - if flush_cache: - self.flush_cache() for obs in observers: obs.before_start() self.dev.synchronize() diff --git a/test/test_pycuda_mocked.py b/test/test_pycuda_mocked.py index e47fc8e8e..6bdfeef07 100644 --- a/test/test_pycuda_mocked.py +++ b/test/test_pycuda_mocked.py @@ -13,8 +13,7 @@ def setup_mock(drv): context = Mock() devprops = {'MAX_THREADS_PER_BLOCK': 1024, 'COMPUTE_CAPABILITY_MAJOR': 5, - 'COMPUTE_CAPABILITY_MINOR': 5, - 'L2_CACHE_SIZE': 4096} + 'COMPUTE_CAPABILITY_MINOR': 5,} context.return_value.get_device.return_value.get_attributes.return_value = devprops context.return_value.get_device.return_value.compute_capability.return_value = "55" drv.Device.return_value.retain_primary_context.return_value = context() diff --git a/test/test_util_functions.py b/test/test_util_functions.py index a202d5de3..970603d83 100644 --- a/test/test_util_functions.py +++ b/test/test_util_functions.py @@ -153,7 +153,7 @@ def test_to_valid_nvrtc_gpu_arch_cc(): assert to_valid_nvrtc_gpu_arch_cc("40") == "52" assert to_valid_nvrtc_gpu_arch_cc("90b") == "90a" assert to_valid_nvrtc_gpu_arch_cc("91c") == "90a" - assert to_valid_nvrtc_gpu_arch_cc("10123001") == "52" + assert to_valid_nvrtc_gpu_arch_cc("1234") == "52" with pytest.raises(ValueError): assert to_valid_nvrtc_gpu_arch_cc("") assert to_valid_nvrtc_gpu_arch_cc("1") From da907b180ac70e0abe4da4ed8003b8c408b65ef4 Mon Sep 17 00:00:00 2001 From: fjwillemsen Date: Fri, 1 Mar 2024 17:54:45 +0100 Subject: [PATCH 19/30] Removed redundant comments / printing --- kernel_tuner/runners/sequential.py | 2 +- kernel_tuner/util.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel_tuner/runners/sequential.py b/kernel_tuner/runners/sequential.py index 23d9dc2ba..aeebd5116 100644 --- a/kernel_tuner/runners/sequential.py +++ b/kernel_tuner/runners/sequential.py @@ -100,7 +100,7 @@ def run(self, parameter_space, tuning_options): params = process_metrics(params, tuning_options.metrics) # get the framework time by estimating based on other times - total_time = 1000 * ((perf_counter() - self.start_time) - warmup_time) # TODO is it valid that we deduct the warmup time here? + total_time = 1000 * ((perf_counter() - self.start_time) - warmup_time) params['strategy_time'] = self.last_strategy_time params['framework_time'] = max(total_time - (params['compile_time'] + params['verification_time'] + params['benchmark_time'] + params['strategy_time']), 0) params['timestamp'] = str(datetime.now(timezone.utc)) diff --git a/kernel_tuner/util.py b/kernel_tuner/util.py index 77aa2607d..df73f2127 100644 --- a/kernel_tuner/util.py +++ b/kernel_tuner/util.py @@ -221,7 +221,7 @@ def check_block_size_names(block_size_names): if not isinstance(block_size_names, list): raise ValueError("block_size_names should be a list of strings!") if len(block_size_names) > 3: - raise ValueError(f"block_size_names should not contain more than 3 names! ({block_size_names=})") + raise ValueError("block_size_names should not contain more than 3 names!") if not all([isinstance(name, "".__class__) for name in block_size_names]): raise ValueError("block_size_names should contain only strings!") From 2396bdf5dd5eb75e21bd741ca2689a0bbb8c63a4 Mon Sep 17 00:00:00 2001 From: fjwillemsen Date: Fri, 1 Mar 2024 18:02:06 +0100 Subject: [PATCH 20/30] Added L2 cache size information to backends --- kernel_tuner/backends/cupy.py | 1 + kernel_tuner/backends/hip.py | 1 + kernel_tuner/backends/nvcuda.py | 4 ++++ kernel_tuner/backends/opencl.py | 4 ++++ kernel_tuner/backends/pycuda.py | 1 + 5 files changed, 11 insertions(+) diff --git a/kernel_tuner/backends/cupy.py b/kernel_tuner/backends/cupy.py index 914f211a7..da2e9c1fe 100644 --- a/kernel_tuner/backends/cupy.py +++ b/kernel_tuner/backends/cupy.py @@ -47,6 +47,7 @@ def __init__(self, device=0, iterations=7, compiler_options=None, observers=None self.devprops = dev.attributes self.cc = dev.compute_capability self.max_threads = self.devprops["MaxThreadsPerBlock"] + self.cache_size_L2 = self.devprops["L2CacheSize"] self.iterations = iterations self.current_module = None diff --git a/kernel_tuner/backends/hip.py b/kernel_tuner/backends/hip.py index 1db4cb302..dc0ffb1cb 100644 --- a/kernel_tuner/backends/hip.py +++ b/kernel_tuner/backends/hip.py @@ -59,6 +59,7 @@ def __init__(self, device=0, iterations=7, compiler_options=None, observers=None self.name = self.hipProps._name.decode('utf-8') self.max_threads = self.hipProps.maxThreadsPerBlock + self.cache_size_L2 = self.hipProps.l2CacheSize self.device = device self.compiler_options = compiler_options or [] self.iterations = iterations diff --git a/kernel_tuner/backends/nvcuda.py b/kernel_tuner/backends/nvcuda.py index 15259cb23..0a74f6d9f 100644 --- a/kernel_tuner/backends/nvcuda.py +++ b/kernel_tuner/backends/nvcuda.py @@ -68,6 +68,10 @@ def __init__(self, device=0, iterations=7, compiler_options=None, observers=None cudart.cudaDeviceAttr.cudaDevAttrMaxThreadsPerBlock, device ) cuda_error_check(err) + err, self.cache_size_L2 = cudart.cudaDeviceGetAttribute( + cudart.cudaDeviceAttr.cudaDevAttrL2CacheSize, device + ) + cuda_error_check(err) self.cc = f"{major}{minor}" self.iterations = iterations self.current_module = None diff --git a/kernel_tuner/backends/opencl.py b/kernel_tuner/backends/opencl.py index af3be1c00..4946d804f 100644 --- a/kernel_tuner/backends/opencl.py +++ b/kernel_tuner/backends/opencl.py @@ -45,6 +45,10 @@ def __init__( self.max_threads = self.ctx.devices[0].get_info( cl.device_info.MAX_WORK_GROUP_SIZE ) + # TODO the L2 cache size request fails + # self.cache_size_L2 = self.ctx.devices[0].get_info( + # cl.device_affinity_domain.L2_CACHE + # ) self.compiler_options = compiler_options or [] # observer stuff diff --git a/kernel_tuner/backends/pycuda.py b/kernel_tuner/backends/pycuda.py index 7fddc9393..659f51594 100644 --- a/kernel_tuner/backends/pycuda.py +++ b/kernel_tuner/backends/pycuda.py @@ -101,6 +101,7 @@ def _finish_up(): str(k): v for (k, v) in self.context.get_device().get_attributes().items() } self.max_threads = devprops["MAX_THREADS_PER_BLOCK"] + self.cache_size_L2 = devprops["L2_CACHE_SIZE"] cc = str(devprops.get("COMPUTE_CAPABILITY_MAJOR", "0")) + str( devprops.get("COMPUTE_CAPABILITY_MINOR", "0") ) From eced775fac8b39c17931dc3891418a763e033b8b Mon Sep 17 00:00:00 2001 From: fjwillemsen Date: Fri, 1 Mar 2024 18:07:11 +0100 Subject: [PATCH 21/30] Added L2 flush kernel --- kernel_tuner/core.py | 53 ++++++++++++++++++++++++++++++++++++-- test/test_pycuda_mocked.py | 3 ++- 2 files changed, 53 insertions(+), 3 deletions(-) diff --git a/kernel_tuner/core.py b/kernel_tuner/core.py index 76bed9497..265849b00 100644 --- a/kernel_tuner/core.py +++ b/kernel_tuner/core.py @@ -340,14 +340,63 @@ def __init__( if not quiet: print("Using: " + self.dev.name) - def benchmark_default(self, func, gpu_args, threads, grid, result): - """Benchmark one kernel execution at a time.""" + # if lang.upper() not in ['OPENCL', 'C', 'FORTRAN']: + # # flush the L2 cache, inspired by https://github.com/pytorch/FBGEMM/blob/eb3c304e6c213b81f2b2077813d3c6d16597aa97/fbgemm_gpu/bench/verify_fp16_stochastic_benchmark.cu#L130 + # flush_gpu_string = """ + # __global__ void flush_gpu(char* d_flush, char* d_flush2, bool do_write) { + # const int idx = blockIdx.x * blockDim.x + threadIdx.x; + # const char val = d_flush[idx]; + # if (do_write * val) { + # d_flush2[idx] = val; + # } + # } + # """ + # cache_size = self.dev.cache_size_L2 + # assert cache_size > 0 and cache_size % 256 == 0, f"Cache size has invalid value {cache_size}" + # d_flush = np.ones((cache_size), order='F').astype(np.float32) + # d_flush2 = np.ones((cache_size), order='F').astype(np.float32) + # self.flush_kernel_gpu_args = [d_flush, d_flush2, np.int32(True)] + + # from kernel_tuner.interface import Options + # options = { + # 'kernel_name': 'flush_gpu', + # 'lang': 'CUDA', + # 'arguments': self.flush_kernel_gpu_args, + # 'problem_size': cache_size, + # 'grid_div_x': None, + # 'grid_div_y': None, + # 'grid_div_z': None, + # 'block_size_names': None, + # } + # options = Options(options) + # flush_kernel_lang = lang.upper() if lang.upper() in ['CUDA', 'CUPY', 'NVCUDA'] else 'CUPY' + # flush_kernel_source = KernelSource('flush_gpu', flush_gpu_string, flush_kernel_lang) + # self.flush_kernel_instance = self.create_kernel_instance(flush_kernel_source, kernel_options=options, params=dict(), verbose=not quiet) + # self.flush_kernel = self.compile_kernel(self.flush_kernel_instance, verbose=not quiet) + # self.flush_kernel_gpu_args = self.ready_argument_list(self.flush_kernel_gpu_args) + + # # from kernel_tuner.kernelbuilder import PythonKernel + # # self.flush_kernel = PythonKernel('flush_gpu', flush_gpu_string, cache_size, self.flush_kernel_gpu_args) + + def flush_cache(self): + """This special function can be called to flush the L2 cache.""" + if hasattr(self, 'flush_kernel'): + return + self.dev.synchronize() + assert self.run_kernel(self.flush_kernel, self.flush_kernel_gpu_args, self.flush_kernel_instance) + # self.flush_kernel.run_kernel(self.flush_kernel.gpu_args) + self.dev.synchronize() + + def benchmark_default(self, func, gpu_args, threads, grid, result, flush_cache=True): + """Benchmark one kernel execution at a time. Run with `flush_cache=True` to avoid caching effects between iterations.""" observers = [ obs for obs in self.dev.observers if not isinstance(obs, ContinuousObserver) ] self.dev.synchronize() for _ in range(self.iterations): + if flush_cache: + self.flush_cache() for obs in observers: obs.before_start() self.dev.synchronize() diff --git a/test/test_pycuda_mocked.py b/test/test_pycuda_mocked.py index 6bdfeef07..e47fc8e8e 100644 --- a/test/test_pycuda_mocked.py +++ b/test/test_pycuda_mocked.py @@ -13,7 +13,8 @@ def setup_mock(drv): context = Mock() devprops = {'MAX_THREADS_PER_BLOCK': 1024, 'COMPUTE_CAPABILITY_MAJOR': 5, - 'COMPUTE_CAPABILITY_MINOR': 5,} + 'COMPUTE_CAPABILITY_MINOR': 5, + 'L2_CACHE_SIZE': 4096} context.return_value.get_device.return_value.get_attributes.return_value = devprops context.return_value.get_device.return_value.compute_capability.return_value = "55" drv.Device.return_value.retain_primary_context.return_value = context() From 143889f317035fd49cb8f9be051cc787c0534856 Mon Sep 17 00:00:00 2001 From: fjwillemsen Date: Fri, 1 Mar 2024 19:27:52 +0100 Subject: [PATCH 22/30] Switched to new attempt for flushing L2 using memset --- kernel_tuner/core.py | 47 +++----------------------------------------- 1 file changed, 3 insertions(+), 44 deletions(-) diff --git a/kernel_tuner/core.py b/kernel_tuner/core.py index 265849b00..f61a07bee 100644 --- a/kernel_tuner/core.py +++ b/kernel_tuner/core.py @@ -337,55 +337,14 @@ def __init__( self.units = dev.units self.name = dev.name self.max_threads = dev.max_threads + self.flush_kernel_possible = lang.upper() not in ['OPENCL', 'C', 'FORTRAN'] and self.dev.cache_size_L2 > 0 if not quiet: print("Using: " + self.dev.name) - # if lang.upper() not in ['OPENCL', 'C', 'FORTRAN']: - # # flush the L2 cache, inspired by https://github.com/pytorch/FBGEMM/blob/eb3c304e6c213b81f2b2077813d3c6d16597aa97/fbgemm_gpu/bench/verify_fp16_stochastic_benchmark.cu#L130 - # flush_gpu_string = """ - # __global__ void flush_gpu(char* d_flush, char* d_flush2, bool do_write) { - # const int idx = blockIdx.x * blockDim.x + threadIdx.x; - # const char val = d_flush[idx]; - # if (do_write * val) { - # d_flush2[idx] = val; - # } - # } - # """ - # cache_size = self.dev.cache_size_L2 - # assert cache_size > 0 and cache_size % 256 == 0, f"Cache size has invalid value {cache_size}" - # d_flush = np.ones((cache_size), order='F').astype(np.float32) - # d_flush2 = np.ones((cache_size), order='F').astype(np.float32) - # self.flush_kernel_gpu_args = [d_flush, d_flush2, np.int32(True)] - - # from kernel_tuner.interface import Options - # options = { - # 'kernel_name': 'flush_gpu', - # 'lang': 'CUDA', - # 'arguments': self.flush_kernel_gpu_args, - # 'problem_size': cache_size, - # 'grid_div_x': None, - # 'grid_div_y': None, - # 'grid_div_z': None, - # 'block_size_names': None, - # } - # options = Options(options) - # flush_kernel_lang = lang.upper() if lang.upper() in ['CUDA', 'CUPY', 'NVCUDA'] else 'CUPY' - # flush_kernel_source = KernelSource('flush_gpu', flush_gpu_string, flush_kernel_lang) - # self.flush_kernel_instance = self.create_kernel_instance(flush_kernel_source, kernel_options=options, params=dict(), verbose=not quiet) - # self.flush_kernel = self.compile_kernel(self.flush_kernel_instance, verbose=not quiet) - # self.flush_kernel_gpu_args = self.ready_argument_list(self.flush_kernel_gpu_args) - - # # from kernel_tuner.kernelbuilder import PythonKernel - # # self.flush_kernel = PythonKernel('flush_gpu', flush_gpu_string, cache_size, self.flush_kernel_gpu_args) - def flush_cache(self): """This special function can be called to flush the L2 cache.""" - if hasattr(self, 'flush_kernel'): - return - self.dev.synchronize() - assert self.run_kernel(self.flush_kernel, self.flush_kernel_gpu_args, self.flush_kernel_instance) - # self.flush_kernel.run_kernel(self.flush_kernel.gpu_args) - self.dev.synchronize() + if self.flush_kernel_possible: + self.dev.memset(, value=0, size=self.dev.cache_size_L2) def benchmark_default(self, func, gpu_args, threads, grid, result, flush_cache=True): """Benchmark one kernel execution at a time. Run with `flush_cache=True` to avoid caching effects between iterations.""" From 651eea7e6ce09d3651885e3a0b3233864e1009c4 Mon Sep 17 00:00:00 2001 From: fjwillemsen Date: Fri, 1 Mar 2024 20:07:47 +0100 Subject: [PATCH 23/30] Added implementation of allocate numpy array function --- kernel_tuner/backends/backend.py | 6 ++++++ kernel_tuner/backends/cupy.py | 8 ++++++-- kernel_tuner/backends/hip.py | 4 +++- kernel_tuner/backends/nvcuda.py | 10 +++++++--- kernel_tuner/backends/opencl.py | 11 ++++------- kernel_tuner/backends/pycuda.py | 8 ++++++-- kernel_tuner/core.py | 3 +-- 7 files changed, 33 insertions(+), 17 deletions(-) diff --git a/kernel_tuner/backends/backend.py b/kernel_tuner/backends/backend.py index a37c9d6e7..b50fe47fe 100644 --- a/kernel_tuner/backends/backend.py +++ b/kernel_tuner/backends/backend.py @@ -2,6 +2,7 @@ from __future__ import print_function from abc import ABC, abstractmethod +from numpy import ndarray class Backend(ABC): @@ -65,6 +66,11 @@ class GPUBackend(Backend): def __init__(self, device, iterations, compiler_options, observers): pass + @abstractmethod + def allocate_ndarray(self, array: ndarray) -> any: + """This method allocates a buffer for a given np.ndarray and returns the pointer.""" + pass + @abstractmethod def copy_constant_memory_args(self, cmem_args): """This method must implement the allocation and copy of constant memory to the GPU.""" diff --git a/kernel_tuner/backends/cupy.py b/kernel_tuner/backends/cupy.py index da2e9c1fe..6d571340f 100644 --- a/kernel_tuner/backends/cupy.py +++ b/kernel_tuner/backends/cupy.py @@ -84,6 +84,11 @@ def __init__(self, device=0, iterations=7, compiler_options=None, observers=None self.env = env self.name = env["device_name"] + def allocate_ndarray(self, array): + alloc = cp.array(array) + self.allocations.append(alloc) + return alloc + def ready_argument_list(self, arguments): """Ready argument list to be passed to the kernel, allocates gpu mem. @@ -99,8 +104,7 @@ def ready_argument_list(self, arguments): for arg in arguments: # if arg i is a numpy array copy to device if isinstance(arg, np.ndarray): - alloc = cp.array(arg) - self.allocations.append(alloc) + alloc = self.allocate_ndarray(arg) gpu_args.append(alloc) # if not a numpy array, just pass argument along else: diff --git a/kernel_tuner/backends/hip.py b/kernel_tuner/backends/hip.py index dc0ffb1cb..13ec7fe20 100644 --- a/kernel_tuner/backends/hip.py +++ b/kernel_tuner/backends/hip.py @@ -86,6 +86,8 @@ def __init__(self, device=0, iterations=7, compiler_options=None, observers=None for obs in self.observers: obs.register_device(self) + def allocate_ndarray(self, array): + return hip.hipMalloc(array.nbytes) def ready_argument_list(self, arguments): """Ready argument list to be passed to the HIP function. @@ -107,7 +109,7 @@ def ready_argument_list(self, arguments): # Allocate space on device for array and convert to ctypes if isinstance(arg, np.ndarray): if dtype_str in dtype_map.keys(): - device_ptr = hip.hipMalloc(arg.nbytes) + device_ptr = self.allocate_ndarray(arg) data_ctypes = arg.ctypes.data_as(ctypes.POINTER(dtype_map[dtype_str])) hip.hipMemcpy_htod(device_ptr, data_ctypes, arg.nbytes) # may be part of run_kernel, return allocations here instead diff --git a/kernel_tuner/backends/nvcuda.py b/kernel_tuner/backends/nvcuda.py index 0a74f6d9f..77e554fc8 100644 --- a/kernel_tuner/backends/nvcuda.py +++ b/kernel_tuner/backends/nvcuda.py @@ -117,6 +117,12 @@ def __del__(self): err = cuda.cuMemFree(device_memory) cuda_error_check(err) + def allocate_ndarray(self, array): + err, device_memory = cuda.cuMemAlloc(array.nbytes) + cuda_error_check(err) + self.allocations.append(device_memory) + return device_memory + def ready_argument_list(self, arguments): """Ready argument list to be passed to the kernel, allocates gpu mem. @@ -132,9 +138,7 @@ def ready_argument_list(self, arguments): for arg in arguments: # if arg is a numpy array copy it to device if isinstance(arg, np.ndarray): - err, device_memory = cuda.cuMemAlloc(arg.nbytes) - cuda_error_check(err) - self.allocations.append(device_memory) + device_memory = self.allocate_ndarray(arg) gpu_args.append(device_memory) self.memcpy_htod(device_memory, arg) # if not array, just pass along diff --git a/kernel_tuner/backends/opencl.py b/kernel_tuner/backends/opencl.py index 4946d804f..8c11925a3 100644 --- a/kernel_tuner/backends/opencl.py +++ b/kernel_tuner/backends/opencl.py @@ -72,6 +72,9 @@ def __init__( self.env = env self.name = dev.name + def allocate_ndarray(self, array): + return cl.Buffer(self.ctx, self.mf.READ_WRITE | self.mf.COPY_HOST_PTR, hostbuf=array) + def ready_argument_list(self, arguments): """Ready argument list to be passed to the kernel, allocates gpu mem. @@ -87,13 +90,7 @@ def ready_argument_list(self, arguments): for arg in arguments: # if arg i is a numpy array copy to device if isinstance(arg, np.ndarray): - gpu_args.append( - cl.Buffer( - self.ctx, - self.mf.READ_WRITE | self.mf.COPY_HOST_PTR, - hostbuf=arg, - ) - ) + gpu_args.append(self.allocate_ndarray(arg)) # if not an array, just pass argument along else: gpu_args.append(arg) diff --git a/kernel_tuner/backends/pycuda.py b/kernel_tuner/backends/pycuda.py index 659f51594..512d39fbf 100644 --- a/kernel_tuner/backends/pycuda.py +++ b/kernel_tuner/backends/pycuda.py @@ -154,6 +154,11 @@ def __del__(self): if hasattr(gpu_mem, "free"): gpu_mem.free() + def allocate_ndarray(self, array): + alloc = drv.mem_alloc(array.nbytes) + self.allocations.append(alloc) + return alloc + def ready_argument_list(self, arguments): """Ready argument list to be passed to the kernel, allocates gpu mem. @@ -169,8 +174,7 @@ def ready_argument_list(self, arguments): for arg in arguments: # if arg i is a numpy array copy to device if isinstance(arg, np.ndarray): - alloc = drv.mem_alloc(arg.nbytes) - self.allocations.append(alloc) + alloc = self.allocate_ndarray(arg) gpu_args.append(alloc) drv.memcpy_htod(gpu_args[-1], arg) elif isinstance(arg, torch.Tensor): diff --git a/kernel_tuner/core.py b/kernel_tuner/core.py index f61a07bee..dcbc2c10f 100644 --- a/kernel_tuner/core.py +++ b/kernel_tuner/core.py @@ -343,8 +343,7 @@ def __init__( def flush_cache(self): """This special function can be called to flush the L2 cache.""" - if self.flush_kernel_possible: - self.dev.memset(, value=0, size=self.dev.cache_size_L2) + return def benchmark_default(self, func, gpu_args, threads, grid, result, flush_cache=True): """Benchmark one kernel execution at a time. Run with `flush_cache=True` to avoid caching effects between iterations.""" From 7d8d48fda8c47884980d9d5bd565dd6bc981c327 Mon Sep 17 00:00:00 2001 From: fjwillemsen Date: Sat, 2 Mar 2024 17:50:20 +0100 Subject: [PATCH 24/30] Added new flush L2 cache method using memset --- kernel_tuner/backends/backend.py | 2 +- kernel_tuner/core.py | 10 ++++++++-- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/kernel_tuner/backends/backend.py b/kernel_tuner/backends/backend.py index b50fe47fe..d260f0acf 100644 --- a/kernel_tuner/backends/backend.py +++ b/kernel_tuner/backends/backend.py @@ -68,7 +68,7 @@ def __init__(self, device, iterations, compiler_options, observers): @abstractmethod def allocate_ndarray(self, array: ndarray) -> any: - """This method allocates a buffer for a given np.ndarray and returns the pointer.""" + """This method must allocate on the GPU a buffer for a given np.ndarray and return the pointer.""" pass @abstractmethod diff --git a/kernel_tuner/core.py b/kernel_tuner/core.py index dcbc2c10f..82472672a 100644 --- a/kernel_tuner/core.py +++ b/kernel_tuner/core.py @@ -337,13 +337,19 @@ def __init__( self.units = dev.units self.name = dev.name self.max_threads = dev.max_threads - self.flush_kernel_possible = lang.upper() not in ['OPENCL', 'C', 'FORTRAN'] and self.dev.cache_size_L2 > 0 + self.flush_possible = lang.upper() not in ['OPENCL', 'C', 'FORTRAN'] and isinstance(self.dev.cache_size_L2, int) and self.dev.cache_size_L2 > 0 + if self.flush_possible: + t = np.int32 + self.flush_array = np.zeros((self.dev.cache_size_L2 // t(0).itemsize), order='F').astype(t) if not quiet: print("Using: " + self.dev.name) def flush_cache(self): """This special function can be called to flush the L2 cache.""" - return + if self.flush_possible: + # inspired by https://github.com/NVIDIA/nvbench/blob/main/nvbench/detail/l2flush.cuh#L51 + alloc = self.dev.allocate_ndarray(self.flush_array) + self.dev.memset(alloc, value=0, size=self.flush_array.nbytes) def benchmark_default(self, func, gpu_args, threads, grid, result, flush_cache=True): """Benchmark one kernel execution at a time. Run with `flush_cache=True` to avoid caching effects between iterations.""" From 9911f4c8e58d96168839a51b90adba0b7e18163c Mon Sep 17 00:00:00 2001 From: fjwillemsen Date: Mon, 4 Mar 2024 17:49:05 +0100 Subject: [PATCH 25/30] Added a standard method for freeing memory from the GPU --- kernel_tuner/backends/backend.py | 5 +++++ kernel_tuner/backends/cupy.py | 4 ++++ kernel_tuner/backends/hip.py | 3 +++ kernel_tuner/backends/nvcuda.py | 10 +++++++--- kernel_tuner/backends/opencl.py | 4 ++++ kernel_tuner/backends/pycuda.py | 7 ++++++- kernel_tuner/core.py | 7 +++++-- 7 files changed, 34 insertions(+), 6 deletions(-) diff --git a/kernel_tuner/backends/backend.py b/kernel_tuner/backends/backend.py index d260f0acf..b8a90bbc0 100644 --- a/kernel_tuner/backends/backend.py +++ b/kernel_tuner/backends/backend.py @@ -71,6 +71,11 @@ def allocate_ndarray(self, array: ndarray) -> any: """This method must allocate on the GPU a buffer for a given np.ndarray and return the pointer.""" pass + @abstractmethod + def free_mem(self, pointer): + """This method must free on the GPU a buffer for a given pointer.""" + pass + @abstractmethod def copy_constant_memory_args(self, cmem_args): """This method must implement the allocation and copy of constant memory to the GPU.""" diff --git a/kernel_tuner/backends/cupy.py b/kernel_tuner/backends/cupy.py index 6d571340f..012901ce5 100644 --- a/kernel_tuner/backends/cupy.py +++ b/kernel_tuner/backends/cupy.py @@ -88,6 +88,10 @@ def allocate_ndarray(self, array): alloc = cp.array(array) self.allocations.append(alloc) return alloc + + def free_mem(self, pointer): + self.allocations.remove(pointer) + del pointer # CuPy uses Python reference counter to free upon disuse def ready_argument_list(self, arguments): """Ready argument list to be passed to the kernel, allocates gpu mem. diff --git a/kernel_tuner/backends/hip.py b/kernel_tuner/backends/hip.py index 13ec7fe20..1f2aff1cf 100644 --- a/kernel_tuner/backends/hip.py +++ b/kernel_tuner/backends/hip.py @@ -88,6 +88,9 @@ def __init__(self, device=0, iterations=7, compiler_options=None, observers=None def allocate_ndarray(self, array): return hip.hipMalloc(array.nbytes) + + def free_mem(self, pointer): + raise NotImplementedError("PyHIP currently does not have a free function") def ready_argument_list(self, arguments): """Ready argument list to be passed to the HIP function. diff --git a/kernel_tuner/backends/nvcuda.py b/kernel_tuner/backends/nvcuda.py index 77e554fc8..d9a8f945d 100644 --- a/kernel_tuner/backends/nvcuda.py +++ b/kernel_tuner/backends/nvcuda.py @@ -113,15 +113,19 @@ def __init__(self, device=0, iterations=7, compiler_options=None, observers=None def __del__(self): for device_memory in self.allocations: - if isinstance(device_memory, cuda.CUdeviceptr): - err = cuda.cuMemFree(device_memory) - cuda_error_check(err) + self.free_mem(device_memory) def allocate_ndarray(self, array): err, device_memory = cuda.cuMemAlloc(array.nbytes) cuda_error_check(err) self.allocations.append(device_memory) return device_memory + + def free_mem(self, pointer): + assert isinstance(pointer, cuda.CUdeviceptr) + self.allocations.remove(pointer) + err = cuda.cuMemFree(pointer) + cuda_error_check(err) def ready_argument_list(self, arguments): """Ready argument list to be passed to the kernel, allocates gpu mem. diff --git a/kernel_tuner/backends/opencl.py b/kernel_tuner/backends/opencl.py index 8c11925a3..0377cbb93 100644 --- a/kernel_tuner/backends/opencl.py +++ b/kernel_tuner/backends/opencl.py @@ -74,6 +74,10 @@ def __init__( def allocate_ndarray(self, array): return cl.Buffer(self.ctx, self.mf.READ_WRITE | self.mf.COPY_HOST_PTR, hostbuf=array) + + def free_mem(self, pointer): + assert isinstance(pointer, cl.Buffer) + pointer.release() def ready_argument_list(self, arguments): """Ready argument list to be passed to the kernel, allocates gpu mem. diff --git a/kernel_tuner/backends/pycuda.py b/kernel_tuner/backends/pycuda.py index 512d39fbf..5afc9c090 100644 --- a/kernel_tuner/backends/pycuda.py +++ b/kernel_tuner/backends/pycuda.py @@ -152,12 +152,17 @@ def __del__(self): for gpu_mem in self.allocations: # if needed for when using mocks during testing if hasattr(gpu_mem, "free"): - gpu_mem.free() + self.free_mem(gpu_mem) def allocate_ndarray(self, array): alloc = drv.mem_alloc(array.nbytes) self.allocations.append(alloc) return alloc + + def free_mem(self, pointer): + assert hasattr(pointer, "free") + self.allocations.remove(pointer) + pointer.free() def ready_argument_list(self, arguments): """Ready argument list to be passed to the kernel, allocates gpu mem. diff --git a/kernel_tuner/core.py b/kernel_tuner/core.py index 82472672a..611f50736 100644 --- a/kernel_tuner/core.py +++ b/kernel_tuner/core.py @@ -341,15 +341,18 @@ def __init__( if self.flush_possible: t = np.int32 self.flush_array = np.zeros((self.dev.cache_size_L2 // t(0).itemsize), order='F').astype(t) + self.flush_alloc = None if not quiet: print("Using: " + self.dev.name) def flush_cache(self): """This special function can be called to flush the L2 cache.""" + if self.flush_alloc is not None: + self.dev.free_mem(self.flush_alloc) if self.flush_possible: # inspired by https://github.com/NVIDIA/nvbench/blob/main/nvbench/detail/l2flush.cuh#L51 - alloc = self.dev.allocate_ndarray(self.flush_array) - self.dev.memset(alloc, value=0, size=self.flush_array.nbytes) + self.flush_alloc = self.dev.allocate_ndarray(self.flush_array) + self.dev.memset(self.flush_alloc, value=0, size=self.flush_array.nbytes) def benchmark_default(self, func, gpu_args, threads, grid, result, flush_cache=True): """Benchmark one kernel execution at a time. Run with `flush_cache=True` to avoid caching effects between iterations.""" From 47c2ccacce52e28cda9137c87b76199f0adf6b1e Mon Sep 17 00:00:00 2001 From: fjwillemsen Date: Mon, 4 Mar 2024 19:00:33 +0100 Subject: [PATCH 26/30] Circumvented an issue where list.remove(val) was not properly implemented by CuPy, and attempt free of previous allocation after checking if flush is possible --- kernel_tuner/backends/cupy.py | 5 ++++- kernel_tuner/core.py | 5 +++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/kernel_tuner/backends/cupy.py b/kernel_tuner/backends/cupy.py index 012901ce5..8fcad0b91 100644 --- a/kernel_tuner/backends/cupy.py +++ b/kernel_tuner/backends/cupy.py @@ -90,7 +90,10 @@ def allocate_ndarray(self, array): return alloc def free_mem(self, pointer): - self.allocations.remove(pointer) + # iteratively comparing is required as comparing with list.remove is not properly implemented + to_remove = [i for i, alloc in enumerate(self.allocations) if cp.array_equal(alloc, pointer)] + assert len(to_remove) == 1 + self.allocations.pop(to_remove[0]) del pointer # CuPy uses Python reference counter to free upon disuse def ready_argument_list(self, arguments): diff --git a/kernel_tuner/core.py b/kernel_tuner/core.py index 611f50736..9e1aaec3e 100644 --- a/kernel_tuner/core.py +++ b/kernel_tuner/core.py @@ -347,9 +347,10 @@ def __init__( def flush_cache(self): """This special function can be called to flush the L2 cache.""" - if self.flush_alloc is not None: - self.dev.free_mem(self.flush_alloc) if self.flush_possible: + # explicitely free the previous memory + if self.flush_alloc is not None: + self.dev.free_mem(self.flush_alloc) # inspired by https://github.com/NVIDIA/nvbench/blob/main/nvbench/detail/l2flush.cuh#L51 self.flush_alloc = self.dev.allocate_ndarray(self.flush_array) self.dev.memset(self.flush_alloc, value=0, size=self.flush_array.nbytes) From 157ca418172b20373dfff0d825534e9797a29de1 Mon Sep 17 00:00:00 2001 From: fjwillemsen Date: Thu, 7 Mar 2024 11:14:13 +0100 Subject: [PATCH 27/30] Added the ability to recopy array arguments with every kernel launch, added interfacing for flushing L2 and recopying arguments --- kernel_tuner/core.py | 14 +++++++++----- kernel_tuner/interface.py | 4 ++++ 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/kernel_tuner/core.py b/kernel_tuner/core.py index 9e1aaec3e..73b2336c2 100644 --- a/kernel_tuner/core.py +++ b/kernel_tuner/core.py @@ -337,7 +337,7 @@ def __init__( self.units = dev.units self.name = dev.name self.max_threads = dev.max_threads - self.flush_possible = lang.upper() not in ['OPENCL', 'C', 'FORTRAN'] and isinstance(self.dev.cache_size_L2, int) and self.dev.cache_size_L2 > 0 + self.flush_possible = lang.upper() not in ['OPENCL', 'HIP', 'C', 'FORTRAN'] and isinstance(self.dev.cache_size_L2, int) and self.dev.cache_size_L2 > 0 if self.flush_possible: t = np.int32 self.flush_array = np.zeros((self.dev.cache_size_L2 // t(0).itemsize), order='F').astype(t) @@ -355,7 +355,7 @@ def flush_cache(self): self.flush_alloc = self.dev.allocate_ndarray(self.flush_array) self.dev.memset(self.flush_alloc, value=0, size=self.flush_array.nbytes) - def benchmark_default(self, func, gpu_args, threads, grid, result, flush_cache=True): + def benchmark_default(self, func, gpu_args, threads, grid, result, flush_cache=True, recopy_args=False): """Benchmark one kernel execution at a time. Run with `flush_cache=True` to avoid caching effects between iterations.""" observers = [ obs for obs in self.dev.observers if not isinstance(obs, ContinuousObserver) @@ -365,6 +365,10 @@ def benchmark_default(self, func, gpu_args, threads, grid, result, flush_cache=T for _ in range(self.iterations): if flush_cache: self.flush_cache() + if recopy_args is not None: + for i, arg in enumerate(recopy_args): + if isinstance(arg, (np.ndarray, cp.ndarray, torch.Tensor)): + self.dev.memcpy_htod(gpu_args[i], arg) for obs in observers: obs.before_start() self.dev.synchronize() @@ -422,7 +426,7 @@ def set_nvml_parameters(self, instance): if "nvml_mem_clock" in instance.params: self.nvml.mem_clock = instance.params["nvml_mem_clock"] - def benchmark(self, func, gpu_args, instance, verbose, objective, skip_nvml_setting=False): + def benchmark(self, func, gpu_args, instance, verbose, objective, skip_nvml_setting=False, flush_L2=True, rewrite_data=False): """Benchmark the kernel instance.""" logging.debug("benchmark " + instance.name) logging.debug("thread block dimensions x,y,z=%d,%d,%d", *instance.threads) @@ -438,7 +442,7 @@ def benchmark(self, func, gpu_args, instance, verbose, objective, skip_nvml_sett result = {} try: self.benchmark_default( - func, gpu_args, instance.threads, instance.grid, result + func, gpu_args, instance.threads, instance.grid, result, flush_cache=flush_L2, recopy_args=instance.arguments if rewrite_data else None ) if self.continuous_observers: @@ -604,7 +608,7 @@ def compile_and_benchmark(self, kernel_source, gpu_args, params, kernel_options, self.set_nvml_parameters(instance) start_benchmark = time.perf_counter() result.update( - self.benchmark(func, gpu_args, instance, verbose, to.objective, skip_nvml_setting=False) + self.benchmark(func, gpu_args, instance, verbose, to.objective, skip_nvml_setting=False, flush_L2=to.flush_L2_cache, rewrite_data=to.always_rewrite_data) ) last_benchmark_time = 1000 * (time.perf_counter() - start_benchmark) diff --git a/kernel_tuner/interface.py b/kernel_tuner/interface.py index 96efe3ce8..5d57f216e 100644 --- a/kernel_tuner/interface.py +++ b/kernel_tuner/interface.py @@ -464,6 +464,8 @@ def __deepcopy__(self, _): ("metrics", ("specifies user-defined metrics, please see :ref:`metrics`.", "dict")), ("simulation_mode", ("Simulate an auto-tuning search from an existing cachefile", "bool")), ("observers", ("""A list of Observers to use during tuning, please see :ref:`observers`.""", "list")), + ("flush_L2_cache", ("""Whether to flush the GPU L2 cache between kernel launches. Defaults to True.""", "bool")), + ("always_rewrite_data", ("""Whether to rewrite the input arrays to the GPU between kernel launches. Defaults to False.""", "bool")), ] ) @@ -577,6 +579,8 @@ def tune_kernel( observers=None, objective=None, objective_higher_is_better=None, + flush_L2_cache=True, + always_rewrite_data=False, ): start_overhead_time = perf_counter() if log: From 98afa6093614a7a428186f2dab705f1a8a8dd3be Mon Sep 17 00:00:00 2001 From: fjwillemsen Date: Thu, 7 Mar 2024 11:29:34 +0100 Subject: [PATCH 28/30] Renamed to for clarity, added check --- kernel_tuner/core.py | 20 +++++++++++++------- kernel_tuner/interface.py | 4 ++-- 2 files changed, 15 insertions(+), 9 deletions(-) diff --git a/kernel_tuner/core.py b/kernel_tuner/core.py index 73b2336c2..7c59accc0 100644 --- a/kernel_tuner/core.py +++ b/kernel_tuner/core.py @@ -355,8 +355,13 @@ def flush_cache(self): self.flush_alloc = self.dev.allocate_ndarray(self.flush_array) self.dev.memset(self.flush_alloc, value=0, size=self.flush_array.nbytes) - def benchmark_default(self, func, gpu_args, threads, grid, result, flush_cache=True, recopy_args=False): - """Benchmark one kernel execution at a time. Run with `flush_cache=True` to avoid caching effects between iterations.""" + def benchmark_default(self, func, gpu_args, threads, grid, result, flush_cache=True, recopy_arrays=None): + """ + Benchmark one kernel execution at a time. + + Run with `flush_cache=True` to avoid caching effects between iterations. + Run with `recopy_arrays` to always write the input arrays to the GPU before each kernel launch. Must have the same order as `gpu_args`. + """ observers = [ obs for obs in self.dev.observers if not isinstance(obs, ContinuousObserver) ] @@ -365,8 +370,9 @@ def benchmark_default(self, func, gpu_args, threads, grid, result, flush_cache=T for _ in range(self.iterations): if flush_cache: self.flush_cache() - if recopy_args is not None: - for i, arg in enumerate(recopy_args): + if recopy_arrays is not None: + assert len(recopy_arrays) == len(gpu_args), "The `recopy_arrays` must be the same length and order as `gpu_args`." + for i, arg in enumerate(recopy_arrays): if isinstance(arg, (np.ndarray, cp.ndarray, torch.Tensor)): self.dev.memcpy_htod(gpu_args[i], arg) for obs in observers: @@ -426,7 +432,7 @@ def set_nvml_parameters(self, instance): if "nvml_mem_clock" in instance.params: self.nvml.mem_clock = instance.params["nvml_mem_clock"] - def benchmark(self, func, gpu_args, instance, verbose, objective, skip_nvml_setting=False, flush_L2=True, rewrite_data=False): + def benchmark(self, func, gpu_args, instance, verbose, objective, skip_nvml_setting=False, flush_L2=True, recopy_arrays=False): """Benchmark the kernel instance.""" logging.debug("benchmark " + instance.name) logging.debug("thread block dimensions x,y,z=%d,%d,%d", *instance.threads) @@ -442,7 +448,7 @@ def benchmark(self, func, gpu_args, instance, verbose, objective, skip_nvml_sett result = {} try: self.benchmark_default( - func, gpu_args, instance.threads, instance.grid, result, flush_cache=flush_L2, recopy_args=instance.arguments if rewrite_data else None + func, gpu_args, instance.threads, instance.grid, result, flush_cache=flush_L2, recopy_arrays=instance.arguments if recopy_arrays else None ) if self.continuous_observers: @@ -608,7 +614,7 @@ def compile_and_benchmark(self, kernel_source, gpu_args, params, kernel_options, self.set_nvml_parameters(instance) start_benchmark = time.perf_counter() result.update( - self.benchmark(func, gpu_args, instance, verbose, to.objective, skip_nvml_setting=False, flush_L2=to.flush_L2_cache, rewrite_data=to.always_rewrite_data) + self.benchmark(func, gpu_args, instance, verbose, to.objective, skip_nvml_setting=False, flush_L2=to.flush_L2_cache, recopy_arrays=to.recopy_arrays) ) last_benchmark_time = 1000 * (time.perf_counter() - start_benchmark) diff --git a/kernel_tuner/interface.py b/kernel_tuner/interface.py index 5d57f216e..bdfd26f43 100644 --- a/kernel_tuner/interface.py +++ b/kernel_tuner/interface.py @@ -465,7 +465,7 @@ def __deepcopy__(self, _): ("simulation_mode", ("Simulate an auto-tuning search from an existing cachefile", "bool")), ("observers", ("""A list of Observers to use during tuning, please see :ref:`observers`.""", "list")), ("flush_L2_cache", ("""Whether to flush the GPU L2 cache between kernel launches. Defaults to True.""", "bool")), - ("always_rewrite_data", ("""Whether to rewrite the input arrays to the GPU between kernel launches. Defaults to False.""", "bool")), + ("recopy_arrays", ("""Whether to rewrite the input arrays to the GPU between kernel launches. Defaults to False.""", "bool")), ] ) @@ -580,7 +580,7 @@ def tune_kernel( objective=None, objective_higher_is_better=None, flush_L2_cache=True, - always_rewrite_data=False, + recopy_arrays=False, ): start_overhead_time = perf_counter() if log: From cfecdc55a7a873b3fa9b1777307d10dae8846944 Mon Sep 17 00:00:00 2001 From: fjwillemsen Date: Fri, 12 Apr 2024 11:31:33 +0200 Subject: [PATCH 29/30] Improved getting L2 cache size --- kernel_tuner/backends/cupy.py | 2 +- kernel_tuner/backends/hip.py | 2 +- kernel_tuner/backends/nvcuda.py | 3 ++- kernel_tuner/backends/pycuda.py | 2 +- 4 files changed, 5 insertions(+), 4 deletions(-) diff --git a/kernel_tuner/backends/cupy.py b/kernel_tuner/backends/cupy.py index 8fcad0b91..ca514f279 100644 --- a/kernel_tuner/backends/cupy.py +++ b/kernel_tuner/backends/cupy.py @@ -47,7 +47,7 @@ def __init__(self, device=0, iterations=7, compiler_options=None, observers=None self.devprops = dev.attributes self.cc = dev.compute_capability self.max_threads = self.devprops["MaxThreadsPerBlock"] - self.cache_size_L2 = self.devprops["L2CacheSize"] + self.cache_size_L2 = int(self.devprops["L2CacheSize"]) self.iterations = iterations self.current_module = None diff --git a/kernel_tuner/backends/hip.py b/kernel_tuner/backends/hip.py index 1f2aff1cf..1973cfc91 100644 --- a/kernel_tuner/backends/hip.py +++ b/kernel_tuner/backends/hip.py @@ -59,7 +59,7 @@ def __init__(self, device=0, iterations=7, compiler_options=None, observers=None self.name = self.hipProps._name.decode('utf-8') self.max_threads = self.hipProps.maxThreadsPerBlock - self.cache_size_L2 = self.hipProps.l2CacheSize + self.cache_size_L2 = int(self.hipProps.l2CacheSize) self.device = device self.compiler_options = compiler_options or [] self.iterations = iterations diff --git a/kernel_tuner/backends/nvcuda.py b/kernel_tuner/backends/nvcuda.py index d9a8f945d..dd2653f04 100644 --- a/kernel_tuner/backends/nvcuda.py +++ b/kernel_tuner/backends/nvcuda.py @@ -72,6 +72,7 @@ def __init__(self, device=0, iterations=7, compiler_options=None, observers=None cudart.cudaDeviceAttr.cudaDevAttrL2CacheSize, device ) cuda_error_check(err) + self.cache_size_L2 = int(self.cache_size_L2) self.cc = f"{major}{minor}" self.iterations = iterations self.current_module = None @@ -330,7 +331,7 @@ def memset(allocation, value, size): :type size: int """ - err = cudart.cudaMemset(allocation, value, size) + err = cudart.cudaMemset(allocation.__init__(), value, size) cuda_error_check(err) @staticmethod diff --git a/kernel_tuner/backends/pycuda.py b/kernel_tuner/backends/pycuda.py index 5afc9c090..a42ca8a70 100644 --- a/kernel_tuner/backends/pycuda.py +++ b/kernel_tuner/backends/pycuda.py @@ -101,7 +101,7 @@ def _finish_up(): str(k): v for (k, v) in self.context.get_device().get_attributes().items() } self.max_threads = devprops["MAX_THREADS_PER_BLOCK"] - self.cache_size_L2 = devprops["L2_CACHE_SIZE"] + self.cache_size_L2 = int(devprops["L2_CACHE_SIZE"]) cc = str(devprops.get("COMPUTE_CAPABILITY_MAJOR", "0")) + str( devprops.get("COMPUTE_CAPABILITY_MINOR", "0") ) From 108e14c543cd5fa4b2d26868e004e2617f93321a Mon Sep 17 00:00:00 2001 From: fjwillemsen Date: Fri, 12 Apr 2024 11:35:06 +0200 Subject: [PATCH 30/30] Small improvements to flushing arrays --- kernel_tuner/core.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/kernel_tuner/core.py b/kernel_tuner/core.py index 7c59accc0..44b6e82e9 100644 --- a/kernel_tuner/core.py +++ b/kernel_tuner/core.py @@ -339,8 +339,10 @@ def __init__( self.max_threads = dev.max_threads self.flush_possible = lang.upper() not in ['OPENCL', 'HIP', 'C', 'FORTRAN'] and isinstance(self.dev.cache_size_L2, int) and self.dev.cache_size_L2 > 0 if self.flush_possible: - t = np.int32 - self.flush_array = np.zeros((self.dev.cache_size_L2 // t(0).itemsize), order='F').astype(t) + self.flush_type = np.uint8 + size = (self.dev.cache_size_L2 // self.flush_type(0).itemsize) + # self.flush_array = np.zeros((size), order='F', dtype=self.flush_type) + self.flush_array = np.empty((size), order='F', dtype=self.flush_type) self.flush_alloc = None if not quiet: print("Using: " + self.dev.name) @@ -353,7 +355,7 @@ def flush_cache(self): self.dev.free_mem(self.flush_alloc) # inspired by https://github.com/NVIDIA/nvbench/blob/main/nvbench/detail/l2flush.cuh#L51 self.flush_alloc = self.dev.allocate_ndarray(self.flush_array) - self.dev.memset(self.flush_alloc, value=0, size=self.flush_array.nbytes) + self.dev.memset(self.flush_alloc, value=1, size=self.flush_array.nbytes) def benchmark_default(self, func, gpu_args, threads, grid, result, flush_cache=True, recopy_arrays=None): """ @@ -367,7 +369,7 @@ def benchmark_default(self, func, gpu_args, threads, grid, result, flush_cache=T ] self.dev.synchronize() - for _ in range(self.iterations): + for i in range(self.iterations): if flush_cache: self.flush_cache() if recopy_arrays is not None: