-
Notifications
You must be signed in to change notification settings - Fork 50
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Flush cache #246
base: master
Are you sure you want to change the base?
Flush cache #246
Changes from all commits
81a68a4
943b3c4
1681730
f153945
7bd7c2b
9dea137
df54145
4cc4a13
e309bc1
d6aac8b
a020791
6e6e5fb
f15338f
00ac419
55ab074
e106bae
0cb5e3a
b682506
da907b1
2396bdf
eced775
143889f
651eea7
7d8d48f
9911f4c
47c2cca
157ca41
98afa60
cfecdc5
108e14c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,6 @@ | ||
"""This module contains all Cupy specific kernel_tuner functions.""" | ||
from __future__ import print_function | ||
from warnings import warn | ||
|
||
import numpy as np | ||
|
||
|
@@ -46,6 +47,7 @@ def __init__(self, device=0, iterations=7, compiler_options=None, observers=None | |
self.devprops = dev.attributes | ||
self.cc = dev.compute_capability | ||
self.max_threads = self.devprops["MaxThreadsPerBlock"] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Also cast this to |
||
self.cache_size_L2 = int(self.devprops["L2CacheSize"]) | ||
|
||
self.iterations = iterations | ||
self.current_module = None | ||
|
@@ -82,6 +84,18 @@ def __init__(self, device=0, iterations=7, compiler_options=None, observers=None | |
self.env = env | ||
self.name = env["device_name"] | ||
|
||
def allocate_ndarray(self, array): | ||
alloc = cp.array(array) | ||
self.allocations.append(alloc) | ||
return alloc | ||
|
||
def free_mem(self, pointer): | ||
# iteratively comparing is required as comparing with list.remove is not properly implemented | ||
to_remove = [i for i, alloc in enumerate(self.allocations) if cp.array_equal(alloc, pointer)] | ||
assert len(to_remove) == 1 | ||
self.allocations.pop(to_remove[0]) | ||
del pointer # CuPy uses Python reference counter to free upon disuse | ||
|
||
def ready_argument_list(self, arguments): | ||
"""Ready argument list to be passed to the kernel, allocates gpu mem. | ||
|
||
|
@@ -97,8 +111,7 @@ def ready_argument_list(self, arguments): | |
for arg in arguments: | ||
# if arg i is a numpy array copy to device | ||
if isinstance(arg, np.ndarray): | ||
alloc = cp.array(arg) | ||
self.allocations.append(alloc) | ||
alloc = self.allocate_ndarray(arg) | ||
gpu_args.append(alloc) | ||
# if not a numpy array, just pass argument along | ||
else: | ||
|
@@ -124,6 +137,7 @@ def compile(self, kernel_instance): | |
compiler_options = self.compiler_options | ||
if not any(["-std=" in opt for opt in self.compiler_options]): | ||
compiler_options = ["--std=c++11"] + self.compiler_options | ||
# CuPy already sets the --gpu-architecture by itself, as per https://github.com/cupy/cupy/blob/main/cupy/cuda/compiler.py#L145 | ||
|
||
options = tuple(compiler_options) | ||
|
||
|
@@ -132,6 +146,7 @@ def compile(self, kernel_instance): | |
) | ||
|
||
self.func = self.current_module.get_function(kernel_name) | ||
self.num_regs = self.func.num_regs | ||
return self.func | ||
|
||
def start_event(self): | ||
|
@@ -197,6 +212,8 @@ def run_kernel(self, func, gpu_args, threads, grid, stream=None): | |
of the grid | ||
:type grid: tuple(int, int) | ||
""" | ||
if stream is None: | ||
stream = self.stream | ||
func(grid, threads, gpu_args, stream=stream, shared_mem=self.smem_size) | ||
|
||
def memset(self, allocation, value, size): | ||
|
Original file line number | Diff line number | Diff line change | ||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
@@ -59,6 +59,7 @@ def __init__(self, device=0, iterations=7, compiler_options=None, observers=None | |||||||||||||
|
||||||||||||||
self.name = self.hipProps._name.decode('utf-8') | ||||||||||||||
self.max_threads = self.hipProps.maxThreadsPerBlock | ||||||||||||||
self.cache_size_L2 = int(self.hipProps.l2CacheSize) | ||||||||||||||
self.device = device | ||||||||||||||
self.compiler_options = compiler_options or [] | ||||||||||||||
self.iterations = iterations | ||||||||||||||
|
@@ -85,6 +86,11 @@ def __init__(self, device=0, iterations=7, compiler_options=None, observers=None | |||||||||||||
for obs in self.observers: | ||||||||||||||
obs.register_device(self) | ||||||||||||||
|
||||||||||||||
def allocate_ndarray(self, array): | ||||||||||||||
return hip.hipMalloc(array.nbytes) | ||||||||||||||
Comment on lines
+89
to
+90
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Don't you need to store the allocated memory?
Suggested change
|
||||||||||||||
|
||||||||||||||
def free_mem(self, pointer): | ||||||||||||||
raise NotImplementedError("PyHIP currently does not have a free function") | ||||||||||||||
|
||||||||||||||
def ready_argument_list(self, arguments): | ||||||||||||||
"""Ready argument list to be passed to the HIP function. | ||||||||||||||
|
@@ -106,7 +112,7 @@ def ready_argument_list(self, arguments): | |||||||||||||
# Allocate space on device for array and convert to ctypes | ||||||||||||||
if isinstance(arg, np.ndarray): | ||||||||||||||
if dtype_str in dtype_map.keys(): | ||||||||||||||
device_ptr = hip.hipMalloc(arg.nbytes) | ||||||||||||||
device_ptr = self.allocate_ndarray(arg) | ||||||||||||||
data_ctypes = arg.ctypes.data_as(ctypes.POINTER(dtype_map[dtype_str])) | ||||||||||||||
hip.hipMemcpy_htod(device_ptr, data_ctypes, arg.nbytes) | ||||||||||||||
# may be part of run_kernel, return allocations here instead | ||||||||||||||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,9 +1,11 @@ | ||
"""This module contains all NVIDIA cuda-python specific kernel_tuner functions.""" | ||
from warnings import warn | ||
|
||
import numpy as np | ||
|
||
from kernel_tuner.backends.backend import GPUBackend | ||
from kernel_tuner.observers.nvcuda import CudaRuntimeObserver | ||
from kernel_tuner.util import SkippableFailure, cuda_error_check | ||
from kernel_tuner.util import SkippableFailure, cuda_error_check, to_valid_nvrtc_gpu_arch_cc | ||
|
||
# embedded in try block to be able to generate documentation | ||
# and run tests without cuda-python installed | ||
|
@@ -66,6 +68,11 @@ def __init__(self, device=0, iterations=7, compiler_options=None, observers=None | |
cudart.cudaDeviceAttr.cudaDevAttrMaxThreadsPerBlock, device | ||
) | ||
cuda_error_check(err) | ||
err, self.cache_size_L2 = cudart.cudaDeviceGetAttribute( | ||
cudart.cudaDeviceAttr.cudaDevAttrL2CacheSize, device | ||
) | ||
cuda_error_check(err) | ||
self.cache_size_L2 = int(self.cache_size_L2) | ||
self.cc = f"{major}{minor}" | ||
self.iterations = iterations | ||
self.current_module = None | ||
|
@@ -107,9 +114,19 @@ def __init__(self, device=0, iterations=7, compiler_options=None, observers=None | |
|
||
def __del__(self): | ||
for device_memory in self.allocations: | ||
if isinstance(device_memory, cuda.CUdeviceptr): | ||
err = cuda.cuMemFree(device_memory) | ||
cuda_error_check(err) | ||
self.free_mem(device_memory) | ||
|
||
def allocate_ndarray(self, array): | ||
err, device_memory = cuda.cuMemAlloc(array.nbytes) | ||
cuda_error_check(err) | ||
self.allocations.append(device_memory) | ||
return device_memory | ||
|
||
def free_mem(self, pointer): | ||
assert isinstance(pointer, cuda.CUdeviceptr) | ||
self.allocations.remove(pointer) | ||
err = cuda.cuMemFree(pointer) | ||
cuda_error_check(err) | ||
|
||
def ready_argument_list(self, arguments): | ||
"""Ready argument list to be passed to the kernel, allocates gpu mem. | ||
|
@@ -126,9 +143,7 @@ def ready_argument_list(self, arguments): | |
for arg in arguments: | ||
# if arg is a numpy array copy it to device | ||
if isinstance(arg, np.ndarray): | ||
err, device_memory = cuda.cuMemAlloc(arg.nbytes) | ||
cuda_error_check(err) | ||
self.allocations.append(device_memory) | ||
device_memory = self.allocate_ndarray(arg) | ||
gpu_args.append(device_memory) | ||
self.memcpy_htod(device_memory, arg) | ||
# if not array, just pass along | ||
|
@@ -161,12 +176,12 @@ def compile(self, kernel_instance): | |
compiler_options.append(b"--std=c++11") | ||
if not any(["--std=" in opt for opt in self.compiler_options]): | ||
self.compiler_options.append("--std=c++11") | ||
if not any([b"--gpu-architecture=" in opt for opt in compiler_options]): | ||
if not any([b"--gpu-architecture=" in opt or b"-arch" in opt for opt in compiler_options]): | ||
compiler_options.append( | ||
f"--gpu-architecture=compute_{self.cc}".encode("UTF-8") | ||
f"--gpu-architecture=compute_{to_valid_nvrtc_gpu_arch_cc(self.cc)}".encode("UTF-8") | ||
) | ||
if not any(["--gpu-architecture=" in opt for opt in self.compiler_options]): | ||
self.compiler_options.append(f"--gpu-architecture=compute_{self.cc}") | ||
if not any(["--gpu-architecture=" in opt or "-arch" in opt for opt in self.compiler_options]): | ||
self.compiler_options.append(f"--gpu-architecture=compute_{to_valid_nvrtc_gpu_arch_cc(self.cc)}") | ||
|
||
err, program = nvrtc.nvrtcCreateProgram( | ||
str.encode(kernel_string), b"CUDAProgram", 0, [], [] | ||
|
@@ -192,6 +207,11 @@ def compile(self, kernel_instance): | |
) | ||
cuda_error_check(err) | ||
|
||
# get the number of registers per thread used in this kernel | ||
num_regs = cuda.cuFuncGetAttribute(cuda.CUfunction_attribute.CU_FUNC_ATTRIBUTE_NUM_REGS, self.func) | ||
assert num_regs[0] == 0, f"Retrieving number of registers per thread unsuccesful: code {num_regs[0]}" | ||
Comment on lines
+210
to
+212
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Would it make sense to move this code to a helper function? |
||
self.num_regs = num_regs[1] | ||
|
||
except RuntimeError as re: | ||
_, n = nvrtc.nvrtcGetProgramLogSize(program) | ||
log = b" " * n | ||
|
@@ -273,6 +293,8 @@ def run_kernel(self, func, gpu_args, threads, grid, stream=None): | |
of the grid | ||
:type grid: tuple(int, int) | ||
""" | ||
if stream is None: | ||
stream = self.stream | ||
arg_types = list() | ||
for arg in gpu_args: | ||
if isinstance(arg, cuda.CUdeviceptr): | ||
|
@@ -309,7 +331,7 @@ def memset(allocation, value, size): | |
:type size: int | ||
|
||
""" | ||
err = cudart.cudaMemset(allocation, value, size) | ||
err = cudart.cudaMemset(allocation.__init__(), value, size) | ||
cuda_error_check(err) | ||
|
||
@staticmethod | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.