Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Flush cache #246

Draft
wants to merge 30 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from 2 commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
81a68a4
Added RegisterObserver with common interface among backends
fjwillemsen Feb 8, 2024
943b3c4
Added test for RegisterObserver, added clause in case of mocktest
fjwillemsen Feb 8, 2024
1681730
Added useful error message in case Register Observer is not supported
fjwillemsen Feb 9, 2024
f153945
Added tests for Register Observer for OpenCL and HIP backends
fjwillemsen Feb 9, 2024
7bd7c2b
Added instruction for pointing cache directory elsewhere
fjwillemsen Feb 12, 2024
9dea137
Non-argument streams are now correctly passed in the CuPy and NVCUDA …
fjwillemsen Feb 12, 2024
df54145
Fixed several issues pertaining to the setting of clocks, in particul…
fjwillemsen Feb 15, 2024
4cc4a13
Time spent setting NVML parameters (clock & memory frequency, power) …
fjwillemsen Feb 15, 2024
e309bc1
Time spent setting NVML parameters (clock & memory frequency, power) …
fjwillemsen Feb 15, 2024
d6aac8b
Removed redundant print statement
fjwillemsen Feb 15, 2024
a020791
Added L2 cache size property to CUDA backends
fjwillemsen Feb 28, 2024
6e6e5fb
Added specification to CUPY compiler options
fjwillemsen Feb 28, 2024
f15338f
Added L2 cache size property to OpenCL, HIP and mocked PyCUDA backends
fjwillemsen Feb 28, 2024
00ac419
Added function to check for compute capability validity, improved che…
fjwillemsen Feb 28, 2024
55ab074
Added a flush kernel to clear the L2 cache between runs
fjwillemsen Feb 28, 2024
e106bae
Added a flush kernel to clear the L2 cache between runs
fjwillemsen Feb 28, 2024
0cb5e3a
Made function for scaling the compute capability to a valid one, adde…
fjwillemsen Feb 29, 2024
b682506
Applied suggestions from comments by @csbnw
fjwillemsen Mar 1, 2024
da907b1
Removed redundant comments / printing
fjwillemsen Mar 1, 2024
2396bdf
Added L2 cache size information to backends
fjwillemsen Mar 1, 2024
eced775
Added L2 flush kernel
fjwillemsen Mar 1, 2024
143889f
Switched to new attempt for flushing L2 using memset
fjwillemsen Mar 1, 2024
651eea7
Added implementation of allocate numpy array function
fjwillemsen Mar 1, 2024
7d8d48f
Added new flush L2 cache method using memset
fjwillemsen Mar 2, 2024
9911f4c
Added a standard method for freeing memory from the GPU
fjwillemsen Mar 4, 2024
47c2cca
Circumvented an issue where list.remove(val) was not properly impleme…
fjwillemsen Mar 4, 2024
157ca41
Added the ability to recopy array arguments with every kernel launch,…
fjwillemsen Mar 7, 2024
98afa60
Renamed to for clarity, added check
fjwillemsen Mar 7, 2024
cfecdc5
Improved getting L2 cache size
fjwillemsen Apr 12, 2024
108e14c
Small improvements to flushing arrays
fjwillemsen Apr 12, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion kernel_tuner/backends/cupy.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ def __init__(self, device=0, iterations=7, compiler_options=None, observers=None
self.devprops = dev.attributes
self.cc = dev.compute_capability
self.max_threads = self.devprops["MaxThreadsPerBlock"]
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also cast this to int for consistency?

self.cache_size_L2 = self.devprops["L2CacheSize"]
self.cache_size_L2 = int(self.devprops["L2CacheSize"])

self.iterations = iterations
self.current_module = None
Expand Down
2 changes: 1 addition & 1 deletion kernel_tuner/backends/hip.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ def __init__(self, device=0, iterations=7, compiler_options=None, observers=None

self.name = self.hipProps._name.decode('utf-8')
self.max_threads = self.hipProps.maxThreadsPerBlock
self.cache_size_L2 = self.hipProps.l2CacheSize
self.cache_size_L2 = int(self.hipProps.l2CacheSize)
self.device = device
self.compiler_options = compiler_options or []
self.iterations = iterations
Expand Down
3 changes: 2 additions & 1 deletion kernel_tuner/backends/nvcuda.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ def __init__(self, device=0, iterations=7, compiler_options=None, observers=None
cudart.cudaDeviceAttr.cudaDevAttrL2CacheSize, device
)
cuda_error_check(err)
self.cache_size_L2 = int(self.cache_size_L2)
self.cc = f"{major}{minor}"
self.iterations = iterations
self.current_module = None
Expand Down Expand Up @@ -330,7 +331,7 @@ def memset(allocation, value, size):
:type size: int

"""
err = cudart.cudaMemset(allocation, value, size)
err = cudart.cudaMemset(allocation.__init__(), value, size)
cuda_error_check(err)

@staticmethod
Expand Down
2 changes: 1 addition & 1 deletion kernel_tuner/backends/pycuda.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ def _finish_up():
str(k): v for (k, v) in self.context.get_device().get_attributes().items()
}
self.max_threads = devprops["MAX_THREADS_PER_BLOCK"]
self.cache_size_L2 = devprops["L2_CACHE_SIZE"]
self.cache_size_L2 = int(devprops["L2_CACHE_SIZE"])
cc = str(devprops.get("COMPUTE_CAPABILITY_MAJOR", "0")) + str(
devprops.get("COMPUTE_CAPABILITY_MINOR", "0")
)
Expand Down
10 changes: 6 additions & 4 deletions kernel_tuner/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -339,8 +339,10 @@ def __init__(
self.max_threads = dev.max_threads
self.flush_possible = lang.upper() not in ['OPENCL', 'HIP', 'C', 'FORTRAN'] and isinstance(self.dev.cache_size_L2, int) and self.dev.cache_size_L2 > 0
if self.flush_possible:
t = np.int32
self.flush_array = np.zeros((self.dev.cache_size_L2 // t(0).itemsize), order='F').astype(t)
self.flush_type = np.uint8
size = (self.dev.cache_size_L2 // self.flush_type(0).itemsize)
# self.flush_array = np.zeros((size), order='F', dtype=self.flush_type)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
# self.flush_array = np.zeros((size), order='F', dtype=self.flush_type)

self.flush_array = np.empty((size), order='F', dtype=self.flush_type)
self.flush_alloc = None
if not quiet:
print("Using: " + self.dev.name)
Expand All @@ -353,7 +355,7 @@ def flush_cache(self):
self.dev.free_mem(self.flush_alloc)
# inspired by https://github.com/NVIDIA/nvbench/blob/main/nvbench/detail/l2flush.cuh#L51
self.flush_alloc = self.dev.allocate_ndarray(self.flush_array)
self.dev.memset(self.flush_alloc, value=0, size=self.flush_array.nbytes)
self.dev.memset(self.flush_alloc, value=1, size=self.flush_array.nbytes)

def benchmark_default(self, func, gpu_args, threads, grid, result, flush_cache=True, recopy_arrays=None):
"""
Expand All @@ -367,7 +369,7 @@ def benchmark_default(self, func, gpu_args, threads, grid, result, flush_cache=T
]

self.dev.synchronize()
for _ in range(self.iterations):
for i in range(self.iterations):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i doesn't seem to be used below. the for-loop on line 377 even defines its own i.

if flush_cache:
self.flush_cache()
if recopy_arrays is not None:
Expand Down
Loading