-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #4 from KernelTuner/updated_kt_example
add simple vector_add example
- Loading branch information
Showing
4 changed files
with
77 additions
and
103 deletions.
There are no files selected for viewing
This file was deleted.
Oops, something went wrong.
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
#include "kernel_float.h" | ||
namespace kf = kernel_float; | ||
|
||
__global__ void vector_add( | ||
kf::vec<float_type, elements_per_thread>* c, | ||
const kf::vec<float_type, elements_per_thread>* a, | ||
const kf::vec<float_type, elements_per_thread>* b, | ||
int n | ||
) { | ||
int i = blockIdx.x * blockDim.x + threadIdx.x; | ||
if (i * elements_per_thread < n) { | ||
c[i] = a[i] + b[i]; | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,63 @@ | ||
#!/usr/bin/env python | ||
import os | ||
|
||
import numpy | ||
from kernel_tuner import tune_kernel | ||
from kernel_tuner.accuracy import TunablePrecision, AccuracyObserver | ||
|
||
# Specify the compiler flags Kernel Tuner should use to compile our kernel | ||
ROOT_DIR = os.path.dirname(os.path.abspath(__file__)) + "/../" | ||
flags = [f"-I{ROOT_DIR}/include", "-std=c++17"] | ||
|
||
def tune(): | ||
|
||
# Prepare input data | ||
size = 100000000 | ||
n = numpy.int32(size) | ||
a = numpy.random.randn(size).astype(numpy.float64) | ||
b = numpy.random.randn(size).astype(numpy.float64) | ||
c = numpy.zeros_like(b) | ||
|
||
# Prepare the argument list of the kernel | ||
args = [ | ||
TunablePrecision("float_type", c), | ||
TunablePrecision("float_type", a), | ||
TunablePrecision("float_type", b), | ||
n, | ||
] | ||
|
||
# Define the reference answer to compute the kernel output against | ||
answer = [a+b, None, None, None] | ||
|
||
# Define the tunable parameters, in this case thread block size | ||
# and the type to use for the input and output data of our kernel | ||
tune_params = dict() | ||
tune_params["block_size_x"] = [64, 128, 256, 512] | ||
tune_params["float_type"] = ["half", "float", "double"] | ||
tune_params["elements_per_thread"] = [1, 2, 4, 8] | ||
|
||
# Observers will measure the error using either RMSE or MRE as error metric | ||
observers = [ | ||
AccuracyObserver("RMSE", "error_rmse"), | ||
AccuracyObserver("MRE", "error_relative"), | ||
] | ||
|
||
# The metrics here are only to ensure Kernel Tuner prints them to the console | ||
metrics = dict(RMSE=lambda p: p["error_rmse"], MRE=lambda p: p["error_relative"]) | ||
|
||
results, env = tune_kernel( | ||
"vector_add", | ||
"vector_add.cu", | ||
size, | ||
args, | ||
tune_params, | ||
answer=answer, | ||
observers=observers, | ||
metrics=metrics, | ||
lang="cupy", | ||
compiler_options=flags | ||
) | ||
|
||
|
||
if __name__ == "__main__": | ||
tune() |