-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Adding notebook that shows that we can run a kernel on the GPU.
- Loading branch information
Showing
2 changed files
with
254 additions
and
84 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,254 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"This example works through the Overview section of the CUDA Python documentation.\n", | ||
"\n", | ||
"https://nvidia.github.io/cuda-python/overview.html" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"from cuda import cuda, nvrtc\n", | ||
"import numpy as np" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"def ASSERT_DRV(err):\n", | ||
" if isinstance(err, cuda.CUresult):\n", | ||
" if err != cuda.CUresult.CUDA_SUCCESS:\n", | ||
" raise RuntimeError(\"Cuda Error: {}\".format(err))\n", | ||
" elif isinstance(err, nvrtc.nvrtcResult):\n", | ||
" if err != nvrtc.nvrtcResult.NVRTC_SUCCESS:\n", | ||
" raise RuntimeError(\"Nvrtc Error: {}\".format(err))\n", | ||
" else:\n", | ||
" raise RuntimeError(\"Unknown error type: {}\".format(err))" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"saxpy = \"\"\"\\\n", | ||
"extern \"C\" __global__\n", | ||
"void saxpy(float a, float *x, float *y, float *out, size_t n)\n", | ||
"{\n", | ||
" size_t tid = blockIdx.x * blockDim.x + threadIdx.x;\n", | ||
" if (tid < n) {\n", | ||
" out[tid] = a * x[tid] + y[tid];\n", | ||
" }\n", | ||
"}\n", | ||
"\"\"\"" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# Create program\n", | ||
"err, prog = nvrtc.nvrtcCreateProgram(str.encode(saxpy), b\"saxpy.cu\", 0, [], [])\n", | ||
"\n", | ||
"# Compile program\n", | ||
"opts = [b\"--fmad=false\", b\"--gpu-architecture=compute_75\"]\n", | ||
"err, = nvrtc.nvrtcCompileProgram(prog, 2, opts)\n", | ||
"\n", | ||
"# Get PTX from compilation\n", | ||
"err, ptxSize = nvrtc.nvrtcGetPTXSize(prog)\n", | ||
"ptx = b\" \" * ptxSize\n", | ||
"err, = nvrtc.nvrtcGetPTX(prog, ptx)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# Initialize CUDA Driver API\n", | ||
"err, = cuda.cuInit(0)\n", | ||
"\n", | ||
"# Retrieve handle for device 0\n", | ||
"err, cuDevice = cuda.cuDeviceGet(0)\n", | ||
"\n", | ||
"# Create context\n", | ||
"err, context = cuda.cuCtxCreate(0, cuDevice)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# Load PTX as module data and retrieve function\n", | ||
"ptx = np.char.array(ptx)\n", | ||
"# Note: Incompatible --gpu-architecture would be detected here\n", | ||
"err, module = cuda.cuModuleLoadData(ptx.ctypes.data)\n", | ||
"ASSERT_DRV(err)\n", | ||
"err, kernel = cuda.cuModuleGetFunction(module, b\"saxpy\")\n", | ||
"ASSERT_DRV(err)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"NUM_THREADS = 512 # Threads per block\n", | ||
"NUM_BLOCKS = 32768 # Blocks per grid\n", | ||
"\n", | ||
"a = np.array([2.0], dtype=np.float32)51515\n", | ||
"n = np.array(NUM_THREADS * NUM_BLOCKS, dtype=np.uint32)\n", | ||
"bufferSize = n * a.itemsize\n", | ||
"\n", | ||
"hX = np.random.rand(n).astype(dtype=np.float32)\n", | ||
"hY = np.random.rand(n).astype(dtype=np.float32)\n", | ||
"hOut = np.zeros(n).astype(dtype=np.float32)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"err, dXclass = cuda.cuMemAlloc(bufferSize)\n", | ||
"err, dYclass = cuda.cuMemAlloc(bufferSize)\n", | ||
"err, dOutclass = cuda.cuMemAlloc(bufferSize)\n", | ||
"\n", | ||
"err, stream = cuda.cuStreamCreate(0)\n", | ||
"\n", | ||
"err, = cuda.cuMemcpyHtoDAsync(\n", | ||
" dXclass, hX.ctypes.data, bufferSize, stream\n", | ||
")\n", | ||
"err, = cuda.cuMemcpyHtoDAsync(\n", | ||
" dYclass, hY.ctypes.data, bufferSize, stream\n", | ||
")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# The following code example is not intuitive \n", | ||
"# Subject to change in a future release\n", | ||
"dX = np.array([int(dXclass)], dtype=np.uint64)\n", | ||
"dY = np.array([int(dYclass)], dtype=np.uint64)\n", | ||
"dOut = np.array([int(dOutclass)], dtype=np.uint64)\n", | ||
"\n", | ||
"args = [a, dX, dY, dOut, n]\n", | ||
"args = np.array([arg.ctypes.data for arg in args], dtype=np.uint64)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"err, = cuda.cuLaunchKernel(\n", | ||
" kernel,\n", | ||
" NUM_BLOCKS, # grid x dim\n", | ||
" 1, # grid y dim\n", | ||
" 1, # grid z dim\n", | ||
" NUM_THREADS, # block x dim\n", | ||
" 1, # block y dim\n", | ||
" 1, # block z dim\n", | ||
" 0, # dynamic shared memory\n", | ||
" stream, # stream\n", | ||
" args.ctypes.data, # kernel arguments\n", | ||
" 0, # extra (ignore)\n", | ||
")\n", | ||
"\n", | ||
"err, = cuda.cuMemcpyDtoHAsync(\n", | ||
" hOut.ctypes.data, dOutclass, bufferSize, stream\n", | ||
")\n", | ||
"err, = cuda.cuStreamSynchronize(stream)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"The results of running Saxpy on the GPU are stored in the `hOut` array.\n", | ||
"\n", | ||
"Next we perform the same calculation in python (numpy) and confirm that the results match." | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# Assert values are same after running kernel\n", | ||
"hZ = a * hX + hY\n", | ||
"if not np.allclose(hOut, hZ):\n", | ||
" raise ValueError(\"Error outside tolerance for host-device vectors\")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"print(hOut[:10])\n", | ||
"print(hZ[:10])" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"err, = cuda.cuStreamDestroy(stream)\n", | ||
"err, = cuda.cuMemFree(dXclass)\n", | ||
"err, = cuda.cuMemFree(dYclass)\n", | ||
"err, = cuda.cuMemFree(dOutclass)\n", | ||
"err, = cuda.cuModuleUnload(module)\n", | ||
"err, = cuda.cuCtxDestroy(context)" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "mapals", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.12.3" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 2 | ||
} |
This file was deleted.
Oops, something went wrong.