Adding notebook that shows that we can run a kernel on the GPU.

lincc-frameworks · Jun 13, 2024 · 7cfb2ca · 7cfb2ca
1 parent 53f3f4c
commit 7cfb2ca
Show file tree

Hide file tree

Showing 2 changed files with 254 additions and 84 deletions.
diff --git a/docs/notebooks/example-cuda-python.ipynb b/docs/notebooks/example-cuda-python.ipynb
@@ -0,0 +1,254 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This example works through the Overview section of the CUDA Python documentation.\n",
+    "\n",
+    "https://nvidia.github.io/cuda-python/overview.html"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from cuda import cuda, nvrtc\n",
+    "import numpy as np"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def ASSERT_DRV(err):\n",
+    "    if isinstance(err, cuda.CUresult):\n",
+    "        if err != cuda.CUresult.CUDA_SUCCESS:\n",
+    "            raise RuntimeError(\"Cuda Error: {}\".format(err))\n",
+    "    elif isinstance(err, nvrtc.nvrtcResult):\n",
+    "        if err != nvrtc.nvrtcResult.NVRTC_SUCCESS:\n",
+    "            raise RuntimeError(\"Nvrtc Error: {}\".format(err))\n",
+    "    else:\n",
+    "        raise RuntimeError(\"Unknown error type: {}\".format(err))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "saxpy = \"\"\"\\\n",
+    "extern \"C\" __global__\n",
+    "void saxpy(float a, float *x, float *y, float *out, size_t n)\n",
+    "{\n",
+    " size_t tid = blockIdx.x * blockDim.x + threadIdx.x;\n",
+    " if (tid < n) {\n",
+    "   out[tid] = a * x[tid] + y[tid];\n",
+    " }\n",
+    "}\n",
+    "\"\"\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Create program\n",
+    "err, prog = nvrtc.nvrtcCreateProgram(str.encode(saxpy), b\"saxpy.cu\", 0, [], [])\n",
+    "\n",
+    "# Compile program\n",
+    "opts = [b\"--fmad=false\", b\"--gpu-architecture=compute_75\"]\n",
+    "err, = nvrtc.nvrtcCompileProgram(prog, 2, opts)\n",
+    "\n",
+    "# Get PTX from compilation\n",
+    "err, ptxSize = nvrtc.nvrtcGetPTXSize(prog)\n",
+    "ptx = b\" \" * ptxSize\n",
+    "err, = nvrtc.nvrtcGetPTX(prog, ptx)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Initialize CUDA Driver API\n",
+    "err, = cuda.cuInit(0)\n",
+    "\n",
+    "# Retrieve handle for device 0\n",
+    "err, cuDevice = cuda.cuDeviceGet(0)\n",
+    "\n",
+    "# Create context\n",
+    "err, context = cuda.cuCtxCreate(0, cuDevice)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Load PTX as module data and retrieve function\n",
+    "ptx = np.char.array(ptx)\n",
+    "# Note: Incompatible --gpu-architecture would be detected here\n",
+    "err, module = cuda.cuModuleLoadData(ptx.ctypes.data)\n",
+    "ASSERT_DRV(err)\n",
+    "err, kernel = cuda.cuModuleGetFunction(module, b\"saxpy\")\n",
+    "ASSERT_DRV(err)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "NUM_THREADS = 512  # Threads per block\n",
+    "NUM_BLOCKS = 32768  # Blocks per grid\n",
+    "\n",
+    "a = np.array([2.0], dtype=np.float32)51515\n",
+    "n = np.array(NUM_THREADS * NUM_BLOCKS, dtype=np.uint32)\n",
+    "bufferSize = n * a.itemsize\n",
+    "\n",
+    "hX = np.random.rand(n).astype(dtype=np.float32)\n",
+    "hY = np.random.rand(n).astype(dtype=np.float32)\n",
+    "hOut = np.zeros(n).astype(dtype=np.float32)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "err, dXclass = cuda.cuMemAlloc(bufferSize)\n",
+    "err, dYclass = cuda.cuMemAlloc(bufferSize)\n",
+    "err, dOutclass = cuda.cuMemAlloc(bufferSize)\n",
+    "\n",
+    "err, stream = cuda.cuStreamCreate(0)\n",
+    "\n",
+    "err, = cuda.cuMemcpyHtoDAsync(\n",
+    "   dXclass, hX.ctypes.data, bufferSize, stream\n",
+    ")\n",
+    "err, = cuda.cuMemcpyHtoDAsync(\n",
+    "   dYclass, hY.ctypes.data, bufferSize, stream\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# The following code example is not intuitive \n",
+    "# Subject to change in a future release\n",
+    "dX = np.array([int(dXclass)], dtype=np.uint64)\n",
+    "dY = np.array([int(dYclass)], dtype=np.uint64)\n",
+    "dOut = np.array([int(dOutclass)], dtype=np.uint64)\n",
+    "\n",
+    "args = [a, dX, dY, dOut, n]\n",
+    "args = np.array([arg.ctypes.data for arg in args], dtype=np.uint64)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "err, = cuda.cuLaunchKernel(\n",
+    "   kernel,\n",
+    "   NUM_BLOCKS,  # grid x dim\n",
+    "   1,  # grid y dim\n",
+    "   1,  # grid z dim\n",
+    "   NUM_THREADS,  # block x dim\n",
+    "   1,  # block y dim\n",
+    "   1,  # block z dim\n",
+    "   0,  # dynamic shared memory\n",
+    "   stream,  # stream\n",
+    "   args.ctypes.data,  # kernel arguments\n",
+    "   0,  # extra (ignore)\n",
+    ")\n",
+    "\n",
+    "err, = cuda.cuMemcpyDtoHAsync(\n",
+    "   hOut.ctypes.data, dOutclass, bufferSize, stream\n",
+    ")\n",
+    "err, = cuda.cuStreamSynchronize(stream)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The results of running Saxpy on the GPU are stored in the `hOut` array.\n",
+    "\n",
+    "Next we perform the same calculation in python (numpy) and confirm that the results match."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Assert values are same after running kernel\n",
+    "hZ = a * hX + hY\n",
+    "if not np.allclose(hOut, hZ):\n",
+    "   raise ValueError(\"Error outside tolerance for host-device vectors\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(hOut[:10])\n",
+    "print(hZ[:10])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "err, = cuda.cuStreamDestroy(stream)\n",
+    "err, = cuda.cuMemFree(dXclass)\n",
+    "err, = cuda.cuMemFree(dYclass)\n",
+    "err, = cuda.cuMemFree(dOutclass)\n",
+    "err, = cuda.cuModuleUnload(module)\n",
+    "err, = cuda.cuCtxDestroy(context)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "mapals",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/docs/notebooks/intro_notebook.ipynb b/docs/notebooks/intro_notebook.ipynb