Dropout-Analytics · gumdropsteve · Jun 4, 2020
diff --git a/distributed/distributed_kmeans.ipynb b/distributed/distributed_kmeans.ipynb
@@ -0,0 +1,261 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Distributed K-Means"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We can use the `LocalCUDACluster` to start a Dask cluster on a single machine with one worker mapped to each GPU. This is called one-process-per-GPU (OPG). "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from dask.distributed import Client\n",
+    "from dask_cuda import LocalCUDACluster"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "cluster = LocalCUDACluster(threads_per_worker=1)\n",
+    "client = Client(cluster)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Generate Data\n",
+    "We can generate a dask_cudf.DataFrame of synthetic data for multiple clusters using `cuml.dask.datasets.make_blobs`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from cuml.dask.datasets import make_blobs"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "n_samples = 1000000\n",
+    "n_features = 2\n",
+    "n_total_partitions = len(list(client.has_what().keys()))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Generating 1000000 samples across 1 partitions on 1 workers (total=1000000 samples)\n"
+     ]
+    }
+   ],
+   "source": [
+    "X, y = make_blobs(n_samples,\n",
+    "                  n_features,\n",
+    "                  centers=5, \n",
+    "                  n_parts=n_total_partitions,\n",
+    "                  cluster_std=0.1, \n",
+    "                  verbose=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Fit & Predict"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from cuml.dask.cluster.kmeans import KMeans"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "CPU times: user 1.84 s, sys: 303 ms, total: 2.14 s\n",
+      "Wall time: 3.16 s\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "<cuml.dask.cluster.kmeans.KMeans at 0x7fe7d1e08250>"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "%%time\n",
+    "kmeans = KMeans(client=client,\n",
+    "                init=\"k-means||\",\n",
+    "                n_clusters=5,\n",
+    "                random_state=100)\n",
+    "\n",
+    "kmeans.fit(X)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "CPU times: user 122 ms, sys: 17.3 ms, total: 139 ms\n",
+      "Wall time: 253 ms\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%time\n",
+    "preds = kmeans.predict(X)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "999995    3\n",
+       "999996    3\n",
+       "999997    3\n",
+       "999998    3\n",
+       "999999    3\n",
+       "dtype: int32"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "preds.tail()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Results"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from cuml.metrics import adjusted_rand_score"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "labels_true = y.compute().values\n",
+    "labels_pred = preds.compute().values"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/winston/anaconda3/envs/bsql/lib/python3.7/site-packages/ipykernel_launcher.py:1: UserWarning:  The dtype of ground truth is not int32 converting the ground truth to int32\n",
+      "  \"\"\"Entry point for launching an IPython kernel.\n",
+      "/home/winston/anaconda3/envs/bsql/lib/python3.7/site-packages/ipykernel_launcher.py:1: UserWarning:  The dtype of predicted labels is not int32 converting the predicted labels to int32\n",
+      "  \"\"\"Entry point for launching an IPython kernel.\n",
+      "/home/winston/anaconda3/envs/bsql/lib/python3.7/site-packages/cuml/utils/input_utils.py:188: UserWarning: Expected column ('F') major order, but got the opposite. Converting data, this will result in additional memory utilization.\n",
+      "  warnings.warn(\"Expected \" + order_to_str(order) + \" major order, \"\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "1.0"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "adjusted_rand_score(labels_true, labels_pred)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}