From 6e4acac1f1a4c778350fc54d86fdbae6a0f6b7a2 Mon Sep 17 00:00:00 2001 From: Daniel Dowler Date: Thu, 10 Nov 2022 14:08:00 -0700 Subject: [PATCH] added yaml-python hybrid example --- examples/v1beta1/sdk/yaml-python-submit.ipynb | 529 ++++++++++++++++++ 1 file changed, 529 insertions(+) create mode 100644 examples/v1beta1/sdk/yaml-python-submit.ipynb diff --git a/examples/v1beta1/sdk/yaml-python-submit.ipynb b/examples/v1beta1/sdk/yaml-python-submit.ipynb new file mode 100644 index 00000000000..8f39ff55f04 --- /dev/null +++ b/examples/v1beta1/sdk/yaml-python-submit.ipynb @@ -0,0 +1,529 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# YAML-Python Katib Example Submission" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The Katib SDK exposes the power of Python programming to Katib users; however, the code structure is significantly more complex than in the case of direct YAML submission using kubectl.\n", + "\n", + "In contrast, direct Katib Experiment submission via YAML offers an easy-to-read (and to maintain) structure, but it is very basic, and it lacks advantages that the SDK route provides.\n", + "\n", + "You can combine the strenghts of both approaches. This is accomplished in the following steps:\n", + "1. Use a YAML file for simplicity and clarity in defining the basic experiment (like a config or template).\n", + "2. Convert the YAML to a Python dictionary that is compatible with the SDK.\n", + "3. Make any runtime changes.\n", + "4. Submit the experiment." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The following steps run through a simple example." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 0. Required Packages" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Before you begin, it is assumed you have access to Katib on a Kubernetes cluster. You also need to have the `kubeflow-katib` library installed, which you can do by uncommenting and running the command below." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# install kubeflow-katib if needed\n", + "# !pip install kubeflow-katib==0.14.0" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "ExecuteTime": { + "end_time": "2022-11-10T19:58:59.047720Z", + "start_time": "2022-11-10T19:58:58.530992Z" + } + }, + "outputs": [], + "source": [ + "import yaml\n", + "import requests\n", + "import time\n", + "import datetime as dt\n", + "import kubeflow.katib as katib" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 1. Katib Experiment in YAML" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You need an experiment to do a test run with. Use the `random.yaml` experiment from the examples in the Katib GitHub repository. [here](https://github.com/kubeflow/katib/blob/master/examples/v1beta1/hp-tuning/random.yaml)." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "ExecuteTime": { + "end_time": "2022-11-10T19:58:59.425994Z", + "start_time": "2022-11-10T19:58:59.073058Z" + } + }, + "outputs": [], + "source": [ + "# download the random.yaml Katib Experiment file\n", + "url = \"https://raw.githubusercontent.com/kubeflow/katib/master/examples/v1beta1/hp-tuning/random.yaml\"\n", + "random_yaml = requests.get(url)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "ExecuteTime": { + "end_time": "2022-11-10T19:58:59.456539Z", + "start_time": "2022-11-10T19:58:59.454277Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "---\n", + "apiVersion: kubeflow.org/v1beta1\n", + "kind: Experiment\n", + "metadata:\n", + " namespace: kubeflow\n", + " name: random\n", + "spec:\n", + " objective:\n", + " type: maximize\n", + " goal: 0.99\n", + " objectiveMetricName: Validation-accuracy\n", + " \n" + ] + } + ], + "source": [ + "# inspect first lines\n", + "print(random_yaml.text[:200])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2. Load the Experiment into Python" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now, turn the YAML text into a Python dictionary." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "ExecuteTime": { + "end_time": "2022-11-10T19:58:59.702433Z", + "start_time": "2022-11-10T19:58:59.691538Z" + } + }, + "outputs": [], + "source": [ + "experiment = yaml.safe_load(random_yaml.text)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "ExecuteTime": { + "end_time": "2022-11-10T19:59:00.427462Z", + "start_time": "2022-11-10T19:59:00.417890Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "{'apiVersion': 'kubeflow.org/v1beta1',\n", + " 'kind': 'Experiment',\n", + " 'metadata': {'namespace': 'kubeflow', 'name': 'random'},\n", + " 'spec': {'objective': {'type': 'maximize',\n", + " 'goal': 0.99,\n", + " 'objectiveMetricName': 'Validation-accuracy',\n", + " 'additionalMetricNames': ['Train-accuracy']},\n", + " 'algorithm': {'algorithmName': 'random'},\n", + " 'parallelTrialCount': 3,\n", + " 'maxTrialCount': 12,\n", + " 'maxFailedTrialCount': 3,\n", + " 'parameters': [{'name': 'lr',\n", + " 'parameterType': 'double',\n", + " 'feasibleSpace': {'min': '0.01', 'max': '0.03'}},\n", + " {'name': 'num-layers',\n", + " 'parameterType': 'int',\n", + " 'feasibleSpace': {'min': '2', 'max': '5'}},\n", + " {'name': 'optimizer',\n", + " 'parameterType': 'categorical',\n", + " 'feasibleSpace': {'list': ['sgd', 'adam', 'ftrl']}}],\n", + " 'trialTemplate': {'primaryContainerName': 'training-container',\n", + " 'trialParameters': [{'name': 'learningRate',\n", + " 'description': 'Learning rate for the training model',\n", + " 'reference': 'lr'},\n", + " {'name': 'numberLayers',\n", + " 'description': 'Number of training model layers',\n", + " 'reference': 'num-layers'},\n", + " {'name': 'optimizer',\n", + " 'description': 'Training model optimizer (sdg, adam or ftrl)',\n", + " 'reference': 'optimizer'}],\n", + " 'trialSpec': {'apiVersion': 'batch/v1',\n", + " 'kind': 'Job',\n", + " 'spec': {'template': {'spec': {'containers': [{'name': 'training-container',\n", + " 'image': 'docker.io/kubeflowkatib/mxnet-mnist:latest',\n", + " 'command': ['python3',\n", + " '/opt/mxnet-mnist/mnist.py',\n", + " '--batch-size=64',\n", + " '--lr=${trialParameters.learningRate}',\n", + " '--num-layers=${trialParameters.numberLayers}',\n", + " '--optimizer=${trialParameters.optimizer}']}],\n", + " 'restartPolicy': 'Never'}}}}}}}" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "experiment" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 3. Make any needed Edits" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "It's common for users (or apps) to need to change parameters at runtime. The following code appends the current timestamp to the Experiment name, and then it updates the `maxTrialCount` and `parallelTrialCount` values.\n", + "\n", + "The code also sets the namespace. If the namespace `kubeflow-user-example-com` doesn't exist on your cluster, you can create it prior to continuing, or you can update the line below with your own namespace. Make sure the namespace you use has the label `katib.kubeflow.org/metrics-collector-injection: enabled`." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "ExecuteTime": { + "end_time": "2022-11-10T19:59:01.475245Z", + "start_time": "2022-11-10T19:59:01.473655Z" + } + }, + "outputs": [], + "source": [ + "# uncomment and run commands if needed to create/label namespace\n", + "# !kubectl create namespace kubeflow-user-example-com\n", + "# !kubectl label namespace kubeflow-user-example-com katib.kubeflow.org/metrics-collector-injection=enabled" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "ExecuteTime": { + "end_time": "2022-11-10T19:59:05.479741Z", + "start_time": "2022-11-10T19:59:05.475708Z" + } + }, + "outputs": [], + "source": [ + "ns = \"kubeflow-user-example-com\" # change namespace if needed\n", + "\n", + "dtime = dt.datetime.now().strftime(\"%Y-%m-%d-%M%H%S\")\n", + "exp_name = f\"{experiment['metadata']['name']}-{dtime}\"\n", + "experiment[\"metadata\"][\"name\"] = exp_name\n", + "experiment[\"metadata\"][\"namespace\"] = ns\n", + "experiment[\"metadata\"][\"labels\"] = {\"katib.kubeflow.org/metrics-collector-injection\": \"enabled\"}\n", + "experiment[\"spec\"][\"maxTrialCount\"] = 10\n", + "experiment[\"spec\"][\"parallelTrialCount\"] = 2" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 4. Submit Katib Experiment" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Submit the updated experiment to Katib." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "ExecuteTime": { + "end_time": "2022-11-10T19:59:08.144505Z", + "start_time": "2022-11-10T19:59:08.057260Z" + } + }, + "outputs": [], + "source": [ + "client = katib.KatibClient()\n", + "ns = \"kubeflow-user-example-com\" # change namespace if needed\n", + "result = client.create_experiment(experiment, namespace=ns)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The following code monitors the progress of the experiment until it either succeeds or fails, or the timeout is reached. You may also wish to visit the Katib UI to check on the experiment there." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "ExecuteTime": { + "end_time": "2022-11-10T20:08:03.644480Z", + "start_time": "2022-11-10T19:59:32.148986Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "12:59:32 Experiment status: Created Trials Succeeded: None\n", + "12:59:52 Experiment status: Running Trials Succeeded: None\n", + "13:02:52 Experiment status: Running Trials Succeeded: 1\n", + "13:03:42 Experiment status: Running Trials Succeeded: 2\n", + "13:04:32 Experiment status: Running Trials Succeeded: 4\n", + "13:05:13 Experiment status: Running Trials Succeeded: 5\n", + "13:05:23 Experiment status: Running Trials Succeeded: 6\n", + "13:06:03 Experiment status: Running Trials Succeeded: 8\n", + "13:07:53 Experiment status: Running Trials Succeeded: 9\n", + "13:08:03 Experiment status: Succeeded Trials Succeeded: 10\n" + ] + } + ], + "source": [ + "timeout = 30*60 # 30 minutes in seconds\n", + "status = None\n", + "prev_time = dt.datetime.now()\n", + "last_msg = \"\"\n", + "while status not in [\"Succeeded\", \"Failed\"] and timeout > 0:\n", + " try:\n", + " status = client.get_experiment_status(exp_name, namespace=ns)\n", + " except IndexError:\n", + " status = None\n", + " exp = client.get_experiment(exp_name, namespace=ns)\n", + " trials_success = exp.get(\"status\", {}).get(\"trialsSucceeded\")\n", + " msg = f\"Experiment status: {status} Trials Succeeded: {trials_success}\"\n", + " tstamp = dt.datetime.now()\n", + " if msg != last_msg:\n", + " print(f\"{tstamp.strftime('%H:%M:%S')} {msg}\")\n", + " last_msg = msg\n", + " if status in [\"Succeeded\", \"Failed\"]: break\n", + " time.sleep(10)\n", + " cur_time = dt.datetime.now()\n", + " timeout -= (tstamp - prev_time).total_seconds()\n", + " prev_time = tstamp" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Finally, we can check that the experiment succeeded, and we can get the optimal hyperparameter results." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "ExecuteTime": { + "end_time": "2022-11-10T20:08:03.710673Z", + "start_time": "2022-11-10T20:08:03.695496Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "client.is_experiment_succeeded(exp_name, ns)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "ExecuteTime": { + "end_time": "2022-11-10T20:08:13.305142Z", + "start_time": "2022-11-10T20:08:13.284595Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "{'currentOptimalTrial': {'bestTrialName': 'random-2022-11-10-591205-bvk9hdk8',\n", + " 'observation': {'metrics': [{'latest': '0.979896',\n", + " 'max': '0.979896',\n", + " 'min': '0.958499',\n", + " 'name': 'Validation-accuracy'},\n", + " {'latest': '0.993737',\n", + " 'max': '0.993737',\n", + " 'min': '0.918094',\n", + " 'name': 'Train-accuracy'}]},\n", + " 'parameterAssignments': [{'name': 'lr', 'value': '0.024085346582757364'},\n", + " {'name': 'num-layers', 'value': '3'},\n", + " {'name': 'optimizer', 'value': 'sgd'}]}}" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "client.get_optimal_hyperparameters(exp_name, namespace=ns)" + ] + } + ], + "metadata": { + "hide_input": false, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.3" + }, + "latex_envs": { + "LaTeX_envs_menu_present": true, + "autoclose": false, + "autocomplete": true, + "bibliofile": "biblio.bib", + "cite_by": "apalike", + "current_citInitial": 1, + "eqLabelWithNumbers": true, + "eqNumInitial": 1, + "hotkeys": { + "equation": "Ctrl-E", + "itemize": "Ctrl-I" + }, + "labels_anchors": false, + "latex_user_defs": false, + "report_style_numbering": false, + "user_envs_cfg": false + }, + "toc": { + "base_numbering": 1, + "nav_menu": {}, + "number_sections": false, + "sideBar": true, + "skip_h1_title": true, + "title_cell": "Table of Contents", + "title_sidebar": "Contents", + "toc_cell": false, + "toc_position": {}, + "toc_section_display": true, + "toc_window_display": false + }, + "varInspector": { + "cols": { + "lenName": 16, + "lenType": 16, + "lenVar": 40 + }, + "kernels_config": { + "python": { + "delete_cmd_postfix": "", + "delete_cmd_prefix": "del ", + "library": "var_list.py", + "varRefreshCmd": "print(var_dic_list())" + }, + "r": { + "delete_cmd_postfix": ") ", + "delete_cmd_prefix": "rm(", + "library": "var_list.r", + "varRefreshCmd": "cat(var_dic_list()) " + } + }, + "types_to_exclude": [ + "module", + "function", + "builtin_function_or_method", + "instance", + "_Feature" + ], + "window_display": false + }, + "vscode": { + "interpreter": { + "hash": "2d81f1fd38f5c5fd601fceae64af1bda578cf2f895366126afd7299adc703a30" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}