From 0440fc383b7b900ed7623b6f8c96ce8680579003 Mon Sep 17 00:00:00 2001 From: droctothorpe Date: Wed, 2 Aug 2023 10:53:44 -0400 Subject: [PATCH] [SDK] Enable resource specification for trial containers Co-authored-by: shipengcheng1230 Co-authored-by: Andrey Velichkevich --- .../kubeflow/katib/api/katib_client.py | 33 ++++++++++++++++++- 1 file changed, 32 insertions(+), 1 deletion(-) diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py index 0929303cff9..02f12104133 100644 --- a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py +++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py @@ -16,7 +16,7 @@ import multiprocessing import textwrap import time -from typing import Any, Callable, Dict, List, Optional +from typing import Any, Callable, Dict, List, Optional, Union import grpc import kubeflow.katib.katib_api_pb2 as katib_api_pb2 @@ -144,6 +144,7 @@ def tune( max_trial_count: int = None, parallel_trial_count: int = None, max_failed_trial_count: int = None, + resources_per_trial: Union[dict, client.V1ResourceRequirements, None] = None, retain_trials: bool = False, packages_to_install: List[str] = None, pip_index_url: str = "https://pypi.org/simple", @@ -177,6 +178,21 @@ def tune( values check this doc: https://www.kubeflow.org/docs/components/katib/experiment/#configuration-spec. parallel_trial_count: Number of Trials that Experiment runs in parallel. max_failed_trial_count: Maximum number of Trials allowed to fail. + resources_per_trial: A parameter that lets you specify how much + resources each trial container should have. You can either specify a + kubernetes.client.V1ResourceRequirements object (documented here: + https://github.com/kubernetes-client/python/blob/master/kubernetes/docs/V1ResourceRequirements.md) + or a dictionary that includes one or more of the following keys: + `cpu`, `memory`, or `gpu` (other keys will be ignored). Appropriate + values for these keys are documented here: + https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/. + For example: + { + "cpu": "1", + "gpu": "1", + "memory": "2Gi", + } + This parameter is optional and defaults to None. retain_trials: Whether Trials' resources (e.g. pods) are deleted after Succeeded state. packages_to_install: List of Python packages to install in addition to the base image packages. These packages are installed before @@ -280,6 +296,20 @@ def tune( + exec_script ) + if isinstance(resources_per_trial, dict): + requests = { + "cpu": "200m", + "memory": "256Mi", + } + if "gpu" in resources_per_trial: + resources_per_trial["nvidia.com/gpu"] = resources_per_trial.pop("gpu") + requests.update(resources_per_trial) + + resources_per_trial = client.V1ResourceRequirements( + requests=requests, + limits=requests, + ) + # Create Trial specification. trial_spec = client.V1Job( api_version="batch/v1", @@ -297,6 +327,7 @@ def tune( image=base_image, command=["bash", "-c"], args=[exec_script], + resources=resources_per_trial, ) ], ),