From bdf0bf57aecf845bfb04b7d02eebad2e93de9f9d Mon Sep 17 00:00:00 2001
From: droctothorpe <alexander.perlman@capitalone.com>
Date: Wed, 2 Aug 2023 10:53:44 -0400
Subject: [PATCH] [SDK] Enable resource specification for trial containers

Co-authored-by: shipengcheng1230 <shipengcheng1230@gmail.com>
Co-authored-by: Andrey Velichkevich <andrey.velichkevich@gmail.com>
---
 .../kubeflow/katib/api/katib_client.py        | 36 ++++++++++++++++++-
 1 file changed, 35 insertions(+), 1 deletion(-)

diff --git a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py
index 0929303cff9..99000fd427c 100644
--- a/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py
+++ b/sdk/python/v1beta1/kubeflow/katib/api/katib_client.py
@@ -16,7 +16,7 @@
 import multiprocessing
 import textwrap
 import time
-from typing import Any, Callable, Dict, List, Optional
+from typing import Any, Callable, Dict, List, Optional, Union
 
 import grpc
 import kubeflow.katib.katib_api_pb2 as katib_api_pb2
@@ -144,6 +144,7 @@ def tune(
         max_trial_count: int = None,
         parallel_trial_count: int = None,
         max_failed_trial_count: int = None,
+        resources_per_trial: Union[dict, client.V1ResourceRequirements, None] = None,
         retain_trials: bool = False,
         packages_to_install: List[str] = None,
         pip_index_url: str = "https://pypi.org/simple",
@@ -177,6 +178,21 @@ def tune(
                 values check this doc: https://www.kubeflow.org/docs/components/katib/experiment/#configuration-spec.
             parallel_trial_count: Number of Trials that Experiment runs in parallel.
             max_failed_trial_count: Maximum number of Trials allowed to fail.
+            resources_per trial: A parameter that lets you specify how much
+            resources each trial container should have. You can either specify a
+            kubernetes.client.V1ResourceRequirements object (documented here:
+            https://github.com/kubernetes-client/python/blob/master/kubernetes/docs/V1ResourceRequirements.md)
+            or a dictionary that includes one or more of the following keys:
+            `cpu`, `memory`, or `gpu` (other keys will be ignored). Appropriate
+            values for these keys are documented here:
+            https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/.
+            For example:
+                {
+                    "cpu": "1",
+                    "gpu": "1",
+                    "memory": "2Gi",
+                }
+            This parameter is optional and defaults to None.
             retain_trials: Whether Trials' resources (e.g. pods) are deleted after Succeeded state.
             packages_to_install: List of Python packages to install in addition
                 to the base image packages. These packages are installed before
@@ -280,6 +296,23 @@ def tune(
                 + exec_script
             )
 
+        resources = client.V1ResourceRequirements()
+        if isinstance(resources_per_trial, dict):
+            requests = {
+                "cpu": "200m",
+                "memory": "256Mi",
+            }
+            if "gpu" in resources_per_trial:
+                resources_per_trial["nvidia.com/gpu"] = resources_per_trial.pop("gpu")
+            requests.update(resources_per_trial)
+
+            resources = client.V1ResourceRequirements(
+                requests=requests,
+                limits=requests,
+            )
+        else:
+            resources = resources_per_trial
+
         # Create Trial specification.
         trial_spec = client.V1Job(
             api_version="batch/v1",
@@ -297,6 +330,7 @@ def tune(
                                 image=base_image,
                                 command=["bash", "-c"],
                                 args=[exec_script],
+                                resources=resources,
                             )
                         ],
                     ),