Skip to content

Commit

Permalink
[SDK] Enable resource specification for trial containers
Browse files Browse the repository at this point in the history
Co-authored-by: shipengcheng1230 <[email protected]>
Co-authored-by: Andrey Velichkevich <[email protected]>
  • Loading branch information
3 people committed Aug 4, 2023
1 parent c749d27 commit bdf0bf5
Showing 1 changed file with 35 additions and 1 deletion.
36 changes: 35 additions & 1 deletion sdk/python/v1beta1/kubeflow/katib/api/katib_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
import multiprocessing
import textwrap
import time
from typing import Any, Callable, Dict, List, Optional
from typing import Any, Callable, Dict, List, Optional, Union

import grpc
import kubeflow.katib.katib_api_pb2 as katib_api_pb2
Expand Down Expand Up @@ -144,6 +144,7 @@ def tune(
max_trial_count: int = None,
parallel_trial_count: int = None,
max_failed_trial_count: int = None,
resources_per_trial: Union[dict, client.V1ResourceRequirements, None] = None,
retain_trials: bool = False,
packages_to_install: List[str] = None,
pip_index_url: str = "https://pypi.org/simple",
Expand Down Expand Up @@ -177,6 +178,21 @@ def tune(
values check this doc: https://www.kubeflow.org/docs/components/katib/experiment/#configuration-spec.
parallel_trial_count: Number of Trials that Experiment runs in parallel.
max_failed_trial_count: Maximum number of Trials allowed to fail.
resources_per trial: A parameter that lets you specify how much
resources each trial container should have. You can either specify a
kubernetes.client.V1ResourceRequirements object (documented here:
https://github.com/kubernetes-client/python/blob/master/kubernetes/docs/V1ResourceRequirements.md)
or a dictionary that includes one or more of the following keys:
`cpu`, `memory`, or `gpu` (other keys will be ignored). Appropriate
values for these keys are documented here:
https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/.
For example:
{
"cpu": "1",
"gpu": "1",
"memory": "2Gi",
}
This parameter is optional and defaults to None.
retain_trials: Whether Trials' resources (e.g. pods) are deleted after Succeeded state.
packages_to_install: List of Python packages to install in addition
to the base image packages. These packages are installed before
Expand Down Expand Up @@ -280,6 +296,23 @@ def tune(
+ exec_script
)

resources = client.V1ResourceRequirements()
if isinstance(resources_per_trial, dict):
requests = {
"cpu": "200m",
"memory": "256Mi",
}
if "gpu" in resources_per_trial:
resources_per_trial["nvidia.com/gpu"] = resources_per_trial.pop("gpu")
requests.update(resources_per_trial)

resources = client.V1ResourceRequirements(
requests=requests,
limits=requests,
)
else:
resources = resources_per_trial

# Create Trial specification.
trial_spec = client.V1Job(
api_version="batch/v1",
Expand All @@ -297,6 +330,7 @@ def tune(
image=base_image,
command=["bash", "-c"],
args=[exec_script],
resources=resources,
)
],
),
Expand Down

0 comments on commit bdf0bf5

Please sign in to comment.