Skip to content

Commit 64527de

Browse files
committed
Merge branch 'develop' into chzheng/docker_image_flag
2 parents c919384 + 504bf65 commit 64527de

File tree

11 files changed

+21
-33
lines changed

11 files changed

+21
-33
lines changed

README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,13 +41,15 @@ XPK supports the following TPU types:
4141
* v5e
4242
* v5p
4343
* Trillium (v6e)
44+
* Ironwood (tpu7x)
4445

4546
and the following GPU types:
4647
* A100
4748
* A3-Highgpu (h100)
4849
* A3-Mega (h100-mega) - [Create cluster](#provisioning-a3-ultra-a3-mega-and-a4-clusters-gpu-machines), [Create workloads](#workloads-for-a3-ultra-a3-mega-and-a4-clusters-gpu-machines)
4950
* A3-Ultra (h200) - [Create cluster](#provisioning-a3-ultra-a3-mega-and-a4-clusters-gpu-machines), [Create workloads](#workloads-for-a3-ultra-a3-mega-and-a4-clusters-gpu-machines)
5051
* A4 (b200) - [Create cluster](#provisioning-a3-ultra-a3-mega-and-a4-clusters-gpu-machines), [Create workloads](#workloads-for-a3-ultra-a3-mega-and-a4-clusters-gpu-machines)
52+
* A4X (gb200)
5153

5254
and the following CPU types:
5355
* n2-standard-32

golden_buddy.sh

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,11 @@ GREEN='\033[0;32m'
8383
YELLOW='\033[0;33m'
8484
NC='\033[0m'
8585

86+
if ! command -v yq &> /dev/null; then
87+
echo -e "${RED}Error: 'yq' command not found. Please install yq to continue.${NC}" >&2
88+
exit 1
89+
fi
90+
8691
if [[ "$MODE" != "update" && "$MODE" != "verify" ]]; then
8792
echo "Error: Unsupported mode '$MODE'. Must be 'update' or 'verify'." >&2
8893
exit 1

src/xpk/commands/run.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -126,7 +126,7 @@ def submit_job(args: Namespace) -> None:
126126
if args.time is not None:
127127
cmd += f' --time {args.time}'
128128

129-
return_code = run_command_with_full_controls(cmd, 'run task', args)
129+
return_code = run_command_with_full_controls(cmd, 'run task')
130130

131131
if return_code != 0:
132132
xpk_print(f'Running task returned ERROR {return_code}')

src/xpk/commands/shell.py

Lines changed: 2 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -38,9 +38,7 @@ def shell(args: Namespace):
3838
if exisitng_shell_pod_name is None:
3939
return_code = connect_to_new_interactive_shell(args)
4040
else:
41-
return_code = connect_to_existing_interactive_shell(
42-
exisitng_shell_pod_name, args
43-
)
41+
return_code = connect_to_existing_interactive_shell(exisitng_shell_pod_name)
4442

4543
if return_code != 0:
4644
xpk_print(f'The command failed with code {return_code}.')
@@ -94,21 +92,17 @@ def connect_to_new_interactive_shell(args: Namespace) -> int:
9492
return run_command_with_full_controls(
9593
command=cmd,
9694
task='Creating new interactive shell and entering it',
97-
global_args=args,
9895
instructions=exit_instructions,
9996
)
10097

10198

102-
def connect_to_existing_interactive_shell(
103-
pod_name: str, args: Namespace
104-
) -> int:
99+
def connect_to_existing_interactive_shell(pod_name: str) -> int:
105100
return run_command_with_full_controls(
106101
command=(
107102
f'kubectl exec --stdin --tty {pod_name} --'
108103
f' {get_pod_template_interactive_command()}'
109104
),
110105
task='Entering existing interactive shell',
111-
global_args=args,
112106
instructions=exit_instructions,
113107
)
114108

src/xpk/commands/workload.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -732,7 +732,6 @@ def workload_delete(args) -> None:
732732
'Delete Workload',
733733
task_names,
734734
batch=100,
735-
dry_run=args.dry_run,
736735
)
737736

738737
if return_code != 0:

src/xpk/core/commands.py

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -18,23 +18,21 @@
1818
import subprocess
1919
import sys
2020
import time
21-
from argparse import Namespace
2221

2322
from ..utils.objects import chunks
2423
from ..utils.file import make_tmp_files, write_tmp_file
2524
from ..utils.console import xpk_print
2625
from ..utils.execution_context import is_dry_run
2726

2827

29-
def run_commands(commands, jobname, per_command_name, batch=10, dry_run=False):
28+
def run_commands(commands, jobname, per_command_name, batch=10):
3029
"""Run commands in groups of `batch`.
3130
3231
Args:
3332
commands: list of command.
3433
jobname: the name of the job.
3534
per_command_name: list of command names.
3635
batch: number of commands to run in parallel.
37-
dry_run: enables dry_run if set to true.
3836
3937
Returns:
4038
0 if successful and 1 otherwise.
@@ -47,7 +45,7 @@ def run_commands(commands, jobname, per_command_name, batch=10, dry_run=False):
4745
f'Breaking up a total of {len(commands)} commands into'
4846
f' {len(commands_batched)} batches'
4947
)
50-
if dry_run:
48+
if is_dry_run():
5149
xpk_print('Pretending all the jobs succeeded')
5250
return 0
5351

@@ -302,7 +300,6 @@ def run_command_for_value(
302300
def run_command_with_full_controls(
303301
command: str,
304302
task: str,
305-
global_args: Namespace,
306303
instructions: str | None = None,
307304
) -> int:
308305
"""Run command in current shell with system out, in and error handles. Wait
@@ -311,13 +308,12 @@ def run_command_with_full_controls(
311308
Args:
312309
command: command to execute
313310
task: user-facing name of the task
314-
global_args: user provided arguments for running the command.
315311
verbose: shows stdout and stderr if set to true. Set to True by default.
316312
317313
Returns:
318314
0 if successful and 1 otherwise.
319315
"""
320-
if global_args.dry_run:
316+
if is_dry_run():
321317
xpk_print(
322318
f'Task: `{task}` is implemented by the following command'
323319
' not running since it is a dry run.'

src/xpk/core/docker_image.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,6 @@
1717
import datetime
1818
import os
1919
import random
20-
import re
2120
import string
2221
import subprocess
2322

@@ -195,7 +194,10 @@ def setup_docker_image(args) -> tuple[int, str]:
195194
docker_image = DEFAULT_DOCKER_IMAGE
196195

197196
result = subprocess.run(
198-
["docker", "images", "-q", docker_image], capture_output=True, text=True
197+
['docker', 'images', '-q', docker_image],
198+
capture_output=True,
199+
text=True,
200+
check=True,
199201
)
200202
is_local = bool(result.stdout.strip())
201203

src/xpk/core/nap.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -156,7 +156,6 @@ def enable_autoprovisioning_on_cluster(
156156
commands,
157157
'Update node pools with autoprovisioning support',
158158
task_names,
159-
dry_run=args.dry_run,
160159
)
161160
if max_return_code != 0:
162161
xpk_print(

src/xpk/core/nodepool.py

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -212,7 +212,6 @@ def run_gke_node_pool_create_command(
212212
delete_commands,
213213
'Delete Nodepools',
214214
delete_task_names,
215-
dry_run=args.dry_run,
216215
)
217216
if max_return_code != 0:
218217
xpk_print(f'Delete Nodepools returned ERROR {max_return_code}')
@@ -240,7 +239,6 @@ def run_gke_node_pool_create_command(
240239
update_WI_commands,
241240
'Enable Workload Identity on existing Nodepools',
242241
update_WI_task_names,
243-
dry_run=args.dry_run,
244242
)
245243
if max_return_code != 0:
246244
xpk_print(
@@ -265,9 +263,7 @@ def run_gke_node_pool_create_command(
265263
)
266264
configmap_yml = {}
267265
configmap_yml[resources_configmap_name] = resources_yml
268-
return_code = create_or_update_cluster_configmap(
269-
configmap_yml, args.dry_run
270-
)
266+
return_code = create_or_update_cluster_configmap(configmap_yml)
271267
if return_code != 0:
272268
return 1
273269

@@ -369,7 +365,6 @@ def run_gke_node_pool_create_command(
369365
create_commands,
370366
'Create Nodepools',
371367
create_task_names,
372-
dry_run=args.dry_run,
373368
)
374369
if max_return_code != 0:
375370
xpk_print(f'Create Nodepools returned ERROR {max_return_code}')
@@ -582,7 +577,6 @@ def upgrade_gke_nodepools_version(args, default_rapid_gke_version) -> int:
582577
commands,
583578
'Update GKE node pools to default RAPID GKE version',
584579
task_names,
585-
dry_run=args.dry_run,
586580
)
587581
if max_return_code != 0:
588582
xpk_print(

src/xpk/core/pathways.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -326,7 +326,7 @@ def try_to_delete_pathwaysjob_first(args, workloads) -> bool:
326326
return_code = run_command_with_updates(commands[0], 'Delete Workload')
327327
else:
328328
return_code = run_commands(
329-
commands, 'Delete Workload', task_names, batch=100, dry_run=args.dry_run
329+
commands, 'Delete Workload', task_names, batch=100
330330
)
331331

332332
if return_code != 0:

0 commit comments

Comments
 (0)