From 6e5d23ab913d05aff7478097de011b259f1c85f2 Mon Sep 17 00:00:00 2001 From: Avritt Rohwer Date: Wed, 18 Sep 2024 04:51:55 +0000 Subject: [PATCH 1/2] Fix autoprovisioning with spot nodes --- src/xpk/commands/workload.py | 10 +++ src/xpk/core/core.py | 2 +- src/xpk/core/nap.py | 138 +++++++++++++++++++++++++++-------- 3 files changed, 117 insertions(+), 33 deletions(-) diff --git a/src/xpk/commands/workload.py b/src/xpk/commands/workload.py index 3c6c6563..e41c7ba0 100644 --- a/src/xpk/commands/workload.py +++ b/src/xpk/commands/workload.py @@ -48,6 +48,7 @@ from ..core.kueue import LOCAL_QUEUE_NAME from ..core.nap import ( get_autoprovisioning_node_selector_args, + get_autoprovisioning_tolerations, is_autoprovisioning_enabled, ) from ..core.pathways import ( @@ -101,6 +102,8 @@ hostNetwork: true dnsPolicy: ClusterFirstWithHostNet terminationGracePeriodSeconds: {args.termination_grace_period_seconds} + tolerations: + {autoprovisioning_tolerations} containers: {container} volumes: @@ -395,6 +398,7 @@ def workload_create(args) -> None: # Currently autoprovisioning is not enabled for Pathways workloads. autoprovisioning_args = '' + autoprovisioning_tolerations = '' autoprovisioning_enabled, return_code = is_autoprovisioning_enabled( args, system ) @@ -407,6 +411,11 @@ def workload_create(args) -> None: ) if return_code != 0: xpk_exit(return_code) + autoprovisioning_tolerations, return_code = ( + get_autoprovisioning_tolerations(args) + ) + if return_code != 0: + xpk_exit(return_code) # Create the workload file based on accelerator type or workload type. if system.accelerator_type == AcceleratorType['GPU']: @@ -467,6 +476,7 @@ def workload_create(args) -> None: local_queue_name=LOCAL_QUEUE_NAME, autoprovisioning_args=autoprovisioning_args, volumes=get_volumes(args, system), + autoprovisioning_tolerations=autoprovisioning_tolerations, ) tmp = write_tmp_file(yml_string) command = f'kubectl apply -f {str(tmp.file.name)}' diff --git a/src/xpk/core/core.py b/src/xpk/core/core.py index 91db6298..e43fa62a 100644 --- a/src/xpk/core/core.py +++ b/src/xpk/core/core.py @@ -763,7 +763,7 @@ def get_capacity_node_selectors_from_capacity_type( case CapacityType.ON_DEMAND.name: node_selector = '' case CapacityType.SPOT.name: - node_selector = 'cloud.google.com/gke-spot="true"' + node_selector = 'cloud.google.com/gke-spot: "true"' case CapacityType.RESERVATION.name: node_selector = f'cloud.google.com/reservation-name: {args.reservation}' case _: diff --git a/src/xpk/core/nap.py b/src/xpk/core/nap.py index b6021e96..08aa37aa 100644 --- a/src/xpk/core/nap.py +++ b/src/xpk/core/nap.py @@ -240,6 +240,29 @@ def create_autoprovisioning_config( return autoprovisioning_config, 0 +def get_cluster_metadata_configmap(args) -> tuple[dict, int]: + """Gets the cluster metadata configmap. + + Args: + args: user provided arguments for running the command. + + Returns: + configmap and 0 if found, None and 1 otherwise. + """ + configmap = get_cluster_configmap( + args, f'{args.cluster}-{CLUSTER_METADATA_CONFIGMAP}' + ) + if configmap is None: + xpk_print( + 'Unable to find config map. Please specify a capacity type' + ' --on-demand, --spot, --reservation=$RESERVATION_ID) to continue' + ' to use autoprovisioning (--enable-autoprovisioning).' + ) + return None, 1 + + return configmap, 0 + + def is_autoprovisioning_enabled( args, system: SystemCharacteristics ) -> tuple[bool, int]: @@ -285,6 +308,42 @@ def is_autoprovisioning_enabled( return False, 1 +def get_capacity_type_str_from_args_or_cluster_default(args) -> tuple[str, int]: + """Determine the capacity type based on user arguments or cluster default. + + Args: + args: user provided arguments for running the command. + + Returns: + Tuple with string with the system characteristics and + int of 0 if successful and 1 otherwise. + """ + # If the user doesn't specify args, then use the cluster settings. + capacity_type, return_code = get_capacity_type(args) + if return_code != 0: + xpk_print('Unable to get capacity type.') + return CapacityType.UNKNOWN.name, return_code + + if capacity_type != CapacityType.UNKNOWN: + return capacity_type.name, 0 + + # Use default settings from cluster creation. + # + # Error out if the metadata config map doesn't exist, and is attempting to use + # autoprovisioning. + cluster_config_map, return_code = get_cluster_metadata_configmap(args) + if return_code != 0: + return CapacityType.UNKNOWN.name, 1 + + return_code, capacity_type_str = get_value_from_map( + CAPACITY_TYPE_CONFIG_KEY, cluster_config_map + ) + if return_code != 0: + return CapacityType.UNKNOWN.name, return_code + + return capacity_type_str, 0 + + def get_autoprovisioning_node_selector_args(args) -> tuple[str, int]: """Determine the capacity type when autoprovisioning is enabled. @@ -297,44 +356,26 @@ def get_autoprovisioning_node_selector_args(args) -> tuple[str, int]: """ return_code = 0 node_selector_args = '' - # If the user doesn't specify args, then use the cluster settings. - capacity_type, return_code = get_capacity_type(args) - capacity_type_str = capacity_type.name + capacity_type_str, return_code = ( + get_capacity_type_str_from_args_or_cluster_default(args) + ) if return_code != 0: - xpk_print('Unable to get capacity type.') return node_selector_args, return_code - if capacity_type_str == CapacityType.UNKNOWN.name: - # Use default settings from cluster creation. - metadata_configmap_name = f'{args.cluster}-{CLUSTER_METADATA_CONFIGMAP}' - cluster_config_map = get_cluster_configmap(args, metadata_configmap_name) - - # Error out if the metadata config map doesn't exist, and is attempting to use - # autoprovisioning. - if cluster_config_map is None: - xpk_print( - 'Unable to find config map. Please specify a capacity type' - ' --on-demand, --spot, --reservation=$RESERVATION_ID) to continue' - ' to use autoprovisioning (--enable-autoprovisioning).' - ) - return node_selector_args, 1 - - return_code, capacity_type_str = get_value_from_map( - CAPACITY_TYPE_CONFIG_KEY, cluster_config_map + cluster_config_map, return_code = get_cluster_metadata_configmap(args) + if return_code != 0: + return node_selector_args, 1 + + if capacity_type_str == CapacityType.RESERVATION.name: + return_code, args.reservation = get_value_from_map( + RESERVATION_CONFIG_KEY, cluster_config_map ) if return_code != 0: return node_selector_args, return_code - - if capacity_type_str == CapacityType.RESERVATION.name: - return_code, args.reservation = get_value_from_map( - RESERVATION_CONFIG_KEY, cluster_config_map - ) - if return_code != 0: - return node_selector_args, return_code - return_code = verify_reservation_exists(args) - if return_code > 0: - xpk_print('Unable to verify reservation name saved in config map.') - return node_selector_args, return_code + return_code = verify_reservation_exists(args) + if return_code > 0: + xpk_print('Unable to verify reservation name saved in config map.') + return node_selector_args, return_code # Check if reservation id is valid. Shared function with cluster creation. node_selector_args, return_code = ( @@ -345,3 +386,36 @@ def get_autoprovisioning_node_selector_args(args) -> tuple[str, int]: return node_selector_args, return_code return node_selector_args, return_code + + +def get_autoprovisioning_tolerations(args) -> tuple[str, int]: + """Determine the pod tolerations when autoprovisioning is enabled. + + Args: + args: user provided arguments for running the command. + + Returns: + Tuple with string of autoprovisioning tolerations and + int of 0 if successful and 1 otherwise. + """ + capacity_type_str, return_code = ( + get_capacity_type_str_from_args_or_cluster_default(args) + ) + if return_code != 0: + return '', return_code + + if capacity_type_str == CapacityType.SPOT.name: + # https://cloud.google.com/kubernetes-engine/docs/concepts/node-auto-provisioning#support_for_spot_vms + # + # > Creating node pools based on Spot VMs is only considered if + # > unschedulable pods with a toleration for the + # > cloud.google.com/gke-spot="true":NoSchedule taint exist + return ( + '''- key: "cloud.google.com/gke-spot" + operator: "Equal" + value: "true" + effect: "NoSchedule"''', + 0, + ) + + return '', 0 From 0e37adffb9b4397eeda0b1fad602fd9be0d7b7ef Mon Sep 17 00:00:00 2001 From: Avritt Rohwer Date: Wed, 25 Sep 2024 14:34:20 +0000 Subject: [PATCH 2/2] Remove get_cluster_metadata_configmap --- src/xpk/core/nap.py | 45 ++++++++++++++++++--------------------------- 1 file changed, 18 insertions(+), 27 deletions(-) diff --git a/src/xpk/core/nap.py b/src/xpk/core/nap.py index 08aa37aa..a6192dec 100644 --- a/src/xpk/core/nap.py +++ b/src/xpk/core/nap.py @@ -240,29 +240,6 @@ def create_autoprovisioning_config( return autoprovisioning_config, 0 -def get_cluster_metadata_configmap(args) -> tuple[dict, int]: - """Gets the cluster metadata configmap. - - Args: - args: user provided arguments for running the command. - - Returns: - configmap and 0 if found, None and 1 otherwise. - """ - configmap = get_cluster_configmap( - args, f'{args.cluster}-{CLUSTER_METADATA_CONFIGMAP}' - ) - if configmap is None: - xpk_print( - 'Unable to find config map. Please specify a capacity type' - ' --on-demand, --spot, --reservation=$RESERVATION_ID) to continue' - ' to use autoprovisioning (--enable-autoprovisioning).' - ) - return None, 1 - - return configmap, 0 - - def is_autoprovisioning_enabled( args, system: SystemCharacteristics ) -> tuple[bool, int]: @@ -331,8 +308,15 @@ def get_capacity_type_str_from_args_or_cluster_default(args) -> tuple[str, int]: # # Error out if the metadata config map doesn't exist, and is attempting to use # autoprovisioning. - cluster_config_map, return_code = get_cluster_metadata_configmap(args) - if return_code != 0: + cluster_config_map = get_cluster_configmap( + args, f'{args.cluster}-{CLUSTER_METADATA_CONFIGMAP}' + ) + if cluster_config_map is None: + xpk_print( + 'Unable to find config map. Please specify a capacity type' + ' --on-demand, --spot, --reservation=$RESERVATION_ID) to continue' + ' to use autoprovisioning (--enable-autoprovisioning).' + ) return CapacityType.UNKNOWN.name, 1 return_code, capacity_type_str = get_value_from_map( @@ -362,8 +346,15 @@ def get_autoprovisioning_node_selector_args(args) -> tuple[str, int]: if return_code != 0: return node_selector_args, return_code - cluster_config_map, return_code = get_cluster_metadata_configmap(args) - if return_code != 0: + cluster_config_map = get_cluster_configmap( + args, f'{args.cluster}-{CLUSTER_METADATA_CONFIGMAP}' + ) + if cluster_config_map is None: + xpk_print( + 'Unable to find config map. Please specify a capacity type' + ' --on-demand, --spot, --reservation=$RESERVATION_ID) to continue' + ' to use autoprovisioning (--enable-autoprovisioning).' + ) return node_selector_args, 1 if capacity_type_str == CapacityType.RESERVATION.name: