|
25 | 25 |
|
26 | 26 |
|
27 | 27 | def check_if_workload_can_schedule(args, system: SystemCharacteristics) -> bool: |
28 | | - """Check if workload can schedule based on the cluster resources (tpu_type and maximum VM in cluster). |
| 28 | + """Check if workload can schedule based on the cluster resources. |
| 29 | +
|
| 30 | + This function validates that the resource requested by the user exists in the |
| 31 | + cluster's resource manifest (a ConfigMap) and that the requested quantity |
| 32 | + (e.g., number of VMs) does not exceed the available quantity. |
29 | 33 |
|
30 | 34 | Args: |
31 | | - args: user provided arguments for running the command. |
32 | | - system: system characteristics |
| 35 | + args: User-provided arguments for running the command. |
| 36 | + system: System characteristics derived from the user's request. |
33 | 37 |
|
34 | 38 | Returns: |
35 | | - returns true if workload can schedule, otherwise returns false. |
| 39 | + True if the workload can be scheduled, otherwise False. |
36 | 40 | """ |
37 | 41 | resources_configmap_name = f'{args.cluster}-{CLUSTER_RESOURCES_CONFIGMAP}' |
38 | 42 | cluster_config_map = get_cluster_configmap(args, resources_configmap_name) |
39 | 43 |
|
40 | | - # Prevents workload creation failure for existing clusters with no ConfigMap |
| 44 | + # If no ConfigMap exists, we cannot validate, so we optimistically proceed. |
| 45 | + # This maintains compatibility with older cluster setups. |
41 | 46 | if cluster_config_map is None: |
42 | 47 | xpk_print( |
43 | | - 'No ConfigMap exist for cluster with the name' |
44 | | - f' {resources_configmap_name}.' |
| 48 | + f"Warning: Could not find resource ConfigMap '{resources_configmap_name}'. " |
| 49 | + "Proceeding without resource validation." |
45 | 50 | ) |
46 | 51 | return True |
47 | 52 |
|
48 | | - # Check for gke accelerator type: |
49 | | - missing_gke_accelerator_type = False |
50 | | - if not cluster_config_map.get(system.gke_accelerator): |
| 53 | + # The user-facing device type (e.g., 'v5litepod-32') is the single source |
| 54 | + # of truth for identifying the resource in the cluster's manifest. |
| 55 | + user_facing_device_type = args.tpu_type if args.tpu_type else args.device_type |
| 56 | + |
| 57 | + # --- Primary Validation --- |
| 58 | + # Check if the cluster's resource manifest contains an entry for the exact |
| 59 | + # device type the user requested. This is the only reliable existence check. |
| 60 | + if user_facing_device_type not in cluster_config_map: |
51 | 61 | xpk_print( |
52 | | - f'Gke Accelerator Type Check: {args.workload} is requesting' |
53 | | - f' {system.gke_accelerator} but cluster only contains' |
54 | | - f' {cluster_config_map.keys()}. ' |
| 62 | + f"Device Type Check Failed: Workload '{args.workload}' is requesting " |
| 63 | + f"device type '{user_facing_device_type}', but the cluster's resource " |
| 64 | + f"manifest only contains entries for: {list(cluster_config_map.keys())}. " |
| 65 | + "The cluster may not be provisioned with this hardware type." |
55 | 66 | ) |
56 | | - missing_gke_accelerator_type = True |
57 | | - elif ( |
58 | | - cluster_config_map[system.gke_accelerator] |
| 67 | + return False |
| 68 | + |
| 69 | + # --- Quantity Validation --- |
| 70 | + |
| 71 | + # Handle autoprovisioning capacity checks. |
| 72 | + if ( |
| 73 | + cluster_config_map[user_facing_device_type] |
59 | 74 | == AUTOPROVISIONING_CONFIG_VALUE |
60 | 75 | ): |
61 | | - # Run total chip check when in autoprovisioning mode. |
62 | 76 | max_chips_in_cluster = int( |
63 | | - cluster_config_map[AUTOPROVISIONING_CONFIG_MAXIMUM_KEY] |
| 77 | + cluster_config_map.get(AUTOPROVISIONING_CONFIG_MAXIMUM_KEY, 0) |
64 | 78 | ) |
65 | 79 | num_chips_in_workload = get_total_chips_requested_from_args(args, system) |
66 | 80 |
|
67 | 81 | if num_chips_in_workload > max_chips_in_cluster: |
68 | 82 | xpk_print( |
69 | | - f'{args.workload} is requesting {num_chips_in_workload} chips but' |
70 | | - f' the cluster {args.cluster} supports up to {max_chips_in_cluster}.' |
71 | | - ' Resize the cluster to support more chips with' |
72 | | - ' `xpk cluster create --autoprovisioning-max-chips=X ...`' |
| 83 | + f"Chip Request Exceeds Limit: Workload '{args.workload}' requests " |
| 84 | + f"{num_chips_in_workload} chips, but the autoprovisioning cluster " |
| 85 | + f"'{args.cluster}' is configured for a maximum of {max_chips_in_cluster} chips." |
73 | 86 | ) |
74 | 87 | return False |
75 | | - return True |
| 88 | + return True # For autoprovisioning, chip count is sufficient. |
76 | 89 |
|
77 | | - # Check for device type |
78 | | - missing_device_type = False |
79 | | - device_type = system.device_type |
80 | | - if device_type not in cluster_config_map: |
81 | | - xpk_print( |
82 | | - f'Device Type Check: {args.workload} is requesting {device_type} but ' |
83 | | - f'cluster only contains {cluster_config_map.keys()}. ' |
84 | | - ) |
85 | | - missing_device_type = True |
| 90 | + # For statically-sized clusters, check if the number of requested VMs fits. |
| 91 | + max_vm_in_cluster = int(cluster_config_map[user_facing_device_type]) |
| 92 | + if system.accelerator_type == AcceleratorType['GPU']: |
| 93 | + vm_required_by_workload = args.num_nodes |
| 94 | + else: |
| 95 | + vm_required_by_workload = args.num_slices * system.vms_per_slice |
86 | 96 |
|
87 | | - if missing_device_type and missing_gke_accelerator_type: |
| 97 | + if vm_required_by_workload > max_vm_in_cluster: |
88 | 98 | xpk_print( |
89 | | - 'Both Device Type and GKE Accelerator Type checks failed.' |
90 | | - f' XPK will not create the workload {args.workload}.' |
| 99 | + f"VM Request Exceeds Capacity: Workload '{args.workload}' requests " |
| 100 | + f"{vm_required_by_workload} VMs for {args.num_slices} slice(s) of type " |
| 101 | + f"'{user_facing_device_type}', but the cluster only has " |
| 102 | + f"{max_vm_in_cluster} VMs of that type available." |
91 | 103 | ) |
92 | 104 | return False |
93 | | - else: |
94 | | - # Check if the size of the workload will fit in the cluster. |
95 | | - max_vm_in_cluster = int(cluster_config_map[device_type]) |
96 | | - if system.accelerator_type == AcceleratorType['GPU']: |
97 | | - vm_required_by_workload = args.num_nodes |
98 | | - else: |
99 | | - vm_required_by_workload = args.num_slices * system.vms_per_slice |
100 | | - if vm_required_by_workload > max_vm_in_cluster: |
101 | | - xpk_print( |
102 | | - f'{args.workload} is requesting {args.num_slices} slice/slices of' |
103 | | - f' {device_type}, which is {vm_required_by_workload} VMs, but the' |
104 | | - f' cluster only contains {max_vm_in_cluster} VMs of {device_type}.' |
105 | | - ' XPK will not create this workload.' |
106 | | - ) |
107 | | - return False |
108 | 105 |
|
109 | 106 | return True |
110 | 107 |
|
|
0 commit comments