From 60a344e867e3b32bfdbbd9f57fde020c569add85 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Sat, 8 Jun 2024 10:55:42 -0700 Subject: [PATCH] [UX] Remove warning for gcp check (#3647) * remove warning for gcp check * Add detailed reason in operation failure * address comments * format * thread safty * Fix thread safety for gcp build * revert * revert * fix --- sky/adaptors/gcp.py | 5 ++-- sky/clouds/gcp.py | 8 ++--- sky/provision/gcp/instance_utils.py | 45 ++++++++++++++++------------- 3 files changed, 32 insertions(+), 26 deletions(-) diff --git a/sky/adaptors/gcp.py b/sky/adaptors/gcp.py index 6465709d42c4..9f63bec87eea 100644 --- a/sky/adaptors/gcp.py +++ b/sky/adaptors/gcp.py @@ -21,8 +21,9 @@ def build(service_name: str, version: str, *args, **kwargs): service_name: GCP service name (e.g., 'compute', 'storagetransfer'). version: Service version (e.g., 'v1'). """ - from googleapiclient import discovery - return discovery.build(service_name, version, *args, **kwargs) + + return googleapiclient.discovery.build(service_name, version, *args, + **kwargs) @common.load_lazy_modules(_LAZY_MODULES) diff --git a/sky/clouds/gcp.py b/sky/clouds/gcp.py index 206dbc4fdc33..95ed7eca7143 100644 --- a/sky/clouds/gcp.py +++ b/sky/clouds/gcp.py @@ -738,13 +738,13 @@ def check_credentials(cls) -> Tuple[bool, Optional[str]]: # pylint: disable=import-outside-toplevel,unused-import import google.auth - import googleapiclient.discovery # This takes user's credential info from "~/.config/gcloud/application_default_credentials.json". # pylint: disable=line-too-long credentials, project = google.auth.default() - crm = googleapiclient.discovery.build('cloudresourcemanager', - 'v1', - credentials=credentials) + crm = gcp.build('cloudresourcemanager', + 'v1', + credentials=credentials, + cache_discovery=False) gcp_minimal_permissions = gcp_utils.get_minimal_permissions() permissions = {'permissions': gcp_minimal_permissions} request = crm.projects().testIamPermissions(resource=project, diff --git a/sky/provision/gcp/instance_utils.py b/sky/provision/gcp/instance_utils.py index dde0918274dc..be17861e9f82 100644 --- a/sky/provision/gcp/instance_utils.py +++ b/sky/provision/gcp/instance_utils.py @@ -100,19 +100,20 @@ def _generate_node_name(cluster_name: str, node_suffix: str, return node_name -def _log_errors(errors: List[Dict[str, str]], e: Any, - zone: Optional[str]) -> None: - """Format errors into a string.""" +def _format_and_log_message_from_errors(errors: List[Dict[str, str]], e: Any, + zone: Optional[str]) -> str: + """Format errors into a string and log it to the console.""" if errors: plural = 's' if len(errors) > 1 else '' codes = ', '.join(repr(e.get('code', 'N/A')) for e in errors) messages = '; '.join( repr(e.get('message', 'N/A').strip('.')) for e in errors) zone_str = f' in {zone}' if zone else '' - logger.warning(f'Got return code{plural} {codes}' - f'{zone_str}: {messages}') + msg = f'Got return code{plural} {codes}{zone_str}: {messages}' else: - logger.warning(f'create_instances: Failed with reason: {e}') + msg = f'create_instances: Failed with reason: {e}' + logger.warning(msg) + return msg def selflink_to_name(selflink: str) -> str: @@ -441,8 +442,10 @@ def call_operation(fn, timeout: int): logger.debug( 'wait_operations: Failed to create instances. Reason: ' f'{errors}') - _log_errors(errors, result, zone) + msg = _format_and_log_message_from_errors( + errors, result, zone) error = common.ProvisionerError('Operation failed') + setattr(error, 'detailed_reason', msg) error.errors = errors raise error return @@ -462,8 +465,10 @@ def call_operation(fn, timeout: int): 'message': f'Timeout waiting for operation {operation["name"]}', 'domain': 'wait_for_operation' }] - _log_errors(errors, None, zone) + msg = _format_and_log_message_from_errors(errors, None, zone) error = common.ProvisionerError('Operation timed out') + # Used for usage collection only, to include in the usage message. + setattr(error, 'detailed_reason', msg) error.errors = errors raise error @@ -819,7 +824,7 @@ def _handle_http_error(e): }) logger.debug( f'create_instances: googleapiclient.errors.HttpError: {e}') - _log_errors(errors, e, zone) + _format_and_log_message_from_errors(errors, e, zone) return errors # Allow Google Compute Engine instance templates. @@ -849,7 +854,7 @@ def _handle_http_error(e): if errors: logger.debug('create_instances: Failed to create instances. ' f'Reason: {errors}') - _log_errors(errors, operations, zone) + _format_and_log_message_from_errors(errors, operations, zone) return errors logger.debug('Waiting GCP instances to be ready ...') @@ -1257,7 +1262,7 @@ def create_instances( 'domain': 'create_instances', 'message': error_details, }) - _log_errors(errors, e, zone) + _format_and_log_message_from_errors(errors, e, zone) return errors, names for detail in error_details: # To be consistent with error messages returned by operation @@ -1276,7 +1281,7 @@ def create_instances( 'domain': violation.get('subject'), 'message': violation.get('description'), }) - _log_errors(errors, e, zone) + _format_and_log_message_from_errors(errors, e, zone) return errors, names errors = [] for operation in operations: @@ -1294,7 +1299,7 @@ def create_instances( if errors: logger.debug('create_instances: Failed to create instances. ' f'Reason: {errors}') - _log_errors(errors, operations, zone) + _format_and_log_message_from_errors(errors, operations, zone) return errors, names logger.debug('Waiting GCP instances to be ready ...') @@ -1336,7 +1341,7 @@ def create_instances( 'message': 'Timeout waiting for creation operation', 'domain': 'create_instances' }] - _log_errors(errors, None, zone) + _format_and_log_message_from_errors(errors, None, zone) return errors, names # NOTE: Error example: @@ -1353,7 +1358,7 @@ def create_instances( logger.debug( 'create_instances: Failed to create instances. Reason: ' f'{errors}') - _log_errors(errors, results, zone) + _format_and_log_message_from_errors(errors, results, zone) return errors, names assert all(success), ( 'Failed to create instances, but there is no error. ' @@ -1475,7 +1480,7 @@ def create_tpu_node(project_id: str, zone: str, tpu_node_config: Dict[str, str], 'https://console.cloud.google.com/iam-admin/quotas ' 'for more information.' }] - _log_errors(provisioner_err.errors, e, zone) + _format_and_log_message_from_errors(provisioner_err.errors, e, zone) raise provisioner_err from e if 'PERMISSION_DENIED' in stderr: @@ -1484,7 +1489,7 @@ def create_tpu_node(project_id: str, zone: str, tpu_node_config: Dict[str, str], 'domain': 'tpu', 'message': 'TPUs are not available in this zone.' }] - _log_errors(provisioner_err.errors, e, zone) + _format_and_log_message_from_errors(provisioner_err.errors, e, zone) raise provisioner_err from e if 'no more capacity in the zone' in stderr: @@ -1493,7 +1498,7 @@ def create_tpu_node(project_id: str, zone: str, tpu_node_config: Dict[str, str], 'domain': 'tpu', 'message': 'No more capacity in this zone.' }] - _log_errors(provisioner_err.errors, e, zone) + _format_and_log_message_from_errors(provisioner_err.errors, e, zone) raise provisioner_err from e if 'CloudTpu received an invalid AcceleratorType' in stderr: @@ -1506,7 +1511,7 @@ def create_tpu_node(project_id: str, zone: str, tpu_node_config: Dict[str, str], 'message': (f'TPU type {tpu_type} is not available in this ' f'zone {zone}.') }] - _log_errors(provisioner_err.errors, e, zone) + _format_and_log_message_from_errors(provisioner_err.errors, e, zone) raise provisioner_err from e # TODO(zhwu): Add more error code handling, if needed. @@ -1515,7 +1520,7 @@ def create_tpu_node(project_id: str, zone: str, tpu_node_config: Dict[str, str], 'domain': 'tpu', 'message': stderr }] - _log_errors(provisioner_err.errors, e, zone) + _format_and_log_message_from_errors(provisioner_err.errors, e, zone) raise provisioner_err from e