Skip to content

Commit

Permalink
[UX] Remove warning for gcp check (#3647)
Browse files Browse the repository at this point in the history
* remove warning for gcp check

* Add detailed reason in operation failure

* address comments

* format

* thread safty

* Fix thread safety for gcp build

* revert

* revert

* fix
  • Loading branch information
Michaelvll committed Aug 23, 2024
1 parent 158a62f commit 60a344e
Show file tree
Hide file tree
Showing 3 changed files with 32 additions and 26 deletions.
5 changes: 3 additions & 2 deletions sky/adaptors/gcp.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,9 @@ def build(service_name: str, version: str, *args, **kwargs):
service_name: GCP service name (e.g., 'compute', 'storagetransfer').
version: Service version (e.g., 'v1').
"""
from googleapiclient import discovery
return discovery.build(service_name, version, *args, **kwargs)

return googleapiclient.discovery.build(service_name, version, *args,
**kwargs)


@common.load_lazy_modules(_LAZY_MODULES)
Expand Down
8 changes: 4 additions & 4 deletions sky/clouds/gcp.py
Original file line number Diff line number Diff line change
Expand Up @@ -738,13 +738,13 @@ def check_credentials(cls) -> Tuple[bool, Optional[str]]:

# pylint: disable=import-outside-toplevel,unused-import
import google.auth
import googleapiclient.discovery

# This takes user's credential info from "~/.config/gcloud/application_default_credentials.json". # pylint: disable=line-too-long
credentials, project = google.auth.default()
crm = googleapiclient.discovery.build('cloudresourcemanager',
'v1',
credentials=credentials)
crm = gcp.build('cloudresourcemanager',
'v1',
credentials=credentials,
cache_discovery=False)
gcp_minimal_permissions = gcp_utils.get_minimal_permissions()
permissions = {'permissions': gcp_minimal_permissions}
request = crm.projects().testIamPermissions(resource=project,
Expand Down
45 changes: 25 additions & 20 deletions sky/provision/gcp/instance_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,19 +100,20 @@ def _generate_node_name(cluster_name: str, node_suffix: str,
return node_name


def _log_errors(errors: List[Dict[str, str]], e: Any,
zone: Optional[str]) -> None:
"""Format errors into a string."""
def _format_and_log_message_from_errors(errors: List[Dict[str, str]], e: Any,
zone: Optional[str]) -> str:
"""Format errors into a string and log it to the console."""
if errors:
plural = 's' if len(errors) > 1 else ''
codes = ', '.join(repr(e.get('code', 'N/A')) for e in errors)
messages = '; '.join(
repr(e.get('message', 'N/A').strip('.')) for e in errors)
zone_str = f' in {zone}' if zone else ''
logger.warning(f'Got return code{plural} {codes}'
f'{zone_str}: {messages}')
msg = f'Got return code{plural} {codes}{zone_str}: {messages}'
else:
logger.warning(f'create_instances: Failed with reason: {e}')
msg = f'create_instances: Failed with reason: {e}'
logger.warning(msg)
return msg


def selflink_to_name(selflink: str) -> str:
Expand Down Expand Up @@ -441,8 +442,10 @@ def call_operation(fn, timeout: int):
logger.debug(
'wait_operations: Failed to create instances. Reason: '
f'{errors}')
_log_errors(errors, result, zone)
msg = _format_and_log_message_from_errors(
errors, result, zone)
error = common.ProvisionerError('Operation failed')
setattr(error, 'detailed_reason', msg)
error.errors = errors
raise error
return
Expand All @@ -462,8 +465,10 @@ def call_operation(fn, timeout: int):
'message': f'Timeout waiting for operation {operation["name"]}',
'domain': 'wait_for_operation'
}]
_log_errors(errors, None, zone)
msg = _format_and_log_message_from_errors(errors, None, zone)
error = common.ProvisionerError('Operation timed out')
# Used for usage collection only, to include in the usage message.
setattr(error, 'detailed_reason', msg)
error.errors = errors
raise error

Expand Down Expand Up @@ -819,7 +824,7 @@ def _handle_http_error(e):
})
logger.debug(
f'create_instances: googleapiclient.errors.HttpError: {e}')
_log_errors(errors, e, zone)
_format_and_log_message_from_errors(errors, e, zone)
return errors

# Allow Google Compute Engine instance templates.
Expand Down Expand Up @@ -849,7 +854,7 @@ def _handle_http_error(e):
if errors:
logger.debug('create_instances: Failed to create instances. '
f'Reason: {errors}')
_log_errors(errors, operations, zone)
_format_and_log_message_from_errors(errors, operations, zone)
return errors

logger.debug('Waiting GCP instances to be ready ...')
Expand Down Expand Up @@ -1257,7 +1262,7 @@ def create_instances(
'domain': 'create_instances',
'message': error_details,
})
_log_errors(errors, e, zone)
_format_and_log_message_from_errors(errors, e, zone)
return errors, names
for detail in error_details:
# To be consistent with error messages returned by operation
Expand All @@ -1276,7 +1281,7 @@ def create_instances(
'domain': violation.get('subject'),
'message': violation.get('description'),
})
_log_errors(errors, e, zone)
_format_and_log_message_from_errors(errors, e, zone)
return errors, names
errors = []
for operation in operations:
Expand All @@ -1294,7 +1299,7 @@ def create_instances(
if errors:
logger.debug('create_instances: Failed to create instances. '
f'Reason: {errors}')
_log_errors(errors, operations, zone)
_format_and_log_message_from_errors(errors, operations, zone)
return errors, names

logger.debug('Waiting GCP instances to be ready ...')
Expand Down Expand Up @@ -1336,7 +1341,7 @@ def create_instances(
'message': 'Timeout waiting for creation operation',
'domain': 'create_instances'
}]
_log_errors(errors, None, zone)
_format_and_log_message_from_errors(errors, None, zone)
return errors, names

# NOTE: Error example:
Expand All @@ -1353,7 +1358,7 @@ def create_instances(
logger.debug(
'create_instances: Failed to create instances. Reason: '
f'{errors}')
_log_errors(errors, results, zone)
_format_and_log_message_from_errors(errors, results, zone)
return errors, names
assert all(success), (
'Failed to create instances, but there is no error. '
Expand Down Expand Up @@ -1475,7 +1480,7 @@ def create_tpu_node(project_id: str, zone: str, tpu_node_config: Dict[str, str],
'https://console.cloud.google.com/iam-admin/quotas '
'for more information.'
}]
_log_errors(provisioner_err.errors, e, zone)
_format_and_log_message_from_errors(provisioner_err.errors, e, zone)
raise provisioner_err from e

if 'PERMISSION_DENIED' in stderr:
Expand All @@ -1484,7 +1489,7 @@ def create_tpu_node(project_id: str, zone: str, tpu_node_config: Dict[str, str],
'domain': 'tpu',
'message': 'TPUs are not available in this zone.'
}]
_log_errors(provisioner_err.errors, e, zone)
_format_and_log_message_from_errors(provisioner_err.errors, e, zone)
raise provisioner_err from e

if 'no more capacity in the zone' in stderr:
Expand All @@ -1493,7 +1498,7 @@ def create_tpu_node(project_id: str, zone: str, tpu_node_config: Dict[str, str],
'domain': 'tpu',
'message': 'No more capacity in this zone.'
}]
_log_errors(provisioner_err.errors, e, zone)
_format_and_log_message_from_errors(provisioner_err.errors, e, zone)
raise provisioner_err from e

if 'CloudTpu received an invalid AcceleratorType' in stderr:
Expand All @@ -1506,7 +1511,7 @@ def create_tpu_node(project_id: str, zone: str, tpu_node_config: Dict[str, str],
'message': (f'TPU type {tpu_type} is not available in this '
f'zone {zone}.')
}]
_log_errors(provisioner_err.errors, e, zone)
_format_and_log_message_from_errors(provisioner_err.errors, e, zone)
raise provisioner_err from e

# TODO(zhwu): Add more error code handling, if needed.
Expand All @@ -1515,7 +1520,7 @@ def create_tpu_node(project_id: str, zone: str, tpu_node_config: Dict[str, str],
'domain': 'tpu',
'message': stderr
}]
_log_errors(provisioner_err.errors, e, zone)
_format_and_log_message_from_errors(provisioner_err.errors, e, zone)
raise provisioner_err from e


Expand Down

0 comments on commit 60a344e

Please sign in to comment.