Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[UX] Remove warning for gcp check #3647

Merged
merged 9 commits into from
Jun 8, 2024
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions sky/adaptors/gcp.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""GCP cloud adaptors"""

# pylint: disable=import-outside-toplevel
import functools
import json

from sky.adaptors import common
Expand All @@ -13,6 +14,7 @@
_LAZY_MODULES = (google, googleapiclient)


@functools.lru_cache()
@common.load_lazy_modules(_LAZY_MODULES)
def build(service_name: str, version: str, *args, **kwargs):
"""Build a GCP service.
Expand Down
8 changes: 4 additions & 4 deletions sky/clouds/gcp.py
Original file line number Diff line number Diff line change
Expand Up @@ -736,13 +736,13 @@ def check_credentials(cls) -> Tuple[bool, Optional[str]]:

# pylint: disable=import-outside-toplevel,unused-import
import google.auth
import googleapiclient.discovery

# This takes user's credential info from "~/.config/gcloud/application_default_credentials.json". # pylint: disable=line-too-long
credentials, project = google.auth.default()
crm = googleapiclient.discovery.build('cloudresourcemanager',
'v1',
credentials=credentials)
crm = gcp.build('cloudresourcemanager',
'v1',
credentials=credentials,
cache_discovery=False)
gcp_minimal_permissions = gcp_utils.get_minimal_permissions()
permissions = {'permissions': gcp_minimal_permissions}
request = crm.projects().testIamPermissions(resource=project,
Expand Down
45 changes: 25 additions & 20 deletions sky/provision/gcp/instance_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,19 +100,20 @@ def _generate_node_name(cluster_name: str, node_suffix: str,
return node_name


def _log_errors(errors: List[Dict[str, str]], e: Any,
zone: Optional[str]) -> None:
"""Format errors into a string."""
def _format_and_log_message_from_errors(errors: List[Dict[str, str]], e: Any,
zone: Optional[str]) -> str:
"""Format errors into a string and log it to the console."""
if errors:
plural = 's' if len(errors) > 1 else ''
codes = ', '.join(repr(e.get('code', 'N/A')) for e in errors)
messages = '; '.join(
repr(e.get('message', 'N/A').strip('.')) for e in errors)
zone_str = f' in {zone}' if zone else ''
logger.warning(f'Got return code{plural} {codes}'
f'{zone_str}: {messages}')
msg = f'Got return code{plural} {codes}{zone_str}: {messages}'
else:
logger.warning(f'create_instances: Failed with reason: {e}')
msg = f'create_instances: Failed with reason: {e}'
logger.warning(msg)
return msg


def selflink_to_name(selflink: str) -> str:
Expand Down Expand Up @@ -441,8 +442,10 @@ def call_operation(fn, timeout: int):
logger.debug(
'wait_operations: Failed to create instances. Reason: '
f'{errors}')
_log_errors(errors, result, zone)
msg = _format_and_log_message_from_errors(
errors, result, zone)
error = common.ProvisionerError('Operation failed')
setattr(error, 'detailed_reason', msg)
error.errors = errors
raise error
return
Expand All @@ -462,8 +465,10 @@ def call_operation(fn, timeout: int):
'message': f'Timeout waiting for operation {operation["name"]}',
'domain': 'wait_for_operation'
}]
_log_errors(errors, None, zone)
msg = _format_and_log_message_from_errors(errors, None, zone)
error = common.ProvisionerError('Operation timed out')
# Used for usage collection only, to include in the usage message.
setattr(error, 'detailed_reason', msg)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we add an attribute for the class and assign here, instead of setattr?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is only used for the usage collection, which uses hasattr to check detailed_reason. Maybe we can keep it this way?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh ic. LGTM!

error.errors = errors
raise error

Expand Down Expand Up @@ -819,7 +824,7 @@ def _handle_http_error(e):
})
logger.debug(
f'create_instances: googleapiclient.errors.HttpError: {e}')
_log_errors(errors, e, zone)
_format_and_log_message_from_errors(errors, e, zone)
return errors

# Allow Google Compute Engine instance templates.
Expand Down Expand Up @@ -849,7 +854,7 @@ def _handle_http_error(e):
if errors:
logger.debug('create_instances: Failed to create instances. '
f'Reason: {errors}')
_log_errors(errors, operations, zone)
_format_and_log_message_from_errors(errors, operations, zone)
return errors

logger.debug('Waiting GCP instances to be ready ...')
Expand Down Expand Up @@ -1257,7 +1262,7 @@ def create_instances(
'domain': 'create_instances',
'message': error_details,
})
_log_errors(errors, e, zone)
_format_and_log_message_from_errors(errors, e, zone)
return errors, names
for detail in error_details:
# To be consistent with error messages returned by operation
Expand All @@ -1276,7 +1281,7 @@ def create_instances(
'domain': violation.get('subject'),
'message': violation.get('description'),
})
_log_errors(errors, e, zone)
_format_and_log_message_from_errors(errors, e, zone)
return errors, names
errors = []
for operation in operations:
Expand All @@ -1294,7 +1299,7 @@ def create_instances(
if errors:
logger.debug('create_instances: Failed to create instances. '
f'Reason: {errors}')
_log_errors(errors, operations, zone)
_format_and_log_message_from_errors(errors, operations, zone)
return errors, names

logger.debug('Waiting GCP instances to be ready ...')
Expand Down Expand Up @@ -1336,7 +1341,7 @@ def create_instances(
'message': 'Timeout waiting for creation operation',
'domain': 'create_instances'
}]
_log_errors(errors, None, zone)
_format_and_log_message_from_errors(errors, None, zone)
return errors, names

# NOTE: Error example:
Expand All @@ -1353,7 +1358,7 @@ def create_instances(
logger.debug(
'create_instances: Failed to create instances. Reason: '
f'{errors}')
_log_errors(errors, results, zone)
_format_and_log_message_from_errors(errors, results, zone)
return errors, names
assert all(success), (
'Failed to create instances, but there is no error. '
Expand Down Expand Up @@ -1475,7 +1480,7 @@ def create_tpu_node(project_id: str, zone: str, tpu_node_config: Dict[str, str],
'https://console.cloud.google.com/iam-admin/quotas '
'for more information.'
}]
_log_errors(provisioner_err.errors, e, zone)
_format_and_log_message_from_errors(provisioner_err.errors, e, zone)
raise provisioner_err from e

if 'PERMISSION_DENIED' in stderr:
Expand All @@ -1484,7 +1489,7 @@ def create_tpu_node(project_id: str, zone: str, tpu_node_config: Dict[str, str],
'domain': 'tpu',
'message': 'TPUs are not available in this zone.'
}]
_log_errors(provisioner_err.errors, e, zone)
_format_and_log_message_from_errors(provisioner_err.errors, e, zone)
raise provisioner_err from e

if 'no more capacity in the zone' in stderr:
Expand All @@ -1493,7 +1498,7 @@ def create_tpu_node(project_id: str, zone: str, tpu_node_config: Dict[str, str],
'domain': 'tpu',
'message': 'No more capacity in this zone.'
}]
_log_errors(provisioner_err.errors, e, zone)
_format_and_log_message_from_errors(provisioner_err.errors, e, zone)
raise provisioner_err from e

if 'CloudTpu received an invalid AcceleratorType' in stderr:
Expand All @@ -1506,7 +1511,7 @@ def create_tpu_node(project_id: str, zone: str, tpu_node_config: Dict[str, str],
'message': (f'TPU type {tpu_type} is not available in this '
f'zone {zone}.')
}]
_log_errors(provisioner_err.errors, e, zone)
_format_and_log_message_from_errors(provisioner_err.errors, e, zone)
raise provisioner_err from e

# TODO(zhwu): Add more error code handling, if needed.
Expand All @@ -1515,7 +1520,7 @@ def create_tpu_node(project_id: str, zone: str, tpu_node_config: Dict[str, str],
'domain': 'tpu',
'message': stderr
}]
_log_errors(provisioner_err.errors, e, zone)
_format_and_log_message_from_errors(provisioner_err.errors, e, zone)
raise provisioner_err from e


Expand Down
Loading