Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[UX] Remove warning for gcp check #3647

Merged
merged 9 commits into from
Jun 8, 2024
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion sky/clouds/gcp.py
Original file line number Diff line number Diff line change
Expand Up @@ -742,7 +742,8 @@ def check_credentials(cls) -> Tuple[bool, Optional[str]]:
credentials, project = google.auth.default()
crm = googleapiclient.discovery.build('cloudresourcemanager',
'v1',
credentials=credentials)
credentials=credentials,
cache_discovery=False)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we benchmark the performance degrade and make sure it is tolerable? I think it should be marginal though

Copy link
Collaborator Author

@Michaelvll Michaelvll Jun 7, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yep, the performance is similar. Tested with:
python test.py

import time
from sky.adaptors import gcp

N = 20
durations = []

for _ in range(N):
    start = time.time()
    gcp.build('compute', 'v1', credentials=None, cache_discovery=False) # or True
    end = time.time()
    durations.append(end - start)

print(f'Average time to build GCP service: {sum(durations) / N:.6f} seconds')

with True
Average time to build GCP service: 0.431246 seconds

with False
Average time to build GCP service: 0.431241 seconds

Also, tested with lru_cache for gcp.build and it seems not affecting the time much.

gcp_minimal_permissions = gcp_utils.get_minimal_permissions()
permissions = {'permissions': gcp_minimal_permissions}
request = crm.projects().testIamPermissions(resource=project,
Expand Down
41 changes: 22 additions & 19 deletions sky/provision/gcp/instance_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,19 +100,20 @@ def _generate_node_name(cluster_name: str, node_suffix: str,
return node_name


def _log_errors(errors: List[Dict[str, str]], e: Any,
zone: Optional[str]) -> None:
def _format_message_from_errors(errors: List[Dict[str, str]], e: Any,
zone: Optional[str]) -> str:
"""Format errors into a string."""
Michaelvll marked this conversation as resolved.
Show resolved Hide resolved
if errors:
plural = 's' if len(errors) > 1 else ''
codes = ', '.join(repr(e.get('code', 'N/A')) for e in errors)
messages = '; '.join(
repr(e.get('message', 'N/A').strip('.')) for e in errors)
zone_str = f' in {zone}' if zone else ''
logger.warning(f'Got return code{plural} {codes}'
f'{zone_str}: {messages}')
msg = f'Got return code{plural} {codes}{zone_str}: {messages}'
else:
logger.warning(f'create_instances: Failed with reason: {e}')
msg = f'create_instances: Failed with reason: {e}'
logger.warning(msg)
return msg


def selflink_to_name(selflink: str) -> str:
Expand Down Expand Up @@ -441,8 +442,9 @@ def call_operation(fn, timeout: int):
logger.debug(
'wait_operations: Failed to create instances. Reason: '
f'{errors}')
_log_errors(errors, result, zone)
msg = _format_message_from_errors(errors, result, zone)
error = common.ProvisionerError('Operation failed')
setattr(error, 'detailed_reason', msg)
error.errors = errors
raise error
return
Expand All @@ -462,8 +464,9 @@ def call_operation(fn, timeout: int):
'message': f'Timeout waiting for operation {operation["name"]}',
'domain': 'wait_for_operation'
}]
_log_errors(errors, None, zone)
msg = _format_message_from_errors(errors, None, zone)
error = common.ProvisionerError('Operation timed out')
setattr(error, 'detailed_reason', msg)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we add an attribute for the class and assign here, instead of setattr?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is only used for the usage collection, which uses hasattr to check detailed_reason. Maybe we can keep it this way?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh ic. LGTM!

error.errors = errors
raise error

Expand Down Expand Up @@ -819,7 +822,7 @@ def _handle_http_error(e):
})
logger.debug(
f'create_instances: googleapiclient.errors.HttpError: {e}')
_log_errors(errors, e, zone)
_format_message_from_errors(errors, e, zone)
return errors

# Allow Google Compute Engine instance templates.
Expand Down Expand Up @@ -849,7 +852,7 @@ def _handle_http_error(e):
if errors:
logger.debug('create_instances: Failed to create instances. '
f'Reason: {errors}')
_log_errors(errors, operations, zone)
_format_message_from_errors(errors, operations, zone)
return errors

logger.debug('Waiting GCP instances to be ready ...')
Expand Down Expand Up @@ -1257,7 +1260,7 @@ def create_instances(
'domain': 'create_instances',
'message': error_details,
})
_log_errors(errors, e, zone)
_format_message_from_errors(errors, e, zone)
return errors, names
for detail in error_details:
# To be consistent with error messages returned by operation
Expand All @@ -1276,7 +1279,7 @@ def create_instances(
'domain': violation.get('subject'),
'message': violation.get('description'),
})
_log_errors(errors, e, zone)
_format_message_from_errors(errors, e, zone)
return errors, names
errors = []
for operation in operations:
Expand All @@ -1294,7 +1297,7 @@ def create_instances(
if errors:
logger.debug('create_instances: Failed to create instances. '
f'Reason: {errors}')
_log_errors(errors, operations, zone)
_format_message_from_errors(errors, operations, zone)
return errors, names

logger.debug('Waiting GCP instances to be ready ...')
Expand Down Expand Up @@ -1336,7 +1339,7 @@ def create_instances(
'message': 'Timeout waiting for creation operation',
'domain': 'create_instances'
}]
_log_errors(errors, None, zone)
_format_message_from_errors(errors, None, zone)
return errors, names

# NOTE: Error example:
Expand All @@ -1353,7 +1356,7 @@ def create_instances(
logger.debug(
'create_instances: Failed to create instances. Reason: '
f'{errors}')
_log_errors(errors, results, zone)
_format_message_from_errors(errors, results, zone)
return errors, names
assert all(success), (
'Failed to create instances, but there is no error. '
Expand Down Expand Up @@ -1475,7 +1478,7 @@ def create_tpu_node(project_id: str, zone: str, tpu_node_config: Dict[str, str],
'https://console.cloud.google.com/iam-admin/quotas '
'for more information.'
}]
_log_errors(provisioner_err.errors, e, zone)
_format_message_from_errors(provisioner_err.errors, e, zone)
raise provisioner_err from e

if 'PERMISSION_DENIED' in stderr:
Expand All @@ -1484,7 +1487,7 @@ def create_tpu_node(project_id: str, zone: str, tpu_node_config: Dict[str, str],
'domain': 'tpu',
'message': 'TPUs are not available in this zone.'
}]
_log_errors(provisioner_err.errors, e, zone)
_format_message_from_errors(provisioner_err.errors, e, zone)
raise provisioner_err from e

if 'no more capacity in the zone' in stderr:
Expand All @@ -1493,7 +1496,7 @@ def create_tpu_node(project_id: str, zone: str, tpu_node_config: Dict[str, str],
'domain': 'tpu',
'message': 'No more capacity in this zone.'
}]
_log_errors(provisioner_err.errors, e, zone)
_format_message_from_errors(provisioner_err.errors, e, zone)
raise provisioner_err from e

if 'CloudTpu received an invalid AcceleratorType' in stderr:
Expand All @@ -1506,7 +1509,7 @@ def create_tpu_node(project_id: str, zone: str, tpu_node_config: Dict[str, str],
'message': (f'TPU type {tpu_type} is not available in this '
f'zone {zone}.')
}]
_log_errors(provisioner_err.errors, e, zone)
_format_message_from_errors(provisioner_err.errors, e, zone)
raise provisioner_err from e

# TODO(zhwu): Add more error code handling, if needed.
Expand All @@ -1515,7 +1518,7 @@ def create_tpu_node(project_id: str, zone: str, tpu_node_config: Dict[str, str],
'domain': 'tpu',
'message': stderr
}]
_log_errors(provisioner_err.errors, e, zone)
_format_message_from_errors(provisioner_err.errors, e, zone)
raise provisioner_err from e


Expand Down
Loading