Skip to content

Commit

Permalink
fix comments
Browse files Browse the repository at this point in the history
  • Loading branch information
suquark committed Nov 15, 2023
1 parent 3c2ab41 commit 6f4479d
Show file tree
Hide file tree
Showing 2 changed files with 22 additions and 17 deletions.
33 changes: 19 additions & 14 deletions sky/provision/gcp/instance.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,19 +11,6 @@
from sky.provision import common
from sky.provision.gcp import instance_utils

# Tag for user defined node types (e.g., m4xl_spot). This is used for multi
# node type clusters.
TAG_RAY_USER_NODE_TYPE = "ray-user-node-type"
# Hash of the node launch config, used to identify out-of-date nodes
TAG_RAY_LAUNCH_CONFIG = "ray-launch-config"
# Tag for autofilled node types for legacy cluster yamls without multi
# node type defined in the cluster configs.
NODE_TYPE_LEGACY_HEAD = "ray-legacy-head-node-type"
NODE_TYPE_LEGACY_WORKER = "ray-legacy-worker-node-type"

# Tag that reports the current state of the node (e.g. Updating, Up-to-date)
TAG_RAY_NODE_STATUS = "ray-node-status"

logger = sky_logging.init_logger(__name__)

MAX_POLLS = 12
Expand Down Expand Up @@ -140,6 +127,8 @@ def run_instances(region: str, cluster_name_on_cloud: str,
)
if not instances:
break
logger.info(
f'Waiting for {len(instances)} instances in STOPPING status')
time.sleep(POLL_INTERVAL)

exist_instances = resource.filter(
Expand Down Expand Up @@ -190,7 +179,8 @@ def get_order_key(node):

if stopping_instances:
raise RuntimeError(
f'Some instances are being stopped during provisioning.')
'Some instances are being stopped during provisioning. '
'Please wait a while and retry.')

if head_instance_id is None:
if running_instances:
Expand Down Expand Up @@ -266,6 +256,21 @@ def get_order_key(node):
if not instances:
break

# Check if the number of running instances is the same as the requested.
instances = resource.filter(
project_id=project_id,
zone=availability_zone,
label_filters=filter_labels,
status_filters=['RUNNING'],
)
if len(instances) != config.count:
logger.warning('The number of running instances is different from '
'the requested number after provisioning '
f'(requested: {config.count}, '
f'observed: {len(instances)}). '
'This could be some instances failed to start '
'or some resource leak.')

return common.ProvisionRecord(provider_name='gcp',
region=region,
zone=availability_zone,
Expand Down
6 changes: 3 additions & 3 deletions sky/provision/gcp/instance_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,16 +49,16 @@ def dec(func):
def wrapper(*args, **kwargs):
from googleapiclient.errors import HttpError

exception = HttpError
exception_type = HttpError

def try_catch_exc():
try:
value = func(*args, **kwargs)
return value
except Exception as e:
if not isinstance(e, exception) or (
if not isinstance(e, exception_type) or (
regex and not re.search(regex, str(e))):
raise e
raise
return e

for _ in range(max_retries):
Expand Down

0 comments on commit 6f4479d

Please sign in to comment.