100
100
CLUSTER_STATUS_LOCK_PATH = os .path .expanduser ('~/.sky/.{}.lock' )
101
101
CLUSTER_STATUS_LOCK_TIMEOUT_SECONDS = 20
102
102
103
+ # Time that must elapse since the last status check before we should re-check if
104
+ # the cluster has been terminated or autostopped.
105
+ _CLUSTER_STATUS_CACHE_DURATION_SECONDS = 2
106
+
103
107
# Filelocks for updating cluster's file_mounts.
104
108
CLUSTER_FILE_MOUNTS_LOCK_PATH = os .path .expanduser (
105
109
'~/.sky/.{}_file_mounts.lock' )
@@ -1669,11 +1673,27 @@ def check_can_clone_disk_and_override_task(
1669
1673
1670
1674
def _update_cluster_status_no_lock (
1671
1675
cluster_name : str ) -> Optional [Dict [str , Any ]]:
1672
- """Updates the status of the cluster.
1676
+ """Update the cluster status.
1677
+
1678
+ The cluster status is updated by checking ray cluster and real status from
1679
+ cloud.
1680
+
1681
+ The function will update the cached cluster status in the global state. For
1682
+ the design of the cluster status and transition, please refer to the
1683
+ sky/design_docs/cluster_status.md
1684
+
1685
+ Returns:
1686
+ If the cluster is terminated or does not exist, return None. Otherwise
1687
+ returns the input record with status and handle potentially updated.
1673
1688
1674
1689
Raises:
1690
+ exceptions.ClusterOwnerIdentityMismatchError: if the current user is
1691
+ not the same as the user who created the cluster.
1692
+ exceptions.CloudUserIdentityError: if we fail to get the current user
1693
+ identity.
1675
1694
exceptions.ClusterStatusFetchingError: the cluster status cannot be
1676
- fetched from the cloud provider.
1695
+ fetched from the cloud provider or there are leaked nodes causing
1696
+ the node number larger than expected.
1677
1697
"""
1678
1698
record = global_user_state .get_cluster_from_name (cluster_name )
1679
1699
if record is None :
@@ -1893,52 +1913,22 @@ def run_ray_status_to_check_ray_cluster_healthy() -> bool:
1893
1913
return global_user_state .get_cluster_from_name (cluster_name )
1894
1914
1895
1915
1896
- def _update_cluster_status (
1897
- cluster_name : str ,
1898
- acquire_per_cluster_status_lock : bool ,
1899
- cluster_status_lock_timeout : int = CLUSTER_STATUS_LOCK_TIMEOUT_SECONDS
1900
- ) -> Optional [ Dict [ str , Any ]]:
1901
- """Update the cluster status.
1916
+ def _must_refresh_cluster_status (
1917
+ record : Dict [ str , Any ] ,
1918
+ force_refresh_statuses : Optional [ Set [ status_lib . ClusterStatus ]]
1919
+ ) -> bool :
1920
+ force_refresh_for_cluster = ( force_refresh_statuses is not None and
1921
+ record [ ' status' ] in force_refresh_statuses )
1902
1922
1903
- The cluster status is updated by checking ray cluster and real status from
1904
- cloud.
1923
+ use_spot = record ['handle' ].launched_resources .use_spot
1924
+ has_autostop = (record ['status' ] != status_lib .ClusterStatus .STOPPED and
1925
+ record ['autostop' ] >= 0 )
1926
+ recently_refreshed = (record ['status_updated_at' ] is not None and
1927
+ time .time () - record ['status_updated_at' ] <
1928
+ _CLUSTER_STATUS_CACHE_DURATION_SECONDS )
1929
+ is_stale = (use_spot or has_autostop ) and not recently_refreshed
1905
1930
1906
- The function will update the cached cluster status in the global state. For
1907
- the design of the cluster status and transition, please refer to the
1908
- sky/design_docs/cluster_status.md
1909
-
1910
- Args:
1911
- cluster_name: The name of the cluster.
1912
- acquire_per_cluster_status_lock: Whether to acquire the per-cluster lock
1913
- before updating the status.
1914
- cluster_status_lock_timeout: The timeout to acquire the per-cluster
1915
- lock.
1916
-
1917
- Returns:
1918
- If the cluster is terminated or does not exist, return None. Otherwise
1919
- returns the input record with status and handle potentially updated.
1920
-
1921
- Raises:
1922
- exceptions.ClusterOwnerIdentityMismatchError: if the current user is
1923
- not the same as the user who created the cluster.
1924
- exceptions.CloudUserIdentityError: if we fail to get the current user
1925
- identity.
1926
- exceptions.ClusterStatusFetchingError: the cluster status cannot be
1927
- fetched from the cloud provider or there are leaked nodes causing
1928
- the node number larger than expected.
1929
- """
1930
- if not acquire_per_cluster_status_lock :
1931
- return _update_cluster_status_no_lock (cluster_name )
1932
-
1933
- try :
1934
- with filelock .FileLock (CLUSTER_STATUS_LOCK_PATH .format (cluster_name ),
1935
- timeout = cluster_status_lock_timeout ):
1936
- return _update_cluster_status_no_lock (cluster_name )
1937
- except filelock .Timeout :
1938
- logger .debug ('Refreshing status: Failed get the lock for cluster '
1939
- f'{ cluster_name !r} . Using the cached status.' )
1940
- record = global_user_state .get_cluster_from_name (cluster_name )
1941
- return record
1931
+ return force_refresh_for_cluster or is_stale
1942
1932
1943
1933
1944
1934
def refresh_cluster_record (
@@ -1956,16 +1946,22 @@ def refresh_cluster_record(
1956
1946
1957
1947
Args:
1958
1948
cluster_name: The name of the cluster.
1959
- force_refresh_statuses: if specified, refresh the cluster if it has one of
1960
- the specified statuses. Additionally, clusters satisfying the
1961
- following conditions will always be refreshed no matter the
1962
- argument is specified or not:
1963
- 1. is a spot cluster, or
1964
- 2. is a non-spot cluster, is not STOPPED, and autostop is set.
1949
+ force_refresh_statuses: if specified, refresh the cluster if it has one
1950
+ of the specified statuses. Additionally, clusters satisfying the
1951
+ following conditions will be refreshed no matter the argument is
1952
+ specified or not:
1953
+ - the most latest available status update is more than
1954
+ _CLUSTER_STATUS_CACHE_DURATION_SECONDS old, and one of:
1955
+ 1. the cluster is a spot cluster, or
1956
+ 2. cluster autostop is set and the cluster is not STOPPED.
1965
1957
acquire_per_cluster_status_lock: Whether to acquire the per-cluster lock
1966
- before updating the status.
1958
+ before updating the status. Even if this is True, the lock may not be
1959
+ acquired if the status does not need to be refreshed.
1967
1960
cluster_status_lock_timeout: The timeout to acquire the per-cluster
1968
- lock. If timeout, the function will use the cached status.
1961
+ lock. If timeout, the function will use the cached status. If the
1962
+ value is <0, do not timeout (wait for the lock indefinitely). By
1963
+ default, this is set to CLUSTER_STATUS_LOCK_TIMEOUT_SECONDS. Warning:
1964
+ if correctness is required, you must set this to -1.
1969
1965
1970
1966
Returns:
1971
1967
If the cluster is terminated or does not exist, return None.
@@ -1986,19 +1982,58 @@ def refresh_cluster_record(
1986
1982
return None
1987
1983
check_owner_identity (cluster_name )
1988
1984
1989
- handle = record ['handle' ]
1990
- if isinstance (handle , backends .CloudVmRayResourceHandle ):
1991
- use_spot = handle .launched_resources .use_spot
1992
- has_autostop = (record ['status' ] != status_lib .ClusterStatus .STOPPED and
1993
- record ['autostop' ] >= 0 )
1994
- force_refresh_for_cluster = (force_refresh_statuses is not None and
1995
- record ['status' ] in force_refresh_statuses )
1996
- if force_refresh_for_cluster or has_autostop or use_spot :
1997
- record = _update_cluster_status (
1998
- cluster_name ,
1999
- acquire_per_cluster_status_lock = acquire_per_cluster_status_lock ,
2000
- cluster_status_lock_timeout = cluster_status_lock_timeout )
2001
- return record
1985
+ if not isinstance (record ['handle' ], backends .CloudVmRayResourceHandle ):
1986
+ return record
1987
+
1988
+ # The loop logic allows us to notice if the status was updated in the
1989
+ # global_user_state by another process and stop trying to get the lock.
1990
+ # The core loop logic is adapted from FileLock's implementation.
1991
+ lock = filelock .FileLock (CLUSTER_STATUS_LOCK_PATH .format (cluster_name ))
1992
+ start_time = time .perf_counter ()
1993
+
1994
+ # Loop until we have an up-to-date status or until we acquire the lock.
1995
+ while True :
1996
+ # Check to see if we can return the cached status.
1997
+ if not _must_refresh_cluster_status (record , force_refresh_statuses ):
1998
+ return record
1999
+
2000
+ if not acquire_per_cluster_status_lock :
2001
+ return _update_cluster_status_no_lock (cluster_name )
2002
+
2003
+ # Try to acquire the lock so we can fetch the status.
2004
+ try :
2005
+ with lock .acquire (blocking = False ):
2006
+ # Lock acquired.
2007
+
2008
+ # Check the cluster status again, since it could have been
2009
+ # updated between our last check and acquiring the lock.
2010
+ record = global_user_state .get_cluster_from_name (cluster_name )
2011
+ if record is None or not _must_refresh_cluster_status (
2012
+ record , force_refresh_statuses ):
2013
+ return record
2014
+
2015
+ # Update and return the cluster status.
2016
+ return _update_cluster_status_no_lock (cluster_name )
2017
+ except filelock .Timeout :
2018
+ # lock.acquire() will throw a Timeout exception if the lock is not
2019
+ # available and we have blocking=False.
2020
+ pass
2021
+
2022
+ # Logic adapted from FileLock.acquire().
2023
+ # If cluster_status_lock_time is <0, we will never hit this. No timeout.
2024
+ # Otherwise, if we have timed out, return the cached status. This has
2025
+ # the potential to cause correctness issues, but if so it is the
2026
+ # caller's responsibility to set the timeout to -1.
2027
+ if 0 <= cluster_status_lock_timeout < time .perf_counter () - start_time :
2028
+ logger .debug ('Refreshing status: Failed get the lock for cluster '
2029
+ f'{ cluster_name !r} . Using the cached status.' )
2030
+ return record
2031
+ time .sleep (0.05 )
2032
+
2033
+ # Refresh for next loop iteration.
2034
+ record = global_user_state .get_cluster_from_name (cluster_name )
2035
+ if record is None :
2036
+ return None
2002
2037
2003
2038
2004
2039
@timeline .event
0 commit comments