diff --git a/dashboard/datacenter.py b/dashboard/datacenter.py index 1e4627eadc86..f7702e64411e 100644 --- a/dashboard/datacenter.py +++ b/dashboard/datacenter.py @@ -138,8 +138,9 @@ async def get_node_info(cls, node_id, get_summary=False): # Merge GcsNodeInfo to node physical stats node_info["raylet"].update(node) + death_info = node.get("deathInfo", {}) node_info["raylet"]["stateMessage"] = compose_state_message( - node.get("deathInfo", {}) + death_info.get("reason", None), death_info.get("reasonMessage", None) ) if not get_summary: diff --git a/dashboard/state_aggregator.py b/dashboard/state_aggregator.py index e3aa7a9f8c6a..5c8591484d32 100644 --- a/dashboard/state_aggregator.py +++ b/dashboard/state_aggregator.py @@ -11,6 +11,7 @@ from ray._private.profiling import chrome_tracing_dump import ray.dashboard.memory_utils as memory_utils +from ray.dashboard.utils import compose_state_message from ray.util.state.common import ( protobuf_message_to_dict, @@ -323,6 +324,10 @@ async def list_nodes(self, *, option: ListApiOptions) -> ListApiResponse: data["node_ip"] = data["node_manager_address"] data["start_time_ms"] = int(data["start_time_ms"]) data["end_time_ms"] = int(data["end_time_ms"]) + death_info = data.get("death_info", {}) + data["state_message"] = compose_state_message( + death_info.get("reason", None), death_info.get("reason_message", None) + ) result.append(data) diff --git a/dashboard/utils.py b/dashboard/utils.py index 5d5c4a0284c7..99c624803bbe 100644 --- a/dashboard/utils.py +++ b/dashboard/utils.py @@ -656,13 +656,17 @@ def get_address_for_submission_client(address: Optional[str]) -> str: return address -def compose_state_message(death_info_dict: dict) -> Optional[str]: +def compose_state_message( + death_reason: Optional[str], death_reason_message: Optional[str] +) -> Optional[str]: """Compose node state message based on death information. Args: - death_info_dict: the node_death field in GcsNodeInfo, in dict type. + death_reason: The reason of node death. + This is a string representation of `gcs_pb2.NodeDeathInfo.Reason`. + death_reason_message: The message of node death. + This corresponds to `gcs_pb2.NodeDeathInfo.ReasonMessage`. """ - death_reason = death_info_dict.get("reason", None) if death_reason == "EXPECTED_TERMINATION": state_message = "Expected termination" elif death_reason == "UNEXPECTED_TERMINATION": @@ -674,7 +678,6 @@ def compose_state_message(death_info_dict: dict) -> Optional[str]: else: state_message = None - death_reason_message = death_info_dict.get("reasonMessage", None) if death_reason_message: if state_message: state_message += f": {death_reason_message}" diff --git a/python/ray/tests/test_state_api.py b/python/ray/tests/test_state_api.py index b6555e1e9ab1..f9f582547f9c 100644 --- a/python/ray/tests/test_state_api.py +++ b/python/ray/tests/test_state_api.py @@ -2154,19 +2154,23 @@ def test_list_get_nodes(ray_start_cluster): cluster = ray_start_cluster cluster.add_node(num_cpus=1, node_name="head_node") ray.init(address=cluster.address) - cluster.add_node(num_cpus=1, node_name="worker_node") + worker_node = cluster.add_node(num_cpus=1, node_name="worker_node") + + cluster.remove_node(worker_node) def verify(): nodes = list_nodes(detail=True) for node in nodes: - assert node["state"] == "ALIVE" assert is_hex(node["node_id"]) - assert ( - node["is_head_node"] - if node["node_name"] == "head_node" - else not node["is_head_node"] - ) assert node["labels"] == {"ray.io/node_id": node["node_id"]} + if node["node_name"] == "head_node": + assert node["is_head_node"] + assert node["state"] == "ALIVE" + assert node["state_message"] is None + else: + assert not node["is_head_node"] + assert node["state"] == "DEAD" + assert node["state_message"] == "Expected termination: received SIGTERM" # Check with legacy API check_nodes = ray.nodes() diff --git a/python/ray/tests/test_task_events.py b/python/ray/tests/test_task_events.py index 89434049349d..b1210b59aff3 100644 --- a/python/ray/tests/test_task_events.py +++ b/python/ray/tests/test_task_events.py @@ -204,7 +204,8 @@ def driver_running(): verify_failed_task, name="node-killed", error_type="NODE_DIED", - error_message="Task failed due to the node dying", + error_message="Task failed due to the node (where this task was running) " + " was dead or unavailable", ) diff --git a/python/ray/util/state/common.py b/python/ray/util/state/common.py index 366648674ee4..9dbb56286c61 100644 --- a/python/ray/util/state/common.py +++ b/python/ray/util/state/common.py @@ -507,6 +507,9 @@ class NodeState(StateSchema): #: ALIVE: The node is alive. #: DEAD: The node is dead. state: TypeNodeStatus = state_column(filterable=True) + #: The state message of the node. + #: This provides more detailed information about the node's state. + state_message: Optional[str] = state_column(filterable=False) #: The name of the node if it is given by the name argument. node_name: str = state_column(filterable=True) #: The total resources of the node. diff --git a/src/ray/core_worker/transport/direct_task_transport.cc b/src/ray/core_worker/transport/direct_task_transport.cc index 54799a1acba0..b419fae2ffea 100644 --- a/src/ray/core_worker/transport/direct_task_transport.cc +++ b/src/ray/core_worker/transport/direct_task_transport.cc @@ -728,13 +728,16 @@ void CoreWorkerDirectTaskSubmitter::HandleGetTaskFailureCause( << " ip: " << addr.ip_address(); task_error_type = rpc::ErrorType::NODE_DIED; std::stringstream buffer; - buffer << "Task failed due to the node dying.\n\nThe node (IP: " << addr.ip_address() - << ", node ID: " << NodeID::FromBinary(addr.raylet_id()) - << ") where this task was running crashed unexpectedly. " - << "This can happen if: (1) the instance where the node was running failed, " - "(2) raylet crashes unexpectedly (OOM, preempted node, etc).\n\n" - << "To see more information about the crash, use `ray logs raylet.out -ip " - << addr.ip_address() << "`"; + buffer << "Task failed due to the node (where this task was running) " + << " was dead or unavailable.\n\nThe node IP: " << addr.ip_address() + << ", node ID: " << NodeID::FromBinary(addr.raylet_id()) << "\n\n" + << "This can happen if the instance where the node was running failed, " + << "the node was preempted, or raylet crashed unexpectedly " + << "(e.g., due to OOM) etc.\n\n" + << "To see node death information, use `ray list nodes --filter \"node_id=" + << NodeID::FromBinary(addr.raylet_id()) << "\"`, " + << "or check Ray dashboard cluster page, or search the node ID in GCS log, " + << "or use `ray logs raylet.out -ip " << addr.ip_address() << "`"; error_info = std::make_unique(); error_info->set_error_message(buffer.str()); error_info->set_error_type(rpc::ErrorType::NODE_DIED);