From 61107b6e6f0feaff5ceb84705590c0a62cd42e23 Mon Sep 17 00:00:00 2001 From: Alexander Zhipa Date: Fri, 17 Oct 2025 13:47:14 -0400 Subject: [PATCH] fix: handle kubernetes describe failures gracefully (#1150) --- scripts/setup_pyre.sh | 0 torchx/schedulers/kubernetes_scheduler.py | 21 ++++++++++----- .../test/kubernetes_scheduler_test.py | 26 +++++++++++++++++++ 3 files changed, 40 insertions(+), 7 deletions(-) mode change 100644 => 100755 scripts/setup_pyre.sh diff --git a/scripts/setup_pyre.sh b/scripts/setup_pyre.sh old mode 100644 new mode 100755 diff --git a/torchx/schedulers/kubernetes_scheduler.py b/torchx/schedulers/kubernetes_scheduler.py index 2ca799c02..627c85a1e 100644 --- a/torchx/schedulers/kubernetes_scheduler.py +++ b/torchx/schedulers/kubernetes_scheduler.py @@ -706,16 +706,23 @@ def _run_opts(self) -> runopts: return opts def describe(self, app_id: str) -> Optional[DescribeAppResponse]: + from kubernetes.client.rest import ApiException + namespace, name = app_id.split(":") roles = {} roles_statuses = {} - resp = self._custom_objects_api().get_namespaced_custom_object_status( - group="batch.volcano.sh", - version="v1alpha1", - namespace=namespace, - plural="jobs", - name=name, - ) + try: + resp = self._custom_objects_api().get_namespaced_custom_object_status( + group="batch.volcano.sh", + version="v1alpha1", + namespace=namespace, + plural="jobs", + name=name, + ) + except ApiException as e: + if e.status == 404: + return None + raise status = resp.get("status") if status: state_str = status["state"]["phase"] diff --git a/torchx/schedulers/test/kubernetes_scheduler_test.py b/torchx/schedulers/test/kubernetes_scheduler_test.py index c81e7d086..516966c13 100644 --- a/torchx/schedulers/test/kubernetes_scheduler_test.py +++ b/torchx/schedulers/test/kubernetes_scheduler_test.py @@ -714,6 +714,32 @@ def test_describe_unknown( ), ) + @patch("kubernetes.client.CustomObjectsApi.get_namespaced_custom_object_status") + def test_describe_api_exception_404( + self, get_namespaced_custom_object_status: MagicMock + ) -> None: + from kubernetes.client.rest import ApiException + + api_exc = ApiException(status=404, reason="Not Found") + get_namespaced_custom_object_status.side_effect = api_exc + app_id = "testnamespace:testid" + scheduler = create_scheduler("test") + info = scheduler.describe(app_id) + self.assertIsNone(info) + + @patch("kubernetes.client.CustomObjectsApi.get_namespaced_custom_object_status") + def test_describe_api_exception_other( + self, get_namespaced_custom_object_status: MagicMock + ) -> None: + from kubernetes.client.rest import ApiException + + api_exc = ApiException(status=500, reason="Internal Server Error") + get_namespaced_custom_object_status.side_effect = api_exc + app_id = "testnamespace:testid" + scheduler = create_scheduler("test") + with self.assertRaises(ApiException): + scheduler.describe(app_id) + def test_runopts(self) -> None: scheduler = kubernetes_scheduler.create_scheduler("foo") runopts = scheduler.run_opts()