[Dashboard] Turn on New Dashboard by Default (#11321)

ray-project · Oct 19, 2020 · f500292 · f500292
1 parent 202b185
commit f500292
Show file tree

Hide file tree

Showing 15 changed files with 123 additions and 814 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,4 +1,5 @@
 # The build output should clearly not be checked in
+*test-output.xml
 /bazel-*
 /python/ray/core
 /python/ray/pickle5_files/
@@ -11,7 +12,7 @@
 /thirdparty/pkg/
 /build/java
 .jar
-
+/dashboard/client/build
 # Files generated by flatc should be ignored
 /src/ray/gcs/format/*_generated.h
 /src/ray/object_manager/format/*_generated.h

diff --git a/ci/travis/test-wheels.sh b/ci/travis/test-wheels.sh
@@ -22,7 +22,6 @@ if [ -z "${BUILD_DIR}" ]; then
 fi
 TEST_DIR="${BUILD_DIR}/python/ray/tests"
 TEST_SCRIPTS=("$TEST_DIR/test_microbenchmarks.py" "$TEST_DIR/test_basic.py")
-UI_TEST_SCRIPT="${BUILD_DIR}/python/ray/tests/test_webui.py"
 
 function retry {
   local n=1
@@ -77,9 +76,6 @@ if [[ "$platform" == "linux" ]]; then
     for SCRIPT in "${TEST_SCRIPTS[@]}"; do
         retry "$PYTHON_EXE" "$SCRIPT"
     done
-
-    # Run the UI test to make sure that the packaged UI works.
-    retry "$PYTHON_EXE" "$UI_TEST_SCRIPT"
   done
 
   # Check that the other wheels are present.
@@ -118,12 +114,6 @@ elif [[ "$platform" == "macosx" ]]; then
     for SCRIPT in "${TEST_SCRIPTS[@]}"; do
       retry "$PYTHON_EXE" "$SCRIPT"
     done
-
-    if (( $(echo "$PY_MM >= 3.0" | bc) )); then
-      # Run the UI test to make sure that the packaged UI works.
-      retry "$PYTHON_EXE" "$UI_TEST_SCRIPT"
-    fi
-
   done
 elif [ "${platform}" = windows ]; then
   echo "WARNING: Wheel testing not yet implemented for Windows."

diff --git a/dashboard/agent.py b/dashboard/agent.py
@@ -38,6 +38,7 @@
 class DashboardAgent(object):
     def __init__(self,
                  redis_address,
+                 dashboard_agent_port,
                  redis_password=None,
                  temp_dir=None,
                  log_dir=None,
@@ -51,6 +52,7 @@ def __init__(self,
         self.redis_password = redis_password
         self.temp_dir = temp_dir
         self.log_dir = log_dir
+        self.dashboard_agent_port = dashboard_agent_port
         self.metrics_export_port = metrics_export_port
         self.node_manager_port = node_manager_port
         self.object_store_name = object_store_name
@@ -59,7 +61,8 @@ def __init__(self,
         assert self.node_id, "Empty node id (RAY_NODE_ID)."
         self.ip = ray._private.services.get_node_ip_address()
         self.server = aiogrpc.server(options=(("grpc.so_reuseport", 0), ))
-        self.grpc_port = self.server.add_insecure_port("[::]:0")
+        self.grpc_port = self.server.add_insecure_port(
+            f"[::]:{self.dashboard_agent_port}")
         logger.info("Dashboard agent grpc address: %s:%s", self.ip,
                     self.grpc_port)
         self.aioredis_client = None
@@ -186,6 +189,11 @@ async def _check_parent():
         required=True,
         type=int,
         help="The port to expose metrics through Prometheus.")
+    parser.add_argument(
+        "--dashboard-agent-port",
+        required=True,
+        type=int,
+        help="The port on which the dashboard agent will receive GRPCs.")
     parser.add_argument(
         "--node-manager-port",
         required=True,
@@ -288,6 +296,7 @@ async def _check_parent():
 
         agent = DashboardAgent(
             args.redis_address,
+            args.dashboard_agent_port,
             redis_password=args.redis_password,
             temp_dir=temp_dir,
             log_dir=log_dir,

diff --git a/dashboard/dashboard.py b/dashboard/dashboard.py
@@ -3,7 +3,6 @@
 except ImportError:
     print("The dashboard requires aiohttp to run.")
     import sys
-
     sys.exit(1)
 
 import argparse

diff --git a/dashboard/datacenter.py b/dashboard/datacenter.py
@@ -111,15 +111,13 @@ async def get_node_info(cls, node_id):
         node_physical_stats = DataSource.node_physical_stats.get(node_id, {})
         node_stats = DataSource.node_stats.get(node_id, {})
         node = DataSource.nodes.get(node_id, {})
-
+        node_ip = DataSource.node_id_to_ip.get(node_id)
         # Merge node log count information into the payload
-        log_info = DataSource.ip_and_pid_to_logs.get(node_physical_stats["ip"],
-                                                     {})
+        log_info = DataSource.ip_and_pid_to_logs.get(node_ip, {})
         node_log_count = 0
         for entries in log_info.values():
             node_log_count += len(entries)
-        error_info = DataSource.ip_and_pid_to_errors.get(
-            node_physical_stats["ip"], {})
+        error_info = DataSource.ip_and_pid_to_errors.get(node_ip, {})
         node_err_count = 0
         for entries in error_info.values():
             node_err_count += len(entries)

diff --git a/dashboard/modules/logical_view/test_logical_view_head.py b/dashboard/modules/logical_view/test_logical_view_head.py
@@ -33,9 +33,8 @@ class InfeasibleActor:
     foo_actors = [Foo.remote(4), Foo.remote(5)]
     infeasible_actor = InfeasibleActor.remote()  # noqa
     results = [actor.do_task.remote() for actor in foo_actors]  # noqa
-    assert (wait_until_server_available(ray_start_with_dashboard["webui_url"])
-            is True)
     webui_url = ray_start_with_dashboard["webui_url"]
+    assert wait_until_server_available(webui_url)
     webui_url = format_web_url(webui_url)
 
     timeout_seconds = 5
@@ -75,5 +74,66 @@ class InfeasibleActor:
                 raise Exception(f"Timed out while testing, {ex_stack}")
 
 
+def test_kill_actor(ray_start_with_dashboard):
+    @ray.remote
+    class Actor:
+        def __init__(self):
+            pass
+
+        def f(self):
+            ray.show_in_dashboard("test")
+            return os.getpid()
+
+    a = Actor.remote()
+    worker_pid = ray.get(a.f.remote())  # noqa
+
+    webui_url = ray_start_with_dashboard["webui_url"]
+    assert wait_until_server_available(webui_url)
+    webui_url = format_web_url(webui_url)
+
+    def actor_killed(pid):
+        """Check For the existence of a unix pid."""
+        try:
+            os.kill(pid, 0)
+        except OSError:
+            return True
+        else:
+            return False
+
+    def get_actor():
+        resp = requests.get(f"{webui_url}/logical/actor_groups")
+        resp.raise_for_status()
+        actor_groups_resp = resp.json()
+        assert actor_groups_resp["result"] is True, actor_groups_resp["msg"]
+        actor_groups = actor_groups_resp["data"]["actorGroups"]
+        actor = actor_groups["Actor"]["entries"][0]
+        return actor
+
+    def kill_actor_using_dashboard(actor):
+        resp = requests.get(
+            webui_url + "/logical/kill_actor",
+            params={
+                "actorId": actor["actorId"],
+                "ipAddress": actor["ipAddress"],
+                "port": actor["port"]
+            })
+        resp.raise_for_status()
+        resp_json = resp.json()
+        assert resp_json["result"] is True, "msg" in resp_json
+
+    start = time.time()
+    last_exc = None
+    while time.time() - start <= 10:
+        try:
+            actor = get_actor()
+            kill_actor_using_dashboard(actor)
+            last_exc = None
+            break
+        except (KeyError, AssertionError) as e:
+            last_exc = e
+            time.sleep(.1)
+    assert last_exc is None
+
+
 if __name__ == "__main__":
     sys.exit(pytest.main(["-v", __file__]))
diff --git a/dashboard/modules/reporter/reporter_agent.py b/dashboard/modules/reporter/reporter_agent.py
@@ -94,23 +94,15 @@ async def GetProfilingStats(self, request, context):
         return reporter_pb2.GetProfilingStatsReply(
             profiling_stats=profiling_stats, std_out=stdout, std_err=stderr)
 
-    async def ReportMetrics(self, request, context):
-        # NOTE: Exceptions are not propagated properly
-        # when we don't catch them here.
+    async def ReportOCMetrics(self, request, context):
+        # This function receives a GRPC containing OpenCensus (OC) metrics
+        # from a Ray process, then exposes those metrics to Prometheus.
         try:
-            metrcs_description_required = (
-                self._metrics_agent.record_metrics_points(
-                    request.metrics_points))
-        except Exception as e:
-            logger.error(e)
+            self._metrics_agent.record_metric_points_from_protobuf(
+                request.metrics)
+        except Exception:
             logger.error(traceback.format_exc())
-
-        # If metrics description is missing, we should notify cpp processes
-        # that we need them. Cpp processes will then report them to here.
-        # We need it when (1) a new metric is reported (application metric)
-        # (2) a reporter goes down and restarted (currently not implemented).
-        return reporter_pb2.ReportMetricsReply(
-            metrcs_description_required=metrcs_description_required)
+        return reporter_pb2.ReportOCMetricsReply()
 
     @staticmethod
     def _get_cpu_percent():
@@ -125,8 +117,7 @@ def _get_gpu_usage():
         try:
             gpus = gpustat.new_query().gpus
         except Exception as e:
-            logger.debug(
-                "gpustat failed to retrieve GPU information: {}".format(e))
+            logger.debug(f"gpustat failed to retrieve GPU information: {e}")
         for gpu in gpus:
             # Note the keys in this dict have periods which throws
             # off javascript so we change .s to _s
@@ -233,12 +224,8 @@ def _get_all_stats(self):
             "cmdline": self._get_raylet_cmdline(),
         }
 
-    async def _perform_iteration(self):
+    async def _perform_iteration(self, aioredis_client):
         """Get any changes to the log files and push updates to Redis."""
-        aioredis_client = await aioredis.create_redis_pool(
-            address=self._dashboard_agent.redis_address,
-            password=self._dashboard_agent.redis_password)
-
         while True:
             try:
                 stats = self._get_all_stats()
@@ -249,5 +236,8 @@ async def _perform_iteration(self):
                 reporter_consts.REPORTER_UPDATE_INTERVAL_MS / 1000)
 
     async def run(self, server):
+        aioredis_client = await aioredis.create_redis_pool(
+            address=self._dashboard_agent.redis_address,
+            password=self._dashboard_agent.redis_password)
         reporter_pb2_grpc.add_ReporterServiceServicer_to_server(self, server)
-        await self._perform_iteration()
+        await self._perform_iteration(aioredis_client)
diff --git a/dashboard/modules/tune/tune_head.py b/dashboard/modules/tune/tune_head.py
@@ -130,7 +130,7 @@ async def collect(self):
 
         # search through all the sub_directories in log directory
         analysis = Analysis(str(self._logdir))
-        df = analysis.dataframe(metric="episode_reward_mean", mode="max")
+        df = analysis.dataframe(metric=None, mode=None)
 
         if len(df) == 0 or "trial_id" not in df.columns:
             return

diff --git a/python/build-wheel-macos.sh b/python/build-wheel-macos.sh
@@ -39,7 +39,8 @@ source "$HOME"/.nvm/nvm.sh
 nvm use node
 
 # Build the dashboard so its static assets can be included in the wheel.
-pushd python/ray/dashboard/client
+# TODO(mfitton): switch this back when deleting old dashboard code.
+pushd python/ray/new_dashboard/client
   npm ci
   npm run build
 popd

diff --git a/python/build-wheel-manylinux1.sh b/python/build-wheel-manylinux1.sh
@@ -35,7 +35,8 @@ nvm install node
 nvm use node
 
 # Build the dashboard so its static assets can be included in the wheel.
-pushd python/ray/dashboard/client
+# TODO(mfitton): switch this back when deleting old dashboard code.
+pushd python/ray/new_dashboard/client
   npm ci
   npm run build
 popd

diff --git a/python/ray/_private/services.py b/python/ray/_private/services.py
@@ -495,10 +495,18 @@ def preexec_fn():
             process.kill()
             raise
 
+    def _get_stream_name(stream):
+        if stream is not None:
+            try:
+                return stream.name
+            except AttributeError:
+                return str(stream)
+        return None
+
     return ProcessInfo(
         process=process,
-        stdout_file=stdout_file.name if stdout_file is not None else None,
-        stderr_file=stderr_file.name if stderr_file is not None else None,
+        stdout_file=_get_stream_name(stdout_file),
+        stderr_file=_get_stream_name(stderr_file),
         use_valgrind=use_valgrind,
         use_gdb=use_gdb,
         use_valgrind_profiler=use_valgrind_profiler,
@@ -1037,12 +1045,7 @@ def start_dashboard(require_dashboard,
             raise ValueError(
                 f"The given dashboard port {port} is already in use")
 
-    if "RAY_USE_NEW_DASHBOARD" in os.environ:
-        dashboard_dir = "new_dashboard"
-    else:
-        dashboard_dir = "dashboard"
-        logdir = None
-
+    dashboard_dir = "new_dashboard"
     dashboard_filepath = os.path.join(RAY_PATH, dashboard_dir, "dashboard.py")
     command = [
         sys.executable,
@@ -1058,21 +1061,20 @@ def start_dashboard(require_dashboard,
     if redis_password:
         command += ["--redis-password", redis_password]
 
-    webui_dependencies_present = True
+    dashboard_dependencies_present = True
     try:
         import aiohttp  # noqa: F401
         import grpc  # noqa: F401
     except ImportError:
-        webui_dependencies_present = False
+        dashboard_dependencies_present = False
         warning_message = (
             "Failed to start the dashboard. The dashboard requires Python 3 "
             "as well as 'pip install aiohttp grpcio'.")
         if require_dashboard:
             raise ImportError(warning_message)
         else:
             logger.warning(warning_message)
-
-    if webui_dependencies_present:
+    if dashboard_dependencies_present:
         process_info = start_ray_process(
             command,
             ray_constants.PROCESS_TYPE_DASHBOARD,
@@ -1319,12 +1321,13 @@ def start_raylet(redis_address,
         sys.executable,
         "-u",
         os.path.join(RAY_PATH, "new_dashboard/agent.py"),
-        "--redis-address={}".format(redis_address),
-        "--metrics-export-port={}".format(metrics_export_port),
-        "--node-manager-port={}".format(node_manager_port),
-        "--object-store-name={}".format(plasma_store_name),
-        "--raylet-name={}".format(raylet_name),
-        "--temp-dir={}".format(temp_dir),
+        f"--redis-address={redis_address}",
+        f"--metrics-export-port={metrics_export_port}",
+        f"--dashboard-agent-port={metrics_agent_port}",
+        f"--node-manager-port={node_manager_port}",
+        f"--object-store-name={plasma_store_name}",
+        f"--raylet-name={raylet_name}",
+        f"--temp-dir={temp_dir}",
     ]
 
     if redis_password is not None and len(redis_password) != 0:
@@ -1357,9 +1360,8 @@ def start_raylet(redis_address,
     if start_initial_python_workers_for_first_job:
         command.append("--num_initial_python_workers_for_first_job={}".format(
             resource_spec.num_cpus))
-    if "RAY_USE_NEW_DASHBOARD" in os.environ:
-        command.append("--agent_command={}".format(
-            subprocess.list2cmdline(agent_command)))
+    command.append("--agent_command={}".format(
+        subprocess.list2cmdline(agent_command)))
     if config.get("plasma_store_as_thread"):
         # command related to the plasma store
         command += [