Skip to content

Commit

Permalink
[Dashboard] Turn on New Dashboard by Default (#11321)
Browse files Browse the repository at this point in the history
  • Loading branch information
mfitton authored Oct 19, 2020
1 parent 202b185 commit f500292
Show file tree
Hide file tree
Showing 15 changed files with 123 additions and 814 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
# The build output should clearly not be checked in
*test-output.xml
/bazel-*
/python/ray/core
/python/ray/pickle5_files/
Expand All @@ -11,7 +12,7 @@
/thirdparty/pkg/
/build/java
.jar

/dashboard/client/build
# Files generated by flatc should be ignored
/src/ray/gcs/format/*_generated.h
/src/ray/object_manager/format/*_generated.h
Expand Down
10 changes: 0 additions & 10 deletions ci/travis/test-wheels.sh
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@ if [ -z "${BUILD_DIR}" ]; then
fi
TEST_DIR="${BUILD_DIR}/python/ray/tests"
TEST_SCRIPTS=("$TEST_DIR/test_microbenchmarks.py" "$TEST_DIR/test_basic.py")
UI_TEST_SCRIPT="${BUILD_DIR}/python/ray/tests/test_webui.py"

function retry {
local n=1
Expand Down Expand Up @@ -77,9 +76,6 @@ if [[ "$platform" == "linux" ]]; then
for SCRIPT in "${TEST_SCRIPTS[@]}"; do
retry "$PYTHON_EXE" "$SCRIPT"
done

# Run the UI test to make sure that the packaged UI works.
retry "$PYTHON_EXE" "$UI_TEST_SCRIPT"
done

# Check that the other wheels are present.
Expand Down Expand Up @@ -118,12 +114,6 @@ elif [[ "$platform" == "macosx" ]]; then
for SCRIPT in "${TEST_SCRIPTS[@]}"; do
retry "$PYTHON_EXE" "$SCRIPT"
done

if (( $(echo "$PY_MM >= 3.0" | bc) )); then
# Run the UI test to make sure that the packaged UI works.
retry "$PYTHON_EXE" "$UI_TEST_SCRIPT"
fi

done
elif [ "${platform}" = windows ]; then
echo "WARNING: Wheel testing not yet implemented for Windows."
Expand Down
11 changes: 10 additions & 1 deletion dashboard/agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
class DashboardAgent(object):
def __init__(self,
redis_address,
dashboard_agent_port,
redis_password=None,
temp_dir=None,
log_dir=None,
Expand All @@ -51,6 +52,7 @@ def __init__(self,
self.redis_password = redis_password
self.temp_dir = temp_dir
self.log_dir = log_dir
self.dashboard_agent_port = dashboard_agent_port
self.metrics_export_port = metrics_export_port
self.node_manager_port = node_manager_port
self.object_store_name = object_store_name
Expand All @@ -59,7 +61,8 @@ def __init__(self,
assert self.node_id, "Empty node id (RAY_NODE_ID)."
self.ip = ray._private.services.get_node_ip_address()
self.server = aiogrpc.server(options=(("grpc.so_reuseport", 0), ))
self.grpc_port = self.server.add_insecure_port("[::]:0")
self.grpc_port = self.server.add_insecure_port(
f"[::]:{self.dashboard_agent_port}")
logger.info("Dashboard agent grpc address: %s:%s", self.ip,
self.grpc_port)
self.aioredis_client = None
Expand Down Expand Up @@ -186,6 +189,11 @@ async def _check_parent():
required=True,
type=int,
help="The port to expose metrics through Prometheus.")
parser.add_argument(
"--dashboard-agent-port",
required=True,
type=int,
help="The port on which the dashboard agent will receive GRPCs.")
parser.add_argument(
"--node-manager-port",
required=True,
Expand Down Expand Up @@ -288,6 +296,7 @@ async def _check_parent():

agent = DashboardAgent(
args.redis_address,
args.dashboard_agent_port,
redis_password=args.redis_password,
temp_dir=temp_dir,
log_dir=log_dir,
Expand Down
1 change: 0 additions & 1 deletion dashboard/dashboard.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
except ImportError:
print("The dashboard requires aiohttp to run.")
import sys

sys.exit(1)

import argparse
Expand Down
8 changes: 3 additions & 5 deletions dashboard/datacenter.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,15 +111,13 @@ async def get_node_info(cls, node_id):
node_physical_stats = DataSource.node_physical_stats.get(node_id, {})
node_stats = DataSource.node_stats.get(node_id, {})
node = DataSource.nodes.get(node_id, {})

node_ip = DataSource.node_id_to_ip.get(node_id)
# Merge node log count information into the payload
log_info = DataSource.ip_and_pid_to_logs.get(node_physical_stats["ip"],
{})
log_info = DataSource.ip_and_pid_to_logs.get(node_ip, {})
node_log_count = 0
for entries in log_info.values():
node_log_count += len(entries)
error_info = DataSource.ip_and_pid_to_errors.get(
node_physical_stats["ip"], {})
error_info = DataSource.ip_and_pid_to_errors.get(node_ip, {})
node_err_count = 0
for entries in error_info.values():
node_err_count += len(entries)
Expand Down
64 changes: 62 additions & 2 deletions dashboard/modules/logical_view/test_logical_view_head.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,8 @@ class InfeasibleActor:
foo_actors = [Foo.remote(4), Foo.remote(5)]
infeasible_actor = InfeasibleActor.remote() # noqa
results = [actor.do_task.remote() for actor in foo_actors] # noqa
assert (wait_until_server_available(ray_start_with_dashboard["webui_url"])
is True)
webui_url = ray_start_with_dashboard["webui_url"]
assert wait_until_server_available(webui_url)
webui_url = format_web_url(webui_url)

timeout_seconds = 5
Expand Down Expand Up @@ -75,5 +74,66 @@ class InfeasibleActor:
raise Exception(f"Timed out while testing, {ex_stack}")


def test_kill_actor(ray_start_with_dashboard):
@ray.remote
class Actor:
def __init__(self):
pass

def f(self):
ray.show_in_dashboard("test")
return os.getpid()

a = Actor.remote()
worker_pid = ray.get(a.f.remote()) # noqa

webui_url = ray_start_with_dashboard["webui_url"]
assert wait_until_server_available(webui_url)
webui_url = format_web_url(webui_url)

def actor_killed(pid):
"""Check For the existence of a unix pid."""
try:
os.kill(pid, 0)
except OSError:
return True
else:
return False

def get_actor():
resp = requests.get(f"{webui_url}/logical/actor_groups")
resp.raise_for_status()
actor_groups_resp = resp.json()
assert actor_groups_resp["result"] is True, actor_groups_resp["msg"]
actor_groups = actor_groups_resp["data"]["actorGroups"]
actor = actor_groups["Actor"]["entries"][0]
return actor

def kill_actor_using_dashboard(actor):
resp = requests.get(
webui_url + "/logical/kill_actor",
params={
"actorId": actor["actorId"],
"ipAddress": actor["ipAddress"],
"port": actor["port"]
})
resp.raise_for_status()
resp_json = resp.json()
assert resp_json["result"] is True, "msg" in resp_json

start = time.time()
last_exc = None
while time.time() - start <= 10:
try:
actor = get_actor()
kill_actor_using_dashboard(actor)
last_exc = None
break
except (KeyError, AssertionError) as e:
last_exc = e
time.sleep(.1)
assert last_exc is None


if __name__ == "__main__":
sys.exit(pytest.main(["-v", __file__]))
36 changes: 13 additions & 23 deletions dashboard/modules/reporter/reporter_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,23 +94,15 @@ async def GetProfilingStats(self, request, context):
return reporter_pb2.GetProfilingStatsReply(
profiling_stats=profiling_stats, std_out=stdout, std_err=stderr)

async def ReportMetrics(self, request, context):
# NOTE: Exceptions are not propagated properly
# when we don't catch them here.
async def ReportOCMetrics(self, request, context):
# This function receives a GRPC containing OpenCensus (OC) metrics
# from a Ray process, then exposes those metrics to Prometheus.
try:
metrcs_description_required = (
self._metrics_agent.record_metrics_points(
request.metrics_points))
except Exception as e:
logger.error(e)
self._metrics_agent.record_metric_points_from_protobuf(
request.metrics)
except Exception:
logger.error(traceback.format_exc())

# If metrics description is missing, we should notify cpp processes
# that we need them. Cpp processes will then report them to here.
# We need it when (1) a new metric is reported (application metric)
# (2) a reporter goes down and restarted (currently not implemented).
return reporter_pb2.ReportMetricsReply(
metrcs_description_required=metrcs_description_required)
return reporter_pb2.ReportOCMetricsReply()

@staticmethod
def _get_cpu_percent():
Expand All @@ -125,8 +117,7 @@ def _get_gpu_usage():
try:
gpus = gpustat.new_query().gpus
except Exception as e:
logger.debug(
"gpustat failed to retrieve GPU information: {}".format(e))
logger.debug(f"gpustat failed to retrieve GPU information: {e}")
for gpu in gpus:
# Note the keys in this dict have periods which throws
# off javascript so we change .s to _s
Expand Down Expand Up @@ -233,12 +224,8 @@ def _get_all_stats(self):
"cmdline": self._get_raylet_cmdline(),
}

async def _perform_iteration(self):
async def _perform_iteration(self, aioredis_client):
"""Get any changes to the log files and push updates to Redis."""
aioredis_client = await aioredis.create_redis_pool(
address=self._dashboard_agent.redis_address,
password=self._dashboard_agent.redis_password)

while True:
try:
stats = self._get_all_stats()
Expand All @@ -249,5 +236,8 @@ async def _perform_iteration(self):
reporter_consts.REPORTER_UPDATE_INTERVAL_MS / 1000)

async def run(self, server):
aioredis_client = await aioredis.create_redis_pool(
address=self._dashboard_agent.redis_address,
password=self._dashboard_agent.redis_password)
reporter_pb2_grpc.add_ReporterServiceServicer_to_server(self, server)
await self._perform_iteration()
await self._perform_iteration(aioredis_client)
2 changes: 1 addition & 1 deletion dashboard/modules/tune/tune_head.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ async def collect(self):

# search through all the sub_directories in log directory
analysis = Analysis(str(self._logdir))
df = analysis.dataframe(metric="episode_reward_mean", mode="max")
df = analysis.dataframe(metric=None, mode=None)

if len(df) == 0 or "trial_id" not in df.columns:
return
Expand Down
3 changes: 2 additions & 1 deletion python/build-wheel-macos.sh
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,8 @@ source "$HOME"/.nvm/nvm.sh
nvm use node

# Build the dashboard so its static assets can be included in the wheel.
pushd python/ray/dashboard/client
# TODO(mfitton): switch this back when deleting old dashboard code.
pushd python/ray/new_dashboard/client
npm ci
npm run build
popd
Expand Down
3 changes: 2 additions & 1 deletion python/build-wheel-manylinux1.sh
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,8 @@ nvm install node
nvm use node

# Build the dashboard so its static assets can be included in the wheel.
pushd python/ray/dashboard/client
# TODO(mfitton): switch this back when deleting old dashboard code.
pushd python/ray/new_dashboard/client
npm ci
npm run build
popd
Expand Down
44 changes: 23 additions & 21 deletions python/ray/_private/services.py
Original file line number Diff line number Diff line change
Expand Up @@ -495,10 +495,18 @@ def preexec_fn():
process.kill()
raise

def _get_stream_name(stream):
if stream is not None:
try:
return stream.name
except AttributeError:
return str(stream)
return None

return ProcessInfo(
process=process,
stdout_file=stdout_file.name if stdout_file is not None else None,
stderr_file=stderr_file.name if stderr_file is not None else None,
stdout_file=_get_stream_name(stdout_file),
stderr_file=_get_stream_name(stderr_file),
use_valgrind=use_valgrind,
use_gdb=use_gdb,
use_valgrind_profiler=use_valgrind_profiler,
Expand Down Expand Up @@ -1037,12 +1045,7 @@ def start_dashboard(require_dashboard,
raise ValueError(
f"The given dashboard port {port} is already in use")

if "RAY_USE_NEW_DASHBOARD" in os.environ:
dashboard_dir = "new_dashboard"
else:
dashboard_dir = "dashboard"
logdir = None

dashboard_dir = "new_dashboard"
dashboard_filepath = os.path.join(RAY_PATH, dashboard_dir, "dashboard.py")
command = [
sys.executable,
Expand All @@ -1058,21 +1061,20 @@ def start_dashboard(require_dashboard,
if redis_password:
command += ["--redis-password", redis_password]

webui_dependencies_present = True
dashboard_dependencies_present = True
try:
import aiohttp # noqa: F401
import grpc # noqa: F401
except ImportError:
webui_dependencies_present = False
dashboard_dependencies_present = False
warning_message = (
"Failed to start the dashboard. The dashboard requires Python 3 "
"as well as 'pip install aiohttp grpcio'.")
if require_dashboard:
raise ImportError(warning_message)
else:
logger.warning(warning_message)

if webui_dependencies_present:
if dashboard_dependencies_present:
process_info = start_ray_process(
command,
ray_constants.PROCESS_TYPE_DASHBOARD,
Expand Down Expand Up @@ -1319,12 +1321,13 @@ def start_raylet(redis_address,
sys.executable,
"-u",
os.path.join(RAY_PATH, "new_dashboard/agent.py"),
"--redis-address={}".format(redis_address),
"--metrics-export-port={}".format(metrics_export_port),
"--node-manager-port={}".format(node_manager_port),
"--object-store-name={}".format(plasma_store_name),
"--raylet-name={}".format(raylet_name),
"--temp-dir={}".format(temp_dir),
f"--redis-address={redis_address}",
f"--metrics-export-port={metrics_export_port}",
f"--dashboard-agent-port={metrics_agent_port}",
f"--node-manager-port={node_manager_port}",
f"--object-store-name={plasma_store_name}",
f"--raylet-name={raylet_name}",
f"--temp-dir={temp_dir}",
]

if redis_password is not None and len(redis_password) != 0:
Expand Down Expand Up @@ -1357,9 +1360,8 @@ def start_raylet(redis_address,
if start_initial_python_workers_for_first_job:
command.append("--num_initial_python_workers_for_first_job={}".format(
resource_spec.num_cpus))
if "RAY_USE_NEW_DASHBOARD" in os.environ:
command.append("--agent_command={}".format(
subprocess.list2cmdline(agent_command)))
command.append("--agent_command={}".format(
subprocess.list2cmdline(agent_command)))
if config.get("plasma_store_as_thread"):
# command related to the plasma store
command += [
Expand Down
Loading

0 comments on commit f500292

Please sign in to comment.