From 8bd8fca71b5fae38b1493c547d15e73da40b32e1 Mon Sep 17 00:00:00 2001 From: Mike Sarahan Date: Thu, 10 Oct 2024 09:58:38 -0500 Subject: [PATCH] add opentelemery helper scripts (#119) --- tests/test_rapids-get-telemetry-trace-id.py | 49 +++++++++++++++++++ tools/rapids-conda-retry | 2 +- tools/rapids-get-pr-conda-artifact | 2 +- tools/rapids-get-telemetry-trace-id | 6 +++ tools/rapids-get-telemetry-traceparent | 30 ++++++++++++ tools/rapids-mamba-retry | 2 +- tools/rapids-otel-wrap | 53 +++++++++++++++++++++ tools/rapids-upload-conda-to-s3 | 2 +- tools/rapids-upload-wheels-to-s3 | 2 +- 9 files changed, 143 insertions(+), 5 deletions(-) create mode 100644 tests/test_rapids-get-telemetry-trace-id.py create mode 100755 tools/rapids-get-telemetry-trace-id create mode 100755 tools/rapids-get-telemetry-traceparent create mode 100755 tools/rapids-otel-wrap diff --git a/tests/test_rapids-get-telemetry-trace-id.py b/tests/test_rapids-get-telemetry-trace-id.py new file mode 100644 index 0000000..0da8b11 --- /dev/null +++ b/tests/test_rapids-get-telemetry-trace-id.py @@ -0,0 +1,49 @@ +import os.path +import subprocess + +TOOLS_DIR = os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", "tools") + +def test_rapids_compute_trace_id(): + result = subprocess.run( + os.path.join(TOOLS_DIR, "rapids-get-telemetry-trace-id"), + env={ + "GITHUB_REPOSITORY": "rapidsai/gha-tools", + "GITHUB_RUN_ID": "1123123", + "RUN_ATTEMPT": "1" + }, + text=True, + capture_output=True, + ) + assert result.stdout.strip() == "22ab4ec60f37f446b4a95917e86660df" + assert result.stderr == "" + assert result.returncode == 0 + +def test_rapids_get_traceparent(): + result = subprocess.run( + [os.path.join(TOOLS_DIR, "rapids-get-telemetry-traceparent"), "my_job"], + env={ + "GITHUB_REPOSITORY": "rapidsai/gha-tools", + "GITHUB_RUN_ID": "1123123", + "RUN_ATTEMPT": "1" + }, + text=True, + capture_output=True, + ) + assert result.stdout.strip() == "00-22ab4ec60f37f446b4a95917e86660df-5f57388b5b07a3e8-01" + assert result.stderr == "" + assert result.returncode == 0 + +def test_rapids_get_traceparent_with_step(): + result = subprocess.run( + [os.path.join(TOOLS_DIR, "rapids-get-telemetry-traceparent"), "my_job", "my step"], + env={ + "GITHUB_REPOSITORY": "rapidsai/gha-tools", + "GITHUB_RUN_ID": "1123123", + "RUN_ATTEMPT": "1" + }, + text=True, + capture_output=True, + ) + assert result.stdout.strip() == "00-22ab4ec60f37f446b4a95917e86660df-a6e5bc57fad91889-01" + assert result.stderr == "" + assert result.returncode == 0 \ No newline at end of file diff --git a/tools/rapids-conda-retry b/tools/rapids-conda-retry index 9ce59ca..78e25b2 100755 --- a/tools/rapids-conda-retry +++ b/tools/rapids-conda-retry @@ -67,7 +67,7 @@ condaCmd=${RAPIDS_CONDA_EXE:=conda} # needToRetry: 1 if the command should be retried, 0 if it should not be function runConda { # shellcheck disable=SC2086 - ${condaCmd} ${args} 2>&1| tee "${outfile}" + rapids-otel-wrap ${condaCmd} ${args} 2>&1| tee "${outfile}" exitcode=$? needToRetry=0 needToClean=0 diff --git a/tools/rapids-get-pr-conda-artifact b/tools/rapids-get-pr-conda-artifact index 3c3605d..2e9885b 100755 --- a/tools/rapids-get-pr-conda-artifact +++ b/tools/rapids-get-pr-conda-artifact @@ -33,4 +33,4 @@ if [[ -z "${commit}" ]]; then commit=$(git ls-remote https://github.com/rapidsai/"${repo}".git refs/heads/pull-request/"${pr}" | cut -c1-7) fi -rapids-get-artifact "ci/${repo}/pull-request/${pr}/${commit}/${artifact_name}" +rapids-otel-wrap rapids-get-artifact "ci/${repo}/pull-request/${pr}/${commit}/${artifact_name}" diff --git a/tools/rapids-get-telemetry-trace-id b/tools/rapids-get-telemetry-trace-id new file mode 100755 index 0000000..f69c777 --- /dev/null +++ b/tools/rapids-get-telemetry-trace-id @@ -0,0 +1,6 @@ +#!/bin/bash +# This is a global, per-run identifier. It is the same across all jobs and all steps within all jobs. +# It is constant from the source repo, to shared-workflows, to shared-actions. + +sha="$(echo "${GITHUB_REPOSITORY}+${GITHUB_RUN_ID}+${RUN_ATTEMPT}" | sha256sum | cut -f1 -d' ')" +echo "${sha:0:32}" \ No newline at end of file diff --git a/tools/rapids-get-telemetry-traceparent b/tools/rapids-get-telemetry-traceparent new file mode 100755 index 0000000..cf7d006 --- /dev/null +++ b/tools/rapids-get-telemetry-traceparent @@ -0,0 +1,30 @@ +#!/bin/bash +# This emits a TRACEPARENT, which follows the w3c trace context standard. +# https://www.w3.org/TR/trace-context/ +# +# This script can operate for two purposes: +# 1. The top level of a job, whether it is the job at the source repo (e.g. rmm) level, or +# the matrix job level +# 2. The steps level within a job, which uses both the job name and the step name +# +# The job name must always be provided as the first argument. +# A step name MAY be provided as the second argument. If it is specified, the output corresponds to +# the step within the context of its job. + +JOB_NAME=$1 +STEP_NAME=${2:-} + +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) + +TRACE_ID="$("${SCRIPT_DIR}"/rapids-get-telemetry-trace-id)" +JOB_SPAN_ID="${TRACE_ID}-${JOB_NAME}" +STEP_SPAN_ID="${JOB_SPAN_ID}-${STEP_NAME}" + +JOB_TRACEPARENT=$(echo -n "${JOB_SPAN_ID}" | sha256sum | cut -f1 -d' ') +STEP_TRACEPARENT=$(echo -n "${STEP_SPAN_ID}" | sha256sum | cut -f1 -d' ') + +if [ "${STEP_NAME}" != "" ]; then + echo "00-${TRACE_ID}-${STEP_TRACEPARENT:0:16}-01" +else + echo "00-${TRACE_ID}-${JOB_TRACEPARENT:0:16}-01" +fi \ No newline at end of file diff --git a/tools/rapids-mamba-retry b/tools/rapids-mamba-retry index 5db5d00..c0bb374 100755 --- a/tools/rapids-mamba-retry +++ b/tools/rapids-mamba-retry @@ -46,4 +46,4 @@ for arg in "$@"; do fi done -rapids-conda-retry "$@" +rapids-otel-wrap rapids-conda-retry "${args[@]}" diff --git a/tools/rapids-otel-wrap b/tools/rapids-otel-wrap new file mode 100755 index 0000000..7f2cfea --- /dev/null +++ b/tools/rapids-otel-wrap @@ -0,0 +1,53 @@ +#!/bin/bash +# Wraps arbitrary commands with arbitrary args. Emits an OpenTelemetry span for tracing the command + +SCRIPT_DIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" + +RAPIDS_OTEL_TRACES_EXPORTER="${RAPIDS_OTEL_TRACES_EXPORTER:-${RAPIDS_OTEL_EXPORTER:-"console"}}" +RAPIDS_OTEL_METRICS_EXPORTER="${RAPIDS_OTEL_METRICS_EXPORTER:-${RAPIDS_OTEL_EXPORTER:-"console"}}" +RAPIDS_OTEL_LOGS_EXPORTER="${RAPIDS_OTEL_LOGS_EXPORTER:-${RAPIDS_OTEL_EXPORTER:-"console"}}" +OTEL_EXPORTER_OTLP_TRACES_ENDPOINT="${OTEL_EXPORTER_OTLP_TRACES_ENDPOINT:-${OTEL_EXPORTER_OTLP_ENDPOINT}/v1/traces}" +OTEL_EXPORTER_OTLP_METRICS_ENDPOINT="${OTEL_EXPORTER_OTLP_METRICS_ENDPOINT:-${OTEL_EXPORTER_OTLP_ENDPOINT}/v1/metrics}" +OTEL_EXPORTER_OTLP_LOGS_ENDPOINT="${OTEL_EXPORTER_OTLP_LOGS_ENDPOINT:-${OTEL_EXPORTER_OTLP_ENDPOINT}/v1/logs}" +export TRACEPARENT="${TRACEPARENT}" + +if [[ $(type otel-cli >/dev/null 2>&1) -eq 0 ]] && [ "$TRACEPARENT" != "" ]; then + echo "Running command with OpenTelemetry instrumentation"; + + set -x + if [ "$OTEL_SERVICE_NAME" = "" ]; then + echo "WARNING: OTEL_SERVICE_NAME variable not provided. Traces from different steps may not be associated correctly." + fi + + # Some commands have instrumentation. For example, conda-build has monkey-patched instrumentation + # that can be activated with the opentelemetry-instrument command. For these commands, + # we replace the command with the wrapped command, quoted as a whole for the purposes + # of otel-cli exec, so that flags don't get confused. + case "$1" in + conda* ) + echo "using opentelemetry-instrument for command"; + command="opentelemetry-instrument $*" + ;; + * ) + command="$*" + ;; + esac + + echo "TRACEPARENT prior to otel-cli exec is: \"${TRACEPARENT}\"" + STEP_TRACEPARENT=$("${SCRIPT_DIR}/rapids-get-telemetry-traceparent" "${JOB_NAME}" "${OTEL_SERVICE_NAME}") + + # otel-cli creates a span for us that bridges the traceparent from the parent process + # into the command we're wrapping + otel-cli exec \ + --name "Run instrumented $*" \ + --force-parent-span-id "$(cut -d'-' -f3 <<<"$STEP_TRACEPARENT")" \ + --verbose \ + -- "${command}" + RETURN_STATUS=$? +else + echo "Skipping instrumentation, running \"${*}\""; + eval "$*" + RETURN_STATUS=$? +fi + +exit "${RETURN_STATUS}" diff --git a/tools/rapids-upload-conda-to-s3 b/tools/rapids-upload-conda-to-s3 index 91a049f..f440298 100755 --- a/tools/rapids-upload-conda-to-s3 +++ b/tools/rapids-upload-conda-to-s3 @@ -30,4 +30,4 @@ pkg_name="$(rapids-package-name "$pkg_type")" # Where conda build artifacts are output path_to_tar_up="${RAPIDS_CONDA_BLD_OUTPUT_DIR}" -rapids-upload-to-s3 "${pkg_name}" "${path_to_tar_up}" +rapids-otel-wrap rapids-upload-to-s3 "${pkg_name}" "${path_to_tar_up}" diff --git a/tools/rapids-upload-wheels-to-s3 b/tools/rapids-upload-wheels-to-s3 index 9b35b39..4671cd1 100755 --- a/tools/rapids-upload-wheels-to-s3 +++ b/tools/rapids-upload-wheels-to-s3 @@ -20,4 +20,4 @@ if [ "${CI:-false}" = "false" ]; then exit 0 fi -rapids-upload-to-s3 "${pkg_name}" "$@" +rapids-otel-wrap rapids-upload-to-s3 "${pkg_name}" "$@"