Skip to content

Commit

Permalink
add opentelemery helper scripts (#119) (#120)
Browse files Browse the repository at this point in the history
  • Loading branch information
msarahan authored Oct 30, 2024
1 parent 88a3c66 commit b2ab625
Show file tree
Hide file tree
Showing 8 changed files with 284 additions and 1 deletion.
3 changes: 3 additions & 0 deletions .github/workflows/prs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -49,3 +49,6 @@ jobs:
run: |
pip install pytest
pytest tests
- name: Run bash tests
run: |
bash tests/test_rapids-otel-wrap.sh
66 changes: 66 additions & 0 deletions tests/test_rapids-get-telemetry-trace-id.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
import sys
import os.path
import subprocess

TOOLS_DIR = os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", "tools")

def test_rapids_compute_trace_id():
result = subprocess.run(
os.path.join(TOOLS_DIR, "rapids-get-telemetry-trace-id"),
env={
"GITHUB_REPOSITORY": "rapidsai/gha-tools",
"GITHUB_RUN_ID": "1123123",
"GITHUB_RUN_ATTEMPT": "1"
},
text=True,
capture_output=True,
)
assert result.stdout.strip() == "22ab4ec60f37f446b4a95917e86660df"
assert result.stderr == ""
assert result.returncode == 0

def test_rapids_get_traceparent():
# this should raise, because OTEL_SERVICE_NAME isn't set
try:
result = subprocess.run(
[os.path.join(TOOLS_DIR, "rapids-get-telemetry-traceparent")],
env={
"GITHUB_REPOSITORY": "rapidsai/gha-tools",
"GITHUB_RUN_ID": "1123123",
"GITHUB_RUN_ATTEMPT": "1"
},
text=True,
capture_output=True,
)
except subprocess.CalledProcessError:
pass
result = subprocess.run(
[os.path.join(TOOLS_DIR, "rapids-get-telemetry-traceparent"), "my_job"],
env={
"GITHUB_REPOSITORY": "rapidsai/gha-tools",
"GITHUB_RUN_ID": "1123123",
"GITHUB_RUN_ATTEMPT": "1",
},
text=True,
capture_output=True,
)
assert result.stdout.strip() == "00-22ab4ec60f37f446b4a95917e86660df-5f57388b5b07a3e8-01"
assert result.stderr == """JOB_SPAN_ID pre-hash: \"22ab4ec60f37f446b4a95917e86660df-my_job\"
STEP_SPAN_ID pre-hash: \"22ab4ec60f37f446b4a95917e86660df-my_job-\"\n"""
assert result.returncode == 0

def test_rapids_get_traceparent_with_step():
result = subprocess.run(
[os.path.join(TOOLS_DIR, "rapids-get-telemetry-traceparent"), "my_job", "my step"],
env={
"GITHUB_REPOSITORY": "rapidsai/gha-tools",
"GITHUB_RUN_ID": "1123123",
"GITHUB_RUN_ATTEMPT": "1",
},
text=True,
capture_output=True,
)
assert result.stdout.strip() == "00-22ab4ec60f37f446b4a95917e86660df-a6e5bc57fad91889-01"
assert result.stderr == """JOB_SPAN_ID pre-hash: \"22ab4ec60f37f446b4a95917e86660df-my_job\"
STEP_SPAN_ID pre-hash: \"22ab4ec60f37f446b4a95917e86660df-my_job-my step\"\n"""
assert result.returncode == 0
54 changes: 54 additions & 0 deletions tests/test_rapids-otel-wrap.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@

import sys
import os.path
import subprocess

TOOLS_DIR = os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", "tools")

def test_wrap_otel():
result = subprocess.run(
[os.path.join(TOOLS_DIR, "rapids-otel-wrap"), "echo", "bob"],
text=True,
capture_output=True,
)
assert result.stdout == "bob\n"
assert result.returncode == 0

def test_wrap_otel_with_spaces():
result = subprocess.run(
[os.path.join(TOOLS_DIR, "rapids-otel-wrap"), "echo", "-n", "bob is here"],
text=True,
capture_output=True,
)
# Note: no newline here, because echo -n shouldn't end with a newline
assert result.stdout == "bob is here"
assert result.returncode == 0

def test_wrap_otel_with_spaces_and_parens():
result = subprocess.run(
[os.path.join(TOOLS_DIR, "rapids-otel-wrap"), "python", "-c", "import sys; print(sys.version)"],
text=True,
capture_output=True,
)
assert result.stdout == "{}\n".format(sys.version)
assert result.returncode == 0

def test_wrap_otel_with_evil_comparison_operators():
result = subprocess.run(
[os.path.join(TOOLS_DIR, "rapids-otel-wrap"), "python", "-c", 'print(str(1<2))'],
text=True,
capture_output=True,
)
assert result.stdout == "True\n"
assert result.returncode == 0

# This differs from the test above in that everything is combined into one string, and we're running it as a true shell
def test_wrap_otel_with_evil_comparison_operators_with_shell():
result = subprocess.run(
'{} python -c "print(str(1<2))"'.format(os.path.join(TOOLS_DIR, "rapids-otel-wrap")),
text=True,
capture_output=True,
shell=True
)
assert result.stdout == "True\n"
assert result.returncode == 0
37 changes: 37 additions & 0 deletions tests/test_rapids-otel-wrap.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
#!/bin/bash
#

SCRIPT_DIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
out=$(${SCRIPT_DIR}/../tools/rapids-otel-wrap echo "abc")
if [ "$out" != "abc" ]; then
echo "error on simple echo case"; exit 1;
fi

out=$(${SCRIPT_DIR}/../tools/rapids-otel-wrap echo "arg with a space")
if [ "$out" != "arg with a space" ]; then
echo "error on space case"; exit 1;
fi

out=$(${SCRIPT_DIR}/../tools/rapids-otel-wrap echo "cmd" "arg with a space" --somearg '"blah blah"')
if [ "$out" != 'cmd arg with a space --somearg "blah blah"' ]; then
echo "error on harder space case"; exit 1;
fi

out=$(${SCRIPT_DIR}/../tools/rapids-otel-wrap cat <<EOF
arg" with a space --somearg 'a<b'
EOF
);
if [ "$out" != "arg\" with a space --somearg 'a<b'" ]; then
echo "error on inequality case as heredoc";
echo "output was:"
echo "$out"
exit 1;
fi

out=$(${SCRIPT_DIR}/../tools/rapids-otel-wrap echo "a<b");
if [ "$out" != "a<b" ]; then
echo "error on inequality case as arg";
echo "output was:"
echo "$out"
exit 1;
fi
4 changes: 3 additions & 1 deletion tools/rapids-conda-retry
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,9 @@ condaCmd=${RAPIDS_CONDA_EXE:=conda}
# needToRetry: 1 if the command should be retried, 0 if it should not be
function runConda {
# shellcheck disable=SC2086
${condaCmd} ${args} 2>&1| tee "${outfile}"
# RAPIDS_OTEL_WRAPPER is optionally passed in as an env var. It is
# used to instrument conda-build or mambabuild for finer-grained spans.
${RAPIDS_OTEL_WRAPPER:-} ${condaCmd} ${args} 2>&1| tee "${outfile}"
exitcode=$?
needToRetry=0
needToClean=0
Expand Down
11 changes: 11 additions & 0 deletions tools/rapids-get-telemetry-trace-id
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
#!/bin/bash
# This is a global, per-run identifier. It is the same across all jobs and all steps within all jobs.
# It is constant from the source repo, to shared-workflows, to shared-actions.

if [ "$GITHUB_REPOSITORY" = "" ] || [ "${GITHUB_RUN_ID}" = "" ] || [ "${GITHUB_RUN_ATTEMPT}" = "" ]; then
echo "Error: one or more inputs to trace id is empty. The variables that must be set are:"
echo " GITHUB_REPOSITORY, GITHUB_RUN_ID, and GITHUB_RUN_ATTEMPT"
exit 1
fi
sha="$(echo "${GITHUB_REPOSITORY}+${GITHUB_RUN_ID}+${GITHUB_RUN_ATTEMPT}" | sha256sum | cut -f1 -d' ')"
echo "${sha:0:32}"
38 changes: 38 additions & 0 deletions tools/rapids-get-telemetry-traceparent
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
#!/bin/bash
# This emits a TRACEPARENT, which follows the w3c trace context standard.
# https://www.w3.org/TR/trace-context/
#
# This script can operate for two purposes:
# 1. The top level of a job, whether it is the job at the source repo (e.g. rmm) level, or
# the matrix job level
# 2. The steps level within a job, which uses both the job name and the step name
#
# The job name must always be provided as the first argument.
# A step name MAY be provided as the second argument. If it is specified, the output corresponds to
# the step within the context of its job.

JOB_NAME=$1
STEP_NAME=${2:-}

if [ "$JOB_NAME" = "" ]; then
echo "ERROR: JOB_NAME (first parameter) is empty. This means your trace doesn't identify anything."
exit 1
fi

SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )

TRACE_ID="$("${SCRIPT_DIR}"/rapids-get-telemetry-trace-id)"
JOB_SPAN_ID="${TRACE_ID}-${JOB_NAME}"
STEP_SPAN_ID="${JOB_SPAN_ID}-${STEP_NAME}"

echo "JOB_SPAN_ID pre-hash: \"$JOB_SPAN_ID\"" 1>&2
echo "STEP_SPAN_ID pre-hash: \"$STEP_SPAN_ID\"" 1>&2

JOB_TRACEPARENT=$(echo -n "${JOB_SPAN_ID}" | sha256sum | cut -f1 -d' ')
STEP_TRACEPARENT=$(echo -n "${STEP_SPAN_ID}" | sha256sum | cut -f1 -d' ')

if [ "${STEP_NAME}" != "" ]; then
echo "00-${TRACE_ID}-${STEP_TRACEPARENT:0:16}-01"
else
echo "00-${TRACE_ID}-${JOB_TRACEPARENT:0:16}-01"
fi
72 changes: 72 additions & 0 deletions tools/rapids-otel-wrap
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
#!/bin/bash
# Wraps arbitrary commands with arbitrary args. Emits an OpenTelemetry span for tracing the command
#
set -x

SCRIPT_DIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"

RAPIDS_OTEL_TRACES_EXPORTER="${RAPIDS_OTEL_TRACES_EXPORTER:-${RAPIDS_OTEL_EXPORTER:-"console"}}"
RAPIDS_OTEL_METRICS_EXPORTER="${RAPIDS_OTEL_METRICS_EXPORTER:-${RAPIDS_OTEL_EXPORTER:-"console"}}"
RAPIDS_OTEL_LOGS_EXPORTER="${RAPIDS_OTEL_LOGS_EXPORTER:-${RAPIDS_OTEL_EXPORTER:-"console"}}"
OTEL_EXPORTER_OTLP_TRACES_ENDPOINT="${OTEL_EXPORTER_OTLP_TRACES_ENDPOINT:-${OTEL_EXPORTER_OTLP_ENDPOINT}/v1/traces}"
OTEL_EXPORTER_OTLP_METRICS_ENDPOINT="${OTEL_EXPORTER_OTLP_METRICS_ENDPOINT:-${OTEL_EXPORTER_OTLP_ENDPOINT}/v1/metrics}"
OTEL_EXPORTER_OTLP_LOGS_ENDPOINT="${OTEL_EXPORTER_OTLP_LOGS_ENDPOINT:-${OTEL_EXPORTER_OTLP_ENDPOINT}/v1/logs}"
export TRACEPARENT
export OTEL_SERVICE_NAME

set -x

if [[ $(type otel-cli >/dev/null 2>&1) -eq 0 ]] && [ "$TRACEPARENT" != "" ]; then
if [ -n "${RAPIDS_TRACE_DEBUG}" ]; then
rapids-echo-stderr "Running command with OpenTelemetry instrumentation";
fi;

if [ "$OTEL_SERVICE_NAME" = "" ]; then
rapids-echo-stderr "WARNING: OTEL_SERVICE_NAME variable not provided. Traces from different steps may not be associated correctly."
fi

# Some commands have instrumentation. For example, conda-build has monkey-patched instrumentation
# that can be activated with the opentelemetry-instrument command. For these commands,
# we replace the command with the wrapped command, quoted as a whole for the purposes
# of otel-cli exec, so that flags don't get confused.
case "$1" in
conda* )
if [ -n "${RAPIDS_TRACE_DEBUG}" ]; then
rapids-echo-stderr "using opentelemetry-instrument for command";
rapids-echo-stderr "TRACEPARENT prior to otel-cli exec is: \"${TRACEPARENT}\"";
fi;
STEP_TRACEPARENT=$("${SCRIPT_DIR}/rapids-get-telemetry-traceparent" "${OTEL_SERVICE_NAME}" "${STEP_NAME}")

# otel-cli creates a span for us that bridges the traceparent from the parent process
# shellcheck disable=SC2086,SC2048
otel-cli exec \
--name "Run instrumented \"$*\"" \
--force-parent-span-id "$(cut -d'-' -f3 <<<"$STEP_TRACEPARENT")" \
--verbose \
-- \
opentelemetry-instrument \
"$@"
;;
* )
if [ -n "${RAPIDS_TRACE_DEBUG}" ]; then
rapids-echo-stderr "No opentelemetry instrumentation known for command $*";
fi;
# shellcheck disable=SC2086,SC2048
otel-cli exec \
--name "Run instrumented \"$*\"" \
--force-parent-span-id "$(cut -d'-' -f3 <<<"$STEP_TRACEPARENT")" \
--verbose \
-- "$@"
;;
esac
RETURN_STATUS=$?
else
if [ -n "${RAPIDS_TRACE_DEBUG}" ]; then
rapids-echo-stderr "Telemetry disabled from lack of otel-cli on PATH or no TRACEPARENT set";
rapids-echo-stderr "Running command unmodified";
fi;
"$@"
RETURN_STATUS=$?
fi

exit "${RETURN_STATUS}"

0 comments on commit b2ab625

Please sign in to comment.