Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
87 changes: 87 additions & 0 deletions .github/scripts/check-metrics.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
#!/bin/sh
set -e

metrics_file="${1:-./metrics.json}"

if [ ! -f "$metrics_file" ]; then
echo "❌ Metrics file not found: $metrics_file"
exit 1
fi

echo "📊 Checking metrics in: $metrics_file"

fail=0

check_metric() {
key="$1"
threshold="$2"
label="$3"
mode="${4:-fail_if_below_threshold}"

if [ -z "$threshold" ]; then
echo "⚠️ No threshold for $label"
return
fi

echo "🔍 Checking $label (mode = $mode, threshold = $threshold)..."

values=$(jq -r --arg key "$key" '.[$key][0]?.values[]? | select(.[1] != "NaN") | .[1]' "$metrics_file")
if [ -z "$values" ]; then
echo "❌ Metric '$key' is missing or has no valid values. Stopping checks."
exit 1
fi

sum=0
count=0
for value in $values; do
val=$(echo "$value" | tr -d '"')
sum=$(echo "$sum + $val" | bc -l)
count=$((count + 1))
done

if [ "$count" -eq 0 ]; then
echo "⚠️ No valid data points for $label."
exit 1
fi

average=$(echo "$sum / $count" | bc -l)

case "$mode" in
fail_if_below_threshold)
if [ "$(echo "$average < $threshold" | bc -l)" -eq 1 ]; then
echo "❌ $label ($average) is below threshold $threshold"
fail=1
return
fi
;;
fail_if_above_threshold)
if [ "$(echo "$average > $threshold" | bc -l)" -eq 1 ]; then
echo "❌ $label ($average) is above threshold $threshold"
fail=1
return
fi
;;
*)
echo "❌ Unknown mode: $mode"
exit 1
;;
esac

echo "✅ $label passed (average = $average)"
}

check_metric "read_latency_ms" "$READ_LATENCY_MS_THRESHOLD" "Read Latency P95" fail_if_above_threshold
check_metric "write_latency_ms" "$WRITE_LATENCY_MS_THRESHOLD" "Write Latency P95" fail_if_above_threshold
check_metric "read_throughput" "$READ_THROUGHPUT_THRESHOLD" "Read Throughput" fail_if_below_threshold
check_metric "write_throughput" "$WRITE_THROUGHPUT_THRESHOLD" "Write Throughput" fail_if_below_threshold
check_metric "read_attempts" "$READ_ATTEMPTS_THRESHOLD" "Read Attempts" fail_if_above_threshold
check_metric "write_attempts" "$WRITE_ATTEMPTS_THRESHOLD" "Write Attempts" fail_if_above_threshold
check_metric "read_availability" "$READ_AVAILABILITY_THRESHOLD" "Read Availability" fail_if_below_threshold
check_metric "write_availability" "$WRITE_AVAILABILITY_THRESHOLD" "Write Availability" fail_if_below_threshold

if [ "$fail" -eq 1 ]; then
echo "❗ Some metrics did not meet thresholds."
exit 1
else
echo "🎉 All metrics validated successfully."
fi
22 changes: 22 additions & 0 deletions .github/workflows/slo-report.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
name: slo-report

on:
workflow_run:
workflows: [ 'SLO' ]
types:
- completed

jobs:
test-ydb-slo-action:
runs-on: ubuntu-latest
name: Publish YDB SLO Report
permissions:
contents: read
pull-requests: write
if: github.event.workflow_run.conclusion == 'success'
steps:
- name: Publish YDB SLO Report
uses: ydb-platform/ydb-slo-action/report@main
with:
github_token: ${{ secrets.GITHUB_TOKEN }}
github_run_id: ${{ github.event.workflow_run.id }}
85 changes: 62 additions & 23 deletions .github/workflows/slo.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
name: SLO tests
name: SLO

on:
push:
Expand Down Expand Up @@ -31,30 +31,24 @@ env:

jobs:
ydb-slo-action-init:
# // https://github.com/ydb-platform/ydb-rs-sdk/issues/227
if: ${{ false }}
if: (!contains(github.event.pull_request.labels.*.name, 'no slo'))

name: Run YDB SLO Tests
runs-on: ubuntu-latest

strategy:
matrix:
example:
- native
rust_version:
- "RUST_VERSION_OLD"
- "RUST_VERSION_NEW"

concurrency:
group: slo-${{ github.ref }}-${{ matrix.example }}-${{ matrix.rust_version }}
group: slo-${{ github.ref }}-native-${{ matrix.rust_version }}
cancel-in-progress: true

steps:
- name: Checkout
- name: Checkout repository
uses: actions/checkout@v4
with:
submodules: true

- name: Install rust
uses: dtolnay/rust-toolchain@v1
Expand All @@ -73,33 +67,35 @@ jobs:
- name: Rust cache
uses: Swatinem/rust-cache@v2

- name: Prepare envs
run: |
REF=${GITHUB_HEAD_REF:-${GITHUB_REF#refs/heads/}}
REF_SAFE=${REF//\//__}
echo "METRICS_REF=$REF_SAFE" >> $GITHUB_ENV
echo "METRICS_LABEL=native" >> $GITHUB_ENV
echo "METRICS_JOB_NAME=native" >> $GITHUB_ENV

- name: Initialize YDB SLO
uses: ydb-platform/ydb-slo-action/init@main
with:
github_pull_request_number: ${{ github.event.inputs.github_pull_request_number }}
github_token: ${{ secrets.GITHUB_TOKEN }}
workload_name: ${{ matrix.example }}-${{ matrix.rust_version }}
workload_name: native-${{ matrix.rust_version }}
ydb_database_node_count: 5

- name: Prepare SLO Database
run: |
cargo run --example ${{ matrix.example }} grpc://localhost:2135 /Root/testdb tableName create
cargo run --bin ydb-slo-tests-native -- -t tableName grpc://localhost:2135 /Root/testdb create

- name: Run SLO Tests
env:
REF: '${{ github.head_ref || github.ref }}'
run: |
cargo run --example ${{ matrix.example }} grpc://localhost:2135 /Root/testdb tableName run \
cargo run --bin ydb-slo-tests-native -- -t tableName --write-timeout 1 grpc://localhost:2135 /Root/testdb run \
--prom-pgw localhost:9091 \
--report-period 1 \
--time ${{ inputs.slo_workload_duration_seconds || 600}} \
--read-rps ${{ inputs.slo_workload_read_max_rps || 1000}} \
--write-rps ${{ inputs.slo_workload_write_max_rps || 100}} \
--read-timeout 10000 \
--write-timeout 10000 || true

- if: always()
name: Cleanup SLO Database
run: |
cargo run --example ${{ matrix.example }} grpc://localhost:2135 /Root/testdb tableName cleanup
--read-timeout 1 || true

- if: always()
name: Store ydb chaos testing logs
Expand All @@ -109,6 +105,49 @@ jobs:
- if: always()
uses: actions/upload-artifact@v4
with:
name: ${{ matrix.example}}-${{ matrix.rust_version }}-chaos-ydb.log
name: native-${{ matrix.rust_version }}-chaos-ydb.log
path: ./chaos-ydb.log
retention-days: 1
retention-days: 1

- if: always()
name: Cleanup SLO Database
run: |
cargo run --bin ydb-slo-tests-native -- -t tableName grpc://localhost:2135 /Root/testdb cleanup || true
validate-slo-metrics:
name: Validate SLO metrics
needs: ydb-slo-action-init
runs-on: ubuntu-latest

env:
READ_LATENCY_MS_THRESHOLD: "600" # 95th percentile read operations latency in milliseconds
WRITE_LATENCY_MS_THRESHOLD: "800" # 95th percentile write operations latency in milliseconds
READ_THROUGHPUT_THRESHOLD: "150" # Read operations throughput
WRITE_THROUGHPUT_THRESHOLD: "3" # Write operations throughput
READ_ATTEMPTS_THRESHOLD: "100" # Read attempts throughput
WRITE_ATTEMPTS_THRESHOLD: "100" # Write attempts throughput
READ_AVAILABILITY_THRESHOLD: "90" # Read operations availability
WRITE_AVAILABILITY_THRESHOLD: "80" # Write operations availability

strategy:
matrix:
rust_version:
- "RUST_VERSION_OLD"
- "RUST_VERSION_NEW"

steps:
- name: Checkout repo
uses: actions/checkout@v4

- name: Download metrics artifact
uses: actions/download-artifact@v4
with:
name: native-${{ matrix.rust_version }}-metrics.json
path: ./artifacts

- name: Make script executable
run: chmod +x .github/scripts/check-metrics.sh

- name: Validate SLO thresholds
run: |
.github/scripts/check-metrics.sh \
./artifacts/native-${{ matrix.rust_version }}-metrics.json
Loading
Loading