Skip to content

Commit 3024da9

Browse files
committed
feat: add metrics collection for slo tests
1 parent bb6b7cb commit 3024da9

File tree

13 files changed

+1366
-286
lines changed

13 files changed

+1366
-286
lines changed

.github/scripts/check-metrics.sh

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
#!/bin/sh
2+
set -e
3+
4+
metrics_file="${1:-./metrics.json}"
5+
6+
if [ ! -f "$metrics_file" ]; then
7+
echo "❌ Metrics file not found: $metrics_file"
8+
exit 1
9+
fi
10+
11+
echo "📊 Checking metrics in: $metrics_file"
12+
13+
fail=0
14+
15+
check_metric() {
16+
key="$1"
17+
threshold="$2"
18+
label="$3"
19+
20+
if [ -z "$threshold" ]; then
21+
echo "⚠️ No threshold for $label"
22+
return
23+
fi
24+
25+
echo "🔍 Checking $label (threshold = $threshold)..."
26+
27+
values=$(jq -r --arg key "$key" '.[$key][0]?.values[]? | select(.[1] != "NaN") | .[1]' "$metrics_file")
28+
if [ -z "$values" ]; then
29+
echo "❌ Metric '$key' is missing or has no valid values. Stopping checks."
30+
exit 1
31+
fi
32+
33+
sum=0
34+
count=0
35+
for value in $values; do
36+
val=$(echo "$value" | tr -d '"')
37+
sum=$(echo "$sum + $val" | bc -l)
38+
count=$((count + 1))
39+
done
40+
41+
if [ "$count" -eq 0 ]; then
42+
echo "⚠️ No valid data points for $label."
43+
exit 1
44+
fi
45+
46+
average=$(echo "$sum / $count" | bc -l)
47+
echo " ➤ Average $label = $average"
48+
49+
if [ "$(echo "$average < $threshold" | bc -l)" -eq 1 ]; then
50+
echo "❌ Average $label ($average) is below threshold $threshold"
51+
fail=1
52+
return
53+
fi
54+
55+
echo "$label passed (average = $average)"
56+
}
57+
58+
#check_metric "read_latency_ms_p95" "$READ_LATENCY_MS_THRESHOLD" "Read Latency P95"
59+
#check_metric "write_latency_ms_p95" "$WRITE_LATENCY_MS_THRESHOLD" "Write Latency P95"
60+
check_metric "read_throughput" "$READ_THROUGHPUT_THRESHOLD" "Read Throughput"
61+
check_metric "write_throughput" "$WRITE_THROUGHPUT_THRESHOLD" "Write Throughput"
62+
#check_metric "read_attempts" "$READ_ATTEMPTS_THRESHOLD" "Read Attempts"
63+
#check_metric "write_attempts" "$WRITE_ATTEMPTS_THRESHOLD" "Write Attempts"
64+
check_metric "read_availability" "$READ_AVAILABILITY_THRESHOLD" "Read Availability"
65+
check_metric "write_availability" "$WRITE_AVAILABILITY_THRESHOLD" "Write Availability"
66+
67+
if [ "$fail" -eq 1 ]; then
68+
echo "❗ Some metrics did not meet thresholds."
69+
exit 1
70+
else
71+
echo "🎉 All metrics validated successfully."
72+
fi

.github/workflows/slo-report.yml

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
name: slo-report
2+
3+
on:
4+
workflow_run:
5+
workflows: [ 'SLO' ]
6+
types:
7+
- completed
8+
9+
jobs:
10+
test-ydb-slo-action:
11+
runs-on: ubuntu-latest
12+
name: Publish YDB SLO Report
13+
permissions:
14+
contents: read
15+
pull-requests: write
16+
if: github.event.workflow_run.conclusion == 'success'
17+
steps:
18+
- name: Publish YDB SLO Report
19+
uses: ydb-platform/ydb-slo-action/report@main
20+
with:
21+
github_token: ${{ secrets.GITHUB_TOKEN }}
22+
github_run_id: ${{ github.event.workflow_run.id }}

.github/workflows/slo.yml

Lines changed: 69 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
name: SLO tests
1+
name: SLO
22

33
on:
44
push:
@@ -31,30 +31,27 @@ env:
3131

3232
jobs:
3333
ydb-slo-action-init:
34-
# // https://github.com/ydb-platform/ydb-rs-sdk/issues/227
35-
if: ${{ false }}
3634
if: (!contains(github.event.pull_request.labels.*.name, 'no slo'))
3735

3836
name: Run YDB SLO Tests
3937
runs-on: ubuntu-latest
4038

4139
strategy:
4240
matrix:
43-
example:
44-
- native
41+
sdk:
42+
- name: native
43+
label: native
4544
rust_version:
4645
- "RUST_VERSION_OLD"
4746
- "RUST_VERSION_NEW"
4847

4948
concurrency:
50-
group: slo-${{ github.ref }}-${{ matrix.example }}-${{ matrix.rust_version }}
49+
group: slo-${{ github.ref }}-${{ matrix.sdk.name }}-${{ matrix.rust_version }}
5150
cancel-in-progress: true
5251

5352
steps:
54-
- name: Checkout
53+
- name: Checkout repository
5554
uses: actions/checkout@v4
56-
with:
57-
submodules: true
5855

5956
- name: Install rust
6057
uses: dtolnay/rust-toolchain@v1
@@ -73,33 +70,36 @@ jobs:
7370
- name: Rust cache
7471
uses: Swatinem/rust-cache@v2
7572

73+
- name: Prepare envs
74+
run: |
75+
REF=${GITHUB_HEAD_REF:-${GITHUB_REF#refs/heads/}}
76+
REF_SAFE=${REF//\//__}
77+
echo "METRICS_REF=$REF_SAFE" >> $GITHUB_ENV
78+
echo "METRICS_LABEL=${{ matrix.sdk.label }}" >> $GITHUB_ENV
79+
echo "METRICS_JOB_NAME=${{ matrix.sdk.name }}" >> $GITHUB_ENV
80+
7681
- name: Initialize YDB SLO
7782
uses: ydb-platform/ydb-slo-action/init@main
7883
with:
7984
github_pull_request_number: ${{ github.event.inputs.github_pull_request_number }}
8085
github_token: ${{ secrets.GITHUB_TOKEN }}
81-
workload_name: ${{ matrix.example }}-${{ matrix.rust_version }}
86+
workload_name: ${{ matrix.sdk.name }}-${{ matrix.rust_version }}
8287
ydb_database_node_count: 5
8388

8489
- name: Prepare SLO Database
8590
run: |
86-
cargo run --example ${{ matrix.example }} grpc://localhost:2135 /Root/testdb tableName create
91+
cargo run --example ${{ matrix.sdk.name }} grpc://localhost:2135 /Root/testdb tableName create
8792
8893
- name: Run SLO Tests
89-
env:
90-
REF: '${{ github.head_ref || github.ref }}'
9194
run: |
92-
cargo run --example ${{ matrix.example }} grpc://localhost:2135 /Root/testdb tableName run \
95+
cargo run --example ${{ matrix.sdk.name }} grpc://localhost:2135 /Root/testdb tableName run \
96+
--prom-pgw localhost:9091 \
97+
--report-period 250 \
9398
--time ${{ inputs.slo_workload_duration_seconds || 600}} \
9499
--read-rps ${{ inputs.slo_workload_read_max_rps || 1000}} \
95100
--write-rps ${{ inputs.slo_workload_write_max_rps || 100}} \
96-
--read-timeout 10000 \
97-
--write-timeout 10000 || true
98-
99-
- if: always()
100-
name: Cleanup SLO Database
101-
run: |
102-
cargo run --example ${{ matrix.example }} grpc://localhost:2135 /Root/testdb tableName cleanup
101+
--read-timeout 1000 \
102+
--write-timeout 1000 || true
103103
104104
- if: always()
105105
name: Store ydb chaos testing logs
@@ -109,6 +109,52 @@ jobs:
109109
- if: always()
110110
uses: actions/upload-artifact@v4
111111
with:
112-
name: ${{ matrix.example}}-${{ matrix.rust_version }}-chaos-ydb.log
112+
name: ${{ matrix.sdk.name}}-${{ matrix.rust_version }}-chaos-ydb.log
113113
path: ./chaos-ydb.log
114-
retention-days: 1
114+
retention-days: 1
115+
116+
- if: always()
117+
name: Cleanup SLO Database
118+
run: |
119+
cargo run --example ${{ matrix.sdk.name }} grpc://localhost:2135 /Root/testdb tableName cleanup || true
120+
validate-slo-metrics:
121+
name: Validate SLO metrics
122+
needs: ydb-slo-action-init
123+
runs-on: ubuntu-latest
124+
125+
env:
126+
# READ_LATENCY_MS_THRESHOLD: "1" # 95th percentile read operations latency in milliseconds
127+
# WRITE_LATENCY_MS_THRESHOLD: "1" # 95th percentile write operations latency in milliseconds
128+
READ_THROUGHPUT_THRESHOLD: "150" # Read operations throughput
129+
WRITE_THROUGHPUT_THRESHOLD: "3" # Write operations throughput
130+
# READ_ATTEMPTS_THRESHOLD: "1" # Read attempts throughput
131+
# WRITE_ATTEMPTS_THRESHOLD: "1" # Write attempts throughput
132+
READ_AVAILABILITY_THRESHOLD: "90" # Read operations availability
133+
WRITE_AVAILABILITY_THRESHOLD: "90" # Write operations availability
134+
135+
strategy:
136+
matrix:
137+
sdk:
138+
- name: native
139+
label: native
140+
rust_version:
141+
- "RUST_VERSION_OLD"
142+
- "RUST_VERSION_NEW"
143+
144+
steps:
145+
- name: Checkout repo
146+
uses: actions/checkout@v4
147+
148+
- name: Download metrics artifact
149+
uses: actions/download-artifact@v4
150+
with:
151+
name: ${{ matrix.sdk.name }}-${{ matrix.rust_version }}-metrics.json
152+
path: ./artifacts
153+
154+
- name: Make script executable
155+
run: chmod +x .github/scripts/check-metrics.sh
156+
157+
- name: Validate SLO thresholds
158+
run: |
159+
.github/scripts/check-metrics.sh \
160+
./artifacts/${{ matrix.sdk.name }}-${{ matrix.rust_version }}-metrics.json

0 commit comments

Comments
 (0)