Skip to content

Commit

Permalink
ci: add chaos testing
Browse files Browse the repository at this point in the history
  • Loading branch information
polRk committed Oct 29, 2024
1 parent 17a901b commit 593ef60
Show file tree
Hide file tree
Showing 3 changed files with 77 additions and 8 deletions.
29 changes: 29 additions & 0 deletions .github/scripts/chaos-ydb.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
#!/bin/bash

sleep 30

chaos_ydb_dynamic_containers() {
# Set the end time to 3 minutes (180 seconds) from the start
end_time=$((SECONDS + 180))
pattern="ydb-dynamic"

while [ $SECONDS -lt $end_time ]; do
signal=${1:-"SIGTERM"}
time=${2:-"5"}

containers=$(docker ps --filter "name=$pattern" -q)

for container_id in $containers; do
echo "Restarting container with signal $signal: $container_id"
docker restart --signal $signal --time $time "$container_id"

sleep 30
done
done
}

chaos_ydb_dynamic_containers "SIGTERM";

chaos_ydb_dynamic_containers "SIGINT";

chaos_ydb_dynamic_containers "SIGKILL" 0;
40 changes: 34 additions & 6 deletions .github/workflows/slo.yml
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,8 @@ jobs:
label: xorm

concurrency:
group: slo-${{ github.ref }}-${{matrix.sdk.name}}
group: slo-${{ github.ref }}-${{ matrix.sdk.name }}
cancel-in-progress: true

steps:
- name: Checkout repository
Expand All @@ -78,17 +79,44 @@ jobs:
github_pull_request_number: ${{ github.event.inputs.github_pull_request_number }}
github_token: ${{ secrets.GITHUB_TOKEN }}
sdk_name: ${{ matrix.sdk.name }}
ydb_database_node_count: 5

- name: Run SLO Tests
- name: Prepare SLO Database
run: |
./tests/slo/.bin/${{matrix.sdk.id}}_linux_amd64 create grpc://localhost:2135 /Root/testdb
- name: Chaos YDB
run: |
chmod +x ./.github/scripts/chaos-ydb.sh
nohup ./.github/scripts/chaos-ydb.sh > chaos-ydb.log 2>&1 &
# - name: Chaos Network
# run: |
# sudo tc qdisc add dev lo root netem delay 100ms 50ms loss 5% corrupt 1%

- name: Run SLO Tests
run: |
./tests/slo/.bin/${{matrix.sdk.id}}_linux_amd64 run grpc://localhost:2135 /Root/testdb \
-prom-pgw localhost:9091 \
-report-period 250 \
-time ${{inputs.slo_workload_duration_seconds || 600}} \
-read-rps ${{inputs.slo_workload_read_max_rps || 1000}} \
-write-rps ${{inputs.slo_workload_write_max_rps || 100}} \
-read-timeout 10000 \
-write-timeout 10000 \
-shutdown-time 30
./tests/slo/.bin/${{matrix.sdk.id}}_linux_amd64 cleanup grpc://localhost:2135 /Root/testdb
-read-timeout 1000 \
-write-timeout 1000 || true
# - if: always()
# run: |
# sudo tc qdisc del dev lo root

- if: always()
uses: actions/upload-artifact@v4
with:
name: ${{matrix.sdk.name}}-chaos-ydb.log
path: ./chaos-ydb.log
retention-days: 1

- if: always()
name: Cleanup SLO Database
run: |
./tests/slo/.bin/${{matrix.sdk.id}}_linux_amd64 cleanup grpc://localhost:2135 /Root/testdb || true
16 changes: 14 additions & 2 deletions tests/slo/internal/metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ type (
operationsFailureTotal *prometheus.CounterVec
operationLatencySeconds *prometheus.HistogramVec

retryAttempts *prometheus.GaugeVec
retryAttemptsTotal *prometheus.CounterVec
retriesSuccessTotal *prometheus.CounterVec
retriesFailureTotal *prometheus.CounterVec
Expand Down Expand Up @@ -107,6 +108,14 @@ func New(url, ref, label, jobName string) (*Metrics, error) {
[]string{"operation_type", "operation_status"},
)

m.retryAttempts = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "sdk_retry_attempts",
Help: "Current retry attempts, categorized by operation type.",
},
[]string{"operation_type"},
)

m.retryAttemptsTotal = prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "sdk_retry_attempts_total",
Expand Down Expand Up @@ -147,6 +156,7 @@ func New(url, ref, label, jobName string) (*Metrics, error) {
Collector(m.operationsSuccessTotal).
Collector(m.operationsFailureTotal).
Collector(m.operationLatencySeconds).
Collector(m.retryAttempts).
Collector(m.retryAttemptsTotal).
Collector(m.retriesSuccessTotal).
Collector(m.retriesFailureTotal).
Expand All @@ -167,6 +177,7 @@ func (m *Metrics) Reset() error {
m.operationsFailureTotal.Reset()
m.operationLatencySeconds.Reset()

m.retryAttempts.Reset()
m.retryAttemptsTotal.Reset()
m.retriesSuccessTotal.Reset()
m.retriesFailureTotal.Reset()
Expand All @@ -192,17 +203,18 @@ func (j Span) Finish(err error, attempts int) {
latency := time.Since(j.start)
j.m.pendingOperations.WithLabelValues(j.name).Sub(1)

j.m.retryAttempts.WithLabelValues(j.name).Set(float64(attempts))
j.m.operationsTotal.WithLabelValues(j.name).Add(1)
j.m.retryAttemptsTotal.WithLabelValues(j.name).Add(float64(attempts))

if err != nil {
j.m.errorsTotal.WithLabelValues(err.Error()).Add(1)
// j.m.retriesFailureTotal.WithLabelValues(j.name).Add(1)
j.m.retriesFailureTotal.WithLabelValues(j.name).Add(float64(attempts))
j.m.operationsFailureTotal.WithLabelValues(j.name).Add(1)
j.m.operationLatencySeconds.WithLabelValues(j.name, OperationStatusFailue).Observe(latency.Seconds())
} else {
j.m.retriesSuccessTotal.WithLabelValues(j.name).Add(float64(attempts))
j.m.operationsSuccessTotal.WithLabelValues(j.name).Add(1)
// j.m.retriesSuccessTotal.WithLabelValues(j.name).Add(1)
j.m.operationLatencySeconds.WithLabelValues(j.name, OperationStatusSuccess).Observe(latency.Seconds())
}
}

0 comments on commit 593ef60

Please sign in to comment.