From 593ef6042d5d968091a8fb90e052e2267061f574 Mon Sep 17 00:00:00 2001 From: Vladislav Polyakov Date: Mon, 28 Oct 2024 13:03:30 +0300 Subject: [PATCH] ci: add chaos testing --- .github/scripts/chaos-ydb.sh | 29 +++++++++++++++++++ .github/workflows/slo.yml | 40 +++++++++++++++++++++++---- tests/slo/internal/metrics/metrics.go | 16 +++++++++-- 3 files changed, 77 insertions(+), 8 deletions(-) create mode 100755 .github/scripts/chaos-ydb.sh diff --git a/.github/scripts/chaos-ydb.sh b/.github/scripts/chaos-ydb.sh new file mode 100755 index 000000000..0fb676938 --- /dev/null +++ b/.github/scripts/chaos-ydb.sh @@ -0,0 +1,29 @@ +#!/bin/bash + +sleep 30 + +chaos_ydb_dynamic_containers() { + # Set the end time to 3 minutes (180 seconds) from the start + end_time=$((SECONDS + 180)) + pattern="ydb-dynamic" + + while [ $SECONDS -lt $end_time ]; do + signal=${1:-"SIGTERM"} + time=${2:-"5"} + + containers=$(docker ps --filter "name=$pattern" -q) + + for container_id in $containers; do + echo "Restarting container with signal $signal: $container_id" + docker restart --signal $signal --time $time "$container_id" + + sleep 30 + done + done +} + +chaos_ydb_dynamic_containers "SIGTERM"; + +chaos_ydb_dynamic_containers "SIGINT"; + +chaos_ydb_dynamic_containers "SIGKILL" 0; diff --git a/.github/workflows/slo.yml b/.github/workflows/slo.yml index 483985d92..4e9296839 100644 --- a/.github/workflows/slo.yml +++ b/.github/workflows/slo.yml @@ -54,7 +54,8 @@ jobs: label: xorm concurrency: - group: slo-${{ github.ref }}-${{matrix.sdk.name}} + group: slo-${{ github.ref }}-${{ matrix.sdk.name }} + cancel-in-progress: true steps: - name: Checkout repository @@ -78,17 +79,44 @@ jobs: github_pull_request_number: ${{ github.event.inputs.github_pull_request_number }} github_token: ${{ secrets.GITHUB_TOKEN }} sdk_name: ${{ matrix.sdk.name }} + ydb_database_node_count: 5 - - name: Run SLO Tests + - name: Prepare SLO Database run: | ./tests/slo/.bin/${{matrix.sdk.id}}_linux_amd64 create grpc://localhost:2135 /Root/testdb + + - name: Chaos YDB + run: | + chmod +x ./.github/scripts/chaos-ydb.sh + nohup ./.github/scripts/chaos-ydb.sh > chaos-ydb.log 2>&1 & + + # - name: Chaos Network + # run: | + # sudo tc qdisc add dev lo root netem delay 100ms 50ms loss 5% corrupt 1% + + - name: Run SLO Tests + run: | ./tests/slo/.bin/${{matrix.sdk.id}}_linux_amd64 run grpc://localhost:2135 /Root/testdb \ -prom-pgw localhost:9091 \ -report-period 250 \ -time ${{inputs.slo_workload_duration_seconds || 600}} \ -read-rps ${{inputs.slo_workload_read_max_rps || 1000}} \ -write-rps ${{inputs.slo_workload_write_max_rps || 100}} \ - -read-timeout 10000 \ - -write-timeout 10000 \ - -shutdown-time 30 - ./tests/slo/.bin/${{matrix.sdk.id}}_linux_amd64 cleanup grpc://localhost:2135 /Root/testdb + -read-timeout 1000 \ + -write-timeout 1000 || true + + # - if: always() + # run: | + # sudo tc qdisc del dev lo root + + - if: always() + uses: actions/upload-artifact@v4 + with: + name: ${{matrix.sdk.name}}-chaos-ydb.log + path: ./chaos-ydb.log + retention-days: 1 + + - if: always() + name: Cleanup SLO Database + run: | + ./tests/slo/.bin/${{matrix.sdk.id}}_linux_amd64 cleanup grpc://localhost:2135 /Root/testdb || true diff --git a/tests/slo/internal/metrics/metrics.go b/tests/slo/internal/metrics/metrics.go index 4e79923ef..30c5e4bba 100644 --- a/tests/slo/internal/metrics/metrics.go +++ b/tests/slo/internal/metrics/metrics.go @@ -27,6 +27,7 @@ type ( operationsFailureTotal *prometheus.CounterVec operationLatencySeconds *prometheus.HistogramVec + retryAttempts *prometheus.GaugeVec retryAttemptsTotal *prometheus.CounterVec retriesSuccessTotal *prometheus.CounterVec retriesFailureTotal *prometheus.CounterVec @@ -107,6 +108,14 @@ func New(url, ref, label, jobName string) (*Metrics, error) { []string{"operation_type", "operation_status"}, ) + m.retryAttempts = prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Name: "sdk_retry_attempts", + Help: "Current retry attempts, categorized by operation type.", + }, + []string{"operation_type"}, + ) + m.retryAttemptsTotal = prometheus.NewCounterVec( prometheus.CounterOpts{ Name: "sdk_retry_attempts_total", @@ -147,6 +156,7 @@ func New(url, ref, label, jobName string) (*Metrics, error) { Collector(m.operationsSuccessTotal). Collector(m.operationsFailureTotal). Collector(m.operationLatencySeconds). + Collector(m.retryAttempts). Collector(m.retryAttemptsTotal). Collector(m.retriesSuccessTotal). Collector(m.retriesFailureTotal). @@ -167,6 +177,7 @@ func (m *Metrics) Reset() error { m.operationsFailureTotal.Reset() m.operationLatencySeconds.Reset() + m.retryAttempts.Reset() m.retryAttemptsTotal.Reset() m.retriesSuccessTotal.Reset() m.retriesFailureTotal.Reset() @@ -192,17 +203,18 @@ func (j Span) Finish(err error, attempts int) { latency := time.Since(j.start) j.m.pendingOperations.WithLabelValues(j.name).Sub(1) + j.m.retryAttempts.WithLabelValues(j.name).Set(float64(attempts)) j.m.operationsTotal.WithLabelValues(j.name).Add(1) j.m.retryAttemptsTotal.WithLabelValues(j.name).Add(float64(attempts)) if err != nil { j.m.errorsTotal.WithLabelValues(err.Error()).Add(1) - // j.m.retriesFailureTotal.WithLabelValues(j.name).Add(1) + j.m.retriesFailureTotal.WithLabelValues(j.name).Add(float64(attempts)) j.m.operationsFailureTotal.WithLabelValues(j.name).Add(1) j.m.operationLatencySeconds.WithLabelValues(j.name, OperationStatusFailue).Observe(latency.Seconds()) } else { + j.m.retriesSuccessTotal.WithLabelValues(j.name).Add(float64(attempts)) j.m.operationsSuccessTotal.WithLabelValues(j.name).Add(1) - // j.m.retriesSuccessTotal.WithLabelValues(j.name).Add(1) j.m.operationLatencySeconds.WithLabelValues(j.name, OperationStatusSuccess).Observe(latency.Seconds()) } }