.github/workflows/dm_chaos.yaml

name: DM Chaos

on:
  schedule:
    - cron: '0 17-23 * * *' # run at minute 0 every hour from 01:00 ~ 07:00 UTC+8
  workflow_dispatch:
    inputs:
      pr:
        description: 'Which PR do you want to trigger'
        required: true
        default: ''

# See: https://docs.github.com/en/actions/reference/workflow-syntax-for-github-actions#concurrency.
concurrency:
  group: ${{ github.ref }}-${{ github.workflow }}
  cancel-in-progress: true

# A workflow run is made up of one or more jobs that can run sequentially or in parallel
jobs:
  # This workflow contains a single job called "base"
  base:
    # The type of runner that the job will run on
    runs-on: ubuntu-20.04
    timeout-minutes: 50
    strategy:
      fail-fast: false
      matrix:
        chaos-obj:
          [
            "pod-failure-dm",
            "pod-kill-dm",
            "network-partition-dm",
            "network-emulation-dm",
            "io-chaos-dm",
          ]

    # Steps represent a sequence of tasks that will be executed as part of the job
    steps:
      # Set up Go for building DM
      - name: Set up Go env
        uses: actions/setup-go@v3
        with:
          go-version: '1.23'
      - name: Print Go version
        run: go version

      # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
      - name: Check out code
        uses: actions/checkout@v2

      - name: Check out code by workflow dispatch
        if: ${{ github.event.inputs.pr != '' }}
        uses: actions/checkout@v2
        with:
          ref: refs/pull/${{ github.event.inputs.pr }}/head

      - name: Cache go modules
        uses: actions/cache@v2
        with:
          path: ~/go/pkg/mod
          key: ${{ runner.os }}-ticdc-${{ hashFiles('go.sum') }}

      - name: Cache Tools
        id: cache-tools
        uses: actions/cache@v2
        with:
          path: tools/bin
          key: ${{ runner.os }}-ticdc-tools-${{ hashFiles('tools/check/go.sum') }}

      - name: install k3s
        run: |
          curl -fsSL https://get.k3s.io | sh -s - --write-kubeconfig-mode 644 \
            "${k3s_disable_command:---disable}" metrics-server \
            "${k3s_disable_command:---disable}" traefik \
            --flannel-backend=none \
            --docker
        shell: bash

      - name: Export KUBECONFIG environment variable
        run: |
          echo 'KUBECONFIG=/etc/rancher/k3s/k3s.yaml' >> $GITHUB_ENV
        shell: bash

      - name: Print cluster information
        run: |
          kubectl config view
          kubectl cluster-info
          kubectl get nodes
          kubectl get pods -n kube-system
          kubectl get sc
          kubectl version
          helm version

      # Disable AppArmor for MySQL, see https://github.com/moby/moby/issues/7512#issuecomment-61787845
      - name: Disable AppArmor for MySQL
        run: |
          sudo ln -s /etc/apparmor.d/usr.sbin.mysqld /etc/apparmor.d/disable/
          sudo apparmor_parser -R /etc/apparmor.d/usr.sbin.mysqld

      - name: Build DM binary
        run: make dm-master dm-worker dmctl dm-chaos-case

      # NOTE: we also copy config files into `bin` directory,
      # so we only need to send `bin` as the context into docker daemon when building image.
      - name: Build DM docker image
        run: |
          cp -r $GITHUB_WORKSPACE/dm/chaos/cases/conf/ $GITHUB_WORKSPACE/bin/
          docker build -f $GITHUB_WORKSPACE/dm/chaos/manifests/Dockerfile -t dm:chaos $GITHUB_WORKSPACE/bin
          docker image list
      
      # Set up upstream instances
      - name: Set up sources
        run: |
          kubectl apply -f $GITHUB_WORKSPACE/dm/chaos/manifests/sources.yaml
          kubectl get -f $GITHUB_WORKSPACE/dm/chaos/manifests/sources.yaml
          kubectl describe -f $GITHUB_WORKSPACE/dm/chaos/manifests/sources.yaml
      - name: Wait for sources ready # kubectl wait --all not working
        run: |
          kubectl wait --for=condition=Ready pod/mysql57-0 --timeout=300s || true
          kubectl wait --for=condition=Ready pod/mysql8-0 --timeout=300s || true
          kubectl wait --for=condition=Ready pod/mariadb-0 --timeout=300s || true
          sleep 10
          echo show pvc
          kubectl get pvc -l app=sources -o wide
          echo show pv
          kubectl get pv -o wide
          echo show svc
          kubectl get svc -l app=sources -o wide
          echo show sts
          kubectl get sts -l app=sources -o wide
          echo show po
          kubectl get po -l app=sources -o wide
          echo describe po
          kubectl describe po -l app=sources
          echo describe pvc
          kubectl describe pvc -l app=sources
          kubectl wait --for=condition=Ready pod/mysql57-0 --timeout=0s
          kubectl wait --for=condition=Ready pod/mysql8-0 --timeout=0s
          kubectl wait --for=condition=Ready pod/mariadb-0 --timeout=0s

      # Set up downstream TiDB instance (deploy a TiDB with mockTiKV, not a TidbCluster managed by TiDB-operator)
      - name: Set up TiDB
        run: |
          kubectl apply -f $GITHUB_WORKSPACE/dm/chaos/manifests/tidb.yaml
          kubectl get -f $GITHUB_WORKSPACE/dm/chaos/manifests/tidb.yaml
          kubectl describe -f $GITHUB_WORKSPACE/dm/chaos/manifests/tidb.yaml
      - name: Wait for TiDB ready
        run: |
          kubectl wait --for=condition=Ready pod/tidb-0 --timeout=300s || true
          echo show pvc
          kubectl get pvc -l app=tidb -o wide
          echo show pv
          kubectl get pv -o wide
          echo show svc
          kubectl get svc -l app=tidb -o wide
          echo show sts
          kubectl get sts -l app=tidb -o wide
          echo show po
          kubectl get po -l app=tidb -o wide
          echo describe po
          kubectl describe po -l app=tidb
          echo describe pvc
          kubectl describe pvc -l app=tidb
          kubectl wait --for=condition=Ready pod/tidb-0 --timeout=0s

      - name: Set up DM-master
        run: |
          kubectl apply -f $GITHUB_WORKSPACE/dm/chaos/manifests/dm-master.yaml
          kubectl get -f $GITHUB_WORKSPACE/dm/chaos/manifests/dm-master.yaml
          kubectl describe -f $GITHUB_WORKSPACE/dm/chaos/manifests/dm-master.yaml
      # NOTE: even some DM-master instances are not ready, we still continue and let chaos test cases to check again.
      - name: Wait for DM-master ready
        run: |
          sleep 10
          kubectl wait --for=condition=Ready pod -l app=dm-master --all --timeout=300s || true
          echo "<<<<< show pvc >>>>>"
          kubectl get pvc -l app=dm-master -o wide
          echo "<<<<< show pv >>>>>"
          kubectl get pv -o wide
          echo "<<<<< show svc >>>>>"
          kubectl get svc -l app=dm-master -o wide
          echo "<<<<< show sts >>>>>"
          kubectl get sts -l app=dm-master -o wide
          echo "<<<<< show po >>>>>"
          kubectl get po -l app=dm-master -o wide
          echo "<<<<< describe po >>>>>"
          kubectl describe po -l app=dm-master
          echo "<<<<< describe pvc >>>>>"
          kubectl describe pvc -l app=dm-master
          echo "<<<<< show current log for dm-master-0 >>>>>"
          kubectl logs dm-master-0 || true
          echo "<<<<< show previous log for dm-master-0 >>>>>"
          kubectl logs dm-master-0 -p || true
          echo "<<<<< show current log for dm-master-1 >>>>>"
          kubectl logs dm-master-1 || true
          echo "<<<<< show previous log for dm-master-1 >>>>>"
          kubectl logs dm-master-1 -p || true
          echo "<<<<< show current log for dm-master-2 >>>>>"
          kubectl logs dm-master-2 || true
          echo "<<<<< show previous log for dm-master-2 >>>>>"
          kubectl logs dm-master-2 -p || true

      - name: Set up DM-worker
        run: |
          kubectl apply -f $GITHUB_WORKSPACE/dm/chaos/manifests/dm-worker.yaml
          kubectl get -f $GITHUB_WORKSPACE/dm/chaos/manifests/dm-worker.yaml
          kubectl describe -f $GITHUB_WORKSPACE/dm/chaos/manifests/dm-worker.yaml
      # NOTE: even some DM-worker instances are not ready, we still continue and let chaos test cases to check again.
      - name: Wait for DM-worker ready
        run: |
          sleep 10
          kubectl wait --for=condition=Ready pod -l app=dm-worker --all --timeout=300s || true
          echo "<<<<< show pvc >>>>>"
          kubectl get pvc -l app=dm-worker -o wide
          echo "<<<<< show pv >>>>>"
          kubectl get pv -o wide
          echo "<<<<< show svc >>>>>"
          kubectl get svc -l app=dm-worker -o wide
          echo "<<<<< show sts >>>>>"
          kubectl get sts -l app=dm-worker -o wide
          echo "<<<<< show po >>>>>"
          kubectl get po -l app=dm-worker -o wide
          echo "<<<<< describe po >>>>>"
          kubectl describe po -l app=dm-worker
          echo "<<<<< describe pvc >>>>>"
          kubectl describe pvc -l app=dm-worker
          echo "<<<<< show current log for dm-worker-0 >>>>>"
          kubectl logs dm-worker-0 || true
          echo "<<<<< show previous log for dm-worker-0 >>>>>"
          kubectl logs dm-worker-0 -p || true
          echo "<<<<< show current log for dm-worker-1 >>>>>"
          kubectl logs dm-worker-1 || true
          echo "<<<<< show previous log for worker-master-1 >>>>>"
          kubectl logs dm-worker-1 -p || true
          echo "<<<<< show current log for dm-worker-2 >>>>>"
          kubectl logs dm-worker-2 || true
          echo "<<<<< show previous log for dm-worker-2 >>>>>"
          kubectl logs dm-worker-2 -p || true

      # NOTE: we sleep a while when check members ready in cases before applying any chaos operations.
      - name: Set up chaos test cases
        run: |
          kubectl apply -f $GITHUB_WORKSPACE/dm/chaos/manifests/cases.yaml
          kubectl get -f $GITHUB_WORKSPACE/dm/chaos/manifests/cases.yaml
          kubectl describe -f $GITHUB_WORKSPACE/dm/chaos/manifests/cases.yaml
          sleep 60

      - name: Encode chaos-mesh action
        run: |
          echo CFG_BASE64=$(base64 -w 0 $GITHUB_WORKSPACE/dm/chaos/manifests/${{ matrix.chaos-obj }}.yaml) >> $GITHUB_ENV

      - name: Run chaos mesh action
        uses: chaos-mesh/chaos-mesh-action@master
        env:
          CFG_BASE64: ${{ env.CFG_BASE64 }}

      # check whether complete with 1m * 20 times.
      - name: Wait for chaos test case complete
        run: |
          $GITHUB_WORKSPACE/dm/chaos/scripts/check-case.sh
      
      - name: Setup tmate session
        if: ${{ failure() }}
        uses: mxschmitt/action-tmate@v3

      - name: Copy logs to hack permission
        if: ${{ always() }}
        run: |
          mkdir ./logs
          kubectl get pods --no-headers -o custom-columns=":metadata.name"|grep -E "dm-"|xargs -I{} kubectl cp {}:/log/{}.log ./logs/{}.log || true
          sudo chown -R runner ./logs
      # Update logs as artifact seems not stable, so we set `continue-on-error: true` here.
      - name: Upload logs
        continue-on-error: true
        uses: actions/upload-artifact@v4
        if: ${{ always() }}
        with:
          name: chaos-base-logs.${{ matrix.chaos-obj }}
          path: |
            ./logs

      # send Slack notify if failed.
      # NOTE: With the exception of `GITHUB_TOKEN`, secrets are not passed to the runner when a workflow is triggered from a forked repository.
      - name: Slack notification
        if: ${{ failure() }}
        env:
          SLACK_WEBHOOK: ${{ secrets.SLACK_NOTIFY }}
        uses: Ilshidur/action-slack@2.1.0
        with:
          args: "chaos job failed, see https://github.com/pingcap/tiflow/actions/runs/{{ GITHUB_RUN_ID }}"