.github/workflows/dataflow_engine_chaos.yaml

name: Dataflow Engine Chaos

on:
  schedule:
    - cron: '0 17-23 * * *' # run at minute 0 every hour from 01:00 ~ 07:00 UTC+8
  workflow_dispatch:
    inputs:
      pr:
        description: 'Which PR do you want to trigger (use PR number, such as 6127)'
        required: true
        default: ''

# See: https://docs.github.com/en/actions/reference/workflow-syntax-for-github-actions#concurrency.
concurrency:
  group: ${{ github.ref }}-${{ github.workflow }}
  cancel-in-progress: true

# A workflow run is made up of one or more jobs that can run sequentially or in parallel
jobs:
  # This workflow contains a single job called "base"
  base:
    # The type of runner that the job will run on
    runs-on: ubuntu-20.04
    timeout-minutes: 50
    strategy:
      fail-fast: false
      matrix:
        chaos-obj:
          [
            "pod-failure-dataflow",
            "pod-kill-dataflow",
            "network-partition-dataflow",
            "network-emulation-dataflow",
            "time-shift-dataflow",
          ]

    # Steps represent a sequence of tasks that will be executed as part of the job
    steps:
      - uses: actions/checkout@v2

      - name: check out code by workerflow dispatch PR
        if: ${{ github.event.inputs.pr != '' }}
        uses: actions/checkout@v2
        with:
          ref: refs/pull/${{ github.event.inputs.pr }}/head

      - uses: actions/setup-go@v3
        with:
          go-version: '1.23'

      - name: Cache go modules
        uses: actions/cache@v2
        with:
          path: ~/go/pkg/mod
          key: ${{ runner.os }}-dataflow-${{ hashFiles('go.sum') }}

      - name: Create k8s Kind Cluster
        uses: helm/kind-action@v1.4.0
        with:
          cluster_name: dataflow-engine-cluster
          config: ${{ github.workspace }}/engine/chaos/manifests/kind-cluster.yaml

      - name: Print cluster information
        run: |
          kubectl config view
          kubectl cluster-info
          kubectl get nodes
          kubectl get pods -n kube-system
          kubectl get sc
          kubectl version
          helm version

      - name: Build dataflow engine binary
        run: |
          make tiflow tiflow-chaos-case
          cp -r $GITHUB_WORKSPACE/engine/chaos/cases/conf/ $GITHUB_WORKSPACE/bin/engine-conf

      - name: Build Dataflow engine docker image
        run: |
          docker build -f $GITHUB_WORKSPACE/engine/chaos/manifests/Dockerfile -t dataflow:chaos $GITHUB_WORKSPACE/bin
          docker image list

      - name: Load docker image to kind cluster
        run: |
          kind load docker-image dataflow:chaos --name dataflow-engine-cluster

      # Set up upstream instances
      - name: Set up sources
        run: |
          kubectl apply -f $GITHUB_WORKSPACE/dm/chaos/manifests/sources.yaml
          kubectl get -f $GITHUB_WORKSPACE/dm/chaos/manifests/sources.yaml
          kubectl describe -f $GITHUB_WORKSPACE/dm/chaos/manifests/sources.yaml
      - name: Wait for sources ready # kubectl wait --all not working
        run: |
          kubectl wait --for=condition=Ready pod/mysql57-0 --timeout=300s || true
          kubectl wait --for=condition=Ready pod/mysql8-0 --timeout=300s || true
          kubectl wait --for=condition=Ready pod/mariadb-0 --timeout=300s || true
          sleep 10
          echo show pvc
          kubectl get pvc -l app=sources -o wide
          echo show pv
          kubectl get pv -o wide
          echo show svc
          kubectl get svc -l app=sources -o wide
          echo show sts
          kubectl get sts -l app=sources -o wide
          echo show po
          kubectl get po -l app=sources -o wide
          echo describe po
          kubectl describe po -l app=sources
          echo describe pvc
          kubectl describe pvc -l app=sources
          kubectl wait --for=condition=Ready pod/mysql57-0 --timeout=0s
          kubectl wait --for=condition=Ready pod/mysql8-0 --timeout=0s
          kubectl wait --for=condition=Ready pod/mariadb-0 --timeout=0s
 
      # Set up downstream TiDB instance (deploy a TiDB with mockTiKV, not a TidbCluster managed by TiDB-operator)
      - name: Set up TiDB
        run: |
          kubectl apply -f $GITHUB_WORKSPACE/dm/chaos/manifests/tidb.yaml
          kubectl get -f $GITHUB_WORKSPACE/dm/chaos/manifests/tidb.yaml
          kubectl describe -f $GITHUB_WORKSPACE/dm/chaos/manifests/tidb.yaml
      - name: Wait for TiDB ready
        run: |
          kubectl wait --for=condition=Ready pod/tidb-0 --timeout=10m || true
          echo show pvc
          kubectl get pvc -l app=tidb -o wide
          echo show pv
          kubectl get pv -o wide
          echo show svc
          kubectl get svc -l app=tidb -o wide
          echo show sts
          kubectl get sts -l app=tidb -o wide
          echo show po
          kubectl get po -l app=tidb -o wide
          echo describe po
          kubectl describe po -l app=tidb
          echo describe pvc
          kubectl describe pvc -l app=tidb
          kubectl wait --for=condition=Ready pod/tidb-0 --timeout=0s

      # Set up minio and create a bucket for tests
      - name: Set up minio
        run: |
          kubectl apply -f $GITHUB_WORKSPACE/deployments/engine/helm/minio/minio.yaml  
          kubectl get -f $GITHUB_WORKSPACE/deployments/engine/helm/minio/minio.yaml 
          kubectl describe -f $GITHUB_WORKSPACE/deployments/engine/helm/minio/minio.yaml 
      - name: Wait for minio ready
        run: |
          kubectl wait --for=condition=Ready pod/chaos-minio-0 --timeout=10m || true
          echo show pvc
          kubectl get pvc -l app=minio -o wide
          echo show pv
          kubectl get pv -o wide
          echo show svc
          kubectl get svc -l app=minio -o wide
          echo show sts
          kubectl get sts -l app=minio -o wide
          echo show po
          kubectl get po -l app=minio -o wide
          echo describe po
          kubectl describe po -l app=minio
          echo describe pvc
          kubectl describe pvc -l app=minio
          kubectl wait --for=condition=Ready pod/chaos-minio-0 --timeout=0s
      - name: Set up minio-create-bucket job
        run: |
          kubectl apply -f $GITHUB_WORKSPACE/deployments/engine/helm/minio/minio-create-bucket.yaml 
          kubectl get -f $GITHUB_WORKSPACE/deployments/engine/helm/minio/minio-create-bucket.yaml
          kubectl describe -f $GITHUB_WORKSPACE/deployments/engine/helm/minio/minio-create-bucket.yaml
          kubectl wait --for=condition=Complete job/chaos-minio-create-bucket-job --timeout=2m

      # Set up metastore and basic services
      - name: Set up metastore and basic services
        run: |
          helm install -f $GITHUB_WORKSPACE/deployments/engine/helm/tiflow/values.yaml chaos $GITHUB_WORKSPACE/deployments/engine/helm/tiflow
          helm list
          sleep 5
          kubectl get pods
          
      - name: Wait for metastore ready
        run: |
          kubectl wait --for=condition=Ready pod/chaos-metastore-mysql-0 --timeout=60s || true
          kubectl wait --for=condition=Ready pod/chaos-metastore-etcd-0 --timeout=60s || true

          echo show pvc
          kubectl get pvc -l app=chaos-metastore-etcd -o wide
          echo show pv
          kubectl get pv -o wide
          echo show svc
          kubectl get svc -l app=chaos-metastore-etcd -o wide
          echo show sts
          kubectl get sts -l app=chaos-metastore-etcd -o wide
          echo show po
          kubectl get po -l app=chaos-metastore-etcd -o wide
          echo describe po
          kubectl describe po -l app=chaos-metastore-etcd
          echo describe pvc
          kubectl describe pvc -l app=chaos-metastore-etcd

          echo show pvc
          kubectl get pvc -l app=chaos-metastore-mysql -o wide
          echo show pv
          kubectl get pv -o wide
          echo show svc
          kubectl get svc -l app=chaos-metastore-mysql -o wide
          echo show sts
          kubectl get sts -l app=chaos-metastore-mysql -o wide
          echo show po
          kubectl get po -l app=chaos-metastore-mysql -o wide
          echo describe po
          kubectl describe po -l app=chaos-metastore-framework
          echo describe pvc
          kubectl describe pvc -l app=chaos-metastore-framework

      - name: Wait for server-master ready
        run: |
          kubectl wait --for=condition=Ready pod -l app=chaos-server-master --all --timeout=20s|| true
          echo "<<<<< show pvc >>>>>"
          kubectl get pvc -l app=chaos-server-master -o wide
          echo "<<<<< show pv >>>>>"
          kubectl get pv -o wide
          echo "<<<<< show svc >>>>>"
          kubectl get svc -l app=chaos-server-master -o wide
          echo "<<<<< show sts >>>>>"
          kubectl get sts -l app=chaos-server-master -o wide
          echo "<<<<< show po >>>>>"
          kubectl get po -l app=chaos-server-master -o wide
          echo "<<<<< describe po >>>>>"
          kubectl describe po -l app=chaos-server-master
          echo "<<<<< describe pvc >>>>>"
          kubectl describe pvc -l app=chaos-server-master
          echo "<<<<< show current log for chaos-server-master-0 >>>>>"
          kubectl logs chaos-server-master-0 || true
          echo "<<<<< show previous log for chaos-server-master-0 >>>>>"
          kubectl logs chaos-server-master-0 -p || true
          echo "<<<<< show current log for chaos-server-master-1 >>>>>"
          kubectl logs chaos-server-master-1 || true
          echo "<<<<< show previous log for chaos-server-master-1 >>>>>"
          kubectl logs chaos-server-master-1 -p || true
          echo "<<<<< show current log for chaos-server-master-2 >>>>>"
          kubectl logs chaos-server-master-2 || true
          echo "<<<<< show previous log for chaos-server-master-2 >>>>>"
          kubectl logs chaos-server-master-2 -p || true

          kubectl logs chaos-server-master-0 -c wait-mysql || true

      - name: Wait for executor ready
        run: |
          kubectl wait --for=condition=Ready pod -l app=chaos-executor --all --timeout=15s|| true
          echo "<<<<< show pvc >>>>>"
          kubectl get pvc -l app=chaos-executor -o wide
          echo "<<<<< show pv >>>>>"
          kubectl get pv -o wide
          echo "<<<<< show svc >>>>>"
          kubectl get svc -l app=chaos-executor -o wide
          echo "<<<<< show sts >>>>>"
          kubectl get sts -l app=chaos-executor -o wide
          echo "<<<<< show po >>>>>"
          kubectl get po -l app=chaos-executor -o wide
          echo "<<<<< describe po >>>>>"
          kubectl describe po -l app=chaos-executor
          echo "<<<<< describe pvc >>>>>"
          kubectl describe pvc -l app=chaos-executor
          echo "<<<<< show current log for chaos-executor-0 >>>>>"
          kubectl logs chaos-executor-0 || true
          echo "<<<<< show previous log for chaos-executor-0 >>>>>"
          kubectl logs chaos-executor-0 -p || true
          echo "<<<<< show current log for chaos-executor-1 >>>>>"
          kubectl logs chaos-executor-1 || true
          echo "<<<<< show previous log for worker-master-1 >>>>>"
          kubectl logs chaos-executor-1 -p || true
          echo "<<<<< show current log for chaos-executor-2 >>>>>"
          kubectl logs chaos-executor-2 || true
          echo "<<<<< show previous log for chaos-executor-2 >>>>>"
          kubectl logs chaos-executor-2 -p || true

          kubectl logs chaos-executor-0 -c wait-server-master || true

      - name: Set up chaos test cases
        run: |
          kubectl apply -f $GITHUB_WORKSPACE/engine/chaos/manifests/cases.yaml
          kubectl get -f $GITHUB_WORKSPACE/engine/chaos/manifests/cases.yaml
          kubectl describe -f $GITHUB_WORKSPACE/engine/chaos/manifests/cases.yaml
          kubectl get pods

      # FIXME: remove this after fix https://github.com/pingcap/tiflow/issues/7304
      - name: Wait DM enter sync stage
        run: |
          for idx in $(seq 0 300); do
            echo "wait dm enter sync stage"
            if kubectl logs job.batch/chaos-test-case | grep "full mode of the task has completed" ; then
              break
            fi
            sleep 1
          done

      - name: Encode chaos-mesh action
        run: |
          echo CFG_BASE64=$(base64 -w 0 $GITHUB_WORKSPACE/engine/chaos/manifests/${{ matrix.chaos-obj }}.yaml) >> $GITHUB_ENV

      - name: Run chaos mesh action
        uses: chaos-mesh/chaos-mesh-action@master
        env:
          CFG_BASE64: ${{ env.CFG_BASE64 }}

      # check whether complete with 1m * 20 times.
      - name: Wait for chaos test case complete
        run: |
          $GITHUB_WORKSPACE/engine/chaos/scripts/check-case.sh

      - name: Pause all chaos
        if: ${{ always() }}
        run: |
          kubectl delete -f $GITHUB_WORKSPACE/engine/chaos/manifests/${{ matrix.chaos-obj }}.yaml

      - name: Dump goroutines
        if: ${{ failure() }}
        run: |
          # Add a delay if test fails, to check whether the cluster can recover after chaos is removed
          sleep 60
          kubectl get pods --no-headers -o custom-columns=":metadata.name"|grep -E "server-master"|xargs -I{} kubectl exec -i -c server-master {} -- wget http://127.0.0.1:10240/debug/pprof/goroutine?debug=2 -O /log/{}_goroutines.log || true
          kubectl get pods --no-headers -o custom-columns=":metadata.name"|grep -E "executor"|xargs -I{} kubectl exec -i -c executor {} -- wget http://127.0.0.1:10241/debug/pprof/goroutine?debug=2 -O /log/{}_goroutines.log || true

      - name: Copy logs to hack permission
        if: ${{ always() }}
        run: |
          mkdir ./logs
          kubectl get pods --no-headers -o custom-columns=":metadata.name"|grep -E "server-master|executor"|xargs -I{} kubectl cp {}:/log ./logs || true
          kind export logs ./logs/kind --name dataflow-engine-cluster
          sudo chown -R runner ./logs

      # Upload logs as artifact seems not stable, so we set `continue-on-error: true` here.
      - name: Upload logs
        continue-on-error: true
        uses: actions/upload-artifact@v2
        if: ${{ always() }}
        with:
          name: chaos-base-logs.${{ matrix.chaos-obj }}
          path: |
            ./logs

      # Send feishu notification if failed.
      - name: Feishu notification
        continue-on-error: true
        uses: foxundermoon/feishu-action@v2
        if: ${{ failure() }}
        with:
          url: ${{ secrets.ENGINE_FEISHU_NOTIFY_URL }}
          msg_type: text
          content: |
            text: |
              dataflow engine chaos job failed, see https://github.com/pingcap/tiflow/actions/runs/${{ github.run_id }}