Dataflow Engine Chaos #6438

Workflow file for this run

.github/workflows/dataflow_engine_chaos.yaml at 64decf6

	name: Dataflow Engine Chaos

	on:
	schedule:
	- cron: '0 17-23 * * *' # run at minute 0 every hour from 01:00 ~ 07:00 UTC+8
	workflow_dispatch:
	inputs:
	pr:
	description: 'Which PR do you want to trigger (use PR number, such as 6127)'
	required: true
	default: ''

	# See: https://docs.github.com/en/actions/reference/workflow-syntax-for-github-actions#concurrency.
	concurrency:
	group: ${{ github.ref }}-${{ github.workflow }}
	cancel-in-progress: true

	# A workflow run is made up of one or more jobs that can run sequentially or in parallel
	jobs:
	# This workflow contains a single job called "base"
	base:
	# The type of runner that the job will run on
	runs-on: ubuntu-20.04
	timeout-minutes: 50
	strategy:
	fail-fast: false
	matrix:
	chaos-obj:
	[
	"pod-failure-dataflow",
	"pod-kill-dataflow",
	"network-partition-dataflow",
	"network-emulation-dataflow",
	"time-shift-dataflow",
	]

	# Steps represent a sequence of tasks that will be executed as part of the job
	steps:
	- uses: actions/checkout@v2

	- name: check out code by workerflow dispatch PR
	if: ${{ github.event.inputs.pr != '' }}
	uses: actions/checkout@v2
	with:
	ref: refs/pull/${{ github.event.inputs.pr }}/head

	- uses: actions/setup-go@v3
	with:
	go-version: '1.23'

	- name: Cache go modules
	uses: actions/cache@v2
	with:
	path: ~/go/pkg/mod
	key: ${{ runner.os }}-dataflow-${{ hashFiles('go.sum') }}

	- name: Create k8s Kind Cluster
	uses: helm/[email protected]
	with:
	cluster_name: dataflow-engine-cluster
	config: ${{ github.workspace }}/engine/chaos/manifests/kind-cluster.yaml

	- name: Print cluster information
	run: \|
	kubectl config view
	kubectl cluster-info
	kubectl get nodes
	kubectl get pods -n kube-system
	kubectl get sc
	kubectl version
	helm version

	- name: Build dataflow engine binary
	run: \|
	make tiflow tiflow-chaos-case
	cp -r $GITHUB_WORKSPACE/engine/chaos/cases/conf/ $GITHUB_WORKSPACE/bin/engine-conf

	- name: Build Dataflow engine docker image
	run: \|
	docker build -f $GITHUB_WORKSPACE/engine/chaos/manifests/Dockerfile -t dataflow:chaos $GITHUB_WORKSPACE/bin
	docker image list

	- name: Load docker image to kind cluster
	run: \|
	kind load docker-image dataflow:chaos --name dataflow-engine-cluster

	# Set up upstream instances
	- name: Set up sources
	run: \|
	kubectl apply -f $GITHUB_WORKSPACE/dm/chaos/manifests/sources.yaml
	kubectl get -f $GITHUB_WORKSPACE/dm/chaos/manifests/sources.yaml
	kubectl describe -f $GITHUB_WORKSPACE/dm/chaos/manifests/sources.yaml
	- name: Wait for sources ready # kubectl wait --all not working
	run: \|
	kubectl wait --for=condition=Ready pod/mysql57-0 --timeout=300s \|\| true
	kubectl wait --for=condition=Ready pod/mysql8-0 --timeout=300s \|\| true
	kubectl wait --for=condition=Ready pod/mariadb-0 --timeout=300s \|\| true
	sleep 10
	echo show pvc
	kubectl get pvc -l app=sources -o wide
	echo show pv
	kubectl get pv -o wide
	echo show svc
	kubectl get svc -l app=sources -o wide
	echo show sts
	kubectl get sts -l app=sources -o wide
	echo show po
	kubectl get po -l app=sources -o wide
	echo describe po
	kubectl describe po -l app=sources
	echo describe pvc
	kubectl describe pvc -l app=sources
	kubectl wait --for=condition=Ready pod/mysql57-0 --timeout=0s
	kubectl wait --for=condition=Ready pod/mysql8-0 --timeout=0s
	kubectl wait --for=condition=Ready pod/mariadb-0 --timeout=0s

	# Set up downstream TiDB instance (deploy a TiDB with mockTiKV, not a TidbCluster managed by TiDB-operator)
	- name: Set up TiDB
	run: \|
	kubectl apply -f $GITHUB_WORKSPACE/dm/chaos/manifests/tidb.yaml
	kubectl get -f $GITHUB_WORKSPACE/dm/chaos/manifests/tidb.yaml
	kubectl describe -f $GITHUB_WORKSPACE/dm/chaos/manifests/tidb.yaml
	- name: Wait for TiDB ready
	run: \|
	kubectl wait --for=condition=Ready pod/tidb-0 --timeout=10m \|\| true
	echo show pvc
	kubectl get pvc -l app=tidb -o wide
	echo show pv
	kubectl get pv -o wide
	echo show svc
	kubectl get svc -l app=tidb -o wide
	echo show sts
	kubectl get sts -l app=tidb -o wide
	echo show po
	kubectl get po -l app=tidb -o wide
	echo describe po
	kubectl describe po -l app=tidb
	echo describe pvc
	kubectl describe pvc -l app=tidb
	kubectl wait --for=condition=Ready pod/tidb-0 --timeout=0s

	# Set up minio and create a bucket for tests
	- name: Set up minio
	run: \|
	kubectl apply -f $GITHUB_WORKSPACE/deployments/engine/helm/minio/minio.yaml
	kubectl get -f $GITHUB_WORKSPACE/deployments/engine/helm/minio/minio.yaml
	kubectl describe -f $GITHUB_WORKSPACE/deployments/engine/helm/minio/minio.yaml
	- name: Wait for minio ready
	run: \|
	kubectl wait --for=condition=Ready pod/chaos-minio-0 --timeout=10m \|\| true
	echo show pvc
	kubectl get pvc -l app=minio -o wide
	echo show pv
	kubectl get pv -o wide
	echo show svc
	kubectl get svc -l app=minio -o wide
	echo show sts
	kubectl get sts -l app=minio -o wide
	echo show po
	kubectl get po -l app=minio -o wide
	echo describe po
	kubectl describe po -l app=minio
	echo describe pvc
	kubectl describe pvc -l app=minio
	kubectl wait --for=condition=Ready pod/chaos-minio-0 --timeout=0s
	- name: Set up minio-create-bucket job
	run: \|
	kubectl apply -f $GITHUB_WORKSPACE/deployments/engine/helm/minio/minio-create-bucket.yaml
	kubectl get -f $GITHUB_WORKSPACE/deployments/engine/helm/minio/minio-create-bucket.yaml
	kubectl describe -f $GITHUB_WORKSPACE/deployments/engine/helm/minio/minio-create-bucket.yaml
	kubectl wait --for=condition=Complete job/chaos-minio-create-bucket-job --timeout=2m

	# Set up metastore and basic services
	- name: Set up metastore and basic services
	run: \|
	helm install -f $GITHUB_WORKSPACE/deployments/engine/helm/tiflow/values.yaml chaos $GITHUB_WORKSPACE/deployments/engine/helm/tiflow
	helm list
	sleep 5
	kubectl get pods

	- name: Wait for metastore ready
	run: \|
	kubectl wait --for=condition=Ready pod/chaos-metastore-mysql-0 --timeout=60s \|\| true
	kubectl wait --for=condition=Ready pod/chaos-metastore-etcd-0 --timeout=60s \|\| true

	echo show pvc
	kubectl get pvc -l app=chaos-metastore-etcd -o wide
	echo show pv
	kubectl get pv -o wide
	echo show svc
	kubectl get svc -l app=chaos-metastore-etcd -o wide
	echo show sts
	kubectl get sts -l app=chaos-metastore-etcd -o wide
	echo show po
	kubectl get po -l app=chaos-metastore-etcd -o wide
	echo describe po
	kubectl describe po -l app=chaos-metastore-etcd
	echo describe pvc
	kubectl describe pvc -l app=chaos-metastore-etcd

	echo show pvc
	kubectl get pvc -l app=chaos-metastore-mysql -o wide
	echo show pv
	kubectl get pv -o wide
	echo show svc
	kubectl get svc -l app=chaos-metastore-mysql -o wide
	echo show sts
	kubectl get sts -l app=chaos-metastore-mysql -o wide
	echo show po
	kubectl get po -l app=chaos-metastore-mysql -o wide
	echo describe po
	kubectl describe po -l app=chaos-metastore-framework
	echo describe pvc
	kubectl describe pvc -l app=chaos-metastore-framework

	- name: Wait for server-master ready
	run: \|
	kubectl wait --for=condition=Ready pod -l app=chaos-server-master --all --timeout=20s\|\| true
	echo "<<<<< show pvc >>>>>"
	kubectl get pvc -l app=chaos-server-master -o wide
	echo "<<<<< show pv >>>>>"
	kubectl get pv -o wide
	echo "<<<<< show svc >>>>>"
	kubectl get svc -l app=chaos-server-master -o wide
	echo "<<<<< show sts >>>>>"
	kubectl get sts -l app=chaos-server-master -o wide
	echo "<<<<< show po >>>>>"
	kubectl get po -l app=chaos-server-master -o wide
	echo "<<<<< describe po >>>>>"
	kubectl describe po -l app=chaos-server-master
	echo "<<<<< describe pvc >>>>>"
	kubectl describe pvc -l app=chaos-server-master
	echo "<<<<< show current log for chaos-server-master-0 >>>>>"
	kubectl logs chaos-server-master-0 \|\| true
	echo "<<<<< show previous log for chaos-server-master-0 >>>>>"
	kubectl logs chaos-server-master-0 -p \|\| true
	echo "<<<<< show current log for chaos-server-master-1 >>>>>"
	kubectl logs chaos-server-master-1 \|\| true
	echo "<<<<< show previous log for chaos-server-master-1 >>>>>"
	kubectl logs chaos-server-master-1 -p \|\| true
	echo "<<<<< show current log for chaos-server-master-2 >>>>>"
	kubectl logs chaos-server-master-2 \|\| true
	echo "<<<<< show previous log for chaos-server-master-2 >>>>>"
	kubectl logs chaos-server-master-2 -p \|\| true

	kubectl logs chaos-server-master-0 -c wait-mysql \|\| true

	- name: Wait for executor ready
	run: \|
	kubectl wait --for=condition=Ready pod -l app=chaos-executor --all --timeout=15s\|\| true
	echo "<<<<< show pvc >>>>>"
	kubectl get pvc -l app=chaos-executor -o wide
	echo "<<<<< show pv >>>>>"
	kubectl get pv -o wide
	echo "<<<<< show svc >>>>>"
	kubectl get svc -l app=chaos-executor -o wide
	echo "<<<<< show sts >>>>>"
	kubectl get sts -l app=chaos-executor -o wide
	echo "<<<<< show po >>>>>"
	kubectl get po -l app=chaos-executor -o wide
	echo "<<<<< describe po >>>>>"
	kubectl describe po -l app=chaos-executor
	echo "<<<<< describe pvc >>>>>"
	kubectl describe pvc -l app=chaos-executor
	echo "<<<<< show current log for chaos-executor-0 >>>>>"
	kubectl logs chaos-executor-0 \|\| true
	echo "<<<<< show previous log for chaos-executor-0 >>>>>"
	kubectl logs chaos-executor-0 -p \|\| true
	echo "<<<<< show current log for chaos-executor-1 >>>>>"
	kubectl logs chaos-executor-1 \|\| true
	echo "<<<<< show previous log for worker-master-1 >>>>>"
	kubectl logs chaos-executor-1 -p \|\| true
	echo "<<<<< show current log for chaos-executor-2 >>>>>"
	kubectl logs chaos-executor-2 \|\| true
	echo "<<<<< show previous log for chaos-executor-2 >>>>>"
	kubectl logs chaos-executor-2 -p \|\| true

	kubectl logs chaos-executor-0 -c wait-server-master \|\| true

	- name: Set up chaos test cases
	run: \|
	kubectl apply -f $GITHUB_WORKSPACE/engine/chaos/manifests/cases.yaml
	kubectl get -f $GITHUB_WORKSPACE/engine/chaos/manifests/cases.yaml
	kubectl describe -f $GITHUB_WORKSPACE/engine/chaos/manifests/cases.yaml
	kubectl get pods

	# FIXME: remove this after fix https://github.com/pingcap/tiflow/issues/7304
	- name: Wait DM enter sync stage
	run: \|
	for idx in $(seq 0 300); do
	echo "wait dm enter sync stage"
	if kubectl logs job.batch/chaos-test-case \| grep "full mode of the task has completed" ; then
	break
	fi
	sleep 1
	done

	- name: Encode chaos-mesh action
	run: \|
	echo CFG_BASE64=$(base64 -w 0 $GITHUB_WORKSPACE/engine/chaos/manifests/${{ matrix.chaos-obj }}.yaml) >> $GITHUB_ENV

	- name: Run chaos mesh action
	uses: chaos-mesh/chaos-mesh-action@master
	env:
	CFG_BASE64: ${{ env.CFG_BASE64 }}

	# check whether complete with 1m * 20 times.
	- name: Wait for chaos test case complete
	run: \|
	$GITHUB_WORKSPACE/engine/chaos/scripts/check-case.sh

	- name: Pause all chaos
	if: ${{ always() }}
	run: \|
	kubectl delete -f $GITHUB_WORKSPACE/engine/chaos/manifests/${{ matrix.chaos-obj }}.yaml

	- name: Dump goroutines
	if: ${{ failure() }}
	run: \|
	# Add a delay if test fails, to check whether the cluster can recover after chaos is removed
	sleep 60
	kubectl get pods --no-headers -o custom-columns=":metadata.name"\|grep -E "server-master"\|xargs -I{} kubectl exec -i -c server-master {} -- wget http://127.0.0.1:10240/debug/pprof/goroutine?debug=2 -O /log/{}_goroutines.log \|\| true
	kubectl get pods --no-headers -o custom-columns=":metadata.name"\|grep -E "executor"\|xargs -I{} kubectl exec -i -c executor {} -- wget http://127.0.0.1:10241/debug/pprof/goroutine?debug=2 -O /log/{}_goroutines.log \|\| true

	- name: Copy logs to hack permission
	if: ${{ always() }}
	run: \|
	mkdir ./logs
	kubectl get pods --no-headers -o custom-columns=":metadata.name"\|grep -E "server-master\|executor"\|xargs -I{} kubectl cp {}:/log ./logs \|\| true
	kind export logs ./logs/kind --name dataflow-engine-cluster
	sudo chown -R runner ./logs

	# Upload logs as artifact seems not stable, so we set `continue-on-error: true` here.
	- name: Upload logs
	continue-on-error: true
	uses: actions/upload-artifact@v4
	if: ${{ always() }}
	with:
	name: chaos-base-logs.${{ matrix.chaos-obj }}
	path: \|
	./logs

	# Send feishu notification if failed.
	- name: Feishu notification
	continue-on-error: true
	uses: foxundermoon/feishu-action@v2
	if: ${{ failure() }}
	with:
	url: ${{ secrets.ENGINE_FEISHU_NOTIFY_URL }}
	msg_type: text
	content: \|
	text: \|
	dataflow engine chaos job failed, see https://github.com/pingcap/tiflow/actions/runs/${{ github.run_id }}

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Dataflow Engine Chaos #6438

Workflow file

Dataflow Engine Chaos #6438

Jobs

Run details

Workflow file for this run