diff --git a/cdc/Makefile b/cdc/Makefile index 80c09e88..afe8fa93 100644 --- a/cdc/Makefile +++ b/cdc/Makefile @@ -287,6 +287,7 @@ integration_test_by_group: prepare_test_binaries check_third_party_binary integr prepare_test_binaries: cd scripts && ./download-integration-test-binaries.sh master && cd .. + touch prepare_test_binaries check_third_party_binary: @which scripts/bin/tidb-server diff --git a/cdc/tests/integration_tests/_utils/check_count b/cdc/tests/integration_tests/_utils/check_count index 667644dd..5737e46e 100755 --- a/cdc/tests/integration_tests/_utils/check_count +++ b/cdc/tests/integration_tests/_utils/check_count @@ -3,7 +3,7 @@ # parameter 2: component name # parameter 3: pd addr # parameter 4: max retry -set -eu +set -eux expected=$1 name=$2 @@ -23,7 +23,8 @@ for ((i = 0; i <= $max_retry; i++)); do ;; pd) : - count=$(pd-ctl health --pd $pd_addr | grep '\"health\": true' | wc | awk '{print $1}') + # Need "timeout", as pd-ctl would be blocked for a long time due to kill -SIGSTOP + count=$(timeout -s SIGKILL 3s pd-ctl health --pd $pd_addr | grep '\"health\": true' | wc | awk '{print $1}') ;; tikv-cdc) : diff --git a/cdc/tests/integration_tests/_utils/check_sync_diff b/cdc/tests/integration_tests/_utils/check_sync_diff index 2a0ca28f..c64c1490 100755 --- a/cdc/tests/integration_tests/_utils/check_sync_diff +++ b/cdc/tests/integration_tests/_utils/check_sync_diff @@ -38,7 +38,7 @@ while [ $i -lt $check_time ]; do fi ((i++)) echo "check diff failed $i-th time, retry later" - sleep 1 + sleep 3 done if [ $i -ge $check_time ]; then diff --git a/cdc/tests/integration_tests/_utils/start_tidb_cluster b/cdc/tests/integration_tests/_utils/start_tidb_cluster index c9608b88..2b44005c 100755 --- a/cdc/tests/integration_tests/_utils/start_tidb_cluster +++ b/cdc/tests/integration_tests/_utils/start_tidb_cluster @@ -4,15 +4,15 @@ # --tidb-config: path to tidb config file # --retry: retry times -set -e +set -euxo pipefail OUT_DIR= tidb_config= pd_config= -retry_times=3 +retry_times=10 multiple_upstream_pd="false" -while [[ ${1} ]]; do +while [[ ${1-} ]]; do case "${1}" in --workdir) OUT_DIR=${2} @@ -47,24 +47,24 @@ while [[ ${1} ]]; do done CUR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) -source $CUR/../_utils/test_prepare +source "$CUR"/../_utils/test_prepare set +e i=1 -while [ $i -le $retry_times ]; do +while [ $i -le "$retry_times" ]; do echo "The ${i} times to try to start tidb cluster..." if [[ "$tidb_config" != "" ]]; then - start_tidb_cluster_impl --workdir ${OUT_DIR} --multiple-upstream-pd ${multiple_upstream_pd} --tidb-config ${tidb_config} + start_tidb_cluster_impl --workdir "${OUT_DIR}" --multiple-upstream-pd "${multiple_upstream_pd}" --tidb-config "${tidb_config}" elif [[ "$pd_config" != "" ]]; then - start_tidb_cluster_impl --workdir ${OUT_DIR} --multiple-upstream-pd ${multiple_upstream_pd} --pd-config ${pd_config} + start_tidb_cluster_impl --workdir "${OUT_DIR}" --multiple-upstream-pd "${multiple_upstream_pd}" --pd-config "${pd_config}" else - start_tidb_cluster_impl --workdir ${OUT_DIR} --multiple-upstream-pd ${multiple_upstream_pd} + start_tidb_cluster_impl --workdir "${OUT_DIR}" --multiple-upstream-pd "${multiple_upstream_pd}" fi if [ $? -eq 0 ]; then break fi - let i++ + i=$((i + 1)) echo "start tidb cluster failed" done diff --git a/cdc/tests/integration_tests/_utils/start_tidb_cluster_impl b/cdc/tests/integration_tests/_utils/start_tidb_cluster_impl index 24e90cee..b793fb24 100755 --- a/cdc/tests/integration_tests/_utils/start_tidb_cluster_impl +++ b/cdc/tests/integration_tests/_utils/start_tidb_cluster_impl @@ -4,7 +4,7 @@ # --tidb-config: path to tidb config file # --multiple-upstream-pd: whether to deploy multiple pd severs in upstream -set -e +set -euxo pipefail OUT_DIR= tidb_config= @@ -26,7 +26,7 @@ randomGenSocketsConf() { echo "socket = \"/tmp/tidb-$random_str.sock\"" >>"$random_file_name" } -while [[ ${1} ]]; do +while [[ ${1-} ]]; do case "${1}" in --workdir) OUT_DIR=${2} diff --git a/cdc/tests/integration_tests/_utils/start_tls_tidb_cluster b/cdc/tests/integration_tests/_utils/start_tls_tidb_cluster index 5f3e715b..7a758496 100755 --- a/cdc/tests/integration_tests/_utils/start_tls_tidb_cluster +++ b/cdc/tests/integration_tests/_utils/start_tls_tidb_cluster @@ -4,13 +4,13 @@ # --tlsdir: certificates directory # --retry: retry times -set -e +set -euxo pipefail OUT_DIR= TLS_DIR= -retry_times=3 +retry_times=10 -while [[ ${1} ]]; do +while [[ ${1-} ]]; do case "${1}" in --workdir) OUT_DIR=${2} diff --git a/cdc/tests/integration_tests/_utils/start_tls_tidb_cluster_impl b/cdc/tests/integration_tests/_utils/start_tls_tidb_cluster_impl index 24a5d01d..ca3ada92 100755 --- a/cdc/tests/integration_tests/_utils/start_tls_tidb_cluster_impl +++ b/cdc/tests/integration_tests/_utils/start_tls_tidb_cluster_impl @@ -3,12 +3,12 @@ # --workdir: work directory # --tlsdir: certificates directory -set -e +set -euxo pipefail OUT_DIR= TLS_DIR= -while [[ ${1} ]]; do +while [[ ${1-} ]]; do case "${1}" in --workdir) OUT_DIR=${2} diff --git a/cdc/tests/integration_tests/_utils/stop_tidb_cluster b/cdc/tests/integration_tests/_utils/stop_tidb_cluster index 22e81507..86ac370a 100755 --- a/cdc/tests/integration_tests/_utils/stop_tidb_cluster +++ b/cdc/tests/integration_tests/_utils/stop_tidb_cluster @@ -3,6 +3,8 @@ # cdc server is ran by binary cdc.test, kill cdc server first to avoid too much # noise in cdc logs. +set -x + PKILL="killall -q -w -s 9 " if [ "$(uname)" == "Darwin" ]; then PKILL="pkill -9 " @@ -13,36 +15,43 @@ ${PKILL} tikv-cdc || true ${PKILL} cdc_state_checker || true ${PKILL} tidb-server || true ${PKILL} tikv-server || true -${PKILL} flash_cluster_manager || true ${PKILL} pd-server || true CUR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) source $CUR/../_utils/test_prepare -kill -9 $(lsof -i tcp:${UP_TIDB_PORT} -t 2>/dev/null) &>/dev/null || true -kill -9 $(lsof -i tcp:${UP_TIDB_OTHER_PORT} -t 2>/dev/null) &>/dev/null || true -kill -9 $(lsof -i tcp:${UP_TIDB_STATUS} -t 2>/dev/null) &>/dev/null || true -kill -9 $(lsof -i tcp:${UP_TIDB_OTHER_STATUS} -t 2>/dev/null) &>/dev/null || true -kill -9 $(lsof -i tcp:${DOWN_TIDB_PORT} -t 2>/dev/null) &>/dev/null || true -kill -9 $(lsof -i tcp:${DOWN_TIDB_STATUS} -t 2>/dev/null) &>/dev/null || true -kill -9 $(lsof -i tcp:${UP_PD_PORT_1} -t 2>/dev/null) &>/dev/null || true -kill -9 $(lsof -i tcp:${UP_PD_PEER_PORT_1} -t 2>/dev/null) &>/dev/null || true -kill -9 $(lsof -i tcp:${UP_PD_PORT_2} -t 2>/dev/null) &>/dev/null || true -kill -9 $(lsof -i tcp:${UP_PD_PEER_PORT_2} -t 2>/dev/null) &>/dev/null || true -kill -9 $(lsof -i tcp:${UP_PD_PORT_3} -t 2>/dev/null) &>/dev/null || true -kill -9 $(lsof -i tcp:${UP_PD_PEER_PORT_3} -t 2>/dev/null) &>/dev/null || true -kill -9 $(lsof -i tcp:${DOWN_PD_PORT} -t 2>/dev/null) &>/dev/null || true -kill -9 $(lsof -i tcp:${DOWN_PD_PEER_PORT} -t 2>/dev/null) &>/dev/null || true -kill -9 $(lsof -i tcp:${UP_TIKV_PORT_1} -t 2>/dev/null) &>/dev/null || true -kill -9 $(lsof -i tcp:${UP_TIKV_STATUS_PORT_1} -t 2>/dev/null) &>/dev/null || true -kill -9 $(lsof -i tcp:${UP_TIKV_PORT_2} -t 2>/dev/null) &>/dev/null || true -kill -9 $(lsof -i tcp:${UP_TIKV_STATUS_PORT_2} -t 2>/dev/null) &>/dev/null || true -kill -9 $(lsof -i tcp:${UP_TIKV_PORT_3} -t 2>/dev/null) &>/dev/null || true -kill -9 $(lsof -i tcp:${UP_TIKV_STATUS_PORT_3} -t 2>/dev/null) &>/dev/null || true -kill -9 $(lsof -i tcp:${DOWN_TIKV_PORT} -t 2>/dev/null) &>/dev/null || true -kill -9 $(lsof -i tcp:${DOWN_TIKV_STATUS_PORT} -t 2>/dev/null) &>/dev/null || true -kill -9 $(lsof -i tcp:9500 -t 2>/dev/null) &>/dev/null || true -kill -9 $(lsof -i tcp:17000 -t 2>/dev/null) &>/dev/null || true - -killall -9 tikv-cdc 2>/dev/null || true -killall -9 tikv-cdc.test 2>/dev/null || true +PORTS=( + "${UP_TIDB_STATUS}" + "${UP_TIDB_OTHER_STATUS}" + "${DOWN_TIDB_PORT}" + "${DOWN_TIDB_STATUS}" + "${UP_PD_PORT_1}" + "${UP_PD_PEER_PORT_1}" + "${UP_PD_PORT_2}" + "${UP_PD_PEER_PORT_2}" + "${UP_PD_PORT_3}" + "${UP_PD_PEER_PORT_3}" + "${DOWN_PD_PORT}" + "${DOWN_PD_PEER_PORT}" + "${UP_TIKV_PORT_1}" + "${UP_TIKV_STATUS_PORT_1}" + "${UP_TIKV_PORT_2}" + "${UP_TIKV_STATUS_PORT_2}" + "${UP_TIKV_PORT_3}" + "${UP_TIKV_STATUS_PORT_3}" + "${DOWN_TIKV_PORT}" + "${DOWN_TIKV_STATUS_PORT}" + 8600 + 8601 + 8602 +) + +# "lsof" is possibly blocked for a long time due to unknown reason (maybe container environment ?) +# And "lsof -bn" (See https://bugzilla.redhat.com/show_bug.cgi?id=171637) does not address this issue. +# So add "timeout" +KILL="kill -9 " +LSOF="timeout -s SIGKILL 3s lsof -bn -i " + +for port in "${PORTS[@]}"; do + ${KILL} $(${LSOF} tcp:"${port}" -t 2>/dev/null) &>/dev/null || true +done diff --git a/cdc/tests/integration_tests/cdc_hang_on/run.sh b/cdc/tests/integration_tests/cdc_hang_on/run.sh index 3e86ca68..d5810b52 100644 --- a/cdc/tests/integration_tests/cdc_hang_on/run.sh +++ b/cdc/tests/integration_tests/cdc_hang_on/run.sh @@ -1,6 +1,6 @@ #!/bin/bash -set -eu +set -eux CUR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) source $CUR/../_utils/test_prepare @@ -13,7 +13,7 @@ RETRY_TIME=10 function restart_cdc() { local id=$1 - local count=$(ps -aux | grep "tikv-cdc.test" | grep "cdc$id.log" | wc | awk '{print $1}') + local count=$(pgrep -a "$CDC_BINARY" | grep "cdc$id.log" | wc -l) if [ "$count" -eq 0 ]; then echo "restart cdc$id" run_cdc_server --workdir $WORK_DIR --binary $CDC_BINARY --logsuffix "$id" --addr "127.0.0.1:860$id" --pd "$UP_PD" @@ -27,17 +27,21 @@ function check_capture_count() { local i for ((i = 0; i <= $max_retry; i++)); do local captures=$(tikv-cdc cli capture list --pd=$UP_PD) - local count=$(echo $captures | jq '.|length') + # A tomestone tikv-cdc server will left capture record in ETCD. So check unique address for counting. + local count=$(echo $captures | jq '.[] | .address' | sort -u | wc -l) if [[ "$count" == "$expected" ]]; then echo "check capture count successfully" break fi echo "failed to check capture count, expected: $expected, got: $count, retry: $i" echo "captures: $captures" + echo "tikv_cdc process:" + pgrep -a "$CDC_BINARY" || true if [ "$i" -eq "$max_retry" ]; then echo "failed to check capture count, max retires exceed" exit 1 fi + # when sent SIGSTOP to pd leader, cdc maybe exit that is expect, and we # shoule restart it restart_cdc 1 @@ -53,8 +57,8 @@ function run() { start_tidb_cluster --workdir $WORK_DIR --multiple-upstream-pd "true" cd $WORK_DIR - run_cdc_server --workdir $WORK_DIR --binary $CDC_BINARY --logsuffix "1" --addr "127.0.0.1:8600" --pd "$UP_PD" - run_cdc_server --workdir $WORK_DIR --binary $CDC_BINARY --logsuffix "2" --addr "127.0.0.1:8601" --pd "$UP_PD" + run_cdc_server --workdir $WORK_DIR --binary $CDC_BINARY --logsuffix "1" --addr "127.0.0.1:8601" --pd "$UP_PD" + run_cdc_server --workdir $WORK_DIR --binary $CDC_BINARY --logsuffix "2" --addr "127.0.0.1:8602" --pd "$UP_PD" local i=1 while [ $i -le 10 ]; do diff --git a/cdc/tests/integration_tests/sigstop/run.sh b/cdc/tests/integration_tests/sigstop/run.sh index c212eeba..acb703ba 100644 --- a/cdc/tests/integration_tests/sigstop/run.sh +++ b/cdc/tests/integration_tests/sigstop/run.sh @@ -1,6 +1,6 @@ #!/bin/bash -set -eu +set -eux CUR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) source $CUR/../_utils/test_prepare @@ -99,7 +99,8 @@ function run_kill_downstream() { pd_pid=$(pgrep -f "pd-server" | sed -n "$n"p) kill -19 $pd_pid sleep 10 - check_count 2 "pd" $UP_PD + # PD would not recover when ETCD leader is stopped. So skip check_count here. + # check_count 2 "pd" $UP_PD kill -18 $pd_pid check_count 3 "pd" $UP_PD diff --git a/cdc/tests/tests.Dockerfile b/cdc/tests/tests.Dockerfile index 39907d20..53b8ba71 100644 --- a/cdc/tests/tests.Dockerfile +++ b/cdc/tests/tests.Dockerfile @@ -32,7 +32,8 @@ RUN yum install -y \ tar \ psmisc \ mysql \ - python3 + python3 \ + lsof RUN wget http://dl.fedoraproject.org/pub/epel/epel-release-latest-7.noarch.rpm RUN yum install -y epel-release-latest-7.noarch.rpm