Skip to content

Commit

Permalink
[close #334] tests: fix timeout of integration test sigstop (#335)
Browse files Browse the repository at this point in the history
Signed-off-by: Ping Yu <[email protected]>
  • Loading branch information
pingyu authored Feb 13, 2023
1 parent d7259ed commit 853a667
Show file tree
Hide file tree
Showing 11 changed files with 72 additions and 55 deletions.
1 change: 1 addition & 0 deletions cdc/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -287,6 +287,7 @@ integration_test_by_group: prepare_test_binaries check_third_party_binary integr

prepare_test_binaries:
cd scripts && ./download-integration-test-binaries.sh master && cd ..
touch prepare_test_binaries

check_third_party_binary:
@which scripts/bin/tidb-server
Expand Down
5 changes: 3 additions & 2 deletions cdc/tests/integration_tests/_utils/check_count
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
# parameter 2: component name
# parameter 3: pd addr
# parameter 4: max retry
set -eu
set -eux

expected=$1
name=$2
Expand All @@ -23,7 +23,8 @@ for ((i = 0; i <= $max_retry; i++)); do
;;
pd)
:
count=$(pd-ctl health --pd $pd_addr | grep '\"health\": true' | wc | awk '{print $1}')
# Need "timeout", as pd-ctl would be blocked for a long time due to kill -SIGSTOP
count=$(timeout -s SIGKILL 3s pd-ctl health --pd $pd_addr | grep '\"health\": true' | wc | awk '{print $1}')
;;
tikv-cdc)
:
Expand Down
2 changes: 1 addition & 1 deletion cdc/tests/integration_tests/_utils/check_sync_diff
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ while [ $i -lt $check_time ]; do
fi
((i++))
echo "check diff failed $i-th time, retry later"
sleep 1
sleep 3
done

if [ $i -ge $check_time ]; then
Expand Down
18 changes: 9 additions & 9 deletions cdc/tests/integration_tests/_utils/start_tidb_cluster
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,15 @@
# --tidb-config: path to tidb config file
# --retry: retry times

set -e
set -euxo pipefail

OUT_DIR=
tidb_config=
pd_config=
retry_times=3
retry_times=10
multiple_upstream_pd="false"

while [[ ${1} ]]; do
while [[ ${1-} ]]; do
case "${1}" in
--workdir)
OUT_DIR=${2}
Expand Down Expand Up @@ -47,24 +47,24 @@ while [[ ${1} ]]; do
done

CUR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
source $CUR/../_utils/test_prepare
source "$CUR"/../_utils/test_prepare

set +e
i=1
while [ $i -le $retry_times ]; do
while [ $i -le "$retry_times" ]; do
echo "The ${i} times to try to start tidb cluster..."

if [[ "$tidb_config" != "" ]]; then
start_tidb_cluster_impl --workdir ${OUT_DIR} --multiple-upstream-pd ${multiple_upstream_pd} --tidb-config ${tidb_config}
start_tidb_cluster_impl --workdir "${OUT_DIR}" --multiple-upstream-pd "${multiple_upstream_pd}" --tidb-config "${tidb_config}"
elif [[ "$pd_config" != "" ]]; then
start_tidb_cluster_impl --workdir ${OUT_DIR} --multiple-upstream-pd ${multiple_upstream_pd} --pd-config ${pd_config}
start_tidb_cluster_impl --workdir "${OUT_DIR}" --multiple-upstream-pd "${multiple_upstream_pd}" --pd-config "${pd_config}"
else
start_tidb_cluster_impl --workdir ${OUT_DIR} --multiple-upstream-pd ${multiple_upstream_pd}
start_tidb_cluster_impl --workdir "${OUT_DIR}" --multiple-upstream-pd "${multiple_upstream_pd}"
fi

if [ $? -eq 0 ]; then
break
fi
let i++
i=$((i + 1))
echo "start tidb cluster failed"
done
4 changes: 2 additions & 2 deletions cdc/tests/integration_tests/_utils/start_tidb_cluster_impl
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
# --tidb-config: path to tidb config file
# --multiple-upstream-pd: whether to deploy multiple pd severs in upstream

set -e
set -euxo pipefail

OUT_DIR=
tidb_config=
Expand All @@ -26,7 +26,7 @@ randomGenSocketsConf() {
echo "socket = \"/tmp/tidb-$random_str.sock\"" >>"$random_file_name"
}

while [[ ${1} ]]; do
while [[ ${1-} ]]; do
case "${1}" in
--workdir)
OUT_DIR=${2}
Expand Down
6 changes: 3 additions & 3 deletions cdc/tests/integration_tests/_utils/start_tls_tidb_cluster
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,13 @@
# --tlsdir: certificates directory
# --retry: retry times

set -e
set -euxo pipefail

OUT_DIR=
TLS_DIR=
retry_times=3
retry_times=10

while [[ ${1} ]]; do
while [[ ${1-} ]]; do
case "${1}" in
--workdir)
OUT_DIR=${2}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,12 @@
# --workdir: work directory
# --tlsdir: certificates directory

set -e
set -euxo pipefail

OUT_DIR=
TLS_DIR=

while [[ ${1} ]]; do
while [[ ${1-} ]]; do
case "${1}" in
--workdir)
OUT_DIR=${2}
Expand Down
65 changes: 37 additions & 28 deletions cdc/tests/integration_tests/_utils/stop_tidb_cluster
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
# cdc server is ran by binary cdc.test, kill cdc server first to avoid too much
# noise in cdc logs.

set -x

PKILL="killall -q -w -s 9 "
if [ "$(uname)" == "Darwin" ]; then
PKILL="pkill -9 "
Expand All @@ -13,36 +15,43 @@ ${PKILL} tikv-cdc || true
${PKILL} cdc_state_checker || true
${PKILL} tidb-server || true
${PKILL} tikv-server || true
${PKILL} flash_cluster_manager || true
${PKILL} pd-server || true

CUR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
source $CUR/../_utils/test_prepare

kill -9 $(lsof -i tcp:${UP_TIDB_PORT} -t 2>/dev/null) &>/dev/null || true
kill -9 $(lsof -i tcp:${UP_TIDB_OTHER_PORT} -t 2>/dev/null) &>/dev/null || true
kill -9 $(lsof -i tcp:${UP_TIDB_STATUS} -t 2>/dev/null) &>/dev/null || true
kill -9 $(lsof -i tcp:${UP_TIDB_OTHER_STATUS} -t 2>/dev/null) &>/dev/null || true
kill -9 $(lsof -i tcp:${DOWN_TIDB_PORT} -t 2>/dev/null) &>/dev/null || true
kill -9 $(lsof -i tcp:${DOWN_TIDB_STATUS} -t 2>/dev/null) &>/dev/null || true
kill -9 $(lsof -i tcp:${UP_PD_PORT_1} -t 2>/dev/null) &>/dev/null || true
kill -9 $(lsof -i tcp:${UP_PD_PEER_PORT_1} -t 2>/dev/null) &>/dev/null || true
kill -9 $(lsof -i tcp:${UP_PD_PORT_2} -t 2>/dev/null) &>/dev/null || true
kill -9 $(lsof -i tcp:${UP_PD_PEER_PORT_2} -t 2>/dev/null) &>/dev/null || true
kill -9 $(lsof -i tcp:${UP_PD_PORT_3} -t 2>/dev/null) &>/dev/null || true
kill -9 $(lsof -i tcp:${UP_PD_PEER_PORT_3} -t 2>/dev/null) &>/dev/null || true
kill -9 $(lsof -i tcp:${DOWN_PD_PORT} -t 2>/dev/null) &>/dev/null || true
kill -9 $(lsof -i tcp:${DOWN_PD_PEER_PORT} -t 2>/dev/null) &>/dev/null || true
kill -9 $(lsof -i tcp:${UP_TIKV_PORT_1} -t 2>/dev/null) &>/dev/null || true
kill -9 $(lsof -i tcp:${UP_TIKV_STATUS_PORT_1} -t 2>/dev/null) &>/dev/null || true
kill -9 $(lsof -i tcp:${UP_TIKV_PORT_2} -t 2>/dev/null) &>/dev/null || true
kill -9 $(lsof -i tcp:${UP_TIKV_STATUS_PORT_2} -t 2>/dev/null) &>/dev/null || true
kill -9 $(lsof -i tcp:${UP_TIKV_PORT_3} -t 2>/dev/null) &>/dev/null || true
kill -9 $(lsof -i tcp:${UP_TIKV_STATUS_PORT_3} -t 2>/dev/null) &>/dev/null || true
kill -9 $(lsof -i tcp:${DOWN_TIKV_PORT} -t 2>/dev/null) &>/dev/null || true
kill -9 $(lsof -i tcp:${DOWN_TIKV_STATUS_PORT} -t 2>/dev/null) &>/dev/null || true
kill -9 $(lsof -i tcp:9500 -t 2>/dev/null) &>/dev/null || true
kill -9 $(lsof -i tcp:17000 -t 2>/dev/null) &>/dev/null || true

killall -9 tikv-cdc 2>/dev/null || true
killall -9 tikv-cdc.test 2>/dev/null || true
PORTS=(
"${UP_TIDB_STATUS}"
"${UP_TIDB_OTHER_STATUS}"
"${DOWN_TIDB_PORT}"
"${DOWN_TIDB_STATUS}"
"${UP_PD_PORT_1}"
"${UP_PD_PEER_PORT_1}"
"${UP_PD_PORT_2}"
"${UP_PD_PEER_PORT_2}"
"${UP_PD_PORT_3}"
"${UP_PD_PEER_PORT_3}"
"${DOWN_PD_PORT}"
"${DOWN_PD_PEER_PORT}"
"${UP_TIKV_PORT_1}"
"${UP_TIKV_STATUS_PORT_1}"
"${UP_TIKV_PORT_2}"
"${UP_TIKV_STATUS_PORT_2}"
"${UP_TIKV_PORT_3}"
"${UP_TIKV_STATUS_PORT_3}"
"${DOWN_TIKV_PORT}"
"${DOWN_TIKV_STATUS_PORT}"
8600
8601
8602
)

# "lsof" is possibly blocked for a long time due to unknown reason (maybe container environment ?)
# And "lsof -bn" (See https://bugzilla.redhat.com/show_bug.cgi?id=171637) does not address this issue.
# So add "timeout"
KILL="kill -9 "
LSOF="timeout -s SIGKILL 3s lsof -bn -i "

for port in "${PORTS[@]}"; do
${KILL} $(${LSOF} tcp:"${port}" -t 2>/dev/null) &>/dev/null || true
done
14 changes: 9 additions & 5 deletions cdc/tests/integration_tests/cdc_hang_on/run.sh
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/bin/bash

set -eu
set -eux

CUR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
source $CUR/../_utils/test_prepare
Expand All @@ -13,7 +13,7 @@ RETRY_TIME=10

function restart_cdc() {
local id=$1
local count=$(ps -aux | grep "tikv-cdc.test" | grep "cdc$id.log" | wc | awk '{print $1}')
local count=$(pgrep -a "$CDC_BINARY" | grep "cdc$id.log" | wc -l)
if [ "$count" -eq 0 ]; then
echo "restart cdc$id"
run_cdc_server --workdir $WORK_DIR --binary $CDC_BINARY --logsuffix "$id" --addr "127.0.0.1:860$id" --pd "$UP_PD"
Expand All @@ -27,17 +27,21 @@ function check_capture_count() {
local i
for ((i = 0; i <= $max_retry; i++)); do
local captures=$(tikv-cdc cli capture list --pd=$UP_PD)
local count=$(echo $captures | jq '.|length')
# A tomestone tikv-cdc server will left capture record in ETCD. So check unique address for counting.
local count=$(echo $captures | jq '.[] | .address' | sort -u | wc -l)
if [[ "$count" == "$expected" ]]; then
echo "check capture count successfully"
break
fi
echo "failed to check capture count, expected: $expected, got: $count, retry: $i"
echo "captures: $captures"
echo "tikv_cdc process:"
pgrep -a "$CDC_BINARY" || true
if [ "$i" -eq "$max_retry" ]; then
echo "failed to check capture count, max retires exceed"
exit 1
fi

# when sent SIGSTOP to pd leader, cdc maybe exit that is expect, and we
# shoule restart it
restart_cdc 1
Expand All @@ -53,8 +57,8 @@ function run() {
start_tidb_cluster --workdir $WORK_DIR --multiple-upstream-pd "true"
cd $WORK_DIR

run_cdc_server --workdir $WORK_DIR --binary $CDC_BINARY --logsuffix "1" --addr "127.0.0.1:8600" --pd "$UP_PD"
run_cdc_server --workdir $WORK_DIR --binary $CDC_BINARY --logsuffix "2" --addr "127.0.0.1:8601" --pd "$UP_PD"
run_cdc_server --workdir $WORK_DIR --binary $CDC_BINARY --logsuffix "1" --addr "127.0.0.1:8601" --pd "$UP_PD"
run_cdc_server --workdir $WORK_DIR --binary $CDC_BINARY --logsuffix "2" --addr "127.0.0.1:8602" --pd "$UP_PD"

local i=1
while [ $i -le 10 ]; do
Expand Down
5 changes: 3 additions & 2 deletions cdc/tests/integration_tests/sigstop/run.sh
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/bin/bash

set -eu
set -eux

CUR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
source $CUR/../_utils/test_prepare
Expand Down Expand Up @@ -99,7 +99,8 @@ function run_kill_downstream() {
pd_pid=$(pgrep -f "pd-server" | sed -n "$n"p)
kill -19 $pd_pid
sleep 10
check_count 2 "pd" $UP_PD
# PD would not recover when ETCD leader is stopped. So skip check_count here.
# check_count 2 "pd" $UP_PD

kill -18 $pd_pid
check_count 3 "pd" $UP_PD
Expand Down
3 changes: 2 additions & 1 deletion cdc/tests/tests.Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,8 @@ RUN yum install -y \
tar \
psmisc \
mysql \
python3
python3 \
lsof

RUN wget http://dl.fedoraproject.org/pub/epel/epel-release-latest-7.noarch.rpm
RUN yum install -y epel-release-latest-7.noarch.rpm
Expand Down

0 comments on commit 853a667

Please sign in to comment.