From a12fe7473cfd500d093f033d0133ca294a5f870f Mon Sep 17 00:00:00 2001 From: Sema Checherinda Date: Mon, 18 Dec 2023 18:08:57 +0100 Subject: [PATCH 1/4] add killed by timeout message --- docker/test/stateful/run.sh | 4 ++-- docker/test/stateless/run.sh | 4 ++-- docker/test/stateless/utils.lib | 15 +++++++++++++++ 3 files changed, 19 insertions(+), 4 deletions(-) diff --git a/docker/test/stateful/run.sh b/docker/test/stateful/run.sh index 806b57c4616a..82587efcb3d1 100755 --- a/docker/test/stateful/run.sh +++ b/docker/test/stateful/run.sh @@ -78,7 +78,7 @@ function start() tail -n1000 /var/log/clickhouse-server/clickhouse-server.log break fi - timeout 120 service clickhouse-server start + timeout_with_logging 120 service clickhouse-server start sleep 0.5 counter=$((counter + 1)) done @@ -163,7 +163,7 @@ function run_tests() } export -f run_tests -timeout "$MAX_RUN_TIME" bash -c run_tests ||: +timeout_with_logging "$MAX_RUN_TIME" bash -c run_tests ||: echo "Files in current directory" ls -la ./ diff --git a/docker/test/stateless/run.sh b/docker/test/stateless/run.sh index bfa9f9938abf..bd44d1724aee 100755 --- a/docker/test/stateless/run.sh +++ b/docker/test/stateless/run.sh @@ -216,11 +216,11 @@ export -f run_tests if [ "$NUM_TRIES" -gt "1" ]; then # We don't run tests with Ordinary database in PRs, only in master. # So run new/changed tests with Ordinary at least once in flaky check. - timeout "$MAX_RUN_TIME" bash -c 'NUM_TRIES=1; USE_DATABASE_ORDINARY=1; run_tests' \ + timeout_with_logging "$MAX_RUN_TIME" bash -c 'NUM_TRIES=1; USE_DATABASE_ORDINARY=1; run_tests' \ | sed 's/All tests have finished//' | sed 's/No tests were run//' ||: fi -timeout "$MAX_RUN_TIME" bash -c run_tests ||: +timeout_with_logging "$MAX_RUN_TIME" bash -c run_tests ||: echo "Files in current directory" ls -la ./ diff --git a/docker/test/stateless/utils.lib b/docker/test/stateless/utils.lib index 1204434d853c..a30e05b46ff4 100644 --- a/docker/test/stateless/utils.lib +++ b/docker/test/stateless/utils.lib @@ -35,4 +35,19 @@ function fn_exists() { declare -F "$1" > /dev/null; } +function timeout_with_logging() { + local exit_code=0 + + timeout "${@}" || exit_code="${?}" + + if [[ "${exit_code}" -eq "124" ]] + then + echo "The command 'timeout ${*}' has been killed by timeout" + fi + + return $exit_code +} + # vi: ft=bash + +} From 140f6dafd8c01b2fed08b58abdfbab400982a071 Mon Sep 17 00:00:00 2001 From: Sema Checherinda Date: Mon, 18 Dec 2023 19:16:50 +0100 Subject: [PATCH 2/4] hung_check check with lighter queries --- tests/clickhouse-test | 134 +++++++++++++++++++++++++++++------------- 1 file changed, 94 insertions(+), 40 deletions(-) diff --git a/tests/clickhouse-test b/tests/clickhouse-test index 1609d8f3c074..c1bc8c9e5599 100755 --- a/tests/clickhouse-test +++ b/tests/clickhouse-test @@ -32,7 +32,7 @@ from typing import Tuple, Union, Optional, Dict, Set, List import subprocess from subprocess import Popen from subprocess import PIPE -from datetime import datetime +from datetime import datetime, timedelta from time import time, sleep from errno import ESRCH @@ -278,37 +278,44 @@ def need_retry(args, stdout, stderr, total_time): msg in stderr for msg in MESSAGES_TO_RETRY ) - -def get_processlist_with_stacktraces(args): - try: - if args.replicated_database: - return clickhouse_execute( +def get_processlist_size(args): + if args.replicated_database: + return int( + clickhouse_execute( args, """ - SELECT materialize(hostName() || '::' || tcpPort()::String) as host_port, * - -- NOTE: view() here to do JOIN on shards, instead of initiator - FROM clusterAllReplicas('test_cluster_database_replicated', view( SELECT - p.*, - arrayStringConcat(groupArray('Thread ID ' || toString(s.thread_id) || '\n' || arrayStringConcat(arrayMap( - x -> concat(addressToLine(x), '::', demangle(addressToSymbol(x))), - s.trace), '\n') AS stacktrace - )) AS stacktraces - FROM system.processes p - JOIN system.stack_trace s USING (query_id) - WHERE query NOT LIKE '%system.processes%' - GROUP BY p.* - )) - ORDER BY elapsed DESC FORMAT Vertical - """, - settings={ - "allow_introspection_functions": 1, - }, - ) - else: - return clickhouse_execute( + count() + FROM + FROM system.processes + WHERE query NOT LIKE '%system.processes%' + )) + FORMAT Vertical + """, + ).strip() + ) + else: + return int( + clickhouse_execute( args, """ + SELECT + count() + FROM system.processes + WHERE query NOT LIKE '%system.processes%' + FORMAT Vertical + """, + ).strip() + ) + +def get_processlist_with_stacktraces(args): + if args.replicated_database: + return clickhouse_execute( + args, + """ + SELECT materialize(hostName() || '::' || tcpPort()::String) as host_port, * + -- NOTE: view() here to do JOIN on shards, instead of initiator + FROM clusterAllReplicas('test_cluster_database_replicated', view( SELECT p.*, arrayStringConcat(groupArray('Thread ID ' || toString(s.thread_id) || '\n' || arrayStringConcat(arrayMap( @@ -319,14 +326,36 @@ def get_processlist_with_stacktraces(args): JOIN system.stack_trace s USING (query_id) WHERE query NOT LIKE '%system.processes%' GROUP BY p.* - ORDER BY elapsed DESC FORMAT Vertical - """, - settings={ - "allow_introspection_functions": 1, - }, - ) - except Exception as e: - return "Failed to get processlist: " + str(e) + )) + ORDER BY elapsed DESC FORMAT Vertical + """, + settings={ + "allow_introspection_functions": 1, + }, + timeout=120, + ) + else: + return clickhouse_execute( + args, + """ + SELECT + p.*, + arrayStringConcat(groupArray('Thread ID ' || toString(s.thread_id) || '\n' || arrayStringConcat(arrayMap( + x -> concat(addressToLine(x), '::', demangle(addressToSymbol(x))), + s.trace), '\n') AS stacktrace + )) AS stacktraces + FROM system.processes p + JOIN system.stack_trace s USING (query_id) + WHERE query NOT LIKE '%system.processes%' + GROUP BY p.* + ORDER BY elapsed DESC FORMAT Vertical + """, + settings={ + "allow_introspection_functions": 1, + }, + timeout=120, + ) + def get_transactions_list(args): @@ -2420,11 +2449,36 @@ def main(args): if args.hung_check: # Some queries may execute in background for some time after test was finished. This is normal. - for _ in range(1, 60): - processlist = get_processlist_with_stacktraces(args) - if not processlist: - break - sleep(1) + print("Checking the hung queries: ", end='') + hung_count = 0 + try: + deadline = datetime.now() + timedelta(seconds=90) + while datetime.now() < deadline: + hung_count = get_processlist_size(args) + if hung_count == 0: + print(" done") + break + print(". ", end='') + except Exception as e: + print( + colored( + "\nHung check failed. Failed to get processlist size: " + str(e), args, "red", attrs=["bold"] + ) + ) + exit_code.value = 1 + + + processlist = "" + if hung_count > 0: + try: + processlist = get_processlist_with_stacktraces(args) + except Exception as e: + print( + colored( + "\nHung check failed, Failed to get processlist with stacktraces: " + str(e), args, "red", attrs=["bold"] + ) + ) + exit_code.value = 1 if processlist: print( From 939d602c3c83df7019d63fe4899b46a364dc26d5 Mon Sep 17 00:00:00 2001 From: Sema Checherinda Date: Mon, 18 Dec 2023 19:26:33 +0100 Subject: [PATCH 3/4] fix typo --- docker/test/stateless/utils.lib | 2 -- tests/clickhouse-test | 5 ++--- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/docker/test/stateless/utils.lib b/docker/test/stateless/utils.lib index a30e05b46ff4..9b6ab535a903 100644 --- a/docker/test/stateless/utils.lib +++ b/docker/test/stateless/utils.lib @@ -49,5 +49,3 @@ function timeout_with_logging() { } # vi: ft=bash - -} diff --git a/tests/clickhouse-test b/tests/clickhouse-test index c1bc8c9e5599..1ba67a3b2f43 100755 --- a/tests/clickhouse-test +++ b/tests/clickhouse-test @@ -288,8 +288,7 @@ def get_processlist_size(args): count() FROM FROM system.processes - WHERE query NOT LIKE '%system.processes%' - )) + WHERE query NOT LIKE '%system.processes%' FORMAT Vertical """, ).strip() @@ -2475,7 +2474,7 @@ def main(args): except Exception as e: print( colored( - "\nHung check failed, Failed to get processlist with stacktraces: " + str(e), args, "red", attrs=["bold"] + "\nHung check failed. Failed to get processlist with stacktraces: " + str(e), args, "red", attrs=["bold"] ) ) exit_code.value = 1 From b4fec61814f2a57aab18f5a15b9bdff4fd703848 Mon Sep 17 00:00:00 2001 From: Sema Checherinda Date: Mon, 18 Dec 2023 20:50:58 +0100 Subject: [PATCH 4/4] fix style and black --- docker/test/stateful/run.sh | 4 ++-- tests/clickhouse-test | 37 +++++++++++++++++++++---------------- 2 files changed, 23 insertions(+), 18 deletions(-) diff --git a/docker/test/stateful/run.sh b/docker/test/stateful/run.sh index 82587efcb3d1..806b57c4616a 100755 --- a/docker/test/stateful/run.sh +++ b/docker/test/stateful/run.sh @@ -78,7 +78,7 @@ function start() tail -n1000 /var/log/clickhouse-server/clickhouse-server.log break fi - timeout_with_logging 120 service clickhouse-server start + timeout 120 service clickhouse-server start sleep 0.5 counter=$((counter + 1)) done @@ -163,7 +163,7 @@ function run_tests() } export -f run_tests -timeout_with_logging "$MAX_RUN_TIME" bash -c run_tests ||: +timeout "$MAX_RUN_TIME" bash -c run_tests ||: echo "Files in current directory" ls -la ./ diff --git a/tests/clickhouse-test b/tests/clickhouse-test index 1ba67a3b2f43..1df2bc8271fb 100755 --- a/tests/clickhouse-test +++ b/tests/clickhouse-test @@ -278,6 +278,7 @@ def need_retry(args, stdout, stderr, total_time): msg in stderr for msg in MESSAGES_TO_RETRY ) + def get_processlist_size(args): if args.replicated_database: return int( @@ -286,27 +287,26 @@ def get_processlist_size(args): """ SELECT count() - FROM + FROM FROM system.processes WHERE query NOT LIKE '%system.processes%' - FORMAT Vertical """, - ).strip() + ).strip() ) else: return int( - clickhouse_execute( + clickhouse_execute( args, """ - SELECT - count() - FROM system.processes - WHERE query NOT LIKE '%system.processes%' - FORMAT Vertical - """, + SELECT + count() + FROM system.processes + WHERE query NOT LIKE '%system.processes%' + """, ).strip() ) + def get_processlist_with_stacktraces(args): if args.replicated_database: return clickhouse_execute( @@ -356,7 +356,6 @@ def get_processlist_with_stacktraces(args): ) - def get_transactions_list(args): try: if args.replicated_database: @@ -2448,7 +2447,7 @@ def main(args): if args.hung_check: # Some queries may execute in background for some time after test was finished. This is normal. - print("Checking the hung queries: ", end='') + print("Checking the hung queries: ", end="") hung_count = 0 try: deadline = datetime.now() + timedelta(seconds=90) @@ -2457,16 +2456,18 @@ def main(args): if hung_count == 0: print(" done") break - print(". ", end='') + print(". ", end="") except Exception as e: print( colored( - "\nHung check failed. Failed to get processlist size: " + str(e), args, "red", attrs=["bold"] + "\nHung check failed. Failed to get processlist size: " + str(e), + args, + "red", + attrs=["bold"], ) ) exit_code.value = 1 - processlist = "" if hung_count > 0: try: @@ -2474,7 +2475,11 @@ def main(args): except Exception as e: print( colored( - "\nHung check failed. Failed to get processlist with stacktraces: " + str(e), args, "red", attrs=["bold"] + "\nHung check failed. Failed to get processlist with stacktraces: " + + str(e), + args, + "red", + attrs=["bold"], ) ) exit_code.value = 1