From 62b6b51532bc6abd229c23df2a232c988b56f2d7 Mon Sep 17 00:00:00 2001 From: David Boucher Date: Fri, 22 Nov 2024 13:30:34 +0100 Subject: [PATCH] fix(broker/sql): two issues in the mysql object * a possible segfault fixed. * an issue on errors raised by mariadb that can have errno=0 These issues are due to a change in mariadb. We can have a statement execution that fails while errno=0. REFS: MON-153670 --- .github/workflows/package-collect.yml | 2 +- broker/core/sql/src/mysql_connection.cc | 27 ++-- broker/core/sql/src/mysql_multi_insert.cc | 6 +- .../services-and-bulk-stmt.robot | 141 ++++++++++++++++++ tests/resources/Broker.py | 97 ++++++++++++ tests/resources/resources.robot | 15 +- 6 files changed, 270 insertions(+), 18 deletions(-) diff --git a/.github/workflows/package-collect.yml b/.github/workflows/package-collect.yml index 86142369cd0..112e953a22e 100644 --- a/.github/workflows/package-collect.yml +++ b/.github/workflows/package-collect.yml @@ -145,7 +145,7 @@ jobs: run: rm -rf *-debuginfo*.${{ matrix.package_extension }} # set condition to true if artifacts are needed - - if: ${{ false }} + - if: ${{ true }} name: Upload package artifacts uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808 # v4.3.3 with: diff --git a/broker/core/sql/src/mysql_connection.cc b/broker/core/sql/src/mysql_connection.cc index 78aea399801..cbc4766ef27 100644 --- a/broker/core/sql/src/mysql_connection.cc +++ b/broker/core/sql/src/mysql_connection.cc @@ -17,6 +17,7 @@ */ #include +#include #include "com/centreon/broker/config/applier/init.hh" #include "com/centreon/broker/log_v2.hh" @@ -464,18 +465,26 @@ void mysql_connection::_statement(mysql_task* t) { "mysql_connection {:p}: execute statement {} attempt {}: {}", static_cast(this), task->statement_id, attempts, query); if (mysql_stmt_execute(stmt)) { - std::string err_msg( - fmt::format("{} errno={} {}", mysql_error::msg[task->error_code], - ::mysql_errno(_conn), ::mysql_stmt_error(stmt))); - SPDLOG_LOGGER_ERROR(log_v2::sql(), - "connection fail to execute statement {:p}: {}", - static_cast(this), err_msg); - if (_server_error(::mysql_stmt_errno(stmt))) { + int32_t err_code = ::mysql_stmt_errno(stmt); + std::string err_msg(fmt::format("{} errno={} {}", + mysql_error::msg[task->error_code], + err_code, ::mysql_stmt_error(stmt))); + if (err_code == 0) { + SPDLOG_LOGGER_ERROR(log_v2::sql(), + "mysql_connection: errno=0, so we simulate a " + "server error CR_SERVER_LOST"); + err_code = CR_SERVER_LOST; + } else { + SPDLOG_LOGGER_ERROR(log_v2::sql(), + "connection fail to execute statement {:p}: {}", + static_cast(this), err_msg); + } + if (_server_error(err_code)) { set_error_message(err_msg); break; } - if (mysql_stmt_errno(stmt) != 1213 && - mysql_stmt_errno(stmt) != 1205) // Dead Lock error + if (err_code != ER_LOCK_DEADLOCK && + err_code != ER_LOCK_WAIT_TIMEOUT) // Dead Lock error attempts = MAX_ATTEMPTS; if (mysql_commit(_conn)) { diff --git a/broker/core/sql/src/mysql_multi_insert.cc b/broker/core/sql/src/mysql_multi_insert.cc index cafc020e386..7d375cb82cd 100644 --- a/broker/core/sql/src/mysql_multi_insert.cc +++ b/broker/core/sql/src/mysql_multi_insert.cc @@ -132,7 +132,11 @@ void bulk_or_multi::execute(mysql& connexion, my_error::code ec, int thread_id) { if (_bulk_stmt) { - if (!_bulk_bind->empty()) { + /* If the database connection is lost, we can have this issue */ + if (!_bulk_bind) { + _bulk_bind = _bulk_stmt->create_bind(); + _bulk_bind->reserve(_bulk_row); + } else if (!_bulk_bind->empty()) { _bulk_stmt->set_bind(std::move(_bulk_bind)); connexion.run_statement(*_bulk_stmt, ec, thread_id); _bulk_bind = _bulk_stmt->create_bind(); diff --git a/tests/broker-engine/services-and-bulk-stmt.robot b/tests/broker-engine/services-and-bulk-stmt.robot index e3d2fc38675..2d03b5d74e5 100644 --- a/tests/broker-engine/services-and-bulk-stmt.robot +++ b/tests/broker-engine/services-and-bulk-stmt.robot @@ -63,6 +63,7 @@ EBBPS1 IF "${output}" == "((0,),)" BREAK END Should Be Equal As Strings ${output} ((0,),) + Disconnect From Database FOR ${i} IN RANGE ${1000} Ctn Process Service Check Result host_1 service_${i+1} 2 warning${i} @@ -100,6 +101,7 @@ EBBPS1 IF "${output}" == "((0,),)" BREAK END Should Be Equal As Strings ${output} ((0,),) + Disconnect From Database EBBPS2 [Documentation] 1000 service check results are sent to the poller. The test is done with the unified_sql stream, no service status is lost, we find the 1000 results in the database: table services. @@ -146,6 +148,7 @@ EBBPS2 IF "${output}" == "((0,),)" BREAK END Should Be Equal As Strings ${output} ((0,),) + Disconnect From Database FOR ${i} IN RANGE ${1000} Ctn Process Service Check Result host_1 service_${i+1} 2 critical${i} @@ -182,6 +185,7 @@ EBBPS2 IF "${output}" == "((0,),)" BREAK END Should Be Equal As Strings ${output} ((0,),) + Disconnect From Database EBMSSM [Documentation] 1000 services are configured with 100 metrics each. The rrd output is removed from the broker configuration. GetSqlManagerStats is called to measure writes into data_bin. @@ -228,6 +232,7 @@ EBMSSM Sleep 1s END Should Be True ${output[0][0]} >= 100000 + Disconnect From Database EBPS2 [Documentation] 1000 services are configured with 20 metrics each. The rrd output is removed from the broker configuration to avoid to write too many rrd files. While metrics are written in bulk, the database is stopped. This must not crash broker. @@ -390,6 +395,142 @@ metric_mapping ${grep_res} Grep File /tmp/test.log name: metric1 corresponds to metric id Should Not Be Empty ${grep_res} metric name "metric1" not found +EBMSSMDBD + [Documentation] 1000 services are configured with 100 metrics each. + ... The rrd output is removed from the broker configuration. + ... While metrics are written in the database, we stop the database and then restart it. + ... Broker must recover its connection to the database and continue to write metrics. + [Tags] broker engine unified_sql MON-153320 + Ctn Clear Metrics + Ctn Config Engine ${1} ${1} ${1000} + # We want all the services to be passive to avoid parasite checks during our test. + Ctn Set Services Passive ${0} service_.* + Ctn Config Broker central + Ctn Config Broker rrd + Ctn Config Broker module ${1} + Ctn Config BBDO3 1 + Ctn Broker Config Log central core error + Ctn Broker Config Log central tcp error + Ctn Broker Config Log central sql debug + Ctn Config Broker Sql Output central unified_sql + Ctn Config Broker Remove Rrd Output central + Ctn Clear Retention + ${start} Get Current Date + Ctn Start Broker + Ctn Start Engine + + Ctn Wait For Engine To Be Ready ${start} 1 + + ${start} Ctn Get Round Current Date + # Let's wait for one "INSERT INTO data_bin" to appear in stats. + Log To Console Many service checks with 100 metrics each are processed. + FOR ${i} IN RANGE ${1000} + Ctn Process Service Check Result With Metrics host_1 service_${i+1} 1 warning${i} 100 + END + + Log To Console We wait for at least one metric to be written in the database. + # Let's wait for all force checks to be in the storage database. + Connect To Database pymysql ${DBName} ${DBUser} ${DBPass} ${DBHost} ${DBPort} + FOR ${i} IN RANGE ${500} + ${output} Query + ... SELECT COUNT(s.last_check) FROM metrics m LEFT JOIN index_data i ON m.index_id = i.id LEFT JOIN services s ON s.host_id = i.host_id AND s.service_id = i.service_id WHERE metric_name LIKE "metric_%%" AND s.last_check >= ${start} + IF ${output[0][0]} >= 1 BREAK + Sleep 1s + END + Disconnect From Database + + Log To Console Let's start some database manipulation... + ${start} Get Current Date + + FOR ${i} IN RANGE ${3} + Ctn Stop Mysql + Sleep 10s + Ctn Start Mysql + ${content} Create List could not insert data in data_bin + ${result} Ctn Find In Log With Timeout ${centralLog} ${start} ${content} 10 + Log To Console ${result} + END + +EBMSSMPART + [Documentation] 1000 services are configured with 100 metrics each. + ... The rrd output is removed from the broker configuration. + ... The data_bin table is configured with two partitions p1 and p2 such + ... that p1 contains old data and p2 contains current data. + ... While metrics are written in the database, we remove the p2 partition. + ... Once the p2 partition is recreated, broker must recover its connection + ... to the database and continue to write metrics. + ... To check that last point, we force a last service check and we check + ... that its metrics are written in the database. + [Tags] broker engine unified_sql MON-153320 + Ctn Clear Metrics + Ctn Config Engine ${1} ${1} ${1000} + # We want all the services to be passive to avoid parasite checks during our test. + Ctn Set Services Passive ${0} service_.* + Ctn Config Broker central + Ctn Config Broker rrd + Ctn Config Broker module ${1} + Ctn Config BBDO3 1 + Ctn Broker Config Log central core error + Ctn Broker Config Log central tcp error + Ctn Broker Config Log central sql trace + Ctn Config Broker Sql Output central unified_sql + Ctn Config Broker Remove Rrd Output central + Ctn Clear Retention + + Ctn Prepare Partitions For Data Bin + ${start} Get Current Date + Ctn Start Broker + Ctn Start Engine + + Ctn Wait For Engine To Be Ready ${start} 1 + + ${start} Ctn Get Round Current Date + # Let's wait for one "INSERT INTO data_bin" to appear in stats. + Log To Console Many service checks with 100 metrics each are processed. + FOR ${i} IN RANGE ${1000} + Ctn Process Service Check Result With Metrics host_1 service_${i+1} 1 warning${i} 100 + END + + Log To Console We wait for at least one metric to be written in the database. + # Let's wait for all force checks to be in the storage database. + Connect To Database pymysql ${DBName} ${DBUser} ${DBPass} ${DBHost} ${DBPort} + FOR ${i} IN RANGE ${500} + ${output} Query + ... SELECT COUNT(s.last_check) FROM metrics m LEFT JOIN index_data i ON m.index_id = i.id LEFT JOIN services s ON s.host_id = i.host_id AND s.service_id = i.service_id WHERE metric_name LIKE "metric_%%" AND s.last_check >= ${start} + IF ${output[0][0]} >= 1 BREAK + Sleep 1s + END + Disconnect From Database + + Log To Console Let's start some database manipulation... + Ctn Remove P2 From Data Bin + ${start} Get Current Date + + ${content} Create List errno= + FOR ${i} IN RANGE ${6} + ${result} Ctn Find In Log With Timeout ${centralLog} ${start} ${content} 10 + IF ${result} BREAK + END + + Log To Console Let's recreate the p2 partition... + Ctn Add P2 To Data Bin + + ${start} Ctn Get Round Current Date + Ctn Process Service Check Result With Metrics host_1 service_1 0 Last Output OK 100 + + Log To Console Let's wait for the last service check to be in the database... + Connect To Database pymysql ${DBName} ${DBUser} ${DBPass} ${DBHost} ${DBPort} + FOR ${i} IN RANGE ${120} + ${output} Query SELECT count(*) FROM data_bin WHERE ctime >= ${start} - 10 + Log To Console ${output} + IF ${output[0][0]} >= 100 BREAK + Sleep 1s + END + Log To Console ${output} + Should Be True ${output[0][0]} >= 100 + Disconnect From Database + + *** Keywords *** Ctn Test Clean Ctn Stop Engine diff --git a/tests/resources/Broker.py b/tests/resources/Broker.py index 92b7ea6e11b..36ad5c36249 100755 --- a/tests/resources/Broker.py +++ b/tests/resources/Broker.py @@ -2734,3 +2734,100 @@ def ctn_broker_get_ba(port: int, ba_id: int, output_file: str, timeout=TIMEOUT): except: logger.console("gRPC server not ready") return res + + +def ctn_prepare_partitions_for_data_bin(): + """ + Create two partitions for the data_bin table. + The first one named p1 contains data with ctime older than now - 60. + The second one named p2 contains data with ctime older than now + 3600. + """ + connection = pymysql.connect(host=DB_HOST, + user=DB_USER, + password=DB_PASS, + database=DB_NAME_STORAGE, + charset='utf8mb4', + cursorclass=pymysql.cursors.DictCursor) + + now = int(time.time()) + before = now - 60 + after = now + 3600 + with connection: + with connection.cursor() as cursor: + cursor.execute("DROP TABLE IF EXISTS data_bin") + sql = f"""CREATE TABLE `data_bin` ( + `id_metric` int(11) DEFAULT NULL, + `ctime` int(11) DEFAULT NULL, + `value` float DEFAULT NULL, + `status` enum('0','1','2','3','4') DEFAULT NULL, + KEY `index_metric` (`id_metric`) +) ENGINE=InnoDB DEFAULT CHARSET=latin1 + PARTITION BY RANGE (`ctime`) +(PARTITION `p1` VALUES LESS THAN ({before}) ENGINE = InnoDB, + PARTITION `p2` VALUES LESS THAN ({after}) ENGINE = InnoDB)""" + cursor.execute(sql) + connection.commit() + + +def ctn_remove_p2_from_data_bin(): + """ + Remove the partition p2 from the data_bin table. + """ + connection = pymysql.connect(host=DB_HOST, + user=DB_USER, + password=DB_PASS, + database=DB_NAME_STORAGE, + charset='utf8mb4', + cursorclass=pymysql.cursors.DictCursor) + + with connection: + with connection.cursor() as cursor: + cursor.execute("ALTER TABLE data_bin DROP PARTITION p2") + connection.commit() + + +def ctn_add_p2_to_data_bin(): + """ + Add the partition p2 the the data_bin table. + """ + connection = pymysql.connect(host=DB_HOST, + user=DB_USER, + password=DB_PASS, + database=DB_NAME_STORAGE, + charset='utf8mb4', + cursorclass=pymysql.cursors.DictCursor) + + after = int(time.time()) + 3600 + with connection: + with connection.cursor() as cursor: + cursor.execute( + f"ALTER TABLE data_bin ADD PARTITION (PARTITION p2 VALUES LESS THAN ({after}))") + connection.commit() + + +def ctn_init_data_bin_without_partition(): + """ + Recreate the data_bin table without partition. + """ + connection = pymysql.connect(host=DB_HOST, + user=DB_USER, + password=DB_PASS, + database=DB_NAME_STORAGE, + charset='utf8mb4', + cursorclass=pymysql.cursors.DictCursor) + + now = int(time.time()) + before = now - 60 + after = now + 3600 + with connection: + with connection.cursor() as cursor: + cursor.execute("DROP TABLE IF EXISTS data_bin") + sql = f"""CREATE TABLE `data_bin` ( + `id_metric` int(11) DEFAULT NULL, + `ctime` int(11) DEFAULT NULL, + `value` float DEFAULT NULL, + `status` enum('0','1','2','3','4') DEFAULT NULL, + KEY `index_metric` (`id_metric`) +) ENGINE=InnoDB DEFAULT CHARSET=latin1""" + cursor.execute(sql) + connection.commit() diff --git a/tests/resources/resources.robot b/tests/resources/resources.robot index 259febc2774..1a2f771afaf 100644 --- a/tests/resources/resources.robot +++ b/tests/resources/resources.robot @@ -363,13 +363,14 @@ Ctn Dump Ba On Error Ctn Process Service Result Hard [Arguments] ${host} ${svc} ${state} ${output} - Repeat Keyword - ... 3 times - ... Ctn Process Service Check Result - ... ${host} - ... ${svc} - ... ${state} - ... ${output} + FOR ${idx} IN RANGE 3 + Ctn Process Service Check Result + ... ${host} + ... ${svc} + ... ${state} + ... ${output} + Sleep 1s + END Ctn Wait For Engine To Be Ready [Arguments] ${start} ${nbEngine}=1