From d2852eba4c698075924c0c7260143c0736311f29 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=B8=A0=E7=A3=8A?= Date: Thu, 18 Jul 2024 14:49:55 +0800 Subject: [PATCH 01/11] clog update --- handler/rca/scene/clog_disk_full_scene.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/handler/rca/scene/clog_disk_full_scene.py b/handler/rca/scene/clog_disk_full_scene.py index be45fa15..6079e808 100644 --- a/handler/rca/scene/clog_disk_full_scene.py +++ b/handler/rca/scene/clog_disk_full_scene.py @@ -137,6 +137,7 @@ def __init__(self, context, tenant_id, ls_id, work_path, stdio, record=None): os.makedirs(work_path) self.stdio.verbose("work_path is {0}".format(self.work_path)) self.stdio = stdio + self.input_parameters = context.get_variable("input_parameters") or {} def execute(self): try: @@ -151,6 +152,9 @@ def execute(self): self.gather_log.grep("{0}".format(self.tenant_id)) self.gather_log.grep("{0}".format(self.ls_id)) self.gather_log.grep("clog checkpoint no change") + if self.input_parameters.get("since") is not None: + since = self.input_parameters.get("since") + self.gather_log.set_parameters("since", since) logs_name = self.gather_log.execute(save_path=work_path_checkpoint) if logs_name is None or len(logs_name) <= 0: self.record.add_record("no log_disk_full about checkpoint") @@ -192,6 +196,9 @@ def execute(self): self.gather_log.grep("{0}".format(self.tenant_id)) self.gather_log.grep("{0}".format(self.ls_id)) self.gather_log.grep("ObLSTxService::get_rec_scn") + if self.input_parameters.get("since") is not None: + since = self.input_parameters.get("since") + self.gather_log.set_parameters("since", since) logs_name = self.gather_log.execute(save_path=work_path_get_min_ckpt_type) check_min_ckpt_type = False for log_name in logs_name: @@ -222,6 +229,9 @@ def execute(self): self.gather_log.grep("{0}".format(self.tenant_id)) self.gather_log.grep("{0}".format(self.ls_id)) self.gather_log.grep("get_min_unreplayed_log_info") + if self.input_parameters.get("since") is not None: + since = self.input_parameters.get("since") + self.gather_log.set_parameters("since", since) logs_name = self.gather_log.execute(save_path=work_path_check_replay_stack) check_replay_stuck = False for log_name in logs_name: @@ -253,6 +263,9 @@ def execute(self): self.gather_log.grep("{0}".format(self.tenant_id)) self.gather_log.grep("log_frozen_memstore_info_if_need_") self.gather_log.grep("[TenantFreezer] oldest frozen memtable") + if self.input_parameters.get("since") is not None: + since = self.input_parameters.get("since") + self.gather_log.set_parameters("since", since) logs_name = self.gather_log.execute(save_path=work_path_check_dump_stuck) check_dump_stuck = False for log_name in logs_name: @@ -287,6 +300,9 @@ def execute(self): self.gather_log.set_parameters("scope", "observer") self.gather_log.grep("{0}".format(self.tenant_id)) self.gather_log.grep("Server out of disk space") + if self.input_parameters.get("since") is not None: + since = self.input_parameters.get("since") + self.gather_log.set_parameters("since", since) logs_name = self.gather_log.execute(save_path=work_path_check_data_disk_full) for log_name in logs_name: if check_data_disk_full: @@ -309,6 +325,9 @@ def execute(self): self.gather_log.set_parameters("scope", "observer") self.gather_log.grep("{0}".format(self.tenant_id)) self.gather_log.grep("Too many sstables in tablet, cannot schdule mini compaction, retry later") + if self.input_parameters.get("since") is not None: + since = self.input_parameters.get("since") + self.gather_log.set_parameters("since", since) logs_name = self.gather_log.execute(save_path=work_path_check_too_many_sstable) for log_name in logs_name: if check_too_many_sstable: From 1c62bb073e9bc5dc734061244d03603b9b285fab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=B8=A0=E7=A3=8A?= Date: Thu, 18 Jul 2024 19:46:45 +0800 Subject: [PATCH 02/11] build --- .github/workflows/build_package.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/build_package.yml b/.github/workflows/build_package.yml index d6a8be8c..a863fe32 100644 --- a/.github/workflows/build_package.yml +++ b/.github/workflows/build_package.yml @@ -7,6 +7,7 @@ on: push: branches: - master + - 2.3.0-qulei_tmp env: ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true From b4b841b05b83075b196513316762b4aa5fc382d4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=B8=A0=E7=A3=8A?= Date: Fri, 19 Jul 2024 10:23:02 +0800 Subject: [PATCH 03/11] update --- handler/rca/scene/clog_disk_full_scene.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/handler/rca/scene/clog_disk_full_scene.py b/handler/rca/scene/clog_disk_full_scene.py index 6079e808..772baa1f 100644 --- a/handler/rca/scene/clog_disk_full_scene.py +++ b/handler/rca/scene/clog_disk_full_scene.py @@ -247,7 +247,11 @@ def execute(self): replay_scn = self.parse_replay_scn(line) replay_scn_time = datetime.datetime.fromtimestamp(float(replay_scn) / 1000000000) log_time = self.parse_log_time(line) + self.record.add_record("log_time:{0}, replay_scn_time:{1}") check_replay_stuck = log_time - replay_scn_time > datetime.timedelta(minutes=0.5) + self.record.add_record("log_time - replay_scn_time : {0} - {1}".format(log_time, replay_scn_time)) + self.record.add_record("datetime.timedelta(minutes=0.5): {0}".format(datetime.timedelta(minutes=0.5))) + self.record.add_record("log_time - replay_scn_time > datetime.timedelta(minutes=0.5) is {0}".format(check_replay_stuck)) break self.record.add_record("check_replay_stuck is {0}".format(check_replay_stuck)) if check_replay_stuck: From 1fe80f57c975d464f829e72d927e4fc3715925f6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=B8=A0=E7=A3=8A?= Date: Fri, 19 Jul 2024 10:25:08 +0800 Subject: [PATCH 04/11] update --- .github/workflows/build_package.yml | 1 - handler/rca/scene/clog_disk_full_scene.py | 11 ++++++++++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build_package.yml b/.github/workflows/build_package.yml index 9a676812..f2079d4e 100644 --- a/.github/workflows/build_package.yml +++ b/.github/workflows/build_package.yml @@ -7,7 +7,6 @@ on: push: branches: - master - - 2.3.0-qulei_tmp env: ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true diff --git a/handler/rca/scene/clog_disk_full_scene.py b/handler/rca/scene/clog_disk_full_scene.py index 772baa1f..b6926c62 100644 --- a/handler/rca/scene/clog_disk_full_scene.py +++ b/handler/rca/scene/clog_disk_full_scene.py @@ -242,7 +242,7 @@ def execute(self): for line in lines: if check_replay_stuck: break - if "get_min_unreplayed_log_info" in line and self.get_stuck_mod(line).get('role_') is not None: + if "get_min_unreplayed_log_info" in line and self.get_stuck_modV2(line).get('role_') is not None: self.record.add_record("get min unreplayed log info is {0}".format(line)) replay_scn = self.parse_replay_scn(line) replay_scn_time = datetime.datetime.fromtimestamp(float(replay_scn) / 1000000000) @@ -362,6 +362,15 @@ def get_stuck_mod(self, line): d[i.group('key')] = i.group('value') return d + def get_stuck_modV2(self,line): + d = dict() + # service_type="TRANS_SERVICE" + p = '(?P[\w|_]+):(?P\w+)' + m = re.finditer(p, line) + for i in m: + d[i.group('key')] = i.group('value') + return d + def parse_checkpoint_scn(self, line): p = "checkpoint_scn=\{val:(?P\d+)\}," p1 = "checkpoint_scn=\{val:(?P\d+)," From b915ec79d272bcd89ac12026ddeb42e5930670e4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=B8=A0=E7=A3=8A?= Date: Fri, 19 Jul 2024 10:33:19 +0800 Subject: [PATCH 05/11] update --- handler/rca/scene/clog_disk_full_scene.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/handler/rca/scene/clog_disk_full_scene.py b/handler/rca/scene/clog_disk_full_scene.py index b6926c62..c4a42182 100644 --- a/handler/rca/scene/clog_disk_full_scene.py +++ b/handler/rca/scene/clog_disk_full_scene.py @@ -243,15 +243,22 @@ def execute(self): if check_replay_stuck: break if "get_min_unreplayed_log_info" in line and self.get_stuck_modV2(line).get('role_') is not None: - self.record.add_record("get min unreplayed log info is {0}".format(line)) + replay_scn = self.parse_replay_scn(line) replay_scn_time = datetime.datetime.fromtimestamp(float(replay_scn) / 1000000000) log_time = self.parse_log_time(line) - self.record.add_record("log_time:{0}, replay_scn_time:{1}") check_replay_stuck = log_time - replay_scn_time > datetime.timedelta(minutes=0.5) - self.record.add_record("log_time - replay_scn_time : {0} - {1}".format(log_time, replay_scn_time)) - self.record.add_record("datetime.timedelta(minutes=0.5): {0}".format(datetime.timedelta(minutes=0.5))) - self.record.add_record("log_time - replay_scn_time > datetime.timedelta(minutes=0.5) is {0}".format(check_replay_stuck)) + if check_replay_stuck: + self.record.add_record("check_replay_stuck is True. the line: {0}".format(line)) + self.record.add_record("get min unreplayed log info is {0}".format(line)) + self.record.add_record( + "log_time - replay_scn_time : {0} - {1}".format(log_time, replay_scn_time)) + self.record.add_record( + "datetime.timedelta(minutes=0.5): {0}".format(datetime.timedelta(minutes=0.5))) + self.record.add_record( + "log_time - replay_scn_time > datetime.timedelta(minutes=0.5) is {0}".format( + check_replay_stuck)) + self.record.add_record("log_time:{0}, replay_scn_time:{1}") break self.record.add_record("check_replay_stuck is {0}".format(check_replay_stuck)) if check_replay_stuck: From 198d2d79e7e9cfcc69fec50797e1e54cad1ef820 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=B8=A0=E7=A3=8A?= Date: Mon, 22 Jul 2024 16:24:03 +0800 Subject: [PATCH 06/11] update rca clog_disk_full_scene --- handler/rca/scene/clog_disk_full_scene.py | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/handler/rca/scene/clog_disk_full_scene.py b/handler/rca/scene/clog_disk_full_scene.py index c4a42182..679e9f04 100644 --- a/handler/rca/scene/clog_disk_full_scene.py +++ b/handler/rca/scene/clog_disk_full_scene.py @@ -251,14 +251,9 @@ def execute(self): if check_replay_stuck: self.record.add_record("check_replay_stuck is True. the line: {0}".format(line)) self.record.add_record("get min unreplayed log info is {0}".format(line)) - self.record.add_record( - "log_time - replay_scn_time : {0} - {1}".format(log_time, replay_scn_time)) - self.record.add_record( - "datetime.timedelta(minutes=0.5): {0}".format(datetime.timedelta(minutes=0.5))) - self.record.add_record( - "log_time - replay_scn_time > datetime.timedelta(minutes=0.5) is {0}".format( - check_replay_stuck)) - self.record.add_record("log_time:{0}, replay_scn_time:{1}") + self.record.add_record("log_time - replay_scn_time : {0} - {1}".format(log_time, replay_scn_time)) + self.record.add_record("datetime.timedelta(minutes=0.5): {0}".format(datetime.timedelta(minutes=0.5))) + self.record.add_record("log_time - replay_scn_time > datetime.timedelta(minutes=0.5) is {0}".format(check_replay_stuck)) break self.record.add_record("check_replay_stuck is {0}".format(check_replay_stuck)) if check_replay_stuck: @@ -369,7 +364,7 @@ def get_stuck_mod(self, line): d[i.group('key')] = i.group('value') return d - def get_stuck_modV2(self,line): + def get_stuck_modV2(self, line): d = dict() # service_type="TRANS_SERVICE" p = '(?P[\w|_]+):(?P\w+)' From 7445f70d44eff01c2869c863facb1c471cbf9aed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=B8=A0=E7=A3=8A?= Date: Mon, 22 Jul 2024 16:43:36 +0800 Subject: [PATCH 07/11] del SsherClient SafeStdio super init func --- common/ssh_client/base.py | 1 - 1 file changed, 1 deletion(-) diff --git a/common/ssh_client/base.py b/common/ssh_client/base.py index 4ac6deed..870f73b5 100644 --- a/common/ssh_client/base.py +++ b/common/ssh_client/base.py @@ -22,7 +22,6 @@ class SsherClient(SafeStdio): def __init__(self, context, node): - super().__init__() self.context = context if context is not None: self.stdio = self.context.stdio From 7d6b470735f04e445ae6d92e1a3eeb4c186248ab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=B8=A0=E7=A3=8A?= Date: Mon, 22 Jul 2024 18:01:36 +0800 Subject: [PATCH 08/11] update lock_conflict_scene --- handler/rca/scene/lock_conflict_scene.py | 1 + 1 file changed, 1 insertion(+) diff --git a/handler/rca/scene/lock_conflict_scene.py b/handler/rca/scene/lock_conflict_scene.py index c2de51be..b5a1b5d0 100644 --- a/handler/rca/scene/lock_conflict_scene.py +++ b/handler/rca/scene/lock_conflict_scene.py @@ -102,6 +102,7 @@ def __execute_4_2(self): audit_switch_value = cursor_check_switch.fetchone().get("value") if audit_switch_value.strip().upper() == "TRUE": holding_lock_sql_info_cursor = self.ob_connector.execute_sql_return_cursor_dictionary('SELECT * FROM oceanbase.gv$OB_SQL_AUDIT where SID="{0}";'.format(holding_lock_session_id)) + trans_record.add_record('exec sql: SELECT * FROM oceanbase.gv$OB_SQL_AUDIT where SID="{0}"; to get holding_lock_sql_info.'.format(holding_lock_session_id)) holding_lock_sql_info = holding_lock_sql_info_cursor.fetchall() if len(holding_lock_sql_info) == 0: trans_record.add_record("holding_lock_session_id: {0}; not find sql_info on gv$OB_SQL_AUDIT".format(holding_lock_session_id)) From 373c7a6a27bf70f4aebaca96219db780d8c0b6b3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=B8=A0=E7=A3=8A?= Date: Tue, 23 Jul 2024 11:44:36 +0800 Subject: [PATCH 09/11] update clog_disk_full_scene --- handler/rca/scene/clog_disk_full_scene.py | 26 ++++++++++++----------- 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/handler/rca/scene/clog_disk_full_scene.py b/handler/rca/scene/clog_disk_full_scene.py index 679e9f04..65bb50aa 100644 --- a/handler/rca/scene/clog_disk_full_scene.py +++ b/handler/rca/scene/clog_disk_full_scene.py @@ -94,6 +94,7 @@ def execute(self): self.verbose("tenant_ls_datas is {0}".format(tenant_ls_datas)) self.record.add_record("tenant_ls_datas is {0}".format(tenant_ls_datas)) self.record.add_suggest("init data end. Please check the other record.") + self.record.add_suggest("If you want to learn more or get help, you can package the folder '{0}' and upload it to the OceanBase community forum.".format(self.work_path)) for tenant_ls_data in tenant_ls_datas: record = RCA_ResultRecord(self.stdio) record.add_record("check error tenant_ls_data is {0}".format(tenant_ls_data)) @@ -145,7 +146,7 @@ def execute(self): if not os.path.exists(work_path): os.makedirs(work_path) # __check_checkpoint - self.record.add_record("__check_checkpoint") + self.record.add_record("check_checkpoint") work_path_checkpoint = work_path + "/checkpoint/" # gather log about tenant_id, ls, "clog checkpoint no change". self.gather_log.set_parameters("scope", "observer") @@ -184,8 +185,8 @@ def execute(self): if is_clog_checkpoint_stuck is False: self.record.add_record("is_clog_checkpoint_stuck is {0}".format(is_clog_checkpoint_stuck)) return False - self.record.add_record("__check_checkpoint end") - self.record.add_record("__get_min_ckpt_type start") + self.record.add_record("check_checkpoint end") + self.record.add_record("get_min_ckpt_type start") if stuck_service_type != "" and stuck_service_type != 'TRANS_SERVICE': self.record.add_record("stuck_service_type is {0}, not 'TRANS_SERVICE'. pass __get_min_ckpt_type".format(stuck_service_type)) pass @@ -217,8 +218,8 @@ def execute(self): self.record.add_suggest("min_checkpoint_tx_log_type is {0}. please check it.".format(min_checkpoint_tx_log_type)) break self.record.add_record("check_min_ckpt_type is {0}".format(check_min_ckpt_type)) - self.record.add_record("__get_min_ckpt_type end") - self.record.add_record("__check_replay_stuck start") + self.record.add_record("get_min_ckpt_type end") + self.record.add_record("check_replay_stuck start") if stuck_service_type != 'TRANS_SERVICE' and stuck_service_type != 'MAX_DECIDED_SCN': self.record.add_record("stuck_service_type is {0} (not TRANS_SERVICE or MAX_DECIDED_SCN). pass __check_replay_stuck. ".format(stuck_service_type)) pass @@ -258,8 +259,9 @@ def execute(self): self.record.add_record("check_replay_stuck is {0}".format(check_replay_stuck)) if check_replay_stuck: self.record.add_record("check_replay_stuck is True. Please check replay status") - self.record.add_record("__check_replay_stuck end") - self.record.add_record("__check_dump_stuck start") + self.record.add_suggest("check_replay_stuck is True. Please check replay status") + self.record.add_record("check_replay_stuck end") + self.record.add_record("check_dump_stuck start") if stuck_service_type != 'TRANS_SERVICE': self.record.add_record("stuck_service_type is {0} (not TRANS_SERVICE ). pass __check_dump_stuck.".format(stuck_service_type)) else: @@ -298,8 +300,8 @@ def execute(self): self.record.add_record("check_dump_stuck is {0}".format(check_dump_stuck)) if check_dump_stuck: self.record.add_suggest("Dump stuck, please check dump status.") - self.record.add_record("__check_dump_stuck end") - self.record.add_record("__check_data_disk_full start") + self.record.add_record("check_dump_stuck end") + self.record.add_record("check_data_disk_full start") check_data_disk_full = False work_path_check_data_disk_full = work_path + "/check_data_disk_full/" # gather log about tenant_id, "Server out of disk space" @@ -323,8 +325,8 @@ def execute(self): self.record.add_record("check_data_disk_full is {0}".format(check_data_disk_full)) if check_data_disk_full: self.record.add_suggest("Data disk full, please check data disk usage.") - self.record.add_record("__check_data_disk_full end") - self.record.add_record("__check_too_many_sstable start") + self.record.add_record("check_data_disk_full end") + self.record.add_record("check_too_many_sstable start") check_too_many_sstable = False work_path_check_too_many_sstable = work_path + "/check_too_many_sstable/" # gather log about tenant_id, "Too many sstables in tablet, cannot schdule mini compaction, retry later" @@ -348,7 +350,7 @@ def execute(self): self.record.add_record("check_too_many_sstable is {0}".format(check_too_many_sstable)) if check_too_many_sstable: self.record.add_suggest("Too many sstables in tablet, please check the number of sstables in the tablet.") - self.record.add_record("__check_too_many_sstable end") + self.record.add_record("check_too_many_sstable end") self.record.add_record("check end") return True except Exception as e: From 23ae53aee8baeca91134bbba2a0216bcfc8fb928 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=B8=A0=E7=A3=8A?= Date: Wed, 24 Jul 2024 15:27:51 +0800 Subject: [PATCH 10/11] fix analyze_log offline --- handler/analyzer/analyze_log.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/handler/analyzer/analyze_log.py b/handler/analyzer/analyze_log.py index 434211e6..3860d9fe 100644 --- a/handler/analyzer/analyze_log.py +++ b/handler/analyzer/analyze_log.py @@ -31,6 +31,7 @@ from common.tool import DirectoryUtil from common.tool import FileUtil from common.tool import TimeUtils +import common.ssh_client.local_client as ssh_client_local_client class AnalyzeLogHandler(BaseShellHandler): @@ -279,13 +280,13 @@ def __pharse_offline_log_file(self, ssh_client, log_name, local_store_dir): :param ssh_helper, log_name :return: """ + + ssh_client = ssh_client_local_client.LocalClient(context=self.context, node={"ssh_type": "local"}) local_store_path = "{0}/{1}".format(local_store_dir, str(log_name).strip(".").replace("/", "_")) if self.grep_args is not None: grep_cmd = "grep -e '{grep_args}' {log_name} >> {local_store_path} ".format(grep_args=self.grep_args, log_name=log_name, local_store_path=local_store_path) self.stdio.verbose("grep files, run cmd = [{0}]".format(grep_cmd)) - ssh_client.exec_cmd(ssh_client, grep_cmd) - else: - download_file(ssh_client, log_name, local_store_path, self.stdio) + ssh_client.exec_cmd(grep_cmd) def __get_observer_ret_code(self, log_line): """ From 04c9947d4a4a5a549995a5e43d11569693c47212 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=B8=A0=E7=A3=8A?= Date: Wed, 24 Jul 2024 15:37:16 +0800 Subject: [PATCH 11/11] fix analyze_log offline --- handler/analyzer/analyze_log.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/handler/analyzer/analyze_log.py b/handler/analyzer/analyze_log.py index 3860d9fe..d018f36f 100644 --- a/handler/analyzer/analyze_log.py +++ b/handler/analyzer/analyze_log.py @@ -287,6 +287,8 @@ def __pharse_offline_log_file(self, ssh_client, log_name, local_store_dir): grep_cmd = "grep -e '{grep_args}' {log_name} >> {local_store_path} ".format(grep_args=self.grep_args, log_name=log_name, local_store_path=local_store_path) self.stdio.verbose("grep files, run cmd = [{0}]".format(grep_cmd)) ssh_client.exec_cmd(grep_cmd) + else: + download_file(ssh_client, log_name, local_store_path, self.stdio) def __get_observer_ret_code(self, log_line): """