From 2f79808724147f375aff6513842ad91cd324bb09 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=B8=A0=E7=A3=8A?= Date: Fri, 25 Oct 2024 15:11:53 +0800 Subject: [PATCH 01/32] update version 3.0.0 --- rpm/build.sh | 2 +- rpm/oceanbase-diagnostic-tool.spec | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/rpm/build.sh b/rpm/build.sh index 726cfb3f..0cbd886d 100755 --- a/rpm/build.sh +++ b/rpm/build.sh @@ -2,7 +2,7 @@ python_bin='python' W_DIR=`pwd` -VERSION=${VERSION:-'2.5.0'} +VERSION=${VERSION:-'3.0.0'} function python_version() diff --git a/rpm/oceanbase-diagnostic-tool.spec b/rpm/oceanbase-diagnostic-tool.spec index 2dda99bf..8f53317d 100644 --- a/rpm/oceanbase-diagnostic-tool.spec +++ b/rpm/oceanbase-diagnostic-tool.spec @@ -1,5 +1,5 @@ Name: oceanbase-diagnostic-tool -Version:2.5.0 +Version:3.0.0 Release: %(echo $RELEASE)%{?dist} Summary: oceanbase diagnostic tool program Group: Development/Tools From 614e6a545b517159c6e7fd74de9cebbb5c5675e2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=B8=A0=E7=A3=8A?= Date: Wed, 6 Nov 2024 15:14:01 +0800 Subject: [PATCH 02/32] obdiag update 3.0.0 version --- rpm/build.sh | 2 +- rpm/oceanbase-diagnostic-tool.spec | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/rpm/build.sh b/rpm/build.sh index 726cfb3f..5d80a834 100755 --- a/rpm/build.sh +++ b/rpm/build.sh @@ -2,7 +2,7 @@ python_bin='python' W_DIR=`pwd` -VERSION=${VERSION:-'2.5.0'} +VERSION=${VERSION:-'2.6.0'} function python_version() diff --git a/rpm/oceanbase-diagnostic-tool.spec b/rpm/oceanbase-diagnostic-tool.spec index 2dda99bf..9fa93dd5 100644 --- a/rpm/oceanbase-diagnostic-tool.spec +++ b/rpm/oceanbase-diagnostic-tool.spec @@ -1,5 +1,5 @@ Name: oceanbase-diagnostic-tool -Version:2.5.0 +Version:2.6.0 Release: %(echo $RELEASE)%{?dist} Summary: oceanbase diagnostic tool program Group: Development/Tools From f7f09e89145a89f227689f99478863d922bfdfa2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=B8=A0=E7=A3=8A?= Date: Wed, 6 Nov 2024 15:33:57 +0800 Subject: [PATCH 03/32] obdiag update 3.0.0 version --- rpm/build.sh | 2 +- rpm/oceanbase-diagnostic-tool.spec | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/rpm/build.sh b/rpm/build.sh index 5d80a834..0cbd886d 100755 --- a/rpm/build.sh +++ b/rpm/build.sh @@ -2,7 +2,7 @@ python_bin='python' W_DIR=`pwd` -VERSION=${VERSION:-'2.6.0'} +VERSION=${VERSION:-'3.0.0'} function python_version() diff --git a/rpm/oceanbase-diagnostic-tool.spec b/rpm/oceanbase-diagnostic-tool.spec index 9fa93dd5..8f53317d 100644 --- a/rpm/oceanbase-diagnostic-tool.spec +++ b/rpm/oceanbase-diagnostic-tool.spec @@ -1,5 +1,5 @@ Name: oceanbase-diagnostic-tool -Version:2.6.0 +Version:3.0.0 Release: %(echo $RELEASE)%{?dist} Summary: oceanbase diagnostic tool program Group: Development/Tools From b27dea7de719531d6c9af0d56e349539e88b3c49 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=B8=A0=E7=A3=8A?= Date: Fri, 8 Nov 2024 10:30:45 +0800 Subject: [PATCH 04/32] gather log merge --- handler/gather/gather_component_log.py | 373 +++++++++++++++++++++++++ 1 file changed, 373 insertions(+) create mode 100644 handler/gather/gather_component_log.py diff --git a/handler/gather/gather_component_log.py b/handler/gather/gather_component_log.py new file mode 100644 index 00000000..20759e5c --- /dev/null +++ b/handler/gather/gather_component_log.py @@ -0,0 +1,373 @@ +#!/usr/bin/env python +# -*- coding: UTF-8 -* +# Copyright (c) 2022 OceanBase +# OceanBase Diagnostic Tool is licensed under Mulan PSL v2. +# You can use this software according to the terms and conditions of the Mulan PSL v2. +# You may obtain a copy of Mulan PSL v2 at: +# http://license.coscl.org.cn/MulanPSL2 +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, +# EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, +# MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. +# See the Mulan PSL v2 for more details. +import os +import threading +import uuid + +from common.command import get_file_start_time, get_file_size +from common.constant import const +from common.ssh_client.ssh import SshClient +from common.tool import FileUtil, TimeUtils +from handler.base_shell_handler import BaseShellHandler +from handler.gather.plugins.redact import Redact +from result_type import ObdiagResult + + +class GatherComponentLogHandler(BaseShellHandler): + + # log_scope_list + log_scope_list = {"observer": ["observer", "rootservice", "election"], "obproxy": ["obproxy", "obproxy_digest", "obproxy_stat", "obproxy_slow", "obproxy_limit"], "oms": ["connector", "error"]} + + def __init__(self, *args, **kwargs): + super().__init__() + self.redact_dir = None + self.gather_log_conf_dict = None + self.thread_nums = None + self.oms_log_path = None + self.is_scene = None + self.inner_config = None + self.stdio = None + self.context = None + self.target = None + self.target_dir = None + self.from_option = None + self.to_option = None + self.since_option = None + self.scope = None + self.grep = None + self.encrypt = None + self.store_dir = None + self.temp_dir = None + self.redact = None + self.nodes = None + + def init(self, context, *args, **kwargs): + try: + self.context = context + self.stdio = self.context.stdio + self.inner_config = self.context.inner_config + self.target = kwargs.get('target', None) + self.target_dir = kwargs.get('target_dir', None) + self.from_option = kwargs.get('from', None) + self.to_option = kwargs.get('to', None) + self.since_option = kwargs.get('since', None) + self.scope = kwargs.get('scope', None) + self.grep = kwargs.get('grep', None) + self.encrypt = kwargs.get('encrypt', None) + self.store_dir = kwargs.get('store_dir', None) + self.temp_dir = kwargs.get('temp_dir', None) + self.redact = kwargs.get('redact', None) + self.nodes = kwargs.get('nodes', None) + self.is_scene = kwargs.get('is_scene', False) + self.oms_log_path = kwargs.get('oms_log_path', None) + self.thread_nums = kwargs.get('thread_nums', 3) + self._check_option() + # build config dict for gather log on node + self.gather_log_conf_dict = {"target": self.target, "tmp_dir": const.GATHER_LOG_TEMPORARY_DIR_DEFAULT} + except Exception as e: + self.stdio.error("init GatherComponentLogHandler failed, error: {0}".format(str(e))) + return ObdiagResult(ObdiagResult.INPUT_ERROR_CODE, "init GatherComponentLogHandler failed, error: {0}".format(str(e))) + + def _check_option(self): + # target check + if self.target is None or self.target == "": + self.target = 'observer' + else: + self.target = self.target.lower().strip() + if not isinstance(self.target, str): + raise Exception("target option can only be string") + self.target = self.target.lower().strip() + if self.target != 'observer' and self.target != 'obproxy' and self.target != 'oms': + raise Exception("target option can only be observer or obproxy or oms") + + # target_dir check + if self.target_dir is None or self.target_dir == "": + self.target_dir = 'obdiag_gather_{0}'.format(self.target) + else: + self.target_dir = self.target_dir.strip() + if not isinstance(self.target, str): + raise Exception("target option can only be string") + + # check nodes + if self.nodes is None or len(self.nodes) == 0: + # if self.nodes not input, use default nodes by self.target + if self.target == 'observer': + self.nodes = self.context.cluster_config.get("servers") + elif self.target == 'obproxy': + self.nodes = self.context.obproxy_config.get("servers") + elif self.target == 'oms': + self.nodes = self.context.oms_config.get("servers") + else: + raise Exception("can not get nodes by target: {0}".format(self.target)) + if len(self.nodes) == 0: + raise Exception("can not get nodes by target: {0}, nodes's len is 0.".format(self.target)) + # build ssh_client for every node + new_nodes = [] + for node in self.nodes: + new_node = node + ssh_client = SshClient(self.context, node) + new_node["ssh_client"] = ssh_client + new_nodes.append(new_node) + self.nodes = new_nodes + # check scope + if self.scope is None or self.scope == "" or self.scope == "all": + self.scope = "all" + self.scope = self.log_scope_list[self.target] + else: + self.scope = self.scope.strip() + if self.scope not in self.log_scope_list[self.target]: + raise Exception("scope option can only be {0},the {1} just support {2}".format(log_scope, self.target, log_scope)) + + # check inner_config + if self.inner_config is None: + self.file_number_limit = 20 + self.file_size_limit = 2 * 1024 * 1024 * 1024 + else: + basic_config = self.inner_config['obdiag']['basic'] + self.file_number_limit = int(basic_config["file_number_limit"]) + self.file_size_limit = int(FileUtil.size(basic_config["file_size_limit"])) + self.config_path = basic_config['config_path'] + + # check thread_nums + if self.thread_nums is None or not isinstance(self.thread_nums, int) or self.thread_nums <= 0: + self.thread_nums = int(self.context.inner_config.get("obdiag", {}).get("gather", {}).get("thread_nums") or 3) + + def handler(self): + try: + # run on every node + def run_on_node(context, conf_dict, node, pool_sema): + with pool_sema: + try: + task = GatherLogOnNode(context, node, conf_dict, pool_sema) + task.handle() + except Exception as e: + self.stdio.exception(e) + self.stdio.error("gather log failed: {0}".format(str(e))) + return ObdiagResult(ObdiagResult.SERVER_ERROR_CODE, error_data="gather log failed: {0}".format(str(e))) + + self.stdio.start_loading("gather start") + pool_sema = threading.BoundedSemaphore(value=self.thread_nums) + node_threads = [] + for node in self.nodes: + next_context = self.context + next_context.stdio = self.stdio.sub_io() + node_thread = threading.Thread( + target=run_on_node, + args=(next_context, self.gather_log_conf_dict, node, pool_sema), + ) + node_thread.start() + node_threads.append(node_threads) + for node_thread in node_threads: + node_thread.join() + self.stdio.stop_loading("gather successes") + last_info = "For result details, please run cmd \033[32m' cat {0} '\033[0m\n".format(os.path.join(pack_dir_this_command, "result_summary.txt")) + self.stdio.print(last_info) + try: + if self.redact and len(self.redact) > 0: + self.stdio.start_loading("gather redact start") + self.stdio.verbose("redact_option is {0}".format(self.redact)) + redact_dir = "{0}_redact".format(self.store_dir) + self.redact_dir = redact_dir + redact = Redact(self.context, self.store_dir, redact_dir, zip_password=self.zip_password) + redact.redact_files(self.redact) + self.stdio.print("redact success the log save on {0}".format(self.redact_dir)) + self.stdio.stop_loading("gather redact successes") + return ObdiagResult(ObdiagResult.SUCCESS_CODE, data={"store_dir": redact_dir, "redact_dir": self.redact_dir}) + except Exception as e: + self.stdio.error("redact failed {0}".format(e)) + return ObdiagResult(ObdiagResult.SERVER_ERROR_CODE, error_data="redact failed {0}".format(e)) + return ObdiagResult(ObdiagResult.SUCCESS_CODE, data={"store_dir": self.store_dir}) + except Exception as e: + self.stdio.exception(e) + self.stdio.error("gather log failed: {0}".format(str(e))) + return ObdiagResult(ObdiagResult.SERVER_ERROR_CODE, error_data="gather log failed: {0}".format(str(e))) + + +# if target need add, you should check def about *_by_target +class GatherLogOnNode: + def __init__(self, context, node, config, pool_sema): + self.ssh_client = node["ssh_client"] + self.context = context + self.stdio = context.stdio + self.config = config + self.node = node + self.pool_sema = pool_sema + self.target = self.config.get("target") + + # mkdir tmp_dir + self.tmp_dir = self.config.get("tmp_dir") + self.tmp_dir = os.path.join(self.tmp_dir, "obdiag_gather_{0}".format(str(uuid.uuid4()))) + self.ssh_client.exec_cmd("mkdir -p {0}".format(self.tmp_dir)) + + self.scope = self.config.get("scope") + # todo log_path for oms + if self.target == "oms": + self.log_path = os.path.join( + node.get("home_path"), + ) + else: + self.log_path = os.path.join(node.get("home_path"), "log") + + self.from_time_str = self.config.get("from_time") + self.to_time_str = self.config.get("to_time") + self.grep_option = self.config.get("grep_option") + self.store_dir = self.config.get("store_dir") + # + self.file_size_limit = self.config.get("file_size_limit") + + def __find_logs_name(self): + try: + logs_scope = "" + for scope in self.scope: + if logs_scope == "": + logs_scope = scope + continue + logs_scope = logs_scope + "|" + scope + self.stdio.verbose("gather_log_on_node {0} find logs scope: {1}".format(self.ssh_client.get_ip(), logs_scope)) + find_cmd = "ls -1 -F {0} |grep -E '{1}'| awk -F '/' ".format(self.log_path, logs_scope) + "'{print $NF}'" + self.stdio.verbose("gather_log_on_node {0} find logs cmd: {1}".format(self.ssh_client.get_ip(), find_cmd)) + logs_name = self.ssh_client.exec_cmd(find_cmd) + return logs_name + except Exception as e: + raise Exception("gather_log_on_node {0} find logs failed: {1}".format(self.ssh_client.get_ip(), str(e))) + + def handle(self): + + from_datetime_timestamp = TimeUtils.timestamp_to_filename_time(TimeUtils.datetime_to_timestamp(self.from_time_str)) + to_datetime_timestamp = TimeUtils.timestamp_to_filename_time(TimeUtils.datetime_to_timestamp(self.to_time_str)) + + tmp_log_dir = os.path.join(self.tmp_dir, "ob_log_{0}_{1}_{2}_{3}".format(self.ssh_client.get_name(), from_datetime_timestamp, to_datetime_timestamp, uuid.uuid4())) + # mkdir tmp_log_dir + self.ssh_client.exec_cmd("mkdir -p {0}".format(tmp_log_dir)) + self.stdio.verbose("gather_log_on_node {0} tmp_log_dir: {1}".format(self.ssh_client.get_ip(), tmp_log_dir)) + try: + # find logs + logs_name = self.__find_logs_name() + if logs_name is None or len(logs_name) == 0: + self.stdio.warn("gather_log_on_node {0} failed: no log found".format(self.ssh_client.get_ip())) + return + # gather log to remote tmp_dir + self.__grep_log_to_tmp(logs_name, tmp_log_dir) + # build tar file + tar_file = os.path.join(self.tmp_dir, "{0}.tar.gz".format(tmp_log_dir)) + tar_cmd = "cd {0} && tar -czf {1}.tar.gz {1}/*".format(self.tmp_dir, tmp_log_dir) + self.stdio.verbose("gather_log_on_node {0} tar_cmd: {1}".format(self.ssh_client.get_ip(), tar_cmd)) + self.ssh_client.exec_cmd(tar_cmd) + + # download log to local store_dir + if int(get_file_size(self.ssh_client, tar_file)) > self.file_size_limit: + self.stdio.error("gather_log_on_node {0} failed: File too large over gather.file_size_limit".format(self.ssh_client.get_ip())) + raise Exception("gather_log_on_node {0} failed: File too large over gather.file_size_limit".format(self.ssh_client.get_ip())) + else: + self.stdio.verbose("gather_log_on_node {0} download log to local store_dir: {1}".format(self.ssh_client.get_ip(), self.store_dir)) + self.ssh_client.download(tar_file, self.store_dir) + except Exception as e: + self.stdio.error("gather_log_on_node {0} failed: {1}".format(self.ssh_client.get_ip(), str(e))) + raise Exception("gather_log_on_node {0} failed: {1}".format(self.ssh_client.get_ip(), str(e))) + finally: + self.ssh_client.exec_cmd("rm -rf {0}".format(tmp_log_dir)) + self.stdio.verbose("gather_log_on_node {0} finished".format(self.ssh_client.get_ip())) + + def __grep_log_to_tmp(self, logs_name, tmp_log_dir): + grep_cmd = "" + if self.grep_option: + self.stdio.verbose("grep files, grep_option = [{0}]".format(self.grep_option)) + for grep_option in self.grep_option: + if grep_cmd == "": + grep_cmd = "grep -e '{0}' ".format(grep_option) + grep_cmd += "| grep -e '{0}'".format(grep_option) + for log_name in logs_name: + source_log_name = "{0}/{1}".format(self.log_path, log_name) + target_log_name = "{0}/{1}".format(tmp_log_dir, log_name) + self.stdio.verbose("grep files, source_log_name = [{0}], target_log_name = [{1}]".format(source_log_name, target_log_name)) + if log_name.endswith(".gz"): + log_grep_cmd = "cp {0} {1}".format(source_log_name, target_log_name) + self.stdio.verbose("grep files, run cmd = [{0}]".format(log_grep_cmd)) + self.ssh_client.exec_cmd(log_grep_cmd) + continue + log_grep_cmd = "" + if grep_cmd == "": + log_grep_cmd = "cp {0} {1}".format(source_log_name, target_log_name) + else: + log_grep_cmd = grep_cmd + " {0}".format(source_log_name) + log_grep_cmd += " > {0} ".format(target_log_name) + self.stdio.verbose("grep files, run cmd = [{0}]".format(log_grep_cmd)) + self.ssh_client.exec_cmd(log_grep_cmd) + + def __find_logs_name(self): + try: + logs_scope = "" + for scope in self.scope: + if logs_scope == "": + logs_scope = scope + continue + logs_scope = logs_scope + "|" + scope + self.stdio.verbose("gather_log_on_node {0} find logs scope: {1}".format(self.ssh_client.get_ip(), logs_scope)) + + find_cmd = "ls -1 -F {0} |grep -E '{1}'| awk -F '/' ".format(self.log_path, logs_scope) + "'{print $NF}'" + self.stdio.verbose("gather_log_on_node {0} find logs cmd: {1}".format(self.ssh_client.get_ip(), find_cmd)) + logs_name = self.ssh_client.exec_cmd(find_cmd) + if logs_name is not None and len(logs_name) != 0: + log_name_list = self.__get_logfile_name_list(self.from_time_str, self.to_time_str, self.log_path, logs_name) + else: + self.stdio.warn("gather_log_on_node {0} failed: no log found".format(self.ssh_client.get_ip())) + return logs_name + except Exception as e: + raise Exception("gather_log_on_node {0} find logs failed: {1}".format(self.ssh_client.get_ip(), str(e))) + + def __get_logfile_name_list(self, from_time_str, to_time_str, log_dir, log_files): + # TODO oms get all log file name list + if self.target == "oms": + return log_files + self.stdio.verbose("get log file name list, from time {0}, to time {1}, log dir {2}, log files {3}".format(from_time_str, to_time_str, log_dir, log_files)) + log_name_list = [] + last_file_dict = {"prefix_file_name": "", "file_name": "", "file_end_time": ""} + for file_name in log_files.split('\n'): + if file_name == "": + self.stdio.verbose("existing file name is empty") + continue + if not file_name.endswith("log") and not file_name.endswith("wf"): + file_start_time_str = "" + prefix_name = file_name[:-14] if len(file_name) > 24 else "" + file_end_time_str = TimeUtils.filename_time_to_datetime(TimeUtils.extract_filename_time_from_log_name(file_name, self.stdio), self.stdio) + if last_file_dict["prefix_file_name"] != "" and last_file_dict["prefix_file_name"] == prefix_name: + file_start_time_str = last_file_dict["file_end_time"] + elif last_file_dict["prefix_file_name"] != "" and last_file_dict["prefix_file_name"] != prefix_name: + file_start_time_str = "" + file_end_time_str = "" + elif last_file_dict["prefix_file_name"] == "": + file_start_time_str = get_file_start_time(self.ssh_client, file_name, log_dir, self.stdio) + # When two time intervals overlap, need to add the file + if (file_end_time_str != "") and (file_start_time_str != "") and (file_start_time_str <= to_time_str) and (file_end_time_str >= from_time_str): + log_name_list.append(file_name) + last_file_dict = {"prefix_file_name": prefix_name, "file_name": file_name, "file_end_time": file_end_time_str} + elif file_name.endswith("log") or file_name.endswith("wf"): + # Get the first and last lines of text of the file. Here, use a command + get_first_line_cmd = "head -n 1 {0}/{1} && tail -n 1 {0}/{1}".format(log_dir, file_name) + first_and_last_line_text = self.ssh_client.exec_cmd(get_first_line_cmd) + # Split the first and last lines of text + first_and_last_line_text_list = str(first_and_last_line_text).splitlines() + if len(first_and_last_line_text_list) >= 2: + first_line_text = first_and_last_line_text_list[0] + last_line_text = first_and_last_line_text_list[-1] + # Time to parse the first and last lines of text + file_start_time_str = TimeUtils.extract_time_from_log_file_text(first_line_text, self.stdio) + file_end_time = TimeUtils.extract_time_from_log_file_text(last_line_text, self.stdio) + self.stdio.verbose("The log file {0} starts at {1} ends at {2}".format(file_name, file_start_time_str, file_end_time)) + self.stdio.verbose("to_time_str {0} from_time_str {1}".format(to_time_str, from_time_str)) + if (file_start_time_str <= to_time_str) and (file_end_time >= from_time_str): + log_name_list.append(file_name) + if len(log_name_list) > 0: + self.stdio.verbose("Find the qualified log file {0} on Server [{1}], " "wait for the next step".format(log_name_list, self.ssh_client.get_ip())) + else: + self.stdio.warn("No found the qualified log file on Server [{0}]".format(self.ssh_client.get_name())) + return log_name_list From ed2455054dc98ea5cad5865b6ca86388900318f6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=B8=A0=E7=A3=8A?= Date: Fri, 8 Nov 2024 15:39:44 +0800 Subject: [PATCH 05/32] gather log merge --- handler/gather/gather_component_log.py | 142 ++++++++++++++++++------- 1 file changed, 102 insertions(+), 40 deletions(-) diff --git a/handler/gather/gather_component_log.py b/handler/gather/gather_component_log.py index 20759e5c..d593f002 100644 --- a/handler/gather/gather_component_log.py +++ b/handler/gather/gather_component_log.py @@ -9,26 +9,29 @@ # EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, # MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. # See the Mulan PSL v2 for more details. +import datetime import os import threading import uuid -from common.command import get_file_start_time, get_file_size +from prettytable import PrettyTable +from common.command import get_file_start_time, get_file_size, is_empty_dir from common.constant import const from common.ssh_client.ssh import SshClient -from common.tool import FileUtil, TimeUtils +from common.tool import FileUtil, TimeUtils, Util from handler.base_shell_handler import BaseShellHandler from handler.gather.plugins.redact import Redact from result_type import ObdiagResult class GatherComponentLogHandler(BaseShellHandler): - # log_scope_list log_scope_list = {"observer": ["observer", "rootservice", "election"], "obproxy": ["obproxy", "obproxy_digest", "obproxy_stat", "obproxy_slow", "obproxy_limit"], "oms": ["connector", "error"]} def __init__(self, *args, **kwargs): super().__init__() + self.oms_module_id = None + self.zip_encrypt = None self.redact_dir = None self.gather_log_conf_dict = None self.thread_nums = None @@ -38,7 +41,6 @@ def __init__(self, *args, **kwargs): self.stdio = None self.context = None self.target = None - self.target_dir = None self.from_option = None self.to_option = None self.since_option = None @@ -56,7 +58,6 @@ def init(self, context, *args, **kwargs): self.stdio = self.context.stdio self.inner_config = self.context.inner_config self.target = kwargs.get('target', None) - self.target_dir = kwargs.get('target_dir', None) self.from_option = kwargs.get('from', None) self.to_option = kwargs.get('to', None) self.since_option = kwargs.get('since', None) @@ -70,9 +71,11 @@ def init(self, context, *args, **kwargs): self.is_scene = kwargs.get('is_scene', False) self.oms_log_path = kwargs.get('oms_log_path', None) self.thread_nums = kwargs.get('thread_nums', 3) + self.zip_encrypt = kwargs.get('zip_encrypt', False) + self.oms_module_id = kwargs.get('oms_module_id', None) self._check_option() # build config dict for gather log on node - self.gather_log_conf_dict = {"target": self.target, "tmp_dir": const.GATHER_LOG_TEMPORARY_DIR_DEFAULT} + self.gather_log_conf_dict = {"target": self.target, "tmp_dir": const.GATHER_LOG_TEMPORARY_DIR_DEFAULT, "zip_password": self.zip_password} except Exception as e: self.stdio.error("init GatherComponentLogHandler failed, error: {0}".format(str(e))) return ObdiagResult(ObdiagResult.INPUT_ERROR_CODE, "init GatherComponentLogHandler failed, error: {0}".format(str(e))) @@ -89,13 +92,15 @@ def _check_option(self): if self.target != 'observer' and self.target != 'obproxy' and self.target != 'oms': raise Exception("target option can only be observer or obproxy or oms") - # target_dir check - if self.target_dir is None or self.target_dir == "": - self.target_dir = 'obdiag_gather_{0}'.format(self.target) - else: - self.target_dir = self.target_dir.strip() - if not isinstance(self.target, str): - raise Exception("target option can only be string") + # check store_dir + if not os.path.exists(self.store_dir): + raise Exception("store_dir: {0} is not exist".format(self.store_dir)) + if self.is_scene is False: + target_dir = 'obdiag_gather_{0}'.format(self.target) + self.store_dir = os.path.join(self.inner_config.get("store_dir"), target_dir) + if not os.path.exists(self.store_dir): + os.makedirs(self.store_dir) + self.stdio.verbose("store_dir rebase: {0}".format(self.store_dir)) # check nodes if self.nodes is None or len(self.nodes) == 0: @@ -125,7 +130,18 @@ def _check_option(self): else: self.scope = self.scope.strip() if self.scope not in self.log_scope_list[self.target]: - raise Exception("scope option can only be {0},the {1} just support {2}".format(log_scope, self.target, log_scope)) + raise Exception("scope option can only be {0},the {1} just support {2}".format(self.scope, self.target, self.log_scope_list)) + # check encrypt + if self.zip_encrypt: + self.zip_password = Util.gen_password(16) + self.stdio.verbose("zip_encrypt is True, zip_password is {0}".format(self.zip_password)) + # check redact + if self.redact: + if self.redact != "" and len(self.redact) != 0: + if "," in self.redact and isinstance(self.redact, str): + self.redact = self.redact.split(",") + else: + self.redact = [self.redact] # check inner_config if self.inner_config is None: @@ -136,18 +152,20 @@ def _check_option(self): self.file_number_limit = int(basic_config["file_number_limit"]) self.file_size_limit = int(FileUtil.size(basic_config["file_size_limit"])) self.config_path = basic_config['config_path'] + self.stdio.verbose("file_number_limit: {0}, file_size_limit: {1}, gather log config_path: {2}".format(self.file_number_limit, self.file_size_limit, self.config_path)) # check thread_nums if self.thread_nums is None or not isinstance(self.thread_nums, int) or self.thread_nums <= 0: self.thread_nums = int(self.context.inner_config.get("obdiag", {}).get("gather", {}).get("thread_nums") or 3) + self.stdio.verbose("thread_nums: {0}".format(self.thread_nums)) def handler(self): try: # run on every node - def run_on_node(context, conf_dict, node, pool_sema): + def run_on_node(context, conf_dict, node, pool_sema, gather_tuple): with pool_sema: try: - task = GatherLogOnNode(context, node, conf_dict, pool_sema) + task = GatherLogOnNode(context, node, conf_dict, pool_sema, gather_tuple) task.handle() except Exception as e: self.stdio.exception(e) @@ -157,19 +175,29 @@ def run_on_node(context, conf_dict, node, pool_sema): self.stdio.start_loading("gather start") pool_sema = threading.BoundedSemaphore(value=self.thread_nums) node_threads = [] + gather_tuples = [] for node in self.nodes: next_context = self.context next_context.stdio = self.stdio.sub_io() + gather_tuple = {} node_thread = threading.Thread( target=run_on_node, - args=(next_context, self.gather_log_conf_dict, node, pool_sema), + args=(next_context, self.gather_log_conf_dict, node, pool_sema, gather_tuple), ) node_thread.start() node_threads.append(node_threads) + gather_tuples.append(gather_tuple) for node_thread in node_threads: node_thread.join() + self.stdio.verbose("gather_tuples: {0}".format(gather_tuples)) self.stdio.stop_loading("gather successes") - last_info = "For result details, please run cmd \033[32m' cat {0} '\033[0m\n".format(os.path.join(pack_dir_this_command, "result_summary.txt")) + # save result + summary_tuples = self.__get_overall_summary(gather_tuples) + self.stdio.print(summary_tuples) + with open(os.path.join(self.store_dir, "result_details.txt"), 'a', encoding='utf-8') as fileobj: + fileobj.write(summary_tuples.get_string()) + + last_info = "For result details, please run cmd \033[32m' cat {0} '\033[0m\n".format(os.path.join(self.store_dir, "result_summary.txt")) self.stdio.print(last_info) try: if self.redact and len(self.redact) > 0: @@ -191,16 +219,34 @@ def run_on_node(context, conf_dict, node, pool_sema): self.stdio.error("gather log failed: {0}".format(str(e))) return ObdiagResult(ObdiagResult.SERVER_ERROR_CODE, error_data="gather log failed: {0}".format(str(e))) + def __get_overall_summary(self, node_summary_tuple): + """ + generate overall summary from all node summary tuples + :param node_summary_tuple: (node, is_err, err_msg, size, consume_time, node_summary) for each node + :return: a string indicating the overall summary + """ + summary_tb = PrettyTable() + summary_tb.title = "{0} Gather Ob Log Summary on {1}".format(self.target, datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")) + if self.zip_password: + summary_tb.field_names = ["Node", "Status", "Size", "Password"] + try: + for tup in node_summary_tuple: + summary_tb.add_row([tup["node"], tup["success"], tup["info"], tup["file_size"]]) + except Exception as e: + self.stdio.error("gather log __get_overall_summary failed: {0}".format(str(e))) + return summary_tb + # if target need add, you should check def about *_by_target class GatherLogOnNode: - def __init__(self, context, node, config, pool_sema): + def __init__(self, context, node, config, pool_sema, gather_tuple): self.ssh_client = node["ssh_client"] self.context = context self.stdio = context.stdio self.config = config self.node = node self.pool_sema = pool_sema + self.gather_tuple = gather_tuple self.target = self.config.get("target") # mkdir tmp_dir @@ -221,24 +267,16 @@ def __init__(self, context, node, config, pool_sema): self.to_time_str = self.config.get("to_time") self.grep_option = self.config.get("grep_option") self.store_dir = self.config.get("store_dir") + self.zip_password = self.config.get("zip_password") # + self.file_number_limit = self.config.get("file_number_limit") self.file_size_limit = self.config.get("file_size_limit") - - def __find_logs_name(self): - try: - logs_scope = "" - for scope in self.scope: - if logs_scope == "": - logs_scope = scope - continue - logs_scope = logs_scope + "|" + scope - self.stdio.verbose("gather_log_on_node {0} find logs scope: {1}".format(self.ssh_client.get_ip(), logs_scope)) - find_cmd = "ls -1 -F {0} |grep -E '{1}'| awk -F '/' ".format(self.log_path, logs_scope) + "'{print $NF}'" - self.stdio.verbose("gather_log_on_node {0} find logs cmd: {1}".format(self.ssh_client.get_ip(), find_cmd)) - logs_name = self.ssh_client.exec_cmd(find_cmd) - return logs_name - except Exception as e: - raise Exception("gather_log_on_node {0} find logs failed: {1}".format(self.ssh_client.get_ip(), str(e))) + self.gather_tuple = { + "node": self.ssh_client.get_name(), + "success": False, + "info": "", + "file_size": 0, + } def handle(self): @@ -253,26 +291,50 @@ def handle(self): # find logs logs_name = self.__find_logs_name() if logs_name is None or len(logs_name) == 0: - self.stdio.warn("gather_log_on_node {0} failed: no log found".format(self.ssh_client.get_ip())) + self.stdio.error("gather_log_on_node {0} failed: no log found".format(self.ssh_client.get_ip())) + self.gather_tuple["info"] = "no log found" + return + elif len(logs_name) > self.file_number_limit: + self.stdio.error('{0} The number of log files is {1}, out of range (0,{2}], ' "Please adjust the query limit".format(self.ssh_client.get_name(), len(logs_name), self.file_number_limit)) + self.gather_tuple["info"] = "too many files {0} > {1}".format(len(logs_name), self.file_number_limit) return - # gather log to remote tmp_dir + + # gather log to remote tmp_dir ,if grep is exit, with grep self.__grep_log_to_tmp(logs_name, tmp_log_dir) + # build tar file + if is_empty_dir(self.ssh_client, tmp_log_dir, self.stdio): + # if remote tmp_log_dir is empty, rm the dir and return + self.ssh_client.exec_cmd("rm -rf {0}".format(tmp_log_dir)) + self.stdio.error("gather_log_on_node {0} failed: tmp_log_dir({1}) no log found".format(self.ssh_client.get_name(), tmp_log_dir)) + self.gather_tuple["info"] = "tmp_log_dir({0}) no log found".format(tmp_log_dir) + return + tar_file = os.path.join(self.tmp_dir, "{0}.tar.gz".format(tmp_log_dir)) tar_cmd = "cd {0} && tar -czf {1}.tar.gz {1}/*".format(self.tmp_dir, tmp_log_dir) self.stdio.verbose("gather_log_on_node {0} tar_cmd: {1}".format(self.ssh_client.get_ip(), tar_cmd)) self.ssh_client.exec_cmd(tar_cmd) # download log to local store_dir - if int(get_file_size(self.ssh_client, tar_file)) > self.file_size_limit: + tar_file_size = int(get_file_size(self.ssh_client, tar_file)) + if tar_file_size > self.file_size_limit: self.stdio.error("gather_log_on_node {0} failed: File too large over gather.file_size_limit".format(self.ssh_client.get_ip())) - raise Exception("gather_log_on_node {0} failed: File too large over gather.file_size_limit".format(self.ssh_client.get_ip())) + self.gather_tuple["info"] = "File too large over gather.file_size_limit" + return else: self.stdio.verbose("gather_log_on_node {0} download log to local store_dir: {1}".format(self.ssh_client.get_ip(), self.store_dir)) self.ssh_client.download(tar_file, self.store_dir) + + # tar to zip + tar_file_name = "{0}.tar.gz".format(tmp_log_dir) + local_zip_store_path = os.path.join(self.store_dir, "{0}.zip".format(tmp_log_dir)) + FileUtil.tar_gz_to_zip(self.store_dir, tar_file_name, local_zip_store_path, self.zip_password, self.stdio) + self.gather_tuple["file_size"] = tar_file_size + self.gather_tuple["info"] = "file save in {0}".format(local_zip_store_path) + self.gather_tuple["success"] = True except Exception as e: self.stdio.error("gather_log_on_node {0} failed: {1}".format(self.ssh_client.get_ip(), str(e))) - raise Exception("gather_log_on_node {0} failed: {1}".format(self.ssh_client.get_ip(), str(e))) + self.gather_tuple["info"] = str(e) finally: self.ssh_client.exec_cmd("rm -rf {0}".format(tmp_log_dir)) self.stdio.verbose("gather_log_on_node {0} finished".format(self.ssh_client.get_ip())) From ee558b3cfba8dab43a387114356584d71b751053 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=B8=A0=E7=A3=8A?= Date: Fri, 8 Nov 2024 17:39:49 +0800 Subject: [PATCH 06/32] gather log merge --- handler/gather/gather_component_log.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/handler/gather/gather_component_log.py b/handler/gather/gather_component_log.py index d593f002..86bee8a6 100644 --- a/handler/gather/gather_component_log.py +++ b/handler/gather/gather_component_log.py @@ -76,6 +76,8 @@ def init(self, context, *args, **kwargs): self._check_option() # build config dict for gather log on node self.gather_log_conf_dict = {"target": self.target, "tmp_dir": const.GATHER_LOG_TEMPORARY_DIR_DEFAULT, "zip_password": self.zip_password} + if self.oms_module_id: + self.gather_log_conf_dict["oms_module_id"] = self.oms_module_id except Exception as e: self.stdio.error("init GatherComponentLogHandler failed, error: {0}".format(str(e))) return ObdiagResult(ObdiagResult.INPUT_ERROR_CODE, "init GatherComponentLogHandler failed, error: {0}".format(str(e))) @@ -256,10 +258,11 @@ def __init__(self, context, node, config, pool_sema, gather_tuple): self.scope = self.config.get("scope") # todo log_path for oms + self.oms_module_id = self.config.get("oms_module_id") if self.target == "oms": - self.log_path = os.path.join( - node.get("home_path"), - ) + if self.oms_module_id is None: + raise Exception("gather log on oms, but oms_module_id is None") + self.log_path = os.path.join(node.get("run_path"), self.oms_module_id, "logs") else: self.log_path = os.path.join(node.get("home_path"), "log") From a6f2d1ad725aba2cebdf950bfa46657b6fc05f22 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=B8=A0=E7=A3=8A?= Date: Fri, 8 Nov 2024 17:49:32 +0800 Subject: [PATCH 07/32] gather log merge --- core.py | 16 +++++++++++++++- handler/gather/gather_component_log.py | 10 ++++------ 2 files changed, 19 insertions(+), 7 deletions(-) diff --git a/core.py b/core.py index 902fd089..d74f7223 100644 --- a/core.py +++ b/core.py @@ -24,6 +24,7 @@ from common.ssh_client.remote_client import dis_rsa_algorithms from handler.gather.gather_ash_report import GatherAshReportHandler +from handler.gather.gather_component_log import GatherComponentLogHandler from handler.rca.rca_handler import RCAHandler from handler.rca.rca_list import RcaScenesListHandler from common.ssh import SshClient, SshConfig @@ -242,13 +243,26 @@ def gather_function(self, function_type, opt): self._call_stdio('error', 'No such custum config') return ObdiagResult(ObdiagResult.INPUT_ERROR_CODE, error_data='No such custum config') else: + options = self.context.options self.stdio.print("{0} start ...".format(function_type)) self.update_obcluster_nodes(config) self.set_context(function_type, 'gather', config) timestamp = TimeUtils.get_current_us_timestamp() self.context.set_variable('gather_timestamp', timestamp) if function_type == 'gather_log': - handler = GatherLogHandler(self.context) + handler = GatherComponentLogHandler( + self.context, + target="observer", + from_option=Util.get_option(options, 'from'), + to_option=Util.get_option(options, 'to'), + since=Util.get_option(options, 'since'), + scope=Util.get_option(options, 'scope'), + grep=Util.get_option(options, 'grep'), + encrypt=Util.get_option(options, 'encrypt'), + store_dir=Util.get_option(options, 'store_dir'), + temp_dir=Util.get_option(options, 'temp_dir'), + redact=Util.get_option(options, 'redact'), + ) return handler.handle() elif function_type == 'gather_awr': handler = GatherAwrHandler(self.context) diff --git a/handler/gather/gather_component_log.py b/handler/gather/gather_component_log.py index 86bee8a6..ed558638 100644 --- a/handler/gather/gather_component_log.py +++ b/handler/gather/gather_component_log.py @@ -31,7 +31,6 @@ class GatherComponentLogHandler(BaseShellHandler): def __init__(self, *args, **kwargs): super().__init__() self.oms_module_id = None - self.zip_encrypt = None self.redact_dir = None self.gather_log_conf_dict = None self.thread_nums = None @@ -58,8 +57,8 @@ def init(self, context, *args, **kwargs): self.stdio = self.context.stdio self.inner_config = self.context.inner_config self.target = kwargs.get('target', None) - self.from_option = kwargs.get('from', None) - self.to_option = kwargs.get('to', None) + self.from_option = kwargs.get('from_option', None) + self.to_option = kwargs.get('to_option', None) self.since_option = kwargs.get('since', None) self.scope = kwargs.get('scope', None) self.grep = kwargs.get('grep', None) @@ -71,7 +70,6 @@ def init(self, context, *args, **kwargs): self.is_scene = kwargs.get('is_scene', False) self.oms_log_path = kwargs.get('oms_log_path', None) self.thread_nums = kwargs.get('thread_nums', 3) - self.zip_encrypt = kwargs.get('zip_encrypt', False) self.oms_module_id = kwargs.get('oms_module_id', None) self._check_option() # build config dict for gather log on node @@ -134,7 +132,7 @@ def _check_option(self): if self.scope not in self.log_scope_list[self.target]: raise Exception("scope option can only be {0},the {1} just support {2}".format(self.scope, self.target, self.log_scope_list)) # check encrypt - if self.zip_encrypt: + if self.encrypt: self.zip_password = Util.gen_password(16) self.stdio.verbose("zip_encrypt is True, zip_password is {0}".format(self.zip_password)) # check redact @@ -161,7 +159,7 @@ def _check_option(self): self.thread_nums = int(self.context.inner_config.get("obdiag", {}).get("gather", {}).get("thread_nums") or 3) self.stdio.verbose("thread_nums: {0}".format(self.thread_nums)) - def handler(self): + def handle(self): try: # run on every node def run_on_node(context, conf_dict, node, pool_sema, gather_tuple): From cb4330cf8b1c8754f8473890b9d6c053aa438d2a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=B8=A0=E7=A3=8A?= Date: Mon, 11 Nov 2024 20:51:09 +0800 Subject: [PATCH 08/32] update GatherComponentLogHandler --- common/ssh_client/remote_client.py | 3 + common/ssh_client/ssh.py | 1 + handler/gather/gather_component_log.py | 99 +++++++++++++++++++------- 3 files changed, 79 insertions(+), 24 deletions(-) diff --git a/common/ssh_client/remote_client.py b/common/ssh_client/remote_client.py index 537eebdc..cddea08d 100644 --- a/common/ssh_client/remote_client.py +++ b/common/ssh_client/remote_client.py @@ -151,3 +151,6 @@ def ssh_invoke_shell_switch_user(self, new_user, cmd, time_out): def get_name(self): return "remote_{0}".format(self.host_ip) + + def get_ip(self): + return self.host_ip diff --git a/common/ssh_client/ssh.py b/common/ssh_client/ssh.py index 8333039a..69486cdc 100644 --- a/common/ssh_client/ssh.py +++ b/common/ssh_client/ssh.py @@ -84,6 +84,7 @@ def exec_cmd(self, cmd): return self.client.exec_cmd(cmd).strip() def download(self, remote_path, local_path): + self.stdio.verbose("download file: {} to {}".format(remote_path, local_path)) return self.client.download(remote_path, local_path) def upload(self, remote_path, local_path): diff --git a/handler/gather/gather_component_log.py b/handler/gather/gather_component_log.py index ed558638..ca38e586 100644 --- a/handler/gather/gather_component_log.py +++ b/handler/gather/gather_component_log.py @@ -50,6 +50,7 @@ def __init__(self, *args, **kwargs): self.temp_dir = None self.redact = None self.nodes = None + self.zip_password = None def init(self, context, *args, **kwargs): try: @@ -72,10 +73,24 @@ def init(self, context, *args, **kwargs): self.thread_nums = kwargs.get('thread_nums', 3) self.oms_module_id = kwargs.get('oms_module_id', None) self._check_option() - # build config dict for gather log on node - self.gather_log_conf_dict = {"target": self.target, "tmp_dir": const.GATHER_LOG_TEMPORARY_DIR_DEFAULT, "zip_password": self.zip_password} if self.oms_module_id: self.gather_log_conf_dict["oms_module_id"] = self.oms_module_id + # build config dict for gather log on node + self.gather_log_conf_dict = { + "target": self.target, + "tmp_dir": const.GATHER_LOG_TEMPORARY_DIR_DEFAULT, + "zip_password": self.zip_password, + "scope": self.scope, + "grep": self.grep, + "encrypt": self.encrypt, + "store_dir": self.store_dir, + "from_time": self.from_time_str, + "to_time": self.to_time_str, + "file_number_limit": self.file_number_limit, + "file_size_limit": self.file_size_limit, + "oms_module_id": self.oms_module_id, + } + except Exception as e: self.stdio.error("init GatherComponentLogHandler failed, error: {0}".format(str(e))) return ObdiagResult(ObdiagResult.INPUT_ERROR_CODE, "init GatherComponentLogHandler failed, error: {0}".format(str(e))) @@ -96,8 +111,8 @@ def _check_option(self): if not os.path.exists(self.store_dir): raise Exception("store_dir: {0} is not exist".format(self.store_dir)) if self.is_scene is False: - target_dir = 'obdiag_gather_{0}'.format(self.target) - self.store_dir = os.path.join(self.inner_config.get("store_dir"), target_dir) + target_dir = os.path.join("obdiag_gather_pack_{0}".format(TimeUtils.timestamp_to_filename_time(TimeUtils.get_current_us_timestamp()))) + self.store_dir = os.path.join(self.store_dir or "./", target_dir) if not os.path.exists(self.store_dir): os.makedirs(self.store_dir) self.stdio.verbose("store_dir rebase: {0}".format(self.store_dir)) @@ -131,10 +146,39 @@ def _check_option(self): self.scope = self.scope.strip() if self.scope not in self.log_scope_list[self.target]: raise Exception("scope option can only be {0},the {1} just support {2}".format(self.scope, self.target, self.log_scope_list)) + # check since from_option and to_option + from_timestamp = None + to_timestamp = None + if self.from_option is not None and self.to_option is not None: + try: + from_timestamp = TimeUtils.parse_time_str(self.from_option) + to_timestamp = TimeUtils.parse_time_str(self.to_option) + self.from_time_str = self.from_option + self.to_time_str = self.to_option + except Exception as e: + raise Exception('Error: Datetime is invalid. Must be in format "yyyy-mm-dd hh:mm:ss". from_datetime={0}, to_datetime={1}'.format(self.from_option, self.to_option)) + if to_timestamp <= from_timestamp: + raise Exception('Error: from datetime is larger than to datetime, please check.') + elif (self.from_option is None or self.to_option is None) and self.since_option is not None: + now_time = datetime.datetime.now() + self.to_time_str = (now_time + datetime.timedelta(minutes=1)).strftime('%Y-%m-%d %H:%M:%S') + self.from_time_str = (now_time - datetime.timedelta(seconds=TimeUtils.parse_time_length_to_sec(self.since_option))).strftime('%Y-%m-%d %H:%M:%S') + self.stdio.print('gather log from_time: {0}, to_time: {1}'.format(self.from_time_str, self.to_time_str)) + else: + self.stdio.print('No time option provided, default processing is based on the last 30 minutes') + now_time = datetime.datetime.now() + self.to_time_str = (now_time + datetime.timedelta(minutes=1)).strftime('%Y-%m-%d %H:%M:%S') + if self.since_option: + self.from_time_str = (now_time - datetime.timedelta(seconds=TimeUtils.parse_time_length_to_sec(self.since_option))).strftime('%Y-%m-%d %H:%M:%S') + else: + self.from_time_str = (now_time - datetime.timedelta(minutes=30)).strftime('%Y-%m-%d %H:%M:%S') + self.stdio.print('gather log from_time: {0}, to_time: {1}'.format(self.from_time_str, self.to_time_str)) # check encrypt if self.encrypt: - self.zip_password = Util.gen_password(16) - self.stdio.verbose("zip_encrypt is True, zip_password is {0}".format(self.zip_password)) + if self.encrypt.strip().upper() == "TRUE": + self.encrypt = True + self.zip_password = Util.gen_password(16) + self.stdio.verbose("zip_encrypt is True, zip_password is {0}".format(self.zip_password)) # check redact if self.redact: if self.redact != "" and len(self.redact) != 0: @@ -162,11 +206,14 @@ def _check_option(self): def handle(self): try: # run on every node - def run_on_node(context, conf_dict, node, pool_sema, gather_tuple): + def run_on_node(context, conf_dict, node, pool_sema, gather_tuples): with pool_sema: try: - task = GatherLogOnNode(context, node, conf_dict, pool_sema, gather_tuple) + gather_tuple = {} + task = GatherLogOnNode(context, node, conf_dict, pool_sema) task.handle() + gather_tuple = task.get_result() + gather_tuples.append(gather_tuple) except Exception as e: self.stdio.exception(e) self.stdio.error("gather log failed: {0}".format(str(e))) @@ -179,14 +226,12 @@ def run_on_node(context, conf_dict, node, pool_sema, gather_tuple): for node in self.nodes: next_context = self.context next_context.stdio = self.stdio.sub_io() - gather_tuple = {} node_thread = threading.Thread( target=run_on_node, - args=(next_context, self.gather_log_conf_dict, node, pool_sema, gather_tuple), + args=(next_context, self.gather_log_conf_dict, node, pool_sema, gather_tuples), ) node_thread.start() - node_threads.append(node_threads) - gather_tuples.append(gather_tuple) + node_threads.append(node_thread) for node_thread in node_threads: node_thread.join() self.stdio.verbose("gather_tuples: {0}".format(gather_tuples)) @@ -199,6 +244,8 @@ def run_on_node(context, conf_dict, node, pool_sema, gather_tuple): last_info = "For result details, please run cmd \033[32m' cat {0} '\033[0m\n".format(os.path.join(self.store_dir, "result_summary.txt")) self.stdio.print(last_info) + if self.zip_password: + self.stdio.print("zip password is {0}".format(self.zip_password)) try: if self.redact and len(self.redact) > 0: self.stdio.start_loading("gather redact start") @@ -227,26 +274,23 @@ def __get_overall_summary(self, node_summary_tuple): """ summary_tb = PrettyTable() summary_tb.title = "{0} Gather Ob Log Summary on {1}".format(self.target, datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")) - if self.zip_password: - summary_tb.field_names = ["Node", "Status", "Size", "Password"] + summary_tb.field_names = ["Node", "Status", "Size", "info"] try: for tup in node_summary_tuple: - summary_tb.add_row([tup["node"], tup["success"], tup["info"], tup["file_size"]]) + summary_tb.add_row([tup["node"], tup["success"], tup["file_size"], tup["info"]]) except Exception as e: self.stdio.error("gather log __get_overall_summary failed: {0}".format(str(e))) return summary_tb -# if target need add, you should check def about *_by_target class GatherLogOnNode: - def __init__(self, context, node, config, pool_sema, gather_tuple): + def __init__(self, context, node, config, pool_sema): self.ssh_client = node["ssh_client"] self.context = context self.stdio = context.stdio self.config = config self.node = node self.pool_sema = pool_sema - self.gather_tuple = gather_tuple self.target = self.config.get("target") # mkdir tmp_dir @@ -268,17 +312,20 @@ def __init__(self, context, node, config, pool_sema, gather_tuple): self.to_time_str = self.config.get("to_time") self.grep_option = self.config.get("grep_option") self.store_dir = self.config.get("store_dir") - self.zip_password = self.config.get("zip_password") + self.zip_password = self.config.get("zip_password") or None # self.file_number_limit = self.config.get("file_number_limit") self.file_size_limit = self.config.get("file_size_limit") self.gather_tuple = { "node": self.ssh_client.get_name(), - "success": False, + "success": "Fail", "info": "", "file_size": 0, } + def get_result(self): + return self.gather_tuple + def handle(self): from_datetime_timestamp = TimeUtils.timestamp_to_filename_time(TimeUtils.datetime_to_timestamp(self.from_time_str)) @@ -328,11 +375,14 @@ def handle(self): # tar to zip tar_file_name = "{0}.tar.gz".format(tmp_log_dir) - local_zip_store_path = os.path.join(self.store_dir, "{0}.zip".format(tmp_log_dir)) + local_zip_store_path = os.path.join(self.store_dir, os.path.basename("{0}.zip".format(tmp_log_dir))) FileUtil.tar_gz_to_zip(self.store_dir, tar_file_name, local_zip_store_path, self.zip_password, self.stdio) - self.gather_tuple["file_size"] = tar_file_size + self.gather_tuple["file_size"] = FileUtil.size_format(num=int(os.path.getsize(local_zip_store_path) or 0), output_str=True) self.gather_tuple["info"] = "file save in {0}".format(local_zip_store_path) - self.gather_tuple["success"] = True + self.gather_tuple["success"] = "Success" + local_tar_file_name = os.path.join(self.store_dir, os.path.basename("{0}".format(tar_file_name))) + self.stdio.verbose("clear tar file: {0}".format(local_tar_file_name)) + os.remove(local_tar_file_name) except Exception as e: self.stdio.error("gather_log_on_node {0} failed: {1}".format(self.ssh_client.get_ip(), str(e))) self.gather_tuple["info"] = str(e) @@ -381,9 +431,10 @@ def __find_logs_name(self): logs_name = self.ssh_client.exec_cmd(find_cmd) if logs_name is not None and len(logs_name) != 0: log_name_list = self.__get_logfile_name_list(self.from_time_str, self.to_time_str, self.log_path, logs_name) + return log_name_list else: self.stdio.warn("gather_log_on_node {0} failed: no log found".format(self.ssh_client.get_ip())) - return logs_name + return [] except Exception as e: raise Exception("gather_log_on_node {0} find logs failed: {1}".format(self.ssh_client.get_ip(), str(e))) From 7efe13b8f2447cfffb55fecd8957d4acf8868dea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=B8=A0=E7=A3=8A?= Date: Mon, 11 Nov 2024 21:02:57 +0800 Subject: [PATCH 09/32] update GatherComponentLogHandler --- core.py | 21 ++++++++++++++++++--- handler/rca/plugins/gather.py | 2 -- 2 files changed, 18 insertions(+), 5 deletions(-) diff --git a/core.py b/core.py index d74f7223..b96b4f61 100644 --- a/core.py +++ b/core.py @@ -243,14 +243,15 @@ def gather_function(self, function_type, opt): self._call_stdio('error', 'No such custum config') return ObdiagResult(ObdiagResult.INPUT_ERROR_CODE, error_data='No such custum config') else: - options = self.context.options self.stdio.print("{0} start ...".format(function_type)) self.update_obcluster_nodes(config) self.set_context(function_type, 'gather', config) + options = self.context.options timestamp = TimeUtils.get_current_us_timestamp() self.context.set_variable('gather_timestamp', timestamp) if function_type == 'gather_log': - handler = GatherComponentLogHandler( + handler = GatherComponentLogHandler() + handler.init( self.context, target="observer", from_option=Util.get_option(options, 'from'), @@ -324,7 +325,21 @@ def gather_obproxy_log(self, opt): return ObdiagResult(ObdiagResult.INPUT_ERROR_CODE, error_data='No such custum config') else: self.set_context_skip_cluster_conn('gather_obproxy_log', 'gather', config) - handler = GatherObProxyLogHandler(self.context) + options = self.context.options + handler = GatherComponentLogHandler() + handler.init( + self.context, + target="obproxy", + from_option=Util.get_option(options, 'from'), + to_option=Util.get_option(options, 'to'), + since=Util.get_option(options, 'since'), + scope=Util.get_option(options, 'scope'), + grep=Util.get_option(options, 'grep'), + encrypt=Util.get_option(options, 'encrypt'), + store_dir=Util.get_option(options, 'store_dir'), + temp_dir=Util.get_option(options, 'temp_dir'), + redact=Util.get_option(options, 'redact'), + ) return handler.handle() def gather_scenes_list(self, opt): diff --git a/handler/rca/plugins/gather.py b/handler/rca/plugins/gather.py index 4a2ddec1..80eda0e4 100644 --- a/handler/rca/plugins/gather.py +++ b/handler/rca/plugins/gather.py @@ -110,7 +110,6 @@ def execute(self, save_path=""): for zip_file in zip_files: if "zip" not in zip_file: continue - # open zip file self.stdio.verbose("open zip file: {0}".format(os.path.join(gather_result, zip_file))) with zipfile.ZipFile(os.path.join(gather_result, zip_file), 'r') as zip_ref: @@ -122,7 +121,6 @@ def execute(self, save_path=""): for log_file in os.listdir(log_dir): result_log_files.append(os.path.join(log_dir, log_file)) self.stdio.verbose("result_log_files add {0}".format(os.path.join(log_dir, log_file))) - self.reset() return result_log_files From 3ecb6c06ab35ad8d1a4de2caa48786229b95c6b1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=B8=A0=E7=A3=8A?= Date: Tue, 12 Nov 2024 16:15:15 +0800 Subject: [PATCH 10/32] update GatherComponentLogHandler --- common/ssh_client/local_client.py | 2 +- common/tool.py | 3 +- core.py | 34 ++++++++++++++++--- handler/gather/gather_component_log.py | 21 +++++++----- handler/gather/scenes/cpu_high.py | 5 +-- handler/gather/scenes/px_collect_log.py | 29 ++++++++++------ handler/gather/scenes/sql_problem.py | 17 ++++------ handler/gather/step/base.py | 15 +++++---- handler/rca/plugins/gather.py | 45 +++++++++++++++++-------- 9 files changed, 113 insertions(+), 58 deletions(-) diff --git a/common/ssh_client/local_client.py b/common/ssh_client/local_client.py index 00e03e3e..71e55b6d 100644 --- a/common/ssh_client/local_client.py +++ b/common/ssh_client/local_client.py @@ -78,4 +78,4 @@ def get_name(self): return "local" def get_ip(self): - return self.client.get_ip() + return "127.0.0.1" diff --git a/common/tool.py b/common/tool.py index aa12d4a2..a790474a 100644 --- a/common/tool.py +++ b/common/tool.py @@ -31,6 +31,7 @@ import json import hashlib import datetime +import uuid import tabulate import tarfile import socket @@ -660,7 +661,7 @@ def write_append(filename, result, stdio=None): fileobj.write(u'{}'.format(result)) def tar_gz_to_zip(temp_dir, tar_gz_file, output_zip, password, stdio): - extract_dir = os.path.join(temp_dir, 'extracted_files') + extract_dir = os.path.join(temp_dir, 'extracted_files_{0}'.format(str(uuid.uuid4())[:6])) try: # 1. Extract the tar.gz file diff --git a/core.py b/core.py index b96b4f61..c550ca01 100644 --- a/core.py +++ b/core.py @@ -40,9 +40,7 @@ from handler.analyzer.analyze_index_space import AnalyzeIndexSpaceHandler from handler.checker.check_handler import CheckHandler from handler.checker.check_list import CheckListHandler -from handler.gather.gather_log import GatherLogHandler from handler.gather.gather_awr import GatherAwrHandler -from handler.gather.gather_obproxy_log import GatherObProxyLogHandler from handler.gather.gather_sysstat import GatherOsInfoHandler from handler.gather.gather_obstack2 import GatherObstack2Handler from handler.gather.gather_obadmin import GatherObAdminHandler @@ -292,9 +290,35 @@ def gather_function(self, function_type, opt): handler_stack.handle() handler_perf = GatherPerfHandler(self.context) handler_perf.handle() - handler_log = GatherLogHandler(self.context) - handler_log.handle() - handler_obproxy = GatherObProxyLogHandler(self.context) + handler_observer_log = GatherComponentLogHandler() + handler_observer_log.init( + self.context, + target="observer", + from_option=Util.get_option(options, 'from'), + to_option=Util.get_option(options, 'to'), + since=Util.get_option(options, 'since'), + scope=Util.get_option(options, 'scope'), + grep=Util.get_option(options, 'grep'), + encrypt=Util.get_option(options, 'encrypt'), + store_dir=Util.get_option(options, 'store_dir'), + temp_dir=Util.get_option(options, 'temp_dir'), + redact=Util.get_option(options, 'redact'), + ) + handler_observer_log.handle() + handler_obproxy = GatherComponentLogHandler() + handler_obproxy.init( + self.context, + target="obproxy", + from_option=Util.get_option(options, 'from'), + to_option=Util.get_option(options, 'to'), + since=Util.get_option(options, 'since'), + scope=Util.get_option(options, 'scope'), + grep=Util.get_option(options, 'grep'), + encrypt=Util.get_option(options, 'encrypt'), + store_dir=Util.get_option(options, 'store_dir'), + temp_dir=Util.get_option(options, 'temp_dir'), + redact=Util.get_option(options, 'redact'), + ) return handler_obproxy.handle() elif function_type == 'gather_sysstat': handler = GatherOsInfoHandler(self.context) diff --git a/handler/gather/gather_component_log.py b/handler/gather/gather_component_log.py index ca38e586..f776341a 100644 --- a/handler/gather/gather_component_log.py +++ b/handler/gather/gather_component_log.py @@ -12,6 +12,7 @@ import datetime import os import threading +import traceback import uuid from prettytable import PrettyTable @@ -72,7 +73,7 @@ def init(self, context, *args, **kwargs): self.oms_log_path = kwargs.get('oms_log_path', None) self.thread_nums = kwargs.get('thread_nums', 3) self.oms_module_id = kwargs.get('oms_module_id', None) - self._check_option() + self.__check_option() if self.oms_module_id: self.gather_log_conf_dict["oms_module_id"] = self.oms_module_id # build config dict for gather log on node @@ -95,7 +96,7 @@ def init(self, context, *args, **kwargs): self.stdio.error("init GatherComponentLogHandler failed, error: {0}".format(str(e))) return ObdiagResult(ObdiagResult.INPUT_ERROR_CODE, "init GatherComponentLogHandler failed, error: {0}".format(str(e))) - def _check_option(self): + def __check_option(self): # target check if self.target is None or self.target == "": self.target = 'observer' @@ -209,14 +210,12 @@ def handle(self): def run_on_node(context, conf_dict, node, pool_sema, gather_tuples): with pool_sema: try: - gather_tuple = {} task = GatherLogOnNode(context, node, conf_dict, pool_sema) task.handle() gather_tuple = task.get_result() gather_tuples.append(gather_tuple) except Exception as e: - self.stdio.exception(e) - self.stdio.error("gather log failed: {0}".format(str(e))) + self.stdio.error("gather log run_on_node failed: {0}".format(str(e))) return ObdiagResult(ObdiagResult.SERVER_ERROR_CODE, error_data="gather log failed: {0}".format(str(e))) self.stdio.start_loading("gather start") @@ -258,11 +257,12 @@ def run_on_node(context, conf_dict, node, pool_sema, gather_tuples): self.stdio.stop_loading("gather redact successes") return ObdiagResult(ObdiagResult.SUCCESS_CODE, data={"store_dir": redact_dir, "redact_dir": self.redact_dir}) except Exception as e: + self.stdio.verbose(traceback.format_exc()) self.stdio.error("redact failed {0}".format(e)) return ObdiagResult(ObdiagResult.SERVER_ERROR_CODE, error_data="redact failed {0}".format(e)) return ObdiagResult(ObdiagResult.SUCCESS_CODE, data={"store_dir": self.store_dir}) except Exception as e: - self.stdio.exception(e) + self.stdio.verbose(traceback.format_exc()) self.stdio.error("gather log failed: {0}".format(str(e))) return ObdiagResult(ObdiagResult.SERVER_ERROR_CODE, error_data="gather log failed: {0}".format(str(e))) @@ -275,9 +275,14 @@ def __get_overall_summary(self, node_summary_tuple): summary_tb = PrettyTable() summary_tb.title = "{0} Gather Ob Log Summary on {1}".format(self.target, datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")) summary_tb.field_names = ["Node", "Status", "Size", "info"] + if self.zip_password: + summary_tb.field_names = ["Node", "Status", "Size", "info", "zip_password"] try: for tup in node_summary_tuple: - summary_tb.add_row([tup["node"], tup["success"], tup["file_size"], tup["info"]]) + if self.zip_password: + summary_tb.add_row([tup["node"], tup["success"], tup["file_size"], tup["info"], self.zip_password]) + else: + summary_tb.add_row([tup["node"], tup["success"], tup["file_size"], tup["info"]]) except Exception as e: self.stdio.error("gather log __get_overall_summary failed: {0}".format(str(e))) return summary_tb @@ -331,7 +336,7 @@ def handle(self): from_datetime_timestamp = TimeUtils.timestamp_to_filename_time(TimeUtils.datetime_to_timestamp(self.from_time_str)) to_datetime_timestamp = TimeUtils.timestamp_to_filename_time(TimeUtils.datetime_to_timestamp(self.to_time_str)) - tmp_log_dir = os.path.join(self.tmp_dir, "ob_log_{0}_{1}_{2}_{3}".format(self.ssh_client.get_name(), from_datetime_timestamp, to_datetime_timestamp, uuid.uuid4())) + tmp_log_dir = os.path.join(self.tmp_dir, "{4}_log_{0}_{1}_{2}_{3}".format(self.ssh_client.get_name(), from_datetime_timestamp, to_datetime_timestamp, str(uuid.uuid4())[:6], self.target)) # mkdir tmp_log_dir self.ssh_client.exec_cmd("mkdir -p {0}".format(tmp_log_dir)) self.stdio.verbose("gather_log_on_node {0} tmp_log_dir: {1}".format(self.ssh_client.get_ip(), tmp_log_dir)) diff --git a/handler/gather/scenes/cpu_high.py b/handler/gather/scenes/cpu_high.py index abf52d2e..b6eaa752 100644 --- a/handler/gather/scenes/cpu_high.py +++ b/handler/gather/scenes/cpu_high.py @@ -18,10 +18,10 @@ import os from common.ssh_client.ssh import SshClient +from handler.gather.gather_component_log import GatherComponentLogHandler from stdio import SafeStdio from handler.gather.gather_obstack2 import GatherObstack2Handler from handler.gather.gather_perf import GatherPerfHandler -from handler.gather.gather_log import GatherLogHandler class CPUHighScene(SafeStdio): @@ -74,7 +74,8 @@ def __gather_current_clocksource(self): def __gather_log(self): try: self.stdio.print("gather observer log start") - handler = GatherLogHandler(self.context, self.report_path, is_scene=True) + handler = GatherComponentLogHandler() + handler.init(self.context, store_dir=self.report_path, target="observer", is_scene=True) handler.handle() self.stdio.print("gather observer log end") except Exception as e: diff --git a/handler/gather/scenes/px_collect_log.py b/handler/gather/scenes/px_collect_log.py index 437e04a7..1f70f28b 100644 --- a/handler/gather/scenes/px_collect_log.py +++ b/handler/gather/scenes/px_collect_log.py @@ -15,14 +15,17 @@ @file: px_collect_log.py @desc: """ +import os +import shutil + from common.ssh_client.ssh import SshClient -from handler.gather.gather_log import GatherLogHandler -from common.command import uzip_dir_local, analyze_log_get_sqc_addr, delete_file_in_folder, find_home_path_by_port +from handler.gather.gather_component_log import GatherComponentLogHandler +from common.command import uzip_dir_local, analyze_log_get_sqc_addr, find_home_path_by_port import datetime class SQLPXCollectLogScene(object): - def __init__(self, context, scene_name, report_path, task_variable_dict=None, env={}): + def __init__(self, context, scene_name, report_path, task_variable_dict=None, env=None): self.context = context self.stdio = context.stdio if task_variable_dict is None: @@ -31,6 +34,8 @@ def __init__(self, context, scene_name, report_path, task_variable_dict=None, en self.task_variable_dict = task_variable_dict self.report_path = report_path self.env = env + if self.env is None: + self.env = {} self.is_ssh = True self.scene_name = scene_name self.db_conn = {} @@ -56,7 +61,7 @@ def execute(self): # 否则不存在,则删除被解压的目录 if len(self.sql_task_node) != 0: self.stdio.verbose("delete file start") - delete_file_in_folder(False, None, self.report_path, self.stdio) + shutil.rmtree(self.report_path) self.stdio.verbose("delete file end") self.__gather_log() uzip_dir_local(self.report_path, self.stdio) @@ -64,14 +69,18 @@ def execute(self): def __gather_log(self): try: self.stdio.verbose("gather observer log start, trace id: {0}".format(self.trace_id)) - handler = GatherLogHandler(self.context, gather_pack_dir=self.report_path, is_scene=True) - self.context.set_variable('filter_nodes_list', self.sql_task_node) - self.context.set_variable('gather_grep', self.trace_id) - self.context.set_variable('gather_mode', 'trace_id_log') from_time_str = (self.search_time - datetime.timedelta(days=3)).strftime('%Y-%m-%d %H:%M:%S') to_time_str = (self.search_time + datetime.timedelta(minutes=1)).strftime('%Y-%m-%d %H:%M:%S') - self.context.set_variable("gather_from", from_time_str) - self.context.set_variable("gather_to", to_time_str) + handler = GatherComponentLogHandler() + handler.init( + self.context, + target="observer", + from_option=from_time_str, + to_option=to_time_str, + grep=[self.trace_id], + is_scene=True, + store_dir=self.report_path, + ) handler.handle() self.stdio.verbose("gather observer log end") except Exception as e: diff --git a/handler/gather/scenes/sql_problem.py b/handler/gather/scenes/sql_problem.py index 8d9c43b2..804f0b1a 100644 --- a/handler/gather/scenes/sql_problem.py +++ b/handler/gather/scenes/sql_problem.py @@ -15,10 +15,8 @@ @file: sql_problem.py @desc: """ - +from handler.gather.gather_component_log import GatherComponentLogHandler from stdio import SafeStdio -from handler.gather.gather_log import GatherLogHandler -from handler.gather.gather_obproxy_log import GatherObProxyLogHandler from handler.gather.gather_plan_monitor import GatherPlanMonitorHandler from common.tool import StringUtils from common.ssh_client.ssh import SshClient @@ -76,9 +74,8 @@ def __gather_log(self): self.task_nodes.append(node) break self.stdio.verbose("gather observer log start") - handler = GatherLogHandler(self.context, self.report_path, is_scene=True) - self.context.set_variable('filter_nodes_list', self.task_nodes) - self.context.set_variable('gather_grep', self.trace_id) + handler = GatherComponentLogHandler() + handler.init(self.context, target="observer", grep=[self.trace_id], nodes=self.task_nodes, store_dir=self.report_path, is_scene=True) handler.handle() self.stdio.verbose("gather observer log end") except Exception as e: @@ -88,15 +85,15 @@ def __gather_log(self): def __gather_obproxy_log(self): try: self.stdio.verbose("gather obproxy log start") - handler = GatherObProxyLogHandler(self.context, gather_pack_dir=self.report_path, is_scene=True) + handler = GatherComponentLogHandler() if self.scene_name: if self.scene_name == "observer.sql_err" or self.scene_name == "observer.perf_sql": - self.context.set_variable('gather_grep', self.trace_id) + handler.init(self.context, target="obproxy", grep=[self.trace_id], store_dir=self.report_path, is_scene=True) + self.stdio.verbose("gather obproxy log end") + return handler.handle() else: self.stdio.warn("unsupported scene {0}".format(self.scene_name)) return - handler.handle() - self.stdio.verbose("gather obproxy log end") else: self.stdio.warn("scene is None") return diff --git a/handler/gather/step/base.py b/handler/gather/step/base.py index ee1bb9c2..d23dadae 100644 --- a/handler/gather/step/base.py +++ b/handler/gather/step/base.py @@ -16,16 +16,17 @@ @desc: """ from common.ssh_client.ssh import SshClient +from handler.gather.gather_component_log import GatherComponentLogHandler from stdio import SafeStdio from handler.gather.step.ssh import SshHandler from handler.gather.step.sql import StepSQLHandler -from handler.gather.gather_log import GatherLogHandler -from handler.gather.gather_obproxy_log import GatherObProxyLogHandler from handler.gather.gather_sysstat import GatherOsInfoHandler class Base(SafeStdio): - def __init__(self, context, step, node, cluster, report_path, task_variable_dict=None, env={}, node_number=1): + def __init__(self, context, step, node, cluster, report_path, task_variable_dict=None, env=None, node_number=1): + if env is None: + env = {} self.context = context self.stdio = context.stdio if task_variable_dict is None: @@ -67,17 +68,17 @@ def execute(self): handler.execute() elif self.step["type"] == "log" and (skip_type != "ssh"): if self.node.get("host_type") and self.node.get("host_type") == "OBSERVER": - handler = GatherLogHandler(self.context, gather_pack_dir=self.report_path, is_scene=True) - self.context.set_variable('filter_nodes_list', [self.node]) - self.context.set_variable('gather_grep', self.step.get("grep")) + handler = GatherComponentLogHandler() + handler.init(self.context, target="observer", grep=self.step.get("grep"), nodes=[self.node], store_dir=self.report_path, is_scene=True) handler.handle() else: self.stdio.verbose("node host_type is {0} not OBSERVER, skipping gather log".format(self.node.get("host_type"))) elif self.step["type"] == "obproxy_log" and (skip_type != "ssh"): if self.node.get("host_type") and self.node.get("host_type") == "OBPROXY": - handler = GatherObProxyLogHandler(self.context, gather_pack_dir=self.report_path, is_scene=True) self.context.set_variable('filter_nodes_list', [self.node]) self.context.set_variable('gather_grep', self.step.get("grep")) + handler = GatherComponentLogHandler() + handler.init(self.context, target="obproxy", grep=self.step.get("grep"), nodes=[self.node], store_dir=self.report_path, is_scene=True) handler.handle() else: self.stdio.verbose("node host_type is {0} not OBPROXY, skipping gather log".format(self.node.get("host_type"))) diff --git a/handler/rca/plugins/gather.py b/handler/rca/plugins/gather.py index 80eda0e4..39e1a057 100644 --- a/handler/rca/plugins/gather.py +++ b/handler/rca/plugins/gather.py @@ -18,8 +18,7 @@ import os.path import zipfile -from handler.gather.gather_log import GatherLogHandler -from handler.gather.gather_obproxy_log import GatherObProxyLogHandler +from handler.gather.gather_component_log import GatherComponentLogHandler class Gather_log: @@ -66,15 +65,10 @@ def execute(self, save_path=""): if len(self.greps_key) == 0: self.stdio.error("The keyword cannot be empty!") raise Exception("The keyword cannot be empty!") - self.context.set_variable("gather_grep", self.greps_key) self.stdio.verbose("gather_grep is {0}".format(self.greps_key)) nodes_list = [] - if not self.conf_map["filter_nodes_list"] or len(self.conf_map["filter_nodes_list"]) == 0: - self.context.set_variable("filter_nodes_list", self.conf_map["filter_nodes_list"]) - # execute on all nodes_list - handle = None - for conf in self.conf_map: - self.context.set_variable(conf, self.conf_map[conf]) + # execute on all nodes_list + handler = None if self.conf_map["gather_target"] == 'observer': all_node = self.context.cluster_config.get("servers") if self.conf_map["filter_nodes_list"] and len(self.conf_map["filter_nodes_list"] > 0): @@ -85,7 +79,19 @@ def execute(self, save_path=""): nodes_list.append(node) self.stdio.verbose("{0} is in the nodes list".format(node.get("ip"))) self.conf_map["filter_nodes_list"] = nodes_list - handle = GatherLogHandler(self.context) + handler = GatherComponentLogHandler() + handler.init( + self.context, + target="observer", + nodes=nodes_list, + from_option=self.conf_map.get("gather_from"), + to_option=self.conf_map.get("gather_to"), + since=self.conf_map.get("gather_since"), + scope=self.conf_map.get("gather_scope"), + grep=self.greps_key, + store_dir=self.work_path, + ) + elif self.conf_map["gather_target"] == 'obproxy': all_node = self.context.get_variable('obproxy_nodes') if self.conf_map["filter_nodes_list"]: @@ -97,14 +103,25 @@ def execute(self, save_path=""): else: nodes_list.append(node) self.conf_map["filter_nodes_list"] = nodes_list - handle = GatherObProxyLogHandler(self.context) + handler = GatherComponentLogHandler() + handler.init( + self.context, + target="obproxy", + nodes=nodes_list, + from_option=self.conf_map.get("gather_from"), + to_option=self.conf_map.get("gather_to"), + since=self.conf_map.get("gather_since"), + scope=self.conf_map.get("gather_scope"), + grep=self.greps_key, + store_dir=self.work_path, + ) - if handle is None: + if handler is None: self.stdio.error("rca gather handle the target cannot be empty!") raise Exception("rca gather handle the target cannot be empty!") else: - handle.handle() - gather_result = handle.pack_dir_this_command + handler.handle() + gather_result = handler.store_dir zip_files = os.listdir(gather_result) result_log_files = [] for zip_file in zip_files: From bd3c8ab3859051d7e57b2403be9a4535e57e6b89 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=B8=A0=E7=A3=8A?= Date: Tue, 12 Nov 2024 16:36:08 +0800 Subject: [PATCH 11/32] update GatherComponentLogHandler --- handler/gather/gather_component_log.py | 9 +++++++-- result_type.py | 3 +++ 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/handler/gather/gather_component_log.py b/handler/gather/gather_component_log.py index f776341a..b73cf480 100644 --- a/handler/gather/gather_component_log.py +++ b/handler/gather/gather_component_log.py @@ -52,6 +52,7 @@ def __init__(self, *args, **kwargs): self.redact = None self.nodes = None self.zip_password = None + self.result = ObdiagResult(ObdiagResult.SUCCESS_CODE, data={}) def init(self, context, *args, **kwargs): try: @@ -94,7 +95,7 @@ def init(self, context, *args, **kwargs): except Exception as e: self.stdio.error("init GatherComponentLogHandler failed, error: {0}".format(str(e))) - return ObdiagResult(ObdiagResult.INPUT_ERROR_CODE, "init GatherComponentLogHandler failed, error: {0}".format(str(e))) + self.result=ObdiagResult(ObdiagResult.INPUT_ERROR_CODE, error_data="init GatherComponentLogHandler failed, error: {0}".format(str(e))) def __check_option(self): # target check @@ -206,6 +207,10 @@ def __check_option(self): def handle(self): try: + if not self.result.is_success(): + return self.result + + # run on every node def run_on_node(context, conf_dict, node, pool_sema, gather_tuples): with pool_sema: @@ -216,7 +221,7 @@ def run_on_node(context, conf_dict, node, pool_sema, gather_tuples): gather_tuples.append(gather_tuple) except Exception as e: self.stdio.error("gather log run_on_node failed: {0}".format(str(e))) - return ObdiagResult(ObdiagResult.SERVER_ERROR_CODE, error_data="gather log failed: {0}".format(str(e))) + raise Exception("gather log run_on_node failed: {0}".format(str(e))) self.stdio.start_loading("gather start") pool_sema = threading.BoundedSemaphore(value=self.thread_nums) diff --git a/result_type.py b/result_type.py index 71f30b01..fd84f82b 100644 --- a/result_type.py +++ b/result_type.py @@ -43,3 +43,6 @@ def get_result(self): def get_code(self): return self.code + + def is_success(self): + return self.code == self.SUCCESS_CODE From b7aa4fd0c953717c03078f41b174e0463995d3bd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=B8=A0=E7=A3=8A?= Date: Tue, 12 Nov 2024 16:36:17 +0800 Subject: [PATCH 12/32] update GatherComponentLogHandler --- handler/gather/gather_component_log.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/handler/gather/gather_component_log.py b/handler/gather/gather_component_log.py index b73cf480..1c69a841 100644 --- a/handler/gather/gather_component_log.py +++ b/handler/gather/gather_component_log.py @@ -95,7 +95,7 @@ def init(self, context, *args, **kwargs): except Exception as e: self.stdio.error("init GatherComponentLogHandler failed, error: {0}".format(str(e))) - self.result=ObdiagResult(ObdiagResult.INPUT_ERROR_CODE, error_data="init GatherComponentLogHandler failed, error: {0}".format(str(e))) + self.result = ObdiagResult(ObdiagResult.INPUT_ERROR_CODE, error_data="init GatherComponentLogHandler failed, error: {0}".format(str(e))) def __check_option(self): # target check @@ -210,7 +210,6 @@ def handle(self): if not self.result.is_success(): return self.result - # run on every node def run_on_node(context, conf_dict, node, pool_sema, gather_tuples): with pool_sema: From 5e3267f4d558a7af2eb661b8f6692f8c9fca99ca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=B8=A0=E7=A3=8A?= Date: Tue, 12 Nov 2024 16:45:54 +0800 Subject: [PATCH 13/32] build test rpm --- .github/workflows/build_package.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/build_package.yml b/.github/workflows/build_package.yml index c595dc05..06a7658f 100644 --- a/.github/workflows/build_package.yml +++ b/.github/workflows/build_package.yml @@ -7,6 +7,7 @@ on: push: branches: - master + - v3.0-dev env: ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true From ee700f922689dd0f5d009b977e3737aa1d73fc69 Mon Sep 17 00:00:00 2001 From: xuyan wang <35394786+wayyoungboy@users.noreply.github.com> Date: Tue, 12 Nov 2024 20:28:27 +0800 Subject: [PATCH 14/32] 3.0.0 check list (#542) * support check list * support check list * support check list * support check list * support check list * support check list --- diag_cmd.py | 36 +++++++++++++++++++++++++------ handler/checker/check_handler.py | 10 ++++++--- handler/display/display_scenes.py | 2 +- 3 files changed, 38 insertions(+), 10 deletions(-) diff --git a/diag_cmd.py b/diag_cmd.py index 544ae539..7b305502 100644 --- a/diag_cmd.py +++ b/diag_cmd.py @@ -991,28 +991,44 @@ def _do_command(self, obdiag): return obdiag.analyze_fuction('analyze_sql_review', self.opts) -class ObdiagCheckCommand(ObdiagOriginCommand): +class ObdiagCheckRunCommand(ObdiagOriginCommand): def __init__(self): - super(ObdiagCheckCommand, self).__init__('check', 'check OceanBase cluster') + super(ObdiagCheckRunCommand, self).__init__('run', 'check OceanBase cluster') self.parser.add_option('--cases', type='string', help="check observer's cases on package_file") self.parser.add_option('--obproxy_cases', type='string', help="check obproxy's cases on package_file") self.parser.add_option('--store_dir', type='string', help='the dir to store check result, current dir by default.', default='./check_report/') - self.parser.add_option('--report_type', type='string', help='The type of the check report, support "table", "json", "xml", "yaml", "html". default table', default='table') + self.parser.add_option('--report_type', type='string', help='The type of the check report, support "table", "json", "xml", "yaml". "html", default table', default='table') self.parser.add_option('-c', type='string', help='obdiag custom config', default=os.path.expanduser('~/.obdiag/config.yml')) self.parser.add_option('--config', action="append", type="string", help='config options Format: --config key=value') + self.parser.add_option('--env', action="append", type='string', help='env of scene') def init(self, cmd, args): - super(ObdiagCheckCommand, self).init(cmd, args) + super(ObdiagCheckRunCommand, self).init(cmd, args) self.parser.set_usage('%s [options]' % self.prev_cmd) return self def _do_command(self, obdiag): - if 'list' in self.args: - return obdiag.check_list(self.opts) + return obdiag.check(self.opts) +class ObdiagCheckListCommand(ObdiagOriginCommand): + + def __init__(self): + super(ObdiagCheckListCommand, self).__init__('list', 'check list') + self.parser.add_option('-c', type='string', help='obdiag custom config', default=os.path.expanduser('~/.obdiag/config.yml')) + self.parser.add_option('--config', action="append", type="string", help='config options Format: --config key=value') + + def init(self, cmd, args): + super(ObdiagCheckListCommand, self).init(cmd, args) + self.parser.set_usage('%s [options]' % self.prev_cmd) + return self + + def _do_command(self, obdiag): + return obdiag.check_list(self.opts) + + class ObdiagRCARunCommand(ObdiagOriginCommand): def __init__(self): @@ -1206,6 +1222,14 @@ def __init__(self): self.register_command(ObdiagRCAListCommand()) +class ObdiagCheckCommand(MajorCommand): + + def __init__(self): + super(ObdiagCheckCommand, self).__init__('check', 'Check OceanBase cluster info') + self.register_command(ObdiagCheckRunCommand()) + self.register_command(ObdiagCheckListCommand()) + + class MainCommand(MajorCommand): def __init__(self): diff --git a/handler/checker/check_handler.py b/handler/checker/check_handler.py index 4a11a756..9fcd3d04 100644 --- a/handler/checker/check_handler.py +++ b/handler/checker/check_handler.py @@ -53,11 +53,14 @@ def __init__(self, context, check_target_type="observer"): self.nodes = self.context.obproxy_config.get("servers") self.tasks_base_path = os.path.expanduser(self.work_path + "/tasks/") self.check_target_type = check_target_type - + self.options = self.context.options + env_option = Util.get_option(self.options, 'env') + self.input_env = StringUtils.parse_env_display(env_option) or {} + # init output parameters self.stdio.verbose( "CheckHandler input. ignore_version is {0} , cluster is {1} , nodes is {2}, " "export_report_path is {3}, export_report_type is {4} , check_target_type is {5}, " - " tasks_base_path is {6}.".format( + " tasks_base_path is {6}, input_env is {7}".format( self.ignore_version, self.cluster.get("ob_cluster_name") or self.cluster.get("obproxy_cluster_name"), StringUtils.node_cut_passwd_for_log(self.nodes), @@ -65,6 +68,7 @@ def __init__(self, context, check_target_type="observer"): self.export_report_type, self.check_target_type, self.tasks_base_path, + self.input_env, ) ) @@ -219,7 +223,7 @@ def execute_one(self, task_name): if version: self.cluster["version"] = version self.stdio.verbose("cluster.version is {0}".format(self.cluster["version"])) - task = TaskBase(self.context, self.tasks[task_name]["task"], self.nodes, self.cluster, report) + task = TaskBase(self.context, self.tasks[task_name]["task"], self.nodes, self.cluster, report, task_variable_dict=self.input_env) self.stdio.verbose("{0} execute!".format(task_name)) task.execute() self.stdio.verbose("execute tasks end : {0}".format(task_name)) diff --git a/handler/display/display_scenes.py b/handler/display/display_scenes.py index 3e67b662..bc1c998d 100644 --- a/handler/display/display_scenes.py +++ b/handler/display/display_scenes.py @@ -97,7 +97,7 @@ def __init_db_conn(self, cli_connection_string): if StringUtils.validate_db_info(self.db_conn): self.__init_db_connector() else: - self.stdio.error("db connection information requird [db_connect = '-hxx -Pxx -uxx -pxx -Dxx'] but provided {0}, please check the --env {0}".format(env_dict)) + self.stdio.error("db connection information requird [db_connect = '-hxx -Pxx -uxx -pxx -Dxx'] but provided {0}, please check the --env db_connect={0}".format(cli_connection_string)) self.db_connector = self.sys_connector except Exception as e: self.stdio.exception("init db connector, error: {0}, please check --env option ") From 46f7b5f2bd4ebd3d770c19c2492b2e2ed3aec10e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=B8=A0=E7=A3=8A?= Date: Tue, 12 Nov 2024 20:55:51 +0800 Subject: [PATCH 15/32] ObdiagResult update --- core.py | 53 +++++++++++++++++--------------- handler/checker/check_handler.py | 16 +++++----- handler/checker/check_list.py | 1 + handler/rca/rca_handler.py | 4 ++- 4 files changed, 39 insertions(+), 35 deletions(-) diff --git a/core.py b/core.py index c550ca01..5fc52d72 100644 --- a/core.py +++ b/core.py @@ -454,31 +454,34 @@ def check(self, opts): self._call_stdio('error', 'No such custum config') return ObdiagResult(ObdiagResult.INPUT_ERROR_CODE, error_data='No such custum config') else: - self.stdio.print("check start ...") - self.update_obcluster_nodes(config) - self.set_context('check', 'check', config) - obproxy_check_handler = None - observer_check_handler = None - result_data = {} - if self.context.obproxy_config.get("servers") is not None and len(self.context.obproxy_config.get("servers")) > 0: - obproxy_check_handler = CheckHandler(self.context, check_target_type="obproxy") - obproxy_check_handler.handle() - obproxy_result = obproxy_check_handler.execute() - result_data['obproxy'] = obproxy_result - if self.context.cluster_config.get("servers") is not None and len(self.context.cluster_config.get("servers")) > 0: - observer_check_handler = CheckHandler(self.context, check_target_type="observer") - observer_check_handler.handle() - observer_result = observer_check_handler.execute() - result_data['observer'] = observer_result - if obproxy_check_handler is not None: - obproxy_report_path = os.path.expanduser(obproxy_check_handler.report.get_report_path()) - if os.path.exists(obproxy_report_path): - self.stdio.print("Check obproxy finished. For more details, please run cmd '" + Fore.YELLOW + " cat {0} ".format(obproxy_check_handler.report.get_report_path()) + Style.RESET_ALL + "'") - if observer_check_handler is not None: - observer_report_path = os.path.expanduser(observer_check_handler.report.get_report_path()) - if os.path.exists(observer_report_path): - self.stdio.print("Check observer finished. For more details, please run cmd'" + Fore.YELLOW + " cat {0} ".format(observer_check_handler.report.get_report_path()) + Style.RESET_ALL + "'") - return ObdiagResult(ObdiagResult.SUCCESS_CODE, data=result_data) + try: + self.stdio.print("check start ...") + self.update_obcluster_nodes(config) + self.set_context('check', 'check', config) + obproxy_check_handler = None + observer_check_handler = None + result_data = {} + if self.context.obproxy_config.get("servers") is not None and len(self.context.obproxy_config.get("servers")) > 0: + obproxy_check_handler = CheckHandler(self.context, check_target_type="obproxy") + obproxy_result = obproxy_check_handler.handle() + result_data['obproxy'] = obproxy_result + if self.context.cluster_config.get("servers") is not None and len(self.context.cluster_config.get("servers")) > 0: + observer_check_handler = CheckHandler(self.context, check_target_type="observer") + observer_result = observer_check_handler.handle() + result_data['observer'] = observer_result + if obproxy_check_handler is not None: + obproxy_report_path = os.path.expanduser(obproxy_check_handler.report.get_report_path()) + if os.path.exists(obproxy_report_path): + self.stdio.print("Check obproxy finished. For more details, please run cmd '" + Fore.YELLOW + " cat {0} ".format(obproxy_check_handler.report.get_report_path()) + Style.RESET_ALL + "'") + if observer_check_handler is not None: + observer_report_path = os.path.expanduser(observer_check_handler.report.get_report_path()) + if os.path.exists(observer_report_path): + self.stdio.print("Check observer finished. For more details, please run cmd'" + Fore.YELLOW + " cat {0} ".format(observer_check_handler.report.get_report_path()) + Style.RESET_ALL + "'") + return ObdiagResult(ObdiagResult.SUCCESS_CODE, data=result_data) + except Exception as e: + self.stdio.error("check Exception: {0}".format(e)) + self.stdio.verbose(traceback.format_exc()) + return ObdiagResult(ObdiagResult.SERVER_ERROR_CODE, error_data="check Exception: {0}".format(e)) def check_list(self, opts): config = self.config_manager diff --git a/handler/checker/check_handler.py b/handler/checker/check_handler.py index 4a11a756..290a9aca 100644 --- a/handler/checker/check_handler.py +++ b/handler/checker/check_handler.py @@ -30,7 +30,6 @@ from handler.checker.check_task import TaskBase import re from common.tool import Util -from common.tool import YamlUtils from common.tool import StringUtils @@ -167,11 +166,11 @@ def handle(self): new_tasks[task_name] = task_value self.tasks = new_tasks self.stdio.verbose("tasks is {0}".format(self.tasks.keys())) - return True + return self.__execute() except Exception as e: self.stdio.error("Get package tasks failed. Error info is {0}".format(e)) self.stdio.verbose(traceback.format_exc()) - return False + raise CheckException("Internal error :{0}".format(e)) # get all tasks def get_all_tasks(self): @@ -209,7 +208,7 @@ def get_package_tasks(self, package_name): return packege_tasks[package_name].get("tasks") # execute task - def execute_one(self, task_name): + def __execute_one(self, task_name): try: self.stdio.verbose("execute tasks is {0}".format(task_name)) # Verify if the version is within a reasonable range @@ -232,22 +231,21 @@ def execute_one(self, task_name): self.stdio.error("execute_one Exception : {0}".format(e)) raise CheckException("execute_one Exception : {0}".format(e)) - def execute(self): + def __execute(self): try: self.stdio.verbose("execute_all_tasks. the number of tasks is {0} ,tasks is {1}".format(len(self.tasks.keys()), self.tasks.keys())) self.report = CheckReport(self.context, export_report_path=self.export_report_path, export_report_type=self.export_report_type, report_target=self.check_target_type) # one of tasks to execute for task in self.tasks: - t_report = self.execute_one(task) + t_report = self.__execute_one(task) self.report.add_task_report(t_report) self.report.export_report() return self.report.report_tobeMap() except CheckrReportException as e: self.stdio.error("Report error :{0}".format(e)) - self.stdio.verbose(traceback.format_exc()) + raise CheckException("Report error :{0}".format(e)) except Exception as e: - self.stdio.error("Internal error :{0}".format(e)) - self.stdio.verbose(traceback.format_exc()) + raise CheckException("Internal error :{0}".format(e)) class checkOBConnectorPool: diff --git a/handler/checker/check_list.py b/handler/checker/check_list.py index 8a5d6451..fc016939 100644 --- a/handler/checker/check_list.py +++ b/handler/checker/check_list.py @@ -82,4 +82,5 @@ def handle(self): Util.print_scene(cases_map, stdio=self.stdio) return ObdiagResult(ObdiagResult.SUCCESS_CODE, data=result_map) except Exception as e: + return ObdiagResult(ObdiagResult.SERVER_ERROR_CODE, error_data=str(e)) diff --git a/handler/rca/rca_handler.py b/handler/rca/rca_handler.py index 1c2797be..8a0bfef2 100644 --- a/handler/rca/rca_handler.py +++ b/handler/rca/rca_handler.py @@ -171,11 +171,13 @@ def handle(self): except Exception as e: raise Exception("rca_scene.init err: {0}".format(e)) self.stdio.verbose("{0} init success".format(scene_name)) + return self.__execute() else: + self.stdio.error("rca_scene :{0} is not exist or not input".format(scene_name)) raise Exception("rca_scene :{0} is not exist or not input".format(scene_name)) # get all tasks - def execute(self): + def __execute(self): try: self.rca_scene.execute() except RCANotNeedExecuteException as e: From 942e028d0ba16adcf42ad8e59b84d5288a5d6aa9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=B8=A0=E7=A3=8A?= Date: Wed, 13 Nov 2024 15:57:39 +0800 Subject: [PATCH 16/32] update GatherComponentLogHandler --- handler/gather/gather_component_log.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/handler/gather/gather_component_log.py b/handler/gather/gather_component_log.py index 1c69a841..f2e37a81 100644 --- a/handler/gather/gather_component_log.py +++ b/handler/gather/gather_component_log.py @@ -380,11 +380,13 @@ def handle(self): return else: self.stdio.verbose("gather_log_on_node {0} download log to local store_dir: {1}".format(self.ssh_client.get_ip(), self.store_dir)) - self.ssh_client.download(tar_file, self.store_dir) - + self.ssh_client.download(tar_file, os.path.join(self.store_dir, os.path.basename("{0}".format(tar_file)))) # tar to zip - tar_file_name = "{0}.tar.gz".format(tmp_log_dir) - local_zip_store_path = os.path.join(self.store_dir, os.path.basename("{0}.zip".format(tmp_log_dir))) + tar_file_name = os.path.basename("{0}".format(tar_file)) + self.stdio.verbose("tar_file_name: {0}".format(tar_file_name)) + local_tar_file_path = os.path.join(self.store_dir, tar_file_name) + local_zip_store_path = os.path.join(self.store_dir, os.path.basename("{0}.zip".format(tar_file))) + self.stdio.verbose("local_tar_file_path: {0}; local_zip_store_path: {1}".format(local_tar_file_path, local_zip_store_path)) FileUtil.tar_gz_to_zip(self.store_dir, tar_file_name, local_zip_store_path, self.zip_password, self.stdio) self.gather_tuple["file_size"] = FileUtil.size_format(num=int(os.path.getsize(local_zip_store_path) or 0), output_str=True) self.gather_tuple["info"] = "file save in {0}".format(local_zip_store_path) @@ -393,6 +395,7 @@ def handle(self): self.stdio.verbose("clear tar file: {0}".format(local_tar_file_name)) os.remove(local_tar_file_name) except Exception as e: + self.stdio.verbose(traceback.format_exc()) self.stdio.error("gather_log_on_node {0} failed: {1}".format(self.ssh_client.get_ip(), str(e))) self.gather_tuple["info"] = str(e) finally: From 7867b26db18a7fbcd577d5d2a166235af3d818b1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=B8=A0=E7=A3=8A?= Date: Wed, 13 Nov 2024 16:32:52 +0800 Subject: [PATCH 17/32] update GatherComponentLogHandler --- handler/gather/gather_component_log.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/handler/gather/gather_component_log.py b/handler/gather/gather_component_log.py index f2e37a81..fb7b6888 100644 --- a/handler/gather/gather_component_log.py +++ b/handler/gather/gather_component_log.py @@ -278,9 +278,11 @@ def __get_overall_summary(self, node_summary_tuple): """ summary_tb = PrettyTable() summary_tb.title = "{0} Gather Ob Log Summary on {1}".format(self.target, datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")) - summary_tb.field_names = ["Node", "Status", "Size", "info"] + self.stdio.verbose("node_summary_tuple: {0}".format(node_summary_tuple)) if self.zip_password: summary_tb.field_names = ["Node", "Status", "Size", "info", "zip_password"] + else: + summary_tb.field_names = ["Node", "Status", "Size", "info"] try: for tup in node_summary_tuple: if self.zip_password: @@ -385,20 +387,18 @@ def handle(self): tar_file_name = os.path.basename("{0}".format(tar_file)) self.stdio.verbose("tar_file_name: {0}".format(tar_file_name)) local_tar_file_path = os.path.join(self.store_dir, tar_file_name) - local_zip_store_path = os.path.join(self.store_dir, os.path.basename("{0}.zip".format(tar_file))) + local_zip_store_path = os.path.join(self.store_dir, os.path.basename("{0}.zip".format(tmp_log_dir))) self.stdio.verbose("local_tar_file_path: {0}; local_zip_store_path: {1}".format(local_tar_file_path, local_zip_store_path)) - FileUtil.tar_gz_to_zip(self.store_dir, tar_file_name, local_zip_store_path, self.zip_password, self.stdio) + FileUtil.tar_gz_to_zip(self.store_dir, local_tar_file_path, local_zip_store_path, self.zip_password, self.stdio) self.gather_tuple["file_size"] = FileUtil.size_format(num=int(os.path.getsize(local_zip_store_path) or 0), output_str=True) self.gather_tuple["info"] = "file save in {0}".format(local_zip_store_path) self.gather_tuple["success"] = "Success" - local_tar_file_name = os.path.join(self.store_dir, os.path.basename("{0}".format(tar_file_name))) - self.stdio.verbose("clear tar file: {0}".format(local_tar_file_name)) - os.remove(local_tar_file_name) except Exception as e: self.stdio.verbose(traceback.format_exc()) self.stdio.error("gather_log_on_node {0} failed: {1}".format(self.ssh_client.get_ip(), str(e))) self.gather_tuple["info"] = str(e) finally: + self.stdio.verbose("clear tmp_log_dir: {0}".format(tmp_log_dir)) self.ssh_client.exec_cmd("rm -rf {0}".format(tmp_log_dir)) self.stdio.verbose("gather_log_on_node {0} finished".format(self.ssh_client.get_ip())) From ae8d218d50052733c911fcf7e5f47aa6e3b4d580 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=B8=A0=E7=A3=8A?= Date: Wed, 13 Nov 2024 20:07:19 +0800 Subject: [PATCH 18/32] threading change to multiprocessing --- handler/gather/gather_component_log.py | 56 +++++++++----------------- 1 file changed, 19 insertions(+), 37 deletions(-) diff --git a/handler/gather/gather_component_log.py b/handler/gather/gather_component_log.py index fb7b6888..4e649e23 100644 --- a/handler/gather/gather_component_log.py +++ b/handler/gather/gather_component_log.py @@ -11,9 +11,9 @@ # See the Mulan PSL v2 for more details. import datetime import os -import threading import traceback import uuid +import multiprocessing as mp from prettytable import PrettyTable from common.command import get_file_start_time, get_file_size, is_empty_dir @@ -132,14 +132,6 @@ def __check_option(self): raise Exception("can not get nodes by target: {0}".format(self.target)) if len(self.nodes) == 0: raise Exception("can not get nodes by target: {0}, nodes's len is 0.".format(self.target)) - # build ssh_client for every node - new_nodes = [] - for node in self.nodes: - new_node = node - ssh_client = SshClient(self.context, node) - new_node["ssh_client"] = ssh_client - new_nodes.append(new_node) - self.nodes = new_nodes # check scope if self.scope is None or self.scope == "" or self.scope == "all": self.scope = "all" @@ -211,39 +203,30 @@ def handle(self): return self.result # run on every node - def run_on_node(context, conf_dict, node, pool_sema, gather_tuples): - with pool_sema: - try: - task = GatherLogOnNode(context, node, conf_dict, pool_sema) - task.handle() - gather_tuple = task.get_result() - gather_tuples.append(gather_tuple) - except Exception as e: - self.stdio.error("gather log run_on_node failed: {0}".format(str(e))) - raise Exception("gather log run_on_node failed: {0}".format(str(e))) - - self.stdio.start_loading("gather start") - pool_sema = threading.BoundedSemaphore(value=self.thread_nums) node_threads = [] gather_tuples = [] + tasks = [] + self.stdio.start_loading("gather redact start") for node in self.nodes: - next_context = self.context - next_context.stdio = self.stdio.sub_io() - node_thread = threading.Thread( - target=run_on_node, - args=(next_context, self.gather_log_conf_dict, node, pool_sema, gather_tuples), - ) - node_thread.start() - node_threads.append(node_thread) - for node_thread in node_threads: - node_thread.join() + new_context = self.context + new_context.stdio = self.stdio.sub_io() + tasks.append(GatherLogOnNode(new_context, node, self.gather_log_conf_dict)) + with mp.Pool(processes=self.thread_nums) as pool: + for task in tasks: + node_threads.append(pool.apply_async(task.handle())) + pool.close() + pool.join() # wait for all task to finish + for task in tasks: + gather_tuple = task.get_result() + gather_tuples.append(gather_tuple) self.stdio.verbose("gather_tuples: {0}".format(gather_tuples)) - self.stdio.stop_loading("gather successes") + self.stdio.stop_loading("succeed") # save result summary_tuples = self.__get_overall_summary(gather_tuples) self.stdio.print(summary_tuples) with open(os.path.join(self.store_dir, "result_details.txt"), 'a', encoding='utf-8') as fileobj: fileobj.write(summary_tuples.get_string()) + self.stdio.stop_loading("succeed") last_info = "For result details, please run cmd \033[32m' cat {0} '\033[0m\n".format(os.path.join(self.store_dir, "result_summary.txt")) self.stdio.print(last_info) @@ -258,7 +241,7 @@ def run_on_node(context, conf_dict, node, pool_sema, gather_tuples): redact = Redact(self.context, self.store_dir, redact_dir, zip_password=self.zip_password) redact.redact_files(self.redact) self.stdio.print("redact success the log save on {0}".format(self.redact_dir)) - self.stdio.stop_loading("gather redact successes") + self.stdio.stop_loading("succeed") return ObdiagResult(ObdiagResult.SUCCESS_CODE, data={"store_dir": redact_dir, "redact_dir": self.redact_dir}) except Exception as e: self.stdio.verbose(traceback.format_exc()) @@ -295,13 +278,12 @@ def __get_overall_summary(self, node_summary_tuple): class GatherLogOnNode: - def __init__(self, context, node, config, pool_sema): - self.ssh_client = node["ssh_client"] + def __init__(self, context, node, config): self.context = context + self.ssh_client = SshClient(context, node) self.stdio = context.stdio self.config = config self.node = node - self.pool_sema = pool_sema self.target = self.config.get("target") # mkdir tmp_dir From 1ebe588da2208ec5c308e48315e4d5a155a5a40c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=B8=A0=E7=A3=8A?= Date: Wed, 13 Nov 2024 21:10:00 +0800 Subject: [PATCH 19/32] threading change to multiprocessing --- handler/gather/gather_component_log.py | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/handler/gather/gather_component_log.py b/handler/gather/gather_component_log.py index 4e649e23..a662837d 100644 --- a/handler/gather/gather_component_log.py +++ b/handler/gather/gather_component_log.py @@ -206,16 +206,20 @@ def handle(self): node_threads = [] gather_tuples = [] tasks = [] - self.stdio.start_loading("gather redact start") + self.stdio.start_loading("gather start") + semaphore = mp.Semaphore(self.thread_nums) for node in self.nodes: new_context = self.context new_context.stdio = self.stdio.sub_io() - tasks.append(GatherLogOnNode(new_context, node, self.gather_log_conf_dict)) - with mp.Pool(processes=self.thread_nums) as pool: - for task in tasks: - node_threads.append(pool.apply_async(task.handle())) - pool.close() - pool.join() # wait for all task to finish + tasks.append(GatherLogOnNode(new_context, node, self.gather_log_conf_dict, semaphore)) + file_queue = [] + for task in tasks: + semaphore.acquire() + file_thread = mp.Process(target=task.handle()) + file_thread.start() + file_queue.append(file_thread) + for file_thread in file_queue: + file_thread.join() for task in tasks: gather_tuple = task.get_result() gather_tuples.append(gather_tuple) @@ -278,13 +282,14 @@ def __get_overall_summary(self, node_summary_tuple): class GatherLogOnNode: - def __init__(self, context, node, config): + def __init__(self, context, node, config, semaphore): self.context = context self.ssh_client = SshClient(context, node) self.stdio = context.stdio self.config = config self.node = node self.target = self.config.get("target") + self.semaphore = semaphore # mkdir tmp_dir self.tmp_dir = self.config.get("tmp_dir") @@ -383,6 +388,7 @@ def handle(self): self.stdio.verbose("clear tmp_log_dir: {0}".format(tmp_log_dir)) self.ssh_client.exec_cmd("rm -rf {0}".format(tmp_log_dir)) self.stdio.verbose("gather_log_on_node {0} finished".format(self.ssh_client.get_ip())) + self.semaphore.release() def __grep_log_to_tmp(self, logs_name, tmp_log_dir): grep_cmd = "" From 3e5d4bf0f6e9b3c6b9cb94a2dcc21f983e9572a4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=B8=A0=E7=A3=8A?= Date: Thu, 14 Nov 2024 11:39:15 +0800 Subject: [PATCH 20/32] tmp delete test package --- .github/workflows/build_package.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/build_package.yml b/.github/workflows/build_package.yml index 06a7658f..c595dc05 100644 --- a/.github/workflows/build_package.yml +++ b/.github/workflows/build_package.yml @@ -7,7 +7,6 @@ on: push: branches: - master - - v3.0-dev env: ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true From e1ca7ebb5001d2a08a3a81fe181745beed568927 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=B8=A0=E7=A3=8A?= Date: Thu, 14 Nov 2024 16:30:00 +0800 Subject: [PATCH 21/32] update GatherComponentLogHandler --- handler/gather/gather_component_log.py | 33 +++++++++++++++++--------- handler/rca/plugins/gather.py | 2 +- 2 files changed, 23 insertions(+), 12 deletions(-) diff --git a/handler/gather/gather_component_log.py b/handler/gather/gather_component_log.py index a662837d..4b609b81 100644 --- a/handler/gather/gather_component_log.py +++ b/handler/gather/gather_component_log.py @@ -61,7 +61,11 @@ def init(self, context, *args, **kwargs): self.inner_config = self.context.inner_config self.target = kwargs.get('target', None) self.from_option = kwargs.get('from_option', None) + if self.from_option: + self.from_option = self.from_option.strip() self.to_option = kwargs.get('to_option', None) + if self.to_option: + self.to_option = self.to_option.strip() self.since_option = kwargs.get('since', None) self.scope = kwargs.get('scope', None) self.grep = kwargs.get('grep', None) @@ -213,22 +217,22 @@ def handle(self): new_context.stdio = self.stdio.sub_io() tasks.append(GatherLogOnNode(new_context, node, self.gather_log_conf_dict, semaphore)) file_queue = [] + result_list = mp.Queue() for task in tasks: semaphore.acquire() - file_thread = mp.Process(target=task.handle()) + file_thread = mp.Process(target=task.handle, args=(result_list,)) file_thread.start() file_queue.append(file_thread) for file_thread in file_queue: file_thread.join() - for task in tasks: - gather_tuple = task.get_result() - gather_tuples.append(gather_tuple) + for _ in range(result_list.qsize()): + gather_tuples.append(result_list.get()) self.stdio.verbose("gather_tuples: {0}".format(gather_tuples)) self.stdio.stop_loading("succeed") # save result summary_tuples = self.__get_overall_summary(gather_tuples) self.stdio.print(summary_tuples) - with open(os.path.join(self.store_dir, "result_details.txt"), 'a', encoding='utf-8') as fileobj: + with open(os.path.join(self.store_dir, "result_summary.txt"), 'a', encoding='utf-8') as fileobj: fileobj.write(summary_tuples.get_string()) self.stdio.stop_loading("succeed") @@ -284,7 +288,7 @@ def __get_overall_summary(self, node_summary_tuple): class GatherLogOnNode: def __init__(self, context, node, config, semaphore): self.context = context - self.ssh_client = SshClient(context, node) + self.ssh_client = None self.stdio = context.stdio self.config = config self.node = node @@ -293,8 +297,6 @@ def __init__(self, context, node, config, semaphore): # mkdir tmp_dir self.tmp_dir = self.config.get("tmp_dir") - self.tmp_dir = os.path.join(self.tmp_dir, "obdiag_gather_{0}".format(str(uuid.uuid4()))) - self.ssh_client.exec_cmd("mkdir -p {0}".format(self.tmp_dir)) self.scope = self.config.get("scope") # todo log_path for oms @@ -315,17 +317,23 @@ def __init__(self, context, node, config, semaphore): self.file_number_limit = self.config.get("file_number_limit") self.file_size_limit = self.config.get("file_size_limit") self.gather_tuple = { - "node": self.ssh_client.get_name(), + "node": "", "success": "Fail", "info": "", "file_size": 0, } + self.result_list = None def get_result(self): return self.gather_tuple - def handle(self): - + def handle(self, result_list=None): + self.result_list = result_list + self.ssh_client = SshClient(self.context, self.node) + self.gather_tuple["node"] = self.ssh_client.get_name() + self.tmp_dir = os.path.join(self.tmp_dir, "obdiag_gather_{0}".format(str(uuid.uuid4()))) + self.ssh_client.exec_cmd("mkdir -p {0}".format(self.tmp_dir)) + self.stdio.verbose("do it") from_datetime_timestamp = TimeUtils.timestamp_to_filename_time(TimeUtils.datetime_to_timestamp(self.from_time_str)) to_datetime_timestamp = TimeUtils.timestamp_to_filename_time(TimeUtils.datetime_to_timestamp(self.to_time_str)) @@ -388,6 +396,9 @@ def handle(self): self.stdio.verbose("clear tmp_log_dir: {0}".format(tmp_log_dir)) self.ssh_client.exec_cmd("rm -rf {0}".format(tmp_log_dir)) self.stdio.verbose("gather_log_on_node {0} finished".format(self.ssh_client.get_ip())) + self.stdio.verbose("gather_log_on_node {0} gather_tuple: {1}".format(self.ssh_client.get_ip(), self.gather_tuple)) + if self.result_list: + self.result_list.put(self.gather_tuple) self.semaphore.release() def __grep_log_to_tmp(self, logs_name, tmp_log_dir): diff --git a/handler/rca/plugins/gather.py b/handler/rca/plugins/gather.py index 39e1a057..83d1fd4a 100644 --- a/handler/rca/plugins/gather.py +++ b/handler/rca/plugins/gather.py @@ -133,7 +133,7 @@ def execute(self, save_path=""): # Extract all files to the current directory zip_ref.extractall(gather_result) for file_name in os.listdir(gather_result): - if "zip" not in file_name and "result_summary.txt" not in file_name: + if "zip" not in file_name and not file_name.endswith(".txt"): log_dir = os.path.join(gather_result, file_name) for log_file in os.listdir(log_dir): result_log_files.append(os.path.join(log_dir, log_file)) From 05d684ee98f3fd5620ed0b442fb61c493ad9e0f8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=B8=A0=E7=A3=8A?= Date: Thu, 14 Nov 2024 17:15:31 +0800 Subject: [PATCH 22/32] build test package --- .github/workflows/build_package.yml | 1 + core.py | 3 +-- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build_package.yml b/.github/workflows/build_package.yml index de04b0e9..118d115b 100644 --- a/.github/workflows/build_package.yml +++ b/.github/workflows/build_package.yml @@ -7,6 +7,7 @@ on: push: branches: - master + - v3.0-dev env: ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true diff --git a/core.py b/core.py index 1bf1a008..c287f11c 100644 --- a/core.py +++ b/core.py @@ -508,8 +508,7 @@ def rca_run(self, opts): self.set_context('rca_run', 'rca_run', config) try: handler = RCAHandler(self.context) - handler.handle() - return handler.execute() + return handler.handle() except Exception as e: self.stdio.error("rca run Exception: {0}".format(e)) self.stdio.verbose(traceback.format_exc()) From ce0eb2e20e819937aa092ff2f632d4654bb8cfc7 Mon Sep 17 00:00:00 2001 From: xiaodong-ji Date: Fri, 15 Nov 2024 17:55:56 +0800 Subject: [PATCH 23/32] replace execute for handler (#557) * replace execute with handle * update core --- core.py | 2 +- update/update.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/core.py b/core.py index c287f11c..c1d71c26 100644 --- a/core.py +++ b/core.py @@ -533,7 +533,7 @@ def update(self, opts): self.stdio.print("update start ...") self.set_offline_context('update', 'update') handler = UpdateHandler(self.context) - return handler.execute() + return handler.handle() def config(self, opt): config = self.config_manager diff --git a/update/update.py b/update/update.py index c9b73ae6..0390bad3 100644 --- a/update/update.py +++ b/update/update.py @@ -46,7 +46,7 @@ def __init__(self, context): self.file_path = Util.get_option(self.options, 'file', default="") self.force = Util.get_option(self.options, 'force', default=False) - def execute(self): + def handle(self): try: file_path = self.file_path force = self.force From 2a64229e95378f00a60310c31e91e1af1251142e Mon Sep 17 00:00:00 2001 From: xuyan wang <35394786+wayyoungboy@users.noreply.github.com> Date: Thu, 21 Nov 2024 17:14:37 +0800 Subject: [PATCH 24/32] V3.0 dev (#567) * build test package * build test package * update GatherComponentLogHandler * update tar_gz_to_zip * delete zip on gather * delete zip on gather * delete zip on gather * delete zip on gather * delete zip on gather * gather use find to get remote log * rca add oms_full_trans * rca add oms_full_trans * update --- common/tool.py | 10 +- config.py | 32 ++++ context.py | 3 +- core.py | 2 + handler/gather/gather_ash_report.py | 4 + handler/gather/gather_component_log.py | 196 ++++++++++++++-------- handler/gather/plugins/redact.py | 70 +++----- handler/rca/plugins/gather.py | 57 ++++--- handler/rca/rca_handler.py | 16 +- handler/rca/scene/oms_full_trans_scene.py | 152 +++++++++++++++++ 10 files changed, 394 insertions(+), 148 deletions(-) create mode 100644 handler/rca/scene/oms_full_trans_scene.py diff --git a/common/tool.py b/common/tool.py index f26d81c3..788c518c 100644 --- a/common/tool.py +++ b/common/tool.py @@ -682,20 +682,14 @@ def tar_gz_to_zip(temp_dir, tar_gz_file, output_zip, password, stdio): base_paths.append(base_path) stdio.verbose("start pyminizip compress_multiple") # 3. Compress the extracted files into a (possibly) encrypted zip file - zip_process = None if password: # Use pyminizip to create the encrypted zip file - zip_process = mp.Process(target=pyminizip.compress_multiple, args=(files_to_compress, base_paths, output_zip, password, 5)) - # pyminizip.compress_multiple(files_to_compress, base_paths, output_zip, password, 5) # 5 is the compression level + pyminizip.compress_multiple(files_to_compress, base_paths, output_zip, password, 5) # 5 is the compression level stdio.verbose("extracted files compressed into encrypted {0}".format(output_zip)) else: # Create an unencrypted zip file - zip_process = mp.Process(target=pyminizip.compress_multiple, args=(files_to_compress, base_paths, output_zip, None, 5)) - # pyminizip.compress_multiple(files_to_compress, base_paths, output_zip, None, 5) + pyminizip.compress_multiple(files_to_compress, base_paths, output_zip, None, 5) stdio.verbose("extracted files compressed into unencrypted {0}".format(output_zip)) - zip_process.start() - if zip_process is not None: - zip_process.join() # 4. Remove the extracted directory shutil.rmtree(extract_dir) diff --git a/config.py b/config.py index 4d08ed10..fd871781 100644 --- a/config.py +++ b/config.py @@ -255,6 +255,38 @@ def create_ob_proxy_node(node_config, global_config): 'servers': ob_proxy_nodes, } + @property + def get_oms_config(self): + oms = self.config_data.get('oms', {}) + nodes = oms.get('servers', {}).get('nodes', []) + + def create_oms_node(node_config, global_config): + return { + 'ip': node_config.get('ip'), + 'ssh_username': node_config.get('ssh_username', global_config.get('ssh_username', '')), + 'ssh_password': node_config.get('ssh_password', global_config.get('ssh_password', '')), + 'ssh_port': node_config.get('ssh_port', global_config.get('ssh_port', 22)), + 'home_path': node_config.get('home_path', global_config.get('home_path', '/root/obproxy')), + 'log_path': node_config.get('log_path', global_config.get('log_path', '/home/admin/logs')), + 'run_path': node_config.get('run_path', global_config.get('run_path', '/home/admin/run')), + 'store_path': node_config.get('store_path', global_config.get('store_path', '/home/admin/store')), + 'ssh_key_file': node_config.get('ssh_key_file', global_config.get('ssh_key_file', '')), + 'ssh_type': node_config.get('ssh_type', global_config.get('ssh_type', 'remote')), + 'container_name': node_config.get('container_name', global_config.get('container_name')), + 'namespace': node_config.get('namespace', global_config.get('namespace', '')), + 'pod_name': node_config.get('pod_name', global_config.get('pod_name', '')), + "kubernetes_config_file": node_config.get('kubernetes_config_file', global_config.get('kubernetes_config_file', '')), + 'host_type': 'OMS', + } + + global_config = oms.get('servers', {}).get('global', {}) + oms_nodes = [create_oms_node(node, global_config) for node in nodes] + + return { + 'oms_cluster_name': oms.get('oms_cluster_name'), + 'servers': oms_nodes, + } + @property def get_node_config(self, type, node_ip, config_item): if type == 'ob_cluster': diff --git a/context.py b/context.py index 7d9d5126..d084a6ca 100644 --- a/context.py +++ b/context.py @@ -102,12 +102,13 @@ def return_false(self, *args, **kwargs): class HandlerContext(object): - def __init__(self, handler_name=None, namespace=None, namespaces=None, cluster_config=None, obproxy_config=None, ocp_config=None, inner_config=None, cmd=None, options=None, stdio=None): + def __init__(self, handler_name=None, namespace=None, namespaces=None, cluster_config=None, obproxy_config=None, oms_config=None, ocp_config=None, inner_config=None, cmd=None, options=None, stdio=None): self.namespace = HandlerContextNamespace(namespace) self.namespaces = namespaces self.handler_name = handler_name self.cluster_config = cluster_config self.obproxy_config = obproxy_config + self.oms_config = oms_config self.ocp_config = ocp_config self.inner_config = inner_config self.cmds = cmd diff --git a/core.py b/core.py index c1d71c26..2c6cddd1 100644 --- a/core.py +++ b/core.py @@ -133,6 +133,7 @@ def set_context(self, handler_name, namespace, config): namespace=namespace, cluster_config=config.get_ob_cluster_config, obproxy_config=config.get_obproxy_config, + oms_config=config.get_oms_config, ocp_config=config.get_ocp_config, cmd=self.cmds, options=self.options, @@ -147,6 +148,7 @@ def set_context_skip_cluster_conn(self, handler_name, namespace, config): namespace=namespace, cluster_config=config.get_ob_cluster_config, obproxy_config=config.get_obproxy_config, + oms_config=config.get_oms_config, ocp_config=config.get_ocp_config, cmd=self.cmds, options=self.options, diff --git a/handler/gather/gather_ash_report.py b/handler/gather/gather_ash_report.py index 6cd91510..410f1d55 100644 --- a/handler/gather/gather_ash_report.py +++ b/handler/gather/gather_ash_report.py @@ -17,6 +17,7 @@ """ import datetime import os +import traceback from common.command import get_observer_version from common.ob_connector import OBConnector @@ -74,6 +75,7 @@ def version_check(self): try: observer_version = get_observer_version(self.context) except Exception as e: + self.stdio.verbose(traceback.format_exc()) self.stdio.warn("RCAHandler Failed to get observer version:{0}".format(e)) return False self.stdio.verbose("RCAHandler.init get observer version: {0}".format(observer_version)) @@ -108,6 +110,7 @@ def execute(self): f.write(self.ash_report_file_name) except Exception as e: + self.stdio.verbose(traceback.format_exc()) self.stdio.error("ash report gather failed, error message: {0}".format(e)) def __init_report_path(self): @@ -116,6 +119,7 @@ def __init_report_path(self): self.stdio.verbose("Use {0} as pack dir.".format(self.report_path)) DirectoryUtil.mkdir(path=self.report_path, stdio=self.stdio) except Exception as e: + self.stdio.verbose(traceback.format_exc()) self.stdio.error("init_report_path failed, error:{0}".format(e)) def init_option(self): diff --git a/handler/gather/gather_component_log.py b/handler/gather/gather_component_log.py index 4b609b81..091843b4 100644 --- a/handler/gather/gather_component_log.py +++ b/handler/gather/gather_component_log.py @@ -11,15 +11,17 @@ # See the Mulan PSL v2 for more details. import datetime import os +import tarfile import traceback import uuid import multiprocessing as mp +import shutil from prettytable import PrettyTable from common.command import get_file_start_time, get_file_size, is_empty_dir from common.constant import const from common.ssh_client.ssh import SshClient -from common.tool import FileUtil, TimeUtils, Util +from common.tool import FileUtil, TimeUtils from handler.base_shell_handler import BaseShellHandler from handler.gather.plugins.redact import Redact from result_type import ObdiagResult @@ -27,11 +29,17 @@ class GatherComponentLogHandler(BaseShellHandler): # log_scope_list - log_scope_list = {"observer": ["observer", "rootservice", "election"], "obproxy": ["obproxy", "obproxy_digest", "obproxy_stat", "obproxy_slow", "obproxy_limit"], "oms": ["connector", "error"]} + log_scope_list = { + "observer": {"observer": {"key": "*observer*"}, "rootservice": {"key": "*rootservice*"}, "election": {"key": "*election*"}}, + "obproxy": {"obproxy": {"key": "*obproxy*"}, "obproxy_digest": {"key": "*obproxy_digest*"}, "obproxy_stat": {"key": "*obproxy_stat*"}, "obproxy_slow": {"key": "*obproxy_slow*"}, "obproxy_limit": {"key": "*obproxy_limit*"}}, + "oms": {"connector": {"key": "*connector.*"}, "error": {"key": "error"}, "trace.log": {"key": "trace.log"}, "metrics": {"key": "metrics*"}}, + } def __init__(self, *args, **kwargs): super().__init__() - self.oms_module_id = None + self.all_files = None + self.gather_tuples = None + self.oms_component_id = None self.redact_dir = None self.gather_log_conf_dict = None self.thread_nums = None @@ -51,7 +59,6 @@ def __init__(self, *args, **kwargs): self.temp_dir = None self.redact = None self.nodes = None - self.zip_password = None self.result = ObdiagResult(ObdiagResult.SUCCESS_CODE, data={}) def init(self, context, *args, **kwargs): @@ -77,15 +84,12 @@ def init(self, context, *args, **kwargs): self.is_scene = kwargs.get('is_scene', False) self.oms_log_path = kwargs.get('oms_log_path', None) self.thread_nums = kwargs.get('thread_nums', 3) - self.oms_module_id = kwargs.get('oms_module_id', None) + self.oms_component_id = kwargs.get('oms_component_id', None) self.__check_option() - if self.oms_module_id: - self.gather_log_conf_dict["oms_module_id"] = self.oms_module_id # build config dict for gather log on node self.gather_log_conf_dict = { "target": self.target, "tmp_dir": const.GATHER_LOG_TEMPORARY_DIR_DEFAULT, - "zip_password": self.zip_password, "scope": self.scope, "grep": self.grep, "encrypt": self.encrypt, @@ -94,7 +98,7 @@ def init(self, context, *args, **kwargs): "to_time": self.to_time_str, "file_number_limit": self.file_number_limit, "file_size_limit": self.file_size_limit, - "oms_module_id": self.oms_module_id, + "oms_component_id": self.oms_component_id, } except Exception as e: @@ -138,12 +142,18 @@ def __check_option(self): raise Exception("can not get nodes by target: {0}, nodes's len is 0.".format(self.target)) # check scope if self.scope is None or self.scope == "" or self.scope == "all": - self.scope = "all" self.scope = self.log_scope_list[self.target] else: self.scope = self.scope.strip() if self.scope not in self.log_scope_list[self.target]: raise Exception("scope option can only be {0},the {1} just support {2}".format(self.scope, self.target, self.log_scope_list)) + # check grep + if self.grep: + if isinstance(self.grep, list): + pass + elif isinstance(self.grep, str): + self.grep = self.grep.strip() + # check since from_option and to_option from_timestamp = None to_timestamp = None @@ -171,12 +181,7 @@ def __check_option(self): else: self.from_time_str = (now_time - datetime.timedelta(minutes=30)).strftime('%Y-%m-%d %H:%M:%S') self.stdio.print('gather log from_time: {0}, to_time: {1}'.format(self.from_time_str, self.to_time_str)) - # check encrypt - if self.encrypt: - if self.encrypt.strip().upper() == "TRUE": - self.encrypt = True - self.zip_password = Util.gen_password(16) - self.stdio.verbose("zip_encrypt is True, zip_password is {0}".format(self.zip_password)) + # check redact if self.redact: if self.redact != "" and len(self.redact) != 0: @@ -208,14 +213,23 @@ def handle(self): # run on every node node_threads = [] - gather_tuples = [] + self.gather_tuples = [] tasks = [] self.stdio.start_loading("gather start") semaphore = mp.Semaphore(self.thread_nums) for node in self.nodes: new_context = self.context new_context.stdio = self.stdio.sub_io() - tasks.append(GatherLogOnNode(new_context, node, self.gather_log_conf_dict, semaphore)) + # use Process must delete ssh_client, and GatherLogOnNode will rebuild it. + if "ssh_client" in node or "ssher" in node: + clear_node = node + if "ssh_client" in node: + del clear_node["ssh_client"] + if "ssher" in node: + del clear_node["ssher"] + tasks.append(GatherLogOnNode(new_context, clear_node, self.gather_log_conf_dict, semaphore)) + else: + tasks.append(GatherLogOnNode(new_context, node, self.gather_log_conf_dict, semaphore)) file_queue = [] result_list = mp.Queue() for task in tasks: @@ -226,11 +240,11 @@ def handle(self): for file_thread in file_queue: file_thread.join() for _ in range(result_list.qsize()): - gather_tuples.append(result_list.get()) - self.stdio.verbose("gather_tuples: {0}".format(gather_tuples)) + self.gather_tuples.append(result_list.get()) + self.stdio.verbose("gather_tuples: {0}".format(self.gather_tuples)) self.stdio.stop_loading("succeed") # save result - summary_tuples = self.__get_overall_summary(gather_tuples) + summary_tuples = self.__get_overall_summary(self.gather_tuples) self.stdio.print(summary_tuples) with open(os.path.join(self.store_dir, "result_summary.txt"), 'a', encoding='utf-8') as fileobj: fileobj.write(summary_tuples.get_string()) @@ -238,17 +252,18 @@ def handle(self): last_info = "For result details, please run cmd \033[32m' cat {0} '\033[0m\n".format(os.path.join(self.store_dir, "result_summary.txt")) self.stdio.print(last_info) - if self.zip_password: - self.stdio.print("zip password is {0}".format(self.zip_password)) try: if self.redact and len(self.redact) > 0: self.stdio.start_loading("gather redact start") self.stdio.verbose("redact_option is {0}".format(self.redact)) redact_dir = "{0}_redact".format(self.store_dir) self.redact_dir = redact_dir - redact = Redact(self.context, self.store_dir, redact_dir, zip_password=self.zip_password) - redact.redact_files(self.redact) + all_files = self.open_all_file() + self.stdio.verbose(all_files) + redact = Redact(self.context, self.store_dir, redact_dir) + redact.redact_files(self.redact, all_files) self.stdio.print("redact success the log save on {0}".format(self.redact_dir)) + self.__delete_all_files_in_tar() self.stdio.stop_loading("succeed") return ObdiagResult(ObdiagResult.SUCCESS_CODE, data={"store_dir": redact_dir, "redact_dir": self.redact_dir}) except Exception as e: @@ -270,20 +285,51 @@ def __get_overall_summary(self, node_summary_tuple): summary_tb = PrettyTable() summary_tb.title = "{0} Gather Ob Log Summary on {1}".format(self.target, datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")) self.stdio.verbose("node_summary_tuple: {0}".format(node_summary_tuple)) - if self.zip_password: - summary_tb.field_names = ["Node", "Status", "Size", "info", "zip_password"] - else: - summary_tb.field_names = ["Node", "Status", "Size", "info"] + summary_tb.field_names = ["Node", "Status", "Size", "info"] try: for tup in node_summary_tuple: - if self.zip_password: - summary_tb.add_row([tup["node"], tup["success"], tup["file_size"], tup["info"], self.zip_password]) - else: - summary_tb.add_row([tup["node"], tup["success"], tup["file_size"], tup["info"]]) + summary_tb.add_row([tup["node"], tup["success"], tup["file_size"], tup["info"]]) except Exception as e: + self.stdio.verbose(traceback.format_exc()) self.stdio.error("gather log __get_overall_summary failed: {0}".format(str(e))) return summary_tb + def open_all_file(self): + all_files = {} + if not self.gather_tuples: + raise Exception("summary_tuples is None. can't open all file") + for tup in self.gather_tuples: + if not tup["file_path"] or len(tup["file_path"]) == 0 or not os.path.exists(tup["file_path"]): + self.stdio.verbose("file_path is None or not exists, can't open file") + continue + try: + file_path = tup["file_path"] + self.stdio.verbose("open file {0}".format(tup["file_path"])) + # 打开 tar.gz 文件 + extract_path = os.path.dirname(file_path) + with tarfile.open(file_path, 'r:gz') as tar: + # get all files in tar + tar.extractall(path=extract_path) + extracted_files = tar.getnames() + self.stdio.verbose("extracted_files: {0}".format(extracted_files)) + extracted_files_new = [] + for extracted_file in extracted_files: + extracted_files_new.append(os.path.join(self.store_dir, extracted_file)) + all_files[file_path] = extracted_files_new + except Exception as e: + self.stdio.verbose(traceback.format_exc()) + self.stdio.error("gather open_all_filefailed: {0}".format(str(e))) + continue + self.all_files = all_files + return all_files + + def __delete_all_files_in_tar(self): + for item in os.listdir(self.store_dir): + item_path = os.path.join(self.store_dir, item) + if os.path.isdir(item_path): + shutil.rmtree(item_path) + return True + class GatherLogOnNode: def __init__(self, context, node, config, semaphore): @@ -299,29 +345,24 @@ def __init__(self, context, node, config, semaphore): self.tmp_dir = self.config.get("tmp_dir") self.scope = self.config.get("scope") - # todo log_path for oms - self.oms_module_id = self.config.get("oms_module_id") + self.oms_component_id = self.config.get("oms_component_id") if self.target == "oms": - if self.oms_module_id is None: - raise Exception("gather log on oms, but oms_module_id is None") - self.log_path = os.path.join(node.get("run_path"), self.oms_module_id, "logs") + if self.oms_component_id is None: + raise Exception("gather log on oms, but oms_component_id is None. please check your config") + if node.get("run_path") is None: + raise Exception("gather log on oms, but run_path is None. please check your config") + self.log_path = os.path.join(node.get("run_path"), self.oms_component_id, "logs") else: self.log_path = os.path.join(node.get("home_path"), "log") self.from_time_str = self.config.get("from_time") self.to_time_str = self.config.get("to_time") - self.grep_option = self.config.get("grep_option") + self.grep_option = self.config.get("grep") self.store_dir = self.config.get("store_dir") - self.zip_password = self.config.get("zip_password") or None # self.file_number_limit = self.config.get("file_number_limit") self.file_size_limit = self.config.get("file_size_limit") - self.gather_tuple = { - "node": "", - "success": "Fail", - "info": "", - "file_size": 0, - } + self.gather_tuple = {"node": "", "success": "Fail", "info": "", "file_size": 0, "file_path": ""} self.result_list = None def get_result(self): @@ -331,13 +372,12 @@ def handle(self, result_list=None): self.result_list = result_list self.ssh_client = SshClient(self.context, self.node) self.gather_tuple["node"] = self.ssh_client.get_name() - self.tmp_dir = os.path.join(self.tmp_dir, "obdiag_gather_{0}".format(str(uuid.uuid4()))) + self.tmp_dir = os.path.join(self.tmp_dir, "obdiag_gather_{0}".format(str(uuid.uuid4())[:6])) self.ssh_client.exec_cmd("mkdir -p {0}".format(self.tmp_dir)) - self.stdio.verbose("do it") from_datetime_timestamp = TimeUtils.timestamp_to_filename_time(TimeUtils.datetime_to_timestamp(self.from_time_str)) to_datetime_timestamp = TimeUtils.timestamp_to_filename_time(TimeUtils.datetime_to_timestamp(self.to_time_str)) - - tmp_log_dir = os.path.join(self.tmp_dir, "{4}_log_{0}_{1}_{2}_{3}".format(self.ssh_client.get_name(), from_datetime_timestamp, to_datetime_timestamp, str(uuid.uuid4())[:6], self.target)) + tmp_dir = "{4}_log_{0}_{1}_{2}_{3}".format(self.ssh_client.get_name(), from_datetime_timestamp, to_datetime_timestamp, str(uuid.uuid4())[:6], self.target) + tmp_log_dir = os.path.join(self.tmp_dir, tmp_dir) # mkdir tmp_log_dir self.ssh_client.exec_cmd("mkdir -p {0}".format(tmp_log_dir)) self.stdio.verbose("gather_log_on_node {0} tmp_log_dir: {1}".format(self.ssh_client.get_ip(), tmp_log_dir)) @@ -365,12 +405,17 @@ def handle(self, result_list=None): return tar_file = os.path.join(self.tmp_dir, "{0}.tar.gz".format(tmp_log_dir)) - tar_cmd = "cd {0} && tar -czf {1}.tar.gz {1}/*".format(self.tmp_dir, tmp_log_dir) + tar_cmd = "cd {0} && tar -czf {1}.tar.gz {1}/*".format(self.tmp_dir, tmp_dir) self.stdio.verbose("gather_log_on_node {0} tar_cmd: {1}".format(self.ssh_client.get_ip(), tar_cmd)) self.ssh_client.exec_cmd(tar_cmd) # download log to local store_dir tar_file_size = int(get_file_size(self.ssh_client, tar_file)) + self.stdio.verbose("gather_log_on_node {0} tar_file_size: {1}".format(self.ssh_client.get_ip(), tar_file_size)) + if tar_file_size == 0: + self.stdio.error("gather_log_on_node {0} failed: tar file size is 0".format(self.ssh_client.get_ip())) + self.gather_tuple["info"] = "tar file size is 0" + return if tar_file_size > self.file_size_limit: self.stdio.error("gather_log_on_node {0} failed: File too large over gather.file_size_limit".format(self.ssh_client.get_ip())) self.gather_tuple["info"] = "File too large over gather.file_size_limit" @@ -378,16 +423,14 @@ def handle(self, result_list=None): else: self.stdio.verbose("gather_log_on_node {0} download log to local store_dir: {1}".format(self.ssh_client.get_ip(), self.store_dir)) self.ssh_client.download(tar_file, os.path.join(self.store_dir, os.path.basename("{0}".format(tar_file)))) - # tar to zip tar_file_name = os.path.basename("{0}".format(tar_file)) self.stdio.verbose("tar_file_name: {0}".format(tar_file_name)) local_tar_file_path = os.path.join(self.store_dir, tar_file_name) - local_zip_store_path = os.path.join(self.store_dir, os.path.basename("{0}.zip".format(tmp_log_dir))) - self.stdio.verbose("local_tar_file_path: {0}; local_zip_store_path: {1}".format(local_tar_file_path, local_zip_store_path)) - FileUtil.tar_gz_to_zip(self.store_dir, local_tar_file_path, local_zip_store_path, self.zip_password, self.stdio) - self.gather_tuple["file_size"] = FileUtil.size_format(num=int(os.path.getsize(local_zip_store_path) or 0), output_str=True) - self.gather_tuple["info"] = "file save in {0}".format(local_zip_store_path) + self.stdio.verbose("local_tar_file_path: {0}".format(local_tar_file_path)) + self.gather_tuple["file_size"] = FileUtil.size_format(num=int(os.path.getsize(local_tar_file_path) or 0), output_str=True) + self.gather_tuple["info"] = "file save in {0}".format(local_tar_file_path) self.gather_tuple["success"] = "Success" + self.gather_tuple["file_path"] = local_tar_file_path except Exception as e: self.stdio.verbose(traceback.format_exc()) self.stdio.error("gather_log_on_node {0} failed: {1}".format(self.ssh_client.get_ip(), str(e))) @@ -404,15 +447,17 @@ def handle(self, result_list=None): def __grep_log_to_tmp(self, logs_name, tmp_log_dir): grep_cmd = "" if self.grep_option: - self.stdio.verbose("grep files, grep_option = [{0}]".format(self.grep_option)) + self.stdio.verbose("grep files, grep_option = {0}".format(self.grep_option)) for grep_option in self.grep_option: if grep_cmd == "": grep_cmd = "grep -e '{0}' ".format(grep_option) + continue grep_cmd += "| grep -e '{0}'".format(grep_option) for log_name in logs_name: source_log_name = "{0}/{1}".format(self.log_path, log_name) target_log_name = "{0}/{1}".format(tmp_log_dir, log_name) self.stdio.verbose("grep files, source_log_name = [{0}], target_log_name = [{1}]".format(source_log_name, target_log_name)) + # for oms log if log_name.endswith(".gz"): log_grep_cmd = "cp {0} {1}".format(source_log_name, target_log_name) self.stdio.verbose("grep files, run cmd = [{0}]".format(log_grep_cmd)) @@ -431,13 +476,23 @@ def __find_logs_name(self): try: logs_scope = "" for scope in self.scope: - if logs_scope == "": - logs_scope = scope - continue - logs_scope = logs_scope + "|" + scope + target_scopes = self.scope[scope]["key"] + if isinstance(target_scopes, list): + for target_scope in target_scopes: + if logs_scope == "": + logs_scope = ' -name "{0}" '.format(target_scope) + continue + logs_scope = logs_scope + ' -o -name "{0}" '.format(target_scope) + else: + if logs_scope == "": + logs_scope = ' -name "{0}" '.format(target_scopes) + continue + logs_scope = logs_scope + ' -o -name "{0}" '.format(target_scopes) + if logs_scope == "": + self.stdio.warn("gather_log_on_node {0} find logs scope is null".format(self.ssh_client.get_ip(), logs_scope)) + return [] self.stdio.verbose("gather_log_on_node {0} find logs scope: {1}".format(self.ssh_client.get_ip(), logs_scope)) - - find_cmd = "ls -1 -F {0} |grep -E '{1}'| awk -F '/' ".format(self.log_path, logs_scope) + "'{print $NF}'" + find_cmd = "cd {0} &&find . {1} | awk -F '/' ".format(self.log_path, logs_scope) + "'{print $NF}'" self.stdio.verbose("gather_log_on_node {0} find logs cmd: {1}".format(self.ssh_client.get_ip(), find_cmd)) logs_name = self.ssh_client.exec_cmd(find_cmd) if logs_name is not None and len(logs_name) != 0: @@ -450,9 +505,18 @@ def __find_logs_name(self): raise Exception("gather_log_on_node {0} find logs failed: {1}".format(self.ssh_client.get_ip(), str(e))) def __get_logfile_name_list(self, from_time_str, to_time_str, log_dir, log_files): - # TODO oms get all log file name list + # oms get all log file name list, the log size is so small if self.target == "oms": - return log_files + log_name_list = [] + formatted_time = datetime.datetime.now().strftime("%Y-%m-%d_%H") + for file_name in log_files.split('\n'): + if file_name == "": + self.stdio.verbose("existing file name is empty") + continue + if "log.gz" not in file_name or formatted_time in file_name: + log_name_list.append(file_name) + continue + return log_name_list self.stdio.verbose("get log file name list, from time {0}, to time {1}, log dir {2}, log files {3}".format(from_time_str, to_time_str, log_dir, log_files)) log_name_list = [] last_file_dict = {"prefix_file_name": "", "file_name": "", "file_end_time": ""} diff --git a/handler/gather/plugins/redact.py b/handler/gather/plugins/redact.py index c45a8176..1c945e01 100644 --- a/handler/gather/plugins/redact.py +++ b/handler/gather/plugins/redact.py @@ -1,19 +1,19 @@ import os import shutil -import zipfile +import tarfile from common.import_module import import_modules import multiprocessing as mp class Redact: - def __init__(self, context, input_file_dir, output_file_dir, zip_password=None): + def __init__(self, context, input_file_dir, output_file_dir): self.context = context self.stdio = context.stdio self.redacts = {} self.input_file_dir = input_file_dir self.output_file_dir = output_file_dir - self.zip_password = zip_password + self.stdio.verbose("Redact output_file_dir: {0}".format(self.output_file_dir)) self.module_dir = os.path.expanduser('~/.obdiag/gather/redact') self.inner_config = self.context.inner_config @@ -37,7 +37,10 @@ def check_redact(self, input_redacts): self.stdio.verbose(f"Redact {input_redact} found") self.redacts[input_redact] = self.all_redact[input_redact] - def redact_files(self, input_redacts): + def redact_files(self, input_redacts, files_name): + if len(files_name) == 0: + self.stdio.warn("No files to redact") + return True self.stdio.verbose("redact_files start") self.check_redact(input_redacts) # check self.redacts @@ -47,65 +50,38 @@ def redact_files(self, input_redacts): # create dir to save the files after redact if not os.path.exists(self.output_file_dir): os.makedirs(self.output_file_dir) - # use threading to redact the files - files_name = os.listdir(self.input_file_dir) - self.stdio.verbose(files_name) - # unzip the log file - for zip_file in files_name: - if ".zip" in zip_file: - self.stdio.verbose("open zip file: {0}".format(os.path.join(self.input_file_dir, zip_file))) - with zipfile.ZipFile(os.path.join(self.input_file_dir, zip_file), 'r') as zip_ref: - # Extract all files to the current directory - if self.zip_password is not None: - zip_ref.extractall(self.input_file_dir, pwd=self.zip_password.encode('utf-8')) - else: - zip_ref.extractall(self.input_file_dir) - gather_log_files = [] - for file_name in os.listdir(self.input_file_dir): - if "zip" not in file_name and "result_summary.txt" not in file_name: - log_dir = os.path.join(self.input_file_dir, file_name) - for log_file in os.listdir(log_dir): - gather_log_files.append(os.path.join(log_dir, log_file)) - self.stdio.verbose("result_log_files add {0}".format(os.path.join(log_dir, log_file))) - if len(gather_log_files) == 0: + # gather all files + self.stdio.verbose("gather_log_files: {0}".format(files_name)) + if len(files_name) == 0: self.stdio.warn("No log file found. The redact process will be skipped.") return False file_queue = [] max_processes = int(self.inner_config.get('gather').get('redact_processing_num')) or 3 self.stdio.verbose("max_processes: {0}".format(max_processes)) semaphore = mp.Semaphore(max_processes) - for file_name in gather_log_files: - if "result_summary.txt" in file_name: - continue - self.stdio.verbose("inport file name: {0}".format(file_name)) - self.stdio.verbose("output file name: {0}".format(file_name.replace(self.input_file_dir, self.output_file_dir))) - semaphore.acquire() - file_thread = mp.Process(target=self.redact_file, args=(file_name, file_name.replace(self.input_file_dir, self.output_file_dir), semaphore)) - file_thread.start() - file_queue.append(file_thread) + for dir_name in files_name: + for file_name in files_name[dir_name]: + self.stdio.verbose("inport file name: {0}".format(file_name)) + self.stdio.verbose("output file name: {0}".format(file_name.replace(self.input_file_dir, self.output_file_dir))) + semaphore.acquire() + file_thread = mp.Process(target=self.redact_file, args=(file_name, file_name.replace(self.input_file_dir, self.output_file_dir), semaphore)) + file_thread.start() + file_queue.append(file_thread) for file_thread in file_queue: file_thread.join() - # delete gather_log_files - self.stdio.verbose("redact end. delete all gather_log_files") - for file_name in gather_log_files: - self.stdio.verbose("delete file: {0}".format(file_name)) - os.remove(file_name) - # zip the dir by node + # tar the dir by node subfolders = [f for f in os.listdir(self.output_file_dir) if os.path.isdir(os.path.join(self.output_file_dir, f))] for subfolder in subfolders: subfolder_path = os.path.join(self.output_file_dir, subfolder) - zip_filename = os.path.join(self.output_file_dir, f"{subfolder}.zip") - if self.zip_password is not None: - self.stdio.warn("the redacted log without passwd") - with zipfile.ZipFile(zip_filename, 'w') as zipf: + tar_filename = os.path.join(self.output_file_dir, f"{subfolder}.tar.gz") + with tarfile.open(tar_filename, "w:gz") as tar: for root, dirs, files in os.walk(subfolder_path): for file in files: file_path = os.path.join(root, file) - zipf.write(file_path, os.path.relpath(file_path, subfolder_path)) - self.stdio.verbose("zip the redact log with passwd: {0}".format(self.zip_password.encode('utf-8'))) + tar.add(file_path, os.path.relpath(file_path, subfolder_path)) self.stdio.verbose("delete the dir: {0}".format(subfolder_path)) shutil.rmtree(subfolder_path) - self.stdio.print(f"{subfolder} is zipped on {zip_filename}") + self.stdio.print(f"{subfolder} is tar on {tar_filename}") return True def redact_file(self, input_file, output_file, semaphore): diff --git a/handler/rca/plugins/gather.py b/handler/rca/plugins/gather.py index 83d1fd4a..3c5fcdc0 100644 --- a/handler/rca/plugins/gather.py +++ b/handler/rca/plugins/gather.py @@ -16,7 +16,6 @@ @desc: """ import os.path -import zipfile from handler.gather.gather_component_log import GatherComponentLogHandler @@ -35,17 +34,18 @@ def __init__(self, context): def init_parameters(self): self.conf_map["filter_nodes_list"] = [] - self.conf_map["gather_from"] = "" - self.conf_map["gather_to"] = "" - self.conf_map["gather_since"] = "" + self.conf_map["gather_from"] = None + self.conf_map["gather_to"] = None + self.conf_map["gather_since"] = None self.conf_map["gather_scope"] = "" self.conf_map["store_dir"] = self.work_path self.conf_map["gather_target"] = "observer" + self.conf_map["gather_oms_component_id"] = None self.greps_key = [] def grep(self, key): if key is None or len(key) < 1 or type(key) != str: - raise Exception("The keyword cannot be empty!") + raise Exception("The keyword {0} cannot be empty!".format(key)) self.greps_key.append(key) def execute(self, save_path=""): @@ -62,9 +62,6 @@ def execute(self, save_path=""): self.work_path = save_path self.conf_map["store_dir"] = self.work_path self.stdio.verbose("Gather_log execute,the conf_map: {0}".format(self.conf_map)) - if len(self.greps_key) == 0: - self.stdio.error("The keyword cannot be empty!") - raise Exception("The keyword cannot be empty!") self.stdio.verbose("gather_grep is {0}".format(self.greps_key)) nodes_list = [] # execute on all nodes_list @@ -115,31 +112,41 @@ def execute(self, save_path=""): grep=self.greps_key, store_dir=self.work_path, ) + elif self.conf_map["gather_target"] == 'oms': + all_node = self.context.oms_config + if self.conf_map["filter_nodes_list"]: + # execute on specific nodes_list + for node in all_node: + if node not in self.conf_map["filter_nodes_list"]: + self.stdio.warn("{0} is not in the nodes list".format(node.get("ip"))) + continue + else: + nodes_list.append(node) + self.conf_map["filter_nodes_list"] = nodes_list + handler = GatherComponentLogHandler() + handler.init( + self.context, + target="oms", + nodes=nodes_list, + from_option=self.conf_map.get("gather_from"), + to_option=self.conf_map.get("gather_to"), + since=self.conf_map.get("gather_since"), + scope=self.conf_map.get("gather_scope"), + grep=self.greps_key, + store_dir=self.work_path, + oms_component_id=self.conf_map.get("gather_oms_component_id"), + ) if handler is None: self.stdio.error("rca gather handle the target cannot be empty!") raise Exception("rca gather handle the target cannot be empty!") else: handler.handle() - gather_result = handler.store_dir - zip_files = os.listdir(gather_result) result_log_files = [] - for zip_file in zip_files: - if "zip" not in zip_file: - continue - # open zip file - self.stdio.verbose("open zip file: {0}".format(os.path.join(gather_result, zip_file))) - with zipfile.ZipFile(os.path.join(gather_result, zip_file), 'r') as zip_ref: - # Extract all files to the current directory - zip_ref.extractall(gather_result) - for file_name in os.listdir(gather_result): - if "zip" not in file_name and not file_name.endswith(".txt"): - log_dir = os.path.join(gather_result, file_name) - for log_file in os.listdir(log_dir): - result_log_files.append(os.path.join(log_dir, log_file)) - self.stdio.verbose("result_log_files add {0}".format(os.path.join(log_dir, log_file))) + result_log_dir_data = handler.open_all_file() + for dir_name in result_log_dir_data: + result_log_files.extend(result_log_dir_data[dir_name]) self.reset() - return result_log_files except Exception as e: raise Exception("rca plugins Gather_log execute error: {0}".format(e)) diff --git a/handler/rca/rca_handler.py b/handler/rca/rca_handler.py index 5036416b..581197bb 100644 --- a/handler/rca/rca_handler.py +++ b/handler/rca/rca_handler.py @@ -25,6 +25,7 @@ get_obproxy_version, get_observer_version, ) +import traceback from prettytable import PrettyTable from common.ob_connector import OBConnector from common.ssh_client.ssh import SshClient @@ -64,6 +65,15 @@ def __init__(self, context): node["ssher"] = ssh context_obproxy_nodes.append(node) self.context.set_variable("obproxy_nodes", context_obproxy_nodes) + # build oms_nodes + oms_nodes = self.context.oms_config.get("servers") + context_oms_nodes = [] + if oms_nodes is not None: + for node in oms_nodes: + ssh = SshClient(context, node) + node["ssher"] = ssh + context_oms_nodes.append(node) + self.context.set_variable("oms_nodes", context_oms_nodes) # build ob_connector try: @@ -168,7 +178,7 @@ def handle(self): return self.__execute() else: self.stdio.error("rca_scene :{0} is not exist or not input".format(scene_name)) - raise Exception("rca_scene :{0} is not exist or not input".format(scene_name)) + return ObdiagResult(ObdiagResult.INPUT_ERROR_CODE, error_data="rca_scene :{0} is not exist or not input".format(scene_name)) # get all tasks def __execute(self): @@ -178,11 +188,13 @@ def __execute(self): self.stdio.warn("rca_scene.execute not need execute: {0}".format(e)) return ObdiagResult(ObdiagResult.SERVER_ERROR_CODE, data={"result": "rca_scene.execute not need execute"}) except Exception as e: + self.stdio.verbose(traceback.format_exc()) self.stdio.error("rca_scene.execute err: {0}".format(e)) return ObdiagResult(ObdiagResult.SERVER_ERROR_CODE, error_data="rca_scene.execute err: {0}".format(e)) try: self.rca_scene.export_result() except Exception as e: + self.stdio.verbose(traceback.format_exc()) self.stdio.error("rca_scene.export_result err: {0}".format(e)) return ObdiagResult(ObdiagResult.SERVER_ERROR_CODE, error_data="rca_scene.export_result err: {0}".format(e)) self.stdio.print( @@ -218,6 +230,7 @@ def __init__(self): self.report = None self.obproxy_nodes = None self.observer_nodes = None + self.oms_nodes = None self.context = None self.name = type(self).__name__ self.Result = None @@ -230,6 +243,7 @@ def init(self, context): self.Result.records.append(self.record) self.observer_nodes = context.get_variable("observer_nodes") self.obproxy_nodes = context.get_variable("obproxy_nodes") + self.oms_nodes = context.get_variable("oms_nodes") self.report = context.get_variable("report") self.obproxy_version = context.get_variable("obproxy_version", default="") self.observer_version = context.get_variable("observer_version", default="") diff --git a/handler/rca/scene/oms_full_trans_scene.py b/handler/rca/scene/oms_full_trans_scene.py new file mode 100644 index 00000000..ebb41077 --- /dev/null +++ b/handler/rca/scene/oms_full_trans_scene.py @@ -0,0 +1,152 @@ +# !/usr/bin/env python +# -*- coding: UTF-8 -* +# Copyright (c) 2022 OceanBase +# OceanBase Diagnostic Tool is licensed under Mulan PSL v2. +# You can use this software according to the terms and conditions of the Mulan PSL v2. +# You may obtain a copy of Mulan PSL v2 at: +# http://license.coscl.org.cn/MulanPSL2 +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, +# EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, +# MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. +# See the Mulan PSL v2 for more details. + +""" +@time: 2024/1/2 +@file: oms_full_trans.py +@desc: +""" +import os.path + +from handler.rca.rca_exception import ( + RCAInitException, + RCAExecuteException, +) +from handler.rca.rca_handler import RcaScene + + +class OMSFullTransScene(RcaScene): + def __init__(self): + super().__init__() + self.component_id = None + + def init(self, context): + super().init(context) + # check component id + self.component_id = self.input_parameters.get("component_id") + self.stdio.verbose("oms component_id is {0}".format(self.component_id)) + if self.component_id is None: + raise RCAInitException("component_id is not exist.") + + def execute(self): + try: + self.record.add_record("oms component_id is {0}".format(self.component_id)) + # 1. check oms woker path + for server in self.oms_nodes: + ssh_client = server["ssher"] + try: + self.stdio.verbose("node:{0} check oms woker path: {1}/{2}".format(ssh_client.get_ip(), server.get("run_path"), self.component_id)) + ssh_client.exec_cmd("cd {0}/{1}".format(server.get("run_path"), self.component_id)) + except Exception as e: + self.record.add_record("node:{0} run_path: {1}/{2} is not exist.".format(ssh_client.get_ip(), server.get("run_path"), self.component_id)) + self.record.add_suggest("the component's work path is not exist. May be the oms'resource is not enough.") + return + self.record.add_record("check oms woker path is exist.") + # 2. download oms error.details + error_details_paths = [] + for server in self.oms_nodes: + ssh_client = server["ssher"] + try: + self.stdio.verbose("node:{0} download oms error.details".format(ssh_client.get_ip())) + # check error.details is exist + if not os.path.exists("{0}/error.details".format(self.store_dir)): + os.makedirs("{0}/error.details".format(self.store_dir)) + local_error_details_path = "{0}/error.details_{1}".format(self.store_dir, ssh_client.get_name()) + self.stdio.verbose("local_error_details_path:{0}".format(local_error_details_path)) + ssh_client.download("{0}/{1}/error.details".format(server.get("run_path"), self.component_id), local_error_details_path) + self.record.add_record("download oms error.details is success.") + error_details_paths.append(local_error_details_path) + except Exception as e: + if "No such file" in "{0}".format(e): + self.record.add_record("node:{0} not find error.details.".format(ssh_client.get_ip())) + else: + self.stdio.error("node:{0} download oms error.details error: {1}".format(ssh_client.get_ip(), e)) + self.record.add_record("node:{0} download oms error.details error: {1}".format(ssh_client.get_ip(), e)) + if len(error_details_paths) == 0: + self.record.add_record("not find oms error.details") + self.record.add_record("downland oms error.details is end.") + # 3. gather the oms log + oms_logs_name = [] + try: + self.stdio.verbose("gather oms log") + self.gather_log.set_parameters("target", "oms") + self.gather_log.set_parameters("store_dir", "oms") + self.gather_log.set_parameters("oms_component_id", self.component_id) + self.record.add_record("get oms log about connector by component_id: {0}".format(self.component_id)) + if self.input_parameters.get("since") is not None: + since = self.input_parameters.get("since") + self.gather_log.set_parameters("since", since) + self.work_path = self.store_dir + oms_logs_name = self.gather_log.execute() + self.stdio.verbose("oms_logs_name:{0}".format(oms_logs_name)) + self.record.add_record("oms_logs save on {0}".format(self.gather_log.work_path)) + except Exception as e: + self.record.add_record("gather the oms log error: {0}".format(e)) + self.record.add_record("gather the oms log is end.") + # get sinkType and sourceType on conf/coordinator.json + self.record.add_record("get sinkType and sourceType on conf/coordinator.json") + for server in self.oms_nodes: + ssh_client = server["ssher"] + try: + sinkType_data = ssh_client.exec_cmd('cat {0}/{1}/conf/coordinator.json|grep "sinkType"'.format(server.get("run_path"), self.component_id)) + sourceType_data = ssh_client.exec_cmd('cat {0}/{1}/conf/coordinator.json|grep "sourceType"'.format(server.get("run_path"), self.component_id)) + self.record.add_record("on node {0}, sinkType: {1}, sourceType: {2}".format(ssh_client.get_name(), sinkType_data, sourceType_data)) + except Exception as e: + self.record.add_record("get {1} sinkType and sourceType on conf/coordinator.json error: {0}".format(e, ssh_client.get_ip())) + continue + + # 4. check the oms full trans + if len(error_details_paths) > 0: + self.record.add_record("check log error.details is start.") + # error.details SINK_TABLE_NOT_FOUND + for error_details_path in error_details_paths: + try: + self.stdio.verbose("check {0} is start.".format(error_details_path)) + with open(error_details_path, 'r', encoding='utf-8') as f: + # TDDO find something on error.details + SINK_TABLE_NOT_FOUND_tag = False + SINK_TABLE_IS_NOT_EMPTY_tag = False + for line in f.readlines(): + if "SINK_TABLE_NOT_FOUND" in line and SINK_TABLE_NOT_FOUND_tag is False: + self.record.add_record("error.details SINK_TABLE_NOT_FOUND is exist.") + self.record.add_suggest("the component_id is {0}, the sink table is not found.".format(self.component_id)) + SINK_TABLE_NOT_FOUND_tag = True + continue + elif "SINK_TABLE_IS_NOT_EMPTY" in line and SINK_TABLE_IS_NOT_EMPTY_tag is False: + self.record.add_record("error.details SINK_TABLE_IS_NOT_EMPTY is exist.") + self.record.add_suggest("the component_id is {0}, the sink table is not empty.".format(self.component_id)) + SINK_TABLE_IS_NOT_EMPTY_tag = True + continue + if SINK_TABLE_NOT_FOUND_tag is False and SINK_TABLE_IS_NOT_EMPTY_tag is False: + self.record.add_record("error.details SINK_TABLE_NOT_FOUND and SINK_TABLE_IS_NOT_EMPTY is not exist.") + self.record.add_suggest("the component_id is {0}, the sink table is empty.".format(self.component_id)) + except Exception as e: + raise RCAExecuteException("error.details SINK_TABLE_NOT_FOUND error: {0}".format(e)) + else: + pass + + except Exception as e: + self.record.add_record("execute oms full trans error: {0}".format(e)) + raise RCAExecuteException(e) + finally: + self.record.add_suggest("if you want to know more about the result, please contact with oms team with {0}".format(self.store_dir)) + return self.record + + def get_scene_info(self): + return { + "name": "oms_full_trans", + "info_en": "OMS full connector error", + "info_cn": "oms全量迁移报错", + } + + +oms_full_trans = OMSFullTransScene() From 0e8304a0af4350b10a4d3dc158c0507ace7a5036 Mon Sep 17 00:00:00 2001 From: xiaodong-ji Date: Mon, 25 Nov 2024 16:58:56 +0800 Subject: [PATCH 25/32] move the script for the gather scene out (#565) * move the script for the gather scene out * black format * Delete the suffix '_scene' in the RCA scene * build rpm --- .github/workflows/build_package.yml | 1 + handler/gather/gather_scenes.py | 33 ++++++------- handler/gather/scenes/base.py | 46 +++++++++---------- handler/gather/scenes/list.py | 20 +++++++- .../{scenes => tasks/observer}/cpu_high.py | 7 ++- .../observer}/px_collect_log.py | 7 ++- .../{scenes => tasks/observer}/sql_problem.py | 7 ++- handler/rca/rca_list.py | 4 +- ...g_disk_full_scene.py => clog_disk_full.py} | 0 ...dl_disk_full_scene.py => ddl_disk_full.py} | 0 .../{ddl_failure_scene.py => ddl_failure.py} | 0 ...isconnection_scene.py => disconnection.py} | 0 ..._ddl_error_scene.py => index_ddl_error.py} | 0 ...ock_conflict_scene.py => lock_conflict.py} | 0 .../{log_error_scene.py => log_error.py} | 0 .../{major_hold_scene.py => major_hold.py} | 0 ..._scene.py => transaction_disconnection.py} | 0 ...cene.py => transaction_execute_timeout.py} | 0 ...ing_scene.py => transaction_not_ending.py} | 0 ...or_scene.py => transaction_other_error.py} | 0 ...lback_scene.py => transaction_rollback.py} | 0 ...t_scene.py => transaction_wait_timeout.py} | 0 22 files changed, 74 insertions(+), 51 deletions(-) rename handler/gather/{scenes => tasks/observer}/cpu_high.py (96%) rename handler/gather/{scenes => tasks/observer}/px_collect_log.py (98%) rename handler/gather/{scenes => tasks/observer}/sql_problem.py (97%) rename handler/rca/scene/{clog_disk_full_scene.py => clog_disk_full.py} (100%) rename handler/rca/scene/{ddl_disk_full_scene.py => ddl_disk_full.py} (100%) rename handler/rca/scene/{ddl_failure_scene.py => ddl_failure.py} (100%) rename handler/rca/scene/{disconnection_scene.py => disconnection.py} (100%) rename handler/rca/scene/{index_ddl_error_scene.py => index_ddl_error.py} (100%) rename handler/rca/scene/{lock_conflict_scene.py => lock_conflict.py} (100%) rename handler/rca/scene/{log_error_scene.py => log_error.py} (100%) rename handler/rca/scene/{major_hold_scene.py => major_hold.py} (100%) rename handler/rca/scene/{transaction_disconnection_scene.py => transaction_disconnection.py} (100%) rename handler/rca/scene/{transaction_execute_timeout_scene.py => transaction_execute_timeout.py} (100%) rename handler/rca/scene/{transaction_not_ending_scene.py => transaction_not_ending.py} (100%) rename handler/rca/scene/{transaction_other_error_scene.py => transaction_other_error.py} (100%) rename handler/rca/scene/{transaction_rollback_scene.py => transaction_rollback.py} (100%) rename handler/rca/scene/{transaction_wait_timeout_scene.py => transaction_wait_timeout.py} (100%) diff --git a/.github/workflows/build_package.yml b/.github/workflows/build_package.yml index 118d115b..63f7104f 100644 --- a/.github/workflows/build_package.yml +++ b/.github/workflows/build_package.yml @@ -8,6 +8,7 @@ on: branches: - master - v3.0-dev + - enhancement-optimize_external_script env: ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true diff --git a/handler/gather/gather_scenes.py b/handler/gather/gather_scenes.py index a0ad5ee0..8d2613bc 100644 --- a/handler/gather/gather_scenes.py +++ b/handler/gather/gather_scenes.py @@ -42,7 +42,7 @@ def __init__(self, context, gather_pack_dir='./', tasks_base_path="~/.obdiag/gat self.gather_pack_dir = gather_pack_dir self.report_path = None self.yaml_tasks = {} - self.code_tasks = [] + self.code_tasks = {} self.env = {} self.scene = "observer.base" self.tasks_base_path = tasks_base_path @@ -88,8 +88,8 @@ def execute(self): self.stdio.verbose("execute_tasks. the number of tasks is {0} ,tasks is {1}".format(len(self.yaml_tasks.keys()), self.yaml_tasks.keys())) for key, value in zip(self.yaml_tasks.keys(), self.yaml_tasks.values()): self.__execute_yaml_task_one(key, value) - for task in self.code_tasks: - self.__execute_code_task_one(task) + for key, value in zip(self.code_tasks.keys(), self.code_tasks.values()): + self.__execute_code_task_one(key, value) except Exception as e: self.stdio.error("Internal error :{0}".format(e)) @@ -116,11 +116,11 @@ def __execute_yaml_task_one(self, task_name, task_data): self.stdio.error("__execute_yaml_task_one Exception : {0}".format(e)) # execute code task - def __execute_code_task_one(self, task_name): + def __execute_code_task_one(self, task_name, task_data): try: self.stdio.verbose("execute tasks is {0}".format(task_name)) - scene = {"name": task_name} - task = SceneBase(context=self.context, scene=scene, report_dir=self.report_path, env=self.env, mode='code', task_type=task_name) + task = task_data["module"] + task.init(self.context, task_name, self.report_path) self.stdio.verbose("{0} execute!".format(task_name)) task.execute() self.stdio.verbose("execute tasks end : {0}".format(task_name)) @@ -133,21 +133,18 @@ def __init_task_names(self): items = re.split(r'[;,]', new) scene = GatherScenesListHandler(self.context) for item in items: - yaml_task_data = scene.get_one_yaml_task(item) - is_code_task = scene.is_code_task(item) - if is_code_task: - self.code_tasks.append(item) + task_data = scene.get_one_task(item) + if task_data["task_type"] == 'py': + self.code_tasks[item] = task_data + elif task_data["task_type"] == 'yaml': + self.yaml_tasks[item] = task_data else: - if yaml_task_data: - self.yaml_tasks[item] = yaml_task_data - else: - self.stdio.error("Invalid Task :{0}. Please check the task is exist.".format(item)) - if ".yaml" in item: - self.stdio.suggest("'.yaml' in task :{0}. Maybe you can remove it. use '--scene={1}'".format(item, item.replace(".yaml", ""))) + self.stdio.error("Invalid Task :{0}. Please check the task is exist.".format(item)) + if ".yaml" in item: + self.stdio.suggest("'.yaml' in task :{0}. Maybe you can remove it. use '--scene={1}'".format(item, item.replace(".yaml", ""))) # hard code add gather observer.base if len(self.code_tasks) > 0: - yaml_task_base = scene.get_one_yaml_task("observer.base") - self.yaml_tasks["observer.base"] = yaml_task_base + self.yaml_tasks["observer.base"] = scene.get_one_task("observer.base") else: self.stdio.error("get task name failed") diff --git a/handler/gather/scenes/base.py b/handler/gather/scenes/base.py index 69bc7ea2..d02845e0 100644 --- a/handler/gather/scenes/base.py +++ b/handler/gather/scenes/base.py @@ -19,9 +19,6 @@ from common.scene import filter_by_version from handler.gather.step.base import Base from common.tool import StringUtils -from handler.gather.scenes.sql_problem import SQLProblemScene -from handler.gather.scenes.cpu_high import CPUHighScene -from handler.gather.scenes.px_collect_log import SQLPXCollectLogScene class SceneBase(SafeStdio): @@ -90,23 +87,26 @@ def __execute_yaml_mode(self, nodes): self.stdio.verbose("run scene excute yaml mode in node") def __execute_code_mode(self): - skip_type = self.context.get_variable("gather_skip_type", None) - if skip_type: - self.stdio.verbose("needs to be filtered out and not gather type is {0}".format(skip_type)) - if self.scene["name"] == "observer.perf_sql" or self.scene["name"] == "observer.sql_err": - scene = SQLProblemScene(self.context, self.scene["name"], self.report_dir, self.scene_variable_dict, self.env) - elif self.scene["name"] == "observer.cpu_high" and (skip_type != "ssh"): - scene = CPUHighScene(self.context, self.report_dir, self.scene_variable_dict, self.env) - elif self.scene["name"] == "observer.px_collect_log" and (skip_type != "ssh"): - scene = SQLPXCollectLogScene(self.context, self.scene["name"], self.report_dir, self.scene_variable_dict, self.env) - else: - scene_names = ["observer.perf_sql", "observer.cpu_high", "observer.px_collect_log"] - if self.scene["name"] not in scene_names: - self.stdio.error("unsupported hard code scene {0}".format(self.scene["name"])) - return - try: - self.stdio.verbose("hard code scene {0} execute start".format(self.scene["name"])) - scene.execute() - self.stdio.verbose("hard code scene {0} execute end".format(self.scene["name"])) - except Exception as e: - self.stdio.error("hard code scene execute failed, error :{0}".format(e)) + pass + + # def __execute_code_mode(self): + # skip_type = self.context.get_variable("gather_skip_type", None) + # if skip_type: + # self.stdio.verbose("needs to be filtered out and not gather type is {0}".format(skip_type)) + # if self.scene["name"] == "observer.perf_sql" or self.scene["name"] == "observer.sql_err": + # scene = SQLProblemScene(self.context, self.scene["name"], self.report_dir, self.scene_variable_dict, self.env) + # elif self.scene["name"] == "observer.cpu_high" and (skip_type != "ssh"): + # scene = CPUHighScene(self.context, self.report_dir, self.scene_variable_dict, self.env) + # elif self.scene["name"] == "observer.px_collect_log" and (skip_type != "ssh"): + # scene = SQLPXCollectLogScene(self.context, self.scene["name"], self.report_dir, self.scene_variable_dict, self.env) + # else: + # scene_names = ["observer.perf_sql", "observer.cpu_high", "observer.px_collect_log"] + # if self.scene["name"] not in scene_names: + # self.stdio.error("unsupported hard code scene {0}".format(self.scene["name"])) + # return + # try: + # self.stdio.verbose("hard code scene {0} execute start".format(self.scene["name"])) + # scene.execute() + # self.stdio.verbose("hard code scene {0} execute end".format(self.scene["name"])) + # except Exception as e: + # self.stdio.error("hard code scene execute failed, error :{0}".format(e)) diff --git a/handler/gather/scenes/list.py b/handler/gather/scenes/list.py index 099e13a8..99e676a3 100644 --- a/handler/gather/scenes/list.py +++ b/handler/gather/scenes/list.py @@ -22,7 +22,7 @@ from stdio import SafeStdio from common.tool import YamlUtils from handler.gather.scenes.register import hardcode_scene_list -from common.tool import Util +from common.tool import Util, DynamicLoading class GatherScenesListHandler(SafeStdio): @@ -89,7 +89,7 @@ def __get_hardcode_task(self, scene): "info_cn": scene.info_cn, } - def get_one_yaml_task(self, name): + def get_one_task(self, name): try: task_data = None current_path = self.yaml_tasks_base_path @@ -101,6 +101,22 @@ def get_one_yaml_task(self, name): if name == task_name: task_data = YamlUtils.read_yaml_data(os.path.join(root, file)) task_data["name"] = task_name + task_data["task_type"] = 'yaml' + if file.endswith('.py'): + folder_name = os.path.basename(root) + task_name = "{}.{}".format(folder_name, file.split('.')[0]) + if name == task_name: + DynamicLoading.add_lib_path(root) + task_module = DynamicLoading.import_module(file[:-3], None) + attr_name = name.split('.')[-1] + if not hasattr(task_module, attr_name): + self.stdio.error("{0} import_module failed".format(attr_name)) + return + task_data = {} + task_data["name"] = task_name + task_data["module"] = getattr(task_module, attr_name) + task_data["task_type"] = 'py' + pass return task_data except Exception as e: self.stdio.error("get one yaml task failed, error: ", e) diff --git a/handler/gather/scenes/cpu_high.py b/handler/gather/tasks/observer/cpu_high.py similarity index 96% rename from handler/gather/scenes/cpu_high.py rename to handler/gather/tasks/observer/cpu_high.py index b6eaa752..075eebf8 100644 --- a/handler/gather/scenes/cpu_high.py +++ b/handler/gather/tasks/observer/cpu_high.py @@ -24,8 +24,8 @@ from handler.gather.gather_perf import GatherPerfHandler -class CPUHighScene(SafeStdio): - def __init__(self, context, report_path, task_variable_dict=None, env={}): +class CPUHigh(SafeStdio): + def init(self, context, scene_name, report_path, task_variable_dict=None, env={}): self.context = context self.stdio = context.stdio if task_variable_dict is None: @@ -89,3 +89,6 @@ def report(self, file_path, command, data): f.write(data + '\n') except Exception as e: self.stdio.error("report sql result to file: {0} failed, error: ".format(file_path)) + + +cpu_high = CPUHigh() diff --git a/handler/gather/scenes/px_collect_log.py b/handler/gather/tasks/observer/px_collect_log.py similarity index 98% rename from handler/gather/scenes/px_collect_log.py rename to handler/gather/tasks/observer/px_collect_log.py index 1f70f28b..03b4fa75 100644 --- a/handler/gather/scenes/px_collect_log.py +++ b/handler/gather/tasks/observer/px_collect_log.py @@ -24,8 +24,8 @@ import datetime -class SQLPXCollectLogScene(object): - def __init__(self, context, scene_name, report_path, task_variable_dict=None, env=None): +class PXCollectLog(object): + def init(self, context, scene_name, report_path, task_variable_dict=None, env=None): self.context = context self.stdio = context.stdio if task_variable_dict is None: @@ -189,3 +189,6 @@ def __parse_env(self): return True except Exception as e: self.stdio.error("Parse env fail. Exception : {0} .".format(e)) + + +px_collect_log = PXCollectLog() diff --git a/handler/gather/scenes/sql_problem.py b/handler/gather/tasks/observer/sql_problem.py similarity index 97% rename from handler/gather/scenes/sql_problem.py rename to handler/gather/tasks/observer/sql_problem.py index 804f0b1a..0d0f0352 100644 --- a/handler/gather/scenes/sql_problem.py +++ b/handler/gather/tasks/observer/sql_problem.py @@ -23,8 +23,8 @@ from common.command import find_home_path_by_port -class SQLProblemScene(SafeStdio): - def __init__(self, context, scene_name, report_path, task_variable_dict=None, env={}): +class SQLProblem(SafeStdio): + def init(self, context, scene_name, report_path, task_variable_dict=None, env={}): self.context = context self.stdio = context.stdio if task_variable_dict is None: @@ -130,3 +130,6 @@ def __parse_env(self): else: self.stdio.error("option env not found, please run 'obdiag gather scene list' to check usage") return False + + +sql_problem = SQLProblem() diff --git a/handler/rca/rca_list.py b/handler/rca/rca_list.py index bd6c3914..515298a0 100644 --- a/handler/rca/rca_list.py +++ b/handler/rca/rca_list.py @@ -46,7 +46,7 @@ def get_all_scenes(self): return for scene_file in scenes_files: lib_path = self.work_path - module_name = os.path.basename(scene_file)[:-9] + module_name = os.path.basename(scene_file)[:-3] DynamicLoading.add_lib_path(lib_path) module = DynamicLoading.import_module(os.path.basename(scene_file)[:-3], None) if not hasattr(module, module_name): @@ -74,6 +74,6 @@ def __find_rca_files(self): for file_or_folder in os.listdir(self.work_path): full_path = os.path.join(self.work_path, file_or_folder) if os.path.isfile(full_path): - if full_path.endswith('_scene.py') and len(os.path.basename(full_path)) > 7: + if full_path.endswith('.py') and len(os.path.basename(full_path)) > 7: files.append(full_path) return files diff --git a/handler/rca/scene/clog_disk_full_scene.py b/handler/rca/scene/clog_disk_full.py similarity index 100% rename from handler/rca/scene/clog_disk_full_scene.py rename to handler/rca/scene/clog_disk_full.py diff --git a/handler/rca/scene/ddl_disk_full_scene.py b/handler/rca/scene/ddl_disk_full.py similarity index 100% rename from handler/rca/scene/ddl_disk_full_scene.py rename to handler/rca/scene/ddl_disk_full.py diff --git a/handler/rca/scene/ddl_failure_scene.py b/handler/rca/scene/ddl_failure.py similarity index 100% rename from handler/rca/scene/ddl_failure_scene.py rename to handler/rca/scene/ddl_failure.py diff --git a/handler/rca/scene/disconnection_scene.py b/handler/rca/scene/disconnection.py similarity index 100% rename from handler/rca/scene/disconnection_scene.py rename to handler/rca/scene/disconnection.py diff --git a/handler/rca/scene/index_ddl_error_scene.py b/handler/rca/scene/index_ddl_error.py similarity index 100% rename from handler/rca/scene/index_ddl_error_scene.py rename to handler/rca/scene/index_ddl_error.py diff --git a/handler/rca/scene/lock_conflict_scene.py b/handler/rca/scene/lock_conflict.py similarity index 100% rename from handler/rca/scene/lock_conflict_scene.py rename to handler/rca/scene/lock_conflict.py diff --git a/handler/rca/scene/log_error_scene.py b/handler/rca/scene/log_error.py similarity index 100% rename from handler/rca/scene/log_error_scene.py rename to handler/rca/scene/log_error.py diff --git a/handler/rca/scene/major_hold_scene.py b/handler/rca/scene/major_hold.py similarity index 100% rename from handler/rca/scene/major_hold_scene.py rename to handler/rca/scene/major_hold.py diff --git a/handler/rca/scene/transaction_disconnection_scene.py b/handler/rca/scene/transaction_disconnection.py similarity index 100% rename from handler/rca/scene/transaction_disconnection_scene.py rename to handler/rca/scene/transaction_disconnection.py diff --git a/handler/rca/scene/transaction_execute_timeout_scene.py b/handler/rca/scene/transaction_execute_timeout.py similarity index 100% rename from handler/rca/scene/transaction_execute_timeout_scene.py rename to handler/rca/scene/transaction_execute_timeout.py diff --git a/handler/rca/scene/transaction_not_ending_scene.py b/handler/rca/scene/transaction_not_ending.py similarity index 100% rename from handler/rca/scene/transaction_not_ending_scene.py rename to handler/rca/scene/transaction_not_ending.py diff --git a/handler/rca/scene/transaction_other_error_scene.py b/handler/rca/scene/transaction_other_error.py similarity index 100% rename from handler/rca/scene/transaction_other_error_scene.py rename to handler/rca/scene/transaction_other_error.py diff --git a/handler/rca/scene/transaction_rollback_scene.py b/handler/rca/scene/transaction_rollback.py similarity index 100% rename from handler/rca/scene/transaction_rollback_scene.py rename to handler/rca/scene/transaction_rollback.py diff --git a/handler/rca/scene/transaction_wait_timeout_scene.py b/handler/rca/scene/transaction_wait_timeout.py similarity index 100% rename from handler/rca/scene/transaction_wait_timeout_scene.py rename to handler/rca/scene/transaction_wait_timeout.py From 549df61c669954c702085aced027c5fab22a4516 Mon Sep 17 00:00:00 2001 From: xiaodong-ji Date: Wed, 27 Nov 2024 16:01:23 +0800 Subject: [PATCH 26/32] add staticmethod for tool (#580) * add staticmethod for tool * update build package yml --- .github/workflows/build_package.yml | 1 - common/tool.py | 6 ++++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/.github/workflows/build_package.yml b/.github/workflows/build_package.yml index 63f7104f..118d115b 100644 --- a/.github/workflows/build_package.yml +++ b/.github/workflows/build_package.yml @@ -8,7 +8,6 @@ on: branches: - master - v3.0-dev - - enhancement-optimize_external_script env: ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true diff --git a/common/tool.py b/common/tool.py index 788c518c..96e49d13 100644 --- a/common/tool.py +++ b/common/tool.py @@ -526,6 +526,7 @@ def unzip(source, ztype=None, stdio=None): stdio and getattr(stdio, 'exception', print)('failed to unzip %s' % source) return None + @staticmethod def extract_tar(tar_path, output_path, stdio=None): if not os.path.exists(output_path): os.makedirs(output_path) @@ -578,6 +579,7 @@ def unlock(obj, stdio=None): fcntl.flock(obj, fcntl.LOCK_UN) return obj + @staticmethod def size_format(num, unit="B", output_str=False, stdio=None): if num < 0: raise ValueError("num cannot be negative!") @@ -637,6 +639,7 @@ def calculate_sha256(filepath, stdio=None): except Exception as e: return "" + @staticmethod def size(size_str, unit='B', stdio=None): unit_size_dict = { "b": 1, @@ -658,10 +661,12 @@ def size(size_str, unit='B', stdio=None): raise ValueError('size cannot be negative!') return real_size / unit_size_dict[unit] + @staticmethod def write_append(filename, result, stdio=None): with io.open(filename, 'a', encoding='utf-8') as fileobj: fileobj.write(u'{}'.format(result)) + @staticmethod def tar_gz_to_zip(temp_dir, tar_gz_file, output_zip, password, stdio): extract_dir = os.path.join(temp_dir, 'extracted_files_{0}'.format(str(uuid.uuid4())[:6])) @@ -1574,6 +1579,7 @@ def print_title(name, stdio): def gen_password(length=8, chars=string.ascii_letters + string.digits, stdio=None): return ''.join([choice(chars) for i in range(length)]) + @staticmethod def retry(retry_count=3, retry_interval=2, stdio=None): def real_decorator(decor_method): def wrapper(*args, **kwargs): From 63b387935bb646ae3a6ee3cb53238fdfd718d94d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=B8=A0=E7=A3=8A?= Date: Thu, 28 Nov 2024 14:54:23 +0800 Subject: [PATCH 27/32] obdiag update 3.0.0 version --- core.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/core.py b/core.py index 5e492397..41cdc039 100644 --- a/core.py +++ b/core.py @@ -477,13 +477,11 @@ def check(self, opts): if self.context.obproxy_config.get("servers") is not None and len(self.context.obproxy_config.get("servers")) > 0: obproxy_check_handler = CheckHandler(self.context, check_target_type="obproxy") - obproxy_check_handler.handle() - obproxy_result = obproxy_check_handler.execute() + obproxy_result = obproxy_check_handler.handle() result_data['obproxy'] = obproxy_result if self.context.cluster_config.get("servers") is not None and len(self.context.cluster_config.get("servers")) > 0: observer_check_handler = CheckHandler(self.context, check_target_type="observer") - observer_check_handler.handle() - observer_result = observer_check_handler.execute() + observer_result = observer_check_handler.handle() result_data['observer'] = observer_result if obproxy_check_handler is not None: obproxy_report_path = os.path.expanduser(obproxy_check_handler.report.get_report_path()) From a513212303b0b858d1e4239c1c967ee832c90350 Mon Sep 17 00:00:00 2001 From: xiaodong-ji Date: Thu, 28 Nov 2024 21:00:20 +0800 Subject: [PATCH 28/32] optimize use of context (#585) * add context for OBConnector * black format * recover call stdio * use context.stdio for OBConnector * balck format * remove stdio parameter for update_obcluster_nodes --- common/command.py | 10 +++++----- common/config_helper.py | 4 ++-- common/ob_connector.py | 5 +++-- core.py | 4 ++-- handler/analyzer/analyze_index_space.py | 2 +- handler/analyzer/analyze_memory.py | 2 +- handler/analyzer/analyze_parameter.py | 4 ++-- handler/analyzer/analyze_queue.py | 4 ++-- handler/analyzer/analyze_sql.py | 4 ++-- handler/analyzer/analyze_sql_review.py | 4 ++-- handler/analyzer/analyze_variable.py | 2 +- handler/checker/check_handler.py | 2 +- handler/display/display_scenes.py | 4 ++-- handler/gather/gather_ash_report.py | 8 +++++++- handler/gather/gather_parameters.py | 4 ++-- handler/gather/gather_plan_monitor.py | 4 ++-- handler/gather/gather_tabledump.py | 4 ++-- handler/gather/gather_variables.py | 2 +- handler/gather/step/sql.py | 2 +- handler/rca/rca_handler.py | 2 +- telemetry/telemetry.py | 4 ++-- 21 files changed, 44 insertions(+), 37 deletions(-) diff --git a/common/command.py b/common/command.py index 9e1ded07..f62134f8 100644 --- a/common/command.py +++ b/common/command.py @@ -243,7 +243,7 @@ def get_observer_version(context): stdio.verbose("get observer version, by sql") obcluster = context.cluster_config # by sql - observer_version = get_observer_version_by_sql(obcluster, stdio) + observer_version = get_observer_version_by_sql(context, obcluster) except Exception as e: try: stdio.verbose("get observer version, by sql fail. by ssh") @@ -325,15 +325,15 @@ def get_obproxy_version(context): # Only applicable to the community version -def get_observer_version_by_sql(ob_cluster, stdio=None): - stdio.verbose("start get_observer_version_by_sql . input: {0}:{1}".format(ob_cluster.get("db_host"), ob_cluster.get("db_port"))) +def get_observer_version_by_sql(context, ob_cluster): + context.stdio.verbose("start get_observer_version_by_sql . input: {0}:{1}".format(ob_cluster.get("db_host"), ob_cluster.get("db_port"))) try: - ob_connector = OBConnector(ip=ob_cluster.get("db_host"), port=ob_cluster.get("db_port"), username=ob_cluster.get("tenant_sys").get("user"), password=ob_cluster.get("tenant_sys").get("password"), stdio=stdio, timeout=100) + ob_connector = OBConnector(context=context, ip=ob_cluster.get("db_host"), port=ob_cluster.get("db_port"), username=ob_cluster.get("tenant_sys").get("user"), password=ob_cluster.get("tenant_sys").get("password"), timeout=100) ob_version_info = ob_connector.execute_sql("select version();") except Exception as e: raise Exception("get_observer_version_by_sql Exception. Maybe cluster'info is error: " + e.__str__()) ob_version = ob_version_info[0] - stdio.verbose("get_observer_version_by_sql ob_version_info is {0}".format(ob_version)) + context.stdio.verbose("get_observer_version_by_sql ob_version_info is {0}".format(ob_version)) version = re.findall(r'OceanBase(_)?(.CE)?-v(.+)', ob_version[0]) if len(version) > 0: return version[0][2] diff --git a/common/config_helper.py b/common/config_helper.py index 292afbe4..dbf2b030 100644 --- a/common/config_helper.py +++ b/common/config_helper.py @@ -55,7 +55,7 @@ def __init__(self, context): def get_cluster_name(self): ob_version = get_observer_version(self.context) - obConnetcor = OBConnector(ip=self.db_host, port=self.db_port, username=self.sys_tenant_user, password=self.sys_tenant_password, stdio=self.stdio, timeout=100) + obConnetcor = OBConnector(context=self.context, ip=self.db_host, port=self.db_port, username=self.sys_tenant_user, password=self.sys_tenant_password, timeout=100) if ob_version.startswith("3") or ob_version.startswith("2"): sql = "select cluster_name from oceanbase.v$ob_cluster" res = obConnetcor.execute_sql(sql) @@ -68,7 +68,7 @@ def get_cluster_name(self): def get_host_info_list_by_cluster(self): ob_version = get_observer_version(self.context) - obConnetcor = OBConnector(ip=self.db_host, port=self.db_port, username=self.sys_tenant_user, password=self.sys_tenant_password, stdio=self.stdio, timeout=100) + obConnetcor = OBConnector(context=self.context, ip=self.db_host, port=self.db_port, username=self.sys_tenant_user, password=self.sys_tenant_password, timeout=100) sql = "select SVR_IP, SVR_PORT, ZONE, BUILD_VERSION from oceanbase.DBA_OB_SERVERS" if ob_version.startswith("3") or ob_version.startswith("2") or ob_version.startswith("1"): sql = "select SVR_IP, SVR_PORT, ZONE, BUILD_VERSION from oceanbase.__all_server" diff --git a/common/ob_connector.py b/common/ob_connector.py index 4383738f..02cc9d70 100644 --- a/common/ob_connector.py +++ b/common/ob_connector.py @@ -28,21 +28,22 @@ class OBConnector(object): def __init__( self, + context, ip, port, username, password=None, database=None, - stdio=None, timeout=30, ): + self.context = context self.ip = str(ip) self.port = int(port) self.username = str(username) self.password = str(password) self.timeout = timeout self.conn = None - self.stdio = stdio + self.stdio = context.stdio self.database = database self.init() diff --git a/core.py b/core.py index 41cdc039..a682f3cf 100644 --- a/core.py +++ b/core.py @@ -139,7 +139,7 @@ def set_context(self, handler_name, namespace, config): stdio=self.stdio, inner_config=self.inner_config_manager.config, ) - telemetry.set_cluster_conn(config.get_ob_cluster_config) + telemetry.set_cluster_conn(self.context, config.get_ob_cluster_config) def set_context_skip_cluster_conn(self, handler_name, namespace, config): self.context = HandlerContext( @@ -175,7 +175,7 @@ def update_obcluster_nodes(self, config): return ob_version = get_observer_version_by_sql(ob_cluster, self.stdio) - obConnetcor = OBConnector(ip=ob_cluster["db_host"], port=ob_cluster["db_port"], username=ob_cluster["tenant_sys"]["user"], password=ob_cluster["tenant_sys"]["password"], stdio=self.stdio) + obConnetcor = OBConnector(context=self.context, ip=ob_cluster["db_host"], port=ob_cluster["db_port"], username=ob_cluster["tenant_sys"]["user"], password=ob_cluster["tenant_sys"]["password"]) sql = "select SVR_IP, SVR_PORT, ZONE, BUILD_VERSION from oceanbase.__all_server" if ob_version.startswith(("1", "2", "3")): diff --git a/handler/analyzer/analyze_index_space.py b/handler/analyzer/analyze_index_space.py index ba2b2419..5909e275 100644 --- a/handler/analyzer/analyze_index_space.py +++ b/handler/analyzer/analyze_index_space.py @@ -58,7 +58,7 @@ def init_option(self): ob_cluster = self.context.cluster_config self.stdio.verbose('cluster config: {0}'.format(StringUtils.mask_passwords(ob_cluster))) self.ob_cluster = ob_cluster - self.sys_connector = OBConnector(ip=ob_cluster.get("db_host"), port=ob_cluster.get("db_port"), username=ob_cluster.get("tenant_sys").get("user"), password=ob_cluster.get("tenant_sys").get("password"), stdio=self.stdio, timeout=100) + self.sys_connector = OBConnector(context=self.context, ip=ob_cluster.get("db_host"), port=ob_cluster.get("db_port"), username=ob_cluster.get("tenant_sys").get("user"), password=ob_cluster.get("tenant_sys").get("password"), timeout=100) tenant_name = Util.get_option(options, 'tenant_name') table_name = Util.get_option(options, 'table_name') index_name = Util.get_option(options, 'index_name') diff --git a/handler/analyzer/analyze_memory.py b/handler/analyzer/analyze_memory.py index 4ed5cc10..980c438b 100644 --- a/handler/analyzer/analyze_memory.py +++ b/handler/analyzer/analyze_memory.py @@ -130,7 +130,7 @@ def init_option(self): def get_version(self): observer_version = "" try: - observer_version = get_observer_version_by_sql(self.ob_cluster, self.stdio) + observer_version = get_observer_version_by_sql(self.context, self.ob_cluster) except Exception as e: self.stdio.exception("failed to get observer version:{0}".format(e)) self.stdio.verbose("get observer version: {0}".format(observer_version)) diff --git a/handler/analyzer/analyze_parameter.py b/handler/analyzer/analyze_parameter.py index 40e5c012..7f8df5d3 100644 --- a/handler/analyzer/analyze_parameter.py +++ b/handler/analyzer/analyze_parameter.py @@ -44,11 +44,11 @@ def __init__(self, context, analyze_type='default'): self.observer_nodes = self.context.cluster_config.get("servers") try: self.obconn = OBConnector( + context=self.context, ip=self.ob_cluster.get("db_host"), port=self.ob_cluster.get("db_port"), username=self.ob_cluster.get("tenant_sys").get("user"), password=self.ob_cluster.get("tenant_sys").get("password"), - stdio=self.stdio, timeout=10000, database="oceanbase", ) @@ -59,7 +59,7 @@ def __init__(self, context, analyze_type='default'): def get_version(self): observer_version = "" try: - observer_version = get_observer_version_by_sql(self.ob_cluster, self.stdio) + observer_version = get_observer_version_by_sql(self.context, self.ob_cluster) except Exception as e: self.stdio.warn("failed to get observer version:{0}".format(e)) self.stdio.verbose("get observer version: {0}".format(observer_version)) diff --git a/handler/analyzer/analyze_queue.py b/handler/analyzer/analyze_queue.py index 62cf11cb..fc26a46c 100644 --- a/handler/analyzer/analyze_queue.py +++ b/handler/analyzer/analyze_queue.py @@ -64,11 +64,11 @@ def __init__(self, context): self.scope = None try: self.obconn = OBConnector( + context=self.context, ip=self.ob_cluster.get("db_host"), port=self.ob_cluster.get("db_port"), username=self.ob_cluster.get("tenant_sys").get("user"), password=self.ob_cluster.get("tenant_sys").get("password"), - stdio=self.stdio, timeout=10000, database="oceanbase", ) @@ -159,7 +159,7 @@ def init_option(self): def get_version(self): observer_version = "" try: - observer_version = get_observer_version_by_sql(self.ob_cluster, self.stdio) + observer_version = get_observer_version_by_sql(self.context, self.ob_cluster) except Exception as e: self.stdio.warn("AnalyzeQueueHandler failed to get observer version:{0}".format(e)) self.stdio.verbose("AnalyzeQueueHandler get observer version: {0}".format(observer_version)) diff --git a/handler/analyzer/analyze_sql.py b/handler/analyzer/analyze_sql.py index ded4f21b..98cb38ef 100644 --- a/handler/analyzer/analyze_sql.py +++ b/handler/analyzer/analyze_sql.py @@ -120,7 +120,7 @@ def init_config(self): ob_cluster = self.context.cluster_config self.stdio.verbose('cluster config: {0}'.format(StringUtils.mask_passwords(ob_cluster))) self.ob_cluster = ob_cluster - self.sys_connector = OBConnector(ip=ob_cluster.get("db_host"), port=ob_cluster.get("db_port"), username=ob_cluster.get("tenant_sys").get("user"), password=ob_cluster.get("tenant_sys").get("password"), stdio=self.stdio, timeout=100) + self.sys_connector = OBConnector(context=self.context, ip=ob_cluster.get("db_host"), port=ob_cluster.get("db_port"), username=ob_cluster.get("tenant_sys").get("user"), password=ob_cluster.get("tenant_sys").get("password"), timeout=100) self.ob_cluster_name = ob_cluster.get("ob_cluster_name") self.stdio.print('init cluster config complete') return True @@ -134,7 +134,7 @@ def init_ob_version(self): def init_db_connector(self): if self.db_user: self.db_connector_provided = True - self.db_connector = OBConnector(ip=self.ob_cluster.get("db_host"), port=self.ob_cluster.get("db_port"), username=self.db_user, password=self.db_password, stdio=self.stdio, timeout=100) + self.db_connector = OBConnector(context=self.context, ip=self.ob_cluster.get("db_host"), port=self.ob_cluster.get("db_port"), username=self.db_user, password=self.db_password, timeout=100) else: self.db_connector = self.sys_connector diff --git a/handler/analyzer/analyze_sql_review.py b/handler/analyzer/analyze_sql_review.py index f2c8494d..03245d38 100644 --- a/handler/analyzer/analyze_sql_review.py +++ b/handler/analyzer/analyze_sql_review.py @@ -58,7 +58,7 @@ def init_config(self): ob_cluster = self.context.cluster_config self.stdio.verbose('cluster config: {0}'.format(StringUtils.mask_passwords(ob_cluster))) self.ob_cluster = ob_cluster - self.sys_connector = OBConnector(ip=ob_cluster.get("db_host"), port=ob_cluster.get("db_port"), username=ob_cluster.get("tenant_sys").get("user"), password=ob_cluster.get("tenant_sys").get("password"), stdio=self.stdio, timeout=100) + self.sys_connector = OBConnector(context=self.context, ip=ob_cluster.get("db_host"), port=ob_cluster.get("db_port"), username=ob_cluster.get("tenant_sys").get("user"), password=ob_cluster.get("tenant_sys").get("password"), timeout=100) self.ob_cluster_name = ob_cluster.get("ob_cluster_name") self.stdio.print('init cluster config complete') return True @@ -67,7 +67,7 @@ def init_db_connector(self): if self.db_user: self.stdio.verbose("init db connector start") self.db_connector_provided = True - self.db_connector = OBConnector(ip=self.ob_cluster.get("db_host"), port=self.ob_cluster.get("db_port"), username=self.db_user, password=self.db_password, stdio=self.stdio, timeout=100) + self.db_connector = OBConnector(context=self.context, ip=self.ob_cluster.get("db_host"), port=self.ob_cluster.get("db_port"), username=self.db_user, password=self.db_password, timeout=100) self.stdio.verbose("init db connector complete") else: self.db_connector = self.sys_connector diff --git a/handler/analyzer/analyze_variable.py b/handler/analyzer/analyze_variable.py index 9199e77a..ce882b82 100644 --- a/handler/analyzer/analyze_variable.py +++ b/handler/analyzer/analyze_variable.py @@ -42,11 +42,11 @@ def __init__(self, context, analyze_type='diff'): self.observer_nodes = self.context.cluster_config.get("servers") try: self.obconn = OBConnector( + context=self.context, ip=self.ob_cluster.get("db_host"), port=self.ob_cluster.get("db_port"), username=self.ob_cluster.get("tenant_sys").get("user"), password=self.ob_cluster.get("tenant_sys").get("password"), - stdio=self.stdio, timeout=10000, database="oceanbase", ) diff --git a/handler/checker/check_handler.py b/handler/checker/check_handler.py index aced1a8e..a5fc20c9 100644 --- a/handler/checker/check_handler.py +++ b/handler/checker/check_handler.py @@ -261,7 +261,7 @@ def __init__(self, context, max_size, cluster): self.stdio.verbose("obConnectorPool init success!") try: for i in range(max_size): - conn = OBConnector(ip=self.cluster.get("db_host"), port=self.cluster.get("db_port"), username=self.cluster.get("tenant_sys").get("user"), password=self.cluster.get("tenant_sys").get("password"), stdio=self.stdio, timeout=10000) + conn = OBConnector(context=context, ip=self.cluster.get("db_host"), port=self.cluster.get("db_port"), username=self.cluster.get("tenant_sys").get("user"), password=self.cluster.get("tenant_sys").get("password"), timeout=10000) self.connections.put(conn) self.stdio.verbose("obConnectorPool init success!") except Exception as e: diff --git a/handler/display/display_scenes.py b/handler/display/display_scenes.py index bc1c998d..e20c876e 100644 --- a/handler/display/display_scenes.py +++ b/handler/display/display_scenes.py @@ -57,7 +57,7 @@ def __init__(self, context, display_pack_dir='./', tasks_base_path="~/.obdiag/di def init_config(self): self.cluster = self.context.cluster_config - self.sys_connector = OBConnector(ip=self.cluster.get("db_host"), port=self.cluster.get("db_port"), username=self.cluster.get("tenant_sys").get("user"), password=self.cluster.get("tenant_sys").get("password"), stdio=self.stdio, timeout=100) + self.sys_connector = OBConnector(context=self.context, ip=self.cluster.get("db_host"), port=self.cluster.get("db_port"), username=self.cluster.get("tenant_sys").get("user"), password=self.cluster.get("tenant_sys").get("password"), timeout=100) self.obproxy_nodes = self.context.obproxy_config['servers'] self.ob_nodes = self.context.cluster_config['servers'] new_nodes = Util.get_nodes_list(self.context, self.ob_nodes, self.stdio) @@ -89,7 +89,7 @@ def execute(self): self.stdio.error("Internal error :{0}".format(e)) def __init_db_connector(self): - self.db_connector = OBConnector(ip=self.db_conn.get("host"), port=self.db_conn.get("port"), username=self.db_conn.get("user"), password=self.db_conn.get("password"), database=self.db_conn.get("database"), stdio=self.stdio, timeout=100) + self.db_connector = OBConnector(context=self.context, ip=self.db_conn.get("host"), port=self.db_conn.get("port"), username=self.db_conn.get("user"), password=self.db_conn.get("password"), database=self.db_conn.get("database"), timeout=100) def __init_db_conn(self, cli_connection_string): try: diff --git a/handler/gather/gather_ash_report.py b/handler/gather/gather_ash_report.py index 410f1d55..cac8e8a9 100644 --- a/handler/gather/gather_ash_report.py +++ b/handler/gather/gather_ash_report.py @@ -53,7 +53,13 @@ def __init__(self, context, gather_pack_dir='./'): self.observer_nodes = self.context.cluster_config.get("servers") try: self.obconn = OBConnector( - ip=self.cluster.get("db_host"), port=self.cluster.get("db_port"), username=self.cluster.get("tenant_sys").get("user"), password=self.cluster.get("tenant_sys").get("password"), stdio=self.stdio, timeout=10000, database="oceanbase" + context=self.context, + ip=self.cluster.get("db_host"), + port=self.cluster.get("db_port"), + username=self.cluster.get("tenant_sys").get("user"), + password=self.cluster.get("tenant_sys").get("password"), + timeout=10000, + database="oceanbase", ) except Exception as e: self.stdio.error("Failed to connect to database: {0}".format(e)) diff --git a/handler/gather/gather_parameters.py b/handler/gather/gather_parameters.py index 359ff423..439536f2 100644 --- a/handler/gather/gather_parameters.py +++ b/handler/gather/gather_parameters.py @@ -40,11 +40,11 @@ def __init__(self, context, gather_pack_dir='./'): self.observer_nodes = self.context.cluster_config.get("servers") try: self.obconn = OBConnector( + context=self.context, ip=self.ob_cluster.get("db_host"), port=self.ob_cluster.get("db_port"), username=self.ob_cluster.get("tenant_sys").get("user"), password=self.ob_cluster.get("tenant_sys").get("password"), - stdio=self.stdio, timeout=10000, database="oceanbase", ) @@ -77,7 +77,7 @@ def init_option(self): def get_version(self): observer_version = "" try: - observer_version = get_observer_version_by_sql(self.ob_cluster, self.stdio) + observer_version = get_observer_version_by_sql(self.context, self.ob_cluster) except Exception as e: self.stdio.warn("failed to get observer version:{0}".format(e)) self.stdio.verbose("get observer version: {0}".format(observer_version)) diff --git a/handler/gather/gather_plan_monitor.py b/handler/gather/gather_plan_monitor.py index 81ec817a..b652e351 100644 --- a/handler/gather/gather_plan_monitor.py +++ b/handler/gather/gather_plan_monitor.py @@ -63,7 +63,7 @@ def __init__(self, context, gather_pack_dir='./', is_scene=False): def init_config(self): ob_cluster = self.context.cluster_config self.ob_cluster = ob_cluster - self.sys_connector = OBConnector(ip=ob_cluster.get("db_host"), port=ob_cluster.get("db_port"), username=ob_cluster.get("tenant_sys").get("user"), password=ob_cluster.get("tenant_sys").get("password"), stdio=self.stdio, timeout=100) + self.sys_connector = OBConnector(context=self.context, ip=ob_cluster.get("db_host"), port=ob_cluster.get("db_port"), username=ob_cluster.get("tenant_sys").get("user"), password=ob_cluster.get("tenant_sys").get("password"), timeout=100) self.ob_cluster_name = ob_cluster.get("ob_cluster_name") return True @@ -92,7 +92,7 @@ def init_option(self): return self.tenant_mode_detected() def __init_db_connector(self): - self.db_connector = OBConnector(ip=self.db_conn.get("host"), port=self.db_conn.get("port"), username=self.db_conn.get("user"), password=self.db_conn.get("password"), database=self.db_conn.get("database"), stdio=self.stdio, timeout=100) + self.db_connector = OBConnector(context=self.context, ip=self.db_conn.get("host"), port=self.db_conn.get("port"), username=self.db_conn.get("user"), password=self.db_conn.get("password"), database=self.db_conn.get("database"), timeout=100) def handle(self): if not self.init_config(): diff --git a/handler/gather/gather_tabledump.py b/handler/gather/gather_tabledump.py index 0d0c2130..46a2626c 100644 --- a/handler/gather/gather_tabledump.py +++ b/handler/gather/gather_tabledump.py @@ -88,9 +88,9 @@ def init(self): else: self.tenant_name = self.__extract_string(user) self.ob_connector = OBConnector( - ip=self.ob_cluster.get("db_host"), port=self.ob_cluster.get("db_port"), username=self.ob_cluster.get("tenant_sys").get("user"), password=self.ob_cluster.get("tenant_sys").get("password"), stdio=self.stdio, timeout=100 + context=self.context, ip=self.ob_cluster.get("db_host"), port=self.ob_cluster.get("db_port"), username=self.ob_cluster.get("tenant_sys").get("user"), password=self.ob_cluster.get("tenant_sys").get("password"), timeout=100 ) - self.tenant_connector = OBConnector(ip=self.ob_cluster.get("db_host"), port=self.ob_cluster.get("db_port"), username=user, password=password, stdio=self.stdio, timeout=100) + self.tenant_connector = OBConnector(context=self.context, ip=self.ob_cluster.get("db_host"), port=self.ob_cluster.get("db_port"), username=user, password=password, timeout=100) self.file_name = "{0}/obdiag_tabledump_result_{1}.txt".format(self.store_dir, TimeUtils.timestamp_to_filename_time(self.gather_timestamp)) return True except Exception as e: diff --git a/handler/gather/gather_variables.py b/handler/gather/gather_variables.py index 970e5ad2..1c620abc 100644 --- a/handler/gather/gather_variables.py +++ b/handler/gather/gather_variables.py @@ -39,11 +39,11 @@ def __init__(self, context, gather_pack_dir='./'): self.observer_nodes = self.context.cluster_config.get("servers") try: self.obconn = OBConnector( + context=self.context, ip=self.ob_cluster.get("db_host"), port=self.ob_cluster.get("db_port"), username=self.ob_cluster.get("tenant_sys").get("user"), password=self.ob_cluster.get("tenant_sys").get("password"), - stdio=self.stdio, timeout=10000, database="oceanbase", ) diff --git a/handler/gather/step/sql.py b/handler/gather/step/sql.py index 8f6e8050..c754fca6 100644 --- a/handler/gather/step/sql.py +++ b/handler/gather/step/sql.py @@ -34,7 +34,7 @@ def __init__(self, context, step, ob_cluster, report_path, task_variable_dict, e self.sys_database = None self.database = None self.env = env - self.ob_connector = OBConnector(ip=ob_cluster.get("db_host"), port=ob_cluster.get("db_port"), username=ob_cluster.get("tenant_sys").get("user"), password=ob_cluster.get("tenant_sys").get("password"), stdio=self.stdio, timeout=10000) + self.ob_connector = OBConnector(context=self.context, ip=ob_cluster.get("db_host"), port=ob_cluster.get("db_port"), username=ob_cluster.get("tenant_sys").get("user"), password=ob_cluster.get("tenant_sys").get("password"), timeout=10000) except Exception as e: self.stdio.error("StepSQLHandler init fail. Please check the OBCLUSTER conf. OBCLUSTER: {0} Exception : {1} .".format(ob_cluster, e)) self.task_variable_dict = task_variable_dict diff --git a/handler/rca/rca_handler.py b/handler/rca/rca_handler.py index 581197bb..2546a65c 100644 --- a/handler/rca/rca_handler.py +++ b/handler/rca/rca_handler.py @@ -79,11 +79,11 @@ def __init__(self, context): try: if self.ob_cluster is not None: ob_connector = OBConnector( + context=self.context, ip=self.ob_cluster.get("db_host"), port=self.ob_cluster.get("db_port"), username=self.ob_cluster.get("tenant_sys").get("user"), password=self.ob_cluster.get("tenant_sys").get("password"), - stdio=self.stdio, timeout=10000, ) self.context.set_variable("ob_connector", ob_connector) diff --git a/telemetry/telemetry.py b/telemetry/telemetry.py index 5c9e2f68..680510e8 100644 --- a/telemetry/telemetry.py +++ b/telemetry/telemetry.py @@ -48,7 +48,7 @@ def __init__(self): self.version = get_obdiag_version() self.stdio = IO(1) - def set_cluster_conn(self, obcluster): + def set_cluster_conn(self, context, obcluster): try: if not self.work_tag: return @@ -60,7 +60,7 @@ def set_cluster_conn(self, obcluster): if obcluster is not None: try: - self.cluster_conn = OBConnector(ip=obcluster.get("db_host"), port=obcluster.get("db_port"), username=obcluster.get("tenant_sys").get("user"), password=obcluster.get("tenant_sys").get("password"), stdio=self.stdio, timeout=10000) + self.cluster_conn = OBConnector(context=context, ip=obcluster.get("db_host"), port=obcluster.get("db_port"), username=obcluster.get("tenant_sys").get("user"), password=obcluster.get("tenant_sys").get("password"), timeout=10000) self.threads.append(threading.Thread(None, self.get_cluster_info())) # self.threads.append(threading.Thread(None, self.get_tenant_info())) for thread in self.threads: From e692585c7591f30b3178e10c242b98701707d4b9 Mon Sep 17 00:00:00 2001 From: xuyan wang <35394786+wayyoungboy@users.noreply.github.com> Date: Mon, 2 Dec 2024 20:41:57 +0800 Subject: [PATCH 29/32] check update (#590) * support check list * support check list * support check list * support check list * support check list * support check list * build test package * build test package * update GatherComponentLogHandler * update tar_gz_to_zip * delete zip on gather * delete zip on gather * delete zip on gather * delete zip on gather * delete zip on gather * gather use find to get remote log * rca add oms_full_trans * rca add oms_full_trans * update * fix core options * add oms config * check list add --all --- core.py | 59 +++++++++++++------------ example/all-components-with-oms.yml | 67 +++++++++++++++++++++++++++++ 2 files changed, 99 insertions(+), 27 deletions(-) create mode 100644 example/all-components-with-oms.yml diff --git a/core.py b/core.py index a682f3cf..7cd2114a 100644 --- a/core.py +++ b/core.py @@ -468,32 +468,37 @@ def check(self, opts): self._call_stdio('error', 'No such custum config') return ObdiagResult(ObdiagResult.INPUT_ERROR_CODE, error_data='No such custum config') else: - self.stdio.print("check start ...") - self.update_obcluster_nodes(config) - self.set_context('check', 'check', config) - obproxy_check_handler = None - observer_check_handler = None - result_data = {} - - if self.context.obproxy_config.get("servers") is not None and len(self.context.obproxy_config.get("servers")) > 0: - obproxy_check_handler = CheckHandler(self.context, check_target_type="obproxy") - obproxy_result = obproxy_check_handler.handle() - result_data['obproxy'] = obproxy_result - if self.context.cluster_config.get("servers") is not None and len(self.context.cluster_config.get("servers")) > 0: - observer_check_handler = CheckHandler(self.context, check_target_type="observer") - observer_result = observer_check_handler.handle() - result_data['observer'] = observer_result - if obproxy_check_handler is not None: - obproxy_report_path = os.path.expanduser(obproxy_check_handler.report.get_report_path()) - if os.path.exists(obproxy_report_path): - result_data['obproxy_report_path'] = os.path.abspath(obproxy_report_path) - self.stdio.print("Check obproxy finished. For more details, please run cmd '" + Fore.YELLOW + " cat {0} ".format(obproxy_check_handler.report.get_report_path()) + Style.RESET_ALL + "'") - if observer_check_handler is not None: - observer_report_path = os.path.expanduser(observer_check_handler.report.get_report_path()) - if os.path.exists(observer_report_path): - result_data['observer_report_path'] = os.path.abspath(observer_report_path) - self.stdio.print("Check observer finished. For more details, please run cmd'" + Fore.YELLOW + " cat {0} ".format(observer_check_handler.report.get_report_path()) + Style.RESET_ALL + "'") - return ObdiagResult(ObdiagResult.SUCCESS_CODE, data=result_data) + try: + self.stdio.print("check start ...") + self.update_obcluster_nodes(config) + self.set_context('check', 'check', config) + obproxy_check_handler = None + observer_check_handler = None + result_data = {} + + if self.context.obproxy_config.get("servers") is not None and len(self.context.obproxy_config.get("servers")) > 0: + obproxy_check_handler = CheckHandler(self.context, check_target_type="obproxy") + obproxy_result = obproxy_check_handler.handle() + result_data['obproxy'] = obproxy_result + if self.context.cluster_config.get("servers") is not None and len(self.context.cluster_config.get("servers")) > 0: + observer_check_handler = CheckHandler(self.context, check_target_type="observer") + observer_result = observer_check_handler.handle() + result_data['observer'] = observer_result + if obproxy_check_handler is not None: + obproxy_report_path = os.path.expanduser(obproxy_check_handler.report.get_report_path()) + if os.path.exists(obproxy_report_path): + result_data['obproxy_report_path'] = os.path.abspath(obproxy_report_path) + self.stdio.print("Check obproxy finished. For more details, please run cmd '" + Fore.YELLOW + " cat {0} ".format(obproxy_check_handler.report.get_report_path()) + Style.RESET_ALL + "'") + if observer_check_handler is not None: + observer_report_path = os.path.expanduser(observer_check_handler.report.get_report_path()) + if os.path.exists(observer_report_path): + result_data['observer_report_path'] = os.path.abspath(observer_report_path) + self.stdio.print("Check observer finished. For more details, please run cmd'" + Fore.YELLOW + " cat {0} ".format(observer_check_handler.report.get_report_path()) + Style.RESET_ALL + "'") + return ObdiagResult(ObdiagResult.SUCCESS_CODE, data=result_data) + except Exception as e: + self.stdio.error("check Exception: {0}".format(e)) + self.stdio.verbose(traceback.format_exc()) + return ObdiagResult(ObdiagResult.SERVER_ERROR_CODE, error_data="check Exception: {0}".format(e)) def check_list(self, opts): config = self.config_manager @@ -540,7 +545,7 @@ def update(self, opts): self.stdio.print("update start ...") self.set_offline_context('update', 'update') handler = UpdateHandler(self.context) - return handler.handle() + return handler.execute() def config(self, opt): config = self.config_manager diff --git a/example/all-components-with-oms.yml b/example/all-components-with-oms.yml new file mode 100644 index 00000000..f4743908 --- /dev/null +++ b/example/all-components-with-oms.yml @@ -0,0 +1,67 @@ +ocp: + login: + url: http://xx.xx.xx.xx:xx + user: admin + password: '' +obcluster: + ob_cluster_name: test + db_host: 192.168.1.1 + db_port: 2881 # default 2881 + tenant_sys: + user: root@sys # default root@sys + password: "" + servers: + nodes: + - ip: 192.168.1.1 + - ip: 192.168.1.2 + - ip: 192.168.1.3 + global: + ssh_username: '' # your username + ssh_password: '' # password if need + # ssh_port: 22 # your ssh port, default 22 + # ssh_key_file: "" # your ssh-key file path if need + # ssh_type: remote # ssh_type choice [remote, docker, kube] default remote + # container_name: xxx # container_name for ssh_type is docker + # The directory for oceanbase installed + home_path: /root/observer + # The directory for data storage. The default value is $home_path/store. + # data_dir: /root/observer/store + # The directory for clog, ilog, and slog. The default value is the same as the data_dir value. + # redo_dir: /root/observer/store +obproxy: + obproxy_cluster_name: obproxy + servers: + nodes: + - ip: 192.168.1.4 + - ip: 192.168.1.5 + - ip: 192.168.1.6 + global: + ssh_username: admin # your username + ssh_password: '' # password if need + # ssh_port: 22 # your ssh port, default 22 + # ssh_key_file: "" # your ssh-key file path if need + # ssh_type: remote # ssh_type choice [remote, docker, kube] default remote + # container_name: xxx # container_name for ssh_type is docker + # The directory for obproxy installed + home_path: /root/obproxy +oms: + oms_name: oms_cluster + servers: + nodes: + - ip: 192.168.1.4 + - ip: 192.168.1.5 + - ip: 192.168.1.6 + global: + ssh_username: admin # your username + ssh_password: '' # password if need + # ssh_port: 22 # your ssh port, default 22 + # ssh_key_file: "" # your ssh-key file path if need + # ssh_type: remote # ssh_type choice [remote, docker, kube] default remote + # container_name: xxx # container_name for ssh_type is docker + # The directory for oms log + log_path: /home/admin/logs + # The directory for oms task's run path + run_path: /home/admin/run + # The directory for oms task's store path + store_path: /home/admin/store + From 8efbb92648b62f578f075b28b204bacdec1387cf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=B8=A0=E7=A3=8A?= Date: Thu, 5 Dec 2024 10:57:46 +0800 Subject: [PATCH 30/32] delete test package --- .github/workflows/build_package.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/build_package.yml b/.github/workflows/build_package.yml index 118d115b..de04b0e9 100644 --- a/.github/workflows/build_package.yml +++ b/.github/workflows/build_package.yml @@ -7,7 +7,6 @@ on: push: branches: - master - - v3.0-dev env: ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true From 1ac01066c7d47c2f5082fd56db06cf73d28b6b18 Mon Sep 17 00:00:00 2001 From: xuyan wang <35394786+wayyoungboy@users.noreply.github.com> Date: Thu, 5 Dec 2024 20:42:19 +0800 Subject: [PATCH 31/32] fix parse_env_display when env_list is None (#602) * support check list * support check list * support check list * support check list * support check list * support check list * build test package * build test package * update GatherComponentLogHandler * update tar_gz_to_zip * delete zip on gather * delete zip on gather * delete zip on gather * delete zip on gather * delete zip on gather * gather use find to get remote log * rca add oms_full_trans * rca add oms_full_trans * update * fix core options * add oms config * check list add --all * fix parse_env_display when env_list is None --- common/tool.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/common/tool.py b/common/tool.py index 96e49d13..1a03f949 100644 --- a/common/tool.py +++ b/common/tool.py @@ -1235,6 +1235,8 @@ def parse_env(env_string, stdio=None): @staticmethod def parse_env_display(env_list): env_dict = {} + if not env_list: + return {} for env_string in env_list: # 分割键和值 key_value = env_string.split('=', 1) From 08108061f05238d6f4c77bed72856bed67bf4c2d Mon Sep 17 00:00:00 2001 From: xuyan wang <35394786+wayyoungboy@users.noreply.github.com> Date: Thu, 5 Dec 2024 20:50:12 +0800 Subject: [PATCH 32/32] add gather_component_log file title (#603) * support check list * support check list * support check list * support check list * support check list * support check list * build test package * build test package * update GatherComponentLogHandler * update tar_gz_to_zip * delete zip on gather * delete zip on gather * delete zip on gather * delete zip on gather * delete zip on gather * gather use find to get remote log * rca add oms_full_trans * rca add oms_full_trans * update * fix core options * add oms config * check list add --all * fix parse_env_display when env_list is None * add gather_component_log file title --- handler/gather/gather_component_log.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/handler/gather/gather_component_log.py b/handler/gather/gather_component_log.py index 091843b4..6bf18505 100644 --- a/handler/gather/gather_component_log.py +++ b/handler/gather/gather_component_log.py @@ -9,6 +9,12 @@ # EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, # MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. # See the Mulan PSL v2 for more details. + +""" +@time: 2024/11/8 +@file: gather_component_log.py +@desc: +""" import datetime import os import tarfile