From 126af5044b45fd078fc8a413bdb1db58690f3219 Mon Sep 17 00:00:00 2001 From: Solomon Jacobs Date: Wed, 3 Apr 2024 15:17:35 +0200 Subject: [PATCH] 16417 FIX Add Configuration Option 'checkmkAgentTimeout' CMK-16676 Closes: #26 Change-Id: I697df13efc1d6b5279396d626b0c335c51928892 --- .werks/16417 | 22 +++++++++++++++++++ .../node-collector-machine-sections-ds.yaml | 3 +++ deploy/charts/checkmk/values.yaml | 2 ++ src/checkmk_kube_agent/send_metrics.py | 6 ++--- 4 files changed, 30 insertions(+), 3 deletions(-) create mode 100644 .werks/16417 diff --git a/.werks/16417 b/.werks/16417 new file mode 100644 index 0000000..3eb8e8b --- /dev/null +++ b/.werks/16417 @@ -0,0 +1,22 @@ +Title: Add Configuration Option 'checkmkAgentTimeout' +Class: fix +Compatible: compat +Component: node-collector +Date: 1712152033 +Knowledge: doc +Level: 1 +Version: 2.0.0-alpha.1 + +The machine-sections-collector executes a version of the 'check_mk_agent' to collect information +about the host. Sometimes this script takes more than five seconds, which causes the following +traceback. + +C+: + File "/usr/local/lib/python3.10/subprocess.py", line 1935, in _wait + raise TimeoutExpired(self.args, timeout) +subprocess.TimeoutExpired: Command '['/usr/local/bin/check_mk_agent']' timed out after 5 seconds +C-: + +If you encounter this error, you can configure a longer timeout via the new option +'nodeCollector.machineSectionsCollector.checkmkAgentTimeout' in the 'values.yaml' configuration +file. diff --git a/deploy/charts/checkmk/templates/node-collector-machine-sections-ds.yaml b/deploy/charts/checkmk/templates/node-collector-machine-sections-ds.yaml index ac9c89f..31c6471 100644 --- a/deploy/charts/checkmk/templates/node-collector-machine-sections-ds.yaml +++ b/deploy/charts/checkmk/templates/node-collector-machine-sections-ds.yaml @@ -61,6 +61,9 @@ spec: - "/usr/local/bin/checkmk-machine-sections-collector" args: - "--log-level={{ .Values.nodeCollector.logLevel }}" + {{- if .Values.nodeCollector.machineSectionsCollector.checkmkAgentTimeout }} + - "--checkmk-agent-timeout={{ .Values.nodeCollector.machineSectionsCollector.checkmkAgentTimeout }}" + {{- end }} {{- if .Values.tlsCommunication.enabled }} - "--secure-protocol" {{- if .Values.tlsCommunication.verifySsl }} diff --git a/deploy/charts/checkmk/values.yaml b/deploy/charts/checkmk/values.yaml index 9716e52..b782155 100644 --- a/deploy/charts/checkmk/values.yaml +++ b/deploy/charts/checkmk/values.yaml @@ -282,6 +282,8 @@ nodeCollector: cpu: 150m memory: 200Mi + checkmkAgentTimeout: 5 + # the machine sections collector can collect monitoring information for network interfaces of the underlying node. # this means that the '/sys' directory of the node will be mounted into the container. # the pod security policy is adjusted accordingly. diff --git a/src/checkmk_kube_agent/send_metrics.py b/src/checkmk_kube_agent/send_metrics.py index 2715f27..1a5531d 100644 --- a/src/checkmk_kube_agent/send_metrics.py +++ b/src/checkmk_kube_agent/send_metrics.py @@ -303,7 +303,7 @@ def parse_arguments(argv: Sequence[str]) -> argparse.Namespace: help="Collector log level.", ) parser.add_argument( - "--agent-timeout", + "--checkmk-agent-timeout", type=int, help="Checkmk Agent execution timeout in seconds", ) @@ -313,7 +313,7 @@ def parse_arguments(argv: Sequence[str]) -> argparse.Namespace: max_retries=10, polling_interval=60, ca_cert="/etc/ca-certificates/checkmk-ca-cert.pem", - agent_timeout=5, + checkmk_agent_timeout=5, ) return parser.parse_args(argv) @@ -324,7 +324,7 @@ def container_metrics_worker( cluster_collector_base_url: Url, headers: RequestHeaders, verify: SslVerify, - args: argparse.Namespace, # pylint: disable=unused-argument + _args: argparse.Namespace, ) -> None: # pragma: no cover """ Query cadvisor api, send metrics to cluster collector