From a7c0acd3ba5421bed234d6f8aa486d3ec42d0456 Mon Sep 17 00:00:00 2001 From: Christian Henkel <6976069+ct2034@users.noreply.github.com> Date: Wed, 17 Jul 2024 15:20:55 +0200 Subject: [PATCH] Port cpu_monitor to ROS2 (#326) (#385) * add cpu monitor to the common diagnostics * add 3 unittests * add psutil to package xml * add debug print for cpu percentages and change checking percentage to -1 * update code with review comments. Added a SetUp() method in the test and move the sleep to there to make sure this is done every test in a systemactic way --------- Co-authored-by: Richardvdketterij (cherry picked from commit f89beba1b2c8537399e0e38986d6f2356d5a1ebd) Co-authored-by: Richard <47382675+RichardvdK@users.noreply.github.com> --- diagnostic_common_diagnostics/CMakeLists.txt | 7 +- diagnostic_common_diagnostics/README.md | 32 ++++- .../cpu_monitor.py | 119 ++++++++++++++++++ diagnostic_common_diagnostics/mainpage.dox | 1 + diagnostic_common_diagnostics/package.xml | 6 +- .../test/systemtest/test_cpu_monitor.py | 116 +++++++++++++++++ 6 files changed, 271 insertions(+), 10 deletions(-) create mode 100755 diagnostic_common_diagnostics/diagnostic_common_diagnostics/cpu_monitor.py create mode 100644 diagnostic_common_diagnostics/test/systemtest/test_cpu_monitor.py diff --git a/diagnostic_common_diagnostics/CMakeLists.txt b/diagnostic_common_diagnostics/CMakeLists.txt index 72071be16..3415a0424 100644 --- a/diagnostic_common_diagnostics/CMakeLists.txt +++ b/diagnostic_common_diagnostics/CMakeLists.txt @@ -8,6 +8,7 @@ find_package(ament_cmake_python REQUIRED) ament_python_install_package(${PROJECT_NAME}) install(PROGRAMS + ${PROJECT_NAME}/cpu_monitor.py ${PROJECT_NAME}/ntp_monitor.py ${PROJECT_NAME}/ram_monitor.py ${PROJECT_NAME}/sensors_monitor.py @@ -19,10 +20,14 @@ if(BUILD_TESTING) ament_lint_auto_find_test_dependencies() find_package(ament_cmake_pytest REQUIRED) + ament_add_pytest_test( + test_cpu_monitor + test/systemtest/test_cpu_monitor.py + TIMEOUT 10) ament_add_pytest_test( test_ntp_monitor test/systemtest/test_ntp_monitor.py TIMEOUT 10) endif() -ament_package() \ No newline at end of file +ament_package() diff --git a/diagnostic_common_diagnostics/README.md b/diagnostic_common_diagnostics/README.md index e88ccff52..3b410ee34 100644 --- a/diagnostic_common_diagnostics/README.md +++ b/diagnostic_common_diagnostics/README.md @@ -7,8 +7,31 @@ Currently only the NTP monitor is ported to ROS2. # Nodes +## cpu_monitor.py +The `cpu_monitor` module allows users to monitor the CPU usage of their system in real-time. +It publishes the usage percentage in a diagnostic message. + +* Name of the node is "cpu_monitor_" + hostname. +* Uses the following args: + * warning_percentage: If the CPU usage is > warning_percentage, a WARN status will be publised. + * window: the maximum length of the used collections.deque for queuing CPU readings. + +### Published Topics +#### /diagnostics +diagnostic_msgs/DiagnosticArray +The diagnostics information. + +### Parameters +#### warning_percentage +(default: 90) +warning percentage threshold. + +#### window +(default: 1) +Length of CPU readings queue. + ## ntp_monitor.py -Runs 'ntpdate' to check if the system clock is synchronized with the NTP server. +Runs 'ntpdate' to check if the system clock is synchronized with the NTP server. * If the offset is smaller than `offset-tolerance`, an `OK` status will be published. * If the offset is larger than the configured `offset-tolerance`, a `WARN` status will be published, * if it is bigger than `error-offset-tolerance`, an `ERROR` status will be published. @@ -20,7 +43,7 @@ diagnostic_msgs/DiagnosticArray The diagnostics information. ### Parameters -#### ntp_hostname +#### ntp_hostname (default: "pool.ntp.org") Hostname of NTP server. @@ -46,9 +69,6 @@ Disable self test. ## hd_monitor.py **To be ported** -## cpu_monitor.py -**To be ported** - ## ram_monitor.py The `ram_monitor` module allows users to monitor the RAM usage of their system in real-time. It publishes the usage percentage in a diagnostic message. @@ -89,4 +109,4 @@ The diagnostics information. Whether to ignore the fan speed. ## tf_monitor.py -**To be ported** \ No newline at end of file +**To be ported** diff --git a/diagnostic_common_diagnostics/diagnostic_common_diagnostics/cpu_monitor.py b/diagnostic_common_diagnostics/diagnostic_common_diagnostics/cpu_monitor.py new file mode 100755 index 000000000..866629572 --- /dev/null +++ b/diagnostic_common_diagnostics/diagnostic_common_diagnostics/cpu_monitor.py @@ -0,0 +1,119 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +# +# Software License Agreement (BSD License) +# +# Copyright (c) 2017, TNO IVS, Helmond, Netherlands +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials provided +# with the distribution. +# * Neither the name of the TNO IVS nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +# COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +# ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + +# \author Rein Appeldoorn + +from collections import deque +import socket +import traceback + +from diagnostic_msgs.msg import DiagnosticStatus + +from diagnostic_updater import DiagnosticTask, Updater + +import psutil + +import rclpy +from rclpy.node import Node + + +class CpuTask(DiagnosticTask): + + def __init__(self, warning_percentage=90, window=1): + DiagnosticTask.__init__(self, 'CPU Information') + + self._warning_percentage = int(warning_percentage) + self._readings = deque(maxlen=window) + + def _get_average_reading(self): + def avg(lst): + return float(sum(lst)) / len(lst) if lst else float('nan') + + return [avg(cpu_percentages) + for cpu_percentages in zip(*self._readings)] + + def run(self, stat): + self._readings.append(psutil.cpu_percent(percpu=True)) + cpu_percentages = self._get_average_reading() + cpu_average = sum(cpu_percentages) / len(cpu_percentages) + + stat.add('CPU Load Average', f'{cpu_average:.2f}') + + warn = False + for idx, cpu_percentage in enumerate(cpu_percentages): + stat.add(f'CPU {idx} Load', f'{cpu_percentage:.2f}') + if cpu_percentage > self._warning_percentage: + warn = True + + if warn: + stat.summary(DiagnosticStatus.WARN, + f'At least one CPU exceeds {self._warning_percentage} percent') + else: + stat.summary(DiagnosticStatus.OK, + f'CPU Average {cpu_average:.2f} percent') + + return stat + + +def main(args=None): + rclpy.init(args=args) + + # Create the node + hostname = socket.gethostname() + node = Node(f'cpu_monitor_{hostname.replace("-", "_")}') + + # Declare and get parameters + node.declare_parameter('warning_percentage', 90) + node.declare_parameter('window', 1) + + warning_percentage = node.get_parameter( + 'warning_percentage').get_parameter_value().integer_value + window = node.get_parameter('window').get_parameter_value().integer_value + + # Create diagnostic updater with default updater rate of 1 hz + updater = Updater(node) + updater.setHardwareID(hostname) + updater.add(CpuTask(warning_percentage=warning_percentage, window=window)) + + rclpy.spin(node) + + +if __name__ == '__main__': + try: + main() + except KeyboardInterrupt: + pass + except Exception: + traceback.print_exc() diff --git a/diagnostic_common_diagnostics/mainpage.dox b/diagnostic_common_diagnostics/mainpage.dox index 25ee9c863..7aa872cef 100644 --- a/diagnostic_common_diagnostics/mainpage.dox +++ b/diagnostic_common_diagnostics/mainpage.dox @@ -4,6 +4,7 @@ \b diagnostic_common_diagnostics contains a few common diagnostic nodes +- cpu_monitor publishes diagnostic messages with the CPU usage of the system. - ntp_monitor publishes diagnostic messages for how well the NTP time sync is working. - tf_monitor used to publish diagnostic messages reporting on the health of the TF tree. It is based on tfwtf. It is not ported to ROS2. diff --git a/diagnostic_common_diagnostics/package.xml b/diagnostic_common_diagnostics/package.xml index b58ff9ced..062bfdcd1 100644 --- a/diagnostic_common_diagnostics/package.xml +++ b/diagnostic_common_diagnostics/package.xml @@ -18,21 +18,21 @@ ament_cmake ament_cmake_python - rclpy diagnostic_updater lm-sensors python3-ntplib + python3-psutil ament_lint_auto ament_cmake_xmllint ament_cmake_lint_cmake - + ament_cmake_pytest diff --git a/diagnostic_common_diagnostics/test/systemtest/test_cpu_monitor.py b/diagnostic_common_diagnostics/test/systemtest/test_cpu_monitor.py new file mode 100644 index 000000000..28430c482 --- /dev/null +++ b/diagnostic_common_diagnostics/test/systemtest/test_cpu_monitor.py @@ -0,0 +1,116 @@ +# -*- coding: utf-8 -*- +# Software License Agreement (BSD License) +# +# Copyright (c) 2023, Robert Bosch GmbH +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials provided +# with the distribution. +# * Neither the name of the Willow Garage nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +# COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +# ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + +import time +import unittest + +from diagnostic_common_diagnostics.cpu_monitor import CpuTask + +from diagnostic_msgs.msg import DiagnosticStatus + +from diagnostic_updater import DiagnosticArray, Updater +from diagnostic_updater import DiagnosticStatusWrapper + +import rclpy +from rclpy.node import Node + + +class TestCPUMonitor(unittest.TestCase): + + @classmethod + def setUpClass(cls): + rclpy.init(args=None) + + @classmethod + def tearDownClass(cls): + if rclpy.ok(): + rclpy.shutdown() + + def setUp(self): + # In this case is recommended for accuracy that psutil.cpu_percent() + # function be called with at least 0.1 seconds between calls. + time.sleep(0.1) + + def diagnostics_callback(self, msg): + self.message_recieved = True + self.assertEqual(len(msg.status), 1) + + def test_ok(self): + warning_percentage = 100 + task = CpuTask(warning_percentage) + stat = DiagnosticStatusWrapper() + task.run(stat) + self.assertEqual(task.name, 'CPU Information') + self.assertEqual(stat.level, DiagnosticStatus.OK) + self.assertIn(str('CPU Average'), stat.message) + + # Check for at least 1 CPU Load Average and 1 CPU Load + self.assertGreaterEqual(len(stat.values), 2) + + def test_warn(self): + warning_percentage = -1 + task = CpuTask(warning_percentage) + stat = DiagnosticStatusWrapper() + task.run(stat) + print(f'Raw readings: {task._readings}') + self.assertEqual(task.name, 'CPU Information') + self.assertEqual(stat.level, DiagnosticStatus.WARN) + self.assertIn(str('At least one CPU exceeds'), stat.message) + + # Check for at least 1 CPU Load Average and 1 CPU Load + self.assertGreaterEqual(len(stat.values), 2) + + def test_updater(self): + self.message_recieved = False + + node = Node('cpu_monitor_test') + updater = Updater(node) + updater.setHardwareID('test_id') + updater.add(CpuTask()) + + node.create_subscription( + DiagnosticArray, '/diagnostics', self.diagnostics_callback, 10) + + start_time = time.time() + timeout = 5.0 # Timeout in seconds + + while not self.message_recieved: + rclpy.spin_once(node) + time.sleep(0.1) + elapsed_time = time.time() - start_time + if elapsed_time >= timeout: + self.fail('No diagnostics received') + + +if __name__ == '__main__': + unittest.main()