Skip to content

Commit

Permalink
Port cpu_monitor to ROS2 (#326) (#385)
Browse files Browse the repository at this point in the history
* add cpu monitor to the common diagnostics
* add 3 unittests
* add psutil to package xml
* add debug print for cpu percentages and change checking percentage to -1
* update code with review comments. Added a SetUp() method in the test and move the sleep to there to make sure this is done every test in a systemactic way

---------

Co-authored-by: Richardvdketterij <[email protected]>
(cherry picked from commit f89beba)

Co-authored-by: Richard <[email protected]>
  • Loading branch information
ct2034 and RichardvdK authored Jul 17, 2024
1 parent de345ff commit a7c0acd
Show file tree
Hide file tree
Showing 6 changed files with 271 additions and 10 deletions.
7 changes: 6 additions & 1 deletion diagnostic_common_diagnostics/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ find_package(ament_cmake_python REQUIRED)
ament_python_install_package(${PROJECT_NAME})

install(PROGRAMS
${PROJECT_NAME}/cpu_monitor.py
${PROJECT_NAME}/ntp_monitor.py
${PROJECT_NAME}/ram_monitor.py
${PROJECT_NAME}/sensors_monitor.py
Expand All @@ -19,10 +20,14 @@ if(BUILD_TESTING)
ament_lint_auto_find_test_dependencies()

find_package(ament_cmake_pytest REQUIRED)
ament_add_pytest_test(
test_cpu_monitor
test/systemtest/test_cpu_monitor.py
TIMEOUT 10)
ament_add_pytest_test(
test_ntp_monitor
test/systemtest/test_ntp_monitor.py
TIMEOUT 10)
endif()

ament_package()
ament_package()
32 changes: 26 additions & 6 deletions diagnostic_common_diagnostics/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,31 @@ Currently only the NTP monitor is ported to ROS2.

# Nodes

## cpu_monitor.py
The `cpu_monitor` module allows users to monitor the CPU usage of their system in real-time.
It publishes the usage percentage in a diagnostic message.

* Name of the node is "cpu_monitor_" + hostname.
* Uses the following args:
* warning_percentage: If the CPU usage is > warning_percentage, a WARN status will be publised.
* window: the maximum length of the used collections.deque for queuing CPU readings.

### Published Topics
#### /diagnostics
diagnostic_msgs/DiagnosticArray
The diagnostics information.

### Parameters
#### warning_percentage
(default: 90)
warning percentage threshold.

#### window
(default: 1)
Length of CPU readings queue.

## ntp_monitor.py
Runs 'ntpdate' to check if the system clock is synchronized with the NTP server.
Runs 'ntpdate' to check if the system clock is synchronized with the NTP server.
* If the offset is smaller than `offset-tolerance`, an `OK` status will be published.
* If the offset is larger than the configured `offset-tolerance`, a `WARN` status will be published,
* if it is bigger than `error-offset-tolerance`, an `ERROR` status will be published.
Expand All @@ -20,7 +43,7 @@ diagnostic_msgs/DiagnosticArray
The diagnostics information.

### Parameters
#### ntp_hostname
#### ntp_hostname
(default: "pool.ntp.org")
Hostname of NTP server.

Expand All @@ -46,9 +69,6 @@ Disable self test.
## hd_monitor.py
**To be ported**

## cpu_monitor.py
**To be ported**

## ram_monitor.py
The `ram_monitor` module allows users to monitor the RAM usage of their system in real-time.
It publishes the usage percentage in a diagnostic message.
Expand Down Expand Up @@ -89,4 +109,4 @@ The diagnostics information.
Whether to ignore the fan speed.

## tf_monitor.py
**To be ported**
**To be ported**
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
#
# Software License Agreement (BSD License)
#
# Copyright (c) 2017, TNO IVS, Helmond, Netherlands
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
#
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above
# copyright notice, this list of conditions and the following
# disclaimer in the documentation and/or other materials provided
# with the distribution.
# * Neither the name of the TNO IVS nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
# COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
# ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.

# \author Rein Appeldoorn

from collections import deque
import socket
import traceback

from diagnostic_msgs.msg import DiagnosticStatus

from diagnostic_updater import DiagnosticTask, Updater

import psutil

import rclpy
from rclpy.node import Node


class CpuTask(DiagnosticTask):

def __init__(self, warning_percentage=90, window=1):
DiagnosticTask.__init__(self, 'CPU Information')

self._warning_percentage = int(warning_percentage)
self._readings = deque(maxlen=window)

def _get_average_reading(self):
def avg(lst):
return float(sum(lst)) / len(lst) if lst else float('nan')

return [avg(cpu_percentages)
for cpu_percentages in zip(*self._readings)]

def run(self, stat):
self._readings.append(psutil.cpu_percent(percpu=True))
cpu_percentages = self._get_average_reading()
cpu_average = sum(cpu_percentages) / len(cpu_percentages)

stat.add('CPU Load Average', f'{cpu_average:.2f}')

warn = False
for idx, cpu_percentage in enumerate(cpu_percentages):
stat.add(f'CPU {idx} Load', f'{cpu_percentage:.2f}')
if cpu_percentage > self._warning_percentage:
warn = True

if warn:
stat.summary(DiagnosticStatus.WARN,
f'At least one CPU exceeds {self._warning_percentage} percent')
else:
stat.summary(DiagnosticStatus.OK,
f'CPU Average {cpu_average:.2f} percent')

return stat


def main(args=None):
rclpy.init(args=args)

# Create the node
hostname = socket.gethostname()
node = Node(f'cpu_monitor_{hostname.replace("-", "_")}')

# Declare and get parameters
node.declare_parameter('warning_percentage', 90)
node.declare_parameter('window', 1)

warning_percentage = node.get_parameter(
'warning_percentage').get_parameter_value().integer_value
window = node.get_parameter('window').get_parameter_value().integer_value

# Create diagnostic updater with default updater rate of 1 hz
updater = Updater(node)
updater.setHardwareID(hostname)
updater.add(CpuTask(warning_percentage=warning_percentage, window=window))

rclpy.spin(node)


if __name__ == '__main__':
try:
main()
except KeyboardInterrupt:
pass
except Exception:
traceback.print_exc()
1 change: 1 addition & 0 deletions diagnostic_common_diagnostics/mainpage.dox
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

\b diagnostic_common_diagnostics contains a few common diagnostic nodes

- cpu_monitor publishes diagnostic messages with the CPU usage of the system.
- ntp_monitor publishes diagnostic messages for how well the NTP time sync is working.
- tf_monitor used to publish diagnostic messages reporting on the health of
the TF tree. It is based on tfwtf. It is not ported to ROS2.
Expand Down
6 changes: 3 additions & 3 deletions diagnostic_common_diagnostics/package.xml
Original file line number Diff line number Diff line change
Expand Up @@ -18,21 +18,21 @@
<buildtool_depend>ament_cmake</buildtool_depend>
<buildtool_depend>ament_cmake_python</buildtool_depend>

<exec_depend>rclpy</exec_depend>
<exec_depend>diagnostic_updater</exec_depend>
<exec_depend>lm-sensors</exec_depend>
<exec_depend>python3-ntplib</exec_depend>
<exec_depend>python3-psutil</exec_depend>

<test_depend>ament_lint_auto</test_depend>
<!-- Usage of ament_lint_common is locked by https://github.com/ament/ament_lint/issues/423
For now, enable the linters that do support exlusions.
Once all files are migrated to ROS2, all linters can be
Once all files are migrated to ROS2, all linters can be
enabled vi ament_lint_common as the files are cleaned up. -->
<!-- <test_depend>ament_cmake_flake8</test_depend> -->
<!-- <test_depend>ament_cmake_pep257</test_depend> -->
<test_depend>ament_cmake_xmllint</test_depend>
<test_depend>ament_cmake_lint_cmake</test_depend>

<test_depend>ament_cmake_pytest</test_depend>

<export>
Expand Down
116 changes: 116 additions & 0 deletions diagnostic_common_diagnostics/test/systemtest/test_cpu_monitor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
# -*- coding: utf-8 -*-
# Software License Agreement (BSD License)
#
# Copyright (c) 2023, Robert Bosch GmbH
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
#
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above
# copyright notice, this list of conditions and the following
# disclaimer in the documentation and/or other materials provided
# with the distribution.
# * Neither the name of the Willow Garage nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
# COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
# ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.

import time
import unittest

from diagnostic_common_diagnostics.cpu_monitor import CpuTask

from diagnostic_msgs.msg import DiagnosticStatus

from diagnostic_updater import DiagnosticArray, Updater
from diagnostic_updater import DiagnosticStatusWrapper

import rclpy
from rclpy.node import Node


class TestCPUMonitor(unittest.TestCase):

@classmethod
def setUpClass(cls):
rclpy.init(args=None)

@classmethod
def tearDownClass(cls):
if rclpy.ok():
rclpy.shutdown()

def setUp(self):
# In this case is recommended for accuracy that psutil.cpu_percent()
# function be called with at least 0.1 seconds between calls.
time.sleep(0.1)

def diagnostics_callback(self, msg):
self.message_recieved = True
self.assertEqual(len(msg.status), 1)

def test_ok(self):
warning_percentage = 100
task = CpuTask(warning_percentage)
stat = DiagnosticStatusWrapper()
task.run(stat)
self.assertEqual(task.name, 'CPU Information')
self.assertEqual(stat.level, DiagnosticStatus.OK)
self.assertIn(str('CPU Average'), stat.message)

# Check for at least 1 CPU Load Average and 1 CPU Load
self.assertGreaterEqual(len(stat.values), 2)

def test_warn(self):
warning_percentage = -1
task = CpuTask(warning_percentage)
stat = DiagnosticStatusWrapper()
task.run(stat)
print(f'Raw readings: {task._readings}')
self.assertEqual(task.name, 'CPU Information')
self.assertEqual(stat.level, DiagnosticStatus.WARN)
self.assertIn(str('At least one CPU exceeds'), stat.message)

# Check for at least 1 CPU Load Average and 1 CPU Load
self.assertGreaterEqual(len(stat.values), 2)

def test_updater(self):
self.message_recieved = False

node = Node('cpu_monitor_test')
updater = Updater(node)
updater.setHardwareID('test_id')
updater.add(CpuTask())

node.create_subscription(
DiagnosticArray, '/diagnostics', self.diagnostics_callback, 10)

start_time = time.time()
timeout = 5.0 # Timeout in seconds

while not self.message_recieved:
rclpy.spin_once(node)
time.sleep(0.1)
elapsed_time = time.time() - start_time
if elapsed_time >= timeout:
self.fail('No diagnostics received')


if __name__ == '__main__':
unittest.main()

0 comments on commit a7c0acd

Please sign in to comment.