From e2d9f87a56c4758b87a75b519ead6fba88114bd4 Mon Sep 17 00:00:00 2001 From: Gregory Boudreau <45526465+gregoryboudreau@users.noreply.github.com> Date: Tue, 28 Nov 2023 17:48:21 -0600 Subject: [PATCH] Add dynamic sensor logic for fixed and psu presence/state checking in thermalctld (#401) * add modular sensor logic even for fixed devices and presence/status checking for PSUs * test changes * fixing accidental removal of name * logic correction * isolating key error * psu runtime change * fixing whitespace addition * remove powergood check from thermalctld logic --- sonic-thermalctld/scripts/thermalctld | 41 +++++++++++---------- sonic-thermalctld/tests/mock_platform.py | 3 ++ sonic-thermalctld/tests/test_thermalctld.py | 6 +-- 3 files changed, 28 insertions(+), 22 deletions(-) diff --git a/sonic-thermalctld/scripts/thermalctld b/sonic-thermalctld/scripts/thermalctld index 0abc0f4b0..82d64a105 100644 --- a/sonic-thermalctld/scripts/thermalctld +++ b/sonic-thermalctld/scripts/thermalctld @@ -520,10 +520,10 @@ class TemperatureUpdater(logger.Logger): self.table = swsscommon.Table(state_db, TemperatureUpdater.TEMPER_INFO_TABLE_NAME) self.phy_entity_table = swsscommon.Table(state_db, PHYSICAL_ENTITY_INFO_TABLE) self.chassis_table = None + self.all_thermals = set() self.is_chassis_system = chassis.is_modular_chassis() if self.is_chassis_system: - self.module_thermals = set() my_slot = try_get(chassis.get_my_slot, INVALID_SLOT) if my_slot != INVALID_SLOT: try: @@ -566,19 +566,23 @@ class TemperatureUpdater(logger.Logger): :return: """ self.log_debug("Start temperature updating") + available_thermals = set() for index, thermal in enumerate(self.chassis.get_all_thermals()): if self.task_stopping_event.is_set(): return + available_thermals.add((thermal, CHASSIS_INFO_KEY, index)) self._refresh_temperature_status(CHASSIS_INFO_KEY, thermal, index) for psu_index, psu in enumerate(self.chassis.get_all_psus()): parent_name = 'PSU {}'.format(psu_index + 1) - for thermal_index, thermal in enumerate(psu.get_all_thermals()): - if self.task_stopping_event.is_set(): - return + if psu.get_presence(): + for thermal_index, thermal in enumerate(psu.get_all_thermals()): + if self.task_stopping_event.is_set(): + return - self._refresh_temperature_status(parent_name, thermal, thermal_index) + available_thermals.add((thermal, parent_name, thermal_index)) + self._refresh_temperature_status(parent_name, thermal, thermal_index) for sfp_index, sfp in enumerate(self.chassis.get_all_sfps()): parent_name = 'SFP {}'.format(sfp_index + 1) @@ -586,10 +590,10 @@ class TemperatureUpdater(logger.Logger): if self.task_stopping_event.is_set(): return + available_thermals.add((thermal, parent_name, thermal_index)) self._refresh_temperature_status(parent_name, thermal, thermal_index) if self.is_chassis_system: - available_thermals = set() for module_index, module in enumerate(self.chassis.get_all_modules()): module_name = try_get(module.get_name, 'Module {}'.format(module_index + 1)) @@ -610,19 +614,18 @@ class TemperatureUpdater(logger.Logger): self._refresh_temperature_status(sfp_name, thermal, thermal_index) for psu_index, psu in enumerate(module.get_all_psus()): - psu_name = '{} PSU {}'.format(module_name, psu_index + 1) - for thermal_index, thermal in enumerate(psu.get_all_thermals()): - if self.task_stopping_event.is_set(): - return - - available_thermals.add((thermal, psu_name, thermal_index)) - self._refresh_temperature_status(psu_name, thermal, thermal_index) - - - thermals_to_remove = self.module_thermals - available_thermals - self.module_thermals = available_thermals - for thermal, parent_name, thermal_index in thermals_to_remove: - self._remove_thermal_from_db(thermal, parent_name, thermal_index) + if psu.get_presence(): + psu_name = '{} PSU {}'.format(module_name, psu_index + 1) + for thermal_index, thermal in enumerate(psu.get_all_thermals()): + if self.task_stopping_event.is_set(): + return + available_thermals.add((thermal, psu_name, thermal_index)) + self._refresh_temperature_status(psu_name, thermal, thermal_index) + + thermals_to_remove = self.all_thermals - available_thermals + self.all_thermals = available_thermals + for thermal, parent_name, thermal_index in thermals_to_remove: + self._remove_thermal_from_db(thermal, parent_name, thermal_index) self.log_debug("End temperature updating") diff --git a/sonic-thermalctld/tests/mock_platform.py b/sonic-thermalctld/tests/mock_platform.py index 660903226..038347e48 100644 --- a/sonic-thermalctld/tests/mock_platform.py +++ b/sonic-thermalctld/tests/mock_platform.py @@ -175,6 +175,9 @@ def get_serial(self): def get_status(self): return self._status + + def get_powergood_status(self): + return self._status def set_status(self, status): self._status = status diff --git a/sonic-thermalctld/tests/test_thermalctld.py b/sonic-thermalctld/tests/test_thermalctld.py index 6fe9ccbd1..151b72fb7 100644 --- a/sonic-thermalctld/tests/test_thermalctld.py +++ b/sonic-thermalctld/tests/test_thermalctld.py @@ -501,11 +501,11 @@ def test_update_module_thermals(self): chassis.set_modular_chassis(True) temperature_updater = thermalctld.TemperatureUpdater(chassis, multiprocessing.Event()) temperature_updater.update() - assert len(temperature_updater.module_thermals) == 3 - + assert len(temperature_updater.all_thermals) == 3 + chassis._module_list = [] temperature_updater.update() - assert len(temperature_updater.module_thermals) == 0 + assert len(temperature_updater.all_thermals) == 0 # Modular chassis-related tests