From 410e571351a448465e5fe307f109175cb5329a06 Mon Sep 17 00:00:00 2001 From: John Mulligan Date: Tue, 20 Aug 2024 17:19:09 -0400 Subject: [PATCH] sambacc: avoid logging an error if cluster is being torn down Saw this in a ceph teuthology run: ``` 2024-08-20 20:39:57,289: DEBUG: Creating RADOS connection 2024-08-20 20:39:57,333: INFO: cluster meta content changed 2024-08-20 20:39:57,333: DEBUG: cluster meta: previous={'nodes': [{'pnn': 0, 'identity': 'smb.adctdb1.0.0.ceph0.kdlxgn', 'node': '192.168.76.200', 'state': 'ready'}, {'pnn': 1, 'identity': 'smb.adctdb1.1.0.ceph1.ngbqkk', 'node': '192.168.76.201', 'state': 'ready'}, {'pnn': 2, 'identity': 'smb.adctdb1.2.0.ceph2.rhmqnu', 'node': '192.168.76.202', 'state': 'ready'}], '_source': 'cephadm'} current={} 2024-08-20 20:39:57,333: ERROR: error during ctdb_monitor_nodes: max() arg is an empty sequence, count=0 Traceback (most recent call last): File "/usr/lib/python3.9/site-packages/sambacc/commands/ctdb.py", line 479, in catch yield File "/usr/lib/python3.9/site-packages/sambacc/commands/ctdb.py", line 360, in ctdb_monitor_nodes ctdb.monitor_cluster_meta_changes( File "/usr/lib/python3.9/site-packages/sambacc/ctdb.py", line 561, in monitor_cluster_meta_changes expected_nodes = _cluster_meta_to_ctdb_nodes( File "/usr/lib/python3.9/site-packages/sambacc/ctdb.py", line 506, in _cluster_meta_to_ctdb_nodes pnn_max = max(n["pnn"] for n in nodes) + 1 # pnn is zero indexed ValueError: max() arg is an empty sequence ``` I could see from the ceph logs the smb cluster was being removed right around this time. If we had nodes and they suddenly vanish we're likely in the process of getting removed and we raced a tad with cephadm removing services while the smb mgr module was removing the contents of the .smb pool. Signed-off-by: John Mulligan --- sambacc/ctdb.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/sambacc/ctdb.py b/sambacc/ctdb.py index cc4dd55..6bbd4bf 100644 --- a/sambacc/ctdb.py +++ b/sambacc/ctdb.py @@ -551,6 +551,13 @@ def monitor_cluster_meta_changes( if curr_meta == prev_meta: _logger.debug("cluster meta content unchanged: %r", curr_meta) continue + if len(prev_meta) > 0 and len(curr_meta) == 0: + # cluster is possibly (probably?) being destroyed. + # Return from this loop and let the command-level loop decide if + # this function needs to be restarted or not. There's a chance this + # process will be terminated very soon anyway. + _logger.warning("no current nodes available") + return _logger.info("cluster meta content changed") _logger.debug( "cluster meta: previous=%r current=%r", prev_meta, curr_meta