From 4d11b576ae7912104bc38daeb37663630651bf06 Mon Sep 17 00:00:00 2001 From: Pierre Rogier Date: Thu, 20 Feb 2025 16:47:19 +0100 Subject: [PATCH] Ignore replica busy condition in healthcheck --- .../suites/healthcheck/health_repl_test.py | 125 +++++++++++++++--- src/lib389/lib389/replica.py | 3 + 2 files changed, 110 insertions(+), 18 deletions(-) diff --git a/dirsrvtests/tests/suites/healthcheck/health_repl_test.py b/dirsrvtests/tests/suites/healthcheck/health_repl_test.py index a8d94dfcb0..59971a7f29 100644 --- a/dirsrvtests/tests/suites/healthcheck/health_repl_test.py +++ b/dirsrvtests/tests/suites/healthcheck/health_repl_test.py @@ -9,7 +9,11 @@ import pytest import os -from contextlib import suppress +import random +import string +import threading +import time +from contextlib import suppress, AbstractContextManager from lib389.backend import Backend, Backends from lib389.idm.user import UserAccounts from lib389.replica import Changelog, ReplicationManager, Replicas @@ -23,11 +27,71 @@ CMD_OUTPUT = 'No issues found.' JSON_OUTPUT = '[]' +LOGIC_DICT = { + False: ( "not ", "", lambda x: x ), + True: ( "", "not ", lambda x: not x ) + } + ds_paths = Paths() log = logging.getLogger(__name__) -def run_healthcheck_and_flush_log(topology, instance, searched_code, json, searched_code2=None): +class LoadInstance(AbstractContextManager): + @staticmethod + def create_test_user(inst): + users = UserAccounts(inst, DEFAULT_SUFFIX) + uid = str(20000 + int(inst.serverid[8:])) + properties = { + 'uid': f'testuser_{inst.serverid}', + 'cn' : f'testuser_{inst.serverid}', + 'sn' : 'user_{inst.serverid}', + 'uidNumber' : uid, + 'gidNumber' : uid, + 'homeDirectory' : f'/home/testuser_{inst.serverid}' + } + return users.ensure_state(properties=properties) + + def __init__(self, inst): + self.inst = inst + self.stop = threading.Event() + self.thread = threading.Thread(target=self.loader) + self.user = LoadInstance.create_test_user(inst) + + def loader(self): + while not self.stop.is_set(): + value = ''.join(random.choices(string.ascii_uppercase + string.digits, k=10)) + self.user.replace('description', value) + #log.info(f'Modified {self.user.dn} description with {value} on {self.inst.serverid}') + time.sleep(0.001) + + def __exit__(self, *args): + self.stop.set() + self.thread.join() + self.user.delete() + + def __enter__(self): + self.thread.start() + + +def assert_is_in_logcap(logcap, searched_code, isnot=False): + # Assert if searched_code is not in logcap + if searched_code is None: + return + + # Handle positive and negative tests: + nomatch, match, f = LOGIC_DICT[bool(isnot)] + try: + assert f(logcap.contains(searched_code)) + log.info(f'Searched code {searched_code} is {match}in healthcheck output') + except AssertionError as exc: + output = [] + logcap.emit(output) + output = "\n".join(output) + log.error(f'{searched_code} is {nomatch}in healthcheck output: {output}') + raise + + +def run_healthcheck_and_flush_log(topology, instance, searched_code, json, searched_code2=None, isnot=False): args = FakeArgs() args.instance = instance.serverid args.verbose = instance.verbose @@ -35,27 +99,15 @@ def run_healthcheck_and_flush_log(topology, instance, searched_code, json, searc args.list_checks = False args.check = ['replication', 'backends:userroot:cl_trimming'] args.dry_run = False + args.json = json if json: log.info('Use healthcheck with --json option') - args.json = json - health_check_run(instance, topology.logcap.log, args) - assert topology.logcap.contains(searched_code) - log.info('Healthcheck returned searched code: %s' % searched_code) - - if searched_code2 is not None: - assert topology.logcap.contains(searched_code2) - log.info('Healthcheck returned searched code: %s' % searched_code2) else: log.info('Use healthcheck without --json option') - args.json = json - health_check_run(instance, topology.logcap.log, args) - assert topology.logcap.contains(searched_code) - log.info('Healthcheck returned searched code: %s' % searched_code) - - if searched_code2 is not None: - assert topology.logcap.contains(searched_code2) - log.info('Healthcheck returned searched code: %s' % searched_code2) + health_check_run(instance, topology.logcap.log, args) + assert_is_in_logcap(topology.logcap, searched_code, isnot=isnot) + assert_is_in_logcap(topology.logcap, searched_code2, isnot=isnot) log.info('Clear the log') topology.logcap.flush() @@ -245,6 +297,42 @@ def test_healthcheck_non_replicated_suffixes(topology_m2): health_check_run(inst, topology_m2.logcap.log, args) +@pytest.mark.xfail(ds_is_older("2.7"), reason="Not fixed") +def test_healthcheck_replica_busy(topology_m3): + """Check that HealthCheck does not returns DSREPLLE0003 code when a replicva is busy + + :id: b7c4a5aa-ef98-11ef-87f5-482ae39447e5 + :setup: 3 MMR topology + :steps: + 1. Create a 3 suppliers full-mesh topology + 2. Generate constant modify load on S1 and S2 + 3. Wait a bit to ensure stable replication flow + 4. Perform a modify on S3 + 5. Use HealthCheck on S3 without --json option + 6. Use HealthCheck on S3 with --json option + :expectedresults: + 1. Success + 2. Success + 3. Success + 4. Success + 5. Healthcheck should not reports DSREPLLE0003 code and related details + 6. Healthcheck should not reports DSREPLLE0003 code and related details + """ + + RET_CODE = 'DSREPLLE0003' + + S1 = topology_m3.ms['supplier1'] + S2 = topology_m3.ms['supplier2'] + S3 = topology_m3.ms['supplier3'] + with LoadInstance(S1), LoadInstance(S2): + # Wait a bit to let replication starts + time.sleep(10) + # Create user on S3 then remove it: + LoadInstance(S3).user.delete() + run_healthcheck_and_flush_log(topology_m3, S3, RET_CODE, json=False, isnot=True) + run_healthcheck_and_flush_log(topology_m3, S3, RET_CODE, json=True, isnot=True) + + @pytest.mark.xfail(ds_is_older("1.4.1"), reason="Not implemented") def test_healthcheck_replication_out_of_sync_broken(topology_m3): """Check if HealthCheck returns DSREPLLE0001 code @@ -286,6 +374,7 @@ def test_healthcheck_replication_out_of_sync_broken(topology_m3): run_healthcheck_and_flush_log(topology_m3, M1, RET_CODE, json=True) + if __name__ == '__main__': # Run isolated # -s for DEBUG mode diff --git a/src/lib389/lib389/replica.py b/src/lib389/lib389/replica.py index b9341032cb..ba568fa8bc 100644 --- a/src/lib389/lib389/replica.py +++ b/src/lib389/lib389/replica.py @@ -1255,6 +1255,9 @@ def _lint_agmts_status(self): report['check'] = f'replication:agmts_status' yield report elif status['state'] == 'amber': + if "can't acquire busy replica" in status['reason']: + # Ignore replica busy condition + continue # Warning report = copy.deepcopy(DSREPLLE0003) report['detail'] = report['detail'].replace('SUFFIX', suffix)