diff --git a/src/tests/ftest/recovery/check_start_corner_case.py b/src/tests/ftest/recovery/check_start_corner_case.py index d56d81aa2d3..5bdf4973b3b 100644 --- a/src/tests/ftest/recovery/check_start_corner_case.py +++ b/src/tests/ftest/recovery/check_start_corner_case.py @@ -153,3 +153,121 @@ def test_start_back_to_back(self): # Containers were removed by the checker. container_1.skip_cleanup() container_2.skip_cleanup() + + def test_two_pools_healthy(self): + """Test to pass in two pool labels where one is healthy pool. + + 1. Create three pools and one container. + 2. Inject container bad label into one of them. + 3. Enable checker and set policy to --all-interactive. + 4. Call dmg check start with two different healthy pool labels. + 5. Call dmg check start with two same healthy pool labels. + 6. Call dmg check start with healthy pool and corrupted pool. + 7. Repair with option 2 (original container label) and wait for checker to finish. + 8. Call dmg check start with healthy pool and invalid label. + 9. Disable checker and verify that the fault is actually fixed. + + Jira ID: DAOS-17858 + + :avocado: tags=all,full_regression + :avocado: tags=hw,medium + :avocado: tags=recovery,cat_recov + :avocado: tags=DMGCheckStartCornerCaseTest,test_two_pools_healthy + """ + # 1. Create three pools and one container. + self.log_step("Create three pools and one container.") + pool_1 = self.get_pool(connect=False) + pool_2 = self.get_pool(connect=False) + pool_3 = self.get_pool(connect=False) + container = self.get_container(pool=pool_3) + + # 2. Inject container bad label into one of them. + self.log_step("Inject container bad label into one of them.") + daos_command = self.get_daos_command() + daos_command.faults_container( + pool=pool_3.identifier, cont=container.identifier, + location="DAOS_CHK_CONT_BAD_LABEL") + + # 3. Enable checker and set policy to --all-interactive. + self.log_step("Enable checker and set policy to --all-interactive.") + dmg_command = self.get_dmg_command() + dmg_command.check_enable() + dmg_command.check_set_policy(all_interactive=True) + + # 4. Call dmg check start with two different healthy pool labels. + self.log_step("Call dmg check start with two different healthy pool labels.") + healthy_diff = pool_1.identifier + " " + pool_2.identifier + try: + dmg_command.check_start(pool=healthy_diff) + msg = ("dmg check start with two different healthy pool labels worked as " + "expected.") + self.log.info(msg) + except CommandFailure as command_failure: + msg = (f"dmg check start with two different healthy pool labels failed! " + f"{command_failure}") + self.fail(msg) + # Need to stop before starting again. + dmg_command.check_stop() + + # 5. Call dmg check start with two same healthy pool labels. + self.log_step("Call dmg check start with two same healthy pool labels.") + healthy_same = pool_1.identifier + " " + pool_1.identifier + try: + dmg_command.check_start(pool=healthy_same) + msg = ("dmg check start with two same healthy pool labels worked as " + "expected.") + self.log.info(msg) + except CommandFailure as command_failure: + msg = (f"dmg check start with two same healthy pool labels failed! " + f"{command_failure}") + self.fail(msg) + dmg_command.check_stop() + + # 6. Call dmg check start with healthy pool and corrupted pool. + self.log_step("Call dmg check start with healthy pool and corrupted pool.") + healthy_corrupted = pool_1.identifier + " " + pool_3.identifier + dmg_command.check_start(pool=healthy_corrupted) + + # 7. Repair with option 2 and wait for checker to finish. + self.log_step("Repair with option 2 and wait for checker to finish.") + # Wait for the checker to detect the inconsistent container label. + query_reports = None + for _ in range(8): + check_query_out = dmg_command.check_query() + # Status becomes RUNNING immediately, but it may take a while to detect the + # inconsistency. If detected, "reports" field is filled. + if check_query_out["response"]["status"] == "RUNNING": + query_reports = check_query_out["response"]["reports"] + if query_reports: + break + time.sleep(5) + if not query_reports: + self.fail("Checker didn't detect any inconsistency!") + fault_msg = query_reports[0]["msg"] + expected_fault = "inconsistent container label" + if expected_fault not in fault_msg: + self.fail(f"Checker didn't detect {expected_fault}! Fault msg = {fault_msg}") + # Obtain the seq num (ID) to repair. + seq = query_reports[0]["seq"] + # Repair with action 2, which is to use the original container label. + dmg_command.check_repair(seq_num=str(seq), action="2") + wait_for_check_complete(dmg=dmg_command) + dmg_command.check_stop() + + # 8. Call dmg check start with healthy pool and invalid label. + self.log_step("Call dmg check start with healthy pool and invalid label.") + healthy_invalid = pool_1.identifier + " TestPool0" + try: + dmg_command.check_start(pool=healthy_invalid) + self.fail("dmg check start with healthy and invalid pool labels worked!") + except CommandFailure as command_failure: + exp_msg = "unable to find pool service" + if exp_msg not in str(command_failure): + self.fail(f"{exp_msg} is not in the error message!") + + # 9. Disable checker and verify that the fault is actually fixed. + self.log_step("Disable checker and verify that the fault is actually fixed.") + dmg_command.check_disable() + expected_props = {"label": container.label.value} + label_verified = container.verify_prop(expected_props=expected_props) + self.assertTrue(label_verified, "Container label isn't fixed!") diff --git a/src/tests/ftest/recovery/check_start_corner_case.yaml b/src/tests/ftest/recovery/check_start_corner_case.yaml index d4ba8437916..ed1816f3b8c 100644 --- a/src/tests/ftest/recovery/check_start_corner_case.yaml +++ b/src/tests/ftest/recovery/check_start_corner_case.yaml @@ -2,7 +2,7 @@ hosts: test_servers: 1 test_clients: 1 -timeout: 4M +timeout: 5M server_config: name: daos_server