Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
118 changes: 118 additions & 0 deletions src/tests/ftest/recovery/check_start_corner_case.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,3 +153,121 @@ def test_start_back_to_back(self):
# Containers were removed by the checker.
container_1.skip_cleanup()
container_2.skip_cleanup()

def test_two_pools_healthy(self):
"""Test to pass in two pool labels where one is healthy pool.

1. Create three pools and one container.
2. Inject container bad label into one of them.
3. Enable checker and set policy to --all-interactive.
4. Call dmg check start with two different healthy pool labels.
5. Call dmg check start with two same healthy pool labels.
6. Call dmg check start with healthy pool and corrupted pool.
7. Repair with option 2 (original container label) and wait for checker to finish.
8. Call dmg check start with healthy pool and invalid label.
9. Disable checker and verify that the fault is actually fixed.

Jira ID: DAOS-17858

:avocado: tags=all,full_regression
:avocado: tags=hw,medium
:avocado: tags=recovery,cat_recov
:avocado: tags=DMGCheckStartCornerCaseTest,test_two_pools_healthy
"""
# 1. Create three pools and one container.
self.log_step("Create three pools and one container.")
pool_1 = self.get_pool(connect=False)
pool_2 = self.get_pool(connect=False)
pool_3 = self.get_pool(connect=False)
container = self.get_container(pool=pool_3)

# 2. Inject container bad label into one of them.
self.log_step("Inject container bad label into one of them.")
daos_command = self.get_daos_command()
daos_command.faults_container(
pool=pool_3.identifier, cont=container.identifier,
location="DAOS_CHK_CONT_BAD_LABEL")

# 3. Enable checker and set policy to --all-interactive.
self.log_step("Enable checker and set policy to --all-interactive.")
dmg_command = self.get_dmg_command()
dmg_command.check_enable()
dmg_command.check_set_policy(all_interactive=True)

# 4. Call dmg check start with two different healthy pool labels.
self.log_step("Call dmg check start with two different healthy pool labels.")
healthy_diff = pool_1.identifier + " " + pool_2.identifier
try:
dmg_command.check_start(pool=healthy_diff)
msg = ("dmg check start with two different healthy pool labels worked as "
"expected.")
self.log.info(msg)
except CommandFailure as command_failure:
msg = (f"dmg check start with two different healthy pool labels failed! "
f"{command_failure}")
self.fail(msg)
# Need to stop before starting again.
dmg_command.check_stop()

# 5. Call dmg check start with two same healthy pool labels.
self.log_step("Call dmg check start with two same healthy pool labels.")
healthy_same = pool_1.identifier + " " + pool_1.identifier
try:
dmg_command.check_start(pool=healthy_same)
msg = ("dmg check start with two same healthy pool labels worked as "
"expected.")
self.log.info(msg)
except CommandFailure as command_failure:
msg = (f"dmg check start with two same healthy pool labels failed! "
f"{command_failure}")
self.fail(msg)
dmg_command.check_stop()

# 6. Call dmg check start with healthy pool and corrupted pool.
self.log_step("Call dmg check start with healthy pool and corrupted pool.")
healthy_corrupted = pool_1.identifier + " " + pool_3.identifier
dmg_command.check_start(pool=healthy_corrupted)

# 7. Repair with option 2 and wait for checker to finish.
self.log_step("Repair with option 2 and wait for checker to finish.")
# Wait for the checker to detect the inconsistent container label.
query_reports = None
for _ in range(8):
check_query_out = dmg_command.check_query()
# Status becomes RUNNING immediately, but it may take a while to detect the
# inconsistency. If detected, "reports" field is filled.
if check_query_out["response"]["status"] == "RUNNING":
query_reports = check_query_out["response"]["reports"]
if query_reports:
break
time.sleep(5)
if not query_reports:
self.fail("Checker didn't detect any inconsistency!")
fault_msg = query_reports[0]["msg"]
expected_fault = "inconsistent container label"
if expected_fault not in fault_msg:
self.fail(f"Checker didn't detect {expected_fault}! Fault msg = {fault_msg}")
# Obtain the seq num (ID) to repair.
seq = query_reports[0]["seq"]
# Repair with action 2, which is to use the original container label.
dmg_command.check_repair(seq_num=str(seq), action="2")
wait_for_check_complete(dmg=dmg_command)
dmg_command.check_stop()

# 8. Call dmg check start with healthy pool and invalid label.
self.log_step("Call dmg check start with healthy pool and invalid label.")
healthy_invalid = pool_1.identifier + " TestPool0"
try:
dmg_command.check_start(pool=healthy_invalid)
self.fail("dmg check start with healthy and invalid pool labels worked!")
except CommandFailure as command_failure:
exp_msg = "unable to find pool service"
if exp_msg not in str(command_failure):
self.fail(f"{exp_msg} is not in the error message!")

# 9. Disable checker and verify that the fault is actually fixed.
self.log_step("Disable checker and verify that the fault is actually fixed.")
dmg_command.check_disable()
expected_props = {"label": container.label.value}
label_verified = container.verify_prop(expected_props=expected_props)
self.assertTrue(label_verified, "Container label isn't fixed!")
2 changes: 1 addition & 1 deletion src/tests/ftest/recovery/check_start_corner_case.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ hosts:
test_servers: 1
test_clients: 1

timeout: 4M
timeout: 5M

server_config:
name: daos_server
Expand Down
Loading