Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions src/chk/chk_engine.c
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
/**
* (C) Copyright 2022-2024 Intel Corporation.
* (C) Copyright 2025 Hewlett Packard Enterprise Development LP
* (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP
*
* SPDX-License-Identifier: BSD-2-Clause-Patent
*/
Expand Down Expand Up @@ -1981,7 +1981,7 @@ chk_engine_sched(void *args)
D_GOTO(out, rc);
}

if (ins_phase > cbk->cb_phase) {
if (ins_phase != CHK_INVAL_PHASE && ins_phase > cbk->cb_phase) {
D_INFO(DF_ENGINE" on rank %u moves from phase %u to phase %u\n",
DP_ENGINE(ins), myrank, cbk->cb_phase, ins_phase);

Expand Down
10 changes: 9 additions & 1 deletion src/chk/chk_internal.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
/**
* (C) Copyright 2022-2024 Intel Corporation.
* (C) Copyright 2025 Hewlett Packard Enterprise Development LP
* (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP
*
* SPDX-License-Identifier: BSD-2-Clause-Patent
*/
Expand Down Expand Up @@ -1173,6 +1173,14 @@ chk_pools_find_slowest(struct chk_instance *ins, int *done)
phase = cpr->cpr_bk.cb_phase;
}

/* All pools have been done, some check engines are still running, leader needs to wait. */
if (ins->ci_orphan_done && *done > 0 && !d_list_empty(&ins->ci_rank_list)) {
D_ASSERT(ins->ci_is_leader);

phase = CHK_INVAL_PHASE;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Instead of overloading CHK_INVAL_PHASE, is it possible to add a new phase to represent this state? CHK_LEADER_WAIT_PHASE or something like that. Seems like we would never want to be in an invalid phase.

*done = 0;
}

return phase;
}

Expand Down
60 changes: 29 additions & 31 deletions src/chk/chk_leader.c
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
/**
* (C) Copyright 2022-2024 Intel Corporation.
* (C) Copyright 2025 Hewlett Packard Enterprise Development LP
* (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP
*
* SPDX-License-Identifier: BSD-2-Clause-Patent
*/
Expand Down Expand Up @@ -226,9 +226,10 @@ static void
chk_leader_exit(struct chk_instance *ins, uint32_t ins_phase, uint32_t ins_status,
uint32_t pool_status, bool bcast)
{
struct chk_bookmark *cbk = &ins->ci_bk;
struct chk_iv iv = { 0 };
int rc = 0;
struct chk_dead_rank *cdr;
struct chk_bookmark *cbk = &ins->ci_bk;
struct chk_iv iv = {0};
int rc = 0;

ins->ci_sched_exiting = 1;

Expand All @@ -237,8 +238,7 @@ chk_leader_exit(struct chk_instance *ins, uint32_t ins_phase, uint32_t ins_statu
chk_pool_stop_all(ins, pool_status, NULL);

if ((bcast && ins_status == CHK__CHECK_INST_STATUS__CIS_FAILED) ||
ins_status == CHK__CHECK_INST_STATUS__CIS_IMPLICATED ||
unlikely(ins_status == CHK__CHECK_INST_STATUS__CIS_COMPLETED && !ins->ci_orphan_done)) {
ins_status == CHK__CHECK_INST_STATUS__CIS_IMPLICATED || !ins->ci_orphan_done) {
iv.ci_gen = cbk->cb_gen;
iv.ci_phase = ins_phase != CHK_INVAL_PHASE ? ins_phase : cbk->cb_phase;
iv.ci_ins_status = ins_status;
Expand All @@ -264,6 +264,10 @@ chk_leader_exit(struct chk_instance *ins, uint32_t ins_phase, uint32_t ins_statu
DP_LEADER(ins), ins_status, DP_RC(rc));
}

while ((cdr = d_list_pop_entry(&ins->ci_dead_ranks, struct chk_dead_rank, cdr_link)) !=
NULL)
D_FREE(cdr);

ins->ci_sched_exiting = 0;
}

Expand Down Expand Up @@ -305,31 +309,24 @@ chk_leader_post_repair(struct chk_instance *ins, struct chk_pool_rec *cpr,
DP_UUID(cpr->cpr_uuid), rc);
}

/*
* If the operation failed and 'failout' is set, then do nothing here.
* chk_leader_exit will handle all the IV and bookmark related things.
*/
if (*result == 0 || !(ins->ci_prop.cp_flags & CHK__CHECK_FLAG__CF_FAILOUT)) {
if (notify) {
iv.ci_gen = cbk->cb_gen;
uuid_copy(iv.ci_uuid, cpr->cpr_uuid);
iv.ci_ins_status = ins->ci_bk.cb_ins_status;
iv.ci_phase = cbk->cb_phase;
iv.ci_pool_status = cbk->cb_pool_status;

/* Synchronously notify the engines that check on the pool got failure. */
rc = chk_iv_update(ins->ci_iv_ns, &iv, CRT_IV_SHORTCUT_NONE,
CRT_IV_SYNC_EAGER, true);
D_CDEBUG(rc != 0, DLOG_ERR, DLOG_INFO,
DF_LEADER" notify the engines that the check for pool "
DF_UUIDF" is done with status %u: rc = %d\n",
DP_LEADER(ins), DP_UUID(cpr->cpr_uuid), iv.ci_pool_status, rc);
if (rc == 0)
cpr->cpr_notified_exit = 1;
}
if (notify) {
uuid_copy(iv.ci_uuid, cpr->cpr_uuid);
iv.ci_gen = cbk->cb_gen;
iv.ci_ins_status = ins->ci_bk.cb_ins_status;
iv.ci_phase = cbk->cb_phase;
iv.ci_pool_status = cbk->cb_pool_status;

rc = chk_iv_update(ins->ci_iv_ns, &iv, CRT_IV_SHORTCUT_NONE, CRT_IV_SYNC_EAGER,
true);
DL_CDEBUG(rc != 0, DLOG_WARN, DLOG_INFO, rc,
DF_LEADER " notify engines that check pool " DF_UUIDF " done, status %u",
DP_LEADER(ins), DP_UUID(cpr->cpr_uuid), iv.ci_pool_status);
if (rc == 0)
cpr->cpr_notified_exit = 1;
}

if (!(ins->ci_prop.cp_flags & CHK__CHECK_FLAG__CF_FAILOUT))
*result = 0;
}

if (update) {
rc = chk_bk_update_leader(&ins->ci_bk);
Expand Down Expand Up @@ -2284,7 +2281,8 @@ chk_leader_sched(void *args)

ins_phase = chk_pools_find_slowest(ins, &done);

if (ins_phase >= CHK__CHECK_SCAN_PHASE__CSP_POOL_MBS && !ins->ci_orphan_done &&
if (ins_phase != CHK_INVAL_PHASE &&
ins_phase >= CHK__CHECK_SCAN_PHASE__CSP_POOL_MBS && !ins->ci_orphan_done &&
!DAOS_FAIL_CHECK(DAOS_CHK_SYNC_ORPHAN_PROCESS)) {
iv.ci_gen = cbk->cb_gen;
iv.ci_phase = ins_phase;
Expand Down Expand Up @@ -2316,7 +2314,7 @@ chk_leader_sched(void *args)
D_GOTO(out, rc);
}

if (cbk->cb_phase == CHK_INVAL_PHASE || cbk->cb_phase < ins_phase) {
if (ins_phase != CHK_INVAL_PHASE && ins_phase > cbk->cb_phase) {
D_INFO(DF_LEADER" moves from phase %u to phase %u\n",
DP_LEADER(ins), cbk->cb_phase, ins_phase);

Expand Down
8 changes: 1 addition & 7 deletions src/tests/suite/daos_cr.c
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
/**
* (C) Copyright 2023-2024 Intel Corporation.
* (C) Copyright 2025 Hewlett Packard Enterprise Development LP
* (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP
*
* SPDX-License-Identifier: BSD-2-Clause-Patent
*/
Expand Down Expand Up @@ -3427,12 +3427,6 @@ cr_fail_sync_orphan(void **state)
rc = cr_pool_verify(&dci, pool.pool_uuid, TCPS_CHECKED, 0, NULL, NULL, NULL);
assert_rc_equal(rc, 0);

/* Check leader may be completed earlier than check engines in this case, double check. */
cr_ins_wait(0, NULL, &dci);

rc = cr_ins_verify(&dci, TCIS_COMPLETED);
assert_rc_equal(rc, 0);

cr_debug_set_params(arg, 0);

rc = cr_mode_switch(false);
Expand Down
Loading