From e5df40461819845e98b19a4dace9155e472d1e9b Mon Sep 17 00:00:00 2001 From: Xuezhao Liu Date: Mon, 29 Dec 2025 04:00:15 +0000 Subject: [PATCH 1/4] DAOS-18368 rebuild: fix use before check of ec_agg_boundary 1. fix a bug of using ec_agg_boundary before checking its valid 2. add some more logs for rebuild fetch getting zero iod_size, to provide some hints for layout information. Signed-off-by: Xuezhao Liu --- src/object/obj_internal.h | 2 + src/object/obj_layout.c | 34 ++++++++++++++++ src/object/srv_obj.c | 76 +++++++++++++++++++++++++++--------- src/object/srv_obj_migrate.c | 56 +++++++++++++++++++++----- 4 files changed, 140 insertions(+), 28 deletions(-) diff --git a/src/object/obj_internal.h b/src/object/obj_internal.h index 598c37644ee..3f630795f56 100644 --- a/src/object/obj_internal.h +++ b/src/object/obj_internal.h @@ -1181,6 +1181,8 @@ iov_alloc_for_csum_info(d_iov_t *iov, struct dcs_csum_info *csum_info); /* obj_layout.c */ int obj_pl_grp_idx(uint32_t layout_gl_ver, uint64_t hash, uint32_t grp_nr); +void +obj_dump_grp_layout(daos_handle_t oh, uint32_t shard); int obj_pl_place(struct pl_map *map, uint16_t layout_ver, struct daos_obj_md *md, diff --git a/src/object/obj_layout.c b/src/object/obj_layout.c index 189261ad31e..337adab92c4 100644 --- a/src/object/obj_layout.c +++ b/src/object/obj_layout.c @@ -1,5 +1,6 @@ /* * (C) Copyright 2016-2023 Intel Corporation. + * (C) Copyright 2025 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -95,3 +96,36 @@ obj_layout_diff(struct pl_map *map, daos_unit_oid_t oid, uint32_t new_ver, uint3 return rc; } + +void +obj_dump_grp_layout(daos_handle_t oh, uint32_t shard) +{ + struct dc_object *obj; + struct dc_obj_shard *obj_shard; + uint32_t grp_idx, i, nr; + + obj = obj_hdl2ptr(oh); + if (obj == NULL) { + D_INFO("invalid oh"); + return; + } + if (shard >= obj->cob_shards_nr) { + D_ERROR("bad shard %d, cob_shards_nr %d", shard, obj->cob_shards_nr); + goto out; + } + + grp_idx = shard / obj->cob_grp_size; + D_INFO(DF_OID " shard %d, grp_idx %d, grp_size %d", DP_OID(obj->cob_md.omd_id), shard, + grp_idx, obj->cob_grp_size); + for (i = grp_idx * obj->cob_grp_size, nr = 0; nr < obj->cob_grp_size; i++, nr++) { + obj_shard = &obj->cob_shards->do_shards[i]; + D_INFO("shard %d/%d/%d, tgt_id %d, rank %d, tgt_idx %d, " + "rebuilding %d, reintegrating %d, fseq %d", + i, obj_shard->do_shard_idx, obj_shard->do_shard, obj_shard->do_target_id, + obj_shard->do_target_rank, obj_shard->do_target_idx, + obj_shard->do_rebuilding, obj_shard->do_reintegrating, obj_shard->do_fseq); + } + +out: + obj_decref(obj); +} diff --git a/src/object/srv_obj.c b/src/object/srv_obj.c index 6ea4bb63ab6..78989187fe9 100644 --- a/src/object/srv_obj.c +++ b/src/object/srv_obj.c @@ -701,6 +701,20 @@ obj_set_reply_sizes(crt_rpc_t *rpc, daos_iod_t *iods, int iod_nr, uint8_t *skips sizes[i] = iods[idx].iod_size; D_DEBUG(DB_IO, DF_UOID" %d:"DF_U64"\n", DP_UOID(orw->orw_oid), i, iods[idx].iod_size); + if ((orw->orw_flags & ORF_FOR_MIGRATION) && sizes[i] == 0) { + D_INFO(DF_CONT " obj " DF_UOID "rebuild fetch zero iod_size, i:%d/idx:%d, " + "iod_nr %d, orw_epoch " DF_X64 ", orw_epoch_first " DF_X64 + " may cause DER_DATA_LOSS", + DP_CONT(orw->orw_pool_uuid, orw->orw_co_uuid), DP_UOID(orw->orw_oid), + i, idx, iods[idx].iod_nr, orw->orw_epoch, orw->orw_epoch_first); + if (iods[idx].iod_type == DAOS_IOD_ARRAY) { + int j; + + for (j = 0; j < min(8, iods[idx].iod_nr); j++) + D_INFO("recx[%d] - " DF_RECX, j, + DP_RECX(iods[idx].iod_recxs[j])); + } + } idx++; } @@ -1368,7 +1382,7 @@ struct ec_agg_boundary_arg { }; static int -obj_fetch_ec_agg_boundary(void *data) +obj_fetch_ec_agg_boundary_ult(void *data) { struct ec_agg_boundary_arg *arg = data; int rc; @@ -1381,6 +1395,33 @@ obj_fetch_ec_agg_boundary(void *data) return rc; } +static int +obj_fetch_ec_agg_boundary(struct obj_io_context *ioc, daos_unit_oid_t *uoid) +{ + struct ec_agg_boundary_arg arg; + int rc; + + arg.eab_pool = ioc->ioc_coc->sc_pool->spc_pool; + uuid_copy(arg.eab_co_uuid, ioc->ioc_coc->sc_uuid); + rc = dss_ult_execute(obj_fetch_ec_agg_boundary_ult, &arg, NULL, NULL, DSS_XS_SYS, 0, 0); + if (rc) { + DL_ERROR(rc, DF_CONT ", " DF_UOID " fetch ec_agg_boundary failed.", + DP_CONT(ioc->ioc_coc->sc_pool_uuid, ioc->ioc_coc->sc_uuid), + DP_UOID(*uoid)); + return rc; + } + if (ioc->ioc_coc->sc_ec_agg_eph_valid == 0) { + rc = -DER_FETCH_AGAIN; + DL_INFO(rc, DF_CONT ", " DF_UOID " zero ec_agg_boundary.", + DP_CONT(ioc->ioc_coc->sc_pool_uuid, ioc->ioc_coc->sc_uuid), DP_UOID(*uoid)); + return rc; + } + D_DEBUG(DB_IO, DF_CONT ", " DF_UOID " fetched ec_agg_eph_boundary " DF_X64 "\n", + DP_CONT(ioc->ioc_coc->sc_pool_uuid, ioc->ioc_coc->sc_uuid), DP_UOID(*uoid), + ioc->ioc_coc->sc_ec_agg_eph_boundary); + return 0; +} + static int obj_local_rw_internal(crt_rpc_t *rpc, struct obj_io_context *ioc, daos_iod_t *iods, struct dcs_iod_csums *iod_csums, uint64_t *offs, uint8_t *skips, @@ -1503,29 +1544,14 @@ obj_local_rw_internal(crt_rpc_t *rpc, struct obj_io_context *ioc, daos_iod_t *io } if ((ec_deg_fetch || (ec_recov && get_parity_list)) && ioc->ioc_coc->sc_ec_agg_eph_valid == 0) { - struct ec_agg_boundary_arg arg; - - arg.eab_pool = ioc->ioc_coc->sc_pool->spc_pool; - uuid_copy(arg.eab_co_uuid, ioc->ioc_coc->sc_uuid); - rc = dss_ult_execute(obj_fetch_ec_agg_boundary, &arg, NULL, NULL, - DSS_XS_SYS, 0, 0); + rc = obj_fetch_ec_agg_boundary(ioc, &orw->orw_oid); if (rc) { DL_ERROR(rc, DF_CONT ", " DF_UOID " fetch ec_agg_boundary failed.", DP_CONT(ioc->ioc_coc->sc_pool_uuid, ioc->ioc_coc->sc_uuid), DP_UOID(orw->orw_oid)); goto out; } - if (ioc->ioc_coc->sc_ec_agg_eph_valid == 0) { - rc = -DER_FETCH_AGAIN; - DL_INFO(rc, DF_CONT ", " DF_UOID " zero ec_agg_boundary.", - DP_CONT(ioc->ioc_coc->sc_pool_uuid, ioc->ioc_coc->sc_uuid), - DP_UOID(orw->orw_oid)); - goto out; - } - D_DEBUG(DB_IO, - DF_CONT ", " DF_UOID " fetched ec_agg_eph_boundary " DF_X64 "\n", - DP_CONT(ioc->ioc_coc->sc_pool_uuid, ioc->ioc_coc->sc_uuid), - DP_UOID(orw->orw_oid), ioc->ioc_coc->sc_ec_agg_eph_boundary); + D_ASSERT(ioc->ioc_coc->sc_ec_agg_eph_valid); } if (get_parity_list) { D_ASSERT(!ec_deg_fetch); @@ -3030,6 +3056,20 @@ ds_obj_rw_handler(crt_rpc_t *rpc) if (orw->orw_flags & ORF_FETCH_EPOCH_EC_AGG_BOUNDARY) { uint64_t rebuild_epoch; + if (ioc.ioc_coc->sc_ec_agg_eph_valid == 0) { + rc = obj_fetch_ec_agg_boundary(&ioc, &orw->orw_oid); + if (rc) { + DL_ERROR(rc, + DF_CONT ", " DF_UOID " fetch ec_agg_boundary " + "failed.", + DP_CONT(ioc.ioc_coc->sc_pool_uuid, + ioc.ioc_coc->sc_uuid), + DP_UOID(orw->orw_oid)); + goto out; + } + D_ASSERT(ioc.ioc_coc->sc_ec_agg_eph_valid); + } + D_ASSERTF(orw->orw_epoch <= orw->orw_epoch_first, "bad orw_epoch " DF_X64 ", orw_epoch_first " DF_X64 "\n", orw->orw_epoch, orw->orw_epoch_first); diff --git a/src/object/srv_obj_migrate.c b/src/object/srv_obj_migrate.c index d2a95aa3c79..3809780f93e 100644 --- a/src/object/srv_obj_migrate.c +++ b/src/object/srv_obj_migrate.c @@ -1156,6 +1156,28 @@ migrate_fetch_update_parity(struct migrate_one *mrone, daos_handle_t oh, return rc; } +static void +mrone_dump_info(struct migrate_one *mrone, daos_handle_t oh, daos_iod_t *iod) +{ + int i; + + if (daos_is_dkey_uint64(mrone->mo_oid.id_pub) && mrone->mo_dkey.iov_len == 8) + D_INFO(DF_RB ": " DF_UOID " int dkey " DF_U64 ", akey " DF_KEY ", iod_type %d, " + " iod_nr %d, iod_size " DF_U64, + DP_RB_MPT(mrone->mo_tls), DP_UOID(mrone->mo_oid), + *(uint64_t *)mrone->mo_dkey.iov_buf, DP_KEY(&iod->iod_name), iod->iod_type, + iod->iod_nr, iod->iod_size); + else + D_INFO(DF_RB ": " DF_UOID " dkey " DF_KEY ", akey " DF_KEY ", iod_type %d, " + " iod_nr %d, iod_size " DF_U64, + DP_RB_MPT(mrone->mo_tls), DP_UOID(mrone->mo_oid), DP_KEY(&mrone->mo_dkey), + DP_KEY(&iod->iod_name), iod->iod_type, iod->iod_nr, iod->iod_size); + if (iod->iod_type == DAOS_IOD_ARRAY) + for (i = 0; i < min(8, iod->iod_nr); i++) + D_INFO("recxs[%d] - " DF_RECX, i, DP_RECX(iod->iod_recxs[i])); + obj_dump_grp_layout(oh, mrone->mo_oid.id_shard); +} + static int migrate_fetch_update_single(struct migrate_one *mrone, daos_handle_t oh, struct ds_cont_child *ds_cont) @@ -1224,6 +1246,8 @@ migrate_fetch_update_single(struct migrate_one *mrone, daos_handle_t oh, daos_iod_t *iod = &mrone->mo_iods[i]; if (mrone->mo_iods[i].iod_size == 0) { + static __thread int log_nr; + /* zero size iod will cause assertion failure * in VOS, so let's check here. * So the object is being destroyed between @@ -1236,11 +1260,16 @@ migrate_fetch_update_single(struct migrate_one *mrone, daos_handle_t oh, */ rc = -DER_DATA_LOSS; D_DEBUG(DB_REBUILD, - DF_RB ": " DF_UOID " %p dkey " DF_KEY " " DF_KEY - " nr %d/%d eph " DF_U64 " " DF_RC "\n", - DP_RB_MRO(mrone), DP_UOID(mrone->mo_oid), mrone, - DP_KEY(&mrone->mo_dkey), DP_KEY(&mrone->mo_iods[i].iod_name), - mrone->mo_iod_num, i, mrone->mo_epoch, DP_RC(rc)); + DF_RB ": cont " DF_UUID " obj " DF_UOID " dkey " DF_KEY " " DF_KEY + " nr %d/%d eph " DF_X64 " " DF_RC "\n", + DP_RB_MRO(mrone), DP_UUID(mrone->mo_cont_uuid), + DP_UOID(mrone->mo_oid), DP_KEY(&mrone->mo_dkey), + DP_KEY(&mrone->mo_iods[i].iod_name), mrone->mo_iod_num, i, + mrone->mo_epoch, DP_RC(rc)); + if (log_nr <= 128) { + mrone_dump_info(mrone, oh, &mrone->mo_iods[i]); + log_nr++; + } D_GOTO(out, rc); } @@ -1407,6 +1436,8 @@ __migrate_fetch_update_bulk(struct migrate_one *mrone, daos_handle_t oh, for (i = 0; rc == 0 && i < iod_num; i++) { if (iods[i].iod_size == 0) { + static __thread int log_nr; + /* zero size iod will cause assertion failure * in VOS, so let's check here. * So the object is being destroyed between @@ -1418,11 +1449,16 @@ __migrate_fetch_update_bulk(struct migrate_one *mrone, daos_handle_t oh, * the rebuild and retry. */ rc = -DER_DATA_LOSS; - D_INFO(DF_RB ": " DF_UOID " %p dkey " DF_KEY " " DF_KEY - " nr %d/%d eph " DF_U64 " " DF_RC "\n", - DP_RB_MRO(mrone), DP_UOID(mrone->mo_oid), mrone, - DP_KEY(&mrone->mo_dkey), DP_KEY(&iods[i].iod_name), iod_num, i, - mrone->mo_epoch, DP_RC(rc)); + DL_INFO(rc, + DF_RB ": cont " DF_UUID " obj " DF_UOID " dkey " DF_KEY " " DF_KEY + " nr %d/%d mo_epoch " DF_X64 " fetch_eph " DF_X64, + DP_RB_MRO(mrone), DP_UUID(mrone->mo_cont_uuid), + DP_UOID(mrone->mo_oid), DP_KEY(&mrone->mo_dkey), + DP_KEY(&iods[i].iod_name), iod_num, i, mrone->mo_epoch, fetch_eph); + if (log_nr <= 128) { + mrone_dump_info(mrone, oh, &mrone->mo_iods[i]); + log_nr++; + } D_GOTO(end, rc); } } From 50c3e243087138b9f6cb01833337b616569fabc1 Mon Sep 17 00:00:00 2001 From: Xuezhao Liu Date: Sun, 4 Jan 2026 10:32:44 +0000 Subject: [PATCH 2/4] DAOS-18368 rebuild: a few log change Signed-off-by: Xuezhao Liu --- src/object/obj_internal.h | 2 +- src/object/obj_layout.c | 2 +- src/object/srv_obj.c | 18 ++++++++++-------- src/object/srv_obj_migrate.c | 8 ++++---- src/rebuild/scan.c | 8 ++++---- 5 files changed, 20 insertions(+), 18 deletions(-) diff --git a/src/object/obj_internal.h b/src/object/obj_internal.h index 3f630795f56..ba3191e761b 100644 --- a/src/object/obj_internal.h +++ b/src/object/obj_internal.h @@ -1,6 +1,6 @@ /** * (C) Copyright 2016-2024 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ diff --git a/src/object/obj_layout.c b/src/object/obj_layout.c index 337adab92c4..87958b70a11 100644 --- a/src/object/obj_layout.c +++ b/src/object/obj_layout.c @@ -1,6 +1,6 @@ /* * (C) Copyright 2016-2023 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ diff --git a/src/object/srv_obj.c b/src/object/srv_obj.c index 78989187fe9..b08b8981dee 100644 --- a/src/object/srv_obj.c +++ b/src/object/srv_obj.c @@ -1,7 +1,7 @@ /** * (C) Copyright 2016-2024 Intel Corporation. * (C) Copyright 2025 Google LLC - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -702,17 +702,19 @@ obj_set_reply_sizes(crt_rpc_t *rpc, daos_iod_t *iods, int iod_nr, uint8_t *skips D_DEBUG(DB_IO, DF_UOID" %d:"DF_U64"\n", DP_UOID(orw->orw_oid), i, iods[idx].iod_size); if ((orw->orw_flags & ORF_FOR_MIGRATION) && sizes[i] == 0) { - D_INFO(DF_CONT " obj " DF_UOID "rebuild fetch zero iod_size, i:%d/idx:%d, " - "iod_nr %d, orw_epoch " DF_X64 ", orw_epoch_first " DF_X64 - " may cause DER_DATA_LOSS", - DP_CONT(orw->orw_pool_uuid, orw->orw_co_uuid), DP_UOID(orw->orw_oid), - i, idx, iods[idx].iod_nr, orw->orw_epoch, orw->orw_epoch_first); + D_DEBUG(DB_REBUILD, + DF_CONT " obj " DF_UOID "rebuild fetch zero iod_size, " + "i:%d/idx:%d, iod_nr %d, orw_epoch " DF_X64 + ", orw_epoch_first " DF_X64 " may cause DER_DATA_LOSS", + DP_CONT(orw->orw_pool_uuid, orw->orw_co_uuid), + DP_UOID(orw->orw_oid), i, idx, iods[idx].iod_nr, orw->orw_epoch, + orw->orw_epoch_first); if (iods[idx].iod_type == DAOS_IOD_ARRAY) { int j; for (j = 0; j < min(8, iods[idx].iod_nr); j++) - D_INFO("recx[%d] - " DF_RECX, j, - DP_RECX(iods[idx].iod_recxs[j])); + D_DEBUG(DB_REBUILD, "recx[%d] - " DF_RECX, j, + DP_RECX(iods[idx].iod_recxs[j])); } } idx++; diff --git a/src/object/srv_obj_migrate.c b/src/object/srv_obj_migrate.c index 3809780f93e..26bcf9cd1ca 100644 --- a/src/object/srv_obj_migrate.c +++ b/src/object/srv_obj_migrate.c @@ -790,7 +790,7 @@ migrate_fetch_update_inline(struct migrate_one *mrone, daos_handle_t oh, struct dcs_iod_csums *iod_csums = NULL; int iod_cnt = 0; int start; - char iov_buf[OBJ_ENUM_UNPACK_MAX_IODS][MAX_BUF_SIZE]; + char iov_buf[OBJ_ENUM_UNPACK_MAX_IODS][MAX_BUF_SIZE]; bool fetch = false; int i; int rc = 0; @@ -1259,13 +1259,13 @@ migrate_fetch_update_single(struct migrate_one *mrone, daos_handle_t oh, * the rebuild and retry. */ rc = -DER_DATA_LOSS; - D_DEBUG(DB_REBUILD, + DL_INFO(rc, DF_RB ": cont " DF_UUID " obj " DF_UOID " dkey " DF_KEY " " DF_KEY - " nr %d/%d eph " DF_X64 " " DF_RC "\n", + " nr %d/%d eph " DF_X64, DP_RB_MRO(mrone), DP_UUID(mrone->mo_cont_uuid), DP_UOID(mrone->mo_oid), DP_KEY(&mrone->mo_dkey), DP_KEY(&mrone->mo_iods[i].iod_name), mrone->mo_iod_num, i, - mrone->mo_epoch, DP_RC(rc)); + mrone->mo_epoch); if (log_nr <= 128) { mrone_dump_info(mrone, oh, &mrone->mo_iods[i]); log_nr++; diff --git a/src/rebuild/scan.c b/src/rebuild/scan.c index e38e9d73e00..75fbc692702 100644 --- a/src/rebuild/scan.c +++ b/src/rebuild/scan.c @@ -1,6 +1,6 @@ /** * (C) Copyright 2017-2024 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -68,9 +68,9 @@ rebuild_obj_fill_buf(daos_handle_t ih, d_iov_t *key_iov, shards[count] = obj_val->shard; arg->count++; - D_DEBUG(DB_REBUILD, "send oid/con "DF_UOID"/"DF_UUID" ephs "DF_U64 - "shard %d cnt %d tgt_id %d\n", DP_UOID(oids[count]), - DP_UUID(arg->cont_uuid), obj_val->eph, shards[count], + D_DEBUG(DB_REBUILD, + "send oid/con " DF_UOID "/" DF_UUID " ephs " DF_X64 " shard %d cnt %d tgt_id %d\n", + DP_UOID(oids[count]), DP_UUID(arg->cont_uuid), obj_val->eph, shards[count], arg->count, arg->tgt_id); rc = dbtree_iter_delete(ih, NULL); From 601df3cc8948c71b63163ea5e682381d1527f7cd Mon Sep 17 00:00:00 2001 From: Xuezhao Liu Date: Wed, 7 Jan 2026 07:16:34 +0000 Subject: [PATCH 3/4] DAOS-18368 rebuild: refine rebuild_leader_status_check Signed-off-by: Xuezhao Liu --- src/object/srv_obj_migrate.c | 4 +-- src/rebuild/srv.c | 47 +++++++++++++++++++++--------------- 2 files changed, 30 insertions(+), 21 deletions(-) diff --git a/src/object/srv_obj_migrate.c b/src/object/srv_obj_migrate.c index 26bcf9cd1ca..c17da3a9527 100644 --- a/src/object/srv_obj_migrate.c +++ b/src/object/srv_obj_migrate.c @@ -3070,8 +3070,8 @@ migrate_one_epoch_object(daos_epoch_range_t *epr, struct migrate_pool_tls *tls, /* Each object enumeration RPC will at least one OID */ if (num < minimum_nr && (enum_flags & DIOF_TO_SPEC_GROUP)) { - D_DEBUG(DB_REBUILD, DF_RB ": enumeration buffer %u empty" DF_UOID "\n", - DP_RB_MPT(tls), num, DP_UOID(arg->oid)); + D_INFO(DF_RB ": enumeration buffer %u empty" DF_UOID, DP_RB_MPT(tls), num, + DP_UOID(arg->oid)); break; } diff --git a/src/rebuild/srv.c b/src/rebuild/srv.c index 88e6b53c851..28941d1df43 100644 --- a/src/rebuild/srv.c +++ b/src/rebuild/srv.c @@ -1,6 +1,6 @@ /** * (C) Copyright 2016-2024 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -958,53 +958,63 @@ rebuild_leader_status_check(struct ds_pool *pool, uint32_t op, char sbuf[RBLD_SBUF_LEN]; double now; char *str; - d_rank_list_t excluded = {0}; + d_rank_list_t rank_list = {0}; bool rebuild_abort = false; int i; + now = ABT_get_wtime(); ABT_rwlock_rdlock(pool->sp_lock); rc = map_ranks_init(pool->sp_map, - PO_COMP_ST_UP | PO_COMP_ST_DOWN | - PO_COMP_ST_DOWNOUT | PO_COMP_ST_NEW, - &excluded); + PO_COMP_ST_UP | PO_COMP_ST_DOWN | PO_COMP_ST_DOWNOUT | + PO_COMP_ST_NEW, + &rank_list); if (rc != 0) { D_INFO(DF_RB ": get rank list: %d\n", DP_RB_RGT(rgt), rc); ABT_rwlock_unlock(pool->sp_lock); goto sleep; } - for (i = 0; i < excluded.rl_nr; i++) { + for (i = 0; i < rank_list.rl_nr; i++) { struct pool_domain *dom; - dom = pool_map_find_dom_by_rank(pool->sp_map, excluded.rl_ranks[i]); + dom = pool_map_find_dom_by_rank(pool->sp_map, rank_list.rl_ranks[i]); D_ASSERT(dom != NULL); if (rgt->rgt_opc == RB_OP_REBUILD) { if (dom->do_comp.co_status == PO_COMP_ST_UP) { if (dom->do_comp.co_in_ver > rgt->rgt_rebuild_ver) { - D_INFO(DF_RB ": cancel rebuild co_in_ver=%u\n", - DP_RB_RGT(rgt), dom->do_comp.co_in_ver); + D_INFO(DF_RB ": cancel rebuild due to new REINT, " + "co_rank %d, co_in_ver %u\n", + DP_RB_RGT(rgt), dom->do_comp.co_rank, + dom->do_comp.co_in_ver); rebuild_abort = true; break; - } else { - continue; } } else if (dom->do_comp.co_status == PO_COMP_ST_DOWN) { if (dom->do_comp.co_fseq > rgt->rgt_rebuild_ver) { - D_INFO(DF_RB ": cancel rebuild co_fseq=%u\n", - DP_RB_RGT(rgt), dom->do_comp.co_fseq); + D_INFO(DF_RB ": cancel rebuild due to new DOWN, " + "co_rank %d, co_fseq %u\n", + DP_RB_RGT(rgt), dom->do_comp.co_rank, + dom->do_comp.co_fseq); rebuild_abort = true; break; } } } - D_INFO(DF_RB " exclude rank %d/%x.\n", DP_RB_RGT(rgt), dom->do_comp.co_rank, - dom->do_comp.co_status); - rebuild_leader_set_status(rgt, dom->do_comp.co_rank, - -1, SCAN_DONE | PULL_DONE); + + if (now - last_print > 20) + D_INFO(DF_RB " rank %d, status 0x%x.\n", DP_RB_RGT(rgt), + dom->do_comp.co_rank, dom->do_comp.co_status); + + /* for PO_COMP_ST_DOWN | PO_COMP_ST_DOWNOUT | PO_COMP_ST_NEW ranks + * set the completion as no progress/completion will be reported from them. + */ + if (dom->do_comp.co_status != PO_COMP_ST_UP) + rebuild_leader_set_status(rgt, dom->do_comp.co_rank, -1, + SCAN_DONE | PULL_DONE); } ABT_rwlock_unlock(pool->sp_lock); - map_ranks_fini(&excluded); + map_ranks_fini(&rank_list); if (rebuild_abort) { rgt->rgt_abort = 1; @@ -1048,7 +1058,6 @@ rebuild_leader_status_check(struct ds_pool *pool, uint32_t op, break; } - now = ABT_get_wtime(); /* print something at least for each 10 seconds */ if (now - last_print > 10) { last_print = now; From f4bc2727ac863015e6f3e6ebf0aebe36bea0ba9f Mon Sep 17 00:00:00 2001 From: Xuezhao Liu Date: Thu, 8 Jan 2026 12:30:21 +0000 Subject: [PATCH 4/4] DAOS-18368 object: refine EC agg peer update Some failures need to be retried. Signed-off-by: Xuezhao Liu --- src/object/srv_ec_aggregate.c | 73 ++++++++++++++++++++++++++++++----- 1 file changed, 64 insertions(+), 9 deletions(-) diff --git a/src/object/srv_ec_aggregate.c b/src/object/srv_ec_aggregate.c index 96abd078284..708f77540a8 100644 --- a/src/object/srv_ec_aggregate.c +++ b/src/object/srv_ec_aggregate.c @@ -1,7 +1,7 @@ /** * (C) Copyright 2020-2024 Intel Corporation. * (C) Copyright 2025 Google LLC - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -1278,6 +1278,42 @@ agg_process_partial_stripe(struct ec_agg_entry *entry) return rc; } +static bool +agg_peer_failed(struct ec_agg_param *agg_param, struct daos_shard_loc *peer_loc) +{ + struct pool_target *targets = NULL; + uint32_t failed_tgts_cnt = 0; + int i; + int rc; + + rc = pool_map_find_failed_tgts(agg_param->ap_pool_info.api_pool->sp_map, &targets, + &failed_tgts_cnt); + if (rc) { + DL_ERROR(rc, DF_CONT " pool_map_find_failed_tgts failed.", + DP_CONT(agg_param->ap_pool_info.api_pool_uuid, + agg_param->ap_pool_info.api_cont_uuid)); + return false; + } + + if (targets == NULL || failed_tgts_cnt == 0) + return false; + + for (i = 0; i < failed_tgts_cnt; i++) { + if (targets[i].ta_comp.co_rank == peer_loc->sd_rank && + targets[i].ta_comp.co_index == peer_loc->sd_tgt_idx) { + D_DEBUG(DB_EPC, DF_CONT " peer parity tgt failed rank %d, tgt_idx %d.\n", + DP_CONT(agg_param->ap_pool_info.api_pool_uuid, + agg_param->ap_pool_info.api_cont_uuid), + peer_loc->sd_rank, peer_loc->sd_tgt_idx); + D_FREE(targets); + return true; + } + } + + D_FREE(targets); + return false; +} + int agg_peer_check_avail(struct ec_agg_param *agg_param, struct ec_agg_entry *entry) { @@ -1334,6 +1370,12 @@ agg_peer_check_avail(struct ec_agg_param *agg_param, struct ec_agg_entry *entry) return rc; } +static bool +agg_peer_retryable_err(int err) +{ + return err == -DER_STALE || err == -DER_TIMEDOUT || daos_crt_network_error(err); +} + /* Sends the generated parity and the stripe number to the peer * parity target. Handler writes the parity and deletes the replicas * for the stripe. @@ -1382,7 +1424,7 @@ agg_peer_update_ult(void *arg) obj = obj_hdl2ptr(entry->ae_obj_hdl); for (peer = 0; peer < p; peer++) { uint64_t enqueue_id = 0; - bool overloaded; + bool peer_retry; if (peer == pidx) continue; @@ -1390,7 +1432,7 @@ agg_peer_update_ult(void *arg) tgt_ep.ep_rank = entry->ae_peer_pshards[peer].sd_rank; tgt_ep.ep_tag = entry->ae_peer_pshards[peer].sd_tgt_idx; retry: - overloaded = false; + peer_retry = false; rc = ds_obj_req_create(dss_get_module_info()->dmi_ctx, &tgt_ep, DAOS_OBJ_RPC_EC_AGGREGATE, &rpc); if (rc) { @@ -1470,13 +1512,20 @@ agg_peer_update_ult(void *arg) rc = ec_agg_out->ea_status; if (rc == -DER_OVERLOAD_RETRY) { enqueue_id = ec_agg_out->ea_comm_out.req_out_enqueue_id; - overloaded = true; + peer_retry = true; } D_CDEBUG(rc == 0, DB_TRACE, DLOG_ERR, "update parity[%d] to %d:%d, status = " DF_RC "\n", peer, tgt_ep.ep_rank, tgt_ep.ep_tag, DP_RC(rc)); peer_updated += rc == 0; } + if (rc != 0 && peer_updated && agg_peer_retryable_err(rc) && + !agg_peer_failed(agg_param, &entry->ae_peer_pshards[peer])) { + DL_INFO(rc, DF_UOID " pidx %d to parity[%d] will retry.", + DP_UOID(entry->ae_oid), pidx, peer); + peer_retry = true; + } + next: if (bulk_hdl) crt_bulk_free(bulk_hdl); @@ -1487,7 +1536,7 @@ agg_peer_update_ult(void *arg) rpc = NULL; bulk_hdl = NULL; iod_csums = NULL; - if (overloaded) { + if (peer_retry) { dss_sleep(daos_rpc_rand_delay(max_delay) << 10); goto retry; } @@ -1665,13 +1714,13 @@ agg_process_holes_ult(void *arg) for (peer = 0; peer < p; peer++) { uint64_t enqueue_id = 0; uint32_t peer_shard; - bool overloaded; + bool peer_retry; if (pidx == peer) continue; retry: - overloaded = false; + peer_retry = false; D_ASSERT(entry->ae_peer_pshards[peer].sd_rank != DAOS_TGT_IGNORE); tgt_ep.ep_rank = entry->ae_peer_pshards[peer].sd_rank; tgt_ep.ep_tag = entry->ae_peer_pshards[peer].sd_tgt_idx; @@ -1719,7 +1768,7 @@ agg_process_holes_ult(void *arg) rc = ec_rep_out->er_status; if (rc == -DER_OVERLOAD_RETRY) { enqueue_id = ec_rep_out->er_comm_out.req_out_enqueue_id; - overloaded = true; + peer_retry = true; } D_CDEBUG(rc == 0, DB_TRACE, DLOG_ERR, DF_UOID " parity[%d] er_status = " DF_RC "\n", @@ -1728,7 +1777,13 @@ agg_process_holes_ult(void *arg) } crt_req_decref(rpc); rpc = NULL; - if (overloaded) { + if (rc != 0 && peer_updated && agg_peer_retryable_err(rc) && + !agg_peer_failed(agg_param, &entry->ae_peer_pshards[peer])) { + DL_INFO(rc, DF_UOID " pidx %d to parity[%d] will retry.", + DP_UOID(entry->ae_oid), pidx, peer); + peer_retry = true; + } + if (peer_retry) { dss_sleep(daos_rpc_rand_delay(max_delay) << 10); goto retry; }