From 248715aca4e1add7f3714d85e0c1dbd31a5786c2 Mon Sep 17 00:00:00 2001 From: Fan Yong Date: Mon, 29 Dec 2025 11:08:10 +0800 Subject: [PATCH] DAOS-18367 vos: evict self-created object when transaction failure Currently, if a transaction failed for some reason, the cleanup logic will try to evict related vos object from cache to avoid leaving stable information in cache. Such logic works well for the system with PMEM. But under md-on-ssd mode, it may cause trouble if the cleanup logic evicts some object that is not created by current failed transaction. Because one vos modification may hold the same object multiple times, and there is CPU yield during these object hold actions. That creates race windows for other concurrent operations against the same object. This patch changes the logic: when the transaction creates some new object(s), it will record related oid(s), if such transaction failed in subsequent process, it will only evict these new created object(s). For those new created dkey or lower component under existing objects, related object cache will not be affected during transaction cleanup. On the other hand, under md-on-ssd mode, CPU may yield during backend TX start, the object that is held by current modification maybe marked as evicted in such race windows. So add logic to check whether related object is evicted or not after backend TX started, if yes, then restart current transaction. Signed-off-by: Fan Yong --- src/dtx/dtx_common.c | 189 +++++---------------------------- src/dtx/tests/dts_structs.c | 7 +- src/include/daos/lru.h | 9 ++ src/include/daos_srv/dtx_srv.h | 33 ++---- src/vos/tests/vts_dtx.c | 39 ++----- src/vos/tests/vts_io.c | 2 +- src/vos/vos_common.c | 32 ++++-- src/vos/vos_dtx.c | 135 +++++++++++++---------- src/vos/vos_internal.h | 27 ++--- src/vos/vos_io.c | 57 ++++------ src/vos/vos_obj.c | 38 ++++--- src/vos/vos_obj.h | 10 +- src/vos/vos_obj_cache.c | 22 ++-- 13 files changed, 231 insertions(+), 369 deletions(-) diff --git a/src/dtx/dtx_common.c b/src/dtx/dtx_common.c index 175f440dc4e..5b342584bb6 100644 --- a/src/dtx/dtx_common.c +++ b/src/dtx/dtx_common.c @@ -883,15 +883,10 @@ dtx_handle_reinit(struct dtx_handle *dth) dth->dth_modify_shared = 0; dth->dth_active = 0; - dth->dth_touched_leader_oid = 0; dth->dth_local_tx_started = 0; dth->dth_cos_done = 0; - - dth->dth_op_seq = 0; - dth->dth_oid_cnt = 0; - dth->dth_oid_cap = 0; - D_FREE(dth->dth_oid_array); - dth->dth_dkey_hash = 0; + dth->dth_op_seq = 0; + dth->dth_dkey_hash = 0; vos_dtx_rsrvd_fini(dth); return vos_dtx_rsrvd_init(dth); @@ -926,32 +921,29 @@ dtx_handle_init(struct dtx_id *dti, daos_handle_t xoh, struct dtx_epoch *epoch, dth->dth_coh = xoh; } - dth->dth_ver = pm_ver; - dth->dth_refs = 1; - dth->dth_mbs = mbs; - - dth->dth_pinned = 0; - dth->dth_cos_done = 0; - dth->dth_modify_shared = 0; - dth->dth_active = 0; - dth->dth_touched_leader_oid = 0; - dth->dth_local_tx_started = 0; - dth->dth_solo = (flags & DTX_SOLO) ? 1 : 0; - dth->dth_drop_cmt = (flags & DTX_DROP_CMT) ? 1 : 0; - dth->dth_dist = (flags & DTX_DIST) ? 1 : 0; - dth->dth_for_migration = (flags & DTX_FOR_MIGRATION) ? 1 : 0; + dth->dth_ver = pm_ver; + dth->dth_refs = 1; + dth->dth_mbs = mbs; + dth->dth_pinned = 0; + dth->dth_cos_done = 0; + dth->dth_modify_shared = 0; + dth->dth_active = 0; + dth->dth_local_tx_started = 0; + dth->dth_solo = (flags & DTX_SOLO) ? 1 : 0; + dth->dth_drop_cmt = (flags & DTX_DROP_CMT) ? 1 : 0; + dth->dth_dist = (flags & DTX_DIST) ? 1 : 0; + dth->dth_for_migration = (flags & DTX_FOR_MIGRATION) ? 1 : 0; dth->dth_ignore_uncommitted = (flags & DTX_IGNORE_UNCOMMITTED) ? 1 : 0; - dth->dth_prepared = (flags & DTX_PREPARED) ? 1 : 0; - dth->dth_epoch_owner = (flags & DTX_EPOCH_OWNER) ? 1 : 0; - dth->dth_aborted = 0; - dth->dth_already = 0; - dth->dth_need_validation = 0; + dth->dth_prepared = (flags & DTX_PREPARED) ? 1 : 0; + dth->dth_epoch_owner = (flags & DTX_EPOCH_OWNER) ? 1 : 0; + dth->dth_aborted = 0; + dth->dth_already = 0; + dth->dth_need_validation = 0; dth->dth_local = (flags & DTX_LOCAL) ? 1 : 0; - - dth->dth_dti_cos = dti_cos; - dth->dth_dti_cos_count = dti_cos_cnt; - dth->dth_ent = NULL; - dth->dth_flags = leader ? DTE_LEADER : 0; + dth->dth_dti_cos = dti_cos; + dth->dth_dti_cos_count = dti_cos_cnt; + dth->dth_ent = NULL; + dth->dth_flags = leader ? DTE_LEADER : 0; if (flags & DTX_SYNC) { dth->dth_flags |= DTE_BLOCK; @@ -960,12 +952,11 @@ dtx_handle_init(struct dtx_id *dti, daos_handle_t xoh, struct dtx_epoch *epoch, dth->dth_sync = 0; } - dth->dth_op_seq = 0; - dth->dth_oid_cnt = 0; - dth->dth_oid_cap = 0; - dth->dth_oid_array = NULL; - - dth->dth_dkey_hash = 0; + dth->dth_op_seq = 0; + dth->dth_local_oid_cnt = 0; + dth->dth_local_oid_cap = 0; + dth->dth_local_oid_array = NULL; + dth->dth_dkey_hash = 0; if (!(flags & DTX_LOCAL)) { if (daos_is_zero_dti(dti)) @@ -1001,83 +992,6 @@ dtx_handle_init(struct dtx_id *dti, daos_handle_t xoh, struct dtx_epoch *epoch, return rc; } -static int -dtx_insert_oid(struct dtx_handle *dth, daos_unit_oid_t *oid, bool touch_leader) -{ - int start = 0; - int end = dth->dth_oid_cnt - 1; - int at; - int rc = 0; - - do { - at = (start + end) / 2; - rc = daos_unit_oid_compare(dth->dth_oid_array[at], *oid); - if (rc == 0) - return 0; - - if (rc > 0) - end = at - 1; - else - start = at + 1; - } while (start <= end); - - if (dth->dth_oid_cnt == dth->dth_oid_cap) { - daos_unit_oid_t *oid_array; - - D_ALLOC_ARRAY(oid_array, dth->dth_oid_cap << 1); - if (oid_array == NULL) - return -DER_NOMEM; - - if (rc > 0) { - /* Insert before dth->dth_oid_array[at]. */ - if (at > 0) - memcpy(&oid_array[0], &dth->dth_oid_array[0], - sizeof(*oid) * at); - oid_array[at] = *oid; - memcpy(&oid_array[at + 1], &dth->dth_oid_array[at], - sizeof(*oid) * (dth->dth_oid_cnt - at)); - } else { - /* Insert after dth->dth_oid_array[at]. */ - memcpy(&oid_array[0], &dth->dth_oid_array[0], - sizeof(*oid) * (at + 1)); - oid_array[at + 1] = *oid; - if (at < dth->dth_oid_cnt - 1) - memcpy(&oid_array[at + 2], - &dth->dth_oid_array[at + 1], - sizeof(*oid) * (dth->dth_oid_cnt - 1 - at)); - } - - D_FREE(dth->dth_oid_array); - dth->dth_oid_array = oid_array; - dth->dth_oid_cap <<= 1; - - goto out; - } - - if (rc > 0) { - /* Insert before dth->dth_oid_array[at]. */ - memmove(&dth->dth_oid_array[at + 1], - &dth->dth_oid_array[at], - sizeof(*oid) * (dth->dth_oid_cnt - at)); - dth->dth_oid_array[at] = *oid; - } else { - /* Insert after dth->dth_oid_array[at]. */ - if (at < dth->dth_oid_cnt - 1) - memmove(&dth->dth_oid_array[at + 2], - &dth->dth_oid_array[at + 1], - sizeof(*oid) * (dth->dth_oid_cnt - 1 - at)); - dth->dth_oid_array[at + 1] = *oid; - } - -out: - if (touch_leader) - dth->dth_touched_leader_oid = 1; - - dth->dth_oid_cnt++; - - return 0; -} - void dtx_renew_epoch(struct dtx_epoch *epoch, struct dtx_handle *dth) { @@ -1110,51 +1024,6 @@ dtx_sub_init(struct dtx_handle *dth, daos_unit_oid_t *oid, uint64_t dkey_hash) dth->dth_dkey_hash = dkey_hash; dth->dth_op_seq++; - rc = daos_unit_oid_compare(dth->dth_leader_oid, *oid); - if (rc == 0) { - if (dth->dth_oid_array == NULL) - dth->dth_touched_leader_oid = 1; - - if (dth->dth_touched_leader_oid) - goto out; - - rc = dtx_insert_oid(dth, oid, true); - - D_GOTO(out, rc); - } - - if (dth->dth_oid_array == NULL) { - D_ASSERT(dth->dth_oid_cnt == 0); - - /* 4 slots by default to hold rename case. */ - dth->dth_oid_cap = 4; - D_ALLOC_ARRAY(dth->dth_oid_array, dth->dth_oid_cap); - if (dth->dth_oid_array == NULL) - D_GOTO(out, rc = -DER_NOMEM); - - if (!dth->dth_touched_leader_oid) { - dth->dth_oid_array[0] = *oid; - dth->dth_oid_cnt = 1; - - D_GOTO(out, rc = 0); - } - - dth->dth_oid_cnt = 2; - - if (rc > 0) { - dth->dth_oid_array[0] = *oid; - dth->dth_oid_array[1] = dth->dth_leader_oid; - } else { - dth->dth_oid_array[0] = dth->dth_leader_oid; - dth->dth_oid_array[1] = *oid; - } - - D_GOTO(out, rc = 0); - } - - rc = dtx_insert_oid(dth, oid, false); - -out: D_DEBUG(DB_IO, "Sub init DTX "DF_DTI" for object "DF_UOID " dkey %lu, opc seq %d: "DF_RC"\n", DP_DTI(&dth->dth_xid), DP_UOID(*oid), @@ -1493,7 +1362,6 @@ dtx_leader_end(struct dtx_leader_handle *dlh, struct ds_cont_child *cont, int re dth->dth_sync ? "sync" : "async", dth->dth_dti_cos_count, dth->dth_cos_done ? dth->dth_dti_cos_count : 0, DP_RC(result)); - D_FREE(dth->dth_oid_array); D_FREE(dlh); d_tm_dec_gauge(dtx_tls_get()->dt_dtx_leader_total, 1); @@ -1617,7 +1485,6 @@ dtx_end(struct dtx_handle *dth, struct ds_cont_child *cont, int result) vos_dtx_detach(dth); out: - D_FREE(dth->dth_oid_array); D_FREE(dth); return result; diff --git a/src/dtx/tests/dts_structs.c b/src/dtx/tests/dts_structs.c index bddfdf9816c..e0a715660f2 100644 --- a/src/dtx/tests/dts_structs.c +++ b/src/dtx/tests/dts_structs.c @@ -62,7 +62,6 @@ struct_dtx_handle(void **state) SET_BITFIELD_1(dummy, dth_drop_cmt); SET_BITFIELD_1(dummy, dth_modify_shared); SET_BITFIELD_1(dummy, dth_active); - SET_BITFIELD_1(dummy, dth_touched_leader_oid); SET_BITFIELD_1(dummy, dth_local_tx_started); SET_BITFIELD_1(dummy, dth_shares_inited); SET_BITFIELD_1(dummy, dth_dist); @@ -75,7 +74,7 @@ struct_dtx_handle(void **state) SET_BITFIELD_1(dummy, dth_local); SET_BITFIELD_1(dummy, dth_epoch_owner); SET_BITFIELD_1(dummy, dth_local_complete); - SET_BITFIELD(dummy, padding1, 12); + SET_BITFIELD(dummy, padding1, 13); SET_FIELD(dummy, dth_dti_cos_count); SET_FIELD(dummy, dth_dti_cos); @@ -87,10 +86,6 @@ struct_dtx_handle(void **state) SET_FIELD(dummy, dth_op_seq); SET_FIELD(dummy, dth_deferred_used_cnt); SET_FIELD(dummy, padding2); - SET_FIELD(dummy, dth_oid_cnt); - SET_FIELD(dummy, dth_oid_cap); - SET_FIELD(dummy, padding3); - SET_FIELD(dummy, dth_oid_array); SET_FIELD(dummy, dth_local_oid_cnt); SET_FIELD(dummy, dth_local_oid_cap); SET_FIELD(dummy, padding4); diff --git a/src/include/daos/lru.h b/src/include/daos/lru.h index de6c5a373b9..bf6586b5f8d 100644 --- a/src/include/daos/lru.h +++ b/src/include/daos/lru.h @@ -137,6 +137,15 @@ daos_lru_ref_evict(struct daos_lru_cache *lcache, struct daos_llink *llink) d_hash_rec_evict_at(&lcache->dlc_htable, &llink->ll_link); } +/** + * Whether the item is evicted or not. + */ +static inline bool +daos_lru_is_evicted(struct daos_llink *llink) +{ + return llink->ll_evicted != 0; +} + /** * Evict the item from LRU before releasing the refcount on it, wait until * the caller is the last one holds refcount. diff --git a/src/include/daos_srv/dtx_srv.h b/src/include/daos_srv/dtx_srv.h index 6143ed9b350..72ded4efa64 100644 --- a/src/include/daos_srv/dtx_srv.h +++ b/src/include/daos_srv/dtx_srv.h @@ -47,6 +47,7 @@ struct dtx_local_oid_record { * the most optimal way (packed). Please make sure that all necessary padding * is explicit so it could be used in the future. */ +/* clang-format off */ struct dtx_handle { union { struct dtx_entry dth_dte; @@ -92,8 +93,6 @@ struct dtx_handle { dth_modify_shared : 1, /* The DTX entry is in active table. */ dth_active : 1, - /* Leader oid is touched. */ - dth_touched_leader_oid : 1, /* Local TX is started. */ dth_local_tx_started : 1, /* The DTX share lists are inited. */ @@ -117,7 +116,7 @@ struct dtx_handle { /* Locally generate the epoch. */ dth_epoch_owner : 1, /* Flag to commit the local transaction */ - dth_local_complete : 1, padding1 : 12; + dth_local_complete : 1, padding1 : 13; /* The count the DTXs in the dth_dti_cos array. */ uint32_t dth_dti_cos_count; @@ -138,25 +137,14 @@ struct dtx_handle { uint16_t dth_deferred_used_cnt; uint16_t padding2; - union { - struct { - /** The count of objects that are modified by this DTX. */ - uint16_t dth_oid_cnt; - /** The total slots in the dth_oid_array. */ - uint16_t dth_oid_cap; - uint32_t padding3; - /** If more than one objects are modified, the IDs are reocrded here. */ - daos_unit_oid_t *dth_oid_array; - }; - struct { - /** The count of objects stored in dth_local_oid_array. */ - uint16_t dth_local_oid_cnt; - /** The total slots in the dth_local_oid_array. */ - uint16_t dth_local_oid_cap; - uint32_t padding4; - /** The record of all objects touched by the local transaction. */ - struct dtx_local_oid_record *dth_local_oid_array; - }; + struct { + /** The count of objects stored in dth_local_oid_array. */ + uint16_t dth_local_oid_cnt; + /** The total slots in the dth_local_oid_array. */ + uint16_t dth_local_oid_cap; + uint32_t padding4; + /** The record of all objects touched by the local transaction. */ + struct dtx_local_oid_record *dth_local_oid_array; }; /* Hash of the dkey to be modified if applicable. Per modification. */ @@ -179,6 +167,7 @@ struct dtx_handle { int dth_share_tbd_count; uint32_t padding5; }; +/* clang-format on */ /* Each sub transaction handle to manage each sub thandle */ struct dtx_sub_status { diff --git a/src/vos/tests/vts_dtx.c b/src/vos/tests/vts_dtx.c index 12cd6d72728..725bfe4b93f 100644 --- a/src/vos/tests/vts_dtx.c +++ b/src/vos/tests/vts_dtx.c @@ -51,40 +51,13 @@ vts_dtx_begin(const daos_unit_oid_t *oid, daos_handle_t coh, daos_epoch_t epoch, vts_init_dte(&dth->dth_dte); - dth->dth_coh = coh; - dth->dth_epoch = epoch; - dth->dth_leader_oid = *oid; - - dth->dth_pinned = 0; - dth->dth_sync = 0; - dth->dth_cos_done = 0; - dth->dth_touched_leader_oid = 0; - dth->dth_local_tx_started = 0; - dth->dth_solo = 0; - dth->dth_drop_cmt = 0; - dth->dth_modify_shared = 0; - dth->dth_active = 0; - dth->dth_dist = 0; - dth->dth_for_migration = 0; - dth->dth_ignore_uncommitted = 0; - dth->dth_prepared = 0; - dth->dth_epoch_owner = 0; - dth->dth_aborted = 0; - dth->dth_already = 0; - dth->dth_need_validation = 0; - - dth->dth_dti_cos_count = 0; - dth->dth_dti_cos = NULL; - dth->dth_ent = NULL; - dth->dth_flags = DTE_LEADER; + dth->dth_coh = coh; + dth->dth_epoch = epoch; + dth->dth_leader_oid = *oid; + dth->dth_flags = DTE_LEADER; dth->dth_modification_cnt = 1; - - dth->dth_op_seq = 1; - dth->dth_oid_cnt = 0; - dth->dth_oid_cap = 0; - dth->dth_oid_array = NULL; - - dth->dth_dkey_hash = dkey_hash; + dth->dth_op_seq = 1; + dth->dth_dkey_hash = dkey_hash; D_INIT_LIST_HEAD(&dth->dth_share_cmt_list); D_INIT_LIST_HEAD(&dth->dth_share_abt_list); diff --git a/src/vos/tests/vts_io.c b/src/vos/tests/vts_io.c index fdad31e03bb..6139ea6c37d 100644 --- a/src/vos/tests/vts_io.c +++ b/src/vos/tests/vts_io.c @@ -916,7 +916,7 @@ hold_obj(struct vos_container *cont, daos_unit_oid_t oid, daos_epoch_range_t *ep assert_rc_equal(rc, 0); } - rc = vos_obj_incarnate(*obj_p, epr, bound, flags, intent, ts_set); + rc = vos_obj_incarnate(*obj_p, epr, bound, flags, intent, ts_set, NULL); if (umm != NULL) rc = umem_tx_end(umm, rc); diff --git a/src/vos/vos_common.c b/src/vos/vos_common.c index a7397a94256..dfd9230610f 100644 --- a/src/vos/vos_common.c +++ b/src/vos/vos_common.c @@ -215,12 +215,22 @@ vos_tx_publish(struct dtx_handle *dth, bool publish) } int -vos_tx_begin(struct dtx_handle *dth, struct umem_instance *umm, bool is_sysdb) +vos_tx_begin(struct dtx_handle *dth, struct umem_instance *umm, bool is_sysdb, + struct vos_object *obj) { int rc; - if (dth == NULL) - return umem_tx_begin(umm, vos_txd_get(is_sysdb)); + if (dth == NULL) { + /* CPU may yield when umem_tx_begin, related object maybe evicted during that. */ + rc = umem_tx_begin(umm, vos_txd_get(is_sysdb)); + if (rc == 0 && unlikely(vos_obj_is_evicted(obj))) { + D_DEBUG(DB_IO, "Obj " DF_UOID " is evicted(1), need to restart TX.\n", + DP_UOID(obj->obj_id)); + rc = umem_tx_end(umm, -DER_TX_RESTART); + } + + return rc; + } D_ASSERT(!is_sysdb); /** Note: On successful return, dth tls gets set and will be cleared by the corresponding @@ -235,6 +245,14 @@ vos_tx_begin(struct dtx_handle *dth, struct umem_instance *umm, bool is_sysdb) rc = umem_tx_begin(umm, vos_txd_get(is_sysdb)); if (rc == 0) { + /* CPU may yield when umem_tx_begin, related object maybe evicted during that. */ + if (unlikely(vos_obj_is_evicted(obj))) { + D_DEBUG(DB_IO, "Obj " DF_UOID " is evicted(2), need to restart TX.\n", + DP_UOID(obj->obj_id)); + + return umem_tx_end(umm, -DER_TX_RESTART); + } + dth->dth_local_tx_started = 1; vos_dth_set(dth, false); } @@ -251,13 +269,7 @@ vos_local_tx_abort(struct dtx_handle *dth) return; /** - * Since a local transaction spawns always a single pool an eaither one of the containers - * can be used to access the pool. - */ - record = &dth->dth_local_oid_array[0]; - - /** - * Evict all objects touched by the aborted transaction from the object cache to make sure + * Evict all objects created by the aborted transaction from the object cache to make sure * no invalid pointer stays there. Not all of the touched objects have to be evicted but * for simplicity's sake all of them are. */ diff --git a/src/vos/vos_dtx.c b/src/vos/vos_dtx.c index c64adc12bb0..e0f65fb2017 100644 --- a/src/vos/vos_dtx.c +++ b/src/vos/vos_dtx.c @@ -151,37 +151,21 @@ dtx_inprogress(struct vos_dtx_act_ent *dae, struct dtx_handle *dth, } static void -dtx_act_ent_cleanup(struct vos_container *cont, struct vos_dtx_act_ent *dae, - struct dtx_handle *dth, bool evict, bool keep_df) +dtx_act_ent_cleanup(struct vos_container *cont, struct vos_dtx_act_ent *dae, bool evict, + bool keep_df) { - if (evict) { - daos_unit_oid_t *oids; - int count; - int i; + if (evict && dae->dae_oids != NULL) { + int i; - if (dth != NULL) { - if (dth->dth_oid_array != NULL) { - D_ASSERT(dth->dth_oid_cnt > 0); - - count = dth->dth_oid_cnt; - oids = dth->dth_oid_array; - } else { - count = 1; - oids = &dth->dth_leader_oid; - } - } else { - count = dae->dae_oid_cnt; - oids = dae->dae_oids; - } - - for (i = 0; i < count; i++) - vos_obj_evict_by_oid(cont, oids[i]); + for (i = 0; i < dae->dae_oid_cnt; i++) + vos_obj_evict_by_oid(cont, dae->dae_oids[i]); } if (dae->dae_oids != NULL && dae->dae_oids != &dae->dae_oid_inline && dae->dae_oids != &DAE_OID(dae)) { D_FREE(dae->dae_oids); dae->dae_oid_cnt = 0; + dae->dae_oid_cap = 0; } DAE_REC_OFF(dae) = UMOFF_NULL; @@ -254,7 +238,7 @@ dtx_act_ent_free(struct btr_instance *tins, struct btr_record *rec, D_ASSERT(dae != NULL); *(struct vos_dtx_act_ent **)args = dae; } else if (dae != NULL) { - dtx_act_ent_cleanup(tins->ti_priv, dae, NULL, true, false); + dtx_act_ent_cleanup(tins->ti_priv, dae, true, false); } return 0; @@ -879,7 +863,7 @@ vos_dtx_commit_one(struct vos_container *cont, struct dtx_id *dti, daos_epoch_t rc = dbtree_delete(cont->vc_dtx_active_hdl, BTR_PROBE_BYPASS, &kiov, &dae); if (rc == 0) { - dtx_act_ent_cleanup(cont, dae, NULL, false, false); + dtx_act_ent_cleanup(cont, dae, false, false); dtx_evict_lid(cont, dae); } @@ -1845,30 +1829,6 @@ vos_dtx_prepared(struct dtx_handle *dth, struct vos_dtx_cmt_ent **dce_p) (dth->dth_modification_cnt > 0)) dth->dth_sync = 1; - if (dth->dth_oid_array != NULL) { - D_ASSERT(dth->dth_oid_cnt > 0); - - dae->dae_oid_cnt = dth->dth_oid_cnt; - if (dth->dth_oid_cnt == 1) { - dae->dae_oid_inline = dth->dth_oid_array[0]; - dae->dae_oids = &dae->dae_oid_inline; - } else { - size = sizeof(daos_unit_oid_t) * dth->dth_oid_cnt; - D_ALLOC_NZ(dae->dae_oids, size); - if (dae->dae_oids == NULL) { - /* Not fatal. */ - D_WARN("No DRAM to store ACT DTX OIDs " - DF_DTI"\n", DP_DTI(&DAE_XID(dae))); - dae->dae_oid_cnt = 0; - } else { - memcpy(dae->dae_oids, dth->dth_oid_array, size); - } - } - } else { - dae->dae_oids = &DAE_OID(dae); - dae->dae_oid_cnt = 1; - } - if (DAE_MBS_DSIZE(dae) <= sizeof(DAE_MBS_INLINE(dae))) { memcpy(DAE_MBS_INLINE(dae), dth->dth_mbs->dm_data, DAE_MBS_DSIZE(dae)); @@ -2441,7 +2401,7 @@ vos_dtx_post_handle(struct vos_container *cont, DAE_FLAGS(daes[i]) |= DTE_PARTIAL_COMMITTED; daes[i]->dae_committing = 0; - dtx_act_ent_cleanup(cont, daes[i], NULL, false, true); + dtx_act_ent_cleanup(cont, daes[i], false, true); continue; } @@ -2467,13 +2427,13 @@ vos_dtx_post_handle(struct vos_container *cont, daes[i]->dae_aborted = 1; daes[i]->dae_aborting = 0; - dtx_act_ent_cleanup(cont, daes[i], NULL, true, false); + dtx_act_ent_cleanup(cont, daes[i], true, false); } else { D_ASSERT(daes[i]->dae_aborting == 0); daes[i]->dae_committed = 1; daes[i]->dae_committing = 0; - dtx_act_ent_cleanup(cont, daes[i], NULL, false, false); + dtx_act_ent_cleanup(cont, daes[i], false, false); } DAE_FLAGS(daes[i]) &= ~(DTE_CORRUPTED | DTE_ORPHAN | DTE_PARTIAL_COMMITTED); } @@ -3659,7 +3619,7 @@ vos_dtx_cleanup_internal(struct dtx_handle *dth) */ if (dae != NULL) { D_ASSERT(!vos_dae_is_prepare(dae)); - dtx_act_ent_cleanup(cont, dae, dth, true, false); + dtx_act_ent_cleanup(cont, dae, true, false); } } else { d_iov_set(&kiov, &dth->dth_xid, sizeof(dth->dth_xid)); @@ -3682,7 +3642,7 @@ vos_dtx_cleanup_internal(struct dtx_handle *dth) if (DAE_EPOCH(dae) != dth->dth_epoch) goto out; - dtx_act_ent_cleanup(cont, dae, dth, true, false); + dtx_act_ent_cleanup(cont, dae, true, false); rc = dbtree_delete(cont->vc_dtx_active_hdl, riov.iov_buf != NULL ? BTR_PROBE_BYPASS : BTR_PROBE_EQ, @@ -4040,7 +4000,7 @@ vos_dtx_local_begin(struct dtx_handle *dth, daos_handle_t poh) goto error; } - rc = vos_tx_begin(dth, umm, pool->vp_sysdb); + rc = vos_tx_begin(dth, umm, pool->vp_sysdb, NULL); if (rc != 0) { D_ERROR("Failed to start transaction: rc=" DF_RC "\n", DP_RC(rc)); goto error; @@ -4167,3 +4127,68 @@ vos_dtx_get_cmt_stat(daos_handle_t coh, uint64_t *cmt_cnt, struct dtx_time_stat out: return rc; } + +int +vos_dtx_record_oid(struct dtx_handle *dth, struct vos_container *cont, daos_unit_oid_t oid) +{ + struct dtx_local_oid_record *oid_array; + struct dtx_local_oid_record *record; + struct vos_dtx_act_ent *dae; + daos_unit_oid_t *oids; + int rc = 0; + + if (dth == NULL) + D_GOTO(out, rc = 0); + + if (dth->dth_local) { + if (dth->dth_local_oid_cnt == dth->dth_local_oid_cap) { + D_REALLOC_ARRAY(oid_array, dth->dth_local_oid_array, dth->dth_local_oid_cap, + dth->dth_local_oid_cap << 1); + if (oid_array == NULL) + D_GOTO(out, rc = -DER_NOMEM); + + dth->dth_local_oid_array = oid_array; + dth->dth_local_oid_cap <<= 1; + } + + record = &dth->dth_local_oid_array[dth->dth_local_oid_cnt]; + record->dor_cont = cont; + vos_cont_addref(cont); + record->dor_oid = oid; + dth->dth_local_oid_cnt++; + + D_GOTO(out, rc = 0); + } + + if (daos_is_zero_dti(&dth->dth_xid)) + D_GOTO(out, rc = 0); + + dae = dth->dth_ent; + D_ASSERT(dae != NULL); + + if (dae->dae_oid_cnt == 0) { + if (daos_unit_oid_compare(oid, DAE_OID(dae)) == 0) + dae->dae_oids = &DAE_OID(dae); + else + dae->dae_oids = &dae->dae_oid_inline; + } else if (dae->dae_oid_cnt >= dae->dae_oid_cap) { + D_ALLOC_ARRAY(oids, dae->dae_oid_cnt << 1); + if (oids == NULL) + D_GOTO(out, rc = -DER_NOMEM); + + memcpy(oids, dae->dae_oids, sizeof(*oids) * dae->dae_oid_cnt); + if (dae->dae_oids != &DAE_OID(dae) && dae->dae_oids != &dae->dae_oid_inline) + D_FREE(dae->dae_oids); + + dae->dae_oids = oids; + dae->dae_oid_cap = dae->dae_oid_cnt << 1; + } + + dae->dae_oids[dae->dae_oid_cnt++] = oid; + +out: + if (rc != 0) + D_ERROR("Failed to record oid " DF_UOID ": " DF_RC "\n", DP_UOID(oid), DP_RC(rc)); + + return rc; +} diff --git a/src/vos/vos_internal.h b/src/vos/vos_internal.h index 18e6438ce6e..6236ffb38ed 100644 --- a/src/vos/vos_internal.h +++ b/src/vos/vos_internal.h @@ -470,10 +470,6 @@ struct vos_dtx_act_ent { * then 'dae_oids' points to the 'dae_oid_inline'. * * Otherwise, 'dae_oids' points to new buffer to hold more. - * - * These information is used for EC aggregation optimization. - * If server restarts, then we will lose the optimization but - * it is not fatal. */ daos_unit_oid_t *dae_oids; /* The time (hlc) when the DTX entry is created. */ @@ -485,6 +481,9 @@ struct vos_dtx_act_ent { /* Back pointer to the DTX handle. */ struct dtx_handle *dae_dth; + /* The capacity of dae_oids if it points to new allocated area. */ + uint32_t dae_oid_cap; + unsigned int dae_committable:1, dae_committing:1, dae_committed:1, @@ -855,6 +854,9 @@ vos_dtx_post_handle(struct vos_container *cont, int vos_dtx_act_reindex(struct vos_container *cont); +int +vos_dtx_record_oid(struct dtx_handle *dth, struct vos_container *cont, daos_unit_oid_t oid); + enum vos_tree_class { /** the first reserved tree class */ VOS_BTR_BEGIN = DBTREE_VOS_BEGIN, @@ -1336,7 +1338,8 @@ vos_evt_desc_cbs_init(struct evt_desc_cbs *cbs, struct vos_pool *pool, daos_handle_t coh, struct vos_object *obj); int -vos_tx_begin(struct dtx_handle *dth, struct umem_instance *umm, bool is_sysdb); +vos_tx_begin(struct dtx_handle *dth, struct umem_instance *umm, bool is_sysdb, + struct vos_object *obj); /** Finish the transaction and publish or cancel the reservations or * return if err == 0 and it's a multi-modification transaction that @@ -1928,20 +1931,6 @@ vos_io_scm(struct vos_pool *pool, daos_iod_type_t type, daos_size_t size, enum v return false; } -/** - * Insert object ID and its parent container into the array of objects touched by the ongoing - * local transaction. - * - * \param[in] dth DTX handle for ongoing local transaction - * \param[in] cont VOS container - * \param[in] oid Object ID - * - * \return 0 : Success. - * -DER_NOMEM : Run out of the volatile memory. - */ -int -vos_insert_oid(struct dtx_handle *dth, struct vos_container *cont, daos_unit_oid_t *oid); - static inline bool vos_pool_is_p2(struct vos_pool *pool) { diff --git a/src/vos/vos_io.c b/src/vos/vos_io.c index cebf9181aaa..abeb60d0f82 100644 --- a/src/vos/vos_io.c +++ b/src/vos/vos_io.c @@ -2552,41 +2552,16 @@ update_cancel(struct vos_io_context *ioc) true /* abort */); } -int -vos_insert_oid(struct dtx_handle *dth, struct vos_container *cont, daos_unit_oid_t *oid) -{ - struct dtx_local_oid_record *oid_array = NULL; - struct dtx_local_oid_record *record = NULL; - - /** The array has to grow to accommodate the next record. */ - if (dth->dth_local_oid_cnt == dth->dth_local_oid_cap) { - D_REALLOC_ARRAY(oid_array, dth->dth_local_oid_array, dth->dth_local_oid_cap, - dth->dth_local_oid_cap << 1); - if (oid_array == NULL) - return -DER_NOMEM; - - dth->dth_local_oid_array = oid_array; - dth->dth_local_oid_cap <<= 1; - } - - record = &dth->dth_local_oid_array[dth->dth_local_oid_cnt]; - record->dor_cont = cont; - vos_cont_addref(cont); - record->dor_oid = *oid; - dth->dth_local_oid_cnt++; - - return 0; -} - int vos_update_end(daos_handle_t ioh, uint32_t pm_ver, daos_key_t *dkey, int err, daos_size_t *size, struct dtx_handle *dth) { - struct vos_dtx_act_ent **daes = NULL; - struct vos_dtx_cmt_ent **dces = NULL; - struct vos_io_context *ioc = vos_ioh2ioc(ioh); - struct umem_instance *umem; + struct vos_dtx_act_ent **daes = NULL; + struct vos_dtx_cmt_ent **dces = NULL; + struct vos_io_context *ioc = vos_ioh2ioc(ioh); + struct umem_instance *umem; bool tx_started = false; + bool created = false; uint16_t minor_epc; uint64_t flags = VOS_OBJ_CREATE | VOS_OBJ_VISIBLE; @@ -2598,6 +2573,13 @@ vos_update_end(daos_handle_t ioh, uint32_t pm_ver, daos_key_t *dkey, int err, if (err != 0) goto abort; + if (unlikely(vos_obj_is_evicted(ioc->ic_pinned_obj))) { + D_DEBUG(DB_IO, "Obj " DF_UOID " is evicted during update, need to restart TX.\n", + DP_UOID(ioc->ic_oid)); + + D_GOTO(abort, err = -DER_TX_RESTART); + } + err = vos_ts_set_add(ioc->ic_ts_set, ioc->ic_cont->vc_ts_idx, NULL, 0); D_ASSERT(err == 0); @@ -2606,7 +2588,10 @@ vos_update_end(daos_handle_t ioh, uint32_t pm_ver, daos_key_t *dkey, int err, if (err != 0) goto abort; - err = vos_tx_begin(dth, umem, ioc->ic_cont->vc_pool->vp_sysdb); + if (ioc->ic_pinned_obj != NULL) + D_ASSERT(ioc->ic_pinned_obj == ioc->ic_obj); + + err = vos_tx_begin(dth, umem, ioc->ic_cont->vc_pool->vp_sysdb, ioc->ic_obj); if (err != 0) goto abort; @@ -2637,8 +2622,8 @@ vos_update_end(daos_handle_t ioh, uint32_t pm_ver, daos_key_t *dkey, int err, D_FREE(daes); } - err = vos_obj_incarnate(ioc->ic_obj, &ioc->ic_epr, ioc->ic_bound, flags, - DAOS_INTENT_UPDATE, ioc->ic_ts_set); + err = vos_obj_incarnate(ioc->ic_obj, &ioc->ic_epr, ioc->ic_bound, flags, DAOS_INTENT_UPDATE, + ioc->ic_ts_set, &created); if (err != 0) goto abort; @@ -2663,10 +2648,6 @@ vos_update_end(daos_handle_t ioh, uint32_t pm_ver, daos_key_t *dkey, int err, goto abort; } - if (dtx_is_valid_handle(dth) && dth->dth_local) { - err = vos_insert_oid(dth, ioc->ic_cont, &ioc->ic_oid); - } - abort: if (err == -DER_NONEXIST || err == -DER_EXIST || err == -DER_INPROGRESS) { @@ -2727,7 +2708,7 @@ vos_update_end(daos_handle_t ioh, uint32_t pm_ver, daos_key_t *dkey, int err, *size = ioc->ic_io_size; D_FREE(daes); D_FREE(dces); - vos_ioc_destroy(ioc, err != 0); + vos_ioc_destroy(ioc, err != 0 && created); return err; } diff --git a/src/vos/vos_obj.c b/src/vos/vos_obj.c index 0015d91d916..6d265ec1cf0 100644 --- a/src/vos/vos_obj.c +++ b/src/vos/vos_obj.c @@ -421,6 +421,7 @@ vos_obj_punch(daos_handle_t coh, daos_unit_oid_t oid, daos_epoch_t epoch, struct vos_ts_set *ts_set; struct vos_container *cont; struct vos_object *obj = NULL; + bool created = false; bool punch_obj = false; bool tx_started = false; uint64_t hold_flags; @@ -494,7 +495,7 @@ vos_obj_punch(daos_handle_t coh, daos_unit_oid_t oid, daos_epoch_t epoch, if (rc != 0) goto reset; - rc = vos_tx_begin(dth, vos_cont2umm(cont), cont->vc_pool->vp_sysdb); + rc = vos_tx_begin(dth, vos_cont2umm(cont), cont->vc_pool->vp_sysdb, obj); if (rc != 0) goto reset; @@ -523,7 +524,7 @@ vos_obj_punch(daos_handle_t coh, daos_unit_oid_t oid, daos_epoch_t epoch, } /* NB: punch always generate a new incarnation of the object */ - rc = vos_obj_incarnate(obj, &epr, bound, hold_flags, DAOS_INTENT_PUNCH, ts_set); + rc = vos_obj_incarnate(obj, &epr, bound, hold_flags, DAOS_INTENT_PUNCH, ts_set, &created); if (rc == 0) { if (dkey) { /* key punch */ rc = key_punch(obj, epr.epr_hi, bound, pm_ver, dkey, @@ -571,14 +572,9 @@ vos_obj_punch(daos_handle_t coh, daos_unit_oid_t oid, daos_epoch_t epoch, vos_ts_set_update(ts_set, epr.epr_hi); } - if (rc == 0) { + if (rc == 0) vos_ts_set_wupdate(ts_set, epr.epr_hi); - if (dtx_is_valid_handle(dth) && dth->dth_local) { - rc = vos_insert_oid(dth, cont, &oid); - } - } - rc = vos_tx_end(cont, dth, NULL, NULL, tx_started, NULL, rc); if (dtx_is_valid_handle(dth)) { if (rc == 0) @@ -592,7 +588,7 @@ vos_obj_punch(daos_handle_t coh, daos_unit_oid_t oid, daos_epoch_t epoch, } if (obj != NULL) - vos_obj_release(obj, 0, rc != 0); + vos_obj_release(obj, 0, rc != 0 && created); D_FREE(daes); D_FREE(dces); @@ -723,7 +719,7 @@ vos_obj_delete_internal(daos_handle_t coh, daos_unit_oid_t oid, bool only_delete rc = umem_tx_end(umm, rc); out: - vos_obj_release(obj, 0, rc == 0); + vos_obj_release(obj, 0, false); return rc; } @@ -801,7 +797,7 @@ vos_obj_del_key(daos_handle_t coh, daos_unit_oid_t oid, daos_key_t *dkey, out_tx: rc = umem_tx_end(umm, rc); out: - vos_obj_release(obj, 0, true); + vos_obj_release(obj, 0, false); return rc; } @@ -816,7 +812,8 @@ vos_obj_mark_corruption(daos_handle_t coh, daos_epoch_t epoch, uint32_t pm_ver, daos_handle_t toh = DAOS_HDL_INVAL; int rc = 0; int i; - bool dirty = false; + bool dirty = false; + bool created = false; cont = vos_hdl2cont(coh); D_ASSERT(cont != NULL); @@ -842,17 +839,24 @@ vos_obj_mark_corruption(daos_handle_t coh, daos_epoch_t epoch, uint32_t pm_ver, } } +restart: rc = vos_obj_hold(cont, oid, &epr, epoch, VOS_OBJ_VISIBLE | VOS_OBJ_CREATE, DAOS_INTENT_MARK, &obj, NULL); if (rc != 0) goto log; - rc = umem_tx_begin(umm, NULL); + rc = vos_tx_begin(NULL, umm, cont->vc_pool->vp_sysdb, obj); + if (unlikely(rc == -DER_TX_RESTART)) { + vos_obj_release(obj, 0, true); + obj = NULL; + goto restart; + } + if (rc != 0) goto log; rc = vos_obj_incarnate(obj, &epr, epoch, VOS_OBJ_VISIBLE | VOS_OBJ_CREATE, DAOS_INTENT_MARK, - NULL); + NULL, &created); if (rc != 0) goto out; @@ -906,12 +910,14 @@ vos_obj_mark_corruption(daos_handle_t coh, daos_epoch_t epoch, uint32_t pm_ver, ", dkey (empty), akey_nr %u, epoch " DF_X64 ", pm_ver %u", DP_UOID(oid), akey_nr, epoch, pm_ver); + if (rc == -DER_ALREADY) + rc = 0; if (daos_handle_is_valid(toh)) dbtree_close(toh); if (obj != NULL) - vos_obj_release(obj, 0, true); + vos_obj_release(obj, 0, rc != 0 && created); - return rc == -DER_ALREADY ? 0 : rc; + return rc; } static int diff --git a/src/vos/vos_obj.h b/src/vos/vos_obj.h index f572ebb03d9..a48f78043e3 100644 --- a/src/vos/vos_obj.h +++ b/src/vos/vos_obj.h @@ -1,5 +1,6 @@ /** * (C) Copyright 2016-2024 Intel Corporation. + * (C) Copyright 2025 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -123,6 +124,12 @@ void vos_obj_evict(struct vos_object *obj); int vos_obj_evict_by_oid(struct vos_container *cont, daos_unit_oid_t oid); +static inline bool +vos_obj_is_evicted(struct vos_object *obj) +{ + return obj != NULL && daos_lru_is_evicted(&obj->obj_llink); +} + /** * Create an object cache. * @@ -246,6 +253,7 @@ vos_oi_delete(struct vos_container *cont, daos_unit_oid_t oid, bool only_delete_ * \param flags [IN] Object flags * \param intent [IN] The request intent. * \param ts_set [IN] Timestamp set + * \param created [OUR] Whether new object is created or not. * * \return 0 Object is successfully incarnated. * \return -DER_NONEXIST The conditions for success don't apply @@ -255,7 +263,7 @@ vos_oi_delete(struct vos_container *cont, daos_unit_oid_t oid, bool only_delete_ */ int vos_obj_incarnate(struct vos_object *obj, daos_epoch_range_t *epr, daos_epoch_t bound, - uint64_t flags, uint32_t intent, struct vos_ts_set *ts_set); + uint64_t flags, uint32_t intent, struct vos_ts_set *ts_set, bool *created); /** * Check if an operation will be conflicting with other ongoing operations over the diff --git a/src/vos/vos_obj_cache.c b/src/vos/vos_obj_cache.c index ba1898e0f25..9e8e9fca940 100644 --- a/src/vos/vos_obj_cache.c +++ b/src/vos/vos_obj_cache.c @@ -424,7 +424,7 @@ vos_obj_release(struct vos_object *obj, uint64_t flags, bool evict) else if (flags & VOS_OBJ_DISCARD) obj->obj_discard = 0; - obj_release(occ, obj, evict); + obj_release(occ, obj, evict || obj->obj_zombie); } /** Move local object to the lru cache */ @@ -525,11 +525,12 @@ vos_obj_check_discard(struct vos_container *cont, daos_unit_oid_t oid, uint64_t int vos_obj_incarnate(struct vos_object *obj, daos_epoch_range_t *epr, daos_epoch_t bound, - uint64_t flags, uint32_t intent, struct vos_ts_set *ts_set) + uint64_t flags, uint32_t intent, struct vos_ts_set *ts_set, bool *created) { - struct vos_container *cont = obj->obj_cont; - uint32_t cond_mask = 0; - int rc; + struct vos_container *cont = obj->obj_cont; + struct dtx_handle *dth = vos_dth_get(cont->vc_pool->vp_sysdb); + uint32_t cond_mask = 0; + int rc; D_ASSERT((flags & (VOS_OBJ_AGGREGATE | VOS_OBJ_DISCARD)) == 0); D_ASSERT(intent == DAOS_INTENT_PUNCH || intent == DAOS_INTENT_UPDATE || @@ -544,6 +545,14 @@ vos_obj_incarnate(struct vos_object *obj, daos_epoch_range_t *epr, daos_epoch_t return rc; } D_ASSERT(obj->obj_df); + + if (created != NULL) + *created = true; + rc = vos_dtx_record_oid(dth, cont, obj->obj_id); + if (rc != 0) { + vos_obj_evict_by_oid(cont, obj->obj_id); + return rc; + } } else { vos_ilog_ts_ignore(vos_obj2umm(obj), &obj->obj_df->vo_ilog); } @@ -553,8 +562,7 @@ vos_obj_incarnate(struct vos_object *obj, daos_epoch_range_t *epr, daos_epoch_t return -DER_UPDATE_AGAIN; /* Check the sync epoch */ - if (intent != DAOS_INTENT_MARK && epr->epr_hi <= obj->obj_sync_epoch && - vos_dth_get(obj->obj_cont->vc_pool->vp_sysdb) != NULL) { + if (intent != DAOS_INTENT_MARK && epr->epr_hi <= obj->obj_sync_epoch && dth != NULL) { /* If someone has synced the object against the * obj->obj_sync_epoch, then we do not allow to modify the * object with old epoch. Let's ask the caller to retry with