diff --git a/src/dtx/dtx_common.c b/src/dtx/dtx_common.c index 175f440dc4e..5b342584bb6 100644 --- a/src/dtx/dtx_common.c +++ b/src/dtx/dtx_common.c @@ -883,15 +883,10 @@ dtx_handle_reinit(struct dtx_handle *dth) dth->dth_modify_shared = 0; dth->dth_active = 0; - dth->dth_touched_leader_oid = 0; dth->dth_local_tx_started = 0; dth->dth_cos_done = 0; - - dth->dth_op_seq = 0; - dth->dth_oid_cnt = 0; - dth->dth_oid_cap = 0; - D_FREE(dth->dth_oid_array); - dth->dth_dkey_hash = 0; + dth->dth_op_seq = 0; + dth->dth_dkey_hash = 0; vos_dtx_rsrvd_fini(dth); return vos_dtx_rsrvd_init(dth); @@ -926,32 +921,29 @@ dtx_handle_init(struct dtx_id *dti, daos_handle_t xoh, struct dtx_epoch *epoch, dth->dth_coh = xoh; } - dth->dth_ver = pm_ver; - dth->dth_refs = 1; - dth->dth_mbs = mbs; - - dth->dth_pinned = 0; - dth->dth_cos_done = 0; - dth->dth_modify_shared = 0; - dth->dth_active = 0; - dth->dth_touched_leader_oid = 0; - dth->dth_local_tx_started = 0; - dth->dth_solo = (flags & DTX_SOLO) ? 1 : 0; - dth->dth_drop_cmt = (flags & DTX_DROP_CMT) ? 1 : 0; - dth->dth_dist = (flags & DTX_DIST) ? 1 : 0; - dth->dth_for_migration = (flags & DTX_FOR_MIGRATION) ? 1 : 0; + dth->dth_ver = pm_ver; + dth->dth_refs = 1; + dth->dth_mbs = mbs; + dth->dth_pinned = 0; + dth->dth_cos_done = 0; + dth->dth_modify_shared = 0; + dth->dth_active = 0; + dth->dth_local_tx_started = 0; + dth->dth_solo = (flags & DTX_SOLO) ? 1 : 0; + dth->dth_drop_cmt = (flags & DTX_DROP_CMT) ? 1 : 0; + dth->dth_dist = (flags & DTX_DIST) ? 1 : 0; + dth->dth_for_migration = (flags & DTX_FOR_MIGRATION) ? 1 : 0; dth->dth_ignore_uncommitted = (flags & DTX_IGNORE_UNCOMMITTED) ? 1 : 0; - dth->dth_prepared = (flags & DTX_PREPARED) ? 1 : 0; - dth->dth_epoch_owner = (flags & DTX_EPOCH_OWNER) ? 1 : 0; - dth->dth_aborted = 0; - dth->dth_already = 0; - dth->dth_need_validation = 0; + dth->dth_prepared = (flags & DTX_PREPARED) ? 1 : 0; + dth->dth_epoch_owner = (flags & DTX_EPOCH_OWNER) ? 1 : 0; + dth->dth_aborted = 0; + dth->dth_already = 0; + dth->dth_need_validation = 0; dth->dth_local = (flags & DTX_LOCAL) ? 1 : 0; - - dth->dth_dti_cos = dti_cos; - dth->dth_dti_cos_count = dti_cos_cnt; - dth->dth_ent = NULL; - dth->dth_flags = leader ? DTE_LEADER : 0; + dth->dth_dti_cos = dti_cos; + dth->dth_dti_cos_count = dti_cos_cnt; + dth->dth_ent = NULL; + dth->dth_flags = leader ? DTE_LEADER : 0; if (flags & DTX_SYNC) { dth->dth_flags |= DTE_BLOCK; @@ -960,12 +952,11 @@ dtx_handle_init(struct dtx_id *dti, daos_handle_t xoh, struct dtx_epoch *epoch, dth->dth_sync = 0; } - dth->dth_op_seq = 0; - dth->dth_oid_cnt = 0; - dth->dth_oid_cap = 0; - dth->dth_oid_array = NULL; - - dth->dth_dkey_hash = 0; + dth->dth_op_seq = 0; + dth->dth_local_oid_cnt = 0; + dth->dth_local_oid_cap = 0; + dth->dth_local_oid_array = NULL; + dth->dth_dkey_hash = 0; if (!(flags & DTX_LOCAL)) { if (daos_is_zero_dti(dti)) @@ -1001,83 +992,6 @@ dtx_handle_init(struct dtx_id *dti, daos_handle_t xoh, struct dtx_epoch *epoch, return rc; } -static int -dtx_insert_oid(struct dtx_handle *dth, daos_unit_oid_t *oid, bool touch_leader) -{ - int start = 0; - int end = dth->dth_oid_cnt - 1; - int at; - int rc = 0; - - do { - at = (start + end) / 2; - rc = daos_unit_oid_compare(dth->dth_oid_array[at], *oid); - if (rc == 0) - return 0; - - if (rc > 0) - end = at - 1; - else - start = at + 1; - } while (start <= end); - - if (dth->dth_oid_cnt == dth->dth_oid_cap) { - daos_unit_oid_t *oid_array; - - D_ALLOC_ARRAY(oid_array, dth->dth_oid_cap << 1); - if (oid_array == NULL) - return -DER_NOMEM; - - if (rc > 0) { - /* Insert before dth->dth_oid_array[at]. */ - if (at > 0) - memcpy(&oid_array[0], &dth->dth_oid_array[0], - sizeof(*oid) * at); - oid_array[at] = *oid; - memcpy(&oid_array[at + 1], &dth->dth_oid_array[at], - sizeof(*oid) * (dth->dth_oid_cnt - at)); - } else { - /* Insert after dth->dth_oid_array[at]. */ - memcpy(&oid_array[0], &dth->dth_oid_array[0], - sizeof(*oid) * (at + 1)); - oid_array[at + 1] = *oid; - if (at < dth->dth_oid_cnt - 1) - memcpy(&oid_array[at + 2], - &dth->dth_oid_array[at + 1], - sizeof(*oid) * (dth->dth_oid_cnt - 1 - at)); - } - - D_FREE(dth->dth_oid_array); - dth->dth_oid_array = oid_array; - dth->dth_oid_cap <<= 1; - - goto out; - } - - if (rc > 0) { - /* Insert before dth->dth_oid_array[at]. */ - memmove(&dth->dth_oid_array[at + 1], - &dth->dth_oid_array[at], - sizeof(*oid) * (dth->dth_oid_cnt - at)); - dth->dth_oid_array[at] = *oid; - } else { - /* Insert after dth->dth_oid_array[at]. */ - if (at < dth->dth_oid_cnt - 1) - memmove(&dth->dth_oid_array[at + 2], - &dth->dth_oid_array[at + 1], - sizeof(*oid) * (dth->dth_oid_cnt - 1 - at)); - dth->dth_oid_array[at + 1] = *oid; - } - -out: - if (touch_leader) - dth->dth_touched_leader_oid = 1; - - dth->dth_oid_cnt++; - - return 0; -} - void dtx_renew_epoch(struct dtx_epoch *epoch, struct dtx_handle *dth) { @@ -1110,51 +1024,6 @@ dtx_sub_init(struct dtx_handle *dth, daos_unit_oid_t *oid, uint64_t dkey_hash) dth->dth_dkey_hash = dkey_hash; dth->dth_op_seq++; - rc = daos_unit_oid_compare(dth->dth_leader_oid, *oid); - if (rc == 0) { - if (dth->dth_oid_array == NULL) - dth->dth_touched_leader_oid = 1; - - if (dth->dth_touched_leader_oid) - goto out; - - rc = dtx_insert_oid(dth, oid, true); - - D_GOTO(out, rc); - } - - if (dth->dth_oid_array == NULL) { - D_ASSERT(dth->dth_oid_cnt == 0); - - /* 4 slots by default to hold rename case. */ - dth->dth_oid_cap = 4; - D_ALLOC_ARRAY(dth->dth_oid_array, dth->dth_oid_cap); - if (dth->dth_oid_array == NULL) - D_GOTO(out, rc = -DER_NOMEM); - - if (!dth->dth_touched_leader_oid) { - dth->dth_oid_array[0] = *oid; - dth->dth_oid_cnt = 1; - - D_GOTO(out, rc = 0); - } - - dth->dth_oid_cnt = 2; - - if (rc > 0) { - dth->dth_oid_array[0] = *oid; - dth->dth_oid_array[1] = dth->dth_leader_oid; - } else { - dth->dth_oid_array[0] = dth->dth_leader_oid; - dth->dth_oid_array[1] = *oid; - } - - D_GOTO(out, rc = 0); - } - - rc = dtx_insert_oid(dth, oid, false); - -out: D_DEBUG(DB_IO, "Sub init DTX "DF_DTI" for object "DF_UOID " dkey %lu, opc seq %d: "DF_RC"\n", DP_DTI(&dth->dth_xid), DP_UOID(*oid), @@ -1493,7 +1362,6 @@ dtx_leader_end(struct dtx_leader_handle *dlh, struct ds_cont_child *cont, int re dth->dth_sync ? "sync" : "async", dth->dth_dti_cos_count, dth->dth_cos_done ? dth->dth_dti_cos_count : 0, DP_RC(result)); - D_FREE(dth->dth_oid_array); D_FREE(dlh); d_tm_dec_gauge(dtx_tls_get()->dt_dtx_leader_total, 1); @@ -1617,7 +1485,6 @@ dtx_end(struct dtx_handle *dth, struct ds_cont_child *cont, int result) vos_dtx_detach(dth); out: - D_FREE(dth->dth_oid_array); D_FREE(dth); return result; diff --git a/src/dtx/tests/dts_structs.c b/src/dtx/tests/dts_structs.c index bddfdf9816c..e0a715660f2 100644 --- a/src/dtx/tests/dts_structs.c +++ b/src/dtx/tests/dts_structs.c @@ -62,7 +62,6 @@ struct_dtx_handle(void **state) SET_BITFIELD_1(dummy, dth_drop_cmt); SET_BITFIELD_1(dummy, dth_modify_shared); SET_BITFIELD_1(dummy, dth_active); - SET_BITFIELD_1(dummy, dth_touched_leader_oid); SET_BITFIELD_1(dummy, dth_local_tx_started); SET_BITFIELD_1(dummy, dth_shares_inited); SET_BITFIELD_1(dummy, dth_dist); @@ -75,7 +74,7 @@ struct_dtx_handle(void **state) SET_BITFIELD_1(dummy, dth_local); SET_BITFIELD_1(dummy, dth_epoch_owner); SET_BITFIELD_1(dummy, dth_local_complete); - SET_BITFIELD(dummy, padding1, 12); + SET_BITFIELD(dummy, padding1, 13); SET_FIELD(dummy, dth_dti_cos_count); SET_FIELD(dummy, dth_dti_cos); @@ -87,10 +86,6 @@ struct_dtx_handle(void **state) SET_FIELD(dummy, dth_op_seq); SET_FIELD(dummy, dth_deferred_used_cnt); SET_FIELD(dummy, padding2); - SET_FIELD(dummy, dth_oid_cnt); - SET_FIELD(dummy, dth_oid_cap); - SET_FIELD(dummy, padding3); - SET_FIELD(dummy, dth_oid_array); SET_FIELD(dummy, dth_local_oid_cnt); SET_FIELD(dummy, dth_local_oid_cap); SET_FIELD(dummy, padding4); diff --git a/src/include/daos/lru.h b/src/include/daos/lru.h index de6c5a373b9..bf6586b5f8d 100644 --- a/src/include/daos/lru.h +++ b/src/include/daos/lru.h @@ -137,6 +137,15 @@ daos_lru_ref_evict(struct daos_lru_cache *lcache, struct daos_llink *llink) d_hash_rec_evict_at(&lcache->dlc_htable, &llink->ll_link); } +/** + * Whether the item is evicted or not. + */ +static inline bool +daos_lru_is_evicted(struct daos_llink *llink) +{ + return llink->ll_evicted != 0; +} + /** * Evict the item from LRU before releasing the refcount on it, wait until * the caller is the last one holds refcount. diff --git a/src/include/daos_srv/dtx_srv.h b/src/include/daos_srv/dtx_srv.h index 6143ed9b350..72ded4efa64 100644 --- a/src/include/daos_srv/dtx_srv.h +++ b/src/include/daos_srv/dtx_srv.h @@ -47,6 +47,7 @@ struct dtx_local_oid_record { * the most optimal way (packed). Please make sure that all necessary padding * is explicit so it could be used in the future. */ +/* clang-format off */ struct dtx_handle { union { struct dtx_entry dth_dte; @@ -92,8 +93,6 @@ struct dtx_handle { dth_modify_shared : 1, /* The DTX entry is in active table. */ dth_active : 1, - /* Leader oid is touched. */ - dth_touched_leader_oid : 1, /* Local TX is started. */ dth_local_tx_started : 1, /* The DTX share lists are inited. */ @@ -117,7 +116,7 @@ struct dtx_handle { /* Locally generate the epoch. */ dth_epoch_owner : 1, /* Flag to commit the local transaction */ - dth_local_complete : 1, padding1 : 12; + dth_local_complete : 1, padding1 : 13; /* The count the DTXs in the dth_dti_cos array. */ uint32_t dth_dti_cos_count; @@ -138,25 +137,14 @@ struct dtx_handle { uint16_t dth_deferred_used_cnt; uint16_t padding2; - union { - struct { - /** The count of objects that are modified by this DTX. */ - uint16_t dth_oid_cnt; - /** The total slots in the dth_oid_array. */ - uint16_t dth_oid_cap; - uint32_t padding3; - /** If more than one objects are modified, the IDs are reocrded here. */ - daos_unit_oid_t *dth_oid_array; - }; - struct { - /** The count of objects stored in dth_local_oid_array. */ - uint16_t dth_local_oid_cnt; - /** The total slots in the dth_local_oid_array. */ - uint16_t dth_local_oid_cap; - uint32_t padding4; - /** The record of all objects touched by the local transaction. */ - struct dtx_local_oid_record *dth_local_oid_array; - }; + struct { + /** The count of objects stored in dth_local_oid_array. */ + uint16_t dth_local_oid_cnt; + /** The total slots in the dth_local_oid_array. */ + uint16_t dth_local_oid_cap; + uint32_t padding4; + /** The record of all objects touched by the local transaction. */ + struct dtx_local_oid_record *dth_local_oid_array; }; /* Hash of the dkey to be modified if applicable. Per modification. */ @@ -179,6 +167,7 @@ struct dtx_handle { int dth_share_tbd_count; uint32_t padding5; }; +/* clang-format on */ /* Each sub transaction handle to manage each sub thandle */ struct dtx_sub_status { diff --git a/src/vos/tests/vts_dtx.c b/src/vos/tests/vts_dtx.c index 12cd6d72728..725bfe4b93f 100644 --- a/src/vos/tests/vts_dtx.c +++ b/src/vos/tests/vts_dtx.c @@ -51,40 +51,13 @@ vts_dtx_begin(const daos_unit_oid_t *oid, daos_handle_t coh, daos_epoch_t epoch, vts_init_dte(&dth->dth_dte); - dth->dth_coh = coh; - dth->dth_epoch = epoch; - dth->dth_leader_oid = *oid; - - dth->dth_pinned = 0; - dth->dth_sync = 0; - dth->dth_cos_done = 0; - dth->dth_touched_leader_oid = 0; - dth->dth_local_tx_started = 0; - dth->dth_solo = 0; - dth->dth_drop_cmt = 0; - dth->dth_modify_shared = 0; - dth->dth_active = 0; - dth->dth_dist = 0; - dth->dth_for_migration = 0; - dth->dth_ignore_uncommitted = 0; - dth->dth_prepared = 0; - dth->dth_epoch_owner = 0; - dth->dth_aborted = 0; - dth->dth_already = 0; - dth->dth_need_validation = 0; - - dth->dth_dti_cos_count = 0; - dth->dth_dti_cos = NULL; - dth->dth_ent = NULL; - dth->dth_flags = DTE_LEADER; + dth->dth_coh = coh; + dth->dth_epoch = epoch; + dth->dth_leader_oid = *oid; + dth->dth_flags = DTE_LEADER; dth->dth_modification_cnt = 1; - - dth->dth_op_seq = 1; - dth->dth_oid_cnt = 0; - dth->dth_oid_cap = 0; - dth->dth_oid_array = NULL; - - dth->dth_dkey_hash = dkey_hash; + dth->dth_op_seq = 1; + dth->dth_dkey_hash = dkey_hash; D_INIT_LIST_HEAD(&dth->dth_share_cmt_list); D_INIT_LIST_HEAD(&dth->dth_share_abt_list); diff --git a/src/vos/tests/vts_io.c b/src/vos/tests/vts_io.c index fdad31e03bb..6139ea6c37d 100644 --- a/src/vos/tests/vts_io.c +++ b/src/vos/tests/vts_io.c @@ -916,7 +916,7 @@ hold_obj(struct vos_container *cont, daos_unit_oid_t oid, daos_epoch_range_t *ep assert_rc_equal(rc, 0); } - rc = vos_obj_incarnate(*obj_p, epr, bound, flags, intent, ts_set); + rc = vos_obj_incarnate(*obj_p, epr, bound, flags, intent, ts_set, NULL); if (umm != NULL) rc = umem_tx_end(umm, rc); diff --git a/src/vos/vos_common.c b/src/vos/vos_common.c index a7397a94256..dfd9230610f 100644 --- a/src/vos/vos_common.c +++ b/src/vos/vos_common.c @@ -215,12 +215,22 @@ vos_tx_publish(struct dtx_handle *dth, bool publish) } int -vos_tx_begin(struct dtx_handle *dth, struct umem_instance *umm, bool is_sysdb) +vos_tx_begin(struct dtx_handle *dth, struct umem_instance *umm, bool is_sysdb, + struct vos_object *obj) { int rc; - if (dth == NULL) - return umem_tx_begin(umm, vos_txd_get(is_sysdb)); + if (dth == NULL) { + /* CPU may yield when umem_tx_begin, related object maybe evicted during that. */ + rc = umem_tx_begin(umm, vos_txd_get(is_sysdb)); + if (rc == 0 && unlikely(vos_obj_is_evicted(obj))) { + D_DEBUG(DB_IO, "Obj " DF_UOID " is evicted(1), need to restart TX.\n", + DP_UOID(obj->obj_id)); + rc = umem_tx_end(umm, -DER_TX_RESTART); + } + + return rc; + } D_ASSERT(!is_sysdb); /** Note: On successful return, dth tls gets set and will be cleared by the corresponding @@ -235,6 +245,14 @@ vos_tx_begin(struct dtx_handle *dth, struct umem_instance *umm, bool is_sysdb) rc = umem_tx_begin(umm, vos_txd_get(is_sysdb)); if (rc == 0) { + /* CPU may yield when umem_tx_begin, related object maybe evicted during that. */ + if (unlikely(vos_obj_is_evicted(obj))) { + D_DEBUG(DB_IO, "Obj " DF_UOID " is evicted(2), need to restart TX.\n", + DP_UOID(obj->obj_id)); + + return umem_tx_end(umm, -DER_TX_RESTART); + } + dth->dth_local_tx_started = 1; vos_dth_set(dth, false); } @@ -251,13 +269,7 @@ vos_local_tx_abort(struct dtx_handle *dth) return; /** - * Since a local transaction spawns always a single pool an eaither one of the containers - * can be used to access the pool. - */ - record = &dth->dth_local_oid_array[0]; - - /** - * Evict all objects touched by the aborted transaction from the object cache to make sure + * Evict all objects created by the aborted transaction from the object cache to make sure * no invalid pointer stays there. Not all of the touched objects have to be evicted but * for simplicity's sake all of them are. */ diff --git a/src/vos/vos_dtx.c b/src/vos/vos_dtx.c index c64adc12bb0..e0f65fb2017 100644 --- a/src/vos/vos_dtx.c +++ b/src/vos/vos_dtx.c @@ -151,37 +151,21 @@ dtx_inprogress(struct vos_dtx_act_ent *dae, struct dtx_handle *dth, } static void -dtx_act_ent_cleanup(struct vos_container *cont, struct vos_dtx_act_ent *dae, - struct dtx_handle *dth, bool evict, bool keep_df) +dtx_act_ent_cleanup(struct vos_container *cont, struct vos_dtx_act_ent *dae, bool evict, + bool keep_df) { - if (evict) { - daos_unit_oid_t *oids; - int count; - int i; + if (evict && dae->dae_oids != NULL) { + int i; - if (dth != NULL) { - if (dth->dth_oid_array != NULL) { - D_ASSERT(dth->dth_oid_cnt > 0); - - count = dth->dth_oid_cnt; - oids = dth->dth_oid_array; - } else { - count = 1; - oids = &dth->dth_leader_oid; - } - } else { - count = dae->dae_oid_cnt; - oids = dae->dae_oids; - } - - for (i = 0; i < count; i++) - vos_obj_evict_by_oid(cont, oids[i]); + for (i = 0; i < dae->dae_oid_cnt; i++) + vos_obj_evict_by_oid(cont, dae->dae_oids[i]); } if (dae->dae_oids != NULL && dae->dae_oids != &dae->dae_oid_inline && dae->dae_oids != &DAE_OID(dae)) { D_FREE(dae->dae_oids); dae->dae_oid_cnt = 0; + dae->dae_oid_cap = 0; } DAE_REC_OFF(dae) = UMOFF_NULL; @@ -254,7 +238,7 @@ dtx_act_ent_free(struct btr_instance *tins, struct btr_record *rec, D_ASSERT(dae != NULL); *(struct vos_dtx_act_ent **)args = dae; } else if (dae != NULL) { - dtx_act_ent_cleanup(tins->ti_priv, dae, NULL, true, false); + dtx_act_ent_cleanup(tins->ti_priv, dae, true, false); } return 0; @@ -879,7 +863,7 @@ vos_dtx_commit_one(struct vos_container *cont, struct dtx_id *dti, daos_epoch_t rc = dbtree_delete(cont->vc_dtx_active_hdl, BTR_PROBE_BYPASS, &kiov, &dae); if (rc == 0) { - dtx_act_ent_cleanup(cont, dae, NULL, false, false); + dtx_act_ent_cleanup(cont, dae, false, false); dtx_evict_lid(cont, dae); } @@ -1845,30 +1829,6 @@ vos_dtx_prepared(struct dtx_handle *dth, struct vos_dtx_cmt_ent **dce_p) (dth->dth_modification_cnt > 0)) dth->dth_sync = 1; - if (dth->dth_oid_array != NULL) { - D_ASSERT(dth->dth_oid_cnt > 0); - - dae->dae_oid_cnt = dth->dth_oid_cnt; - if (dth->dth_oid_cnt == 1) { - dae->dae_oid_inline = dth->dth_oid_array[0]; - dae->dae_oids = &dae->dae_oid_inline; - } else { - size = sizeof(daos_unit_oid_t) * dth->dth_oid_cnt; - D_ALLOC_NZ(dae->dae_oids, size); - if (dae->dae_oids == NULL) { - /* Not fatal. */ - D_WARN("No DRAM to store ACT DTX OIDs " - DF_DTI"\n", DP_DTI(&DAE_XID(dae))); - dae->dae_oid_cnt = 0; - } else { - memcpy(dae->dae_oids, dth->dth_oid_array, size); - } - } - } else { - dae->dae_oids = &DAE_OID(dae); - dae->dae_oid_cnt = 1; - } - if (DAE_MBS_DSIZE(dae) <= sizeof(DAE_MBS_INLINE(dae))) { memcpy(DAE_MBS_INLINE(dae), dth->dth_mbs->dm_data, DAE_MBS_DSIZE(dae)); @@ -2441,7 +2401,7 @@ vos_dtx_post_handle(struct vos_container *cont, DAE_FLAGS(daes[i]) |= DTE_PARTIAL_COMMITTED; daes[i]->dae_committing = 0; - dtx_act_ent_cleanup(cont, daes[i], NULL, false, true); + dtx_act_ent_cleanup(cont, daes[i], false, true); continue; } @@ -2467,13 +2427,13 @@ vos_dtx_post_handle(struct vos_container *cont, daes[i]->dae_aborted = 1; daes[i]->dae_aborting = 0; - dtx_act_ent_cleanup(cont, daes[i], NULL, true, false); + dtx_act_ent_cleanup(cont, daes[i], true, false); } else { D_ASSERT(daes[i]->dae_aborting == 0); daes[i]->dae_committed = 1; daes[i]->dae_committing = 0; - dtx_act_ent_cleanup(cont, daes[i], NULL, false, false); + dtx_act_ent_cleanup(cont, daes[i], false, false); } DAE_FLAGS(daes[i]) &= ~(DTE_CORRUPTED | DTE_ORPHAN | DTE_PARTIAL_COMMITTED); } @@ -3659,7 +3619,7 @@ vos_dtx_cleanup_internal(struct dtx_handle *dth) */ if (dae != NULL) { D_ASSERT(!vos_dae_is_prepare(dae)); - dtx_act_ent_cleanup(cont, dae, dth, true, false); + dtx_act_ent_cleanup(cont, dae, true, false); } } else { d_iov_set(&kiov, &dth->dth_xid, sizeof(dth->dth_xid)); @@ -3682,7 +3642,7 @@ vos_dtx_cleanup_internal(struct dtx_handle *dth) if (DAE_EPOCH(dae) != dth->dth_epoch) goto out; - dtx_act_ent_cleanup(cont, dae, dth, true, false); + dtx_act_ent_cleanup(cont, dae, true, false); rc = dbtree_delete(cont->vc_dtx_active_hdl, riov.iov_buf != NULL ? BTR_PROBE_BYPASS : BTR_PROBE_EQ, @@ -4040,7 +4000,7 @@ vos_dtx_local_begin(struct dtx_handle *dth, daos_handle_t poh) goto error; } - rc = vos_tx_begin(dth, umm, pool->vp_sysdb); + rc = vos_tx_begin(dth, umm, pool->vp_sysdb, NULL); if (rc != 0) { D_ERROR("Failed to start transaction: rc=" DF_RC "\n", DP_RC(rc)); goto error; @@ -4167,3 +4127,68 @@ vos_dtx_get_cmt_stat(daos_handle_t coh, uint64_t *cmt_cnt, struct dtx_time_stat out: return rc; } + +int +vos_dtx_record_oid(struct dtx_handle *dth, struct vos_container *cont, daos_unit_oid_t oid) +{ + struct dtx_local_oid_record *oid_array; + struct dtx_local_oid_record *record; + struct vos_dtx_act_ent *dae; + daos_unit_oid_t *oids; + int rc = 0; + + if (dth == NULL) + D_GOTO(out, rc = 0); + + if (dth->dth_local) { + if (dth->dth_local_oid_cnt == dth->dth_local_oid_cap) { + D_REALLOC_ARRAY(oid_array, dth->dth_local_oid_array, dth->dth_local_oid_cap, + dth->dth_local_oid_cap << 1); + if (oid_array == NULL) + D_GOTO(out, rc = -DER_NOMEM); + + dth->dth_local_oid_array = oid_array; + dth->dth_local_oid_cap <<= 1; + } + + record = &dth->dth_local_oid_array[dth->dth_local_oid_cnt]; + record->dor_cont = cont; + vos_cont_addref(cont); + record->dor_oid = oid; + dth->dth_local_oid_cnt++; + + D_GOTO(out, rc = 0); + } + + if (daos_is_zero_dti(&dth->dth_xid)) + D_GOTO(out, rc = 0); + + dae = dth->dth_ent; + D_ASSERT(dae != NULL); + + if (dae->dae_oid_cnt == 0) { + if (daos_unit_oid_compare(oid, DAE_OID(dae)) == 0) + dae->dae_oids = &DAE_OID(dae); + else + dae->dae_oids = &dae->dae_oid_inline; + } else if (dae->dae_oid_cnt >= dae->dae_oid_cap) { + D_ALLOC_ARRAY(oids, dae->dae_oid_cnt << 1); + if (oids == NULL) + D_GOTO(out, rc = -DER_NOMEM); + + memcpy(oids, dae->dae_oids, sizeof(*oids) * dae->dae_oid_cnt); + if (dae->dae_oids != &DAE_OID(dae) && dae->dae_oids != &dae->dae_oid_inline) + D_FREE(dae->dae_oids); + + dae->dae_oids = oids; + dae->dae_oid_cap = dae->dae_oid_cnt << 1; + } + + dae->dae_oids[dae->dae_oid_cnt++] = oid; + +out: + if (rc != 0) + D_ERROR("Failed to record oid " DF_UOID ": " DF_RC "\n", DP_UOID(oid), DP_RC(rc)); + + return rc; +} diff --git a/src/vos/vos_internal.h b/src/vos/vos_internal.h index 18e6438ce6e..6236ffb38ed 100644 --- a/src/vos/vos_internal.h +++ b/src/vos/vos_internal.h @@ -470,10 +470,6 @@ struct vos_dtx_act_ent { * then 'dae_oids' points to the 'dae_oid_inline'. * * Otherwise, 'dae_oids' points to new buffer to hold more. - * - * These information is used for EC aggregation optimization. - * If server restarts, then we will lose the optimization but - * it is not fatal. */ daos_unit_oid_t *dae_oids; /* The time (hlc) when the DTX entry is created. */ @@ -485,6 +481,9 @@ struct vos_dtx_act_ent { /* Back pointer to the DTX handle. */ struct dtx_handle *dae_dth; + /* The capacity of dae_oids if it points to new allocated area. */ + uint32_t dae_oid_cap; + unsigned int dae_committable:1, dae_committing:1, dae_committed:1, @@ -855,6 +854,9 @@ vos_dtx_post_handle(struct vos_container *cont, int vos_dtx_act_reindex(struct vos_container *cont); +int +vos_dtx_record_oid(struct dtx_handle *dth, struct vos_container *cont, daos_unit_oid_t oid); + enum vos_tree_class { /** the first reserved tree class */ VOS_BTR_BEGIN = DBTREE_VOS_BEGIN, @@ -1336,7 +1338,8 @@ vos_evt_desc_cbs_init(struct evt_desc_cbs *cbs, struct vos_pool *pool, daos_handle_t coh, struct vos_object *obj); int -vos_tx_begin(struct dtx_handle *dth, struct umem_instance *umm, bool is_sysdb); +vos_tx_begin(struct dtx_handle *dth, struct umem_instance *umm, bool is_sysdb, + struct vos_object *obj); /** Finish the transaction and publish or cancel the reservations or * return if err == 0 and it's a multi-modification transaction that @@ -1928,20 +1931,6 @@ vos_io_scm(struct vos_pool *pool, daos_iod_type_t type, daos_size_t size, enum v return false; } -/** - * Insert object ID and its parent container into the array of objects touched by the ongoing - * local transaction. - * - * \param[in] dth DTX handle for ongoing local transaction - * \param[in] cont VOS container - * \param[in] oid Object ID - * - * \return 0 : Success. - * -DER_NOMEM : Run out of the volatile memory. - */ -int -vos_insert_oid(struct dtx_handle *dth, struct vos_container *cont, daos_unit_oid_t *oid); - static inline bool vos_pool_is_p2(struct vos_pool *pool) { diff --git a/src/vos/vos_io.c b/src/vos/vos_io.c index cebf9181aaa..abeb60d0f82 100644 --- a/src/vos/vos_io.c +++ b/src/vos/vos_io.c @@ -2552,41 +2552,16 @@ update_cancel(struct vos_io_context *ioc) true /* abort */); } -int -vos_insert_oid(struct dtx_handle *dth, struct vos_container *cont, daos_unit_oid_t *oid) -{ - struct dtx_local_oid_record *oid_array = NULL; - struct dtx_local_oid_record *record = NULL; - - /** The array has to grow to accommodate the next record. */ - if (dth->dth_local_oid_cnt == dth->dth_local_oid_cap) { - D_REALLOC_ARRAY(oid_array, dth->dth_local_oid_array, dth->dth_local_oid_cap, - dth->dth_local_oid_cap << 1); - if (oid_array == NULL) - return -DER_NOMEM; - - dth->dth_local_oid_array = oid_array; - dth->dth_local_oid_cap <<= 1; - } - - record = &dth->dth_local_oid_array[dth->dth_local_oid_cnt]; - record->dor_cont = cont; - vos_cont_addref(cont); - record->dor_oid = *oid; - dth->dth_local_oid_cnt++; - - return 0; -} - int vos_update_end(daos_handle_t ioh, uint32_t pm_ver, daos_key_t *dkey, int err, daos_size_t *size, struct dtx_handle *dth) { - struct vos_dtx_act_ent **daes = NULL; - struct vos_dtx_cmt_ent **dces = NULL; - struct vos_io_context *ioc = vos_ioh2ioc(ioh); - struct umem_instance *umem; + struct vos_dtx_act_ent **daes = NULL; + struct vos_dtx_cmt_ent **dces = NULL; + struct vos_io_context *ioc = vos_ioh2ioc(ioh); + struct umem_instance *umem; bool tx_started = false; + bool created = false; uint16_t minor_epc; uint64_t flags = VOS_OBJ_CREATE | VOS_OBJ_VISIBLE; @@ -2598,6 +2573,13 @@ vos_update_end(daos_handle_t ioh, uint32_t pm_ver, daos_key_t *dkey, int err, if (err != 0) goto abort; + if (unlikely(vos_obj_is_evicted(ioc->ic_pinned_obj))) { + D_DEBUG(DB_IO, "Obj " DF_UOID " is evicted during update, need to restart TX.\n", + DP_UOID(ioc->ic_oid)); + + D_GOTO(abort, err = -DER_TX_RESTART); + } + err = vos_ts_set_add(ioc->ic_ts_set, ioc->ic_cont->vc_ts_idx, NULL, 0); D_ASSERT(err == 0); @@ -2606,7 +2588,10 @@ vos_update_end(daos_handle_t ioh, uint32_t pm_ver, daos_key_t *dkey, int err, if (err != 0) goto abort; - err = vos_tx_begin(dth, umem, ioc->ic_cont->vc_pool->vp_sysdb); + if (ioc->ic_pinned_obj != NULL) + D_ASSERT(ioc->ic_pinned_obj == ioc->ic_obj); + + err = vos_tx_begin(dth, umem, ioc->ic_cont->vc_pool->vp_sysdb, ioc->ic_obj); if (err != 0) goto abort; @@ -2637,8 +2622,8 @@ vos_update_end(daos_handle_t ioh, uint32_t pm_ver, daos_key_t *dkey, int err, D_FREE(daes); } - err = vos_obj_incarnate(ioc->ic_obj, &ioc->ic_epr, ioc->ic_bound, flags, - DAOS_INTENT_UPDATE, ioc->ic_ts_set); + err = vos_obj_incarnate(ioc->ic_obj, &ioc->ic_epr, ioc->ic_bound, flags, DAOS_INTENT_UPDATE, + ioc->ic_ts_set, &created); if (err != 0) goto abort; @@ -2663,10 +2648,6 @@ vos_update_end(daos_handle_t ioh, uint32_t pm_ver, daos_key_t *dkey, int err, goto abort; } - if (dtx_is_valid_handle(dth) && dth->dth_local) { - err = vos_insert_oid(dth, ioc->ic_cont, &ioc->ic_oid); - } - abort: if (err == -DER_NONEXIST || err == -DER_EXIST || err == -DER_INPROGRESS) { @@ -2727,7 +2708,7 @@ vos_update_end(daos_handle_t ioh, uint32_t pm_ver, daos_key_t *dkey, int err, *size = ioc->ic_io_size; D_FREE(daes); D_FREE(dces); - vos_ioc_destroy(ioc, err != 0); + vos_ioc_destroy(ioc, err != 0 && created); return err; } diff --git a/src/vos/vos_obj.c b/src/vos/vos_obj.c index 0015d91d916..6d265ec1cf0 100644 --- a/src/vos/vos_obj.c +++ b/src/vos/vos_obj.c @@ -421,6 +421,7 @@ vos_obj_punch(daos_handle_t coh, daos_unit_oid_t oid, daos_epoch_t epoch, struct vos_ts_set *ts_set; struct vos_container *cont; struct vos_object *obj = NULL; + bool created = false; bool punch_obj = false; bool tx_started = false; uint64_t hold_flags; @@ -494,7 +495,7 @@ vos_obj_punch(daos_handle_t coh, daos_unit_oid_t oid, daos_epoch_t epoch, if (rc != 0) goto reset; - rc = vos_tx_begin(dth, vos_cont2umm(cont), cont->vc_pool->vp_sysdb); + rc = vos_tx_begin(dth, vos_cont2umm(cont), cont->vc_pool->vp_sysdb, obj); if (rc != 0) goto reset; @@ -523,7 +524,7 @@ vos_obj_punch(daos_handle_t coh, daos_unit_oid_t oid, daos_epoch_t epoch, } /* NB: punch always generate a new incarnation of the object */ - rc = vos_obj_incarnate(obj, &epr, bound, hold_flags, DAOS_INTENT_PUNCH, ts_set); + rc = vos_obj_incarnate(obj, &epr, bound, hold_flags, DAOS_INTENT_PUNCH, ts_set, &created); if (rc == 0) { if (dkey) { /* key punch */ rc = key_punch(obj, epr.epr_hi, bound, pm_ver, dkey, @@ -571,14 +572,9 @@ vos_obj_punch(daos_handle_t coh, daos_unit_oid_t oid, daos_epoch_t epoch, vos_ts_set_update(ts_set, epr.epr_hi); } - if (rc == 0) { + if (rc == 0) vos_ts_set_wupdate(ts_set, epr.epr_hi); - if (dtx_is_valid_handle(dth) && dth->dth_local) { - rc = vos_insert_oid(dth, cont, &oid); - } - } - rc = vos_tx_end(cont, dth, NULL, NULL, tx_started, NULL, rc); if (dtx_is_valid_handle(dth)) { if (rc == 0) @@ -592,7 +588,7 @@ vos_obj_punch(daos_handle_t coh, daos_unit_oid_t oid, daos_epoch_t epoch, } if (obj != NULL) - vos_obj_release(obj, 0, rc != 0); + vos_obj_release(obj, 0, rc != 0 && created); D_FREE(daes); D_FREE(dces); @@ -723,7 +719,7 @@ vos_obj_delete_internal(daos_handle_t coh, daos_unit_oid_t oid, bool only_delete rc = umem_tx_end(umm, rc); out: - vos_obj_release(obj, 0, rc == 0); + vos_obj_release(obj, 0, false); return rc; } @@ -801,7 +797,7 @@ vos_obj_del_key(daos_handle_t coh, daos_unit_oid_t oid, daos_key_t *dkey, out_tx: rc = umem_tx_end(umm, rc); out: - vos_obj_release(obj, 0, true); + vos_obj_release(obj, 0, false); return rc; } @@ -816,7 +812,8 @@ vos_obj_mark_corruption(daos_handle_t coh, daos_epoch_t epoch, uint32_t pm_ver, daos_handle_t toh = DAOS_HDL_INVAL; int rc = 0; int i; - bool dirty = false; + bool dirty = false; + bool created = false; cont = vos_hdl2cont(coh); D_ASSERT(cont != NULL); @@ -842,17 +839,24 @@ vos_obj_mark_corruption(daos_handle_t coh, daos_epoch_t epoch, uint32_t pm_ver, } } +restart: rc = vos_obj_hold(cont, oid, &epr, epoch, VOS_OBJ_VISIBLE | VOS_OBJ_CREATE, DAOS_INTENT_MARK, &obj, NULL); if (rc != 0) goto log; - rc = umem_tx_begin(umm, NULL); + rc = vos_tx_begin(NULL, umm, cont->vc_pool->vp_sysdb, obj); + if (unlikely(rc == -DER_TX_RESTART)) { + vos_obj_release(obj, 0, true); + obj = NULL; + goto restart; + } + if (rc != 0) goto log; rc = vos_obj_incarnate(obj, &epr, epoch, VOS_OBJ_VISIBLE | VOS_OBJ_CREATE, DAOS_INTENT_MARK, - NULL); + NULL, &created); if (rc != 0) goto out; @@ -906,12 +910,14 @@ vos_obj_mark_corruption(daos_handle_t coh, daos_epoch_t epoch, uint32_t pm_ver, ", dkey (empty), akey_nr %u, epoch " DF_X64 ", pm_ver %u", DP_UOID(oid), akey_nr, epoch, pm_ver); + if (rc == -DER_ALREADY) + rc = 0; if (daos_handle_is_valid(toh)) dbtree_close(toh); if (obj != NULL) - vos_obj_release(obj, 0, true); + vos_obj_release(obj, 0, rc != 0 && created); - return rc == -DER_ALREADY ? 0 : rc; + return rc; } static int diff --git a/src/vos/vos_obj.h b/src/vos/vos_obj.h index f572ebb03d9..a48f78043e3 100644 --- a/src/vos/vos_obj.h +++ b/src/vos/vos_obj.h @@ -1,5 +1,6 @@ /** * (C) Copyright 2016-2024 Intel Corporation. + * (C) Copyright 2025 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -123,6 +124,12 @@ void vos_obj_evict(struct vos_object *obj); int vos_obj_evict_by_oid(struct vos_container *cont, daos_unit_oid_t oid); +static inline bool +vos_obj_is_evicted(struct vos_object *obj) +{ + return obj != NULL && daos_lru_is_evicted(&obj->obj_llink); +} + /** * Create an object cache. * @@ -246,6 +253,7 @@ vos_oi_delete(struct vos_container *cont, daos_unit_oid_t oid, bool only_delete_ * \param flags [IN] Object flags * \param intent [IN] The request intent. * \param ts_set [IN] Timestamp set + * \param created [OUR] Whether new object is created or not. * * \return 0 Object is successfully incarnated. * \return -DER_NONEXIST The conditions for success don't apply @@ -255,7 +263,7 @@ vos_oi_delete(struct vos_container *cont, daos_unit_oid_t oid, bool only_delete_ */ int vos_obj_incarnate(struct vos_object *obj, daos_epoch_range_t *epr, daos_epoch_t bound, - uint64_t flags, uint32_t intent, struct vos_ts_set *ts_set); + uint64_t flags, uint32_t intent, struct vos_ts_set *ts_set, bool *created); /** * Check if an operation will be conflicting with other ongoing operations over the diff --git a/src/vos/vos_obj_cache.c b/src/vos/vos_obj_cache.c index ba1898e0f25..9e8e9fca940 100644 --- a/src/vos/vos_obj_cache.c +++ b/src/vos/vos_obj_cache.c @@ -424,7 +424,7 @@ vos_obj_release(struct vos_object *obj, uint64_t flags, bool evict) else if (flags & VOS_OBJ_DISCARD) obj->obj_discard = 0; - obj_release(occ, obj, evict); + obj_release(occ, obj, evict || obj->obj_zombie); } /** Move local object to the lru cache */ @@ -525,11 +525,12 @@ vos_obj_check_discard(struct vos_container *cont, daos_unit_oid_t oid, uint64_t int vos_obj_incarnate(struct vos_object *obj, daos_epoch_range_t *epr, daos_epoch_t bound, - uint64_t flags, uint32_t intent, struct vos_ts_set *ts_set) + uint64_t flags, uint32_t intent, struct vos_ts_set *ts_set, bool *created) { - struct vos_container *cont = obj->obj_cont; - uint32_t cond_mask = 0; - int rc; + struct vos_container *cont = obj->obj_cont; + struct dtx_handle *dth = vos_dth_get(cont->vc_pool->vp_sysdb); + uint32_t cond_mask = 0; + int rc; D_ASSERT((flags & (VOS_OBJ_AGGREGATE | VOS_OBJ_DISCARD)) == 0); D_ASSERT(intent == DAOS_INTENT_PUNCH || intent == DAOS_INTENT_UPDATE || @@ -544,6 +545,14 @@ vos_obj_incarnate(struct vos_object *obj, daos_epoch_range_t *epr, daos_epoch_t return rc; } D_ASSERT(obj->obj_df); + + if (created != NULL) + *created = true; + rc = vos_dtx_record_oid(dth, cont, obj->obj_id); + if (rc != 0) { + vos_obj_evict_by_oid(cont, obj->obj_id); + return rc; + } } else { vos_ilog_ts_ignore(vos_obj2umm(obj), &obj->obj_df->vo_ilog); } @@ -553,8 +562,7 @@ vos_obj_incarnate(struct vos_object *obj, daos_epoch_range_t *epr, daos_epoch_t return -DER_UPDATE_AGAIN; /* Check the sync epoch */ - if (intent != DAOS_INTENT_MARK && epr->epr_hi <= obj->obj_sync_epoch && - vos_dth_get(obj->obj_cont->vc_pool->vp_sysdb) != NULL) { + if (intent != DAOS_INTENT_MARK && epr->epr_hi <= obj->obj_sync_epoch && dth != NULL) { /* If someone has synced the object against the * obj->obj_sync_epoch, then we do not allow to modify the * object with old epoch. Let's ask the caller to retry with