From 54697646efc0cbd98100c22338aabf24ae46b3c8 Mon Sep 17 00:00:00 2001 From: Isaku Yamahata Date: Mon, 13 Oct 2025 09:49:29 -0700 Subject: [PATCH 01/12] mshv_vtl/tdx: Add member to tdx vp context for TDX timer service TD partitioning provides a timer service for L1 (VTL2) guest to set a preemption timer for L2 (VTL0) vCPUs. Add members for a new timer service to the tdx_vp_context struct for the L1 (VTL2) userspace to pass a timeout value down to the L1 (VTL2) kernel. Signed-off-by: Isaku Yamahata --- drivers/hv/mshv_vtl.h | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/drivers/hv/mshv_vtl.h b/drivers/hv/mshv_vtl.h index 917eae20eaeb2..65421fc17aa80 100644 --- a/drivers/hv/mshv_vtl.h +++ b/drivers/hv/mshv_vtl.h @@ -77,6 +77,21 @@ struct tdx_l2_enter_guest_state { u8 reserved[6]; }; +#define MSHV_VTL_TDX_L2_DEADLINE_DISARMED (0ULL) + +/* + * Userspace sets this bit for the kernel to issue TDG.VP.WR(TSC_DEADLINE) + * when it changed deadline. + * The kernel clears this bits on TDG.VP.WR(TSC_DEADLINE). + */ +#define MSHV_VTL_TDX_L2_DEADLINE_UPDATE BIT(0) + +struct tdx_l2_tsc_deadline { + __u64 deadline; + __u8 update; + __u8 pad[7]; +}; + /* * This structure must be placed in a larger structure at offset 272 (0x110). * The GPR list for TDX and fx_state for xsave have alignment requirements on the @@ -91,8 +106,9 @@ struct tdx_vp_context { __u64 entry_rcx; /* Must be on 256 byte boundary. */ struct tdx_l2_enter_guest_state l2_enter_guest_state; + struct tdx_l2_tsc_deadline l2_tsc_deadline; /* Pad space until the next 256 byte boundary. */ - __u8 pad3[96]; + __u8 pad3[80]; /* Must be 16 byte aligned. */ struct fxregs_state fx_state; __u8 pad4[16]; From c79ca81837b439867bbb5c0a7abecb25c530edb9 Mon Sep 17 00:00:00 2001 From: Isaku Yamahata Date: Mon, 13 Oct 2025 10:06:46 -0700 Subject: [PATCH 02/12] mshv_vtl/tdx: Factor out tdg_vp_wr() wrapper Refactor __tdcall() for a dedicated wrapper for TDG.VP.WR() operation. This prepares for additional calls of TDG.VP.WR() cleanly while avoiding repeated open-coding. No functional change intended. Signed-off-by: Isaku Yamahata --- drivers/hv/mshv_vtl_main.c | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/drivers/hv/mshv_vtl_main.c b/drivers/hv/mshv_vtl_main.c index b6b706d4d845c..f45a49cb1a92b 100644 --- a/drivers/hv/mshv_vtl_main.c +++ b/drivers/hv/mshv_vtl_main.c @@ -577,11 +577,24 @@ struct vmx_vmcs_field { }; }; +#define TDG_VP_WR 10 + +static u64 tdg_vp_wr(u64 field, u64 value, u64 mask) +{ + struct tdx_module_args args = { + .rcx = 0, + .rdx = field, + .r8 = value, + .r9 = mask, + }; + + return __tdcall(TDG_VP_WR, &args); +} + static void mshv_write_tdx_apic_page(u64 apic_page_gpa) { struct tdx_extended_field_code extended_field_code; struct vmx_vmcs_field vmcs_field; - struct tdx_module_args args = {}; u64 status = 0; extended_field_code.as_u64 = 0; @@ -592,13 +605,9 @@ static void mshv_write_tdx_apic_page(u64 apic_page_gpa) vmcs_field.as_u32 = 0x00002012; extended_field_code.field_size = 3; /* TDX_FIELD_SIZE_64_BIT */ - args.rcx = 0; - args.rdx = extended_field_code.as_u64; - args.r8 = apic_page_gpa; - args.r9 = 0xFFFFFFFFFFFFFFFF; - /* Issue tdg_vp_wr to set the apic page. */ - status = __tdcall(10, &args); + status = tdg_vp_wr(extended_field_code.as_u64, apic_page_gpa, + 0xFFFFFFFFFFFFFFFF); pr_debug("set_apic_page gpa: %llx status: %llx\n", apic_page_gpa, status); if (status != 0) From 1bf07066d93ba44dd12f088905b9c1c340a83c87 Mon Sep 17 00:00:00 2001 From: Isaku Yamahata Date: Mon, 13 Oct 2025 10:31:33 -0700 Subject: [PATCH 03/12] mshv_vtl/tdx: Set TSC deadline if userspace requests Program the TD partitioning TSC deadline timer service for L2 (VTL0) vCPUs when the L1 (VTL2) userspace requests. Then, the TDX module sets preemption timer for L2 vCPU. If the timer expires, the L2 (VTL0) vCPU exits with a VMX preemption timer exit reason. The mshv_vtl driver then exits to the userspace, and the userspace is notified of the exit. The TDX module does not clear TDVPS deadline on a preemption timer exit. Disarm the TSC deadline explicitly on the preemption timer exit. Otherwise the following TDG.VP.ENTER() immediately exits without executing the L2 guest. Reported-by: Dexuan Cui Signed-off-by: Isaku Yamahata --- drivers/hv/mshv_vtl_main.c | 91 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 91 insertions(+) diff --git a/drivers/hv/mshv_vtl_main.c b/drivers/hv/mshv_vtl_main.c index f45a49cb1a92b..eb3bdce3c9a09 100644 --- a/drivers/hv/mshv_vtl_main.c +++ b/drivers/hv/mshv_vtl_main.c @@ -152,6 +152,49 @@ union hv_register_vsm_page_offsets { u64 as_uint64; } __packed; +#define MSHV_VTL_NUM_L2_VM 3 +#define TDVPS_TSC_DEADLINE_DISARMED (~0ULL) + +#define TDVPS_TSC_DEADLINE 0xA020000300000058ULL + +#define TDG_VP_ENTRY_VM_SHIFT 52 +#define TDG_VP_ENTRY_VM_MASK GENMASK_ULL(53, 52) +#define TDG_VP_ENTRY_VM_IDX(entry_rcx) \ + (((entry_rcx) & TDG_VP_ENTRY_VM_MASK) >> \ + TDG_VP_ENTRY_VM_SHIFT) + +#if defined(CONFIG_X86_64) && defined(CONFIG_INTEL_TDX_GUEST) +/* index: 0: L1 VM, 1-3: L2 VM */ +static bool is_tdx_vm_idx_valid(u64 vm_idx) +{ + return vm_idx >= 1 && vm_idx <= MSHV_VTL_NUM_L2_VM; +} + +/* + * Convert SDM TSC deadline to TDX TD partitioning guest timer service. + * See SDM TSC-Deadline Mode + * SDM: tdx_vp_context.tsc_deadline follows this. + * 0: disarmed. + * -1: armed. It's far future (years). It won't fire in practical time. + * + * TDX TDVPS deadline: + * See Intel TDX Module Partitioning Architecture Specification + * L2 VM TSC Deadline Support + * 0: immediate inject timer interrupt. + * -1: disarmed. + * -2: this can also be considered as far future. + */ +static u64 tsc_deadline_to_tdvps(u64 tsc_deadline) +{ + if (tsc_deadline == MSHV_VTL_TDX_L2_DEADLINE_DISARMED) + tsc_deadline = TDVPS_TSC_DEADLINE_DISARMED; + else if (tsc_deadline == ~0ULL) + tsc_deadline = ~0ULL - 1ULL; + + return tsc_deadline; +} +#endif + struct mshv_vtl_per_cpu { struct mshv_vtl_run *run; struct page *reg_page; @@ -629,6 +672,7 @@ static int mshv_vtl_alloc_context(unsigned int cpu) if (hv_isolation_type_tdx()) { #if defined(CONFIG_X86_64) && defined(CONFIG_INTEL_TDX_GUEST) struct page *tdx_apic_page; + int vm_idx; tdx_apic_page = alloc_page(GFP_KERNEL | __GFP_ZERO); if (!tdx_apic_page) @@ -649,6 +693,11 @@ static int mshv_vtl_alloc_context(unsigned int cpu) /* Enable the apic page. */ mshv_write_tdx_apic_page(page_to_phys(tdx_apic_page)); + + mshv_vtl_this_run()->tdx_context.l2_tsc_deadline.deadline = + MSHV_VTL_TDX_L2_DEADLINE_DISARMED; + for (vm_idx = 1; vm_idx <= MSHV_VTL_NUM_L2_VM; vm_idx++) + tdg_vp_wr(TDVPS_TSC_DEADLINE + vm_idx, TDVPS_TSC_DEADLINE_DISARMED, ~0ULL); #endif } else if (hv_isolation_type_snp()) { #ifdef CONFIG_X86_64 @@ -873,6 +922,36 @@ static void mshv_vtl_on_user_return(struct user_return_notifier *urn) wrmsrl(MSR_TSC_AUX, per_cpu->l1_msr_tsc_aux); } +static void mshv_vtl_return_tdx_tsc_deadline(struct mshv_vtl_run *vtl_run) +{ + struct tdx_vp_context *context = &vtl_run->tdx_context; + u64 vm_idx, deadline; + + /* L2 VM index is encoded in entry_rcx for TDG.VP.ENTER(). */ + vm_idx = TDG_VP_ENTRY_VM_IDX(vtl_run->tdx_context.entry_rcx); + if (!is_tdx_vm_idx_valid(vm_idx)) + return; + + if (!(context->l2_tsc_deadline.update & MSHV_VTL_TDX_L2_DEADLINE_UPDATE)) + return; + + deadline = tsc_deadline_to_tdvps(context->l2_tsc_deadline.deadline); + tdg_vp_wr(TDVPS_TSC_DEADLINE + vm_idx, deadline, ~0ULL); + + /* Tell the userspace that the kernel consumed the deadline */ + context->l2_tsc_deadline.update &= ~MSHV_VTL_TDX_L2_DEADLINE_UPDATE; +} + +static void mshv_tdx_tsc_deadline_expired(struct tdx_vp_context *context) +{ + u64 vm_idx = TDG_VP_ENTRY_VM_IDX(context->entry_rcx); + + if (!is_tdx_vm_idx_valid(vm_idx)) + return; + + tdg_vp_wr(TDVPS_TSC_DEADLINE + vm_idx, TDVPS_TSC_DEADLINE_DISARMED, ~0ULL); +} + void mshv_vtl_return_tdx(void) { struct tdx_tdg_vp_enter_exit_info *tdx_exit_info; @@ -885,6 +964,8 @@ void mshv_vtl_return_tdx(void) tdx_vp_state = &vtl_run->tdx_context.vp_state; per_cpu = this_cpu_ptr(&mshv_vtl_per_cpu); + mshv_vtl_return_tdx_tsc_deadline(vtl_run); + kernel_fpu_begin_mask(0); fxrstor(&vtl_run->tdx_context.fx_state); // restore FP reg and XMM regs native_write_cr2(tdx_vp_state->cr2); @@ -1296,6 +1377,11 @@ static bool mshv_tdx_is_idle(const struct tdx_vp_context *context) (u32)context->l2_enter_guest_state.rcx == HV_X64_MSR_GUEST_IDLE; } +static bool mshv_tdx_is_preemption_timer(const struct tdx_vp_context *context) +{ + return ((u32)context->exit_info.rax) == EXIT_REASON_PREEMPTION_TIMER; +} + static void mshv_tdx_handle_hlt_idle(struct tdx_vp_context *context) { const u64 VP_WRITE = 10; @@ -1338,6 +1424,11 @@ static bool mshv_tdx_try_handle_exit(struct mshv_vtl_run *run) const bool x2apic = MSHV_VTL_OFFLOAD_FLAG_X2APIC & run->offload_flags; bool ret_to_user = true; + if (mshv_tdx_is_preemption_timer(context)) { + mshv_tdx_tsc_deadline_expired(context); + return false; + } + if (!intr_inject || mshv_tdx_next_intr_exists(context)) return false; From 5992af2aa84c0188a21f5a004ad6c1196eae8d45 Mon Sep 17 00:00:00 2001 From: Isaku Yamahata Date: Fri, 21 Nov 2025 10:12:22 -0800 Subject: [PATCH 04/12] mshv_vtl/tdx: Skip duplicated TDG.VP.WR(TSC deadline) As the tdcall is slow, cache the previously written TSC deadline value and skip unnecessary tdg.vp.wr(TSC deadline) if the value doesn't change. This is also a preparation for hlt emulation case that requires the previously written TSC deadline value. Signed-off-by: Isaku Yamahata --- drivers/hv/mshv_vtl_main.c | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/drivers/hv/mshv_vtl_main.c b/drivers/hv/mshv_vtl_main.c index eb3bdce3c9a09..e2d8a3598e601 100644 --- a/drivers/hv/mshv_vtl_main.c +++ b/drivers/hv/mshv_vtl_main.c @@ -208,6 +208,7 @@ struct mshv_vtl_per_cpu { u64 l1_msr_lstar; u64 l1_msr_sfmask; u64 l1_msr_tsc_aux; + u64 l2_tsc_deadline_prev[MSHV_VTL_NUM_L2_VM]; bool msrs_are_guest; struct user_return_notifier mshv_urn; #endif @@ -657,6 +658,17 @@ static void mshv_write_tdx_apic_page(u64 apic_page_gpa) panic("write tdx apic page failed: %llx\n", status); } +static void mshv_vtl_set_tsc_deadline(u64 vm_idx, u64 deadline) +{ + struct mshv_vtl_per_cpu *per_cpu = this_cpu_ptr(&mshv_vtl_per_cpu); + + if (deadline == per_cpu->l2_tsc_deadline_prev[vm_idx - 1]) + return; + + tdg_vp_wr(TDVPS_TSC_DEADLINE + vm_idx, deadline, ~0ULL); + per_cpu->l2_tsc_deadline_prev[vm_idx - 1] = deadline; +} + #endif static int mshv_vtl_alloc_context(unsigned int cpu) @@ -697,7 +709,8 @@ static int mshv_vtl_alloc_context(unsigned int cpu) mshv_vtl_this_run()->tdx_context.l2_tsc_deadline.deadline = MSHV_VTL_TDX_L2_DEADLINE_DISARMED; for (vm_idx = 1; vm_idx <= MSHV_VTL_NUM_L2_VM; vm_idx++) - tdg_vp_wr(TDVPS_TSC_DEADLINE + vm_idx, TDVPS_TSC_DEADLINE_DISARMED, ~0ULL); + mshv_vtl_set_tsc_deadline(vm_idx, + TDVPS_TSC_DEADLINE_DISARMED); #endif } else if (hv_isolation_type_snp()) { #ifdef CONFIG_X86_64 @@ -936,7 +949,7 @@ static void mshv_vtl_return_tdx_tsc_deadline(struct mshv_vtl_run *vtl_run) return; deadline = tsc_deadline_to_tdvps(context->l2_tsc_deadline.deadline); - tdg_vp_wr(TDVPS_TSC_DEADLINE + vm_idx, deadline, ~0ULL); + mshv_vtl_set_tsc_deadline(vm_idx, deadline); /* Tell the userspace that the kernel consumed the deadline */ context->l2_tsc_deadline.update &= ~MSHV_VTL_TDX_L2_DEADLINE_UPDATE; @@ -949,7 +962,7 @@ static void mshv_tdx_tsc_deadline_expired(struct tdx_vp_context *context) if (!is_tdx_vm_idx_valid(vm_idx)) return; - tdg_vp_wr(TDVPS_TSC_DEADLINE + vm_idx, TDVPS_TSC_DEADLINE_DISARMED, ~0ULL); + mshv_vtl_set_tsc_deadline(vm_idx, TDVPS_TSC_DEADLINE_DISARMED); } void mshv_vtl_return_tdx(void) From fc1601f6e4db4ef55392278a565de34448096edd Mon Sep 17 00:00:00 2001 From: Isaku Yamahata Date: Wed, 15 Oct 2025 16:29:02 -0700 Subject: [PATCH 05/12] mshv_vtl/tdx: Arm per-cpu timer to wake up from L0 HLT emulation The TDX timer service sets a preemption timer for the L2 (VTL0) vCPU. tdg.vp.enter() exits with preemption timer exit reason on timer expiry. The HLT emulation path needs extra change where the L1 (VTL2) kernel issues TDG.VP.VMCALL(HLT) because the host (L0) VMM doesn't know the L2 deadline timer value. When the L1 kernel issues TDG.VP.VMCALL(HLT), start per-CPU hrtimer to wake up from the L0 HLT emulation by L1 getting timer interrupt. Cancel the hrtimer after it returns from the L0 VMM. Signed-off-by: Isaku Yamahata --- drivers/hv/mshv_vtl_main.c | 166 +++++++++++++++++++++++++++++++++++++ 1 file changed, 166 insertions(+) diff --git a/drivers/hv/mshv_vtl_main.c b/drivers/hv/mshv_vtl_main.c index e2d8a3598e601..b645431e39c3a 100644 --- a/drivers/hv/mshv_vtl_main.c +++ b/drivers/hv/mshv_vtl_main.c @@ -209,6 +209,7 @@ struct mshv_vtl_per_cpu { u64 l1_msr_sfmask; u64 l1_msr_tsc_aux; u64 l2_tsc_deadline_prev[MSHV_VTL_NUM_L2_VM]; + u64 l2_hlt_tsc_deadline; bool msrs_are_guest; struct user_return_notifier mshv_urn; #endif @@ -222,6 +223,20 @@ static DEFINE_PER_CPU(struct mshv_vtl_poll_file, mshv_vtl_poll_file); static DEFINE_PER_CPU(unsigned long long, num_vtl0_transitions); static DEFINE_PER_CPU(struct mshv_vtl_per_cpu, mshv_vtl_per_cpu); +#if defined(CONFIG_X86_64) && defined(CONFIG_INTEL_TDX_GUEST) +static DEFINE_PER_CPU(struct hrtimer, mshv_tdx_halt_timer); +static struct hrtimer *tdx_this_halt_timer(void) +{ + return this_cpu_ptr(&mshv_tdx_halt_timer); +} +#else +static struct hrtimer *tdx_this_halt_timer(void) +{ + return NULL; +} +#endif +static void mshv_tdx_init_halt_timer(void); + noinline void mshv_vtl_return_tdx(void); struct mshv_vtl_run *mshv_vtl_this_run(void); void mshv_tdx_request_cache_flush(bool wbnoinvd); @@ -711,7 +726,9 @@ static int mshv_vtl_alloc_context(unsigned int cpu) for (vm_idx = 1; vm_idx <= MSHV_VTL_NUM_L2_VM; vm_idx++) mshv_vtl_set_tsc_deadline(vm_idx, TDVPS_TSC_DEADLINE_DISARMED); + per_cpu->l2_hlt_tsc_deadline = TDVPS_TSC_DEADLINE_DISARMED; #endif + mshv_tdx_init_halt_timer(); } else if (hv_isolation_type_snp()) { #ifdef CONFIG_X86_64 int ret; @@ -1041,6 +1058,144 @@ static bool mshv_vtl_process_intercept(void) return false; } +/* + * The purpose is to get interrupt on this vCPU to wake up from + * L0 VMM HLT emulation. + * + * The sequence is + * - local_irq_save() + * - Start the timer if necessary. + * - tdx_halt(irq_disabled=false) + * The L0 VMM wakes up vCPU from HLT due to timer interrupt even with + * rflags.if=0. + * - Cancel the timer if timer was started. + * The callback isn't be invoked because of rflags.if=0. + * - local_irq_restore() + */ +static enum hrtimer_restart mshv_tdx_timer_fn(struct hrtimer *timer) +{ + return HRTIMER_NORESTART; +} + +static void mshv_tdx_init_halt_timer(void) +{ + struct hrtimer *timer = tdx_this_halt_timer(); + + if (!timer) + return; + + hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED); + timer->function = mshv_tdx_timer_fn; +} + +enum TDX_HALT_TIMER { + TIMER_ARMED, + TIMER_NOTARMED, +}; + +/* + * The L1 VMM needs to tell wake up time from HLT emulation because the host + * (L0) VMM doesn't have access to TDVPS_TSC_DEADLINE with the production TDX + * module. + * Set up a timer interrupt instead. + */ +static enum TDX_HALT_TIMER mshv_tdx_setup_halt_timer(void) +{ +#if defined(CONFIG_X86_64) && defined(CONFIG_INTEL_TDX_GUEST) + struct tdx_vp_context *context = &mshv_vtl_this_run()->tdx_context; +#endif + u64 now, deadline = TDVPS_TSC_DEADLINE_DISARMED; + struct hrtimer *timer = tdx_this_halt_timer(); + ktime_t time; + + if (!timer) + return TIMER_NOTARMED; + +#if defined(CONFIG_X86_64) && defined(CONFIG_INTEL_TDX_GUEST) + /* Get the timeout value to wake up from HLT. */ + if (context->l2_tsc_deadline.update & MSHV_VTL_TDX_L2_DEADLINE_UPDATE) + deadline = tsc_deadline_to_tdvps(context->l2_tsc_deadline.deadline); + else { + struct mshv_vtl_per_cpu *per_cpu = this_cpu_ptr(&mshv_vtl_per_cpu); + u64 vm_idx = TDG_VP_ENTRY_VM_IDX(context->entry_rcx); + + /* + * If we run L2 vCPU before entering the L0 HLT emulation, we + * may have issued tdg.vp.wr(TSC DEADLINE). + */ + if (is_tdx_vm_idx_valid(vm_idx)) + deadline = per_cpu->l2_tsc_deadline_prev[vm_idx - 1]; + } +#endif + if (deadline == TDVPS_TSC_DEADLINE_DISARMED) + return TIMER_NOTARMED; + + time = 0; + now = rdtsc(); + if (deadline > now) { + /* + * ktime_t is nsec. + * 1 TSC tick = 1 / (tsc_khz * 1000) sec + * = 1000 * 1000 / tsc_khz nsec + */ + time = mul_u64_u64_div_u64(deadline - now, 1000 * 1000, tsc_khz); + if (time < 0) + time = KTIME_MAX; + } + + hrtimer_start(timer, time, HRTIMER_MODE_REL_PINNED); +#if defined(CONFIG_X86_64) && defined(CONFIG_INTEL_TDX_GUEST) + this_cpu_ptr(&mshv_vtl_per_cpu)->l2_hlt_tsc_deadline = deadline; +#endif + return TIMER_ARMED; +} + +static enum TDX_HALT_TIMER mshv_tdx_halt_timer_pre(bool try_arm) +{ + if (!hv_isolation_type_tdx()) + return TIMER_NOTARMED; + + if (!try_arm) + return TIMER_NOTARMED; + + return mshv_tdx_setup_halt_timer(); +} + +static void mshv_tdx_halt_timer_post(enum TDX_HALT_TIMER armed) +{ +#if defined(CONFIG_X86_64) && defined(CONFIG_INTEL_TDX_GUEST) + struct mshv_vtl_per_cpu *per_cpu; + struct tdx_vp_context *context; +#endif + struct hrtimer *timer; + + if (armed != TIMER_ARMED) + return; + + timer = tdx_this_halt_timer(); + if (!timer) + return; + + hrtimer_cancel(timer); + +#if defined(CONFIG_X86_64) && defined(CONFIG_INTEL_TDX_GUEST) + per_cpu = this_cpu_ptr(&mshv_vtl_per_cpu); + if (per_cpu->l2_hlt_tsc_deadline > rdtsc()) + return; + + /* + * Emulate timer expiry as if preemption timer expires with + * tdg.vp.enter(). + */ + context = &mshv_vtl_this_run()->tdx_context; + context->exit_info.rax = EXIT_REASON_PREEMPTION_TIMER; + + mshv_tdx_tsc_deadline_expired(context); + + context->l2_tsc_deadline.update &= ~MSHV_VTL_TDX_L2_DEADLINE_UPDATE; +#endif +} + static bool in_idle_is_enabled; DEFINE_PER_CPU(struct task_struct *, mshv_vtl_thread); @@ -1051,9 +1206,12 @@ static void mshv_vtl_switch_to_vtl0_irqoff(void) struct hv_vtl_cpu_context *cpu_ctx = &this_run->cpu_context; u32 flags = READ_ONCE(this_run->flags); union hv_input_vtl target_vtl = READ_ONCE(this_run->target_vtl); + enum TDX_HALT_TIMER armed; trace_mshv_vtl_enter_vtl0_rcuidle(cpu_ctx); + armed = mshv_tdx_halt_timer_pre(flags & MSHV_VTL_RUN_FLAG_HALTED); + /* A VTL2 TDX kernel doesn't allocate hv_vp_assist_page at the moment */ hvp = hv_vp_assist_page ? hv_vp_assist_page[smp_processor_id()] : NULL; @@ -1076,6 +1234,8 @@ static void mshv_vtl_switch_to_vtl0_irqoff(void) hv_vtl_return(cpu_ctx, target_vtl, flags, mshv_vsm_page_offsets.vtl_return_offset); + mshv_tdx_halt_timer_post(armed); + if (!hvp) return; @@ -1105,7 +1265,13 @@ static void mshv_vtl_idle(void) } raw_local_irq_enable(); } else { + enum TDX_HALT_TIMER armed; + + armed = mshv_tdx_halt_timer_pre(true); + hv_vtl_idle(); + + mshv_tdx_halt_timer_post(armed); } } From 9b80430043b8b9736391db94a91b99418e11022a Mon Sep 17 00:00:00 2001 From: Isaku Yamahata Date: Thu, 20 Nov 2025 13:58:27 -0800 Subject: [PATCH 06/12] mshv_vtl/tdx: Eliminate duplicate tdg.vp.wr(TSC deadline = disarm) On timer expiry path, it unconditionally issues tdg.vp.wr(TSC deadline = disarm). The following tdg.vp.enter() execution path may overwrite tdg.vp.wr(new TSC deadline). Delete the duplicated tdg.vp.wr() call as optimization. Signed-off-by: Isaku Yamahata --- drivers/hv/mshv_vtl_main.c | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/drivers/hv/mshv_vtl_main.c b/drivers/hv/mshv_vtl_main.c index b645431e39c3a..8ea64c5fafb05 100644 --- a/drivers/hv/mshv_vtl_main.c +++ b/drivers/hv/mshv_vtl_main.c @@ -210,6 +210,7 @@ struct mshv_vtl_per_cpu { u64 l1_msr_tsc_aux; u64 l2_tsc_deadline_prev[MSHV_VTL_NUM_L2_VM]; u64 l2_hlt_tsc_deadline; + bool l2_tsc_deadline_expired[MSHV_VTL_NUM_L2_VM]; bool msrs_are_guest; struct user_return_notifier mshv_urn; #endif @@ -677,6 +678,8 @@ static void mshv_vtl_set_tsc_deadline(u64 vm_idx, u64 deadline) { struct mshv_vtl_per_cpu *per_cpu = this_cpu_ptr(&mshv_vtl_per_cpu); + per_cpu->l2_tsc_deadline_expired[vm_idx - 1] = false; + if (deadline == per_cpu->l2_tsc_deadline_prev[vm_idx - 1]) return; @@ -955,6 +958,7 @@ static void mshv_vtl_on_user_return(struct user_return_notifier *urn) static void mshv_vtl_return_tdx_tsc_deadline(struct mshv_vtl_run *vtl_run) { struct tdx_vp_context *context = &vtl_run->tdx_context; + struct mshv_vtl_per_cpu *per_cpu; u64 vm_idx, deadline; /* L2 VM index is encoded in entry_rcx for TDG.VP.ENTER(). */ @@ -962,8 +966,13 @@ static void mshv_vtl_return_tdx_tsc_deadline(struct mshv_vtl_run *vtl_run) if (!is_tdx_vm_idx_valid(vm_idx)) return; - if (!(context->l2_tsc_deadline.update & MSHV_VTL_TDX_L2_DEADLINE_UPDATE)) + per_cpu = this_cpu_ptr(&mshv_vtl_per_cpu); + if (!(context->l2_tsc_deadline.update & MSHV_VTL_TDX_L2_DEADLINE_UPDATE)) { + if (per_cpu->l2_tsc_deadline_expired[vm_idx - 1]) + mshv_vtl_set_tsc_deadline(vm_idx, TDVPS_TSC_DEADLINE_DISARMED); + return; + } deadline = tsc_deadline_to_tdvps(context->l2_tsc_deadline.deadline); mshv_vtl_set_tsc_deadline(vm_idx, deadline); @@ -974,12 +983,13 @@ static void mshv_vtl_return_tdx_tsc_deadline(struct mshv_vtl_run *vtl_run) static void mshv_tdx_tsc_deadline_expired(struct tdx_vp_context *context) { + struct mshv_vtl_per_cpu *per_cpu = this_cpu_ptr(&mshv_vtl_per_cpu); u64 vm_idx = TDG_VP_ENTRY_VM_IDX(context->entry_rcx); if (!is_tdx_vm_idx_valid(vm_idx)) return; - mshv_vtl_set_tsc_deadline(vm_idx, TDVPS_TSC_DEADLINE_DISARMED); + per_cpu->l2_tsc_deadline_expired[vm_idx - 1] = true; } void mshv_vtl_return_tdx(void) @@ -1121,9 +1131,11 @@ static enum TDX_HALT_TIMER mshv_tdx_setup_halt_timer(void) /* * If we run L2 vCPU before entering the L0 HLT emulation, we - * may have issued tdg.vp.wr(TSC DEADLINE). + * may have issued tdg.vp.wr(TSC DEADLINE) and the timer may + * have been expired. */ - if (is_tdx_vm_idx_valid(vm_idx)) + if (is_tdx_vm_idx_valid(vm_idx) && + !per_cpu->l2_tsc_deadline_expired[vm_idx - 1]) deadline = per_cpu->l2_tsc_deadline_prev[vm_idx - 1]; } #endif From 3528fd7a17430d9e7847a3b62b77e614921a99cf Mon Sep 17 00:00:00 2001 From: Isaku Yamahata Date: Mon, 20 Oct 2025 10:50:29 -0700 Subject: [PATCH 07/12] drivers: hv: mshv_vtl: Advertise TDX timer service extension Add an extension for the TDX timer service, so that the userspace can query the feature before use. Signed-off-by: Isaku Yamahata --- drivers/hv/mshv_vtl_main.c | 17 +++++++++++++++++ include/uapi/linux/mshv.h | 1 + 2 files changed, 18 insertions(+) diff --git a/drivers/hv/mshv_vtl_main.c b/drivers/hv/mshv_vtl_main.c index 8ea64c5fafb05..765f1aa5eaecf 100644 --- a/drivers/hv/mshv_vtl_main.c +++ b/drivers/hv/mshv_vtl_main.c @@ -312,6 +312,19 @@ static int mshv_tdx_set_cpumask_from_apicid(int apicid, struct cpumask *cpu_mask } #endif +static long mshv_tdx_vtl_ioctl_check_extension(u32 arg) +{ + if (!IS_ENABLED(CONFIG_INTEL_TDX_GUEST)) + return -EOPNOTSUPP; + + switch (arg) { + case MSHV_CAP_LOWER_VTL_TIMER_VIRT: + return 1; + default: + return -EOPNOTSUPP; + } +} + static long __mshv_vtl_ioctl_check_extension(u32 arg) { switch (arg) { @@ -321,6 +334,10 @@ static long __mshv_vtl_ioctl_check_extension(u32 arg) return mshv_vsm_capabilities.return_action_available; case MSHV_CAP_DR6_SHARED: return mshv_vsm_capabilities.dr6_shared; + case MSHV_CAP_LOWER_VTL_TIMER_VIRT: + if (hv_isolation_type_tdx()) + return mshv_tdx_vtl_ioctl_check_extension(arg); + break; } return -EOPNOTSUPP; diff --git a/include/uapi/linux/mshv.h b/include/uapi/linux/mshv.h index 6fba073f2b5ca..bf9cc25f7bdac 100644 --- a/include/uapi/linux/mshv.h +++ b/include/uapi/linux/mshv.h @@ -15,6 +15,7 @@ #define MSHV_CAP_REGISTER_PAGE 0x1 #define MSHV_CAP_VTL_RETURN_ACTION 0x2 #define MSHV_CAP_DR6_SHARED 0x3 +#define MSHV_CAP_LOWER_VTL_TIMER_VIRT 0x4 #define MSHV_VP_MMAP_REGISTERS_OFFSET (HV_VP_STATE_PAGE_REGISTERS * 0x1000) #define MAX_RUN_MSG_SIZE 256 From 8bfc1c194846eb9e47519a2bc15013a59a6d43c6 Mon Sep 17 00:00:00 2001 From: Isaku Yamahata Date: Thu, 30 Oct 2025 11:41:09 -0700 Subject: [PATCH 08/12] arch/x86: Unbreak df21bf37518b ("arch/x86: Provide the CPU number in the wakeup AP callback") The commit df21bf37518b ("arch/x86: Provide the CPU number in the wakeup AP callback") changed the signature of struct apic::wakeup_secondary_cpu(), but it did not update numachip_wakeup_secondary(). Update it to fix the compile error. arch/x86/kernel/apic/apic_numachip.c:228:43: error: initialization of 'int (*)(u32, long unsigned int, unsigned int)' {aka 'int (*)(unsigned int, long unsigned int, unsigned int)'} from incompatible pointer type 'int (*)(u32, long unsigned int)' {aka 'int (*)(unsigned int, long unsigned int)'} [-Wincompatible-pointer-types] 228 | .wakeup_secondary_cpu = numachip_wakeup_secondary, | ^~~~~~~~~~~~~~~~~~~~~~~~~ Fixes: df21bf37518b ("arch/x86: Provide the CPU number in the wakeup AP callback") Signed-off-by: Isaku Yamahata --- arch/x86/kernel/apic/apic_numachip.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c index 16410f087b7af..333536b89bde3 100644 --- a/arch/x86/kernel/apic/apic_numachip.c +++ b/arch/x86/kernel/apic/apic_numachip.c @@ -56,7 +56,7 @@ static void numachip2_apic_icr_write(int apicid, unsigned int val) numachip2_write32_lcsr(NUMACHIP2_APIC_ICR, (apicid << 12) | val); } -static int numachip_wakeup_secondary(u32 phys_apicid, unsigned long start_rip) +static int numachip_wakeup_secondary(u32 phys_apicid, unsigned long start_rip, unsigned int cpu) { numachip_apic_icr_write(phys_apicid, APIC_DM_INIT); numachip_apic_icr_write(phys_apicid, APIC_DM_STARTUP | From e52cdcdb91bcf96f2f8f1ffe8a2dfb7ff7d21205 Mon Sep 17 00:00:00 2001 From: Isaku Yamahata Date: Wed, 15 Oct 2025 16:29:02 -0700 Subject: [PATCH 09/12] mshv_vtl/tdx: Fix arm64 build MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add ifdef and define to fix arm64 build. drivers/hv/mshv_vtl_main.c: In function ‘mshv_tdx_setup_halt_timer’: drivers/hv/mshv_vtl_main.c:1163:15: error: implicit declaration of function ‘rdtsc’ [-Werror=implicit-function-declaration] 1163 | now = rdtsc(); | ^~~~~ drivers/hv/mshv_vtl_main.c:1170:73: error: ‘tsc_khz’ undeclared (first use in this function) 1170 | time = mul_u64_u64_div_u64(deadline - now, 1000 * 1000, tsc_khz); | ^~~~~~~ drivers/hv/mshv_vtl_main.c:1170:73: note: each undeclared identifier is reported only once for each function it appears in drivers/hv/mshv_vtl_main.c: In function ‘mshv_vtl_switch_to_vtl0_irqoff’: drivers/hv/mshv_vtl_main.c:1242:49: error: ‘MSHV_VTL_RUN_FLAG_HALTED’ undeclared (first use in this function) 1242 | armed = mshv_tdx_halt_timer_pre(flags & MSHV_VTL_RUN_FLAG_HALTED); | ^~~~~~~~~~~~~~~~~~~~~~~~ cc1: some warnings being treated as errors Fixes: fc1601f6e4db ("mshv_vtl/tdx: Arm per-cpu timer to wake up from L0 HLT emulation") Signed-off-by: Isaku Yamahata --- drivers/hv/mshv_vtl_main.c | 48 ++++++++++++++------------------------ 1 file changed, 18 insertions(+), 30 deletions(-) diff --git a/drivers/hv/mshv_vtl_main.c b/drivers/hv/mshv_vtl_main.c index 765f1aa5eaecf..d6ce411b940b4 100644 --- a/drivers/hv/mshv_vtl_main.c +++ b/drivers/hv/mshv_vtl_main.c @@ -230,13 +230,8 @@ static struct hrtimer *tdx_this_halt_timer(void) { return this_cpu_ptr(&mshv_tdx_halt_timer); } -#else -static struct hrtimer *tdx_this_halt_timer(void) -{ - return NULL; -} -#endif static void mshv_tdx_init_halt_timer(void); +#endif noinline void mshv_vtl_return_tdx(void); struct mshv_vtl_run *mshv_vtl_this_run(void); @@ -747,8 +742,8 @@ static int mshv_vtl_alloc_context(unsigned int cpu) mshv_vtl_set_tsc_deadline(vm_idx, TDVPS_TSC_DEADLINE_DISARMED); per_cpu->l2_hlt_tsc_deadline = TDVPS_TSC_DEADLINE_DISARMED; -#endif mshv_tdx_init_halt_timer(); +#endif } else if (hv_isolation_type_snp()) { #ifdef CONFIG_X86_64 int ret; @@ -1085,6 +1080,12 @@ static bool mshv_vtl_process_intercept(void) return false; } +enum TDX_HALT_TIMER { + TIMER_ARMED, + TIMER_NOTARMED, +}; + +#if defined(CONFIG_X86_64) && defined(CONFIG_INTEL_TDX_GUEST) /* * The purpose is to get interrupt on this vCPU to wake up from * L0 VMM HLT emulation. @@ -1108,18 +1109,10 @@ static void mshv_tdx_init_halt_timer(void) { struct hrtimer *timer = tdx_this_halt_timer(); - if (!timer) - return; - hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED); timer->function = mshv_tdx_timer_fn; } -enum TDX_HALT_TIMER { - TIMER_ARMED, - TIMER_NOTARMED, -}; - /* * The L1 VMM needs to tell wake up time from HLT emulation because the host * (L0) VMM doesn't have access to TDVPS_TSC_DEADLINE with the production TDX @@ -1128,17 +1121,11 @@ enum TDX_HALT_TIMER { */ static enum TDX_HALT_TIMER mshv_tdx_setup_halt_timer(void) { -#if defined(CONFIG_X86_64) && defined(CONFIG_INTEL_TDX_GUEST) struct tdx_vp_context *context = &mshv_vtl_this_run()->tdx_context; -#endif u64 now, deadline = TDVPS_TSC_DEADLINE_DISARMED; struct hrtimer *timer = tdx_this_halt_timer(); ktime_t time; - if (!timer) - return TIMER_NOTARMED; - -#if defined(CONFIG_X86_64) && defined(CONFIG_INTEL_TDX_GUEST) /* Get the timeout value to wake up from HLT. */ if (context->l2_tsc_deadline.update & MSHV_VTL_TDX_L2_DEADLINE_UPDATE) deadline = tsc_deadline_to_tdvps(context->l2_tsc_deadline.deadline); @@ -1155,7 +1142,6 @@ static enum TDX_HALT_TIMER mshv_tdx_setup_halt_timer(void) !per_cpu->l2_tsc_deadline_expired[vm_idx - 1]) deadline = per_cpu->l2_tsc_deadline_prev[vm_idx - 1]; } -#endif if (deadline == TDVPS_TSC_DEADLINE_DISARMED) return TIMER_NOTARMED; @@ -1173,9 +1159,7 @@ static enum TDX_HALT_TIMER mshv_tdx_setup_halt_timer(void) } hrtimer_start(timer, time, HRTIMER_MODE_REL_PINNED); -#if defined(CONFIG_X86_64) && defined(CONFIG_INTEL_TDX_GUEST) this_cpu_ptr(&mshv_vtl_per_cpu)->l2_hlt_tsc_deadline = deadline; -#endif return TIMER_ARMED; } @@ -1192,22 +1176,17 @@ static enum TDX_HALT_TIMER mshv_tdx_halt_timer_pre(bool try_arm) static void mshv_tdx_halt_timer_post(enum TDX_HALT_TIMER armed) { -#if defined(CONFIG_X86_64) && defined(CONFIG_INTEL_TDX_GUEST) struct mshv_vtl_per_cpu *per_cpu; struct tdx_vp_context *context; -#endif struct hrtimer *timer; if (armed != TIMER_ARMED) return; timer = tdx_this_halt_timer(); - if (!timer) - return; hrtimer_cancel(timer); -#if defined(CONFIG_X86_64) && defined(CONFIG_INTEL_TDX_GUEST) per_cpu = this_cpu_ptr(&mshv_vtl_per_cpu); if (per_cpu->l2_hlt_tsc_deadline > rdtsc()) return; @@ -1222,8 +1201,14 @@ static void mshv_tdx_halt_timer_post(enum TDX_HALT_TIMER armed) mshv_tdx_tsc_deadline_expired(context); context->l2_tsc_deadline.update &= ~MSHV_VTL_TDX_L2_DEADLINE_UPDATE; -#endif } +#else +static enum TDX_HALT_TIMER mshv_tdx_halt_timer_pre(bool try_arm) +{ + return TIMER_NOTARMED; +} +static void mshv_tdx_halt_timer_post(enum TDX_HALT_TIMER armed) {} +#endif static bool in_idle_is_enabled; DEFINE_PER_CPU(struct task_struct *, mshv_vtl_thread); @@ -1239,6 +1224,9 @@ static void mshv_vtl_switch_to_vtl0_irqoff(void) trace_mshv_vtl_enter_vtl0_rcuidle(cpu_ctx); +#ifndef MSHV_VTL_RUN_FLAG_HALTED +# define MSHV_VTL_RUN_FLAG_HALTED 0ULL +#endif armed = mshv_tdx_halt_timer_pre(flags & MSHV_VTL_RUN_FLAG_HALTED); /* A VTL2 TDX kernel doesn't allocate hv_vp_assist_page at the moment */ From 9519a2a1e329a47256223ea12c39d40ea28b6d8c Mon Sep 17 00:00:00 2001 From: Isaku Yamahata Date: Wed, 10 Dec 2025 14:28:49 -0800 Subject: [PATCH 10/12] mshv_vtl/tdx: Fix compile error, parameter name omitted In case of !CONFIG_INTEL_TDX_GUEST, gcc complains as "parameter name omitted". Add missing parameter names. Fixes: 151bf9813836 ("mshv_vtl/tdx: Handle some APIC functionality in kernel") Signed-off-by: Isaku Yamahata --- drivers/hv/mshv_vtl_main.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/hv/mshv_vtl_main.c b/drivers/hv/mshv_vtl_main.c index d6ce411b940b4..d3dd7803f32da 100644 --- a/drivers/hv/mshv_vtl_main.c +++ b/drivers/hv/mshv_vtl_main.c @@ -1306,8 +1306,8 @@ static void mshv_vtl_idle(void) */ #ifndef CONFIG_INTEL_TDX_GUEST static void mshv_tdx_free_apicid_to_cpuid_mapping(void) {} -static int mshv_tdx_create_apicid_to_cpuid_mapping(struct device *) { return 0; } -static bool mshv_tdx_try_handle_exit(struct mshv_vtl_run *) { return false; } +static int mshv_tdx_create_apicid_to_cpuid_mapping(struct device *dev) { return 0; } +static bool mshv_tdx_try_handle_exit(struct mshv_vtl_run *run) { return false; } #else static void mshv_tdx_free_apicid_to_cpuid_mapping(void) { From 5da04eb28faad442031b89ee2fd4e7849221afaa Mon Sep 17 00:00:00 2001 From: Isaku Yamahata Date: Wed, 10 Dec 2025 16:32:32 -0800 Subject: [PATCH 11/12] mshv_vtl/tdx: Move up one ifdef COFIG_INTEL_TDX_GUEST Move up one of define(CONFIG_X86_64) && defined(CONFIG_INTEL_TDX_GUEST) for consistency. Fixes: 1bf07066d93b ("mshv_vtl/tdx: Set TSC deadline if userspace requests") Signed-off-by: Isaku Yamahata --- drivers/hv/mshv_vtl_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/hv/mshv_vtl_main.c b/drivers/hv/mshv_vtl_main.c index d3dd7803f32da..2b79b77fa9ef5 100644 --- a/drivers/hv/mshv_vtl_main.c +++ b/drivers/hv/mshv_vtl_main.c @@ -152,6 +152,7 @@ union hv_register_vsm_page_offsets { u64 as_uint64; } __packed; +#if defined(CONFIG_X86_64) && defined(CONFIG_INTEL_TDX_GUEST) #define MSHV_VTL_NUM_L2_VM 3 #define TDVPS_TSC_DEADLINE_DISARMED (~0ULL) @@ -163,7 +164,6 @@ union hv_register_vsm_page_offsets { (((entry_rcx) & TDG_VP_ENTRY_VM_MASK) >> \ TDG_VP_ENTRY_VM_SHIFT) -#if defined(CONFIG_X86_64) && defined(CONFIG_INTEL_TDX_GUEST) /* index: 0: L1 VM, 1-3: L2 VM */ static bool is_tdx_vm_idx_valid(u64 vm_idx) { From 1047cf13c95df46bd4a2ca1547e596653b76e56f Mon Sep 17 00:00:00 2001 From: Isaku Yamahata Date: Thu, 11 Dec 2025 10:17:43 -0800 Subject: [PATCH 12/12] drivers: hv: mshv_vtl: 0 for MSHV_CAP_LOWER_VTL_TIMER_VIRT if unsupported Returning -EOPNOTSUPP is a fatal error. It means that The driver doesn't know the feature. Not that the feature is not supported due to the runtime platform reason. Return 0 for MSHV_CAP_LOWER_VTL_TIMER_VIRT on non-TDX platform, which is safer. Fixes: 3528fd7a1743 ("drivers: hv: mshv_vtl: Advertise TDX timer service extension") Signed-off-by: Isaku Yamahata --- drivers/hv/mshv_vtl_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/hv/mshv_vtl_main.c b/drivers/hv/mshv_vtl_main.c index 2b79b77fa9ef5..92c5d1c37fdd7 100644 --- a/drivers/hv/mshv_vtl_main.c +++ b/drivers/hv/mshv_vtl_main.c @@ -332,7 +332,7 @@ static long __mshv_vtl_ioctl_check_extension(u32 arg) case MSHV_CAP_LOWER_VTL_TIMER_VIRT: if (hv_isolation_type_tdx()) return mshv_tdx_vtl_ioctl_check_extension(arg); - break; + return 0; } return -EOPNOTSUPP;