On Thu, Aug 21, 2025 at 06:30:34AM -0700, Chao Gao wrote:
The FPU support for CET virtualization has already been merged into 6.17-rc1. Building on that, this series introduces Intel CET virtualization support for KVM.
Changes in v13
- Add "arch" and "size" fields to the register ID used in
KVM_GET/SET_ONE_REG ioctls 2. Add a kselftest for KVM_GET/SET_ONE_REG ioctls 3. Advertise KVM_CAP_ONE_REG 4. Document how the emulation of SSP MSRs is flawed for 32-bit guests 5. Don't pass-thru MSR_IA32_INT_SSP_TAB and report it as unsupported for 32-bit guests 6. Refine changelog to clarify why CET MSRs are pass-thru'd. 7. Limit SHSTK to 64-bit kernels 8. Retain CET state if L1 doesn't set VM_EXIT_LOAD_CET_STATE 9. Rename a new functions for clarity
below is the diff between v12 and v13:
diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index a4870d9c9279..478d9b63a9db 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -411,15 +411,26 @@ struct kvm_xcrs { __u64 padding[16]; };
-#define KVM_X86_REG_MSR (1 << 2) -#define KVM_X86_REG_SYNTHETIC (1 << 3) - -struct kvm_x86_reg_id { - __u32 index; - __u8 type; - __u8 rsvd; - __u16 rsvd16; -}; +#define KVM_X86_REG_TYPE_MSR 2 +#define KVM_X86_REG_TYPE_SYNTHETIC_MSR 3 + +#define KVM_X86_REG_TYPE_SIZE(type) \ +({ \ + __u64 type_size = (__u64)type << 32; \ + \ + type_size |= type == KVM_X86_REG_TYPE_MSR ? KVM_REG_SIZE_U64 : \ + type == KVM_X86_REG_TYPE_SYNTHETIC_MSR ? KVM_REG_SIZE_U64 :\ + 0; \ + type_size; \ +}) + +#define KVM_X86_REG_ENCODE(type, index) \ + (KVM_REG_X86 | KVM_X86_REG_TYPE_SIZE(type) | index) + +#define KVM_X86_REG_MSR(index) \ + KVM_X86_REG_ENCODE(KVM_X86_REG_TYPE_MSR, index) +#define KVM_X86_REG_SYNTHETIC_MSR(index) \ + KVM_X86_REG_ENCODE(KVM_X86_REG_TYPE_SYNTHETIC_MSR, index)
/* KVM synthetic MSR index staring from 0 */ #define KVM_SYNTHETIC_GUEST_SSP 0 diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index d0c08aab3e3a..ee05b876c656 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -944,7 +944,7 @@ void kvm_set_cpu_caps(void) VENDOR_F(WAITPKG), F(SGX_LC), F(BUS_LOCK_DETECT), - F(SHSTK), + X86_64_F(SHSTK), );
/* diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 092c91af8f0c..d7e2fb30fc1a 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -739,9 +739,6 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, MSR_IA32_PL3_SSP, MSR_TYPE_RW);
- nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, - MSR_IA32_INT_SSP_TAB, MSR_TYPE_RW); - kvm_vcpu_unmap(vcpu, &map);
vmx->nested.force_msr_bitmap_recalc = false; @@ -2542,8 +2539,8 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct loaded_vmcs *vmcs0 } }
-static inline void cet_vmcs_fields_get(struct kvm_vcpu *vcpu, u64 *s_cet, - u64 *ssp, u64 *ssp_tbl) +static void vmcs_read_cet_state(struct kvm_vcpu *vcpu, u64 *s_cet, + u64 *ssp, u64 *ssp_tbl) { if (guest_cpu_cap_has(vcpu, X86_FEATURE_IBT) || guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK)) @@ -2555,8 +2552,8 @@ static inline void cet_vmcs_fields_get(struct kvm_vcpu *vcpu, u64 *s_cet, } }
-static inline void cet_vmcs_fields_set(struct kvm_vcpu *vcpu, u64 s_cet, - u64 ssp, u64 ssp_tbl) +static void vmcs_write_cet_state(struct kvm_vcpu *vcpu, u64 s_cet, + u64 ssp, u64 ssp_tbl) { if (guest_cpu_cap_has(vcpu, X86_FEATURE_IBT) || guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK)) @@ -2685,8 +2682,8 @@ static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_CET_STATE) - cet_vmcs_fields_set(&vmx->vcpu, vmcs12->guest_s_cet, - vmcs12->guest_ssp, vmcs12->guest_ssp_tbl); + vmcs_write_cet_state(&vmx->vcpu, vmcs12->guest_s_cet, + vmcs12->guest_ssp, vmcs12->guest_ssp_tbl);
set_cr4_guest_host_mask(vmx); } @@ -2730,9 +2727,9 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
if (!vmx->nested.nested_run_pending || !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_CET_STATE)) - cet_vmcs_fields_set(vcpu, vmx->nested.pre_vmenter_s_cet, - vmx->nested.pre_vmenter_ssp, - vmx->nested.pre_vmenter_ssp_tbl); + vmcs_write_cet_state(vcpu, vmx->nested.pre_vmenter_s_cet, + vmx->nested.pre_vmenter_ssp, + vmx->nested.pre_vmenter_ssp_tbl);
if (kvm_mpx_supported() && (!vmx->nested.nested_run_pending || !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))) @@ -3106,7 +3103,7 @@ static bool is_l1_noncanonical_address_on_vmexit(u64 la, struct vmcs12 *vmcs12)
static bool is_valid_cet_state(struct kvm_vcpu *vcpu, u64 s_cet, u64 ssp, u64 ssp_tbl) { - if (!is_cet_msr_valid(vcpu, s_cet) || !IS_ALIGNED(ssp, 4)) + if (!kvm_is_valid_u_s_cet(vcpu, s_cet) || !IS_ALIGNED(ssp, 4)) return false;
if (is_noncanonical_msr_address(ssp_tbl, vcpu)) @@ -3665,7 +3662,7 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
if (!vmx->nested.nested_run_pending || !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_CET_STATE)) - cet_vmcs_fields_get(vcpu, &vmx->nested.pre_vmenter_s_cet, + vmcs_read_cet_state(vcpu, &vmx->nested.pre_vmenter_s_cet, &vmx->nested.pre_vmenter_ssp, &vmx->nested.pre_vmenter_ssp_tbl);
@@ -4596,9 +4593,6 @@ static bool is_vmcs12_ext_field(unsigned long field) case GUEST_IDTR_BASE: case GUEST_PENDING_DBG_EXCEPTIONS: case GUEST_BNDCFGS: - case GUEST_S_CET: - case GUEST_SSP: - case GUEST_INTR_SSP_TABLE: return true; default: break; @@ -4649,10 +4643,6 @@ static void sync_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu, vmcs12->guest_pending_dbg_exceptions = vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
- cet_vmcs_fields_get(&vmx->vcpu, &vmcs12->guest_s_cet, - &vmcs12->guest_ssp, - &vmcs12->guest_ssp_tbl); - vmx->nested.need_sync_vmcs02_to_vmcs12_rare = false; }
@@ -4759,6 +4749,10 @@ static void sync_vmcs02_to_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_EFER) vmcs12->guest_ia32_efer = vcpu->arch.efer; + + vmcs_read_cet_state(&vmx->vcpu, &vmcs12->guest_s_cet, + &vmcs12->guest_ssp, + &vmcs12->guest_ssp_tbl); }
/* @@ -4884,9 +4878,17 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, if (vmcs12->vm_exit_controls & VM_EXIT_CLEAR_BNDCFGS) vmcs_write64(GUEST_BNDCFGS, 0);
+ /* + * Load CET state from host state if VM_EXIT_LOAD_CET_STATE is set. + * otherwise CET state should be retained across VM-exit, i.e., + * guest values should be propagated from vmcs12 to vmcs01. + */ if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_CET_STATE) - cet_vmcs_fields_set(vcpu, vmcs12->host_s_cet, vmcs12->host_ssp, - vmcs12->host_ssp_tbl); + vmcs_write_cet_state(vcpu, vmcs12->host_s_cet, vmcs12->host_ssp, + vmcs12->host_ssp_tbl); + else + vmcs_write_cet_state(vcpu, vmcs12->guest_s_cet, vmcs12->guest_ssp, + vmcs12->guest_ssp_tbl);
if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) { vmcs_write64(GUEST_IA32_PAT, vmcs12->host_ia32_pat); diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 294a294f0d0d..989008f5307e 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -4102,7 +4102,7 @@ void pt_update_intercept_for_msr(struct kvm_vcpu *vcpu)
void vmx_recalc_msr_intercepts(struct kvm_vcpu *vcpu) { - bool set; + bool intercept;
if (!cpu_has_vmx_msr_bitmap()) return; @@ -4150,21 +4150,20 @@ void vmx_recalc_msr_intercepts(struct kvm_vcpu *vcpu) !guest_cpu_cap_has(vcpu, X86_FEATURE_FLUSH_L1D));
if (kvm_cpu_cap_has(X86_FEATURE_SHSTK)) { - set = !guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK); + intercept = !guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK);
- vmx_set_intercept_for_msr(vcpu, MSR_IA32_PL0_SSP, MSR_TYPE_RW, set); - vmx_set_intercept_for_msr(vcpu, MSR_IA32_PL1_SSP, MSR_TYPE_RW, set); - vmx_set_intercept_for_msr(vcpu, MSR_IA32_PL2_SSP, MSR_TYPE_RW, set); - vmx_set_intercept_for_msr(vcpu, MSR_IA32_PL3_SSP, MSR_TYPE_RW, set); - vmx_set_intercept_for_msr(vcpu, MSR_IA32_INT_SSP_TAB, MSR_TYPE_RW, set); + vmx_set_intercept_for_msr(vcpu, MSR_IA32_PL0_SSP, MSR_TYPE_RW, intercept); + vmx_set_intercept_for_msr(vcpu, MSR_IA32_PL1_SSP, MSR_TYPE_RW, intercept); + vmx_set_intercept_for_msr(vcpu, MSR_IA32_PL2_SSP, MSR_TYPE_RW, intercept); + vmx_set_intercept_for_msr(vcpu, MSR_IA32_PL3_SSP, MSR_TYPE_RW, intercept); }
if (kvm_cpu_cap_has(X86_FEATURE_SHSTK) || kvm_cpu_cap_has(X86_FEATURE_IBT)) { - set = !guest_cpu_cap_has(vcpu, X86_FEATURE_IBT) && - !guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK); + intercept = !guest_cpu_cap_has(vcpu, X86_FEATURE_IBT) && + !guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK);
- vmx_set_intercept_for_msr(vcpu, MSR_IA32_U_CET, MSR_TYPE_RW, set); - vmx_set_intercept_for_msr(vcpu, MSR_IA32_S_CET, MSR_TYPE_RW, set); + vmx_set_intercept_for_msr(vcpu, MSR_IA32_U_CET, MSR_TYPE_RW, intercept); + vmx_set_intercept_for_msr(vcpu, MSR_IA32_S_CET, MSR_TYPE_RW, intercept); }
/* diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index f2b89190a200..9930678f5a3b 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1892,16 +1892,33 @@ static int __kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data, if (!guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK) && !guest_cpu_cap_has(vcpu, X86_FEATURE_IBT)) return KVM_MSR_RET_UNSUPPORTED; - if (!is_cet_msr_valid(vcpu, data)) + if (!kvm_is_valid_u_s_cet(vcpu, data)) return 1; break; case MSR_KVM_INTERNAL_GUEST_SSP: if (!host_initiated) return 1; fallthrough; + /* + * Note that the MSR emulation here is flawed when a vCPU + * doesn't support the Intel 64 architecture. The expected + * architectural behavior in this case is that the upper 32 + * bits do not exist and should always read '0'. However, + * because the actual hardware on which the virtual CPU is + * running does support Intel 64, XRSTORS/XSAVES in the + * guest could observe behavior that violates the + * architecture. Intercepting XRSTORS/XSAVES for this + * special case isn't deemed worthwhile. + */ case MSR_IA32_PL0_SSP ... MSR_IA32_INT_SSP_TAB: if (!guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK)) return KVM_MSR_RET_UNSUPPORTED; + /* + * MSR_IA32_INT_SSP_TAB is not present on processors that do + * not support Intel 64 architecture. + */ + if (index == MSR_IA32_INT_SSP_TAB && !guest_cpu_cap_has(vcpu, X86_FEATURE_LM)) + return KVM_MSR_RET_UNSUPPORTED; if (is_noncanonical_msr_address(data, vcpu)) return 1; /* All SSP MSRs except MSR_IA32_INT_SSP_TAB must be 4-byte aligned */ @@ -4852,6 +4869,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_IRQFD_RESAMPLE: case KVM_CAP_MEMORY_FAULT_INFO: case KVM_CAP_X86_GUEST_MODE: + case KVM_CAP_ONE_REG: r = 1; break; case KVM_CAP_PRE_FAULT_MEMORY: @@ -6030,11 +6048,20 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu, } }
+struct kvm_x86_reg_id { + __u32 index; + __u8 type; + __u8 rsvd; + __u8 rsvd4:4; + __u8 size:4; + __u8 x86; +}; + static int kvm_translate_synthetic_msr(struct kvm_x86_reg_id *reg) { switch (reg->index) { case KVM_SYNTHETIC_GUEST_SSP: - reg->type = KVM_X86_REG_MSR; + reg->type = KVM_X86_REG_TYPE_MSR; reg->index = MSR_KVM_INTERNAL_GUEST_SSP; break; default: @@ -6170,22 +6197,28 @@ long kvm_arch_vcpu_ioctl(struct file *filp, break;
r = -EINVAL; + if ((reg.id & KVM_REG_ARCH_MASK) != KVM_REG_X86) + break; + id = (struct kvm_x86_reg_id *)®.id; - if (id->rsvd || id->rsvd16) + if (id->rsvd || id->rsvd4) + break; + + if (id->type != KVM_X86_REG_TYPE_MSR && + id->type != KVM_X86_REG_TYPE_SYNTHETIC_MSR) break;
- if (id->type != KVM_X86_REG_MSR && - id->type != KVM_X86_REG_SYNTHETIC) + if ((reg.id & KVM_REG_SIZE_MASK) != KVM_REG_SIZE_U64) break;
- if (id->type == KVM_X86_REG_SYNTHETIC) { + if (id->type == KVM_X86_REG_TYPE_SYNTHETIC_MSR) { r = kvm_translate_synthetic_msr(id); if (r) break; }
r = -EINVAL; - if (id->type != KVM_X86_REG_MSR) + if (id->type != KVM_X86_REG_TYPE_MSR) break;
value = u64_to_user_ptr(reg.addr); @@ -9862,7 +9895,7 @@ int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops) }
if (boot_cpu_has(X86_FEATURE_SHSTK)) { - rdmsrl(MSR_IA32_S_CET, kvm_host.s_cet); + rdmsrq(MSR_IA32_S_CET, kvm_host.s_cet); /* * Linux doesn't yet support supervisor shadow stacks (SSS), so * KVM doesn't save/restore the associated MSRs, i.e. KVM may diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index d0b91e3ad9ec..d6b21ba41416 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -744,7 +744,7 @@ static inline void kvm_set_xstate_msr(struct kvm_vcpu *vcpu, #define CET_US_IBT_MASK_BITS (GENMASK_ULL(5, 2) | GENMASK_ULL(63, 10)) #define CET_US_LEGACY_BITMAP_BASE(data) ((data) >> 12)
-static inline bool is_cet_msr_valid(struct kvm_vcpu *vcpu, u64 data) +static inline bool kvm_is_valid_u_s_cet(struct kvm_vcpu *vcpu, u64 data) { if (data & CET_US_RESERVED_BITS) return false; diff --git a/tools/arch/x86/include/uapi/asm/kvm.h b/tools/arch/x86/include/uapi/asm/kvm.h index 6f3499507c5e..590762820a61 100644 --- a/tools/arch/x86/include/uapi/asm/kvm.h +++ b/tools/arch/x86/include/uapi/asm/kvm.h @@ -411,6 +411,30 @@ struct kvm_xcrs { __u64 padding[16]; };
+#define KVM_X86_REG_TYPE_MSR 2 +#define KVM_X86_REG_TYPE_SYNTHETIC_MSR 3 + +#define KVM_X86_REG_TYPE_SIZE(type) \ +({ \ + __u64 type_size = (__u64)type << 32; \ + \ + type_size |= type == KVM_X86_REG_TYPE_MSR ? KVM_REG_SIZE_U64 : \ + type == KVM_X86_REG_TYPE_SYNTHETIC_MSR ? KVM_REG_SIZE_U64 :\ + 0; \ + type_size; \ +}) + +#define KVM_X86_REG_ENCODE(type, index) \ + (KVM_REG_X86 | KVM_X86_REG_TYPE_SIZE(type) | index) + +#define KVM_X86_REG_MSR(index) \ + KVM_X86_REG_ENCODE(KVM_X86_REG_TYPE_MSR, index) +#define KVM_X86_REG_SYNTHETIC_MSR(index) \ + KVM_X86_REG_ENCODE(KVM_X86_REG_TYPE_SYNTHETIC_MSR, index) + +/* KVM synthetic MSR index staring from 0 */ +#define KVM_SYNTHETIC_GUEST_SSP 0 + #define KVM_SYNC_X86_REGS (1UL << 0) #define KVM_SYNC_X86_SREGS (1UL << 1) #define KVM_SYNC_X86_EVENTS (1UL << 2) diff --git a/tools/testing/selftests/kvm/Makefile.kvm b/tools/testing/selftests/kvm/Makefile.kvm index f6fe7a07a0a2..9a375d5faf1c 100644 --- a/tools/testing/selftests/kvm/Makefile.kvm +++ b/tools/testing/selftests/kvm/Makefile.kvm @@ -136,6 +136,7 @@ TEST_GEN_PROGS_x86 += x86/max_vcpuid_cap_test TEST_GEN_PROGS_x86 += x86/triple_fault_event_test TEST_GEN_PROGS_x86 += x86/recalc_apic_map_test TEST_GEN_PROGS_x86 += x86/aperfmperf_test +TEST_GEN_PROGS_x86 += x86/get_set_one_reg TEST_GEN_PROGS_x86 += access_tracking_perf_test TEST_GEN_PROGS_x86 += coalesced_io_test TEST_GEN_PROGS_x86 += dirty_log_perf_test diff --git a/tools/testing/selftests/kvm/x86/get_set_one_reg.c b/tools/testing/selftests/kvm/x86/get_set_one_reg.c new file mode 100644 index 000000000000..8b069155ddc7 --- /dev/null +++ b/tools/testing/selftests/kvm/x86/get_set_one_reg.c @@ -0,0 +1,35 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <fcntl.h> +#include <stdint.h> +#include <sys/ioctl.h> + +#include "test_util.h" +#include "kvm_util.h" +#include "processor.h" + +int main(int argc, char *argv[]) +{ + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; + u64 data; + int r; + + TEST_REQUIRE(kvm_has_cap(KVM_CAP_ONE_REG)); + + vm = vm_create_with_one_vcpu(&vcpu, NULL); + + TEST_ASSERT_EQ(__vcpu_get_reg(vcpu, KVM_X86_REG_MSR(MSR_EFER), &data), 0); + TEST_ASSERT_EQ(__vcpu_set_reg(vcpu, KVM_X86_REG_MSR(MSR_EFER), data), 0); + + if (kvm_cpu_has(X86_FEATURE_SHSTK)) { + r = __vcpu_get_reg(vcpu, KVM_X86_REG_SYNTHETIC_MSR(KVM_SYNTHETIC_GUEST_SSP), + &data); + TEST_ASSERT_EQ(r, 0); + r = __vcpu_set_reg(vcpu, KVM_X86_REG_SYNTHETIC_MSR(KVM_SYNTHETIC_GUEST_SSP), + data); + TEST_ASSERT_EQ(r, 0); + } + + kvm_vm_free(vm); + return 0; +}