The patch below does not apply to the 6.1-stable tree. If someone wants it applied there, or to any other stable or longterm tree, then please email the backport, including the original git commit id to stable@vger.kernel.org.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.1.y git checkout FETCH_HEAD git cherry-pick -x 7d0cce6cbe71af6e9c1831bff101a2b9c249c4a2 # <resolve conflicts, build, test, etc.> git commit -s git send-email --to 'stable@vger.kernel.org' --in-reply-to '2025081215-variable-implicit-aa4c@gregkh' --subject-prefix 'PATCH 6.1.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 7d0cce6cbe71af6e9c1831bff101a2b9c249c4a2 Mon Sep 17 00:00:00 2001 From: Maxim Levitsky mlevitsk@redhat.com Date: Tue, 10 Jun 2025 16:20:09 -0700 Subject: [PATCH] KVM: VMX: Wrap all accesses to IA32_DEBUGCTL with getter/setter APIs
Introduce vmx_guest_debugctl_{read,write}() to handle all accesses to vmcs.GUEST_IA32_DEBUGCTL. This will allow stuffing FREEZE_IN_SMM into GUEST_IA32_DEBUGCTL based on the host setting without bleeding the state into the guest, and without needing to copy+paste the FREEZE_IN_SMM logic into every patch that accesses GUEST_IA32_DEBUGCTL.
No functional change intended.
Cc: stable@vger.kernel.org Signed-off-by: Maxim Levitsky mlevitsk@redhat.com [sean: massage changelog, make inline, use in all prepare_vmcs02() cases] Reviewed-by: Dapeng Mi dapeng1.mi@linux.intel.com Link: https://lore.kernel.org/r/20250610232010.162191-8-seanjc@google.com Signed-off-by: Sean Christopherson seanjc@google.com
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 1b8b0642fc2d..ef20184b8b11 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -2663,11 +2663,11 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, if (vmx->nested.nested_run_pending && (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) { kvm_set_dr(vcpu, 7, vmcs12->guest_dr7); - vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl & - vmx_get_supported_debugctl(vcpu, false)); + vmx_guest_debugctl_write(vcpu, vmcs12->guest_ia32_debugctl & + vmx_get_supported_debugctl(vcpu, false)); } else { kvm_set_dr(vcpu, 7, vcpu->arch.dr7); - vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.pre_vmenter_debugctl); + vmx_guest_debugctl_write(vcpu, vmx->nested.pre_vmenter_debugctl); } if (kvm_mpx_supported() && (!vmx->nested.nested_run_pending || !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))) @@ -3532,7 +3532,7 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
if (!vmx->nested.nested_run_pending || !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) - vmx->nested.pre_vmenter_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL); + vmx->nested.pre_vmenter_debugctl = vmx_guest_debugctl_read(); if (kvm_mpx_supported() && (!vmx->nested.nested_run_pending || !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))) @@ -4806,7 +4806,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, __vmx_set_segment(vcpu, &seg, VCPU_SREG_LDTR);
kvm_set_dr(vcpu, 7, 0x400); - vmcs_write64(GUEST_IA32_DEBUGCTL, 0); + vmx_guest_debugctl_write(vcpu, 0);
if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr, vmcs12->vm_exit_msr_load_count)) diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index bbf4509f32d0..0b173602821b 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -653,11 +653,11 @@ static void intel_pmu_reset(struct kvm_vcpu *vcpu) */ static void intel_pmu_legacy_freezing_lbrs_on_pmi(struct kvm_vcpu *vcpu) { - u64 data = vmcs_read64(GUEST_IA32_DEBUGCTL); + u64 data = vmx_guest_debugctl_read();
if (data & DEBUGCTLMSR_FREEZE_LBRS_ON_PMI) { data &= ~DEBUGCTLMSR_LBR; - vmcs_write64(GUEST_IA32_DEBUGCTL, data); + vmx_guest_debugctl_write(vcpu, data); } }
@@ -730,7 +730,7 @@ void vmx_passthrough_lbr_msrs(struct kvm_vcpu *vcpu)
if (!lbr_desc->event) { vmx_disable_lbr_msrs_passthrough(vcpu); - if (vmcs_read64(GUEST_IA32_DEBUGCTL) & DEBUGCTLMSR_LBR) + if (vmx_guest_debugctl_read() & DEBUGCTLMSR_LBR) goto warn; if (test_bit(INTEL_PMC_IDX_FIXED_VLBR, pmu->pmc_in_use)) goto warn; @@ -752,7 +752,7 @@ void vmx_passthrough_lbr_msrs(struct kvm_vcpu *vcpu)
static void intel_pmu_cleanup(struct kvm_vcpu *vcpu) { - if (!(vmcs_read64(GUEST_IA32_DEBUGCTL) & DEBUGCTLMSR_LBR)) + if (!(vmx_guest_debugctl_read() & DEBUGCTLMSR_LBR)) intel_pmu_release_guest_lbr_event(vcpu); }
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 6a8b78e954cd..a77d325fe78b 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -2149,7 +2149,7 @@ int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) msr_info->data = vmx->pt_desc.guest.addr_a[index / 2]; break; case MSR_IA32_DEBUGCTLMSR: - msr_info->data = vmcs_read64(GUEST_IA32_DEBUGCTL); + msr_info->data = vmx_guest_debugctl_read(); break; default: find_uret_msr: @@ -2283,7 +2283,8 @@ int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) VM_EXIT_SAVE_DEBUG_CONTROLS) get_vmcs12(vcpu)->guest_ia32_debugctl = data;
- vmcs_write64(GUEST_IA32_DEBUGCTL, data); + vmx_guest_debugctl_write(vcpu, data); + if (intel_pmu_lbr_is_enabled(vcpu) && !to_vmx(vcpu)->lbr_desc.event && (data & DEBUGCTLMSR_LBR)) intel_pmu_create_guest_lbr_event(vcpu); @@ -4798,7 +4799,8 @@ static void init_vmcs(struct vcpu_vmx *vmx) vmcs_write32(GUEST_SYSENTER_CS, 0); vmcs_writel(GUEST_SYSENTER_ESP, 0); vmcs_writel(GUEST_SYSENTER_EIP, 0); - vmcs_write64(GUEST_IA32_DEBUGCTL, 0); + + vmx_guest_debugctl_write(&vmx->vcpu, 0);
if (cpu_has_vmx_tpr_shadow()) { vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0); diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 392e66c7e5fe..c20a4185d10a 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -417,6 +417,16 @@ void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu); u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated); bool vmx_is_valid_debugctl(struct kvm_vcpu *vcpu, u64 data, bool host_initiated);
+static inline void vmx_guest_debugctl_write(struct kvm_vcpu *vcpu, u64 val) +{ + vmcs_write64(GUEST_IA32_DEBUGCTL, val); +} + +static inline u64 vmx_guest_debugctl_read(void) +{ + return vmcs_read64(GUEST_IA32_DEBUGCTL); +} + /* * Note, early Intel manuals have the write-low and read-high bitmap offsets * the wrong way round. The bitmaps control MSRs 0x00000000-0x00001fff and
From: Sean Christopherson seanjc@google.com
[ Upstream commit e76ae52747a82a548742107b4100e90da41a624d ]
Add helpers to print unimplemented MSR accesses and condition all such prints on report_ignored_msrs, i.e. honor userspace's request to not print unimplemented MSRs. Even though vcpu_unimpl() is ratelimited, printing can still be problematic, e.g. if a print gets stalled when host userspace is writing MSRs during live migration, an effective stall can result in very noticeable disruption in the guest.
E.g. the profile below was taken while calling KVM_SET_MSRS on the PMU counters while the PMU was disabled in KVM.
- 99.75% 0.00% [.] __ioctl - __ioctl - 99.74% entry_SYSCALL_64_after_hwframe do_syscall_64 sys_ioctl - do_vfs_ioctl - 92.48% kvm_vcpu_ioctl - kvm_arch_vcpu_ioctl - 85.12% kvm_set_msr_ignored_check svm_set_msr kvm_set_msr_common printk vprintk_func vprintk_default vprintk_emit console_unlock call_console_drivers univ8250_console_write serial8250_console_write uart_console_write
Reported-by: Aaron Lewis aaronlewis@google.com Reviewed-by: Vitaly Kuznetsov vkuznets@redhat.com Link: https://lore.kernel.org/r/20230124234905.3774678-3-seanjc@google.com Signed-off-by: Sean Christopherson seanjc@google.com Stable-dep-of: 7d0cce6cbe71 ("KVM: VMX: Wrap all accesses to IA32_DEBUGCTL with getter/setter APIs") Signed-off-by: Sasha Levin sashal@kernel.org --- arch/x86/kvm/hyperv.c | 10 ++++------ arch/x86/kvm/svm/svm.c | 5 ++--- arch/x86/kvm/vmx/vmx.c | 4 +--- arch/x86/kvm/x86.c | 18 +++++------------- arch/x86/kvm/x86.h | 12 ++++++++++++ 5 files changed, 24 insertions(+), 25 deletions(-)
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 28555bbd52e8..cb0a531e13c5 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -1406,8 +1406,7 @@ static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data, case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER: return syndbg_set_msr(vcpu, msr, data, host); default: - vcpu_unimpl(vcpu, "Hyper-V unhandled wrmsr: 0x%x data 0x%llx\n", - msr, data); + kvm_pr_unimpl_wrmsr(vcpu, msr, data); return 1; } return 0; @@ -1528,8 +1527,7 @@ static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host) return 1; break; default: - vcpu_unimpl(vcpu, "Hyper-V unhandled wrmsr: 0x%x data 0x%llx\n", - msr, data); + kvm_pr_unimpl_wrmsr(vcpu, msr, data); return 1; }
@@ -1581,7 +1579,7 @@ static int kvm_hv_get_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER: return syndbg_get_msr(vcpu, msr, pdata, host); default: - vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr); + kvm_pr_unimpl_rdmsr(vcpu, msr); return 1; }
@@ -1646,7 +1644,7 @@ static int kvm_hv_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, data = APIC_BUS_FREQUENCY; break; default: - vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr); + kvm_pr_unimpl_rdmsr(vcpu, msr); return 1; } *pdata = data; diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index b6bbd0dc4e65..8fbe071334d4 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -3035,8 +3035,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) break; case MSR_IA32_DEBUGCTLMSR: if (!lbrv) { - vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTL 0x%llx, nop\n", - __func__, data); + kvm_pr_unimpl_wrmsr(vcpu, ecx, data); break; }
@@ -3077,7 +3076,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) case MSR_VM_CR: return svm_set_vm_cr(vcpu, data); case MSR_VM_IGNNE: - vcpu_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data); + kvm_pr_unimpl_wrmsr(vcpu, ecx, data); break; case MSR_AMD64_DE_CFG: { struct kvm_msr_entry msr_entry; diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index fbe26b88f731..dac7883acdd2 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -2144,9 +2144,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
invalid = data & ~vmx_get_supported_debugctl(vcpu, msr_info->host_initiated); if (invalid & (DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR)) { - if (report_ignored_msrs) - vcpu_unimpl(vcpu, "%s: BTF|LBR in IA32_DEBUGCTLMSR 0x%llx, nop\n", - __func__, data); + kvm_pr_unimpl_wrmsr(vcpu, msr_index, data); data &= ~(DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR); invalid &= ~(DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR); } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index a6dc8f662fa4..fcca0c030bcd 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3573,7 +3573,6 @@ static void record_steal_time(struct kvm_vcpu *vcpu)
int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { - bool pr = false; u32 msr = msr_info->index; u64 data = msr_info->data;
@@ -3625,15 +3624,13 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (data == BIT_ULL(18)) { vcpu->arch.msr_hwcr = data; } else if (data != 0) { - vcpu_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n", - data); + kvm_pr_unimpl_wrmsr(vcpu, msr, data); return 1; } break; case MSR_FAM10H_MMIO_CONF_BASE: if (data != 0) { - vcpu_unimpl(vcpu, "unimplemented MMIO_CONF_BASE wrmsr: " - "0x%llx\n", data); + kvm_pr_unimpl_wrmsr(vcpu, msr, data); return 1; } break; @@ -3813,16 +3810,13 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3: case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1: - pr = true; - fallthrough; case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3: case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL1: if (kvm_pmu_is_valid_msr(vcpu, msr)) return kvm_pmu_set_msr(vcpu, msr_info);
- if (pr || data != 0) - vcpu_unimpl(vcpu, "disabled perfctr wrmsr: " - "0x%x data 0x%llx\n", msr, data); + if (data) + kvm_pr_unimpl_wrmsr(vcpu, msr, data); break; case MSR_K7_CLK_CTL: /* @@ -3849,9 +3843,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) /* Drop writes to this legacy MSR -- see rdmsr * counterpart for further detail. */ - if (report_ignored_msrs) - vcpu_unimpl(vcpu, "ignored wrmsr: 0x%x data 0x%llx\n", - msr, data); + kvm_pr_unimpl_wrmsr(vcpu, msr, data); break; case MSR_AMD64_OSVW_ID_LENGTH: if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW)) diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 9de72586f406..f3554bf05201 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -331,6 +331,18 @@ extern bool report_ignored_msrs;
extern bool eager_page_split;
+static inline void kvm_pr_unimpl_wrmsr(struct kvm_vcpu *vcpu, u32 msr, u64 data) +{ + if (report_ignored_msrs) + vcpu_unimpl(vcpu, "Unhandled WRMSR(0x%x) = 0x%llx\n", msr, data); +} + +static inline void kvm_pr_unimpl_rdmsr(struct kvm_vcpu *vcpu, u32 msr) +{ + if (report_ignored_msrs) + vcpu_unimpl(vcpu, "Unhandled RDMSR(0x%x)\n", msr); +} + static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec) { return pvclock_scale_delta(nsec, vcpu->arch.virtual_tsc_mult,
From: Sean Christopherson seanjc@google.com
[ Upstream commit 8a4351ac302cd8c19729ba2636acfd0467c22ae8 ]
Move VMX's logic to check DEBUGCTL values into a standalone helper so that the code can be used by nested VM-Enter to apply the same logic to the value being loaded from vmcs12.
KVM needs to explicitly check vmcs12->guest_ia32_debugctl on nested VM-Enter, as hardware may support features that KVM does not, i.e. relying on hardware to detect invalid guest state will result in false negatives. Unfortunately, that means applying KVM's funky suppression of BTF and LBR to vmcs12 so as not to break existing guests.
No functional change intended.
Reviewed-by: Dapeng Mi dapeng1.mi@linux.intel.com Link: https://lore.kernel.org/r/20250610232010.162191-6-seanjc@google.com Signed-off-by: Sean Christopherson seanjc@google.com Stable-dep-of: 7d0cce6cbe71 ("KVM: VMX: Wrap all accesses to IA32_DEBUGCTL with getter/setter APIs") Signed-off-by: Sasha Levin sashal@kernel.org --- arch/x86/kvm/vmx/vmx.c | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-)
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index dac7883acdd2..164bbf7959be 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -2071,6 +2071,19 @@ static u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated return debugctl; }
+static bool vmx_is_valid_debugctl(struct kvm_vcpu *vcpu, u64 data, + bool host_initiated) +{ + u64 invalid; + + invalid = data & ~vmx_get_supported_debugctl(vcpu, host_initiated); + if (invalid & (DEBUGCTLMSR_BTF | DEBUGCTLMSR_LBR)) { + kvm_pr_unimpl_wrmsr(vcpu, MSR_IA32_DEBUGCTLMSR, data); + invalid &= ~(DEBUGCTLMSR_BTF | DEBUGCTLMSR_LBR); + } + return !invalid; +} + /* * Writes msr value into the appropriate "register". * Returns 0 on success, non-0 otherwise. @@ -2139,19 +2152,12 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) } vmcs_writel(GUEST_SYSENTER_ESP, data); break; - case MSR_IA32_DEBUGCTLMSR: { - u64 invalid; - - invalid = data & ~vmx_get_supported_debugctl(vcpu, msr_info->host_initiated); - if (invalid & (DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR)) { - kvm_pr_unimpl_wrmsr(vcpu, msr_index, data); - data &= ~(DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR); - invalid &= ~(DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR); - } - - if (invalid) + case MSR_IA32_DEBUGCTLMSR: + if (!vmx_is_valid_debugctl(vcpu, data, msr_info->host_initiated)) return 1;
+ data &= vmx_get_supported_debugctl(vcpu, msr_info->host_initiated); + if (is_guest_mode(vcpu) && get_vmcs12(vcpu)->vm_exit_controls & VM_EXIT_SAVE_DEBUG_CONTROLS) get_vmcs12(vcpu)->guest_ia32_debugctl = data; @@ -2161,7 +2167,6 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) (data & DEBUGCTLMSR_LBR)) intel_pmu_create_guest_lbr_event(vcpu); return 0; - } case MSR_IA32_BNDCFGS: if (!kvm_mpx_supported() || (!msr_info->host_initiated &&
On Thu, Aug 14, 2025, Sasha Levin wrote:
From: Sean Christopherson seanjc@google.com
[ Upstream commit 8a4351ac302cd8c19729ba2636acfd0467c22ae8 ]
Move VMX's logic to check DEBUGCTL values into a standalone helper so that the code can be used by nested VM-Enter to apply the same logic to the value being loaded from vmcs12.
KVM needs to explicitly check vmcs12->guest_ia32_debugctl on nested VM-Enter, as hardware may support features that KVM does not, i.e. relying on hardware to detect invalid guest state will result in false negatives. Unfortunately, that means applying KVM's funky suppression of BTF and LBR to vmcs12 so as not to break existing guests.
No functional change intended.
Reviewed-by: Dapeng Mi dapeng1.mi@linux.intel.com Link: https://lore.kernel.org/r/20250610232010.162191-6-seanjc@google.com Signed-off-by: Sean Christopherson seanjc@google.com Stable-dep-of: 7d0cce6cbe71 ("KVM: VMX: Wrap all accesses to IA32_DEBUGCTL with getter/setter APIs") Signed-off-by: Sasha Levin sashal@kernel.org
Acked-by: Sean Christopherson seanjc@google.com
On Thu, Aug 14, 2025, Sasha Levin wrote:
From: Sean Christopherson seanjc@google.com
[ Upstream commit 8a4351ac302cd8c19729ba2636acfd0467c22ae8 ]
Move VMX's logic to check DEBUGCTL values into a standalone helper so that the code can be used by nested VM-Enter to apply the same logic to the value being loaded from vmcs12.
KVM needs to explicitly check vmcs12->guest_ia32_debugctl on nested VM-Enter, as hardware may support features that KVM does not, i.e. relying on hardware to detect invalid guest state will result in false negatives. Unfortunately, that means applying KVM's funky suppression of BTF and LBR to vmcs12 so as not to break existing guests.
No functional change intended.
Reviewed-by: Dapeng Mi dapeng1.mi@linux.intel.com Link: https://lore.kernel.org/r/20250610232010.162191-6-seanjc@google.com Signed-off-by: Sean Christopherson seanjc@google.com Stable-dep-of: 7d0cce6cbe71 ("KVM: VMX: Wrap all accesses to IA32_DEBUGCTL with getter/setter APIs") Signed-off-by: Sasha Levin sashal@kernel.org
Acked-by: Sean Christopherson seanjc@google.com
From: Maxim Levitsky mlevitsk@redhat.com
[ Upstream commit 095686e6fcb4150f0a55b1a25987fad3d8af58d6 ]
Add a consistency check for L2's guest_ia32_debugctl, as KVM only supports a subset of hardware functionality, i.e. KVM can't rely on hardware to detect illegal/unsupported values. Failure to check the vmcs12 value would allow the guest to load any harware-supported value while running L2.
Take care to exempt BTF and LBR from the validity check in order to match KVM's behavior for writes via WRMSR, but without clobbering vmcs12. Even if VM_EXIT_SAVE_DEBUG_CONTROLS is set in vmcs12, L1 can reasonably expect that vmcs12->guest_ia32_debugctl will not be modified if writes to the MSR are being intercepted.
Arguably, KVM _should_ update vmcs12 if VM_EXIT_SAVE_DEBUG_CONTROLS is set *and* writes to MSR_IA32_DEBUGCTLMSR are not being intercepted by L1, but that would incur non-trivial complexity and wouldn't change the fact that KVM's handling of DEBUGCTL is blatantly broken. I.e. the extra complexity is not worth carrying.
Cc: stable@vger.kernel.org Signed-off-by: Maxim Levitsky mlevitsk@redhat.com Co-developed-by: Sean Christopherson seanjc@google.com Link: https://lore.kernel.org/r/20250610232010.162191-7-seanjc@google.com Signed-off-by: Sean Christopherson seanjc@google.com Stable-dep-of: 7d0cce6cbe71 ("KVM: VMX: Wrap all accesses to IA32_DEBUGCTL with getter/setter APIs") Signed-off-by: Sasha Levin sashal@kernel.org --- arch/x86/kvm/vmx/nested.c | 12 ++++++++++-- arch/x86/kvm/vmx/vmx.c | 5 ++--- arch/x86/kvm/vmx/vmx.h | 3 +++ 3 files changed, 15 insertions(+), 5 deletions(-)
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 8052f8b7d8e1..d5d675917662 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -2532,7 +2532,8 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, if (vmx->nested.nested_run_pending && (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) { kvm_set_dr(vcpu, 7, vmcs12->guest_dr7); - vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl); + vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl & + vmx_get_supported_debugctl(vcpu, false)); } else { kvm_set_dr(vcpu, 7, vcpu->arch.dr7); vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.pre_vmenter_debugctl); @@ -3022,7 +3023,8 @@ static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu, return -EINVAL;
if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) && - CC(!kvm_dr7_valid(vmcs12->guest_dr7))) + (CC(!kvm_dr7_valid(vmcs12->guest_dr7)) || + CC(!vmx_is_valid_debugctl(vcpu, vmcs12->guest_ia32_debugctl, false)))) return -EINVAL;
if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT) && @@ -4374,6 +4376,12 @@ static void sync_vmcs02_to_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) (vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) | (vm_entry_controls_get(to_vmx(vcpu)) & VM_ENTRY_IA32E_MODE);
+ /* + * Note! Save DR7, but intentionally don't grab DEBUGCTL from vmcs02. + * Writes to DEBUGCTL that aren't intercepted by L1 are immediately + * propagated to vmcs12 (see vmx_set_msr()), as the value loaded into + * vmcs02 doesn't strictly track vmcs12. + */ if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_DEBUG_CONTROLS) kvm_get_dr(vcpu, 7, (unsigned long *)&vmcs12->guest_dr7);
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 164bbf7959be..ce4e9ebc5b2b 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -2056,7 +2056,7 @@ static u64 nested_vmx_truncate_sysenter_addr(struct kvm_vcpu *vcpu, return (unsigned long)data; }
-static u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated) +u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated) { u64 debugctl = 0;
@@ -2071,8 +2071,7 @@ static u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated return debugctl; }
-static bool vmx_is_valid_debugctl(struct kvm_vcpu *vcpu, u64 data, - bool host_initiated) +bool vmx_is_valid_debugctl(struct kvm_vcpu *vcpu, u64 data, bool host_initiated) { u64 invalid;
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 9e0bb98b116d..aa41d3e8fa06 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -445,6 +445,9 @@ static inline void vmx_set_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr,
void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu);
+u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated); +bool vmx_is_valid_debugctl(struct kvm_vcpu *vcpu, u64 data, bool host_initiated); + /* * Note, early Intel manuals have the write-low and read-high bitmap offsets * the wrong way round. The bitmaps control MSRs 0x00000000-0x00001fff and
On Thu, Aug 14, 2025, Sasha Levin wrote:
From: Maxim Levitsky mlevitsk@redhat.com
[ Upstream commit 095686e6fcb4150f0a55b1a25987fad3d8af58d6 ]
Add a consistency check for L2's guest_ia32_debugctl, as KVM only supports a subset of hardware functionality, i.e. KVM can't rely on hardware to detect illegal/unsupported values. Failure to check the vmcs12 value would allow the guest to load any harware-supported value while running L2.
Take care to exempt BTF and LBR from the validity check in order to match KVM's behavior for writes via WRMSR, but without clobbering vmcs12. Even if VM_EXIT_SAVE_DEBUG_CONTROLS is set in vmcs12, L1 can reasonably expect that vmcs12->guest_ia32_debugctl will not be modified if writes to the MSR are being intercepted.
Arguably, KVM _should_ update vmcs12 if VM_EXIT_SAVE_DEBUG_CONTROLS is set *and* writes to MSR_IA32_DEBUGCTLMSR are not being intercepted by L1, but that would incur non-trivial complexity and wouldn't change the fact that KVM's handling of DEBUGCTL is blatantly broken. I.e. the extra complexity is not worth carrying.
Cc: stable@vger.kernel.org Signed-off-by: Maxim Levitsky mlevitsk@redhat.com Co-developed-by: Sean Christopherson seanjc@google.com Link: https://lore.kernel.org/r/20250610232010.162191-7-seanjc@google.com Signed-off-by: Sean Christopherson seanjc@google.com Stable-dep-of: 7d0cce6cbe71 ("KVM: VMX: Wrap all accesses to IA32_DEBUGCTL with getter/setter APIs") Signed-off-by: Sasha Levin sashal@kernel.org
Acked-by: Sean Christopherson seanjc@google.com
Just in case you read this before my other N emails that say the exact same thing...
Needs to land after the RTM_DEBUG patch: https://lore.kernel.org/all/20250813184918.2071296-1-sashal@kernel.org
From: Maxim Levitsky mlevitsk@redhat.com
[ Upstream commit 7d0cce6cbe71af6e9c1831bff101a2b9c249c4a2 ]
Introduce vmx_guest_debugctl_{read,write}() to handle all accesses to vmcs.GUEST_IA32_DEBUGCTL. This will allow stuffing FREEZE_IN_SMM into GUEST_IA32_DEBUGCTL based on the host setting without bleeding the state into the guest, and without needing to copy+paste the FREEZE_IN_SMM logic into every patch that accesses GUEST_IA32_DEBUGCTL.
No functional change intended.
Cc: stable@vger.kernel.org Signed-off-by: Maxim Levitsky mlevitsk@redhat.com [sean: massage changelog, make inline, use in all prepare_vmcs02() cases] Reviewed-by: Dapeng Mi dapeng1.mi@linux.intel.com Link: https://lore.kernel.org/r/20250610232010.162191-8-seanjc@google.com Signed-off-by: Sean Christopherson seanjc@google.com Signed-off-by: Sasha Levin sashal@kernel.org --- arch/x86/kvm/vmx/nested.c | 10 +++++----- arch/x86/kvm/vmx/pmu_intel.c | 8 ++++---- arch/x86/kvm/vmx/vmx.c | 8 +++++--- arch/x86/kvm/vmx/vmx.h | 10 ++++++++++ 4 files changed, 24 insertions(+), 12 deletions(-)
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index d5d675917662..638e608799a3 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -2532,11 +2532,11 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, if (vmx->nested.nested_run_pending && (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) { kvm_set_dr(vcpu, 7, vmcs12->guest_dr7); - vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl & - vmx_get_supported_debugctl(vcpu, false)); + vmx_guest_debugctl_write(vcpu, vmcs12->guest_ia32_debugctl & + vmx_get_supported_debugctl(vcpu, false)); } else { kvm_set_dr(vcpu, 7, vcpu->arch.dr7); - vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.pre_vmenter_debugctl); + vmx_guest_debugctl_write(vcpu, vmx->nested.pre_vmenter_debugctl); } if (kvm_mpx_supported() && (!vmx->nested.nested_run_pending || !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))) @@ -3404,7 +3404,7 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
if (!vmx->nested.nested_run_pending || !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) - vmx->nested.pre_vmenter_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL); + vmx->nested.pre_vmenter_debugctl = vmx_guest_debugctl_read(); if (kvm_mpx_supported() && (!vmx->nested.nested_run_pending || !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))) @@ -4572,7 +4572,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, __vmx_set_segment(vcpu, &seg, VCPU_SREG_LDTR);
kvm_set_dr(vcpu, 7, 0x400); - vmcs_write64(GUEST_IA32_DEBUGCTL, 0); + vmx_guest_debugctl_write(vcpu, 0);
if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr, vmcs12->vm_exit_msr_load_count)) diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 220cdbe1e286..76d3ed8abf6a 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -672,11 +672,11 @@ static void intel_pmu_reset(struct kvm_vcpu *vcpu) */ static void intel_pmu_legacy_freezing_lbrs_on_pmi(struct kvm_vcpu *vcpu) { - u64 data = vmcs_read64(GUEST_IA32_DEBUGCTL); + u64 data = vmx_guest_debugctl_read();
if (data & DEBUGCTLMSR_FREEZE_LBRS_ON_PMI) { data &= ~DEBUGCTLMSR_LBR; - vmcs_write64(GUEST_IA32_DEBUGCTL, data); + vmx_guest_debugctl_write(vcpu, data); } }
@@ -746,7 +746,7 @@ void vmx_passthrough_lbr_msrs(struct kvm_vcpu *vcpu)
if (!lbr_desc->event) { vmx_disable_lbr_msrs_passthrough(vcpu); - if (vmcs_read64(GUEST_IA32_DEBUGCTL) & DEBUGCTLMSR_LBR) + if (vmx_guest_debugctl_read() & DEBUGCTLMSR_LBR) goto warn; if (test_bit(INTEL_PMC_IDX_FIXED_VLBR, pmu->pmc_in_use)) goto warn; @@ -769,7 +769,7 @@ void vmx_passthrough_lbr_msrs(struct kvm_vcpu *vcpu)
static void intel_pmu_cleanup(struct kvm_vcpu *vcpu) { - if (!(vmcs_read64(GUEST_IA32_DEBUGCTL) & DEBUGCTLMSR_LBR)) + if (!(vmx_guest_debugctl_read() & DEBUGCTLMSR_LBR)) intel_pmu_release_guest_lbr_event(vcpu); }
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index ce4e9ebc5b2b..473dc350e9d6 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -2031,7 +2031,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) msr_info->data = vmx->pt_desc.guest.addr_a[index / 2]; break; case MSR_IA32_DEBUGCTLMSR: - msr_info->data = vmcs_read64(GUEST_IA32_DEBUGCTL); + msr_info->data = vmx_guest_debugctl_read(); break; default: find_uret_msr: @@ -2161,7 +2161,8 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) VM_EXIT_SAVE_DEBUG_CONTROLS) get_vmcs12(vcpu)->guest_ia32_debugctl = data;
- vmcs_write64(GUEST_IA32_DEBUGCTL, data); + vmx_guest_debugctl_write(vcpu, data); + if (intel_pmu_lbr_is_enabled(vcpu) && !to_vmx(vcpu)->lbr_desc.event && (data & DEBUGCTLMSR_LBR)) intel_pmu_create_guest_lbr_event(vcpu); @@ -4751,7 +4752,8 @@ static void init_vmcs(struct vcpu_vmx *vmx) vmcs_write32(GUEST_SYSENTER_CS, 0); vmcs_writel(GUEST_SYSENTER_ESP, 0); vmcs_writel(GUEST_SYSENTER_EIP, 0); - vmcs_write64(GUEST_IA32_DEBUGCTL, 0); + + vmx_guest_debugctl_write(&vmx->vcpu, 0);
if (cpu_has_vmx_tpr_shadow()) { vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0); diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index aa41d3e8fa06..34f85a71c925 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -448,6 +448,16 @@ void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu); u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated); bool vmx_is_valid_debugctl(struct kvm_vcpu *vcpu, u64 data, bool host_initiated);
+static inline void vmx_guest_debugctl_write(struct kvm_vcpu *vcpu, u64 val) +{ + vmcs_write64(GUEST_IA32_DEBUGCTL, val); +} + +static inline u64 vmx_guest_debugctl_read(void) +{ + return vmcs_read64(GUEST_IA32_DEBUGCTL); +} + /* * Note, early Intel manuals have the write-low and read-high bitmap offsets * the wrong way round. The bitmaps control MSRs 0x00000000-0x00001fff and
On Thu, Aug 14, 2025, Sasha Levin wrote:
From: Maxim Levitsky mlevitsk@redhat.com
[ Upstream commit 7d0cce6cbe71af6e9c1831bff101a2b9c249c4a2 ]
Introduce vmx_guest_debugctl_{read,write}() to handle all accesses to vmcs.GUEST_IA32_DEBUGCTL. This will allow stuffing FREEZE_IN_SMM into GUEST_IA32_DEBUGCTL based on the host setting without bleeding the state into the guest, and without needing to copy+paste the FREEZE_IN_SMM logic into every patch that accesses GUEST_IA32_DEBUGCTL.
No functional change intended.
Cc: stable@vger.kernel.org Signed-off-by: Maxim Levitsky mlevitsk@redhat.com [sean: massage changelog, make inline, use in all prepare_vmcs02() cases] Reviewed-by: Dapeng Mi dapeng1.mi@linux.intel.com Link: https://lore.kernel.org/r/20250610232010.162191-8-seanjc@google.com Signed-off-by: Sean Christopherson seanjc@google.com Signed-off-by: Sasha Levin sashal@kernel.org
Acked-by: Sean Christopherson seanjc@google.com
On Thu, Aug 14, 2025, Sasha Levin wrote:
From: Sean Christopherson seanjc@google.com
[ Upstream commit e76ae52747a82a548742107b4100e90da41a624d ]
Add helpers to print unimplemented MSR accesses and condition all such prints on report_ignored_msrs, i.e. honor userspace's request to not print unimplemented MSRs. Even though vcpu_unimpl() is ratelimited, printing can still be problematic, e.g. if a print gets stalled when host userspace is writing MSRs during live migration, an effective stall can result in very noticeable disruption in the guest.
E.g. the profile below was taken while calling KVM_SET_MSRS on the PMU counters while the PMU was disabled in KVM.
- 99.75% 0.00% [.] __ioctl
- __ioctl
- 99.74% entry_SYSCALL_64_after_hwframe do_syscall_64 sys_ioctl
- do_vfs_ioctl
- 92.48% kvm_vcpu_ioctl
- kvm_arch_vcpu_ioctl
- 85.12% kvm_set_msr_ignored_check svm_set_msr kvm_set_msr_common printk vprintk_func vprintk_default vprintk_emit console_unlock call_console_drivers univ8250_console_write serial8250_console_write uart_console_write
Reported-by: Aaron Lewis aaronlewis@google.com Reviewed-by: Vitaly Kuznetsov vkuznets@redhat.com Link: https://lore.kernel.org/r/20230124234905.3774678-3-seanjc@google.com Signed-off-by: Sean Christopherson seanjc@google.com Stable-dep-of: 7d0cce6cbe71 ("KVM: VMX: Wrap all accesses to IA32_DEBUGCTL with getter/setter APIs") Signed-off-by: Sasha Levin sashal@kernel.org
Acked-by: Sean Christopherson seanjc@google.com
linux-stable-mirror@lists.linaro.org