Ensure Intel PT tracing is disabled before VM-Entry in Intel PT Host/Guest mode.
Intel PT has 2 modes for tracing virtual machines. The default is System mode whereby host and guest output to the host trace buffer. The other is Host/Guest mode whereby host and guest output to their own buffers. Host/Guest mode is selected by kvm_intel module parameter pt_mode=1.
In Host/Guest mode, the following rule must be followed:
If the logical processor is operating with Intel PT enabled (if IA32_RTIT_CTL.TraceEn = 1) at the time of VM entry, the "load IA32_RTIT_CTL" VM-entry control must be 0.
However, "load IA32_RTIT_CTL" VM-entry control is always 1 in Host/Guest mode, so IA32_RTIT_CTL.TraceEn must always be 0 at VM entry, irrespective of whether guest IA32_RTIT_CTL.TraceEn is 1.
Fix by stopping host Intel PT tracing always at VM entry in Host/Guest mode. That also fixes the issue whereby the Intel PT NMI handler would set IA32_RTIT_CTL.TraceEn back to 1 after KVM has just set it to 0.
Fixes: 2ef444f1600b ("KVM: x86: Add Intel PT context switch for each vcpu") Cc: stable@vger.kernel.org Signed-off-by: Adrian Hunter adrian.hunter@intel.com --- arch/x86/events/intel/pt.c | 131 +++++++++++++++++++++++++++++++- arch/x86/events/intel/pt.h | 10 +++ arch/x86/include/asm/intel_pt.h | 4 + arch/x86/kvm/vmx/vmx.c | 23 ++---- arch/x86/kvm/vmx/vmx.h | 1 - 5 files changed, 147 insertions(+), 22 deletions(-)
diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c index a087bc0c5498..d9469d2d6aa6 100644 --- a/arch/x86/events/intel/pt.c +++ b/arch/x86/events/intel/pt.c @@ -480,16 +480,20 @@ static u64 pt_config_filters(struct perf_event *event) */
/* avoid redundant msr writes */ - if (pt->filters.filter[range].msr_a != filter->msr_a) { + if (pt->filters.filter[range].msr_a != filter->msr_a || + pt->write_filter_msrs[range]) { wrmsrl(pt_address_ranges[range].msr_a, filter->msr_a); pt->filters.filter[range].msr_a = filter->msr_a; }
- if (pt->filters.filter[range].msr_b != filter->msr_b) { + if (pt->filters.filter[range].msr_b != filter->msr_b || + pt->write_filter_msrs[range]) { wrmsrl(pt_address_ranges[range].msr_b, filter->msr_b); pt->filters.filter[range].msr_b = filter->msr_b; }
+ pt->write_filter_msrs[range] = false; + rtit_ctl |= (u64)filter->config << pt_address_ranges[range].reg_off; }
@@ -534,6 +538,11 @@ static void pt_config(struct perf_event *event) reg |= (event->attr.config & PT_CONFIG_MASK);
event->hw.aux_config = reg; + + /* Configuration is complete, it is now OK to handle an NMI */ + barrier(); + WRITE_ONCE(pt->handle_nmi, 1); + pt_config_start(event); }
@@ -950,6 +959,7 @@ static void pt_handle_status(struct pt *pt) pt_buffer_advance(buf);
wrmsrl(MSR_IA32_RTIT_STATUS, status); + pt->status = status; }
/** @@ -1588,7 +1598,6 @@ static void pt_event_start(struct perf_event *event, int mode) goto fail_end_stop; }
- WRITE_ONCE(pt->handle_nmi, 1); hwc->state = 0;
pt_config_buffer(buf); @@ -1643,6 +1652,104 @@ static void pt_event_stop(struct perf_event *event, int mode) } }
+#define PT_VM_NO_TRANSITION 0 +#define PT_VM_ENTRY 1 +#define PT_VM_EXIT 2 + +void intel_pt_vm_entry(bool guest_trace_enable) +{ + struct pt *pt = this_cpu_ptr(&pt_ctx); + struct perf_event *event; + + pt->restart_event = NULL; + pt->stashed_buf_sz = 0; + + WRITE_ONCE(pt->vm_transition, PT_VM_ENTRY); + barrier(); + + if (READ_ONCE(pt->handle_nmi)) { + /* Must stop handler before reading pt->handle.event */ + WRITE_ONCE(pt->handle_nmi, 0); + barrier(); + event = pt->handle.event; + if (event && !event->hw.state) { + struct pt_buffer *buf = perf_get_aux(&pt->handle); + + if (buf && buf->snapshot) + pt->stashed_buf_sz = buf->nr_pages << PAGE_SHIFT; + pt->restart_event = event; + pt_event_stop(event, PERF_EF_UPDATE); + } + } + + /* + * If guest_trace_enable, MSRs need to be saved, but the values are + * either already cached or not needed: + * MSR_IA32_RTIT_CTL event->hw.aux_config + * MSR_IA32_RTIT_STATUS pt->status + * MSR_IA32_RTIT_CR3_MATCH not used + * MSR_IA32_RTIT_OUTPUT_BASE pt->output_base + * MSR_IA32_RTIT_OUTPUT_MASK pt->output_mask + * MSR_IA32_RTIT_ADDR... pt->filters + */ +} +EXPORT_SYMBOL_GPL(intel_pt_vm_entry); + +void intel_pt_vm_exit(bool guest_trace_enable) +{ + struct pt *pt = this_cpu_ptr(&pt_ctx); + u64 base = pt->output_base; + u64 mask = pt->output_mask; + + WRITE_ONCE(pt->vm_transition, PT_VM_EXIT); + barrier(); + + /* + * If guest_trace_enable, MSRs need to be restored, but that is handled + * in different ways: + * MSR_IA32_RTIT_CTL written next start + * MSR_IA32_RTIT_STATUS restored below + * MSR_IA32_RTIT_CR3_MATCH not used + * MSR_IA32_RTIT_OUTPUT_BASE written next start or restored + * further below + * MSR_IA32_RTIT_OUTPUT_MASK written next start or restored + * further below + * MSR_IA32_RTIT_ADDR... flagged to be written when + * needed + */ + if (guest_trace_enable) { + wrmsrl(MSR_IA32_RTIT_STATUS, pt->status); + /* + * Force address filter MSR writes during reconfiguration, + * refer pt_config_filters(). + */ + for (int range = 0; range < PT_FILTERS_NUM; range++) + pt->write_filter_msrs[range] = true; + } + + if (pt->restart_event) { + if (guest_trace_enable) { + /* Invalidate to force buffer reconfiguration */ + pt->output_base = ~0ULL; + pt->output_mask = 0; + } + pt_event_start(pt->restart_event, 0); + pt->restart_event = NULL; + } + + /* If tracing wasn't started, restore buffer configuration */ + if (guest_trace_enable && !READ_ONCE(pt->handle_nmi)) { + wrmsrl(MSR_IA32_RTIT_OUTPUT_BASE, base); + wrmsrl(MSR_IA32_RTIT_OUTPUT_MASK, mask); + pt->output_base = base; + pt->output_mask = mask; + } + + barrier(); + WRITE_ONCE(pt->vm_transition, PT_VM_NO_TRANSITION); +} +EXPORT_SYMBOL_GPL(intel_pt_vm_exit); + static long pt_event_snapshot_aux(struct perf_event *event, struct perf_output_handle *handle, unsigned long size) @@ -1651,6 +1758,24 @@ static long pt_event_snapshot_aux(struct perf_event *event, struct pt_buffer *buf = perf_get_aux(&pt->handle); unsigned long from = 0, to; long ret; + int tr; + + /* + * Special handling during VM transition. At VM-Entry stage, once + * tracing is stopped, as indicated by buf == NULL, snapshot using the + * saved head position. At VM-Exit do that also until tracing is + * reconfigured as indicated by handle_nmi. + */ + tr = READ_ONCE(pt->vm_transition); + if ((tr == PT_VM_ENTRY && !buf) || (tr == PT_VM_EXIT && !READ_ONCE(pt->handle_nmi))) { + if (WARN_ON_ONCE(!pt->stashed_buf_sz)) + return 0; + to = pt->handle.head; + if (to < size) + from = pt->stashed_buf_sz; + from += to - size; + return perf_output_copy_aux(&pt->handle, handle, from, to); + }
if (WARN_ON_ONCE(!buf)) return 0; diff --git a/arch/x86/events/intel/pt.h b/arch/x86/events/intel/pt.h index a1b6c04b7f68..0428019b92f4 100644 --- a/arch/x86/events/intel/pt.h +++ b/arch/x86/events/intel/pt.h @@ -121,6 +121,11 @@ struct pt_filters { * @vmx_on: 1 if VMX is ON on this cpu * @output_base: cached RTIT_OUTPUT_BASE MSR value * @output_mask: cached RTIT_OUTPUT_MASK MSR value + * @status: cached RTIT_STATUS MSR value + * @vm_transition: VM transition (snapshot_aux needs special handling) + * @write_filter_msrs: write address filter MSRs during configuration + * @stashed_buf_sz: buffer size during VM transition + * @restart_event: event to restart after VM-Exit */ struct pt { struct perf_output_handle handle; @@ -129,6 +134,11 @@ struct pt { int vmx_on; u64 output_base; u64 output_mask; + u64 status; + int vm_transition; + bool write_filter_msrs[PT_FILTERS_NUM]; + unsigned long stashed_buf_sz; + struct perf_event *restart_event; };
#endif /* __INTEL_PT_H__ */ diff --git a/arch/x86/include/asm/intel_pt.h b/arch/x86/include/asm/intel_pt.h index c796e9bc98b6..a673ac3a825e 100644 --- a/arch/x86/include/asm/intel_pt.h +++ b/arch/x86/include/asm/intel_pt.h @@ -30,11 +30,15 @@ enum pt_capabilities { void cpu_emergency_stop_pt(void); extern u32 intel_pt_validate_hw_cap(enum pt_capabilities cap); extern u32 intel_pt_validate_cap(u32 *caps, enum pt_capabilities cap); +extern void intel_pt_vm_entry(bool guest_trace_enable); +extern void intel_pt_vm_exit(bool guest_trace_enable); extern int is_intel_pt_event(struct perf_event *event); #else static inline void cpu_emergency_stop_pt(void) {} static inline u32 intel_pt_validate_hw_cap(enum pt_capabilities cap) { return 0; } static inline u32 intel_pt_validate_cap(u32 *caps, enum pt_capabilities capability) { return 0; } +static inline void intel_pt_vm_entry(bool guest_trace_enable) {} +static inline void intel_pt_vm_exit(bool guest_trace_enable) {} static inline int is_intel_pt_event(struct perf_event *event) { return 0; } #endif
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index eaf4965ac6df..9998da4e774d 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1220,16 +1220,10 @@ static void pt_guest_enter(struct vcpu_vmx *vmx) if (vmx_pt_mode_is_system()) return;
- /* - * GUEST_IA32_RTIT_CTL is already set in the VMCS. - * Save host state before VM entry. - */ - rdmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl); - if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) { - wrmsrl(MSR_IA32_RTIT_CTL, 0); - pt_save_msr(&vmx->pt_desc.host, vmx->pt_desc.num_address_ranges); + intel_pt_vm_entry(vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN); + + if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) pt_load_msr(&vmx->pt_desc.guest, vmx->pt_desc.num_address_ranges); - } }
static void pt_guest_exit(struct vcpu_vmx *vmx) @@ -1237,17 +1231,10 @@ static void pt_guest_exit(struct vcpu_vmx *vmx) if (vmx_pt_mode_is_system()) return;
- if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) { + if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) pt_save_msr(&vmx->pt_desc.guest, vmx->pt_desc.num_address_ranges); - pt_load_msr(&vmx->pt_desc.host, vmx->pt_desc.num_address_ranges); - }
- /* - * KVM requires VM_EXIT_CLEAR_IA32_RTIT_CTL to expose PT to the guest, - * i.e. RTIT_CTL is always cleared on VM-Exit. Restore it if necessary. - */ - if (vmx->pt_desc.host.ctl) - wrmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl); + intel_pt_vm_exit(vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN); }
void vmx_set_host_fs_gs(struct vmcs_host_state *host, u16 fs_sel, u16 gs_sel, diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 2325f773a20b..24ac6f6dc0ca 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -63,7 +63,6 @@ struct pt_desc { u64 ctl_bitmask; u32 num_address_ranges; u32 caps[PT_CPUID_REGS_NUM * PT_CPUID_LEAVES]; - struct pt_ctx host; struct pt_ctx guest; };