This is a note to let you know that I've just added the patch titled
KVM: nVMX: Eliminate vmcs02 pool
to the 4.15-stable tree which can be found at:
http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum…
The filename of the patch is:
KVM_nVMX_Eliminate_vmcs02_pool.patch
and it can be found in the queue-4.15 subdirectory.
If you, or anyone else, feels it should not be added to the stable tree,
please let <stable(a)vger.kernel.org> know about it.
Subject: KVM: nVMX: Eliminate vmcs02 pool
From: Jim Mattson jmattson(a)google.com
Date: Mon Nov 27 17:22:25 2017 -0600
From: Jim Mattson jmattson(a)google.com
commit de3a0021a60635de96aa92713c1a31a96747d72c
The potential performance advantages of a vmcs02 pool have never been
realized. To simplify the code, eliminate the pool. Instead, a single
vmcs02 is allocated per VCPU when the VCPU enters VMX operation.
Cc: stable(a)vger.kernel.org # prereq for Spectre mitigation
Signed-off-by: Jim Mattson <jmattson(a)google.com>
Signed-off-by: Mark Kanda <mark.kanda(a)oracle.com>
Reviewed-by: Ameya More <ameya.more(a)oracle.com>
Reviewed-by: David Hildenbrand <david(a)redhat.com>
Reviewed-by: Paolo Bonzini <pbonzini(a)redhat.com>
Signed-off-by: Radim Krčmář <rkrcmar(a)redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
arch/x86/kvm/vmx.c | 146 ++++++++---------------------------------------------
1 file changed, 23 insertions(+), 123 deletions(-)
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -185,7 +185,6 @@ module_param(ple_window_max, int, S_IRUG
extern const ulong vmx_return;
#define NR_AUTOLOAD_MSRS 8
-#define VMCS02_POOL_SIZE 1
struct vmcs {
u32 revision_id;
@@ -226,7 +225,7 @@ struct shared_msr_entry {
* stored in guest memory specified by VMPTRLD, but is opaque to the guest,
* which must access it using VMREAD/VMWRITE/VMCLEAR instructions.
* More than one of these structures may exist, if L1 runs multiple L2 guests.
- * nested_vmx_run() will use the data here to build a vmcs02: a VMCS for the
+ * nested_vmx_run() will use the data here to build the vmcs02: a VMCS for the
* underlying hardware which will be used to run L2.
* This structure is packed to ensure that its layout is identical across
* machines (necessary for live migration).
@@ -409,13 +408,6 @@ struct __packed vmcs12 {
*/
#define VMCS12_SIZE 0x1000
-/* Used to remember the last vmcs02 used for some recently used vmcs12s */
-struct vmcs02_list {
- struct list_head list;
- gpa_t vmptr;
- struct loaded_vmcs vmcs02;
-};
-
/*
* The nested_vmx structure is part of vcpu_vmx, and holds information we need
* for correct emulation of VMX (i.e., nested VMX) on this vcpu.
@@ -440,15 +432,15 @@ struct nested_vmx {
*/
bool sync_shadow_vmcs;
- /* vmcs02_list cache of VMCSs recently used to run L2 guests */
- struct list_head vmcs02_pool;
- int vmcs02_num;
bool change_vmcs01_virtual_x2apic_mode;
/* L2 must run next, and mustn't decide to exit to L1. */
bool nested_run_pending;
+
+ struct loaded_vmcs vmcs02;
+
/*
- * Guest pages referred to in vmcs02 with host-physical pointers, so
- * we must keep them pinned while L2 runs.
+ * Guest pages referred to in the vmcs02 with host-physical
+ * pointers, so we must keep them pinned while L2 runs.
*/
struct page *apic_access_page;
struct page *virtual_apic_page;
@@ -6974,94 +6966,6 @@ static int handle_monitor(struct kvm_vcp
}
/*
- * To run an L2 guest, we need a vmcs02 based on the L1-specified vmcs12.
- * We could reuse a single VMCS for all the L2 guests, but we also want the
- * option to allocate a separate vmcs02 for each separate loaded vmcs12 - this
- * allows keeping them loaded on the processor, and in the future will allow
- * optimizations where prepare_vmcs02 doesn't need to set all the fields on
- * every entry if they never change.
- * So we keep, in vmx->nested.vmcs02_pool, a cache of size VMCS02_POOL_SIZE
- * (>=0) with a vmcs02 for each recently loaded vmcs12s, most recent first.
- *
- * The following functions allocate and free a vmcs02 in this pool.
- */
-
-/* Get a VMCS from the pool to use as vmcs02 for the current vmcs12. */
-static struct loaded_vmcs *nested_get_current_vmcs02(struct vcpu_vmx *vmx)
-{
- struct vmcs02_list *item;
- list_for_each_entry(item, &vmx->nested.vmcs02_pool, list)
- if (item->vmptr == vmx->nested.current_vmptr) {
- list_move(&item->list, &vmx->nested.vmcs02_pool);
- return &item->vmcs02;
- }
-
- if (vmx->nested.vmcs02_num >= max(VMCS02_POOL_SIZE, 1)) {
- /* Recycle the least recently used VMCS. */
- item = list_last_entry(&vmx->nested.vmcs02_pool,
- struct vmcs02_list, list);
- item->vmptr = vmx->nested.current_vmptr;
- list_move(&item->list, &vmx->nested.vmcs02_pool);
- return &item->vmcs02;
- }
-
- /* Create a new VMCS */
- item = kzalloc(sizeof(struct vmcs02_list), GFP_KERNEL);
- if (!item)
- return NULL;
- item->vmcs02.vmcs = alloc_vmcs();
- item->vmcs02.shadow_vmcs = NULL;
- if (!item->vmcs02.vmcs) {
- kfree(item);
- return NULL;
- }
- loaded_vmcs_init(&item->vmcs02);
- item->vmptr = vmx->nested.current_vmptr;
- list_add(&(item->list), &(vmx->nested.vmcs02_pool));
- vmx->nested.vmcs02_num++;
- return &item->vmcs02;
-}
-
-/* Free and remove from pool a vmcs02 saved for a vmcs12 (if there is one) */
-static void nested_free_vmcs02(struct vcpu_vmx *vmx, gpa_t vmptr)
-{
- struct vmcs02_list *item;
- list_for_each_entry(item, &vmx->nested.vmcs02_pool, list)
- if (item->vmptr == vmptr) {
- free_loaded_vmcs(&item->vmcs02);
- list_del(&item->list);
- kfree(item);
- vmx->nested.vmcs02_num--;
- return;
- }
-}
-
-/*
- * Free all VMCSs saved for this vcpu, except the one pointed by
- * vmx->loaded_vmcs. We must be running L1, so vmx->loaded_vmcs
- * must be &vmx->vmcs01.
- */
-static void nested_free_all_saved_vmcss(struct vcpu_vmx *vmx)
-{
- struct vmcs02_list *item, *n;
-
- WARN_ON(vmx->loaded_vmcs != &vmx->vmcs01);
- list_for_each_entry_safe(item, n, &vmx->nested.vmcs02_pool, list) {
- /*
- * Something will leak if the above WARN triggers. Better than
- * a use-after-free.
- */
- if (vmx->loaded_vmcs == &item->vmcs02)
- continue;
-
- free_loaded_vmcs(&item->vmcs02);
- list_del(&item->list);
- kfree(item);
- vmx->nested.vmcs02_num--;
- }
-}
-
-/*
* The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(),
* set the success or error code of an emulated VMX instruction, as specified
* by Vol 2B, VMX Instruction Reference, "Conventions".
@@ -7242,6 +7146,12 @@ static int enter_vmx_operation(struct kv
struct vcpu_vmx *vmx = to_vmx(vcpu);
struct vmcs *shadow_vmcs;
+ vmx->nested.vmcs02.vmcs = alloc_vmcs();
+ vmx->nested.vmcs02.shadow_vmcs = NULL;
+ if (!vmx->nested.vmcs02.vmcs)
+ goto out_vmcs02;
+ loaded_vmcs_init(&vmx->nested.vmcs02);
+
if (cpu_has_vmx_msr_bitmap()) {
vmx->nested.msr_bitmap =
(unsigned long *)__get_free_page(GFP_KERNEL);
@@ -7264,9 +7174,6 @@ static int enter_vmx_operation(struct kv
vmx->vmcs01.shadow_vmcs = shadow_vmcs;
}
- INIT_LIST_HEAD(&(vmx->nested.vmcs02_pool));
- vmx->nested.vmcs02_num = 0;
-
hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC,
HRTIMER_MODE_REL_PINNED);
vmx->nested.preemption_timer.function = vmx_preemption_timer_fn;
@@ -7281,6 +7188,9 @@ out_cached_vmcs12:
free_page((unsigned long)vmx->nested.msr_bitmap);
out_msr_bitmap:
+ free_loaded_vmcs(&vmx->nested.vmcs02);
+
+out_vmcs02:
return -ENOMEM;
}
@@ -7434,7 +7344,7 @@ static void free_nested(struct vcpu_vmx
vmx->vmcs01.shadow_vmcs = NULL;
}
kfree(vmx->nested.cached_vmcs12);
- /* Unpin physical memory we referred to in current vmcs02 */
+ /* Unpin physical memory we referred to in the vmcs02 */
if (vmx->nested.apic_access_page) {
kvm_release_page_dirty(vmx->nested.apic_access_page);
vmx->nested.apic_access_page = NULL;
@@ -7450,7 +7360,7 @@ static void free_nested(struct vcpu_vmx
vmx->nested.pi_desc = NULL;
}
- nested_free_all_saved_vmcss(vmx);
+ free_loaded_vmcs(&vmx->nested.vmcs02);
}
/* Emulate the VMXOFF instruction */
@@ -7493,8 +7403,6 @@ static int handle_vmclear(struct kvm_vcp
vmptr + offsetof(struct vmcs12, launch_state),
&zero, sizeof(zero));
- nested_free_vmcs02(vmx, vmptr);
-
nested_vmx_succeed(vcpu);
return kvm_skip_emulated_instruction(vcpu);
}
@@ -8406,10 +8314,11 @@ static bool nested_vmx_exit_reflected(st
/*
* The host physical addresses of some pages of guest memory
- * are loaded into VMCS02 (e.g. L1's Virtual APIC Page). The CPU
- * may write to these pages via their host physical address while
- * L2 is running, bypassing any address-translation-based dirty
- * tracking (e.g. EPT write protection).
+ * are loaded into the vmcs02 (e.g. vmcs12's Virtual APIC
+ * Page). The CPU may write to these pages via their host
+ * physical address while L2 is running, bypassing any
+ * address-translation-based dirty tracking (e.g. EPT write
+ * protection).
*
* Mark them dirty on every exit from L2 to prevent them from
* getting out of sync with dirty tracking.
@@ -10903,20 +10812,15 @@ static int enter_vmx_non_root_mode(struc
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
- struct loaded_vmcs *vmcs02;
u32 msr_entry_idx;
u32 exit_qual;
- vmcs02 = nested_get_current_vmcs02(vmx);
- if (!vmcs02)
- return -ENOMEM;
-
enter_guest_mode(vcpu);
if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS))
vmx->nested.vmcs01_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
- vmx_switch_vmcs(vcpu, vmcs02);
+ vmx_switch_vmcs(vcpu, &vmx->nested.vmcs02);
vmx_segment_cache_clear(vmx);
if (prepare_vmcs02(vcpu, vmcs12, from_vmentry, &exit_qual)) {
@@ -11534,10 +11438,6 @@ static void nested_vmx_vmexit(struct kvm
vm_exit_controls_reset_shadow(vmx);
vmx_segment_cache_clear(vmx);
- /* if no vmcs02 cache requested, remove the one we used */
- if (VMCS02_POOL_SIZE == 0)
- nested_free_vmcs02(vmx, vmx->nested.current_vmptr);
-
/* Update any VMCS fields that might have changed while L2 ran */
vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.nr);
vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.nr);
Patches currently in stable-queue which might be from jmattson(a)google.com are
queue-4.15/x86kvm_Update_spectre-v1_mitigation.patch
queue-4.15/KVMVMX_Allow_direct_access_to_MSR_IA32_SPEC_CTRL.patch
queue-4.15/KVM_nVMX_Eliminate_vmcs02_pool.patch
queue-4.15/KVMVMX_Emulate_MSR_IA32_ARCH_CAPABILITIES.patch
queue-4.15/KVM_VMX_make_MSR_bitmaps_per-VCPU.patch
queue-4.15/KVMx86_Update_the_reverse_cpuid_list_to_include_CPUID_7_EDX.patch
This is a note to let you know that I've just added the patch titled
KVM/VMX: Allow direct access to MSR_IA32_SPEC_CTRL
to the 4.15-stable tree which can be found at:
http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum…
The filename of the patch is:
KVMVMX_Allow_direct_access_to_MSR_IA32_SPEC_CTRL.patch
and it can be found in the queue-4.15 subdirectory.
If you, or anyone else, feels it should not be added to the stable tree,
please let <stable(a)vger.kernel.org> know about it.
Subject: KVM/VMX: Allow direct access to MSR_IA32_SPEC_CTRL
From: KarimAllah Ahmed karahmed(a)amazon.de
Date: Thu Feb 1 22:59:45 2018 +0100
From: KarimAllah Ahmed karahmed(a)amazon.de
commit d28b387fb74da95d69d2615732f50cceb38e9a4d
[ Based on a patch from Ashok Raj <ashok.raj(a)intel.com> ]
Add direct access to MSR_IA32_SPEC_CTRL for guests. This is needed for
guests that will only mitigate Spectre V2 through IBRS+IBPB and will not
be using a retpoline+IBPB based approach.
To avoid the overhead of saving and restoring the MSR_IA32_SPEC_CTRL for
guests that do not actually use the MSR, only start saving and restoring
when a non-zero is written to it.
No attempt is made to handle STIBP here, intentionally. Filtering STIBP
may be added in a future patch, which may require trapping all writes
if we don't want to pass it through directly to the guest.
[dwmw2: Clean up CPUID bits, save/restore manually, handle reset]
Signed-off-by: KarimAllah Ahmed <karahmed(a)amazon.de>
Signed-off-by: David Woodhouse <dwmw(a)amazon.co.uk>
Signed-off-by: Thomas Gleixner <tglx(a)linutronix.de>
Reviewed-by: Darren Kenny <darren.kenny(a)oracle.com>
Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk(a)oracle.com>
Reviewed-by: Jim Mattson <jmattson(a)google.com>
Cc: Andrea Arcangeli <aarcange(a)redhat.com>
Cc: Andi Kleen <ak(a)linux.intel.com>
Cc: Jun Nakajima <jun.nakajima(a)intel.com>
Cc: kvm(a)vger.kernel.org
Cc: Dave Hansen <dave.hansen(a)intel.com>
Cc: Tim Chen <tim.c.chen(a)linux.intel.com>
Cc: Andy Lutomirski <luto(a)kernel.org>
Cc: Asit Mallick <asit.k.mallick(a)intel.com>
Cc: Arjan Van De Ven <arjan.van.de.ven(a)intel.com>
Cc: Greg KH <gregkh(a)linuxfoundation.org>
Cc: Paolo Bonzini <pbonzini(a)redhat.com>
Cc: Dan Williams <dan.j.williams(a)intel.com>
Cc: Linus Torvalds <torvalds(a)linux-foundation.org>
Cc: Ashok Raj <ashok.raj(a)intel.com>
Link: https://lkml.kernel.org/r/1517522386-18410-5-git-send-email-karahmed@amazon…
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
arch/x86/kvm/cpuid.c | 9 ++--
arch/x86/kvm/vmx.c | 105 ++++++++++++++++++++++++++++++++++++++++++++++++++-
arch/x86/kvm/x86.c | 2
3 files changed, 110 insertions(+), 6 deletions(-)
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -367,7 +367,7 @@ static inline int __do_cpuid_ent(struct
/* cpuid 0x80000008.ebx */
const u32 kvm_cpuid_8000_0008_ebx_x86_features =
- F(IBPB);
+ F(IBPB) | F(IBRS);
/* cpuid 0xC0000001.edx */
const u32 kvm_cpuid_C000_0001_edx_x86_features =
@@ -394,7 +394,8 @@ static inline int __do_cpuid_ent(struct
/* cpuid 7.0.edx*/
const u32 kvm_cpuid_7_0_edx_x86_features =
- F(AVX512_4VNNIW) | F(AVX512_4FMAPS) | F(ARCH_CAPABILITIES);
+ F(AVX512_4VNNIW) | F(AVX512_4FMAPS) | F(SPEC_CTRL) |
+ F(ARCH_CAPABILITIES);
/* all calls to cpuid_count() should be made on the same cpu */
get_cpu();
@@ -630,9 +631,11 @@ static inline int __do_cpuid_ent(struct
g_phys_as = phys_as;
entry->eax = g_phys_as | (virt_as << 8);
entry->edx = 0;
- /* IBPB isn't necessarily present in hardware cpuid */
+ /* IBRS and IBPB aren't necessarily present in hardware cpuid */
if (boot_cpu_has(X86_FEATURE_IBPB))
entry->ebx |= F(IBPB);
+ if (boot_cpu_has(X86_FEATURE_IBRS))
+ entry->ebx |= F(IBRS);
entry->ebx &= kvm_cpuid_8000_0008_ebx_x86_features;
cpuid_mask(&entry->ebx, CPUID_8000_0008_EBX);
break;
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -595,6 +595,7 @@ struct vcpu_vmx {
#endif
u64 arch_capabilities;
+ u64 spec_ctrl;
u32 vm_entry_controls_shadow;
u32 vm_exit_controls_shadow;
@@ -1911,6 +1912,29 @@ static void update_exception_bitmap(stru
}
/*
+ * Check if MSR is intercepted for currently loaded MSR bitmap.
+ */
+static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr)
+{
+ unsigned long *msr_bitmap;
+ int f = sizeof(unsigned long);
+
+ if (!cpu_has_vmx_msr_bitmap())
+ return true;
+
+ msr_bitmap = to_vmx(vcpu)->loaded_vmcs->msr_bitmap;
+
+ if (msr <= 0x1fff) {
+ return !!test_bit(msr, msr_bitmap + 0x800 / f);
+ } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
+ msr &= 0x1fff;
+ return !!test_bit(msr, msr_bitmap + 0xc00 / f);
+ }
+
+ return true;
+}
+
+/*
* Check if MSR is intercepted for L01 MSR bitmap.
*/
static bool msr_write_intercepted_l01(struct kvm_vcpu *vcpu, u32 msr)
@@ -3262,6 +3286,14 @@ static int vmx_get_msr(struct kvm_vcpu *
case MSR_IA32_TSC:
msr_info->data = guest_read_tsc(vcpu);
break;
+ case MSR_IA32_SPEC_CTRL:
+ if (!msr_info->host_initiated &&
+ !guest_cpuid_has(vcpu, X86_FEATURE_IBRS) &&
+ !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
+ return 1;
+
+ msr_info->data = to_vmx(vcpu)->spec_ctrl;
+ break;
case MSR_IA32_ARCH_CAPABILITIES:
if (!msr_info->host_initiated &&
!guest_cpuid_has(vcpu, X86_FEATURE_ARCH_CAPABILITIES))
@@ -3375,6 +3407,37 @@ static int vmx_set_msr(struct kvm_vcpu *
case MSR_IA32_TSC:
kvm_write_tsc(vcpu, msr_info);
break;
+ case MSR_IA32_SPEC_CTRL:
+ if (!msr_info->host_initiated &&
+ !guest_cpuid_has(vcpu, X86_FEATURE_IBRS) &&
+ !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
+ return 1;
+
+ /* The STIBP bit doesn't fault even if it's not advertised */
+ if (data & ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP))
+ return 1;
+
+ vmx->spec_ctrl = data;
+
+ if (!data)
+ break;
+
+ /*
+ * For non-nested:
+ * When it's written (to non-zero) for the first time, pass
+ * it through.
+ *
+ * For nested:
+ * The handling of the MSR bitmap for L2 guests is done in
+ * nested_vmx_merge_msr_bitmap. We should not touch the
+ * vmcs02.msr_bitmap here since it gets completely overwritten
+ * in the merging. We update the vmcs01 here for L1 as well
+ * since it will end up touching the MSR anyway now.
+ */
+ vmx_disable_intercept_for_msr(vmx->vmcs01.msr_bitmap,
+ MSR_IA32_SPEC_CTRL,
+ MSR_TYPE_RW);
+ break;
case MSR_IA32_PRED_CMD:
if (!msr_info->host_initiated &&
!guest_cpuid_has(vcpu, X86_FEATURE_IBPB) &&
@@ -5700,6 +5763,7 @@ static void vmx_vcpu_reset(struct kvm_vc
u64 cr0;
vmx->rmode.vm86_active = 0;
+ vmx->spec_ctrl = 0;
vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val();
kvm_set_cr8(vcpu, 0);
@@ -9371,6 +9435,15 @@ static void __noclone vmx_vcpu_run(struc
vmx_arm_hv_timer(vcpu);
+ /*
+ * If this vCPU has touched SPEC_CTRL, restore the guest's value if
+ * it's non-zero. Since vmentry is serialising on affected CPUs, there
+ * is no need to worry about the conditional branch over the wrmsr
+ * being speculatively taken.
+ */
+ if (vmx->spec_ctrl)
+ wrmsrl(MSR_IA32_SPEC_CTRL, vmx->spec_ctrl);
+
vmx->__launched = vmx->loaded_vmcs->launched;
asm(
/* Store host registers */
@@ -9489,6 +9562,27 @@ static void __noclone vmx_vcpu_run(struc
#endif
);
+ /*
+ * We do not use IBRS in the kernel. If this vCPU has used the
+ * SPEC_CTRL MSR it may have left it on; save the value and
+ * turn it off. This is much more efficient than blindly adding
+ * it to the atomic save/restore list. Especially as the former
+ * (Saving guest MSRs on vmexit) doesn't even exist in KVM.
+ *
+ * For non-nested case:
+ * If the L01 MSR bitmap does not intercept the MSR, then we need to
+ * save it.
+ *
+ * For nested case:
+ * If the L02 MSR bitmap does not intercept the MSR, then we need to
+ * save it.
+ */
+ if (!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL))
+ rdmsrl(MSR_IA32_SPEC_CTRL, vmx->spec_ctrl);
+
+ if (vmx->spec_ctrl)
+ wrmsrl(MSR_IA32_SPEC_CTRL, 0);
+
/* Eliminate branch target predictions from guest mode */
vmexit_fill_RSB();
@@ -10113,7 +10207,7 @@ static inline bool nested_vmx_merge_msr_
unsigned long *msr_bitmap_l1;
unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.vmcs02.msr_bitmap;
/*
- * pred_cmd is trying to verify two things:
+ * pred_cmd & spec_ctrl are trying to verify two things:
*
* 1. L0 gave a permission to L1 to actually passthrough the MSR. This
* ensures that we do not accidentally generate an L02 MSR bitmap
@@ -10126,9 +10220,10 @@ static inline bool nested_vmx_merge_msr_
* the MSR.
*/
bool pred_cmd = msr_write_intercepted_l01(vcpu, MSR_IA32_PRED_CMD);
+ bool spec_ctrl = msr_write_intercepted_l01(vcpu, MSR_IA32_SPEC_CTRL);
if (!nested_cpu_has_virt_x2apic_mode(vmcs12) &&
- !pred_cmd)
+ !pred_cmd && !spec_ctrl)
return false;
page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->msr_bitmap);
@@ -10162,6 +10257,12 @@ static inline bool nested_vmx_merge_msr_
}
}
+ if (spec_ctrl)
+ nested_vmx_disable_intercept_for_msr(
+ msr_bitmap_l1, msr_bitmap_l0,
+ MSR_IA32_SPEC_CTRL,
+ MSR_TYPE_R | MSR_TYPE_W);
+
if (pred_cmd)
nested_vmx_disable_intercept_for_msr(
msr_bitmap_l1, msr_bitmap_l0,
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1009,7 +1009,7 @@ static u32 msrs_to_save[] = {
#endif
MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA,
MSR_IA32_FEATURE_CONTROL, MSR_IA32_BNDCFGS, MSR_TSC_AUX,
- MSR_IA32_ARCH_CAPABILITIES
+ MSR_IA32_SPEC_CTRL, MSR_IA32_ARCH_CAPABILITIES
};
static unsigned num_msrs_to_save;
Patches currently in stable-queue which might be from ashok.raj(a)intel.com are
queue-4.15/x86pti_Do_not_enable_PTI_on_CPUs_which_are_not_vulnerable_to_Meltdown.patch
queue-4.15/x86cpufeature_Blacklist_SPEC_CTRLPRED_CMD_on_early_Spectre_v2_microcodes.patch
queue-4.15/x86cpufeatures_Add_Intel_feature_bits_for_Speculation_Control.patch
queue-4.15/x86paravirt_Remove_noreplace-paravirt_cmdline_option.patch
queue-4.15/KVM_VMX_Make_indirect_call_speculation_safe.patch
queue-4.15/x86msr_Add_definitions_for_new_speculation_control_MSRs.patch
queue-4.15/KVMVMX_Allow_direct_access_to_MSR_IA32_SPEC_CTRL.patch
queue-4.15/x86cpufeatures_Add_CPUID_7_EDX_CPUID_leaf.patch
queue-4.15/x86cpufeatures_Add_AMD_feature_bits_for_Speculation_Control.patch
queue-4.15/KVMSVM_Allow_direct_access_to_MSR_IA32_SPEC_CTRL.patch
queue-4.15/KVMx86_Add_IBPB_support.patch
queue-4.15/KVMVMX_Emulate_MSR_IA32_ARCH_CAPABILITIES.patch
queue-4.15/x86speculation_Add_basic_IBPB_(Indirect_Branch_Prediction_Barrier)_support.patch
queue-4.15/KVM_x86_Make_indirect_calls_in_emulator_speculation_safe.patch
This is a note to let you know that I've just added the patch titled
KVM/VMX: Emulate MSR_IA32_ARCH_CAPABILITIES
to the 4.15-stable tree which can be found at:
http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum…
The filename of the patch is:
KVMVMX_Emulate_MSR_IA32_ARCH_CAPABILITIES.patch
and it can be found in the queue-4.15 subdirectory.
If you, or anyone else, feels it should not be added to the stable tree,
please let <stable(a)vger.kernel.org> know about it.
Subject: KVM/VMX: Emulate MSR_IA32_ARCH_CAPABILITIES
From: KarimAllah Ahmed karahmed(a)amazon.de
Date: Thu Feb 1 22:59:44 2018 +0100
From: KarimAllah Ahmed karahmed(a)amazon.de
commit 28c1c9fabf48d6ad596273a11c46e0d0da3e14cd
Intel processors use MSR_IA32_ARCH_CAPABILITIES MSR to indicate RDCL_NO
(bit 0) and IBRS_ALL (bit 1). This is a read-only MSR. By default the
contents will come directly from the hardware, but user-space can still
override it.
[dwmw2: The bit in kvm_cpuid_7_0_edx_x86_features can be unconditional]
Signed-off-by: KarimAllah Ahmed <karahmed(a)amazon.de>
Signed-off-by: David Woodhouse <dwmw(a)amazon.co.uk>
Signed-off-by: Thomas Gleixner <tglx(a)linutronix.de>
Reviewed-by: Paolo Bonzini <pbonzini(a)redhat.com>
Reviewed-by: Darren Kenny <darren.kenny(a)oracle.com>
Reviewed-by: Jim Mattson <jmattson(a)google.com>
Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk(a)oracle.com>
Cc: Andrea Arcangeli <aarcange(a)redhat.com>
Cc: Andi Kleen <ak(a)linux.intel.com>
Cc: Jun Nakajima <jun.nakajima(a)intel.com>
Cc: kvm(a)vger.kernel.org
Cc: Dave Hansen <dave.hansen(a)intel.com>
Cc: Linus Torvalds <torvalds(a)linux-foundation.org>
Cc: Andy Lutomirski <luto(a)kernel.org>
Cc: Asit Mallick <asit.k.mallick(a)intel.com>
Cc: Arjan Van De Ven <arjan.van.de.ven(a)intel.com>
Cc: Greg KH <gregkh(a)linuxfoundation.org>
Cc: Dan Williams <dan.j.williams(a)intel.com>
Cc: Tim Chen <tim.c.chen(a)linux.intel.com>
Cc: Ashok Raj <ashok.raj(a)intel.com>
Link: https://lkml.kernel.org/r/1517522386-18410-4-git-send-email-karahmed@amazon…
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
arch/x86/kvm/cpuid.c | 2 +-
arch/x86/kvm/vmx.c | 15 +++++++++++++++
arch/x86/kvm/x86.c | 1 +
3 files changed, 17 insertions(+), 1 deletion(-)
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -394,7 +394,7 @@ static inline int __do_cpuid_ent(struct
/* cpuid 7.0.edx*/
const u32 kvm_cpuid_7_0_edx_x86_features =
- F(AVX512_4VNNIW) | F(AVX512_4FMAPS);
+ F(AVX512_4VNNIW) | F(AVX512_4FMAPS) | F(ARCH_CAPABILITIES);
/* all calls to cpuid_count() should be made on the same cpu */
get_cpu();
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -594,6 +594,8 @@ struct vcpu_vmx {
u64 msr_guest_kernel_gs_base;
#endif
+ u64 arch_capabilities;
+
u32 vm_entry_controls_shadow;
u32 vm_exit_controls_shadow;
u32 secondary_exec_control;
@@ -3260,6 +3262,12 @@ static int vmx_get_msr(struct kvm_vcpu *
case MSR_IA32_TSC:
msr_info->data = guest_read_tsc(vcpu);
break;
+ case MSR_IA32_ARCH_CAPABILITIES:
+ if (!msr_info->host_initiated &&
+ !guest_cpuid_has(vcpu, X86_FEATURE_ARCH_CAPABILITIES))
+ return 1;
+ msr_info->data = to_vmx(vcpu)->arch_capabilities;
+ break;
case MSR_IA32_SYSENTER_CS:
msr_info->data = vmcs_read32(GUEST_SYSENTER_CS);
break;
@@ -3395,6 +3403,11 @@ static int vmx_set_msr(struct kvm_vcpu *
vmx_disable_intercept_for_msr(vmx->vmcs01.msr_bitmap, MSR_IA32_PRED_CMD,
MSR_TYPE_W);
break;
+ case MSR_IA32_ARCH_CAPABILITIES:
+ if (!msr_info->host_initiated)
+ return 1;
+ vmx->arch_capabilities = data;
+ break;
case MSR_IA32_CR_PAT:
if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data))
@@ -5657,6 +5670,8 @@ static void vmx_vcpu_setup(struct vcpu_v
++vmx->nmsrs;
}
+ if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES))
+ rdmsrl(MSR_IA32_ARCH_CAPABILITIES, vmx->arch_capabilities);
vm_exit_controls_init(vmx, vmcs_config.vmexit_ctrl);
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1009,6 +1009,7 @@ static u32 msrs_to_save[] = {
#endif
MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA,
MSR_IA32_FEATURE_CONTROL, MSR_IA32_BNDCFGS, MSR_TSC_AUX,
+ MSR_IA32_ARCH_CAPABILITIES
};
static unsigned num_msrs_to_save;
Patches currently in stable-queue which might be from karahmed(a)amazon.de are
queue-4.15/x86spectre_Simplify_spectre_v2_command_line_parsing.patch
queue-4.15/x86pti_Do_not_enable_PTI_on_CPUs_which_are_not_vulnerable_to_Meltdown.patch
queue-4.15/x86speculation_Use_Indirect_Branch_Prediction_Barrier_in_context_switch.patch
queue-4.15/x86cpuid_Fix_up_virtual_IBRSIBPBSTIBP_feature_bits_on_Intel.patch
queue-4.15/x86cpufeature_Blacklist_SPEC_CTRLPRED_CMD_on_early_Spectre_v2_microcodes.patch
queue-4.15/x86cpufeatures_Add_Intel_feature_bits_for_Speculation_Control.patch
queue-4.15/x86msr_Add_definitions_for_new_speculation_control_MSRs.patch
queue-4.15/KVMVMX_Allow_direct_access_to_MSR_IA32_SPEC_CTRL.patch
queue-4.15/x86cpufeatures_Add_CPUID_7_EDX_CPUID_leaf.patch
queue-4.15/x86cpufeatures_Add_AMD_feature_bits_for_Speculation_Control.patch
queue-4.15/KVMSVM_Allow_direct_access_to_MSR_IA32_SPEC_CTRL.patch
queue-4.15/KVMx86_Add_IBPB_support.patch
queue-4.15/KVMVMX_Emulate_MSR_IA32_ARCH_CAPABILITIES.patch
queue-4.15/x86speculation_Add_basic_IBPB_(Indirect_Branch_Prediction_Barrier)_support.patch
queue-4.15/x86speculation_Simplify_indirect_branch_prediction_barrier().patch
queue-4.15/x86retpoline_Simplify_vmexit_fill_RSB().patch
queue-4.15/x86cpufeatures_Clean_up_Spectre_v2_related_CPUID_flags.patch
queue-4.15/KVMx86_Update_the_reverse_cpuid_list_to_include_CPUID_7_EDX.patch
queue-4.15/x86retpoline_Avoid_retpolines_for_built-in___init_functions.patch
This is a note to let you know that I've just added the patch titled
KVM/SVM: Allow direct access to MSR_IA32_SPEC_CTRL
to the 4.15-stable tree which can be found at:
http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum…
The filename of the patch is:
KVMSVM_Allow_direct_access_to_MSR_IA32_SPEC_CTRL.patch
and it can be found in the queue-4.15 subdirectory.
If you, or anyone else, feels it should not be added to the stable tree,
please let <stable(a)vger.kernel.org> know about it.
Subject: KVM/SVM: Allow direct access to MSR_IA32_SPEC_CTRL
From: KarimAllah Ahmed karahmed(a)amazon.de
Date: Sat Feb 3 15:56:23 2018 +0100
From: KarimAllah Ahmed karahmed(a)amazon.de
commit b2ac58f90540e39324e7a29a7ad471407ae0bf48
[ Based on a patch from Paolo Bonzini <pbonzini(a)redhat.com> ]
... basically doing exactly what we do for VMX:
- Passthrough SPEC_CTRL to guests (if enabled in guest CPUID)
- Save and restore SPEC_CTRL around VMExit and VMEntry only if the guest
actually used it.
Signed-off-by: KarimAllah Ahmed <karahmed(a)amazon.de>
Signed-off-by: David Woodhouse <dwmw(a)amazon.co.uk>
Signed-off-by: Thomas Gleixner <tglx(a)linutronix.de>
Reviewed-by: Darren Kenny <darren.kenny(a)oracle.com>
Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk(a)oracle.com>
Cc: Andrea Arcangeli <aarcange(a)redhat.com>
Cc: Andi Kleen <ak(a)linux.intel.com>
Cc: Jun Nakajima <jun.nakajima(a)intel.com>
Cc: kvm(a)vger.kernel.org
Cc: Dave Hansen <dave.hansen(a)intel.com>
Cc: Tim Chen <tim.c.chen(a)linux.intel.com>
Cc: Andy Lutomirski <luto(a)kernel.org>
Cc: Asit Mallick <asit.k.mallick(a)intel.com>
Cc: Arjan Van De Ven <arjan.van.de.ven(a)intel.com>
Cc: Greg KH <gregkh(a)linuxfoundation.org>
Cc: Paolo Bonzini <pbonzini(a)redhat.com>
Cc: Dan Williams <dan.j.williams(a)intel.com>
Cc: Linus Torvalds <torvalds(a)linux-foundation.org>
Cc: Ashok Raj <ashok.raj(a)intel.com>
Link: https://lkml.kernel.org/r/1517669783-20732-1-git-send-email-karahmed@amazon…
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
arch/x86/kvm/svm.c | 88 +++++++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 88 insertions(+)
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -184,6 +184,8 @@ struct vcpu_svm {
u64 gs_base;
} host;
+ u64 spec_ctrl;
+
u32 *msrpm;
ulong nmi_iret_rip;
@@ -249,6 +251,7 @@ static const struct svm_direct_access_ms
{ .index = MSR_CSTAR, .always = true },
{ .index = MSR_SYSCALL_MASK, .always = true },
#endif
+ { .index = MSR_IA32_SPEC_CTRL, .always = false },
{ .index = MSR_IA32_PRED_CMD, .always = false },
{ .index = MSR_IA32_LASTBRANCHFROMIP, .always = false },
{ .index = MSR_IA32_LASTBRANCHTOIP, .always = false },
@@ -882,6 +885,25 @@ static bool valid_msr_intercept(u32 inde
return false;
}
+static bool msr_write_intercepted(struct kvm_vcpu *vcpu, unsigned msr)
+{
+ u8 bit_write;
+ unsigned long tmp;
+ u32 offset;
+ u32 *msrpm;
+
+ msrpm = is_guest_mode(vcpu) ? to_svm(vcpu)->nested.msrpm:
+ to_svm(vcpu)->msrpm;
+
+ offset = svm_msrpm_offset(msr);
+ bit_write = 2 * (msr & 0x0f) + 1;
+ tmp = msrpm[offset];
+
+ BUG_ON(offset == MSR_INVALID);
+
+ return !!test_bit(bit_write, &tmp);
+}
+
static void set_msr_interception(u32 *msrpm, unsigned msr,
int read, int write)
{
@@ -1584,6 +1606,8 @@ static void svm_vcpu_reset(struct kvm_vc
u32 dummy;
u32 eax = 1;
+ svm->spec_ctrl = 0;
+
if (!init_event) {
svm->vcpu.arch.apic_base = APIC_DEFAULT_PHYS_BASE |
MSR_IA32_APICBASE_ENABLE;
@@ -3605,6 +3629,13 @@ static int svm_get_msr(struct kvm_vcpu *
case MSR_VM_CR:
msr_info->data = svm->nested.vm_cr_msr;
break;
+ case MSR_IA32_SPEC_CTRL:
+ if (!msr_info->host_initiated &&
+ !guest_cpuid_has(vcpu, X86_FEATURE_IBRS))
+ return 1;
+
+ msr_info->data = svm->spec_ctrl;
+ break;
case MSR_IA32_UCODE_REV:
msr_info->data = 0x01000065;
break;
@@ -3696,6 +3727,33 @@ static int svm_set_msr(struct kvm_vcpu *
case MSR_IA32_TSC:
kvm_write_tsc(vcpu, msr);
break;
+ case MSR_IA32_SPEC_CTRL:
+ if (!msr->host_initiated &&
+ !guest_cpuid_has(vcpu, X86_FEATURE_IBRS))
+ return 1;
+
+ /* The STIBP bit doesn't fault even if it's not advertised */
+ if (data & ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP))
+ return 1;
+
+ svm->spec_ctrl = data;
+
+ if (!data)
+ break;
+
+ /*
+ * For non-nested:
+ * When it's written (to non-zero) for the first time, pass
+ * it through.
+ *
+ * For nested:
+ * The handling of the MSR bitmap for L2 guests is done in
+ * nested_svm_vmrun_msrpm.
+ * We update the L1 MSR bit as well since it will end up
+ * touching the MSR anyway now.
+ */
+ set_msr_interception(svm->msrpm, MSR_IA32_SPEC_CTRL, 1, 1);
+ break;
case MSR_IA32_PRED_CMD:
if (!msr->host_initiated &&
!guest_cpuid_has(vcpu, X86_FEATURE_IBPB))
@@ -4964,6 +5022,15 @@ static void svm_vcpu_run(struct kvm_vcpu
local_irq_enable();
+ /*
+ * If this vCPU has touched SPEC_CTRL, restore the guest's value if
+ * it's non-zero. Since vmentry is serialising on affected CPUs, there
+ * is no need to worry about the conditional branch over the wrmsr
+ * being speculatively taken.
+ */
+ if (svm->spec_ctrl)
+ wrmsrl(MSR_IA32_SPEC_CTRL, svm->spec_ctrl);
+
asm volatile (
"push %%" _ASM_BP "; \n\t"
"mov %c[rbx](%[svm]), %%" _ASM_BX " \n\t"
@@ -5056,6 +5123,27 @@ static void svm_vcpu_run(struct kvm_vcpu
#endif
);
+ /*
+ * We do not use IBRS in the kernel. If this vCPU has used the
+ * SPEC_CTRL MSR it may have left it on; save the value and
+ * turn it off. This is much more efficient than blindly adding
+ * it to the atomic save/restore list. Especially as the former
+ * (Saving guest MSRs on vmexit) doesn't even exist in KVM.
+ *
+ * For non-nested case:
+ * If the L01 MSR bitmap does not intercept the MSR, then we need to
+ * save it.
+ *
+ * For nested case:
+ * If the L02 MSR bitmap does not intercept the MSR, then we need to
+ * save it.
+ */
+ if (!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL))
+ rdmsrl(MSR_IA32_SPEC_CTRL, svm->spec_ctrl);
+
+ if (svm->spec_ctrl)
+ wrmsrl(MSR_IA32_SPEC_CTRL, 0);
+
/* Eliminate branch target predictions from guest mode */
vmexit_fill_RSB();
Patches currently in stable-queue which might be from pbonzini(a)redhat.com are
queue-4.15/x86pti_Do_not_enable_PTI_on_CPUs_which_are_not_vulnerable_to_Meltdown.patch
queue-4.15/x86kvm_Update_spectre-v1_mitigation.patch
queue-4.15/x86speculation_Use_Indirect_Branch_Prediction_Barrier_in_context_switch.patch
queue-4.15/KVM_VMX_introduce_alloc_loaded_vmcs.patch
queue-4.15/x86cpufeature_Blacklist_SPEC_CTRLPRED_CMD_on_early_Spectre_v2_microcodes.patch
queue-4.15/x86cpufeatures_Add_Intel_feature_bits_for_Speculation_Control.patch
queue-4.15/x86paravirt_Remove_noreplace-paravirt_cmdline_option.patch
queue-4.15/KVM_VMX_Make_indirect_call_speculation_safe.patch
queue-4.15/x86msr_Add_definitions_for_new_speculation_control_MSRs.patch
queue-4.15/KVMVMX_Allow_direct_access_to_MSR_IA32_SPEC_CTRL.patch
queue-4.15/x86cpufeatures_Add_CPUID_7_EDX_CPUID_leaf.patch
queue-4.15/KVM_nVMX_Eliminate_vmcs02_pool.patch
queue-4.15/x86cpufeatures_Add_AMD_feature_bits_for_Speculation_Control.patch
queue-4.15/KVMSVM_Allow_direct_access_to_MSR_IA32_SPEC_CTRL.patch
queue-4.15/KVMx86_Add_IBPB_support.patch
queue-4.15/KVMVMX_Emulate_MSR_IA32_ARCH_CAPABILITIES.patch
queue-4.15/x86speculation_Add_basic_IBPB_(Indirect_Branch_Prediction_Barrier)_support.patch
queue-4.15/x86speculation_Simplify_indirect_branch_prediction_barrier().patch
queue-4.15/KVM_VMX_make_MSR_bitmaps_per-VCPU.patch
queue-4.15/KVM_x86_Make_indirect_calls_in_emulator_speculation_safe.patch
queue-4.15/x86retpoline_Simplify_vmexit_fill_RSB().patch
queue-4.15/x86cpufeatures_Clean_up_Spectre_v2_related_CPUID_flags.patch
queue-4.15/KVMx86_Update_the_reverse_cpuid_list_to_include_CPUID_7_EDX.patch
This is a note to let you know that I've just added the patch titled
Documentation: Document array_index_nospec
to the 4.15-stable tree which can be found at:
http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum…
The filename of the patch is:
Documentation_Document_array_index_nospec.patch
and it can be found in the queue-4.15 subdirectory.
If you, or anyone else, feels it should not be added to the stable tree,
please let <stable(a)vger.kernel.org> know about it.
Subject: Documentation: Document array_index_nospec
From: Mark Rutland mark.rutland(a)arm.com
Date: Mon Jan 29 17:02:16 2018 -0800
From: Mark Rutland mark.rutland(a)arm.com
commit f84a56f73dddaeac1dba8045b007f742f61cd2da
Document the rationale and usage of the new array_index_nospec() helper.
Signed-off-by: Mark Rutland <mark.rutland(a)arm.com>
Signed-off-by: Will Deacon <will.deacon(a)arm.com>
Signed-off-by: Dan Williams <dan.j.williams(a)intel.com>
Signed-off-by: Thomas Gleixner <tglx(a)linutronix.de>
Reviewed-by: Kees Cook <keescook(a)chromium.org>
Cc: linux-arch(a)vger.kernel.org
Cc: Jonathan Corbet <corbet(a)lwn.net>
Cc: Peter Zijlstra <peterz(a)infradead.org>
Cc: gregkh(a)linuxfoundation.org
Cc: kernel-hardening(a)lists.openwall.com
Cc: torvalds(a)linux-foundation.org
Cc: alan(a)linux.intel.com
Link: https://lkml.kernel.org/r/151727413645.33451.15878817161436755393.stgit@dwi…
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
Documentation/speculation.txt | 90 ++++++++++++++++++++++++++++++++++++++++++
1 file changed, 90 insertions(+)
--- /dev/null
+++ b/Documentation/speculation.txt
@@ -0,0 +1,90 @@
+This document explains potential effects of speculation, and how undesirable
+effects can be mitigated portably using common APIs.
+
+===========
+Speculation
+===========
+
+To improve performance and minimize average latencies, many contemporary CPUs
+employ speculative execution techniques such as branch prediction, performing
+work which may be discarded at a later stage.
+
+Typically speculative execution cannot be observed from architectural state,
+such as the contents of registers. However, in some cases it is possible to
+observe its impact on microarchitectural state, such as the presence or
+absence of data in caches. Such state may form side-channels which can be
+observed to extract secret information.
+
+For example, in the presence of branch prediction, it is possible for bounds
+checks to be ignored by code which is speculatively executed. Consider the
+following code:
+
+ int load_array(int *array, unsigned int index)
+ {
+ if (index >= MAX_ARRAY_ELEMS)
+ return 0;
+ else
+ return array[index];
+ }
+
+Which, on arm64, may be compiled to an assembly sequence such as:
+
+ CMP <index>, #MAX_ARRAY_ELEMS
+ B.LT less
+ MOV <returnval>, #0
+ RET
+ less:
+ LDR <returnval>, [<array>, <index>]
+ RET
+
+It is possible that a CPU mis-predicts the conditional branch, and
+speculatively loads array[index], even if index >= MAX_ARRAY_ELEMS. This
+value will subsequently be discarded, but the speculated load may affect
+microarchitectural state which can be subsequently measured.
+
+More complex sequences involving multiple dependent memory accesses may
+result in sensitive information being leaked. Consider the following
+code, building on the prior example:
+
+ int load_dependent_arrays(int *arr1, int *arr2, int index)
+ {
+ int val1, val2,
+
+ val1 = load_array(arr1, index);
+ val2 = load_array(arr2, val1);
+
+ return val2;
+ }
+
+Under speculation, the first call to load_array() may return the value
+of an out-of-bounds address, while the second call will influence
+microarchitectural state dependent on this value. This may provide an
+arbitrary read primitive.
+
+====================================
+Mitigating speculation side-channels
+====================================
+
+The kernel provides a generic API to ensure that bounds checks are
+respected even under speculation. Architectures which are affected by
+speculation-based side-channels are expected to implement these
+primitives.
+
+The array_index_nospec() helper in <linux/nospec.h> can be used to
+prevent information from being leaked via side-channels.
+
+A call to array_index_nospec(index, size) returns a sanitized index
+value that is bounded to [0, size) even under cpu speculation
+conditions.
+
+This can be used to protect the earlier load_array() example:
+
+ int load_array(int *array, unsigned int index)
+ {
+ if (index >= MAX_ARRAY_ELEMS)
+ return 0;
+ else {
+ index = array_index_nospec(index, MAX_ARRAY_ELEMS);
+ return array[index];
+ }
+ }
Patches currently in stable-queue which might be from mark.rutland(a)arm.com are
queue-4.15/Documentation_Document_array_index_nospec.patch
This is a note to let you know that I've just added the patch titled
x86/uaccess: Use __uaccess_begin_nospec() and uaccess_try_nospec
to the 4.14-stable tree which can be found at:
http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum…
The filename of the patch is:
x86uaccess_Use___uaccess_begin_nospec()_and_uaccess_try_nospec.patch
and it can be found in the queue-4.14 subdirectory.
If you, or anyone else, feels it should not be added to the stable tree,
please let <stable(a)vger.kernel.org> know about it.
Subject: x86/uaccess: Use __uaccess_begin_nospec() and uaccess_try_nospec
From: Dan Williams dan.j.williams(a)intel.com
Date: Mon Jan 29 17:02:49 2018 -0800
From: Dan Williams dan.j.williams(a)intel.com
commit 304ec1b050310548db33063e567123fae8fd0301
Quoting Linus:
I do think that it would be a good idea to very expressly document
the fact that it's not that the user access itself is unsafe. I do
agree that things like "get_user()" want to be protected, but not
because of any direct bugs or problems with get_user() and friends,
but simply because get_user() is an excellent source of a pointer
that is obviously controlled from a potentially attacking user
space. So it's a prime candidate for then finding _subsequent_
accesses that can then be used to perturb the cache.
__uaccess_begin_nospec() covers __get_user() and copy_from_iter() where the
limit check is far away from the user pointer de-reference. In those cases
a barrier_nospec() prevents speculation with a potential pointer to
privileged memory. uaccess_try_nospec covers get_user_try.
Suggested-by: Linus Torvalds <torvalds(a)linux-foundation.org>
Suggested-by: Andi Kleen <ak(a)linux.intel.com>
Signed-off-by: Dan Williams <dan.j.williams(a)intel.com>
Signed-off-by: Thomas Gleixner <tglx(a)linutronix.de>
Cc: linux-arch(a)vger.kernel.org
Cc: Kees Cook <keescook(a)chromium.org>
Cc: kernel-hardening(a)lists.openwall.com
Cc: gregkh(a)linuxfoundation.org
Cc: Al Viro <viro(a)zeniv.linux.org.uk>
Cc: alan(a)linux.intel.com
Link: https://lkml.kernel.org/r/151727416953.33451.10508284228526170604.stgit@dwi…
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
arch/x86/include/asm/uaccess.h | 6 +++---
arch/x86/include/asm/uaccess_32.h | 6 +++---
arch/x86/include/asm/uaccess_64.h | 12 ++++++------
arch/x86/lib/usercopy_32.c | 4 ++--
4 files changed, 14 insertions(+), 14 deletions(-)
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -450,7 +450,7 @@ do { \
({ \
int __gu_err; \
__inttype(*(ptr)) __gu_val; \
- __uaccess_begin(); \
+ __uaccess_begin_nospec(); \
__get_user_size(__gu_val, (ptr), (size), __gu_err, -EFAULT); \
__uaccess_end(); \
(x) = (__force __typeof__(*(ptr)))__gu_val; \
@@ -557,7 +557,7 @@ struct __large_struct { unsigned long bu
* get_user_ex(...);
* } get_user_catch(err)
*/
-#define get_user_try uaccess_try
+#define get_user_try uaccess_try_nospec
#define get_user_catch(err) uaccess_catch(err)
#define get_user_ex(x, ptr) do { \
@@ -591,7 +591,7 @@ extern void __cmpxchg_wrong_size(void)
__typeof__(ptr) __uval = (uval); \
__typeof__(*(ptr)) __old = (old); \
__typeof__(*(ptr)) __new = (new); \
- __uaccess_begin(); \
+ __uaccess_begin_nospec(); \
switch (size) { \
case 1: \
{ \
--- a/arch/x86/include/asm/uaccess_32.h
+++ b/arch/x86/include/asm/uaccess_32.h
@@ -29,21 +29,21 @@ raw_copy_from_user(void *to, const void
switch (n) {
case 1:
ret = 0;
- __uaccess_begin();
+ __uaccess_begin_nospec();
__get_user_asm_nozero(*(u8 *)to, from, ret,
"b", "b", "=q", 1);
__uaccess_end();
return ret;
case 2:
ret = 0;
- __uaccess_begin();
+ __uaccess_begin_nospec();
__get_user_asm_nozero(*(u16 *)to, from, ret,
"w", "w", "=r", 2);
__uaccess_end();
return ret;
case 4:
ret = 0;
- __uaccess_begin();
+ __uaccess_begin_nospec();
__get_user_asm_nozero(*(u32 *)to, from, ret,
"l", "k", "=r", 4);
__uaccess_end();
--- a/arch/x86/include/asm/uaccess_64.h
+++ b/arch/x86/include/asm/uaccess_64.h
@@ -55,31 +55,31 @@ raw_copy_from_user(void *dst, const void
return copy_user_generic(dst, (__force void *)src, size);
switch (size) {
case 1:
- __uaccess_begin();
+ __uaccess_begin_nospec();
__get_user_asm_nozero(*(u8 *)dst, (u8 __user *)src,
ret, "b", "b", "=q", 1);
__uaccess_end();
return ret;
case 2:
- __uaccess_begin();
+ __uaccess_begin_nospec();
__get_user_asm_nozero(*(u16 *)dst, (u16 __user *)src,
ret, "w", "w", "=r", 2);
__uaccess_end();
return ret;
case 4:
- __uaccess_begin();
+ __uaccess_begin_nospec();
__get_user_asm_nozero(*(u32 *)dst, (u32 __user *)src,
ret, "l", "k", "=r", 4);
__uaccess_end();
return ret;
case 8:
- __uaccess_begin();
+ __uaccess_begin_nospec();
__get_user_asm_nozero(*(u64 *)dst, (u64 __user *)src,
ret, "q", "", "=r", 8);
__uaccess_end();
return ret;
case 10:
- __uaccess_begin();
+ __uaccess_begin_nospec();
__get_user_asm_nozero(*(u64 *)dst, (u64 __user *)src,
ret, "q", "", "=r", 10);
if (likely(!ret))
@@ -89,7 +89,7 @@ raw_copy_from_user(void *dst, const void
__uaccess_end();
return ret;
case 16:
- __uaccess_begin();
+ __uaccess_begin_nospec();
__get_user_asm_nozero(*(u64 *)dst, (u64 __user *)src,
ret, "q", "", "=r", 16);
if (likely(!ret))
--- a/arch/x86/lib/usercopy_32.c
+++ b/arch/x86/lib/usercopy_32.c
@@ -331,7 +331,7 @@ do { \
unsigned long __copy_user_ll(void *to, const void *from, unsigned long n)
{
- __uaccess_begin();
+ __uaccess_begin_nospec();
if (movsl_is_ok(to, from, n))
__copy_user(to, from, n);
else
@@ -344,7 +344,7 @@ EXPORT_SYMBOL(__copy_user_ll);
unsigned long __copy_from_user_ll_nocache_nozero(void *to, const void __user *from,
unsigned long n)
{
- __uaccess_begin();
+ __uaccess_begin_nospec();
#ifdef CONFIG_X86_INTEL_USERCOPY
if (n > 64 && static_cpu_has(X86_FEATURE_XMM2))
n = __copy_user_intel_nocache(to, from, n);
Patches currently in stable-queue which might be from torvalds(a)linux-foundation.org are
queue-4.14/objtool_Add_support_for_alternatives_at_the_end_of_a_section.patch
queue-4.14/x86pti_Do_not_enable_PTI_on_CPUs_which_are_not_vulnerable_to_Meltdown.patch
queue-4.14/x86_Introduce_barrier_nospec.patch
queue-4.14/x86speculation_Use_Indirect_Branch_Prediction_Barrier_in_context_switch.patch
queue-4.14/x86get_user_Use_pointer_masking_to_limit_speculation.patch
queue-4.14/x86_Introduce___uaccess_begin_nospec()_and_uaccess_try_nospec.patch
queue-4.14/x86cpufeature_Blacklist_SPEC_CTRLPRED_CMD_on_early_Spectre_v2_microcodes.patch
queue-4.14/x86cpufeatures_Add_Intel_feature_bits_for_Speculation_Control.patch
queue-4.14/x86paravirt_Remove_noreplace-paravirt_cmdline_option.patch
queue-4.14/KVM_VMX_Make_indirect_call_speculation_safe.patch
queue-4.14/x86msr_Add_definitions_for_new_speculation_control_MSRs.patch
queue-4.14/x86alternative_Print_unadorned_pointers.patch
queue-4.14/KVMVMX_Allow_direct_access_to_MSR_IA32_SPEC_CTRL.patch
queue-4.14/x86cpufeatures_Add_CPUID_7_EDX_CPUID_leaf.patch
queue-4.14/array_index_nospec_Sanitize_speculative_array_de-references.patch
queue-4.14/Documentation_Document_array_index_nospec.patch
queue-4.14/x86entry64_Remove_the_SYSCALL64_fast_path.patch
queue-4.14/x86retpoline_Remove_the_esprsp_thunk.patch
queue-4.14/x86bugs_Drop_one_mitigation_from_dmesg.patch
queue-4.14/x86cpufeatures_Add_AMD_feature_bits_for_Speculation_Control.patch
queue-4.14/scripts-faddr2line-fix-cross_compile-unset-error.patch
queue-4.14/KVMSVM_Allow_direct_access_to_MSR_IA32_SPEC_CTRL.patch
queue-4.14/x86asm_Move_status_from_thread_struct_to_thread_info.patch
queue-4.14/KVMx86_Add_IBPB_support.patch
queue-4.14/x86_Implement_array_index_mask_nospec.patch
queue-4.14/KVMVMX_Emulate_MSR_IA32_ARCH_CAPABILITIES.patch
queue-4.14/nl80211_Sanitize_array_index_in_parse_txq_params.patch
queue-4.14/moduleretpoline_Warn_about_missing_retpoline_in_module.patch
queue-4.14/x86speculation_Add_basic_IBPB_(Indirect_Branch_Prediction_Barrier)_support.patch
queue-4.14/x86speculation_Simplify_indirect_branch_prediction_barrier().patch
queue-4.14/x86nospec_Fix_header_guards_names.patch
queue-4.14/KVM_x86_Make_indirect_calls_in_emulator_speculation_safe.patch
queue-4.14/x86uaccess_Use___uaccess_begin_nospec()_and_uaccess_try_nospec.patch
queue-4.14/x86entry64_Push_extra_regs_right_away.patch
queue-4.14/x86usercopy_Replace_open_coded_stacclac_with___uaccess_begin_end.patch
queue-4.14/vfs_fdtable_Prevent_bounds-check_bypass_via_speculative_execution.patch
queue-4.14/x86retpoline_Simplify_vmexit_fill_RSB().patch
queue-4.14/objtool_Warn_on_stripped_section_symbol.patch
queue-4.14/x86spectre_Report_get_user_mitigation_for_spectre_v1.patch
queue-4.14/x86cpufeatures_Clean_up_Spectre_v2_related_CPUID_flags.patch
queue-4.14/x86syscall_Sanitize_syscall_table_de-references_under_speculation.patch
queue-4.14/objtool_Improve_retpoline_alternative_handling.patch
This is a note to let you know that I've just added the patch titled
x86/usercopy: Replace open coded stac/clac with __uaccess_{begin, end}
to the 4.14-stable tree which can be found at:
http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum…
The filename of the patch is:
x86usercopy_Replace_open_coded_stacclac_with___uaccess_begin_end.patch
and it can be found in the queue-4.14 subdirectory.
If you, or anyone else, feels it should not be added to the stable tree,
please let <stable(a)vger.kernel.org> know about it.
Subject: x86/usercopy: Replace open coded stac/clac with __uaccess_{begin, end}
From: Dan Williams dan.j.williams(a)intel.com
Date: Mon Jan 29 17:02:44 2018 -0800
From: Dan Williams dan.j.williams(a)intel.com
commit b5c4ae4f35325d520b230bab6eb3310613b72ac1
In preparation for converting some __uaccess_begin() instances to
__uacess_begin_nospec(), make sure all 'from user' uaccess paths are
using the _begin(), _end() helpers rather than open-coded stac() and
clac().
No functional changes.
Suggested-by: Ingo Molnar <mingo(a)redhat.com>
Signed-off-by: Dan Williams <dan.j.williams(a)intel.com>
Signed-off-by: Thomas Gleixner <tglx(a)linutronix.de>
Cc: linux-arch(a)vger.kernel.org
Cc: Tom Lendacky <thomas.lendacky(a)amd.com>
Cc: Kees Cook <keescook(a)chromium.org>
Cc: kernel-hardening(a)lists.openwall.com
Cc: gregkh(a)linuxfoundation.org
Cc: Al Viro <viro(a)zeniv.linux.org.uk>
Cc: torvalds(a)linux-foundation.org
Cc: alan(a)linux.intel.com
Link: https://lkml.kernel.org/r/151727416438.33451.17309465232057176966.stgit@dwi…
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
arch/x86/lib/usercopy_32.c | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
--- a/arch/x86/lib/usercopy_32.c
+++ b/arch/x86/lib/usercopy_32.c
@@ -331,12 +331,12 @@ do { \
unsigned long __copy_user_ll(void *to, const void *from, unsigned long n)
{
- stac();
+ __uaccess_begin();
if (movsl_is_ok(to, from, n))
__copy_user(to, from, n);
else
n = __copy_user_intel(to, from, n);
- clac();
+ __uaccess_end();
return n;
}
EXPORT_SYMBOL(__copy_user_ll);
@@ -344,7 +344,7 @@ EXPORT_SYMBOL(__copy_user_ll);
unsigned long __copy_from_user_ll_nocache_nozero(void *to, const void __user *from,
unsigned long n)
{
- stac();
+ __uaccess_begin();
#ifdef CONFIG_X86_INTEL_USERCOPY
if (n > 64 && static_cpu_has(X86_FEATURE_XMM2))
n = __copy_user_intel_nocache(to, from, n);
@@ -353,7 +353,7 @@ unsigned long __copy_from_user_ll_nocach
#else
__copy_user(to, from, n);
#endif
- clac();
+ __uaccess_end();
return n;
}
EXPORT_SYMBOL(__copy_from_user_ll_nocache_nozero);
Patches currently in stable-queue which might be from mingo(a)redhat.com are
queue-4.14/x86_Introduce_barrier_nospec.patch
queue-4.14/x86_Introduce___uaccess_begin_nospec()_and_uaccess_try_nospec.patch
queue-4.14/x86usercopy_Replace_open_coded_stacclac_with___uaccess_begin_end.patch
This is a note to let you know that I've just added the patch titled
x86/syscall: Sanitize syscall table de-references under speculation
to the 4.14-stable tree which can be found at:
http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum…
The filename of the patch is:
x86syscall_Sanitize_syscall_table_de-references_under_speculation.patch
and it can be found in the queue-4.14 subdirectory.
If you, or anyone else, feels it should not be added to the stable tree,
please let <stable(a)vger.kernel.org> know about it.
Subject: x86/syscall: Sanitize syscall table de-references under speculation
From: Dan Williams dan.j.williams(a)intel.com
Date: Mon Jan 29 17:02:59 2018 -0800
From: Dan Williams dan.j.williams(a)intel.com
commit 2fbd7af5af8665d18bcefae3e9700be07e22b681
The syscall table base is a user controlled function pointer in kernel
space. Use array_index_nospec() to prevent any out of bounds speculation.
While retpoline prevents speculating into a userspace directed target it
does not stop the pointer de-reference, the concern is leaking memory
relative to the syscall table base, by observing instruction cache
behavior.
Reported-by: Linus Torvalds <torvalds(a)linux-foundation.org>
Signed-off-by: Dan Williams <dan.j.williams(a)intel.com>
Signed-off-by: Thomas Gleixner <tglx(a)linutronix.de>
Cc: linux-arch(a)vger.kernel.org
Cc: kernel-hardening(a)lists.openwall.com
Cc: gregkh(a)linuxfoundation.org
Cc: Andy Lutomirski <luto(a)kernel.org>
Cc: alan(a)linux.intel.com
Link: https://lkml.kernel.org/r/151727417984.33451.1216731042505722161.stgit@dwil…
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
arch/x86/entry/common.c | 5 ++++-
1 file changed, 4 insertions(+), 1 deletion(-)
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -21,6 +21,7 @@
#include <linux/export.h>
#include <linux/context_tracking.h>
#include <linux/user-return-notifier.h>
+#include <linux/nospec.h>
#include <linux/uprobes.h>
#include <linux/livepatch.h>
#include <linux/syscalls.h>
@@ -284,7 +285,8 @@ __visible void do_syscall_64(struct pt_r
* regs->orig_ax, which changes the behavior of some syscalls.
*/
if (likely((nr & __SYSCALL_MASK) < NR_syscalls)) {
- regs->ax = sys_call_table[nr & __SYSCALL_MASK](
+ nr = array_index_nospec(nr & __SYSCALL_MASK, NR_syscalls);
+ regs->ax = sys_call_table[nr](
regs->di, regs->si, regs->dx,
regs->r10, regs->r8, regs->r9);
}
@@ -320,6 +322,7 @@ static __always_inline void do_syscall_3
}
if (likely(nr < IA32_NR_syscalls)) {
+ nr = array_index_nospec(nr, IA32_NR_syscalls);
/*
* It's possible that a 32-bit syscall implementation
* takes a 64-bit parameter but nonetheless assumes that
Patches currently in stable-queue which might be from torvalds(a)linux-foundation.org are
queue-4.14/objtool_Add_support_for_alternatives_at_the_end_of_a_section.patch
queue-4.14/x86pti_Do_not_enable_PTI_on_CPUs_which_are_not_vulnerable_to_Meltdown.patch
queue-4.14/x86_Introduce_barrier_nospec.patch
queue-4.14/x86speculation_Use_Indirect_Branch_Prediction_Barrier_in_context_switch.patch
queue-4.14/x86get_user_Use_pointer_masking_to_limit_speculation.patch
queue-4.14/x86_Introduce___uaccess_begin_nospec()_and_uaccess_try_nospec.patch
queue-4.14/x86cpufeature_Blacklist_SPEC_CTRLPRED_CMD_on_early_Spectre_v2_microcodes.patch
queue-4.14/x86cpufeatures_Add_Intel_feature_bits_for_Speculation_Control.patch
queue-4.14/x86paravirt_Remove_noreplace-paravirt_cmdline_option.patch
queue-4.14/KVM_VMX_Make_indirect_call_speculation_safe.patch
queue-4.14/x86msr_Add_definitions_for_new_speculation_control_MSRs.patch
queue-4.14/x86alternative_Print_unadorned_pointers.patch
queue-4.14/KVMVMX_Allow_direct_access_to_MSR_IA32_SPEC_CTRL.patch
queue-4.14/x86cpufeatures_Add_CPUID_7_EDX_CPUID_leaf.patch
queue-4.14/array_index_nospec_Sanitize_speculative_array_de-references.patch
queue-4.14/Documentation_Document_array_index_nospec.patch
queue-4.14/x86entry64_Remove_the_SYSCALL64_fast_path.patch
queue-4.14/x86retpoline_Remove_the_esprsp_thunk.patch
queue-4.14/x86bugs_Drop_one_mitigation_from_dmesg.patch
queue-4.14/x86cpufeatures_Add_AMD_feature_bits_for_Speculation_Control.patch
queue-4.14/scripts-faddr2line-fix-cross_compile-unset-error.patch
queue-4.14/KVMSVM_Allow_direct_access_to_MSR_IA32_SPEC_CTRL.patch
queue-4.14/x86asm_Move_status_from_thread_struct_to_thread_info.patch
queue-4.14/KVMx86_Add_IBPB_support.patch
queue-4.14/x86_Implement_array_index_mask_nospec.patch
queue-4.14/KVMVMX_Emulate_MSR_IA32_ARCH_CAPABILITIES.patch
queue-4.14/nl80211_Sanitize_array_index_in_parse_txq_params.patch
queue-4.14/moduleretpoline_Warn_about_missing_retpoline_in_module.patch
queue-4.14/x86speculation_Add_basic_IBPB_(Indirect_Branch_Prediction_Barrier)_support.patch
queue-4.14/x86speculation_Simplify_indirect_branch_prediction_barrier().patch
queue-4.14/x86nospec_Fix_header_guards_names.patch
queue-4.14/KVM_x86_Make_indirect_calls_in_emulator_speculation_safe.patch
queue-4.14/x86uaccess_Use___uaccess_begin_nospec()_and_uaccess_try_nospec.patch
queue-4.14/x86entry64_Push_extra_regs_right_away.patch
queue-4.14/x86usercopy_Replace_open_coded_stacclac_with___uaccess_begin_end.patch
queue-4.14/vfs_fdtable_Prevent_bounds-check_bypass_via_speculative_execution.patch
queue-4.14/x86retpoline_Simplify_vmexit_fill_RSB().patch
queue-4.14/objtool_Warn_on_stripped_section_symbol.patch
queue-4.14/x86spectre_Report_get_user_mitigation_for_spectre_v1.patch
queue-4.14/x86cpufeatures_Clean_up_Spectre_v2_related_CPUID_flags.patch
queue-4.14/x86syscall_Sanitize_syscall_table_de-references_under_speculation.patch
queue-4.14/objtool_Improve_retpoline_alternative_handling.patch