This is a note to let you know that I've just added the patch titled
kaiser: enhanced by kernel and user PCIDs
to the 4.9-stable tree which can be found at:
http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum…
The filename of the patch is:
kaiser-enhanced-by-kernel-and-user-pcids.patch
and it can be found in the queue-4.9 subdirectory.
If you, or anyone else, feels it should not be added to the stable tree,
please let <stable(a)vger.kernel.org> know about it.
>From foo@baz Wed Jan 3 20:37:21 CET 2018
From: Hugh Dickins <hughd(a)google.com>
Date: Wed, 30 Aug 2017 16:23:00 -0700
Subject: kaiser: enhanced by kernel and user PCIDs
From: Hugh Dickins <hughd(a)google.com>
Merged performance improvements to Kaiser, using distinct kernel
and user Process Context Identifiers to minimize the TLB flushing.
[This work actually all from Dave Hansen 2017-08-30:
still omitting trackswitch mods, and KAISER_REAL_SWITCH deleted.]
Signed-off-by: Hugh Dickins <hughd(a)google.com>
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
arch/x86/entry/entry_64.S | 10 ++++-
arch/x86/entry/entry_64_compat.S | 1
arch/x86/include/asm/cpufeatures.h | 1
arch/x86/include/asm/kaiser.h | 15 ++++++-
arch/x86/include/asm/pgtable_types.h | 26 +++++++++++++
arch/x86/include/asm/tlbflush.h | 54 +++++++++++++++++++++++-----
arch/x86/include/uapi/asm/processor-flags.h | 3 +
arch/x86/kernel/cpu/common.c | 34 +++++++++++++++++
arch/x86/kvm/x86.c | 3 +
arch/x86/mm/kaiser.c | 7 +++
arch/x86/mm/tlb.c | 46 ++++++++++++++++++++++-
11 files changed, 182 insertions(+), 18 deletions(-)
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -1317,7 +1317,10 @@ ENTRY(nmi)
/* %rax is saved above, so OK to clobber here */
movq %cr3, %rax
pushq %rax
- andq $(~KAISER_SHADOW_PGD_OFFSET), %rax
+ /* mask off "user" bit of pgd address and 12 PCID bits: */
+ andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), %rax
+ /* Add back kernel PCID and "no flush" bit */
+ orq X86_CR3_PCID_KERN_VAR, %rax
movq %rax, %cr3
#endif
call do_nmi
@@ -1558,7 +1561,10 @@ end_repeat_nmi:
/* %rax is saved above, so OK to clobber here */
movq %cr3, %rax
pushq %rax
- andq $(~KAISER_SHADOW_PGD_OFFSET), %rax
+ /* mask off "user" bit of pgd address and 12 PCID bits: */
+ andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), %rax
+ /* Add back kernel PCID and "no flush" bit */
+ orq X86_CR3_PCID_KERN_VAR, %rax
movq %rax, %cr3
#endif
--- a/arch/x86/entry/entry_64_compat.S
+++ b/arch/x86/entry/entry_64_compat.S
@@ -13,6 +13,7 @@
#include <asm/irqflags.h>
#include <asm/asm.h>
#include <asm/smap.h>
+#include <asm/pgtable_types.h>
#include <asm/kaiser.h>
#include <linux/linkage.h>
#include <linux/err.h>
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -189,6 +189,7 @@
#define X86_FEATURE_CPB ( 7*32+ 2) /* AMD Core Performance Boost */
#define X86_FEATURE_EPB ( 7*32+ 3) /* IA32_ENERGY_PERF_BIAS support */
+#define X86_FEATURE_INVPCID_SINGLE ( 7*32+ 4) /* Effectively INVPCID && CR4.PCIDE=1 */
#define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* AMD HW-PState */
#define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */
--- a/arch/x86/include/asm/kaiser.h
+++ b/arch/x86/include/asm/kaiser.h
@@ -1,5 +1,8 @@
#ifndef _ASM_X86_KAISER_H
#define _ASM_X86_KAISER_H
+
+#include <uapi/asm/processor-flags.h> /* For PCID constants */
+
/*
* This file includes the definitions for the KAISER feature.
* KAISER is a counter measure against x86_64 side channel attacks on
@@ -21,13 +24,21 @@
.macro _SWITCH_TO_KERNEL_CR3 reg
movq %cr3, \reg
-andq $(~KAISER_SHADOW_PGD_OFFSET), \reg
+andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), \reg
+orq X86_CR3_PCID_KERN_VAR, \reg
movq \reg, %cr3
.endm
.macro _SWITCH_TO_USER_CR3 reg
movq %cr3, \reg
-orq $(KAISER_SHADOW_PGD_OFFSET), \reg
+andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), \reg
+/*
+ * This can obviously be one instruction by putting the
+ * KAISER_SHADOW_PGD_OFFSET bit in the X86_CR3_PCID_USER_VAR.
+ * But, just leave it now for simplicity.
+ */
+orq X86_CR3_PCID_USER_VAR, \reg
+orq $(KAISER_SHADOW_PGD_OFFSET), \reg
movq \reg, %cr3
.endm
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -141,6 +141,32 @@
_PAGE_SOFT_DIRTY)
#define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE)
+/* The ASID is the lower 12 bits of CR3 */
+#define X86_CR3_PCID_ASID_MASK (_AC((1<<12)-1,UL))
+
+/* Mask for all the PCID-related bits in CR3: */
+#define X86_CR3_PCID_MASK (X86_CR3_PCID_NOFLUSH | X86_CR3_PCID_ASID_MASK)
+#if defined(CONFIG_KAISER) && defined(CONFIG_X86_64)
+#define X86_CR3_PCID_ASID_KERN (_AC(0x4,UL))
+#define X86_CR3_PCID_ASID_USER (_AC(0x6,UL))
+
+#define X86_CR3_PCID_KERN_FLUSH (X86_CR3_PCID_ASID_KERN)
+#define X86_CR3_PCID_USER_FLUSH (X86_CR3_PCID_ASID_USER)
+#define X86_CR3_PCID_KERN_NOFLUSH (X86_CR3_PCID_NOFLUSH | X86_CR3_PCID_ASID_KERN)
+#define X86_CR3_PCID_USER_NOFLUSH (X86_CR3_PCID_NOFLUSH | X86_CR3_PCID_ASID_USER)
+#else
+#define X86_CR3_PCID_ASID_KERN (_AC(0x0,UL))
+#define X86_CR3_PCID_ASID_USER (_AC(0x0,UL))
+/*
+ * PCIDs are unsupported on 32-bit and none of these bits can be
+ * set in CR3:
+ */
+#define X86_CR3_PCID_KERN_FLUSH (0)
+#define X86_CR3_PCID_USER_FLUSH (0)
+#define X86_CR3_PCID_KERN_NOFLUSH (0)
+#define X86_CR3_PCID_USER_NOFLUSH (0)
+#endif
+
/*
* The cache modes defined here are used to translate between pure SW usage
* and the HW defined cache mode bits and/or PAT entries.
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -13,7 +13,6 @@ static inline void __invpcid(unsigned lo
unsigned long type)
{
struct { u64 d[2]; } desc = { { pcid, addr } };
-
/*
* The memory clobber is because the whole point is to invalidate
* stale TLB entries and, especially if we're flushing global
@@ -134,14 +133,25 @@ static inline void cr4_set_bits_and_upda
static inline void __native_flush_tlb(void)
{
+ if (!cpu_feature_enabled(X86_FEATURE_INVPCID)) {
+ /*
+ * If current->mm == NULL then we borrow a mm which may change during a
+ * task switch and therefore we must not be preempted while we write CR3
+ * back:
+ */
+ preempt_disable();
+ native_write_cr3(native_read_cr3());
+ preempt_enable();
+ return;
+ }
/*
- * If current->mm == NULL then we borrow a mm which may change during a
- * task switch and therefore we must not be preempted while we write CR3
- * back:
- */
- preempt_disable();
- native_write_cr3(native_read_cr3());
- preempt_enable();
+ * We are no longer using globals with KAISER, so a
+ * "nonglobals" flush would work too. But, this is more
+ * conservative.
+ *
+ * Note, this works with CR4.PCIDE=0 or 1.
+ */
+ invpcid_flush_all();
}
static inline void __native_flush_tlb_global_irq_disabled(void)
@@ -163,6 +173,8 @@ static inline void __native_flush_tlb_gl
/*
* Using INVPCID is considerably faster than a pair of writes
* to CR4 sandwiched inside an IRQ flag save/restore.
+ *
+ * Note, this works with CR4.PCIDE=0 or 1.
*/
invpcid_flush_all();
return;
@@ -182,7 +194,31 @@ static inline void __native_flush_tlb_gl
static inline void __native_flush_tlb_single(unsigned long addr)
{
- asm volatile("invlpg (%0)" ::"r" (addr) : "memory");
+ /*
+ * SIMICS #GP's if you run INVPCID with type 2/3
+ * and X86_CR4_PCIDE clear. Shame!
+ *
+ * The ASIDs used below are hard-coded. But, we must not
+ * call invpcid(type=1/2) before CR4.PCIDE=1. Just call
+ * invpcid in the case we are called early.
+ */
+ if (!this_cpu_has(X86_FEATURE_INVPCID_SINGLE)) {
+ asm volatile("invlpg (%0)" ::"r" (addr) : "memory");
+ return;
+ }
+ /* Flush the address out of both PCIDs. */
+ /*
+ * An optimization here might be to determine addresses
+ * that are only kernel-mapped and only flush the kernel
+ * ASID. But, userspace flushes are probably much more
+ * important performance-wise.
+ *
+ * Make sure to do only a single invpcid when KAISER is
+ * disabled and we have only a single ASID.
+ */
+ if (X86_CR3_PCID_ASID_KERN != X86_CR3_PCID_ASID_USER)
+ invpcid_flush_one(X86_CR3_PCID_ASID_KERN, addr);
+ invpcid_flush_one(X86_CR3_PCID_ASID_USER, addr);
}
static inline void __flush_tlb_all(void)
--- a/arch/x86/include/uapi/asm/processor-flags.h
+++ b/arch/x86/include/uapi/asm/processor-flags.h
@@ -77,7 +77,8 @@
#define X86_CR3_PWT _BITUL(X86_CR3_PWT_BIT)
#define X86_CR3_PCD_BIT 4 /* Page Cache Disable */
#define X86_CR3_PCD _BITUL(X86_CR3_PCD_BIT)
-#define X86_CR3_PCID_MASK _AC(0x00000fff,UL) /* PCID Mask */
+#define X86_CR3_PCID_NOFLUSH_BIT 63 /* Preserve old PCID */
+#define X86_CR3_PCID_NOFLUSH _BITULL(X86_CR3_PCID_NOFLUSH_BIT)
/*
* Intel CPU features in CR4
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -324,11 +324,45 @@ static __always_inline void setup_smap(s
}
}
+/*
+ * These can have bit 63 set, so we can not just use a plain "or"
+ * instruction to get their value or'd into CR3. It would take
+ * another register. So, we use a memory reference to these
+ * instead.
+ *
+ * This is also handy because systems that do not support
+ * PCIDs just end up or'ing a 0 into their CR3, which does
+ * no harm.
+ */
+__aligned(PAGE_SIZE) unsigned long X86_CR3_PCID_KERN_VAR = 0;
+__aligned(PAGE_SIZE) unsigned long X86_CR3_PCID_USER_VAR = 0;
+
static void setup_pcid(struct cpuinfo_x86 *c)
{
if (cpu_has(c, X86_FEATURE_PCID)) {
if (cpu_has(c, X86_FEATURE_PGE)) {
cr4_set_bits(X86_CR4_PCIDE);
+ /*
+ * These variables are used by the entry/exit
+ * code to change PCIDs.
+ */
+#ifdef CONFIG_KAISER
+ X86_CR3_PCID_KERN_VAR = X86_CR3_PCID_KERN_NOFLUSH;
+ X86_CR3_PCID_USER_VAR = X86_CR3_PCID_USER_NOFLUSH;
+#endif
+ /*
+ * INVPCID has two "groups" of types:
+ * 1/2: Invalidate an individual address
+ * 3/4: Invalidate all contexts
+ *
+ * 1/2 take a PCID, but 3/4 do not. So, 3/4
+ * ignore the PCID argument in the descriptor.
+ * But, we have to be careful not to call 1/2
+ * with an actual non-zero PCID in them before
+ * we do the above cr4_set_bits().
+ */
+ if (cpu_has(c, X86_FEATURE_INVPCID))
+ set_cpu_cap(c, X86_FEATURE_INVPCID_SINGLE);
} else {
/*
* flush_tlb_all(), as currently implemented, won't
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -773,7 +773,8 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, u
return 1;
/* PCID can not be enabled when cr3[11:0]!=000H or EFER.LMA=0 */
- if ((kvm_read_cr3(vcpu) & X86_CR3_PCID_MASK) || !is_long_mode(vcpu))
+ if ((kvm_read_cr3(vcpu) & X86_CR3_PCID_ASID_MASK) ||
+ !is_long_mode(vcpu))
return 1;
}
--- a/arch/x86/mm/kaiser.c
+++ b/arch/x86/mm/kaiser.c
@@ -239,6 +239,8 @@ static void __init kaiser_init_all_pgds(
} while (0)
extern char __per_cpu_user_mapped_start[], __per_cpu_user_mapped_end[];
+extern unsigned long X86_CR3_PCID_KERN_VAR;
+extern unsigned long X86_CR3_PCID_USER_VAR;
/*
* If anything in here fails, we will likely die on one of the
* first kernel->user transitions and init will die. But, we
@@ -289,6 +291,11 @@ void __init kaiser_init(void)
kaiser_add_user_map_early(&debug_idt_table,
sizeof(gate_desc) * NR_VECTORS,
__PAGE_KERNEL);
+
+ kaiser_add_user_map_early(&X86_CR3_PCID_KERN_VAR, PAGE_SIZE,
+ __PAGE_KERNEL);
+ kaiser_add_user_map_early(&X86_CR3_PCID_USER_VAR, PAGE_SIZE,
+ __PAGE_KERNEL);
}
/* Add a mapping to the shadow mapping, and synchronize the mappings */
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -34,6 +34,46 @@ struct flush_tlb_info {
unsigned long flush_end;
};
+static void load_new_mm_cr3(pgd_t *pgdir)
+{
+ unsigned long new_mm_cr3 = __pa(pgdir);
+
+ /*
+ * KAISER, plus PCIDs needs some extra work here. But,
+ * if either of features is not present, we need no
+ * PCIDs here and just do a normal, full TLB flush with
+ * the write_cr3()
+ */
+ if (!IS_ENABLED(CONFIG_KAISER) ||
+ !cpu_feature_enabled(X86_FEATURE_PCID))
+ goto out_set_cr3;
+ /*
+ * We reuse the same PCID for different tasks, so we must
+ * flush all the entires for the PCID out when we change
+ * tasks.
+ */
+ new_mm_cr3 = X86_CR3_PCID_KERN_FLUSH | __pa(pgdir);
+
+ /*
+ * The flush from load_cr3() may leave old TLB entries
+ * for userspace in place. We must flush that context
+ * separately. We can theoretically delay doing this
+ * until we actually load up the userspace CR3, but
+ * that's a bit tricky. We have to have the "need to
+ * flush userspace PCID" bit per-cpu and check it in the
+ * exit-to-userspace paths.
+ */
+ invpcid_flush_single_context(X86_CR3_PCID_ASID_USER);
+
+out_set_cr3:
+ /*
+ * Caution: many callers of this function expect
+ * that load_cr3() is serializing and orders TLB
+ * fills with respect to the mm_cpumask writes.
+ */
+ write_cr3(new_mm_cr3);
+}
+
/*
* We cannot call mmdrop() because we are in interrupt context,
* instead update mm->cpu_vm_mask.
@@ -45,7 +85,7 @@ void leave_mm(int cpu)
BUG();
if (cpumask_test_cpu(cpu, mm_cpumask(active_mm))) {
cpumask_clear_cpu(cpu, mm_cpumask(active_mm));
- load_cr3(swapper_pg_dir);
+ load_new_mm_cr3(swapper_pg_dir);
/*
* This gets called in the idle path where RCU
* functions differently. Tracing normally
@@ -120,7 +160,7 @@ void switch_mm_irqs_off(struct mm_struct
* ordering guarantee we need.
*
*/
- load_cr3(next->pgd);
+ load_new_mm_cr3(next->pgd);
trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
@@ -167,7 +207,7 @@ void switch_mm_irqs_off(struct mm_struct
* As above, load_cr3() is serializing and orders TLB
* fills with respect to the mm_cpumask write.
*/
- load_cr3(next->pgd);
+ load_new_mm_cr3(next->pgd);
trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
load_mm_cr4(next);
load_mm_ldt(next);
Patches currently in stable-queue which might be from hughd(a)google.com are
queue-4.9/kaiser-vmstat-show-nr_kaisertable-as-nr_overhead.patch
queue-4.9/kaiser-add-nokaiser-boot-option-using-alternative.patch
queue-4.9/kaiser-fix-unlikely-error-in-alloc_ldt_struct.patch
queue-4.9/kaiser-kaiser_flush_tlb_on_return_to_user-check-pcid.patch
queue-4.9/x86-paravirt-dont-patch-flush_tlb_single.patch
queue-4.9/kaiser-merged-update.patch
queue-4.9/kaiser-delete-kaiser_real_switch-option.patch
queue-4.9/kaiser-kaiser_remove_mapping-move-along-the-pgd.patch
queue-4.9/kaiser-fix-perf-crashes.patch
queue-4.9/kaiser-drop-is_atomic-arg-to-kaiser_pagetable_walk.patch
queue-4.9/kaiser-load_new_mm_cr3-let-switch_user_cr3-flush-user.patch
queue-4.9/kaiser-enhanced-by-kernel-and-user-pcids.patch
queue-4.9/kaiser-x86_cr3_pcid_noflush-and-x86_cr3_pcid_user.patch
queue-4.9/kaiser-align-addition-to-x86-mm-makefile.patch
queue-4.9/kaiser-use-alternative-instead-of-x86_cr3_pcid_noflush.patch
queue-4.9/kaiser-stack-map-page_size-at-thread_size-page_size.patch
queue-4.9/kaiser-name-that-0x1000-kaiser_shadow_pgd_offset.patch
queue-4.9/kaiser-fix-regs-to-do_nmi-ifndef-config_kaiser.patch
queue-4.9/kaiser-do-not-set-_page_nx-on-pgd_none.patch
queue-4.9/kaiser-tidied-up-asm-kaiser.h-somewhat.patch
queue-4.9/kaiser-cleanups-while-trying-for-gold-link.patch
queue-4.9/kaiser-tidied-up-kaiser_add-remove_mapping-slightly.patch
queue-4.9/kaiser-fix-build-and-fixme-in-alloc_ldt_struct.patch
queue-4.9/kaiser-kernel-address-isolation.patch
queue-4.9/kaiser-enomem-if-kaiser_pagetable_walk-null.patch
queue-4.9/kaiser-asm-tlbflush.h-handle-nopge-at-lower-level.patch
queue-4.9/kaiser-paranoid_entry-pass-cr3-need-to-paranoid_exit.patch
queue-4.9/kaiser-kaiser-depends-on-smp.patch
queue-4.9/kaiser-pcid-0-for-kernel-and-128-for-user.patch
This is a note to let you know that I've just added the patch titled
kaiser: drop is_atomic arg to kaiser_pagetable_walk()
to the 4.9-stable tree which can be found at:
http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum…
The filename of the patch is:
kaiser-drop-is_atomic-arg-to-kaiser_pagetable_walk.patch
and it can be found in the queue-4.9 subdirectory.
If you, or anyone else, feels it should not be added to the stable tree,
please let <stable(a)vger.kernel.org> know about it.
>From foo@baz Wed Jan 3 20:37:21 CET 2018
From: Hugh Dickins <hughd(a)google.com>
Date: Sun, 29 Oct 2017 11:36:19 -0700
Subject: kaiser: drop is_atomic arg to kaiser_pagetable_walk()
From: Hugh Dickins <hughd(a)google.com>
I have not observed a might_sleep() warning from setup_fixmap_gdt()'s
use of kaiser_add_mapping() in our tree (why not?), but like upstream
we have not provided a way for that to pass is_atomic true down to
kaiser_pagetable_walk(), and at startup it's far from a likely source
of trouble: so just delete the walk's is_atomic arg and might_sleep().
Signed-off-by: Hugh Dickins <hughd(a)google.com>
Acked-by: Jiri Kosina <jkosina(a)suse.cz>
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
arch/x86/mm/kaiser.c | 10 ++--------
1 file changed, 2 insertions(+), 8 deletions(-)
--- a/arch/x86/mm/kaiser.c
+++ b/arch/x86/mm/kaiser.c
@@ -107,19 +107,13 @@ static inline unsigned long get_pa_from_
*
* Returns a pointer to a PTE on success, or NULL on failure.
*/
-static pte_t *kaiser_pagetable_walk(unsigned long address, bool is_atomic)
+static pte_t *kaiser_pagetable_walk(unsigned long address)
{
pmd_t *pmd;
pud_t *pud;
pgd_t *pgd = native_get_shadow_pgd(pgd_offset_k(address));
gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
- if (is_atomic) {
- gfp &= ~GFP_KERNEL;
- gfp |= __GFP_HIGH | __GFP_ATOMIC;
- } else
- might_sleep();
-
if (pgd_none(*pgd)) {
WARN_ONCE(1, "All shadow pgds should have been populated");
return NULL;
@@ -194,7 +188,7 @@ static int kaiser_add_user_map(const voi
ret = -EIO;
break;
}
- pte = kaiser_pagetable_walk(address, false);
+ pte = kaiser_pagetable_walk(address);
if (!pte) {
ret = -ENOMEM;
break;
Patches currently in stable-queue which might be from hughd(a)google.com are
queue-4.9/kaiser-vmstat-show-nr_kaisertable-as-nr_overhead.patch
queue-4.9/kaiser-add-nokaiser-boot-option-using-alternative.patch
queue-4.9/kaiser-fix-unlikely-error-in-alloc_ldt_struct.patch
queue-4.9/kaiser-kaiser_flush_tlb_on_return_to_user-check-pcid.patch
queue-4.9/x86-paravirt-dont-patch-flush_tlb_single.patch
queue-4.9/kaiser-merged-update.patch
queue-4.9/kaiser-delete-kaiser_real_switch-option.patch
queue-4.9/kaiser-kaiser_remove_mapping-move-along-the-pgd.patch
queue-4.9/kaiser-fix-perf-crashes.patch
queue-4.9/kaiser-drop-is_atomic-arg-to-kaiser_pagetable_walk.patch
queue-4.9/kaiser-load_new_mm_cr3-let-switch_user_cr3-flush-user.patch
queue-4.9/kaiser-enhanced-by-kernel-and-user-pcids.patch
queue-4.9/kaiser-x86_cr3_pcid_noflush-and-x86_cr3_pcid_user.patch
queue-4.9/kaiser-align-addition-to-x86-mm-makefile.patch
queue-4.9/kaiser-use-alternative-instead-of-x86_cr3_pcid_noflush.patch
queue-4.9/kaiser-stack-map-page_size-at-thread_size-page_size.patch
queue-4.9/kaiser-name-that-0x1000-kaiser_shadow_pgd_offset.patch
queue-4.9/kaiser-fix-regs-to-do_nmi-ifndef-config_kaiser.patch
queue-4.9/kaiser-do-not-set-_page_nx-on-pgd_none.patch
queue-4.9/kaiser-tidied-up-asm-kaiser.h-somewhat.patch
queue-4.9/kaiser-cleanups-while-trying-for-gold-link.patch
queue-4.9/kaiser-tidied-up-kaiser_add-remove_mapping-slightly.patch
queue-4.9/kaiser-fix-build-and-fixme-in-alloc_ldt_struct.patch
queue-4.9/kaiser-kernel-address-isolation.patch
queue-4.9/kaiser-enomem-if-kaiser_pagetable_walk-null.patch
queue-4.9/kaiser-asm-tlbflush.h-handle-nopge-at-lower-level.patch
queue-4.9/kaiser-paranoid_entry-pass-cr3-need-to-paranoid_exit.patch
queue-4.9/kaiser-kaiser-depends-on-smp.patch
queue-4.9/kaiser-pcid-0-for-kernel-and-128-for-user.patch
This is a note to let you know that I've just added the patch titled
kaiser: fix build and FIXME in alloc_ldt_struct()
to the 4.9-stable tree which can be found at:
http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum…
The filename of the patch is:
kaiser-fix-build-and-fixme-in-alloc_ldt_struct.patch
and it can be found in the queue-4.9 subdirectory.
If you, or anyone else, feels it should not be added to the stable tree,
please let <stable(a)vger.kernel.org> know about it.
>From foo@baz Wed Jan 3 20:37:21 CET 2018
From: Hugh Dickins <hughd(a)google.com>
Date: Sun, 3 Sep 2017 17:09:44 -0700
Subject: kaiser: fix build and FIXME in alloc_ldt_struct()
From: Hugh Dickins <hughd(a)google.com>
Include linux/kaiser.h instead of asm/kaiser.h to build ldt.c without
CONFIG_KAISER. kaiser_add_mapping() does already return an error code,
so fix the FIXME.
Signed-off-by: Hugh Dickins <hughd(a)google.com>
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
arch/x86/kernel/ldt.c | 10 ++++------
1 file changed, 4 insertions(+), 6 deletions(-)
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -16,9 +16,9 @@
#include <linux/slab.h>
#include <linux/vmalloc.h>
#include <linux/uaccess.h>
+#include <linux/kaiser.h>
#include <asm/ldt.h>
-#include <asm/kaiser.h>
#include <asm/desc.h>
#include <asm/mmu_context.h>
#include <asm/syscalls.h>
@@ -49,7 +49,7 @@ static struct ldt_struct *alloc_ldt_stru
{
struct ldt_struct *new_ldt;
int alloc_size;
- int ret = 0;
+ int ret;
if (size > LDT_ENTRIES)
return NULL;
@@ -77,10 +77,8 @@ static struct ldt_struct *alloc_ldt_stru
return NULL;
}
- // FIXME: make kaiser_add_mapping() return an error code
- // when it fails
- kaiser_add_mapping((unsigned long)new_ldt->entries, alloc_size,
- __PAGE_KERNEL);
+ ret = kaiser_add_mapping((unsigned long)new_ldt->entries, alloc_size,
+ __PAGE_KERNEL);
if (ret) {
__free_ldt_struct(new_ldt);
return NULL;
Patches currently in stable-queue which might be from hughd(a)google.com are
queue-4.9/kaiser-vmstat-show-nr_kaisertable-as-nr_overhead.patch
queue-4.9/kaiser-add-nokaiser-boot-option-using-alternative.patch
queue-4.9/kaiser-fix-unlikely-error-in-alloc_ldt_struct.patch
queue-4.9/kaiser-kaiser_flush_tlb_on_return_to_user-check-pcid.patch
queue-4.9/x86-paravirt-dont-patch-flush_tlb_single.patch
queue-4.9/kaiser-merged-update.patch
queue-4.9/kaiser-delete-kaiser_real_switch-option.patch
queue-4.9/kaiser-kaiser_remove_mapping-move-along-the-pgd.patch
queue-4.9/kaiser-fix-perf-crashes.patch
queue-4.9/kaiser-drop-is_atomic-arg-to-kaiser_pagetable_walk.patch
queue-4.9/kaiser-load_new_mm_cr3-let-switch_user_cr3-flush-user.patch
queue-4.9/kaiser-enhanced-by-kernel-and-user-pcids.patch
queue-4.9/kaiser-x86_cr3_pcid_noflush-and-x86_cr3_pcid_user.patch
queue-4.9/kaiser-align-addition-to-x86-mm-makefile.patch
queue-4.9/kaiser-use-alternative-instead-of-x86_cr3_pcid_noflush.patch
queue-4.9/kaiser-stack-map-page_size-at-thread_size-page_size.patch
queue-4.9/kaiser-name-that-0x1000-kaiser_shadow_pgd_offset.patch
queue-4.9/kaiser-fix-regs-to-do_nmi-ifndef-config_kaiser.patch
queue-4.9/kaiser-do-not-set-_page_nx-on-pgd_none.patch
queue-4.9/kaiser-tidied-up-asm-kaiser.h-somewhat.patch
queue-4.9/kaiser-cleanups-while-trying-for-gold-link.patch
queue-4.9/kaiser-tidied-up-kaiser_add-remove_mapping-slightly.patch
queue-4.9/kaiser-fix-build-and-fixme-in-alloc_ldt_struct.patch
queue-4.9/kaiser-kernel-address-isolation.patch
queue-4.9/kaiser-enomem-if-kaiser_pagetable_walk-null.patch
queue-4.9/kaiser-asm-tlbflush.h-handle-nopge-at-lower-level.patch
queue-4.9/kaiser-paranoid_entry-pass-cr3-need-to-paranoid_exit.patch
queue-4.9/kaiser-kaiser-depends-on-smp.patch
queue-4.9/kaiser-pcid-0-for-kernel-and-128-for-user.patch
This is a note to let you know that I've just added the patch titled
kaiser: do not set _PAGE_NX on pgd_none
to the 4.9-stable tree which can be found at:
http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum…
The filename of the patch is:
kaiser-do-not-set-_page_nx-on-pgd_none.patch
and it can be found in the queue-4.9 subdirectory.
If you, or anyone else, feels it should not be added to the stable tree,
please let <stable(a)vger.kernel.org> know about it.
>From foo@baz Wed Jan 3 20:37:21 CET 2018
From: Hugh Dickins <hughd(a)google.com>
Date: Tue, 5 Sep 2017 12:05:01 -0700
Subject: kaiser: do not set _PAGE_NX on pgd_none
From: Hugh Dickins <hughd(a)google.com>
native_pgd_clear() uses native_set_pgd(), so native_set_pgd() must
avoid setting the _PAGE_NX bit on an otherwise pgd_none() entry:
usually that just generated a warning on exit, but sometimes
more mysterious and damaging failures (our production machines
could not complete booting).
The original fix to this just avoided adding _PAGE_NX to
an empty entry; but eventually more problems surfaced with kexec,
and EFI mapping expected to be a problem too. So now instead
change native_set_pgd() to update shadow only if _PAGE_USER:
A few places (kernel/machine_kexec_64.c, platform/efi/efi_64.c for sure)
use set_pgd() to set up a temporary internal virtual address space, with
physical pages remapped at what Kaiser regards as userspace addresses:
Kaiser then assumes a shadow pgd follows, which it will try to corrupt.
This appears to be responsible for the recent kexec and kdump failures;
though it's unclear how those did not manifest as a problem before.
Ah, the shadow pgd will only be assumed to "follow" if the requested
pgd is on an even-numbered page: so I suppose it was going wrong 50%
of the time all along.
What we need is a flag to set_pgd(), to tell it we're dealing with
userspace. Er, isn't that what the pgd's _PAGE_USER bit is saying?
Add a test for that. But we cannot do the same for pgd_clear()
(which may be called to clear corrupted entries - set aside the
question of "corrupt in which pgd?" until later), so there just
rely on pgd_clear() not being called in the problematic cases -
with a WARN_ON_ONCE() which should fire half the time if it is.
But this is getting too big for an inline function: move it into
arch/x86/mm/kaiser.c (which then demands a boot/compressed mod);
and de-void and de-space native_get_shadow/normal_pgd() while here.
Also make an unnecessary change to KASLR's init_trampoline(): it was
using set_pgd() to assign a pgd-value to a global variable (not in a
pg directory page), which was rather scary given Kaiser's previous
set_pgd() implementation: not a problem now, but too scary to leave
as was, it could easily blow up if we have to change set_pgd() again.
Signed-off-by: Hugh Dickins <hughd(a)google.com>
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
arch/x86/boot/compressed/misc.h | 1
arch/x86/include/asm/pgtable_64.h | 51 +++++++++-----------------------------
arch/x86/mm/kaiser.c | 42 +++++++++++++++++++++++++++++++
arch/x86/mm/kaslr.c | 4 +-
4 files changed, 58 insertions(+), 40 deletions(-)
--- a/arch/x86/boot/compressed/misc.h
+++ b/arch/x86/boot/compressed/misc.h
@@ -9,6 +9,7 @@
*/
#undef CONFIG_PARAVIRT
#undef CONFIG_PARAVIRT_SPINLOCKS
+#undef CONFIG_KAISER
#undef CONFIG_KASAN
#include <linux/linkage.h>
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -107,61 +107,36 @@ static inline void native_pud_clear(pud_
}
#ifdef CONFIG_KAISER
-static inline pgd_t * native_get_shadow_pgd(pgd_t *pgdp)
+extern pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd);
+
+static inline pgd_t *native_get_shadow_pgd(pgd_t *pgdp)
{
- return (pgd_t *)(void*)((unsigned long)(void*)pgdp | (unsigned long)PAGE_SIZE);
+ return (pgd_t *)((unsigned long)pgdp | (unsigned long)PAGE_SIZE);
}
-static inline pgd_t * native_get_normal_pgd(pgd_t *pgdp)
+static inline pgd_t *native_get_normal_pgd(pgd_t *pgdp)
{
- return (pgd_t *)(void*)((unsigned long)(void*)pgdp & ~(unsigned long)PAGE_SIZE);
+ return (pgd_t *)((unsigned long)pgdp & ~(unsigned long)PAGE_SIZE);
}
#else
-static inline pgd_t * native_get_shadow_pgd(pgd_t *pgdp)
+static inline pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd)
+{
+ return pgd;
+}
+static inline pgd_t *native_get_shadow_pgd(pgd_t *pgdp)
{
BUILD_BUG_ON(1);
return NULL;
}
-static inline pgd_t * native_get_normal_pgd(pgd_t *pgdp)
+static inline pgd_t *native_get_normal_pgd(pgd_t *pgdp)
{
return pgdp;
}
#endif /* CONFIG_KAISER */
-/*
- * Page table pages are page-aligned. The lower half of the top
- * level is used for userspace and the top half for the kernel.
- * This returns true for user pages that need to get copied into
- * both the user and kernel copies of the page tables, and false
- * for kernel pages that should only be in the kernel copy.
- */
-static inline bool is_userspace_pgd(void *__ptr)
-{
- unsigned long ptr = (unsigned long)__ptr;
-
- return ((ptr % PAGE_SIZE) < (PAGE_SIZE / 2));
-}
-
static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd)
{
-#ifdef CONFIG_KAISER
- pteval_t extra_kern_pgd_flags = 0;
- /* Do we need to also populate the shadow pgd? */
- if (is_userspace_pgd(pgdp)) {
- native_get_shadow_pgd(pgdp)->pgd = pgd.pgd;
- /*
- * Even if the entry is *mapping* userspace, ensure
- * that userspace can not use it. This way, if we
- * get out to userspace running on the kernel CR3,
- * userspace will crash instead of running.
- */
- extra_kern_pgd_flags = _PAGE_NX;
- }
- pgdp->pgd = pgd.pgd;
- pgdp->pgd |= extra_kern_pgd_flags;
-#else /* CONFIG_KAISER */
- *pgdp = pgd;
-#endif
+ *pgdp = kaiser_set_shadow_pgd(pgdp, pgd);
}
static inline void native_pgd_clear(pgd_t *pgd)
--- a/arch/x86/mm/kaiser.c
+++ b/arch/x86/mm/kaiser.c
@@ -302,4 +302,46 @@ void kaiser_remove_mapping(unsigned long
unmap_pud_range_nofree(pgd, addr, end);
}
}
+
+/*
+ * Page table pages are page-aligned. The lower half of the top
+ * level is used for userspace and the top half for the kernel.
+ * This returns true for user pages that need to get copied into
+ * both the user and kernel copies of the page tables, and false
+ * for kernel pages that should only be in the kernel copy.
+ */
+static inline bool is_userspace_pgd(pgd_t *pgdp)
+{
+ return ((unsigned long)pgdp % PAGE_SIZE) < (PAGE_SIZE / 2);
+}
+
+pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd)
+{
+ /*
+ * Do we need to also populate the shadow pgd? Check _PAGE_USER to
+ * skip cases like kexec and EFI which make temporary low mappings.
+ */
+ if (pgd.pgd & _PAGE_USER) {
+ if (is_userspace_pgd(pgdp)) {
+ native_get_shadow_pgd(pgdp)->pgd = pgd.pgd;
+ /*
+ * Even if the entry is *mapping* userspace, ensure
+ * that userspace can not use it. This way, if we
+ * get out to userspace running on the kernel CR3,
+ * userspace will crash instead of running.
+ */
+ pgd.pgd |= _PAGE_NX;
+ }
+ } else if (!pgd.pgd) {
+ /*
+ * pgd_clear() cannot check _PAGE_USER, and is even used to
+ * clear corrupted pgd entries: so just rely on cases like
+ * kexec and EFI never to be using pgd_clear().
+ */
+ if (!WARN_ON_ONCE((unsigned long)pgdp & PAGE_SIZE) &&
+ is_userspace_pgd(pgdp))
+ native_get_shadow_pgd(pgdp)->pgd = pgd.pgd;
+ }
+ return pgd;
+}
#endif /* CONFIG_KAISER */
--- a/arch/x86/mm/kaslr.c
+++ b/arch/x86/mm/kaslr.c
@@ -189,6 +189,6 @@ void __meminit init_trampoline(void)
*pud_tramp = *pud;
}
- set_pgd(&trampoline_pgd_entry,
- __pgd(_KERNPG_TABLE | __pa(pud_page_tramp)));
+ /* Avoid set_pgd(), in case it's complicated by CONFIG_KAISER */
+ trampoline_pgd_entry = __pgd(_KERNPG_TABLE | __pa(pud_page_tramp));
}
Patches currently in stable-queue which might be from hughd(a)google.com are
queue-4.9/kaiser-vmstat-show-nr_kaisertable-as-nr_overhead.patch
queue-4.9/kaiser-add-nokaiser-boot-option-using-alternative.patch
queue-4.9/kaiser-fix-unlikely-error-in-alloc_ldt_struct.patch
queue-4.9/kaiser-kaiser_flush_tlb_on_return_to_user-check-pcid.patch
queue-4.9/x86-paravirt-dont-patch-flush_tlb_single.patch
queue-4.9/kaiser-merged-update.patch
queue-4.9/kaiser-delete-kaiser_real_switch-option.patch
queue-4.9/kaiser-kaiser_remove_mapping-move-along-the-pgd.patch
queue-4.9/kaiser-fix-perf-crashes.patch
queue-4.9/kaiser-drop-is_atomic-arg-to-kaiser_pagetable_walk.patch
queue-4.9/kaiser-load_new_mm_cr3-let-switch_user_cr3-flush-user.patch
queue-4.9/kaiser-enhanced-by-kernel-and-user-pcids.patch
queue-4.9/kaiser-x86_cr3_pcid_noflush-and-x86_cr3_pcid_user.patch
queue-4.9/kaiser-align-addition-to-x86-mm-makefile.patch
queue-4.9/kaiser-use-alternative-instead-of-x86_cr3_pcid_noflush.patch
queue-4.9/kaiser-stack-map-page_size-at-thread_size-page_size.patch
queue-4.9/kaiser-name-that-0x1000-kaiser_shadow_pgd_offset.patch
queue-4.9/kaiser-fix-regs-to-do_nmi-ifndef-config_kaiser.patch
queue-4.9/kaiser-do-not-set-_page_nx-on-pgd_none.patch
queue-4.9/kaiser-tidied-up-asm-kaiser.h-somewhat.patch
queue-4.9/kaiser-cleanups-while-trying-for-gold-link.patch
queue-4.9/kaiser-tidied-up-kaiser_add-remove_mapping-slightly.patch
queue-4.9/kaiser-fix-build-and-fixme-in-alloc_ldt_struct.patch
queue-4.9/kaiser-kernel-address-isolation.patch
queue-4.9/kaiser-enomem-if-kaiser_pagetable_walk-null.patch
queue-4.9/kaiser-asm-tlbflush.h-handle-nopge-at-lower-level.patch
queue-4.9/kaiser-paranoid_entry-pass-cr3-need-to-paranoid_exit.patch
queue-4.9/kaiser-kaiser-depends-on-smp.patch
queue-4.9/kaiser-pcid-0-for-kernel-and-128-for-user.patch
This is a note to let you know that I've just added the patch titled
kaiser: disabled on Xen PV
to the 4.9-stable tree which can be found at:
http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum…
The filename of the patch is:
kaiser-disabled-on-xen-pv.patch
and it can be found in the queue-4.9 subdirectory.
If you, or anyone else, feels it should not be added to the stable tree,
please let <stable(a)vger.kernel.org> know about it.
>From foo@baz Wed Jan 3 20:37:21 CET 2018
From: Jiri Kosina <jkosina(a)suse.cz>
Date: Tue, 2 Jan 2018 14:19:49 +0100
Subject: kaiser: disabled on Xen PV
From: Jiri Kosina <jkosina(a)suse.cz>
Kaiser cannot be used on paravirtualized MMUs (namely reading and writing CR3).
This does not work with KAISER as the CR3 switch from and to user space PGD
would require to map the whole XEN_PV machinery into both.
More importantly, enabling KAISER on Xen PV doesn't make too much sense, as PV
guests use distinct %cr3 values for kernel and user already.
Signed-off-by: Jiri Kosina <jkosina(a)suse.cz>
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
arch/x86/mm/kaiser.c | 5 +++++
1 file changed, 5 insertions(+)
--- a/arch/x86/mm/kaiser.c
+++ b/arch/x86/mm/kaiser.c
@@ -263,6 +263,9 @@ void __init kaiser_check_boottime_disabl
char arg[5];
int ret;
+ if (boot_cpu_has(X86_FEATURE_XENPV))
+ goto silent_disable;
+
ret = cmdline_find_option(boot_command_line, "pti", arg, sizeof(arg));
if (ret > 0) {
if (!strncmp(arg, "on", 2))
@@ -290,6 +293,8 @@ enable:
disable:
pr_info("Kernel/User page tables isolation: disabled\n");
+
+silent_disable:
kaiser_enabled = 0;
setup_clear_cpu_cap(X86_FEATURE_KAISER);
}
Patches currently in stable-queue which might be from jkosina(a)suse.cz are
queue-4.9/kaiser-add-nokaiser-boot-option-using-alternative.patch
queue-4.9/kaiser-kaiser_flush_tlb_on_return_to_user-check-pcid.patch
queue-4.9/kaiser-drop-is_atomic-arg-to-kaiser_pagetable_walk.patch
queue-4.9/kaiser-use-alternative-instead-of-x86_cr3_pcid_noflush.patch
queue-4.9/kaiser-disabled-on-xen-pv.patch
queue-4.9/kaiser-kernel-address-isolation.patch
queue-4.9/kaiser-asm-tlbflush.h-handle-nopge-at-lower-level.patch
This is a note to let you know that I've just added the patch titled
kaiser: asm/tlbflush.h handle noPGE at lower level
to the 4.9-stable tree which can be found at:
http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum…
The filename of the patch is:
kaiser-asm-tlbflush.h-handle-nopge-at-lower-level.patch
and it can be found in the queue-4.9 subdirectory.
If you, or anyone else, feels it should not be added to the stable tree,
please let <stable(a)vger.kernel.org> know about it.
>From foo@baz Wed Jan 3 20:37:21 CET 2018
From: Hugh Dickins <hughd(a)google.com>
Date: Sat, 4 Nov 2017 18:23:24 -0700
Subject: kaiser: asm/tlbflush.h handle noPGE at lower level
From: Hugh Dickins <hughd(a)google.com>
I found asm/tlbflush.h too twisty, and think it safer not to avoid
__native_flush_tlb_global_irq_disabled() in the kaiser_enabled case,
but instead let it handle kaiser_enabled along with cr3: it can just
use __native_flush_tlb() for that, no harm in re-disabling preemption.
(This is not the same change as Kirill and Dave have suggested for
upstream, flipping PGE in cr4: that's neat, but needs a cpu_has_pge
check; cr3 is enough for kaiser, and thought to be cheaper than cr4.)
Also delete the X86_FEATURE_INVPCID invpcid_flush_all_nonglobals()
preference from __native_flush_tlb(): unlike the invpcid_flush_all()
preference in __native_flush_tlb_global(), it's not seen in upstream
4.14, and was recently reported to be surprisingly slow.
Signed-off-by: Hugh Dickins <hughd(a)google.com>
Acked-by: Jiri Kosina <jkosina(a)suse.cz>
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
arch/x86/include/asm/tlbflush.h | 27 +++------------------------
1 file changed, 3 insertions(+), 24 deletions(-)
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -152,14 +152,6 @@ static inline void kaiser_flush_tlb_on_r
static inline void __native_flush_tlb(void)
{
- if (this_cpu_has(X86_FEATURE_INVPCID)) {
- /*
- * Note, this works with CR4.PCIDE=0 or 1.
- */
- invpcid_flush_all_nonglobals();
- return;
- }
-
/*
* If current->mm == NULL then we borrow a mm which may change during a
* task switch and therefore we must not be preempted while we write CR3
@@ -183,11 +175,8 @@ static inline void __native_flush_tlb_gl
/* restore PGE as it was before */
native_write_cr4(cr4);
} else {
- /*
- * x86_64 microcode update comes this way when CR4.PGE is not
- * enabled, and it's safer for all callers to allow this case.
- */
- native_write_cr3(native_read_cr3());
+ /* do it with cr3, letting kaiser flush user PCID */
+ __native_flush_tlb();
}
}
@@ -195,12 +184,6 @@ static inline void __native_flush_tlb_gl
{
unsigned long flags;
- if (kaiser_enabled) {
- /* Globals are not used at all */
- __native_flush_tlb();
- return;
- }
-
if (this_cpu_has(X86_FEATURE_INVPCID)) {
/*
* Using INVPCID is considerably faster than a pair of writes
@@ -256,11 +239,7 @@ static inline void __native_flush_tlb_si
static inline void __flush_tlb_all(void)
{
- if (boot_cpu_has(X86_FEATURE_PGE))
- __flush_tlb_global();
- else
- __flush_tlb();
-
+ __flush_tlb_global();
/*
* Note: if we somehow had PCID but not PGE, then this wouldn't work --
* we'd end up flushing kernel translations for the current ASID but
Patches currently in stable-queue which might be from hughd(a)google.com are
queue-4.9/kaiser-vmstat-show-nr_kaisertable-as-nr_overhead.patch
queue-4.9/kaiser-add-nokaiser-boot-option-using-alternative.patch
queue-4.9/kaiser-fix-unlikely-error-in-alloc_ldt_struct.patch
queue-4.9/kaiser-kaiser_flush_tlb_on_return_to_user-check-pcid.patch
queue-4.9/x86-paravirt-dont-patch-flush_tlb_single.patch
queue-4.9/kaiser-merged-update.patch
queue-4.9/kaiser-delete-kaiser_real_switch-option.patch
queue-4.9/kaiser-kaiser_remove_mapping-move-along-the-pgd.patch
queue-4.9/kaiser-fix-perf-crashes.patch
queue-4.9/kaiser-drop-is_atomic-arg-to-kaiser_pagetable_walk.patch
queue-4.9/kaiser-load_new_mm_cr3-let-switch_user_cr3-flush-user.patch
queue-4.9/kaiser-enhanced-by-kernel-and-user-pcids.patch
queue-4.9/kaiser-x86_cr3_pcid_noflush-and-x86_cr3_pcid_user.patch
queue-4.9/kaiser-align-addition-to-x86-mm-makefile.patch
queue-4.9/kaiser-use-alternative-instead-of-x86_cr3_pcid_noflush.patch
queue-4.9/kaiser-stack-map-page_size-at-thread_size-page_size.patch
queue-4.9/kaiser-name-that-0x1000-kaiser_shadow_pgd_offset.patch
queue-4.9/kaiser-fix-regs-to-do_nmi-ifndef-config_kaiser.patch
queue-4.9/kaiser-do-not-set-_page_nx-on-pgd_none.patch
queue-4.9/kaiser-tidied-up-asm-kaiser.h-somewhat.patch
queue-4.9/kaiser-cleanups-while-trying-for-gold-link.patch
queue-4.9/kaiser-tidied-up-kaiser_add-remove_mapping-slightly.patch
queue-4.9/kaiser-fix-build-and-fixme-in-alloc_ldt_struct.patch
queue-4.9/kaiser-kernel-address-isolation.patch
queue-4.9/kaiser-enomem-if-kaiser_pagetable_walk-null.patch
queue-4.9/kaiser-asm-tlbflush.h-handle-nopge-at-lower-level.patch
queue-4.9/kaiser-paranoid_entry-pass-cr3-need-to-paranoid_exit.patch
queue-4.9/kaiser-kaiser-depends-on-smp.patch
queue-4.9/kaiser-pcid-0-for-kernel-and-128-for-user.patch
This is a note to let you know that I've just added the patch titled
kaiser: delete KAISER_REAL_SWITCH option
to the 4.9-stable tree which can be found at:
http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum…
The filename of the patch is:
kaiser-delete-kaiser_real_switch-option.patch
and it can be found in the queue-4.9 subdirectory.
If you, or anyone else, feels it should not be added to the stable tree,
please let <stable(a)vger.kernel.org> know about it.
>From foo@baz Wed Jan 3 20:37:21 CET 2018
From: Hugh Dickins <hughd(a)google.com>
Date: Sun, 3 Sep 2017 18:30:43 -0700
Subject: kaiser: delete KAISER_REAL_SWITCH option
From: Hugh Dickins <hughd(a)google.com>
We fail to see what CONFIG_KAISER_REAL_SWITCH is for: it seems to be
left over from early development, and now just obscures tricky parts
of the code. Delete it before adding PCIDs, or nokaiser boot option.
(Or if there is some good reason to keep the option, then it needs
a help text - and a "depends on KAISER", so that all those without
KAISER are not asked the question. But we'd much rather delete it.)
Signed-off-by: Hugh Dickins <hughd(a)google.com>
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
arch/x86/entry/entry_64.S | 4 ----
arch/x86/include/asm/kaiser.h | 4 ----
security/Kconfig | 4 ----
3 files changed, 12 deletions(-)
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -1317,9 +1317,7 @@ ENTRY(nmi)
/* %rax is saved above, so OK to clobber here */
movq %cr3, %rax
pushq %rax
-#ifdef CONFIG_KAISER_REAL_SWITCH
andq $(~KAISER_SHADOW_PGD_OFFSET), %rax
-#endif
movq %rax, %cr3
#endif
call do_nmi
@@ -1560,9 +1558,7 @@ end_repeat_nmi:
/* %rax is saved above, so OK to clobber here */
movq %cr3, %rax
pushq %rax
-#ifdef CONFIG_KAISER_REAL_SWITCH
andq $(~KAISER_SHADOW_PGD_OFFSET), %rax
-#endif
movq %rax, %cr3
#endif
--- a/arch/x86/include/asm/kaiser.h
+++ b/arch/x86/include/asm/kaiser.h
@@ -21,17 +21,13 @@
.macro _SWITCH_TO_KERNEL_CR3 reg
movq %cr3, \reg
-#ifdef CONFIG_KAISER_REAL_SWITCH
andq $(~KAISER_SHADOW_PGD_OFFSET), \reg
-#endif
movq \reg, %cr3
.endm
.macro _SWITCH_TO_USER_CR3 reg
movq %cr3, \reg
-#ifdef CONFIG_KAISER_REAL_SWITCH
orq $(KAISER_SHADOW_PGD_OFFSET), \reg
-#endif
movq \reg, %cr3
.endm
--- a/security/Kconfig
+++ b/security/Kconfig
@@ -41,10 +41,6 @@ config KAISER
If you are unsure how to answer this question, answer Y.
-config KAISER_REAL_SWITCH
- bool "KAISER: actually switch page tables"
- default y
-
config SECURITYFS
bool "Enable the securityfs filesystem"
help
Patches currently in stable-queue which might be from hughd(a)google.com are
queue-4.9/kaiser-vmstat-show-nr_kaisertable-as-nr_overhead.patch
queue-4.9/kaiser-add-nokaiser-boot-option-using-alternative.patch
queue-4.9/kaiser-fix-unlikely-error-in-alloc_ldt_struct.patch
queue-4.9/kaiser-kaiser_flush_tlb_on_return_to_user-check-pcid.patch
queue-4.9/x86-paravirt-dont-patch-flush_tlb_single.patch
queue-4.9/kaiser-merged-update.patch
queue-4.9/kaiser-delete-kaiser_real_switch-option.patch
queue-4.9/kaiser-kaiser_remove_mapping-move-along-the-pgd.patch
queue-4.9/kaiser-fix-perf-crashes.patch
queue-4.9/kaiser-drop-is_atomic-arg-to-kaiser_pagetable_walk.patch
queue-4.9/kaiser-load_new_mm_cr3-let-switch_user_cr3-flush-user.patch
queue-4.9/kaiser-enhanced-by-kernel-and-user-pcids.patch
queue-4.9/kaiser-x86_cr3_pcid_noflush-and-x86_cr3_pcid_user.patch
queue-4.9/kaiser-align-addition-to-x86-mm-makefile.patch
queue-4.9/kaiser-use-alternative-instead-of-x86_cr3_pcid_noflush.patch
queue-4.9/kaiser-stack-map-page_size-at-thread_size-page_size.patch
queue-4.9/kaiser-name-that-0x1000-kaiser_shadow_pgd_offset.patch
queue-4.9/kaiser-fix-regs-to-do_nmi-ifndef-config_kaiser.patch
queue-4.9/kaiser-do-not-set-_page_nx-on-pgd_none.patch
queue-4.9/kaiser-tidied-up-asm-kaiser.h-somewhat.patch
queue-4.9/kaiser-cleanups-while-trying-for-gold-link.patch
queue-4.9/kaiser-tidied-up-kaiser_add-remove_mapping-slightly.patch
queue-4.9/kaiser-fix-build-and-fixme-in-alloc_ldt_struct.patch
queue-4.9/kaiser-kernel-address-isolation.patch
queue-4.9/kaiser-enomem-if-kaiser_pagetable_walk-null.patch
queue-4.9/kaiser-asm-tlbflush.h-handle-nopge-at-lower-level.patch
queue-4.9/kaiser-paranoid_entry-pass-cr3-need-to-paranoid_exit.patch
queue-4.9/kaiser-kaiser-depends-on-smp.patch
queue-4.9/kaiser-pcid-0-for-kernel-and-128-for-user.patch
This is a note to let you know that I've just added the patch titled
kaiser: align addition to x86/mm/Makefile
to the 4.9-stable tree which can be found at:
http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum…
The filename of the patch is:
kaiser-align-addition-to-x86-mm-makefile.patch
and it can be found in the queue-4.9 subdirectory.
If you, or anyone else, feels it should not be added to the stable tree,
please let <stable(a)vger.kernel.org> know about it.
>From foo@baz Wed Jan 3 20:37:21 CET 2018
From: Hugh Dickins <hughd(a)google.com>
Date: Sun, 3 Sep 2017 19:51:10 -0700
Subject: kaiser: align addition to x86/mm/Makefile
From: Hugh Dickins <hughd(a)google.com>
Use tab not space so they line up properly, kaslr.o also.
Signed-off-by: Hugh Dickins <hughd(a)google.com>
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
arch/x86/mm/Makefile | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -37,5 +37,5 @@ obj-$(CONFIG_NUMA_EMU) += numa_emulatio
obj-$(CONFIG_X86_INTEL_MPX) += mpx.o
obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS) += pkeys.o
-obj-$(CONFIG_RANDOMIZE_MEMORY) += kaslr.o
-obj-$(CONFIG_KAISER) += kaiser.o
+obj-$(CONFIG_RANDOMIZE_MEMORY) += kaslr.o
+obj-$(CONFIG_KAISER) += kaiser.o
Patches currently in stable-queue which might be from hughd(a)google.com are
queue-4.9/kaiser-vmstat-show-nr_kaisertable-as-nr_overhead.patch
queue-4.9/kaiser-add-nokaiser-boot-option-using-alternative.patch
queue-4.9/kaiser-fix-unlikely-error-in-alloc_ldt_struct.patch
queue-4.9/kaiser-kaiser_flush_tlb_on_return_to_user-check-pcid.patch
queue-4.9/x86-paravirt-dont-patch-flush_tlb_single.patch
queue-4.9/kaiser-merged-update.patch
queue-4.9/kaiser-delete-kaiser_real_switch-option.patch
queue-4.9/kaiser-kaiser_remove_mapping-move-along-the-pgd.patch
queue-4.9/kaiser-fix-perf-crashes.patch
queue-4.9/kaiser-drop-is_atomic-arg-to-kaiser_pagetable_walk.patch
queue-4.9/kaiser-load_new_mm_cr3-let-switch_user_cr3-flush-user.patch
queue-4.9/kaiser-enhanced-by-kernel-and-user-pcids.patch
queue-4.9/kaiser-x86_cr3_pcid_noflush-and-x86_cr3_pcid_user.patch
queue-4.9/kaiser-align-addition-to-x86-mm-makefile.patch
queue-4.9/kaiser-use-alternative-instead-of-x86_cr3_pcid_noflush.patch
queue-4.9/kaiser-stack-map-page_size-at-thread_size-page_size.patch
queue-4.9/kaiser-name-that-0x1000-kaiser_shadow_pgd_offset.patch
queue-4.9/kaiser-fix-regs-to-do_nmi-ifndef-config_kaiser.patch
queue-4.9/kaiser-do-not-set-_page_nx-on-pgd_none.patch
queue-4.9/kaiser-tidied-up-asm-kaiser.h-somewhat.patch
queue-4.9/kaiser-cleanups-while-trying-for-gold-link.patch
queue-4.9/kaiser-tidied-up-kaiser_add-remove_mapping-slightly.patch
queue-4.9/kaiser-fix-build-and-fixme-in-alloc_ldt_struct.patch
queue-4.9/kaiser-kernel-address-isolation.patch
queue-4.9/kaiser-enomem-if-kaiser_pagetable_walk-null.patch
queue-4.9/kaiser-asm-tlbflush.h-handle-nopge-at-lower-level.patch
queue-4.9/kaiser-paranoid_entry-pass-cr3-need-to-paranoid_exit.patch
queue-4.9/kaiser-kaiser-depends-on-smp.patch
queue-4.9/kaiser-pcid-0-for-kernel-and-128-for-user.patch
This is a note to let you know that I've just added the patch titled
kaiser: add "nokaiser" boot option, using ALTERNATIVE
to the 4.9-stable tree which can be found at:
http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum…
The filename of the patch is:
kaiser-add-nokaiser-boot-option-using-alternative.patch
and it can be found in the queue-4.9 subdirectory.
If you, or anyone else, feels it should not be added to the stable tree,
please let <stable(a)vger.kernel.org> know about it.
>From foo@baz Wed Jan 3 20:37:21 CET 2018
From: Hugh Dickins <hughd(a)google.com>
Date: Sun, 24 Sep 2017 16:59:49 -0700
Subject: kaiser: add "nokaiser" boot option, using ALTERNATIVE
From: Hugh Dickins <hughd(a)google.com>
Added "nokaiser" boot option: an early param like "noinvpcid".
Most places now check int kaiser_enabled (#defined 0 when not
CONFIG_KAISER) instead of #ifdef CONFIG_KAISER; but entry_64.S
and entry_64_compat.S are using the ALTERNATIVE technique, which
patches in the preferred instructions at runtime. That technique
is tied to x86 cpu features, so X86_FEATURE_KAISER is fabricated.
Prior to "nokaiser", Kaiser #defined _PAGE_GLOBAL 0: revert that,
but be careful with both _PAGE_GLOBAL and CR4.PGE: setting them when
nokaiser like when !CONFIG_KAISER, but not setting either when kaiser -
neither matters on its own, but it's hard to be sure that _PAGE_GLOBAL
won't get set in some obscure corner, or something add PGE into CR4.
By omitting _PAGE_GLOBAL from __supported_pte_mask when kaiser_enabled,
all page table setup which uses pte_pfn() masks it out of the ptes.
It's slightly shameful that the same declaration versus definition of
kaiser_enabled appears in not one, not two, but in three header files
(asm/kaiser.h, asm/pgtable.h, asm/tlbflush.h). I felt safer that way,
than with #including any of those in any of the others; and did not
feel it worth an asm/kaiser_enabled.h - kernel/cpu/common.c includes
them all, so we shall hear about it if they get out of synch.
Cleanups while in the area: removed the silly #ifdef CONFIG_KAISER
from kaiser.c; removed the unused native_get_normal_pgd(); removed
the spurious reg clutter from SWITCH_*_CR3 macro stubs; corrected some
comments. But more interestingly, set CR4.PSE in secondary_startup_64:
the manual is clear that it does not matter whether it's 0 or 1 when
4-level-pts are enabled, but I was distracted to find cr4 different on
BSP and auxiliaries - BSP alone was adding PSE, in probe_page_size_mask().
Signed-off-by: Hugh Dickins <hughd(a)google.com>
Acked-by: Jiri Kosina <jkosina(a)suse.cz>
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
Documentation/kernel-parameters.txt | 2 +
arch/x86/entry/entry_64.S | 15 ++++++-----
arch/x86/include/asm/cpufeatures.h | 3 ++
arch/x86/include/asm/kaiser.h | 27 +++++++++++++++------
arch/x86/include/asm/pgtable.h | 20 +++++++++++----
arch/x86/include/asm/pgtable_64.h | 13 +++-------
arch/x86/include/asm/pgtable_types.h | 4 ---
arch/x86/include/asm/tlbflush.h | 39 +++++++++++++++++++------------
arch/x86/kernel/cpu/common.c | 28 +++++++++++++++++++++-
arch/x86/kernel/espfix_64.c | 3 +-
arch/x86/kernel/head_64.S | 4 +--
arch/x86/mm/init.c | 2 -
arch/x86/mm/init_64.c | 10 +++++++
arch/x86/mm/kaiser.c | 26 +++++++++++++++++---
arch/x86/mm/pgtable.c | 8 +-----
arch/x86/mm/tlb.c | 4 ---
tools/arch/x86/include/asm/cpufeatures.h | 3 ++
17 files changed, 146 insertions(+), 65 deletions(-)
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -2763,6 +2763,8 @@ bytes respectively. Such letter suffixes
nojitter [IA-64] Disables jitter checking for ITC timers.
+ nokaiser [X86-64] Disable KAISER isolation of kernel from user.
+
no-kvmclock [X86,KVM] Disable paravirtualized KVM clock driver
no-kvmapf [X86,KVM] Disable paravirtualized asynchronous page
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -1079,7 +1079,7 @@ ENTRY(paranoid_entry)
* unconditionally, but we need to find out whether the reverse
* should be done on return (conveyed to paranoid_exit in %ebx).
*/
- movq %cr3, %rax
+ ALTERNATIVE "jmp 2f", "movq %cr3, %rax", X86_FEATURE_KAISER
testl $KAISER_SHADOW_PGD_OFFSET, %eax
jz 2f
orl $2, %ebx
@@ -1111,6 +1111,7 @@ ENTRY(paranoid_exit)
TRACE_IRQS_OFF_DEBUG
TRACE_IRQS_IRETQ_DEBUG
#ifdef CONFIG_KAISER
+ /* No ALTERNATIVE for X86_FEATURE_KAISER: paranoid_entry sets %ebx */
testl $2, %ebx /* SWITCH_USER_CR3 needed? */
jz paranoid_exit_no_switch
SWITCH_USER_CR3
@@ -1341,13 +1342,14 @@ ENTRY(nmi)
#ifdef CONFIG_KAISER
/* Unconditionally use kernel CR3 for do_nmi() */
/* %rax is saved above, so OK to clobber here */
- movq %cr3, %rax
+ ALTERNATIVE "jmp 2f", "movq %cr3, %rax", X86_FEATURE_KAISER
/* If PCID enabled, NOFLUSH now and NOFLUSH on return */
orq x86_cr3_pcid_noflush, %rax
pushq %rax
/* mask off "user" bit of pgd address and 12 PCID bits: */
andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), %rax
movq %rax, %cr3
+2:
#endif
call do_nmi
@@ -1357,8 +1359,7 @@ ENTRY(nmi)
* kernel code that needs user CR3, but do we ever return
* to "user mode" where we need the kernel CR3?
*/
- popq %rax
- mov %rax, %cr3
+ ALTERNATIVE "", "popq %rax; movq %rax, %cr3", X86_FEATURE_KAISER
#endif
/*
@@ -1585,13 +1586,14 @@ end_repeat_nmi:
#ifdef CONFIG_KAISER
/* Unconditionally use kernel CR3 for do_nmi() */
/* %rax is saved above, so OK to clobber here */
- movq %cr3, %rax
+ ALTERNATIVE "jmp 2f", "movq %cr3, %rax", X86_FEATURE_KAISER
/* If PCID enabled, NOFLUSH now and NOFLUSH on return */
orq x86_cr3_pcid_noflush, %rax
pushq %rax
/* mask off "user" bit of pgd address and 12 PCID bits: */
andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), %rax
movq %rax, %cr3
+2:
#endif
/* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */
@@ -1603,8 +1605,7 @@ end_repeat_nmi:
* kernel code that needs user CR3, like just just before
* a sysret.
*/
- popq %rax
- mov %rax, %cr3
+ ALTERNATIVE "", "popq %rax; movq %rax, %cr3", X86_FEATURE_KAISER
#endif
testl %ebx, %ebx /* swapgs needed? */
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -198,6 +198,9 @@
#define X86_FEATURE_AVX512_4VNNIW (7*32+16) /* AVX-512 Neural Network Instructions */
#define X86_FEATURE_AVX512_4FMAPS (7*32+17) /* AVX-512 Multiply Accumulation Single precision */
+/* Because the ALTERNATIVE scheme is for members of the X86_FEATURE club... */
+#define X86_FEATURE_KAISER ( 7*32+31) /* CONFIG_KAISER w/o nokaiser */
+
/* Virtualization flags: Linux defined, word 8 */
#define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */
#define X86_FEATURE_VNMI ( 8*32+ 1) /* Intel Virtual NMI */
--- a/arch/x86/include/asm/kaiser.h
+++ b/arch/x86/include/asm/kaiser.h
@@ -46,28 +46,33 @@ movq \reg, %cr3
.endm
.macro SWITCH_KERNEL_CR3
-pushq %rax
+ALTERNATIVE "jmp 8f", "pushq %rax", X86_FEATURE_KAISER
_SWITCH_TO_KERNEL_CR3 %rax
popq %rax
+8:
.endm
.macro SWITCH_USER_CR3
-pushq %rax
+ALTERNATIVE "jmp 8f", "pushq %rax", X86_FEATURE_KAISER
_SWITCH_TO_USER_CR3 %rax %al
popq %rax
+8:
.endm
.macro SWITCH_KERNEL_CR3_NO_STACK
-movq %rax, PER_CPU_VAR(unsafe_stack_register_backup)
+ALTERNATIVE "jmp 8f", \
+ __stringify(movq %rax, PER_CPU_VAR(unsafe_stack_register_backup)), \
+ X86_FEATURE_KAISER
_SWITCH_TO_KERNEL_CR3 %rax
movq PER_CPU_VAR(unsafe_stack_register_backup), %rax
+8:
.endm
#else /* CONFIG_KAISER */
-.macro SWITCH_KERNEL_CR3 reg
+.macro SWITCH_KERNEL_CR3
.endm
-.macro SWITCH_USER_CR3 reg regb
+.macro SWITCH_USER_CR3
.endm
.macro SWITCH_KERNEL_CR3_NO_STACK
.endm
@@ -90,6 +95,16 @@ DECLARE_PER_CPU(unsigned long, x86_cr3_p
extern char __per_cpu_user_mapped_start[], __per_cpu_user_mapped_end[];
+extern int kaiser_enabled;
+#else
+#define kaiser_enabled 0
+#endif /* CONFIG_KAISER */
+
+/*
+ * Kaiser function prototypes are needed even when CONFIG_KAISER is not set,
+ * so as to build with tests on kaiser_enabled instead of #ifdefs.
+ */
+
/**
* kaiser_add_mapping - map a virtual memory part to the shadow (user) mapping
* @addr: the start address of the range
@@ -119,8 +134,6 @@ extern void kaiser_remove_mapping(unsign
*/
extern void kaiser_init(void);
-#endif /* CONFIG_KAISER */
-
#endif /* __ASSEMBLY */
#endif /* _ASM_X86_KAISER_H */
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -18,6 +18,12 @@
#ifndef __ASSEMBLY__
#include <asm/x86_init.h>
+#ifdef CONFIG_KAISER
+extern int kaiser_enabled;
+#else
+#define kaiser_enabled 0
+#endif
+
void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd);
void ptdump_walk_pgd_level_checkwx(void);
@@ -697,7 +703,7 @@ static inline int pgd_bad(pgd_t pgd)
* page table by accident; it will fault on the first
* instruction it tries to run. See native_set_pgd().
*/
- if (IS_ENABLED(CONFIG_KAISER))
+ if (kaiser_enabled)
ignore_flags |= _PAGE_NX;
return (pgd_flags(pgd) & ~ignore_flags) != _KERNPG_TABLE;
@@ -913,12 +919,14 @@ static inline void pmdp_set_wrprotect(st
*/
static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count)
{
- memcpy(dst, src, count * sizeof(pgd_t));
+ memcpy(dst, src, count * sizeof(pgd_t));
#ifdef CONFIG_KAISER
- /* Clone the shadow pgd part as well */
- memcpy(native_get_shadow_pgd(dst),
- native_get_shadow_pgd(src),
- count * sizeof(pgd_t));
+ if (kaiser_enabled) {
+ /* Clone the shadow pgd part as well */
+ memcpy(native_get_shadow_pgd(dst),
+ native_get_shadow_pgd(src),
+ count * sizeof(pgd_t));
+ }
#endif
}
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -111,13 +111,12 @@ extern pgd_t kaiser_set_shadow_pgd(pgd_t
static inline pgd_t *native_get_shadow_pgd(pgd_t *pgdp)
{
+#ifdef CONFIG_DEBUG_VM
+ /* linux/mmdebug.h may not have been included at this point */
+ BUG_ON(!kaiser_enabled);
+#endif
return (pgd_t *)((unsigned long)pgdp | (unsigned long)PAGE_SIZE);
}
-
-static inline pgd_t *native_get_normal_pgd(pgd_t *pgdp)
-{
- return (pgd_t *)((unsigned long)pgdp & ~(unsigned long)PAGE_SIZE);
-}
#else
static inline pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd)
{
@@ -128,10 +127,6 @@ static inline pgd_t *native_get_shadow_p
BUILD_BUG_ON(1);
return NULL;
}
-static inline pgd_t *native_get_normal_pgd(pgd_t *pgdp)
-{
- return pgdp;
-}
#endif /* CONFIG_KAISER */
static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd)
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -45,11 +45,7 @@
#define _PAGE_ACCESSED (_AT(pteval_t, 1) << _PAGE_BIT_ACCESSED)
#define _PAGE_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_DIRTY)
#define _PAGE_PSE (_AT(pteval_t, 1) << _PAGE_BIT_PSE)
-#ifdef CONFIG_KAISER
-#define _PAGE_GLOBAL (_AT(pteval_t, 0))
-#else
#define _PAGE_GLOBAL (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL)
-#endif
#define _PAGE_SOFTW1 (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW1)
#define _PAGE_SOFTW2 (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW2)
#define _PAGE_PAT (_AT(pteval_t, 1) << _PAGE_BIT_PAT)
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -137,9 +137,11 @@ static inline void cr4_set_bits_and_upda
* to avoid the need for asm/kaiser.h in unexpected places.
*/
#ifdef CONFIG_KAISER
+extern int kaiser_enabled;
extern void kaiser_setup_pcid(void);
extern void kaiser_flush_tlb_on_return_to_user(void);
#else
+#define kaiser_enabled 0
static inline void kaiser_setup_pcid(void)
{
}
@@ -164,7 +166,7 @@ static inline void __native_flush_tlb(vo
* back:
*/
preempt_disable();
- if (this_cpu_has(X86_FEATURE_PCID))
+ if (kaiser_enabled && this_cpu_has(X86_FEATURE_PCID))
kaiser_flush_tlb_on_return_to_user();
native_write_cr3(native_read_cr3());
preempt_enable();
@@ -175,20 +177,30 @@ static inline void __native_flush_tlb_gl
unsigned long cr4;
cr4 = this_cpu_read(cpu_tlbstate.cr4);
- /* clear PGE */
- native_write_cr4(cr4 & ~X86_CR4_PGE);
- /* write old PGE again and flush TLBs */
- native_write_cr4(cr4);
+ if (cr4 & X86_CR4_PGE) {
+ /* clear PGE and flush TLB of all entries */
+ native_write_cr4(cr4 & ~X86_CR4_PGE);
+ /* restore PGE as it was before */
+ native_write_cr4(cr4);
+ } else {
+ /*
+ * x86_64 microcode update comes this way when CR4.PGE is not
+ * enabled, and it's safer for all callers to allow this case.
+ */
+ native_write_cr3(native_read_cr3());
+ }
}
static inline void __native_flush_tlb_global(void)
{
-#ifdef CONFIG_KAISER
- /* Globals are not used at all */
- __native_flush_tlb();
-#else
unsigned long flags;
+ if (kaiser_enabled) {
+ /* Globals are not used at all */
+ __native_flush_tlb();
+ return;
+ }
+
if (this_cpu_has(X86_FEATURE_INVPCID)) {
/*
* Using INVPCID is considerably faster than a pair of writes
@@ -208,7 +220,6 @@ static inline void __native_flush_tlb_gl
raw_local_irq_save(flags);
__native_flush_tlb_global_irq_disabled();
raw_local_irq_restore(flags);
-#endif
}
static inline void __native_flush_tlb_single(unsigned long addr)
@@ -223,7 +234,7 @@ static inline void __native_flush_tlb_si
*/
if (!this_cpu_has(X86_FEATURE_INVPCID_SINGLE)) {
- if (this_cpu_has(X86_FEATURE_PCID))
+ if (kaiser_enabled && this_cpu_has(X86_FEATURE_PCID))
kaiser_flush_tlb_on_return_to_user();
asm volatile("invlpg (%0)" ::"r" (addr) : "memory");
return;
@@ -238,9 +249,9 @@ static inline void __native_flush_tlb_si
* Make sure to do only a single invpcid when KAISER is
* disabled and we have only a single ASID.
*/
- if (X86_CR3_PCID_ASID_KERN != X86_CR3_PCID_ASID_USER)
- invpcid_flush_one(X86_CR3_PCID_ASID_KERN, addr);
- invpcid_flush_one(X86_CR3_PCID_ASID_USER, addr);
+ if (kaiser_enabled)
+ invpcid_flush_one(X86_CR3_PCID_ASID_USER, addr);
+ invpcid_flush_one(X86_CR3_PCID_ASID_KERN, addr);
}
static inline void __flush_tlb_all(void)
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -179,6 +179,20 @@ static int __init x86_pcid_setup(char *s
return 1;
}
__setup("nopcid", x86_pcid_setup);
+
+static int __init x86_nokaiser_setup(char *s)
+{
+ /* nokaiser doesn't accept parameters */
+ if (s)
+ return -EINVAL;
+#ifdef CONFIG_KAISER
+ kaiser_enabled = 0;
+ setup_clear_cpu_cap(X86_FEATURE_KAISER);
+ pr_info("nokaiser: KAISER feature disabled\n");
+#endif
+ return 0;
+}
+early_param("nokaiser", x86_nokaiser_setup);
#endif
static int __init x86_noinvpcid_setup(char *s)
@@ -327,7 +341,7 @@ static __always_inline void setup_smap(s
static void setup_pcid(struct cpuinfo_x86 *c)
{
if (cpu_has(c, X86_FEATURE_PCID)) {
- if (cpu_has(c, X86_FEATURE_PGE)) {
+ if (cpu_has(c, X86_FEATURE_PGE) || kaiser_enabled) {
cr4_set_bits(X86_CR4_PCIDE);
/*
* INVPCID has two "groups" of types:
@@ -799,6 +813,10 @@ void get_cpu_cap(struct cpuinfo_x86 *c)
c->x86_capability[CPUID_8000_000A_EDX] = cpuid_edx(0x8000000a);
init_scattered_cpuid_features(c);
+#ifdef CONFIG_KAISER
+ if (kaiser_enabled)
+ set_cpu_cap(c, X86_FEATURE_KAISER);
+#endif
}
static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c)
@@ -1537,6 +1555,14 @@ void cpu_init(void)
* try to read it.
*/
cr4_init_shadow();
+ if (!kaiser_enabled) {
+ /*
+ * secondary_startup_64() deferred setting PGE in cr4:
+ * probe_page_size_mask() sets it on the boot cpu,
+ * but it needs to be set on each secondary cpu.
+ */
+ cr4_set_bits(X86_CR4_PGE);
+ }
/*
* Load microcode on this cpu if a valid microcode is available.
--- a/arch/x86/kernel/espfix_64.c
+++ b/arch/x86/kernel/espfix_64.c
@@ -132,9 +132,10 @@ void __init init_espfix_bsp(void)
* area to ensure it is mapped into the shadow user page
* tables.
*/
- if (IS_ENABLED(CONFIG_KAISER))
+ if (kaiser_enabled) {
set_pgd(native_get_shadow_pgd(pgd_p),
__pgd(_KERNPG_TABLE | __pa((pud_t *)espfix_pud_page)));
+ }
/* Randomize the locations */
init_espfix_random();
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -190,8 +190,8 @@ ENTRY(secondary_startup_64)
movq $(init_level4_pgt - __START_KERNEL_map), %rax
1:
- /* Enable PAE mode and PGE */
- movl $(X86_CR4_PAE | X86_CR4_PGE), %ecx
+ /* Enable PAE and PSE, but defer PGE until kaiser_enabled is decided */
+ movl $(X86_CR4_PAE | X86_CR4_PSE), %ecx
movq %rcx, %cr4
/* Setup early boot stage 4 level pagetables. */
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -177,7 +177,7 @@ static void __init probe_page_size_mask(
cr4_set_bits_and_update_boot(X86_CR4_PSE);
/* Enable PGE if available */
- if (boot_cpu_has(X86_FEATURE_PGE)) {
+ if (boot_cpu_has(X86_FEATURE_PGE) && !kaiser_enabled) {
cr4_set_bits_and_update_boot(X86_CR4_PGE);
__supported_pte_mask |= _PAGE_GLOBAL;
} else
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -324,6 +324,16 @@ void __init cleanup_highmap(void)
continue;
if (vaddr < (unsigned long) _text || vaddr > end)
set_pmd(pmd, __pmd(0));
+ else if (kaiser_enabled) {
+ /*
+ * level2_kernel_pgt is initialized with _PAGE_GLOBAL:
+ * clear that now. This is not important, so long as
+ * CR4.PGE remains clear, but it removes an anomaly.
+ * Physical mapping setup below avoids _PAGE_GLOBAL
+ * by use of massage_pgprot() inside pfn_pte() etc.
+ */
+ set_pmd(pmd, pmd_clear_flags(*pmd, _PAGE_GLOBAL));
+ }
}
}
--- a/arch/x86/mm/kaiser.c
+++ b/arch/x86/mm/kaiser.c
@@ -16,7 +16,9 @@
#include <asm/pgalloc.h>
#include <asm/desc.h>
-#ifdef CONFIG_KAISER
+int kaiser_enabled __read_mostly = 1;
+EXPORT_SYMBOL(kaiser_enabled); /* for inlined TLB flush functions */
+
__visible
DEFINE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup);
@@ -167,8 +169,8 @@ static pte_t *kaiser_pagetable_walk(unsi
return pte_offset_kernel(pmd, address);
}
-int kaiser_add_user_map(const void *__start_addr, unsigned long size,
- unsigned long flags)
+static int kaiser_add_user_map(const void *__start_addr, unsigned long size,
+ unsigned long flags)
{
int ret = 0;
pte_t *pte;
@@ -177,6 +179,15 @@ int kaiser_add_user_map(const void *__st
unsigned long end_addr = PAGE_ALIGN(start_addr + size);
unsigned long target_address;
+ /*
+ * It is convenient for callers to pass in __PAGE_KERNEL etc,
+ * and there is no actual harm from setting _PAGE_GLOBAL, so
+ * long as CR4.PGE is not set. But it is nonetheless troubling
+ * to see Kaiser itself setting _PAGE_GLOBAL (now that "nokaiser"
+ * requires that not to be #defined to 0): so mask it off here.
+ */
+ flags &= ~_PAGE_GLOBAL;
+
for (; address < end_addr; address += PAGE_SIZE) {
target_address = get_pa_from_mapping(address);
if (target_address == -1) {
@@ -263,6 +274,8 @@ void __init kaiser_init(void)
{
int cpu;
+ if (!kaiser_enabled)
+ return;
kaiser_init_all_pgds();
for_each_possible_cpu(cpu) {
@@ -311,6 +324,8 @@ void __init kaiser_init(void)
/* Add a mapping to the shadow mapping, and synchronize the mappings */
int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags)
{
+ if (!kaiser_enabled)
+ return 0;
return kaiser_add_user_map((const void *)addr, size, flags);
}
@@ -322,6 +337,8 @@ void kaiser_remove_mapping(unsigned long
unsigned long addr, next;
pgd_t *pgd;
+ if (!kaiser_enabled)
+ return;
pgd = native_get_shadow_pgd(pgd_offset_k(start));
for (addr = start; addr < end; pgd++, addr = next) {
next = pgd_addr_end(addr, end);
@@ -343,6 +360,8 @@ static inline bool is_userspace_pgd(pgd_
pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd)
{
+ if (!kaiser_enabled)
+ return pgd;
/*
* Do we need to also populate the shadow pgd? Check _PAGE_USER to
* skip cases like kexec and EFI which make temporary low mappings.
@@ -399,4 +418,3 @@ void kaiser_flush_tlb_on_return_to_user(
X86_CR3_PCID_USER_FLUSH | KAISER_SHADOW_PGD_OFFSET);
}
EXPORT_SYMBOL(kaiser_flush_tlb_on_return_to_user);
-#endif /* CONFIG_KAISER */
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -345,16 +345,12 @@ static inline void _pgd_free(pgd_t *pgd)
}
#else
-#ifdef CONFIG_KAISER
/*
- * Instead of one pmd, we aquire two pmds. Being order-1, it is
+ * Instead of one pgd, Kaiser acquires two pgds. Being order-1, it is
* both 8k in size and 8k-aligned. That lets us just flip bit 12
* in a pointer to swap between the two 4k halves.
*/
-#define PGD_ALLOCATION_ORDER 1
-#else
-#define PGD_ALLOCATION_ORDER 0
-#endif
+#define PGD_ALLOCATION_ORDER kaiser_enabled
static inline pgd_t *_pgd_alloc(void)
{
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -39,8 +39,7 @@ static void load_new_mm_cr3(pgd_t *pgdir
{
unsigned long new_mm_cr3 = __pa(pgdir);
-#ifdef CONFIG_KAISER
- if (this_cpu_has(X86_FEATURE_PCID)) {
+ if (kaiser_enabled && this_cpu_has(X86_FEATURE_PCID)) {
/*
* We reuse the same PCID for different tasks, so we must
* flush all the entries for the PCID out when we change tasks.
@@ -57,7 +56,6 @@ static void load_new_mm_cr3(pgd_t *pgdir
new_mm_cr3 |= X86_CR3_PCID_KERN_FLUSH;
kaiser_flush_tlb_on_return_to_user();
}
-#endif /* CONFIG_KAISER */
/*
* Caution: many callers of this function expect
--- a/tools/arch/x86/include/asm/cpufeatures.h
+++ b/tools/arch/x86/include/asm/cpufeatures.h
@@ -197,6 +197,9 @@
#define X86_FEATURE_AVX512_4VNNIW (7*32+16) /* AVX-512 Neural Network Instructions */
#define X86_FEATURE_AVX512_4FMAPS (7*32+17) /* AVX-512 Multiply Accumulation Single precision */
+/* Because the ALTERNATIVE scheme is for members of the X86_FEATURE club... */
+#define X86_FEATURE_KAISER ( 7*32+31) /* CONFIG_KAISER w/o nokaiser */
+
/* Virtualization flags: Linux defined, word 8 */
#define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */
#define X86_FEATURE_VNMI ( 8*32+ 1) /* Intel Virtual NMI */
Patches currently in stable-queue which might be from hughd(a)google.com are
queue-4.9/kaiser-vmstat-show-nr_kaisertable-as-nr_overhead.patch
queue-4.9/kaiser-add-nokaiser-boot-option-using-alternative.patch
queue-4.9/kaiser-fix-unlikely-error-in-alloc_ldt_struct.patch
queue-4.9/kaiser-kaiser_flush_tlb_on_return_to_user-check-pcid.patch
queue-4.9/x86-paravirt-dont-patch-flush_tlb_single.patch
queue-4.9/kaiser-merged-update.patch
queue-4.9/kaiser-delete-kaiser_real_switch-option.patch
queue-4.9/kaiser-kaiser_remove_mapping-move-along-the-pgd.patch
queue-4.9/kaiser-fix-perf-crashes.patch
queue-4.9/kaiser-drop-is_atomic-arg-to-kaiser_pagetable_walk.patch
queue-4.9/kaiser-load_new_mm_cr3-let-switch_user_cr3-flush-user.patch
queue-4.9/kaiser-enhanced-by-kernel-and-user-pcids.patch
queue-4.9/kaiser-x86_cr3_pcid_noflush-and-x86_cr3_pcid_user.patch
queue-4.9/kaiser-align-addition-to-x86-mm-makefile.patch
queue-4.9/kaiser-use-alternative-instead-of-x86_cr3_pcid_noflush.patch
queue-4.9/kaiser-stack-map-page_size-at-thread_size-page_size.patch
queue-4.9/kaiser-name-that-0x1000-kaiser_shadow_pgd_offset.patch
queue-4.9/kaiser-fix-regs-to-do_nmi-ifndef-config_kaiser.patch
queue-4.9/kaiser-do-not-set-_page_nx-on-pgd_none.patch
queue-4.9/kaiser-tidied-up-asm-kaiser.h-somewhat.patch
queue-4.9/kaiser-cleanups-while-trying-for-gold-link.patch
queue-4.9/kaiser-tidied-up-kaiser_add-remove_mapping-slightly.patch
queue-4.9/kaiser-fix-build-and-fixme-in-alloc_ldt_struct.patch
queue-4.9/kaiser-kernel-address-isolation.patch
queue-4.9/kaiser-enomem-if-kaiser_pagetable_walk-null.patch
queue-4.9/kaiser-asm-tlbflush.h-handle-nopge-at-lower-level.patch
queue-4.9/kaiser-paranoid_entry-pass-cr3-need-to-paranoid_exit.patch
queue-4.9/kaiser-kaiser-depends-on-smp.patch
queue-4.9/kaiser-pcid-0-for-kernel-and-128-for-user.patch