This is a note to let you know that I've just added the patch titled
kaiser: tidied up asm/kaiser.h somewhat
to the 4.4-stable tree which can be found at:
http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum…
The filename of the patch is:
kaiser-tidied-up-asm-kaiser.h-somewhat.patch
and it can be found in the queue-4.4 subdirectory.
If you, or anyone else, feels it should not be added to the stable tree,
please let <stable(a)vger.kernel.org> know about it.
>From foo@baz Wed Jan 3 18:58:12 CET 2018
From: Hugh Dickins <hughd(a)google.com>
Date: Sun, 3 Sep 2017 19:18:07 -0700
Subject: kaiser: tidied up asm/kaiser.h somewhat
From: Hugh Dickins <hughd(a)google.com>
Mainly deleting a surfeit of blank lines, and reflowing header comment.
Signed-off-by: Hugh Dickins <hughd(a)google.com>
Acked-by: Jiri Kosina <jkosina(a)suse.cz>
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
arch/x86/include/asm/kaiser.h | 32 +++++++++++++-------------------
1 file changed, 13 insertions(+), 19 deletions(-)
--- a/arch/x86/include/asm/kaiser.h
+++ b/arch/x86/include/asm/kaiser.h
@@ -1,15 +1,17 @@
#ifndef _ASM_X86_KAISER_H
#define _ASM_X86_KAISER_H
-
-/* This file includes the definitions for the KAISER feature.
- * KAISER is a counter measure against x86_64 side channel attacks on the kernel virtual memory.
- * It has a shodow-pgd for every process. the shadow-pgd has a minimalistic kernel-set mapped,
- * but includes the whole user memory. Within a kernel context switch, or when an interrupt is handled,
- * the pgd is switched to the normal one. When the system switches to user mode, the shadow pgd is enabled.
- * By this, the virtual memory chaches are freed, and the user may not attack the whole kernel memory.
+/*
+ * This file includes the definitions for the KAISER feature.
+ * KAISER is a counter measure against x86_64 side channel attacks on
+ * the kernel virtual memory. It has a shadow pgd for every process: the
+ * shadow pgd has a minimalistic kernel-set mapped, but includes the whole
+ * user memory. Within a kernel context switch, or when an interrupt is handled,
+ * the pgd is switched to the normal one. When the system switches to user mode,
+ * the shadow pgd is enabled. By this, the virtual memory caches are freed,
+ * and the user may not attack the whole kernel memory.
*
- * A minimalistic kernel mapping holds the parts needed to be mapped in user mode, as the entry/exit functions
- * of the user space, or the stacks.
+ * A minimalistic kernel mapping holds the parts needed to be mapped in user
+ * mode, such as the entry/exit functions of the user space, or the stacks.
*/
#ifdef __ASSEMBLY__
#ifdef CONFIG_KAISER
@@ -48,13 +50,10 @@ _SWITCH_TO_KERNEL_CR3 %rax
movq PER_CPU_VAR(unsafe_stack_register_backup), %rax
.endm
-
.macro SWITCH_USER_CR3_NO_STACK
-
movq %rax, PER_CPU_VAR(unsafe_stack_register_backup)
_SWITCH_TO_USER_CR3 %rax
movq PER_CPU_VAR(unsafe_stack_register_backup), %rax
-
.endm
#else /* CONFIG_KAISER */
@@ -72,7 +71,6 @@ movq PER_CPU_VAR(unsafe_stack_register_b
#else /* __ASSEMBLY__ */
-
#ifdef CONFIG_KAISER
/*
* Upon kernel/user mode switch, it may happen that the address
@@ -80,7 +78,6 @@ movq PER_CPU_VAR(unsafe_stack_register_b
* stored. To change the address space, another register is
* needed. A register therefore has to be stored/restored.
*/
-
DECLARE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup);
/**
@@ -95,7 +92,6 @@ DECLARE_PER_CPU_USER_MAPPED(unsigned lon
*/
extern int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags);
-
/**
* kaiser_remove_mapping - unmap a virtual memory part of the shadow mapping
* @addr: the start address of the range
@@ -104,12 +100,12 @@ extern int kaiser_add_mapping(unsigned l
extern void kaiser_remove_mapping(unsigned long start, unsigned long size);
/**
- * kaiser_initialize_mapping - Initalize the shadow mapping
+ * kaiser_init - Initialize the shadow mapping
*
* Most parts of the shadow mapping can be mapped upon boot
* time. Only per-process things like the thread stacks
* or a new LDT have to be mapped at runtime. These boot-
- * time mappings are permanent and nevertunmapped.
+ * time mappings are permanent and never unmapped.
*/
extern void kaiser_init(void);
@@ -117,6 +113,4 @@ extern void kaiser_init(void);
#endif /* __ASSEMBLY */
-
-
#endif /* _ASM_X86_KAISER_H */
Patches currently in stable-queue which might be from hughd(a)google.com are
queue-4.4/kaiser-vmstat-show-nr_kaisertable-as-nr_overhead.patch
queue-4.4/kaiser-add-nokaiser-boot-option-using-alternative.patch
queue-4.4/kaiser-fix-unlikely-error-in-alloc_ldt_struct.patch
queue-4.4/kaiser-_pgd_alloc-without-__gfp_repeat-to-avoid-stalls.patch
queue-4.4/kaiser-kaiser_flush_tlb_on_return_to_user-check-pcid.patch
queue-4.4/x86-paravirt-dont-patch-flush_tlb_single.patch
queue-4.4/kaiser-merged-update.patch
queue-4.4/kaiser-delete-kaiser_real_switch-option.patch
queue-4.4/kaiser-kaiser_remove_mapping-move-along-the-pgd.patch
queue-4.4/kaiser-fix-perf-crashes.patch
queue-4.4/kaiser-drop-is_atomic-arg-to-kaiser_pagetable_walk.patch
queue-4.4/kaiser-load_new_mm_cr3-let-switch_user_cr3-flush-user.patch
queue-4.4/kaiser-enhanced-by-kernel-and-user-pcids.patch
queue-4.4/kaiser-x86_cr3_pcid_noflush-and-x86_cr3_pcid_user.patch
queue-4.4/kaiser-use-alternative-instead-of-x86_cr3_pcid_noflush.patch
queue-4.4/kaiser-stack-map-page_size-at-thread_size-page_size.patch
queue-4.4/kaiser-name-that-0x1000-kaiser_shadow_pgd_offset.patch
queue-4.4/kaiser-fix-regs-to-do_nmi-ifndef-config_kaiser.patch
queue-4.4/kaiser-do-not-set-_page_nx-on-pgd_none.patch
queue-4.4/kaiser-tidied-up-asm-kaiser.h-somewhat.patch
queue-4.4/kaiser-cleanups-while-trying-for-gold-link.patch
queue-4.4/kaiser-tidied-up-kaiser_add-remove_mapping-slightly.patch
queue-4.4/kaiser-fix-build-and-fixme-in-alloc_ldt_struct.patch
queue-4.4/kaiser-kernel-address-isolation.patch
queue-4.4/kaiser-enomem-if-kaiser_pagetable_walk-null.patch
queue-4.4/kaiser-asm-tlbflush.h-handle-nopge-at-lower-level.patch
queue-4.4/kaiser-paranoid_entry-pass-cr3-need-to-paranoid_exit.patch
queue-4.4/kaiser-kaiser-depends-on-smp.patch
queue-4.4/kaiser-pcid-0-for-kernel-and-128-for-user.patch
This is a note to let you know that I've just added the patch titled
kaiser: stack map PAGE_SIZE at THREAD_SIZE-PAGE_SIZE
to the 4.4-stable tree which can be found at:
http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum…
The filename of the patch is:
kaiser-stack-map-page_size-at-thread_size-page_size.patch
and it can be found in the queue-4.4 subdirectory.
If you, or anyone else, feels it should not be added to the stable tree,
please let <stable(a)vger.kernel.org> know about it.
>From foo@baz Wed Jan 3 18:58:12 CET 2018
From: Hugh Dickins <hughd(a)google.com>
Date: Sun, 3 Sep 2017 18:57:03 -0700
Subject: kaiser: stack map PAGE_SIZE at THREAD_SIZE-PAGE_SIZE
From: Hugh Dickins <hughd(a)google.com>
Kaiser only needs to map one page of the stack; and
kernel/fork.c did not build on powerpc (no __PAGE_KERNEL).
It's all cleaner if linux/kaiser.h provides kaiser_map_thread_stack()
and kaiser_unmap_thread_stack() wrappers around asm/kaiser.h's
kaiser_add_mapping() and kaiser_remove_mapping(). And use
linux/kaiser.h in init/main.c to avoid the #ifdefs there.
Signed-off-by: Hugh Dickins <hughd(a)google.com>
Acked-by: Jiri Kosina <jkosina(a)suse.cz>
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
include/linux/kaiser.h | 40 +++++++++++++++++++++++++++++++++-------
init/main.c | 6 +-----
kernel/fork.c | 7 ++-----
3 files changed, 36 insertions(+), 17 deletions(-)
--- a/include/linux/kaiser.h
+++ b/include/linux/kaiser.h
@@ -1,26 +1,52 @@
-#ifndef _INCLUDE_KAISER_H
-#define _INCLUDE_KAISER_H
+#ifndef _LINUX_KAISER_H
+#define _LINUX_KAISER_H
#ifdef CONFIG_KAISER
#include <asm/kaiser.h>
+
+static inline int kaiser_map_thread_stack(void *stack)
+{
+ /*
+ * Map that page of kernel stack on which we enter from user context.
+ */
+ return kaiser_add_mapping((unsigned long)stack +
+ THREAD_SIZE - PAGE_SIZE, PAGE_SIZE, __PAGE_KERNEL);
+}
+
+static inline void kaiser_unmap_thread_stack(void *stack)
+{
+ /*
+ * Note: may be called even when kaiser_map_thread_stack() failed.
+ */
+ kaiser_remove_mapping((unsigned long)stack +
+ THREAD_SIZE - PAGE_SIZE, PAGE_SIZE);
+}
#else
/*
* These stubs are used whenever CONFIG_KAISER is off, which
- * includes architectures that support KAISER, but have it
- * disabled.
+ * includes architectures that support KAISER, but have it disabled.
*/
static inline void kaiser_init(void)
{
}
-static inline void kaiser_remove_mapping(unsigned long start, unsigned long size)
+static inline int kaiser_add_mapping(unsigned long addr,
+ unsigned long size, unsigned long flags)
+{
+ return 0;
+}
+static inline void kaiser_remove_mapping(unsigned long start,
+ unsigned long size)
{
}
-static inline int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags)
+static inline int kaiser_map_thread_stack(void *stack)
{
return 0;
}
+static inline void kaiser_unmap_thread_stack(void *stack)
+{
+}
#endif /* !CONFIG_KAISER */
-#endif /* _INCLUDE_KAISER_H */
+#endif /* _LINUX_KAISER_H */
--- a/init/main.c
+++ b/init/main.c
@@ -81,15 +81,13 @@
#include <linux/integrity.h>
#include <linux/proc_ns.h>
#include <linux/io.h>
+#include <linux/kaiser.h>
#include <asm/io.h>
#include <asm/bugs.h>
#include <asm/setup.h>
#include <asm/sections.h>
#include <asm/cacheflush.h>
-#ifdef CONFIG_KAISER
-#include <asm/kaiser.h>
-#endif
static int kernel_init(void *);
@@ -495,9 +493,7 @@ static void __init mm_init(void)
pgtable_init();
vmalloc_init();
ioremap_huge_init();
-#ifdef CONFIG_KAISER
kaiser_init();
-#endif
}
asmlinkage __visible void __init start_kernel(void)
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -168,12 +168,9 @@ static struct thread_info *alloc_thread_
return page ? page_address(page) : NULL;
}
-extern void kaiser_remove_mapping(unsigned long start_addr, unsigned long size);
static inline void free_thread_info(struct thread_info *ti)
{
-#ifdef CONFIG_KAISER
- kaiser_remove_mapping((unsigned long)ti, THREAD_SIZE);
-#endif
+ kaiser_unmap_thread_stack(ti);
free_kmem_pages((unsigned long)ti, THREAD_SIZE_ORDER);
}
# else
@@ -358,7 +355,7 @@ static struct task_struct *dup_task_stru
tsk->stack = ti;
- err= kaiser_add_mapping((unsigned long)tsk->stack, THREAD_SIZE, __PAGE_KERNEL);
+ err = kaiser_map_thread_stack(tsk->stack);
if (err)
goto free_ti;
#ifdef CONFIG_SECCOMP
Patches currently in stable-queue which might be from hughd(a)google.com are
queue-4.4/kaiser-vmstat-show-nr_kaisertable-as-nr_overhead.patch
queue-4.4/kaiser-add-nokaiser-boot-option-using-alternative.patch
queue-4.4/kaiser-fix-unlikely-error-in-alloc_ldt_struct.patch
queue-4.4/kaiser-_pgd_alloc-without-__gfp_repeat-to-avoid-stalls.patch
queue-4.4/kaiser-kaiser_flush_tlb_on_return_to_user-check-pcid.patch
queue-4.4/x86-paravirt-dont-patch-flush_tlb_single.patch
queue-4.4/kaiser-merged-update.patch
queue-4.4/kaiser-delete-kaiser_real_switch-option.patch
queue-4.4/kaiser-kaiser_remove_mapping-move-along-the-pgd.patch
queue-4.4/kaiser-fix-perf-crashes.patch
queue-4.4/kaiser-drop-is_atomic-arg-to-kaiser_pagetable_walk.patch
queue-4.4/kaiser-load_new_mm_cr3-let-switch_user_cr3-flush-user.patch
queue-4.4/kaiser-enhanced-by-kernel-and-user-pcids.patch
queue-4.4/kaiser-x86_cr3_pcid_noflush-and-x86_cr3_pcid_user.patch
queue-4.4/kaiser-use-alternative-instead-of-x86_cr3_pcid_noflush.patch
queue-4.4/kaiser-stack-map-page_size-at-thread_size-page_size.patch
queue-4.4/kaiser-name-that-0x1000-kaiser_shadow_pgd_offset.patch
queue-4.4/kaiser-fix-regs-to-do_nmi-ifndef-config_kaiser.patch
queue-4.4/kaiser-do-not-set-_page_nx-on-pgd_none.patch
queue-4.4/kaiser-tidied-up-asm-kaiser.h-somewhat.patch
queue-4.4/kaiser-cleanups-while-trying-for-gold-link.patch
queue-4.4/kaiser-tidied-up-kaiser_add-remove_mapping-slightly.patch
queue-4.4/kaiser-fix-build-and-fixme-in-alloc_ldt_struct.patch
queue-4.4/kaiser-kernel-address-isolation.patch
queue-4.4/kaiser-enomem-if-kaiser_pagetable_walk-null.patch
queue-4.4/kaiser-asm-tlbflush.h-handle-nopge-at-lower-level.patch
queue-4.4/kaiser-paranoid_entry-pass-cr3-need-to-paranoid_exit.patch
queue-4.4/kaiser-kaiser-depends-on-smp.patch
queue-4.4/kaiser-pcid-0-for-kernel-and-128-for-user.patch
This is a note to let you know that I've just added the patch titled
kaiser: PCID 0 for kernel and 128 for user
to the 4.4-stable tree which can be found at:
http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum…
The filename of the patch is:
kaiser-pcid-0-for-kernel-and-128-for-user.patch
and it can be found in the queue-4.4 subdirectory.
If you, or anyone else, feels it should not be added to the stable tree,
please let <stable(a)vger.kernel.org> know about it.
>From foo@baz Wed Jan 3 18:58:12 CET 2018
From: Hugh Dickins <hughd(a)google.com>
Date: Fri, 8 Sep 2017 19:26:30 -0700
Subject: kaiser: PCID 0 for kernel and 128 for user
From: Hugh Dickins <hughd(a)google.com>
Why was 4 chosen for kernel PCID and 6 for user PCID?
No good reason in a backport where PCIDs are only used for Kaiser.
If we continue with those, then we shall need to add Andy Lutomirski's
4.13 commit 6c690ee1039b ("x86/mm: Split read_cr3() into read_cr3_pa()
and __read_cr3()"), which deals with the problem of read_cr3() callers
finding stray bits in the cr3 that they expected to be page-aligned;
and for hibernation, his 4.14 commit f34902c5c6c0 ("x86/hibernate/64:
Mask off CR3's PCID bits in the saved CR3").
But if 0 is used for kernel PCID, then there's no need to add in those
commits - whenever the kernel looks, it sees 0 in the lower bits; and
0 for kernel seems an obvious choice.
And I naughtily propose 128 for user PCID. Because there's a place
in _SWITCH_TO_USER_CR3 where it takes note of the need for TLB FLUSH,
but needs to reset that to NOFLUSH for the next occasion. Currently
it does so with a "movb $(0x80)" into the high byte of the per-cpu
quadword, but that will cause a machine without PCID support to crash.
Now, if %al just happened to have 0x80 in it at that point, on a
machine with PCID support, but 0 on a machine without PCID support...
(That will go badly wrong once the pgd can be at a physical address
above 2^56, but even with 5-level paging, physical goes up to 2^52.)
Signed-off-by: Hugh Dickins <hughd(a)google.com>
Acked-by: Jiri Kosina <jkosina(a)suse.cz>
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
arch/x86/include/asm/kaiser.h | 19 ++++++++++++-------
arch/x86/include/asm/pgtable_types.h | 7 ++++---
arch/x86/mm/tlb.c | 3 +++
3 files changed, 19 insertions(+), 10 deletions(-)
--- a/arch/x86/include/asm/kaiser.h
+++ b/arch/x86/include/asm/kaiser.h
@@ -29,14 +29,19 @@ orq X86_CR3_PCID_KERN_VAR, \reg
movq \reg, %cr3
.endm
-.macro _SWITCH_TO_USER_CR3 reg
+.macro _SWITCH_TO_USER_CR3 reg regb
+/*
+ * regb must be the low byte portion of reg: because we have arranged
+ * for the low byte of the user PCID to serve as the high byte of NOFLUSH
+ * (0x80 for each when PCID is enabled, or 0x00 when PCID and NOFLUSH are
+ * not enabled): so that the one register can update both memory and cr3.
+ */
movq %cr3, \reg
andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), \reg
orq PER_CPU_VAR(X86_CR3_PCID_USER_VAR), \reg
js 9f
-// FLUSH this time, reset to NOFLUSH for next time
-// But if nopcid? Consider using 0x80 for user pcid?
-movb $(0x80), PER_CPU_VAR(X86_CR3_PCID_USER_VAR+7)
+/* FLUSH this time, reset to NOFLUSH for next time (if PCID enabled) */
+movb \regb, PER_CPU_VAR(X86_CR3_PCID_USER_VAR+7)
9:
movq \reg, %cr3
.endm
@@ -49,7 +54,7 @@ popq %rax
.macro SWITCH_USER_CR3
pushq %rax
-_SWITCH_TO_USER_CR3 %rax
+_SWITCH_TO_USER_CR3 %rax %al
popq %rax
.endm
@@ -61,7 +66,7 @@ movq PER_CPU_VAR(unsafe_stack_register_b
.macro SWITCH_USER_CR3_NO_STACK
movq %rax, PER_CPU_VAR(unsafe_stack_register_backup)
-_SWITCH_TO_USER_CR3 %rax
+_SWITCH_TO_USER_CR3 %rax %al
movq PER_CPU_VAR(unsafe_stack_register_backup), %rax
.endm
@@ -69,7 +74,7 @@ movq PER_CPU_VAR(unsafe_stack_register_b
.macro SWITCH_KERNEL_CR3 reg
.endm
-.macro SWITCH_USER_CR3 reg
+.macro SWITCH_USER_CR3 reg regb
.endm
.macro SWITCH_USER_CR3_NO_STACK
.endm
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -111,16 +111,17 @@
/* Mask for all the PCID-related bits in CR3: */
#define X86_CR3_PCID_MASK (X86_CR3_PCID_NOFLUSH | X86_CR3_PCID_ASID_MASK)
+#define X86_CR3_PCID_ASID_KERN (_AC(0x0,UL))
+
#if defined(CONFIG_KAISER) && defined(CONFIG_X86_64)
-#define X86_CR3_PCID_ASID_KERN (_AC(0x4,UL))
-#define X86_CR3_PCID_ASID_USER (_AC(0x6,UL))
+/* Let X86_CR3_PCID_ASID_USER be usable for the X86_CR3_PCID_NOFLUSH bit */
+#define X86_CR3_PCID_ASID_USER (_AC(0x80,UL))
#define X86_CR3_PCID_KERN_FLUSH (X86_CR3_PCID_ASID_KERN)
#define X86_CR3_PCID_USER_FLUSH (X86_CR3_PCID_ASID_USER)
#define X86_CR3_PCID_KERN_NOFLUSH (X86_CR3_PCID_NOFLUSH | X86_CR3_PCID_ASID_KERN)
#define X86_CR3_PCID_USER_NOFLUSH (X86_CR3_PCID_NOFLUSH | X86_CR3_PCID_ASID_USER)
#else
-#define X86_CR3_PCID_ASID_KERN (_AC(0x0,UL))
#define X86_CR3_PCID_ASID_USER (_AC(0x0,UL))
/*
* PCIDs are unsupported on 32-bit and none of these bits can be
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -50,6 +50,9 @@ static void load_new_mm_cr3(pgd_t *pgdir
* invpcid_flush_single_context(X86_CR3_PCID_ASID_USER) could
* do it here, but can only be used if X86_FEATURE_INVPCID is
* available - and many machines support pcid without invpcid.
+ *
+ * The line below is a no-op: X86_CR3_PCID_KERN_FLUSH is now 0;
+ * but keep that line in there in case something changes.
*/
new_mm_cr3 |= X86_CR3_PCID_KERN_FLUSH;
kaiser_flush_tlb_on_return_to_user();
Patches currently in stable-queue which might be from hughd(a)google.com are
queue-4.4/kaiser-vmstat-show-nr_kaisertable-as-nr_overhead.patch
queue-4.4/kaiser-add-nokaiser-boot-option-using-alternative.patch
queue-4.4/kaiser-fix-unlikely-error-in-alloc_ldt_struct.patch
queue-4.4/kaiser-_pgd_alloc-without-__gfp_repeat-to-avoid-stalls.patch
queue-4.4/kaiser-kaiser_flush_tlb_on_return_to_user-check-pcid.patch
queue-4.4/x86-paravirt-dont-patch-flush_tlb_single.patch
queue-4.4/kaiser-merged-update.patch
queue-4.4/kaiser-delete-kaiser_real_switch-option.patch
queue-4.4/kaiser-kaiser_remove_mapping-move-along-the-pgd.patch
queue-4.4/kaiser-fix-perf-crashes.patch
queue-4.4/kaiser-drop-is_atomic-arg-to-kaiser_pagetable_walk.patch
queue-4.4/kaiser-load_new_mm_cr3-let-switch_user_cr3-flush-user.patch
queue-4.4/kaiser-enhanced-by-kernel-and-user-pcids.patch
queue-4.4/kaiser-x86_cr3_pcid_noflush-and-x86_cr3_pcid_user.patch
queue-4.4/kaiser-use-alternative-instead-of-x86_cr3_pcid_noflush.patch
queue-4.4/kaiser-stack-map-page_size-at-thread_size-page_size.patch
queue-4.4/kaiser-name-that-0x1000-kaiser_shadow_pgd_offset.patch
queue-4.4/kaiser-fix-regs-to-do_nmi-ifndef-config_kaiser.patch
queue-4.4/kaiser-do-not-set-_page_nx-on-pgd_none.patch
queue-4.4/kaiser-tidied-up-asm-kaiser.h-somewhat.patch
queue-4.4/kaiser-cleanups-while-trying-for-gold-link.patch
queue-4.4/kaiser-tidied-up-kaiser_add-remove_mapping-slightly.patch
queue-4.4/kaiser-fix-build-and-fixme-in-alloc_ldt_struct.patch
queue-4.4/kaiser-kernel-address-isolation.patch
queue-4.4/kaiser-enomem-if-kaiser_pagetable_walk-null.patch
queue-4.4/kaiser-asm-tlbflush.h-handle-nopge-at-lower-level.patch
queue-4.4/kaiser-paranoid_entry-pass-cr3-need-to-paranoid_exit.patch
queue-4.4/kaiser-kaiser-depends-on-smp.patch
queue-4.4/kaiser-pcid-0-for-kernel-and-128-for-user.patch
This is a note to let you know that I've just added the patch titled
kaiser: paranoid_entry pass cr3 need to paranoid_exit
to the 4.4-stable tree which can be found at:
http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum…
The filename of the patch is:
kaiser-paranoid_entry-pass-cr3-need-to-paranoid_exit.patch
and it can be found in the queue-4.4 subdirectory.
If you, or anyone else, feels it should not be added to the stable tree,
please let <stable(a)vger.kernel.org> know about it.
>From foo@baz Wed Jan 3 18:58:12 CET 2018
From: Hugh Dickins <hughd(a)google.com>
Date: Tue, 26 Sep 2017 18:43:07 -0700
Subject: kaiser: paranoid_entry pass cr3 need to paranoid_exit
From: Hugh Dickins <hughd(a)google.com>
Neel Natu points out that paranoid_entry() was wrong to assume that
an entry that did not need swapgs would not need SWITCH_KERNEL_CR3:
paranoid_entry (used for debug breakpoint, int3, double fault or MCE;
though I think it's only the MCE case that is cause for concern here)
can break in at an awkward time, between cr3 switch and swapgs, but
its handling always needs kernel gs and kernel cr3.
Easy to fix in itself, but paranoid_entry() also needs to convey to
paranoid_exit() (and my reading of macro idtentry says paranoid_entry
and paranoid_exit are always paired) how to restore the prior state.
The swapgs state is already conveyed by %ebx (0 or 1), so extend that
also to convey when SWITCH_USER_CR3 will be needed (2 or 3).
(Yes, I'd much prefer that 0 meant no swapgs, whereas it's the other
way round: and a convention shared with error_entry() and error_exit(),
which I don't want to touch. Perhaps I should have inverted the bit
for switch cr3 too, but did not.)
paranoid_exit() would be straightforward, except for TRACE_IRQS: it
did TRACE_IRQS_IRETQ when doing swapgs, but TRACE_IRQS_IRETQ_DEBUG
when not: which is it supposed to use when SWITCH_USER_CR3 is split
apart from that? As best as I can determine, commit 5963e317b1e9
("ftrace/x86: Do not change stacks in DEBUG when calling lockdep")
missed the swapgs case, and should have used TRACE_IRQS_IRETQ_DEBUG
there too (the discrepancy has nothing to do with the liberal use
of _NO_STACK and _UNSAFE_STACK hereabouts: TRACE_IRQS_OFF_DEBUG has
just been used in all cases); discrepancy lovingly preserved across
several paranoid_exit() cleanups, but I'm now removing it.
Neel further indicates that to use SWITCH_USER_CR3_NO_STACK there in
paranoid_exit() is now not only unnecessary but unsafe: might corrupt
syscall entry's unsafe_stack_register_backup of %rax. Just use
SWITCH_USER_CR3: and delete SWITCH_USER_CR3_NO_STACK altogether,
before we make the mistake of using it again.
hughd adds: this commit fixes an issue in the Kaiser-without-PCIDs
part of the series, and ought to be moved earlier, if you decided
to make a release of Kaiser-without-PCIDs.
Signed-off-by: Hugh Dickins <hughd(a)google.com>
Acked-by: Jiri Kosina <jkosina(a)suse.cz>
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
arch/x86/entry/entry_64.S | 46 ++++++++++++++++++++++++++++++++----------
arch/x86/include/asm/kaiser.h | 8 -------
2 files changed, 36 insertions(+), 18 deletions(-)
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -1025,7 +1025,11 @@ idtentry machine_check has_error_cod
/*
* Save all registers in pt_regs, and switch gs if needed.
* Use slow, but surefire "are we in kernel?" check.
- * Return: ebx=0: need swapgs on exit, ebx=1: otherwise
+ *
+ * Return: ebx=0: needs swapgs but not SWITCH_USER_CR3 in paranoid_exit
+ * ebx=1: needs neither swapgs nor SWITCH_USER_CR3 in paranoid_exit
+ * ebx=2: needs both swapgs and SWITCH_USER_CR3 in paranoid_exit
+ * ebx=3: needs SWITCH_USER_CR3 but not swapgs in paranoid_exit
*/
ENTRY(paranoid_entry)
cld
@@ -1037,9 +1041,26 @@ ENTRY(paranoid_entry)
testl %edx, %edx
js 1f /* negative -> in kernel */
SWAPGS
- SWITCH_KERNEL_CR3
xorl %ebx, %ebx
-1: ret
+1:
+#ifdef CONFIG_KAISER
+ /*
+ * We might have come in between a swapgs and a SWITCH_KERNEL_CR3
+ * on entry, or between a SWITCH_USER_CR3 and a swapgs on exit.
+ * Do a conditional SWITCH_KERNEL_CR3: this could safely be done
+ * unconditionally, but we need to find out whether the reverse
+ * should be done on return (conveyed to paranoid_exit in %ebx).
+ */
+ movq %cr3, %rax
+ testl $KAISER_SHADOW_PGD_OFFSET, %eax
+ jz 2f
+ orl $2, %ebx
+ andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), %rax
+ orq x86_cr3_pcid_noflush, %rax
+ movq %rax, %cr3
+2:
+#endif
+ ret
END(paranoid_entry)
/*
@@ -1052,20 +1073,25 @@ END(paranoid_entry)
* be complicated. Fortunately, we there's no good reason
* to try to handle preemption here.
*
- * On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it)
+ * On entry: ebx=0: needs swapgs but not SWITCH_USER_CR3
+ * ebx=1: needs neither swapgs nor SWITCH_USER_CR3
+ * ebx=2: needs both swapgs and SWITCH_USER_CR3
+ * ebx=3: needs SWITCH_USER_CR3 but not swapgs
*/
ENTRY(paranoid_exit)
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF_DEBUG
- testl %ebx, %ebx /* swapgs needed? */
+ TRACE_IRQS_IRETQ_DEBUG
+#ifdef CONFIG_KAISER
+ testl $2, %ebx /* SWITCH_USER_CR3 needed? */
+ jz paranoid_exit_no_switch
+ SWITCH_USER_CR3
+paranoid_exit_no_switch:
+#endif
+ testl $1, %ebx /* swapgs needed? */
jnz paranoid_exit_no_swapgs
- TRACE_IRQS_IRETQ
- SWITCH_USER_CR3_NO_STACK
SWAPGS_UNSAFE_STACK
- jmp paranoid_exit_restore
paranoid_exit_no_swapgs:
- TRACE_IRQS_IRETQ_DEBUG
-paranoid_exit_restore:
RESTORE_EXTRA_REGS
RESTORE_C_REGS
REMOVE_PT_GPREGS_FROM_STACK 8
--- a/arch/x86/include/asm/kaiser.h
+++ b/arch/x86/include/asm/kaiser.h
@@ -63,20 +63,12 @@ _SWITCH_TO_KERNEL_CR3 %rax
movq PER_CPU_VAR(unsafe_stack_register_backup), %rax
.endm
-.macro SWITCH_USER_CR3_NO_STACK
-movq %rax, PER_CPU_VAR(unsafe_stack_register_backup)
-_SWITCH_TO_USER_CR3 %rax %al
-movq PER_CPU_VAR(unsafe_stack_register_backup), %rax
-.endm
-
#else /* CONFIG_KAISER */
.macro SWITCH_KERNEL_CR3 reg
.endm
.macro SWITCH_USER_CR3 reg regb
.endm
-.macro SWITCH_USER_CR3_NO_STACK
-.endm
.macro SWITCH_KERNEL_CR3_NO_STACK
.endm
Patches currently in stable-queue which might be from hughd(a)google.com are
queue-4.4/kaiser-vmstat-show-nr_kaisertable-as-nr_overhead.patch
queue-4.4/kaiser-add-nokaiser-boot-option-using-alternative.patch
queue-4.4/kaiser-fix-unlikely-error-in-alloc_ldt_struct.patch
queue-4.4/kaiser-_pgd_alloc-without-__gfp_repeat-to-avoid-stalls.patch
queue-4.4/kaiser-kaiser_flush_tlb_on_return_to_user-check-pcid.patch
queue-4.4/x86-paravirt-dont-patch-flush_tlb_single.patch
queue-4.4/kaiser-merged-update.patch
queue-4.4/kaiser-delete-kaiser_real_switch-option.patch
queue-4.4/kaiser-kaiser_remove_mapping-move-along-the-pgd.patch
queue-4.4/kaiser-fix-perf-crashes.patch
queue-4.4/kaiser-drop-is_atomic-arg-to-kaiser_pagetable_walk.patch
queue-4.4/kaiser-load_new_mm_cr3-let-switch_user_cr3-flush-user.patch
queue-4.4/kaiser-enhanced-by-kernel-and-user-pcids.patch
queue-4.4/kaiser-x86_cr3_pcid_noflush-and-x86_cr3_pcid_user.patch
queue-4.4/kaiser-use-alternative-instead-of-x86_cr3_pcid_noflush.patch
queue-4.4/kaiser-stack-map-page_size-at-thread_size-page_size.patch
queue-4.4/kaiser-name-that-0x1000-kaiser_shadow_pgd_offset.patch
queue-4.4/kaiser-fix-regs-to-do_nmi-ifndef-config_kaiser.patch
queue-4.4/kaiser-do-not-set-_page_nx-on-pgd_none.patch
queue-4.4/kaiser-tidied-up-asm-kaiser.h-somewhat.patch
queue-4.4/kaiser-cleanups-while-trying-for-gold-link.patch
queue-4.4/kaiser-tidied-up-kaiser_add-remove_mapping-slightly.patch
queue-4.4/kaiser-fix-build-and-fixme-in-alloc_ldt_struct.patch
queue-4.4/kaiser-kernel-address-isolation.patch
queue-4.4/kaiser-enomem-if-kaiser_pagetable_walk-null.patch
queue-4.4/kaiser-asm-tlbflush.h-handle-nopge-at-lower-level.patch
queue-4.4/kaiser-paranoid_entry-pass-cr3-need-to-paranoid_exit.patch
queue-4.4/kaiser-kaiser-depends-on-smp.patch
queue-4.4/kaiser-pcid-0-for-kernel-and-128-for-user.patch
This is a note to let you know that I've just added the patch titled
kaiser: merged update
to the 4.4-stable tree which can be found at:
http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum…
The filename of the patch is:
kaiser-merged-update.patch
and it can be found in the queue-4.4 subdirectory.
If you, or anyone else, feels it should not be added to the stable tree,
please let <stable(a)vger.kernel.org> know about it.
>From foo@baz Wed Jan 3 18:58:12 CET 2018
From: Dave Hansen <dave.hansen(a)linux.intel.com>
Date: Wed, 30 Aug 2017 16:23:00 -0700
Subject: kaiser: merged update
From: Dave Hansen <dave.hansen(a)linux.intel.com>
Merged fixes and cleanups, rebased to 4.4.89 tree (no 5-level paging).
Signed-off-by: Hugh Dickins <hughd(a)google.com>
Acked-by: Jiri Kosina <jkosina(a)suse.cz>
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
arch/x86/entry/entry_64.S | 106 ++++++++++-
arch/x86/include/asm/kaiser.h | 43 ++--
arch/x86/include/asm/pgtable.h | 18 +
arch/x86/include/asm/pgtable_64.h | 48 ++++-
arch/x86/include/asm/pgtable_types.h | 6
arch/x86/kernel/espfix_64.c | 13 -
arch/x86/kernel/head_64.S | 19 +-
arch/x86/kernel/ldt.c | 27 ++
arch/x86/kernel/tracepoint.c | 2
arch/x86/mm/kaiser.c | 318 +++++++++++++++++++++++++----------
arch/x86/mm/pageattr.c | 63 +++++-
arch/x86/mm/pgtable.c | 40 +---
include/linux/kaiser.h | 26 ++
kernel/fork.c | 9
security/Kconfig | 5
15 files changed, 553 insertions(+), 190 deletions(-)
create mode 100644 include/linux/kaiser.h
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -212,6 +212,13 @@ entry_SYSCALL_64_fastpath:
movq RIP(%rsp), %rcx
movq EFLAGS(%rsp), %r11
RESTORE_C_REGS_EXCEPT_RCX_R11
+ /*
+ * This opens a window where we have a user CR3, but are
+ * running in the kernel. This makes using the CS
+ * register useless for telling whether or not we need to
+ * switch CR3 in NMIs. Normal interrupts are OK because
+ * they are off here.
+ */
SWITCH_USER_CR3
movq RSP(%rsp), %rsp
/*
@@ -350,11 +357,25 @@ GLOBAL(int_ret_from_sys_call)
syscall_return_via_sysret:
/* rcx and r11 are already restored (see code above) */
RESTORE_C_REGS_EXCEPT_RCX_R11
+ /*
+ * This opens a window where we have a user CR3, but are
+ * running in the kernel. This makes using the CS
+ * register useless for telling whether or not we need to
+ * switch CR3 in NMIs. Normal interrupts are OK because
+ * they are off here.
+ */
SWITCH_USER_CR3
movq RSP(%rsp), %rsp
USERGS_SYSRET64
opportunistic_sysret_failed:
+ /*
+ * This opens a window where we have a user CR3, but are
+ * running in the kernel. This makes using the CS
+ * register useless for telling whether or not we need to
+ * switch CR3 in NMIs. Normal interrupts are OK because
+ * they are off here.
+ */
SWITCH_USER_CR3
SWAPGS
jmp restore_c_regs_and_iret
@@ -1059,6 +1080,13 @@ ENTRY(error_entry)
cld
SAVE_C_REGS 8
SAVE_EXTRA_REGS 8
+ /*
+ * error_entry() always returns with a kernel gsbase and
+ * CR3. We must also have a kernel CR3/gsbase before
+ * calling TRACE_IRQS_*. Just unconditionally switch to
+ * the kernel CR3 here.
+ */
+ SWITCH_KERNEL_CR3
xorl %ebx, %ebx
testb $3, CS+8(%rsp)
jz .Lerror_kernelspace
@@ -1069,7 +1097,6 @@ ENTRY(error_entry)
* from user mode due to an IRET fault.
*/
SWAPGS
- SWITCH_KERNEL_CR3
.Lerror_entry_from_usermode_after_swapgs:
/*
@@ -1122,7 +1149,7 @@ ENTRY(error_entry)
* Switch to kernel gsbase:
*/
SWAPGS
- SWITCH_KERNEL_CR3
+
/*
* Pretend that the exception came from user mode: set up pt_regs
* as if we faulted immediately after IRET and clear EBX so that
@@ -1222,7 +1249,10 @@ ENTRY(nmi)
*/
SWAPGS_UNSAFE_STACK
- SWITCH_KERNEL_CR3_NO_STACK
+ /*
+ * percpu variables are mapped with user CR3, so no need
+ * to switch CR3 here.
+ */
cld
movq %rsp, %rdx
movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
@@ -1256,14 +1286,33 @@ ENTRY(nmi)
movq %rsp, %rdi
movq $-1, %rsi
+#ifdef CONFIG_KAISER
+ /* Unconditionally use kernel CR3 for do_nmi() */
+ /* %rax is saved above, so OK to clobber here */
+ movq %cr3, %rax
+ pushq %rax
+#ifdef CONFIG_KAISER_REAL_SWITCH
+ andq $(~0x1000), %rax
+#endif
+ movq %rax, %cr3
+#endif
call do_nmi
+ /*
+ * Unconditionally restore CR3. I know we return to
+ * kernel code that needs user CR3, but do we ever return
+ * to "user mode" where we need the kernel CR3?
+ */
+#ifdef CONFIG_KAISER
+ popq %rax
+ mov %rax, %cr3
+#endif
/*
* Return back to user mode. We must *not* do the normal exit
- * work, because we don't want to enable interrupts. Fortunately,
- * do_nmi doesn't modify pt_regs.
+ * work, because we don't want to enable interrupts. Do not
+ * switch to user CR3: we might be going back to kernel code
+ * that had a user CR3 set.
*/
- SWITCH_USER_CR3
SWAPGS
jmp restore_c_regs_and_iret
@@ -1459,23 +1508,54 @@ end_repeat_nmi:
ALLOC_PT_GPREGS_ON_STACK
/*
- * Use paranoid_entry to handle SWAPGS, but no need to use paranoid_exit
- * as we should not be calling schedule in NMI context.
- * Even with normal interrupts enabled. An NMI should not be
- * setting NEED_RESCHED or anything that normal interrupts and
- * exceptions might do.
+ * Use the same approach as paranoid_entry to handle SWAPGS, but
+ * without CR3 handling since we do that differently in NMIs. No
+ * need to use paranoid_exit as we should not be calling schedule
+ * in NMI context. Even with normal interrupts enabled. An NMI
+ * should not be setting NEED_RESCHED or anything that normal
+ * interrupts and exceptions might do.
*/
- call paranoid_entry
+ cld
+ SAVE_C_REGS
+ SAVE_EXTRA_REGS
+ movl $1, %ebx
+ movl $MSR_GS_BASE, %ecx
+ rdmsr
+ testl %edx, %edx
+ js 1f /* negative -> in kernel */
+ SWAPGS
+ xorl %ebx, %ebx
+1:
+#ifdef CONFIG_KAISER
+ /* Unconditionally use kernel CR3 for do_nmi() */
+ /* %rax is saved above, so OK to clobber here */
+ movq %cr3, %rax
+ pushq %rax
+#ifdef CONFIG_KAISER_REAL_SWITCH
+ andq $(~0x1000), %rax
+#endif
+ movq %rax, %cr3
+#endif
/* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */
movq %rsp, %rdi
+ addq $8, %rdi /* point %rdi at ptregs, fixed up for CR3 */
movq $-1, %rsi
call do_nmi
+ /*
+ * Unconditionally restore CR3. We might be returning to
+ * kernel code that needs user CR3, like just just before
+ * a sysret.
+ */
+#ifdef CONFIG_KAISER
+ popq %rax
+ mov %rax, %cr3
+#endif
testl %ebx, %ebx /* swapgs needed? */
jnz nmi_restore
nmi_swapgs:
- SWITCH_USER_CR3_NO_STACK
+ /* We fixed up CR3 above, so no need to switch it here */
SWAPGS_UNSAFE_STACK
nmi_restore:
RESTORE_EXTRA_REGS
--- a/arch/x86/include/asm/kaiser.h
+++ b/arch/x86/include/asm/kaiser.h
@@ -16,13 +16,17 @@
.macro _SWITCH_TO_KERNEL_CR3 reg
movq %cr3, \reg
+#ifdef CONFIG_KAISER_REAL_SWITCH
andq $(~0x1000), \reg
+#endif
movq \reg, %cr3
.endm
.macro _SWITCH_TO_USER_CR3 reg
movq %cr3, \reg
+#ifdef CONFIG_KAISER_REAL_SWITCH
orq $(0x1000), \reg
+#endif
movq \reg, %cr3
.endm
@@ -65,48 +69,53 @@ movq PER_CPU_VAR(unsafe_stack_register_b
.endm
#endif /* CONFIG_KAISER */
+
#else /* __ASSEMBLY__ */
#ifdef CONFIG_KAISER
-// Upon kernel/user mode switch, it may happen that
-// the address space has to be switched before the registers have been stored.
-// To change the address space, another register is needed.
-// A register therefore has to be stored/restored.
-//
-DECLARE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup);
+/*
+ * Upon kernel/user mode switch, it may happen that the address
+ * space has to be switched before the registers have been
+ * stored. To change the address space, another register is
+ * needed. A register therefore has to be stored/restored.
+*/
-#endif /* CONFIG_KAISER */
+DECLARE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup);
/**
- * shadowmem_add_mapping - map a virtual memory part to the shadow mapping
+ * kaiser_add_mapping - map a virtual memory part to the shadow (user) mapping
* @addr: the start address of the range
* @size: the size of the range
* @flags: The mapping flags of the pages
*
- * the mapping is done on a global scope, so no bigger synchronization has to be done.
- * the pages have to be manually unmapped again when they are not needed any longer.
+ * The mapping is done on a global scope, so no bigger
+ * synchronization has to be done. the pages have to be
+ * manually unmapped again when they are not needed any longer.
*/
-extern void kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags);
+extern int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags);
/**
- * shadowmem_remove_mapping - unmap a virtual memory part of the shadow mapping
+ * kaiser_remove_mapping - unmap a virtual memory part of the shadow mapping
* @addr: the start address of the range
* @size: the size of the range
*/
extern void kaiser_remove_mapping(unsigned long start, unsigned long size);
/**
- * shadowmem_initialize_mapping - Initalize the shadow mapping
+ * kaiser_initialize_mapping - Initalize the shadow mapping
*
- * most parts of the shadow mapping can be mapped upon boot time.
- * only the thread stacks have to be mapped on runtime.
- * the mapped regions are not unmapped at all.
+ * Most parts of the shadow mapping can be mapped upon boot
+ * time. Only per-process things like the thread stacks
+ * or a new LDT have to be mapped at runtime. These boot-
+ * time mappings are permanent and nevertunmapped.
*/
extern void kaiser_init(void);
-#endif
+#endif /* CONFIG_KAISER */
+
+#endif /* __ASSEMBLY */
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -653,7 +653,17 @@ static inline pud_t *pud_offset(pgd_t *p
static inline int pgd_bad(pgd_t pgd)
{
- return (pgd_flags(pgd) & ~_PAGE_USER) != _KERNPG_TABLE;
+ pgdval_t ignore_flags = _PAGE_USER;
+ /*
+ * We set NX on KAISER pgds that map userspace memory so
+ * that userspace can not meaningfully use the kernel
+ * page table by accident; it will fault on the first
+ * instruction it tries to run. See native_set_pgd().
+ */
+ if (IS_ENABLED(CONFIG_KAISER))
+ ignore_flags |= _PAGE_NX;
+
+ return (pgd_flags(pgd) & ~ignore_flags) != _KERNPG_TABLE;
}
static inline int pgd_none(pgd_t pgd)
@@ -857,8 +867,10 @@ static inline void clone_pgd_range(pgd_t
{
memcpy(dst, src, count * sizeof(pgd_t));
#ifdef CONFIG_KAISER
- // clone the shadow pgd part as well
- memcpy(native_get_shadow_pgd(dst), native_get_shadow_pgd(src), count * sizeof(pgd_t));
+ /* Clone the shadow pgd part as well */
+ memcpy(native_get_shadow_pgd(dst),
+ native_get_shadow_pgd(src),
+ count * sizeof(pgd_t));
#endif
}
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -107,26 +107,58 @@ static inline void native_pud_clear(pud_
}
#ifdef CONFIG_KAISER
-static inline pgd_t * native_get_shadow_pgd(pgd_t *pgdp) {
+static inline pgd_t * native_get_shadow_pgd(pgd_t *pgdp)
+{
return (pgd_t *)(void*)((unsigned long)(void*)pgdp | (unsigned long)PAGE_SIZE);
}
-static inline pgd_t * native_get_normal_pgd(pgd_t *pgdp) {
+static inline pgd_t * native_get_normal_pgd(pgd_t *pgdp)
+{
return (pgd_t *)(void*)((unsigned long)(void*)pgdp & ~(unsigned long)PAGE_SIZE);
}
+#else
+static inline pgd_t * native_get_shadow_pgd(pgd_t *pgdp)
+{
+ BUILD_BUG_ON(1);
+ return NULL;
+}
+static inline pgd_t * native_get_normal_pgd(pgd_t *pgdp)
+{
+ return pgdp;
+}
#endif /* CONFIG_KAISER */
+/*
+ * Page table pages are page-aligned. The lower half of the top
+ * level is used for userspace and the top half for the kernel.
+ * This returns true for user pages that need to get copied into
+ * both the user and kernel copies of the page tables, and false
+ * for kernel pages that should only be in the kernel copy.
+ */
+static inline bool is_userspace_pgd(void *__ptr)
+{
+ unsigned long ptr = (unsigned long)__ptr;
+
+ return ((ptr % PAGE_SIZE) < (PAGE_SIZE / 2));
+}
+
static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd)
{
#ifdef CONFIG_KAISER
- // We know that a pgd is page aligned.
- // Therefore the lower indices have to be mapped to user space.
- // These pages are mapped to the shadow mapping.
- if ((((unsigned long)pgdp) % PAGE_SIZE) < (PAGE_SIZE / 2)) {
+ pteval_t extra_kern_pgd_flags = 0;
+ /* Do we need to also populate the shadow pgd? */
+ if (is_userspace_pgd(pgdp)) {
native_get_shadow_pgd(pgdp)->pgd = pgd.pgd;
+ /*
+ * Even if the entry is *mapping* userspace, ensure
+ * that userspace can not use it. This way, if we
+ * get out to userspace running on the kernel CR3,
+ * userspace will crash instead of running.
+ */
+ extra_kern_pgd_flags = _PAGE_NX;
}
-
- pgdp->pgd = pgd.pgd & ~_PAGE_USER;
+ pgdp->pgd = pgd.pgd;
+ pgdp->pgd |= extra_kern_pgd_flags;
#else /* CONFIG_KAISER */
*pgdp = pgd;
#endif
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -42,7 +42,7 @@
#ifdef CONFIG_KAISER
#define _PAGE_GLOBAL (_AT(pteval_t, 0))
#else
-#define _PAGE_GLOBAL (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL)
+#define _PAGE_GLOBAL (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL)
#endif
#define _PAGE_SOFTW1 (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW1)
#define _PAGE_SOFTW2 (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW2)
@@ -93,11 +93,7 @@
#define _PAGE_NX (_AT(pteval_t, 0))
#endif
-#ifdef CONFIG_KAISER
-#define _PAGE_PROTNONE (_AT(pteval_t, 0))
-#else
#define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)
-#endif
#define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \
_PAGE_ACCESSED | _PAGE_DIRTY)
--- a/arch/x86/kernel/espfix_64.c
+++ b/arch/x86/kernel/espfix_64.c
@@ -127,11 +127,14 @@ void __init init_espfix_bsp(void)
/* Install the espfix pud into the kernel page directory */
pgd_p = &init_level4_pgt[pgd_index(ESPFIX_BASE_ADDR)];
pgd_populate(&init_mm, pgd_p, (pud_t *)espfix_pud_page);
-#ifdef CONFIG_KAISER
- // add the esp stack pud to the shadow mapping here.
- // This can be done directly, because the fixup stack has its own pud
- set_pgd(native_get_shadow_pgd(pgd_p), __pgd(_PAGE_TABLE | __pa((pud_t *)espfix_pud_page)));
-#endif
+ /*
+ * Just copy the top-level PGD that is mapping the espfix
+ * area to ensure it is mapped into the shadow user page
+ * tables.
+ */
+ if (IS_ENABLED(CONFIG_KAISER))
+ set_pgd(native_get_shadow_pgd(pgd_p),
+ __pgd(_KERNPG_TABLE | __pa((pud_t *)espfix_pud_page)));
/* Randomize the locations */
init_espfix_random();
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -442,11 +442,24 @@ early_idt_ripmsg:
GLOBAL(name)
#ifdef CONFIG_KAISER
+/*
+ * Each PGD needs to be 8k long and 8k aligned. We do not
+ * ever go out to userspace with these, so we do not
+ * strictly *need* the second page, but this allows us to
+ * have a single set_pgd() implementation that does not
+ * need to worry about whether it has 4k or 8k to work
+ * with.
+ *
+ * This ensures PGDs are 8k long:
+ */
+#define KAISER_USER_PGD_FILL 512
+/* This ensures they are 8k-aligned: */
#define NEXT_PGD_PAGE(name) \
.balign 2 * PAGE_SIZE; \
GLOBAL(name)
#else
#define NEXT_PGD_PAGE(name) NEXT_PAGE(name)
+#define KAISER_USER_PGD_FILL 0
#endif
/* Automate the creation of 1 to 1 mapping pmd entries */
@@ -461,6 +474,7 @@ GLOBAL(name)
NEXT_PGD_PAGE(early_level4_pgt)
.fill 511,8,0
.quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
+ .fill KAISER_USER_PGD_FILL,8,0
NEXT_PAGE(early_dynamic_pgts)
.fill 512*EARLY_DYNAMIC_PAGE_TABLES,8,0
@@ -469,7 +483,8 @@ NEXT_PAGE(early_dynamic_pgts)
#ifndef CONFIG_XEN
NEXT_PGD_PAGE(init_level4_pgt)
- .fill 2*512,8,0
+ .fill 512,8,0
+ .fill KAISER_USER_PGD_FILL,8,0
#else
NEXT_PGD_PAGE(init_level4_pgt)
.quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
@@ -478,6 +493,7 @@ NEXT_PGD_PAGE(init_level4_pgt)
.org init_level4_pgt + L4_START_KERNEL*8, 0
/* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
.quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
+ .fill KAISER_USER_PGD_FILL,8,0
NEXT_PAGE(level3_ident_pgt)
.quad level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
@@ -488,6 +504,7 @@ NEXT_PAGE(level2_ident_pgt)
*/
PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD)
#endif
+ .fill KAISER_USER_PGD_FILL,8,0
NEXT_PAGE(level3_kernel_pgt)
.fill L3_START_KERNEL,8,0
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -18,6 +18,7 @@
#include <linux/uaccess.h>
#include <asm/ldt.h>
+#include <asm/kaiser.h>
#include <asm/desc.h>
#include <asm/mmu_context.h>
#include <asm/syscalls.h>
@@ -34,11 +35,21 @@ static void flush_ldt(void *current_mm)
set_ldt(pc->ldt->entries, pc->ldt->size);
}
+static void __free_ldt_struct(struct ldt_struct *ldt)
+{
+ if (ldt->size * LDT_ENTRY_SIZE > PAGE_SIZE)
+ vfree(ldt->entries);
+ else
+ free_page((unsigned long)ldt->entries);
+ kfree(ldt);
+}
+
/* The caller must call finalize_ldt_struct on the result. LDT starts zeroed. */
static struct ldt_struct *alloc_ldt_struct(int size)
{
struct ldt_struct *new_ldt;
int alloc_size;
+ int ret = 0;
if (size > LDT_ENTRIES)
return NULL;
@@ -66,6 +77,14 @@ static struct ldt_struct *alloc_ldt_stru
return NULL;
}
+ // FIXME: make kaiser_add_mapping() return an error code
+ // when it fails
+ kaiser_add_mapping((unsigned long)new_ldt->entries, alloc_size,
+ __PAGE_KERNEL);
+ if (ret) {
+ __free_ldt_struct(new_ldt);
+ return NULL;
+ }
new_ldt->size = size;
return new_ldt;
}
@@ -92,12 +111,10 @@ static void free_ldt_struct(struct ldt_s
if (likely(!ldt))
return;
+ kaiser_remove_mapping((unsigned long)ldt->entries,
+ ldt->size * LDT_ENTRY_SIZE);
paravirt_free_ldt(ldt->entries, ldt->size);
- if (ldt->size * LDT_ENTRY_SIZE > PAGE_SIZE)
- vfree(ldt->entries);
- else
- free_page((unsigned long)ldt->entries);
- kfree(ldt);
+ __free_ldt_struct(ldt);
}
/*
--- a/arch/x86/kernel/tracepoint.c
+++ b/arch/x86/kernel/tracepoint.c
@@ -9,10 +9,12 @@
#include <linux/atomic.h>
atomic_t trace_idt_ctr = ATOMIC_INIT(0);
+__aligned(PAGE_SIZE)
struct desc_ptr trace_idt_descr = { NR_VECTORS * 16 - 1,
(unsigned long) trace_idt_table };
/* No need to be aligned, but done to keep all IDTs defined the same way. */
+__aligned(PAGE_SIZE)
gate_desc trace_idt_table[NR_VECTORS] __page_aligned_bss;
static int trace_irq_vector_refcount;
--- a/arch/x86/mm/kaiser.c
+++ b/arch/x86/mm/kaiser.c
@@ -1,160 +1,306 @@
-
-
+#include <linux/bug.h>
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/string.h>
#include <linux/types.h>
#include <linux/bug.h>
#include <linux/init.h>
+#include <linux/interrupt.h>
#include <linux/spinlock.h>
#include <linux/mm.h>
-
#include <linux/uaccess.h>
+#include <linux/ftrace.h>
+
+#include <asm/kaiser.h>
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
#include <asm/desc.h>
#ifdef CONFIG_KAISER
__visible DEFINE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup);
+/*
+ * At runtime, the only things we map are some things for CPU
+ * hotplug, and stacks for new processes. No two CPUs will ever
+ * be populating the same addresses, so we only need to ensure
+ * that we protect between two CPUs trying to allocate and
+ * populate the same page table page.
+ *
+ * Only take this lock when doing a set_p[4um]d(), but it is not
+ * needed for doing a set_pte(). We assume that only the *owner*
+ * of a given allocation will be doing this for _their_
+ * allocation.
+ *
+ * This ensures that once a system has been running for a while
+ * and there have been stacks all over and these page tables
+ * are fully populated, there will be no further acquisitions of
+ * this lock.
+ */
+static DEFINE_SPINLOCK(shadow_table_allocation_lock);
-/**
- * Get the real ppn from a address in kernel mapping.
- * @param address The virtual adrress
- * @return the physical address
+/*
+ * Returns -1 on error.
*/
-static inline unsigned long get_pa_from_mapping (unsigned long address)
+static inline unsigned long get_pa_from_mapping(unsigned long vaddr)
{
pgd_t *pgd;
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
- pgd = pgd_offset_k(address);
- BUG_ON(pgd_none(*pgd) || pgd_large(*pgd));
+ pgd = pgd_offset_k(vaddr);
+ /*
+ * We made all the kernel PGDs present in kaiser_init().
+ * We expect them to stay that way.
+ */
+ BUG_ON(pgd_none(*pgd));
+ /*
+ * PGDs are either 512GB or 128TB on all x86_64
+ * configurations. We don't handle these.
+ */
+ BUG_ON(pgd_large(*pgd));
+
+ pud = pud_offset(pgd, vaddr);
+ if (pud_none(*pud)) {
+ WARN_ON_ONCE(1);
+ return -1;
+ }
- pud = pud_offset(pgd, address);
- BUG_ON(pud_none(*pud));
+ if (pud_large(*pud))
+ return (pud_pfn(*pud) << PAGE_SHIFT) | (vaddr & ~PUD_PAGE_MASK);
- if (pud_large(*pud)) {
- return (pud_pfn(*pud) << PAGE_SHIFT) | (address & ~PUD_PAGE_MASK);
+ pmd = pmd_offset(pud, vaddr);
+ if (pmd_none(*pmd)) {
+ WARN_ON_ONCE(1);
+ return -1;
}
- pmd = pmd_offset(pud, address);
- BUG_ON(pmd_none(*pmd));
+ if (pmd_large(*pmd))
+ return (pmd_pfn(*pmd) << PAGE_SHIFT) | (vaddr & ~PMD_PAGE_MASK);
- if (pmd_large(*pmd)) {
- return (pmd_pfn(*pmd) << PAGE_SHIFT) | (address & ~PMD_PAGE_MASK);
+ pte = pte_offset_kernel(pmd, vaddr);
+ if (pte_none(*pte)) {
+ WARN_ON_ONCE(1);
+ return -1;
}
- pte = pte_offset_kernel(pmd, address);
- BUG_ON(pte_none(*pte));
-
- return (pte_pfn(*pte) << PAGE_SHIFT) | (address & ~PAGE_MASK);
+ return (pte_pfn(*pte) << PAGE_SHIFT) | (vaddr & ~PAGE_MASK);
}
-void _kaiser_copy (unsigned long start_addr, unsigned long size,
- unsigned long flags)
+/*
+ * This is a relatively normal page table walk, except that it
+ * also tries to allocate page tables pages along the way.
+ *
+ * Returns a pointer to a PTE on success, or NULL on failure.
+ */
+static pte_t *kaiser_pagetable_walk(unsigned long address, bool is_atomic)
{
- pgd_t *pgd;
- pud_t *pud;
pmd_t *pmd;
- pte_t *pte;
- unsigned long address;
- unsigned long end_addr = start_addr + size;
- unsigned long target_address;
+ pud_t *pud;
+ pgd_t *pgd = native_get_shadow_pgd(pgd_offset_k(address));
+ gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
- for (address = PAGE_ALIGN(start_addr - (PAGE_SIZE - 1));
- address < PAGE_ALIGN(end_addr); address += PAGE_SIZE) {
- target_address = get_pa_from_mapping(address);
+ might_sleep();
+ if (is_atomic) {
+ gfp &= ~GFP_KERNEL;
+ gfp |= __GFP_HIGH | __GFP_ATOMIC;
+ }
- pgd = native_get_shadow_pgd(pgd_offset_k(address));
+ if (pgd_none(*pgd)) {
+ WARN_ONCE(1, "All shadow pgds should have been populated");
+ return NULL;
+ }
+ BUILD_BUG_ON(pgd_large(*pgd) != 0);
- BUG_ON(pgd_none(*pgd) && "All shadow pgds should be mapped at this time\n");
- BUG_ON(pgd_large(*pgd));
+ pud = pud_offset(pgd, address);
+ /* The shadow page tables do not use large mappings: */
+ if (pud_large(*pud)) {
+ WARN_ON(1);
+ return NULL;
+ }
+ if (pud_none(*pud)) {
+ unsigned long new_pmd_page = __get_free_page(gfp);
+ if (!new_pmd_page)
+ return NULL;
+ spin_lock(&shadow_table_allocation_lock);
+ if (pud_none(*pud))
+ set_pud(pud, __pud(_KERNPG_TABLE | __pa(new_pmd_page)));
+ else
+ free_page(new_pmd_page);
+ spin_unlock(&shadow_table_allocation_lock);
+ }
- pud = pud_offset(pgd, address);
- if (pud_none(*pud)) {
- set_pud(pud, __pud(_PAGE_TABLE | __pa(pmd_alloc_one(0, address))));
- }
- BUG_ON(pud_large(*pud));
+ pmd = pmd_offset(pud, address);
+ /* The shadow page tables do not use large mappings: */
+ if (pmd_large(*pmd)) {
+ WARN_ON(1);
+ return NULL;
+ }
+ if (pmd_none(*pmd)) {
+ unsigned long new_pte_page = __get_free_page(gfp);
+ if (!new_pte_page)
+ return NULL;
+ spin_lock(&shadow_table_allocation_lock);
+ if (pmd_none(*pmd))
+ set_pmd(pmd, __pmd(_KERNPG_TABLE | __pa(new_pte_page)));
+ else
+ free_page(new_pte_page);
+ spin_unlock(&shadow_table_allocation_lock);
+ }
- pmd = pmd_offset(pud, address);
- if (pmd_none(*pmd)) {
- set_pmd(pmd, __pmd(_PAGE_TABLE | __pa(pte_alloc_one_kernel(0, address))));
- }
- BUG_ON(pmd_large(*pmd));
+ return pte_offset_kernel(pmd, address);
+}
- pte = pte_offset_kernel(pmd, address);
+int kaiser_add_user_map(const void *__start_addr, unsigned long size,
+ unsigned long flags)
+{
+ int ret = 0;
+ pte_t *pte;
+ unsigned long start_addr = (unsigned long )__start_addr;
+ unsigned long address = start_addr & PAGE_MASK;
+ unsigned long end_addr = PAGE_ALIGN(start_addr + size);
+ unsigned long target_address;
+
+ for (;address < end_addr; address += PAGE_SIZE) {
+ target_address = get_pa_from_mapping(address);
+ if (target_address == -1) {
+ ret = -EIO;
+ break;
+ }
+ pte = kaiser_pagetable_walk(address, false);
if (pte_none(*pte)) {
set_pte(pte, __pte(flags | target_address));
} else {
- BUG_ON(__pa(pte_page(*pte)) != target_address);
+ pte_t tmp;
+ set_pte(&tmp, __pte(flags | target_address));
+ WARN_ON_ONCE(!pte_same(*pte, tmp));
}
}
+ return ret;
}
-// at first, add a pmd for every pgd entry in the shadowmem-kernel-part of the kernel mapping
-static inline void __init _kaiser_init(void)
+static int kaiser_add_user_map_ptrs(const void *start, const void *end, unsigned long flags)
+{
+ unsigned long size = end - start;
+
+ return kaiser_add_user_map(start, size, flags);
+}
+
+/*
+ * Ensure that the top level of the (shadow) page tables are
+ * entirely populated. This ensures that all processes that get
+ * forked have the same entries. This way, we do not have to
+ * ever go set up new entries in older processes.
+ *
+ * Note: we never free these, so there are no updates to them
+ * after this.
+ */
+static void __init kaiser_init_all_pgds(void)
{
pgd_t *pgd;
int i = 0;
pgd = native_get_shadow_pgd(pgd_offset_k((unsigned long )0));
for (i = PTRS_PER_PGD / 2; i < PTRS_PER_PGD; i++) {
- set_pgd(pgd + i, __pgd(_PAGE_TABLE |__pa(pud_alloc_one(0, 0))));
+ pgd_t new_pgd;
+ pud_t *pud = pud_alloc_one(&init_mm, PAGE_OFFSET + i * PGDIR_SIZE);
+ if (!pud) {
+ WARN_ON(1);
+ break;
+ }
+ new_pgd = __pgd(_KERNPG_TABLE |__pa(pud));
+ /*
+ * Make sure not to stomp on some other pgd entry.
+ */
+ if (!pgd_none(pgd[i])) {
+ WARN_ON(1);
+ continue;
+ }
+ set_pgd(pgd + i, new_pgd);
}
}
+#define kaiser_add_user_map_early(start, size, flags) do { \
+ int __ret = kaiser_add_user_map(start, size, flags); \
+ WARN_ON(__ret); \
+} while (0)
+
+#define kaiser_add_user_map_ptrs_early(start, end, flags) do { \
+ int __ret = kaiser_add_user_map_ptrs(start, end, flags); \
+ WARN_ON(__ret); \
+} while (0)
+
extern char __per_cpu_user_mapped_start[], __per_cpu_user_mapped_end[];
-spinlock_t shadow_table_lock;
+/*
+ * If anything in here fails, we will likely die on one of the
+ * first kernel->user transitions and init will die. But, we
+ * will have most of the kernel up by then and should be able to
+ * get a clean warning out of it. If we BUG_ON() here, we run
+ * the risk of being before we have good console output.
+ */
void __init kaiser_init(void)
{
int cpu;
- spin_lock_init(&shadow_table_lock);
-
- spin_lock(&shadow_table_lock);
- _kaiser_init();
+ kaiser_init_all_pgds();
for_each_possible_cpu(cpu) {
- // map the per cpu user variables
- _kaiser_copy(
- (unsigned long) (__per_cpu_user_mapped_start + per_cpu_offset(cpu)),
- (unsigned long) __per_cpu_user_mapped_end - (unsigned long) __per_cpu_user_mapped_start,
- __PAGE_KERNEL);
- }
-
- // map the entry/exit text section, which is responsible to switch between user- and kernel mode
- _kaiser_copy(
- (unsigned long) __entry_text_start,
- (unsigned long) __entry_text_end - (unsigned long) __entry_text_start,
- __PAGE_KERNEL_RX);
-
- // the fixed map address of the idt_table
- _kaiser_copy(
- (unsigned long) idt_descr.address,
- sizeof(gate_desc) * NR_VECTORS,
- __PAGE_KERNEL_RO);
+ void *percpu_vaddr = __per_cpu_user_mapped_start +
+ per_cpu_offset(cpu);
+ unsigned long percpu_sz = __per_cpu_user_mapped_end -
+ __per_cpu_user_mapped_start;
+ kaiser_add_user_map_early(percpu_vaddr, percpu_sz,
+ __PAGE_KERNEL);
+ }
- spin_unlock(&shadow_table_lock);
+ /*
+ * Map the entry/exit text section, which is needed at
+ * switches from user to and from kernel.
+ */
+ kaiser_add_user_map_ptrs_early(__entry_text_start, __entry_text_end,
+ __PAGE_KERNEL_RX);
+
+#if defined(CONFIG_FUNCTION_GRAPH_TRACER) || defined(CONFIG_KASAN)
+ kaiser_add_user_map_ptrs_early(__irqentry_text_start,
+ __irqentry_text_end,
+ __PAGE_KERNEL_RX);
+#endif
+ kaiser_add_user_map_early((void *)idt_descr.address,
+ sizeof(gate_desc) * NR_VECTORS,
+ __PAGE_KERNEL_RO);
+#ifdef CONFIG_TRACING
+ kaiser_add_user_map_early(&trace_idt_descr,
+ sizeof(trace_idt_descr),
+ __PAGE_KERNEL);
+ kaiser_add_user_map_early(&trace_idt_table,
+ sizeof(gate_desc) * NR_VECTORS,
+ __PAGE_KERNEL);
+#endif
+ kaiser_add_user_map_early(&debug_idt_descr, sizeof(debug_idt_descr),
+ __PAGE_KERNEL);
+ kaiser_add_user_map_early(&debug_idt_table,
+ sizeof(gate_desc) * NR_VECTORS,
+ __PAGE_KERNEL);
}
+extern void unmap_pud_range_nofree(pgd_t *pgd, unsigned long start, unsigned long end);
// add a mapping to the shadow-mapping, and synchronize the mappings
-void kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags)
+int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags)
{
- spin_lock(&shadow_table_lock);
- _kaiser_copy(addr, size, flags);
- spin_unlock(&shadow_table_lock);
+ return kaiser_add_user_map((const void *)addr, size, flags);
}
-extern void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end);
void kaiser_remove_mapping(unsigned long start, unsigned long size)
{
- pgd_t *pgd = native_get_shadow_pgd(pgd_offset_k(start));
- spin_lock(&shadow_table_lock);
- do {
- unmap_pud_range(pgd, start, start + size);
- } while (pgd++ != native_get_shadow_pgd(pgd_offset_k(start + size)));
- spin_unlock(&shadow_table_lock);
+ unsigned long end = start + size;
+ unsigned long addr;
+
+ for (addr = start; addr < end; addr += PGDIR_SIZE) {
+ pgd_t *pgd = native_get_shadow_pgd(pgd_offset_k(addr));
+ /*
+ * unmap_p4d_range() handles > P4D_SIZE unmaps,
+ * so no need to trim 'end'.
+ */
+ unmap_pud_range_nofree(pgd, addr, end);
+ }
}
#endif /* CONFIG_KAISER */
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -52,6 +52,7 @@ static DEFINE_SPINLOCK(cpa_lock);
#define CPA_FLUSHTLB 1
#define CPA_ARRAY 2
#define CPA_PAGES_ARRAY 4
+#define CPA_FREE_PAGETABLES 8
#ifdef CONFIG_PROC_FS
static unsigned long direct_pages_count[PG_LEVEL_NUM];
@@ -723,10 +724,13 @@ static int split_large_page(struct cpa_d
return 0;
}
-static bool try_to_free_pte_page(pte_t *pte)
+static bool try_to_free_pte_page(struct cpa_data *cpa, pte_t *pte)
{
int i;
+ if (!(cpa->flags & CPA_FREE_PAGETABLES))
+ return false;
+
for (i = 0; i < PTRS_PER_PTE; i++)
if (!pte_none(pte[i]))
return false;
@@ -735,10 +739,13 @@ static bool try_to_free_pte_page(pte_t *
return true;
}
-static bool try_to_free_pmd_page(pmd_t *pmd)
+static bool try_to_free_pmd_page(struct cpa_data *cpa, pmd_t *pmd)
{
int i;
+ if (!(cpa->flags & CPA_FREE_PAGETABLES))
+ return false;
+
for (i = 0; i < PTRS_PER_PMD; i++)
if (!pmd_none(pmd[i]))
return false;
@@ -759,7 +766,9 @@ static bool try_to_free_pud_page(pud_t *
return true;
}
-static bool unmap_pte_range(pmd_t *pmd, unsigned long start, unsigned long end)
+static bool unmap_pte_range(struct cpa_data *cpa, pmd_t *pmd,
+ unsigned long start,
+ unsigned long end)
{
pte_t *pte = pte_offset_kernel(pmd, start);
@@ -770,22 +779,23 @@ static bool unmap_pte_range(pmd_t *pmd,
pte++;
}
- if (try_to_free_pte_page((pte_t *)pmd_page_vaddr(*pmd))) {
+ if (try_to_free_pte_page(cpa, (pte_t *)pmd_page_vaddr(*pmd))) {
pmd_clear(pmd);
return true;
}
return false;
}
-static void __unmap_pmd_range(pud_t *pud, pmd_t *pmd,
+static void __unmap_pmd_range(struct cpa_data *cpa, pud_t *pud, pmd_t *pmd,
unsigned long start, unsigned long end)
{
- if (unmap_pte_range(pmd, start, end))
- if (try_to_free_pmd_page((pmd_t *)pud_page_vaddr(*pud)))
+ if (unmap_pte_range(cpa, pmd, start, end))
+ if (try_to_free_pmd_page(cpa, (pmd_t *)pud_page_vaddr(*pud)))
pud_clear(pud);
}
-static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end)
+static void unmap_pmd_range(struct cpa_data *cpa, pud_t *pud,
+ unsigned long start, unsigned long end)
{
pmd_t *pmd = pmd_offset(pud, start);
@@ -796,7 +806,7 @@ static void unmap_pmd_range(pud_t *pud,
unsigned long next_page = (start + PMD_SIZE) & PMD_MASK;
unsigned long pre_end = min_t(unsigned long, end, next_page);
- __unmap_pmd_range(pud, pmd, start, pre_end);
+ __unmap_pmd_range(cpa, pud, pmd, start, pre_end);
start = pre_end;
pmd++;
@@ -809,7 +819,8 @@ static void unmap_pmd_range(pud_t *pud,
if (pmd_large(*pmd))
pmd_clear(pmd);
else
- __unmap_pmd_range(pud, pmd, start, start + PMD_SIZE);
+ __unmap_pmd_range(cpa, pud, pmd,
+ start, start + PMD_SIZE);
start += PMD_SIZE;
pmd++;
@@ -819,17 +830,19 @@ static void unmap_pmd_range(pud_t *pud,
* 4K leftovers?
*/
if (start < end)
- return __unmap_pmd_range(pud, pmd, start, end);
+ return __unmap_pmd_range(cpa, pud, pmd, start, end);
/*
* Try again to free the PMD page if haven't succeeded above.
*/
if (!pud_none(*pud))
- if (try_to_free_pmd_page((pmd_t *)pud_page_vaddr(*pud)))
+ if (try_to_free_pmd_page(cpa, (pmd_t *)pud_page_vaddr(*pud)))
pud_clear(pud);
}
-void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)
+static void __unmap_pud_range(struct cpa_data *cpa, pgd_t *pgd,
+ unsigned long start,
+ unsigned long end)
{
pud_t *pud = pud_offset(pgd, start);
@@ -840,7 +853,7 @@ void unmap_pud_range(pgd_t *pgd, unsigne
unsigned long next_page = (start + PUD_SIZE) & PUD_MASK;
unsigned long pre_end = min_t(unsigned long, end, next_page);
- unmap_pmd_range(pud, start, pre_end);
+ unmap_pmd_range(cpa, pud, start, pre_end);
start = pre_end;
pud++;
@@ -854,7 +867,7 @@ void unmap_pud_range(pgd_t *pgd, unsigne
if (pud_large(*pud))
pud_clear(pud);
else
- unmap_pmd_range(pud, start, start + PUD_SIZE);
+ unmap_pmd_range(cpa, pud, start, start + PUD_SIZE);
start += PUD_SIZE;
pud++;
@@ -864,7 +877,7 @@ void unmap_pud_range(pgd_t *pgd, unsigne
* 2M leftovers?
*/
if (start < end)
- unmap_pmd_range(pud, start, end);
+ unmap_pmd_range(cpa, pud, start, end);
/*
* No need to try to free the PUD page because we'll free it in
@@ -872,6 +885,24 @@ void unmap_pud_range(pgd_t *pgd, unsigne
*/
}
+static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)
+{
+ struct cpa_data cpa = {
+ .flags = CPA_FREE_PAGETABLES,
+ };
+
+ __unmap_pud_range(&cpa, pgd, start, end);
+}
+
+void unmap_pud_range_nofree(pgd_t *pgd, unsigned long start, unsigned long end)
+{
+ struct cpa_data cpa = {
+ .flags = 0,
+ };
+
+ __unmap_pud_range(&cpa, pgd, start, end);
+}
+
static void unmap_pgd_range(pgd_t *root, unsigned long addr, unsigned long end)
{
pgd_t *pgd_entry = root + pgd_index(addr);
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -340,40 +340,26 @@ static inline void _pgd_free(pgd_t *pgd)
kmem_cache_free(pgd_cache, pgd);
}
#else
-static inline pgd_t *_pgd_alloc(void)
-{
-#ifdef CONFIG_KAISER
- // Instead of one PML4, we aquire two PML4s and, thus, an 8kb-aligned memory
- // block. Therefore, we have to allocate at least 3 pages. However, the
- // __get_free_pages returns us 4 pages. Hence, we store the base pointer at
- // the beginning of the page of our 8kb-aligned memory block in order to
- // correctly free it afterwars.
- unsigned long pages = __get_free_pages(PGALLOC_GFP, get_order(4*PAGE_SIZE));
-
- if(native_get_normal_pgd((pgd_t*) pages) == (pgd_t*) pages)
- {
- *((unsigned long*)(pages + 2 * PAGE_SIZE)) = pages;
- return (pgd_t *) pages;
- }
- else
- {
- *((unsigned long*)(pages + 3 * PAGE_SIZE)) = pages;
- return (pgd_t *) (pages + PAGE_SIZE);
- }
+#ifdef CONFIG_KAISER
+/*
+ * Instead of one pmd, we aquire two pmds. Being order-1, it is
+ * both 8k in size and 8k-aligned. That lets us just flip bit 12
+ * in a pointer to swap between the two 4k halves.
+ */
+#define PGD_ALLOCATION_ORDER 1
#else
- return (pgd_t *)__get_free_page(PGALLOC_GFP);
+#define PGD_ALLOCATION_ORDER 0
#endif
+
+static inline pgd_t *_pgd_alloc(void)
+{
+ return (pgd_t *)__get_free_pages(PGALLOC_GFP, PGD_ALLOCATION_ORDER);
}
static inline void _pgd_free(pgd_t *pgd)
{
-#ifdef CONFIG_KAISER
- unsigned long pages = *((unsigned long*) ((char*) pgd + 2 * PAGE_SIZE));
- free_pages(pages, get_order(4*PAGE_SIZE));
-#else
- free_page((unsigned long)pgd);
-#endif
+ free_pages((unsigned long)pgd, PGD_ALLOCATION_ORDER);
}
#endif /* CONFIG_X86_PAE */
--- /dev/null
+++ b/include/linux/kaiser.h
@@ -0,0 +1,26 @@
+#ifndef _INCLUDE_KAISER_H
+#define _INCLUDE_KAISER_H
+
+#ifdef CONFIG_KAISER
+#include <asm/kaiser.h>
+#else
+
+/*
+ * These stubs are used whenever CONFIG_KAISER is off, which
+ * includes architectures that support KAISER, but have it
+ * disabled.
+ */
+
+static inline void kaiser_init(void)
+{
+}
+static inline void kaiser_remove_mapping(unsigned long start, unsigned long size)
+{
+}
+static inline int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags)
+{
+ return 0;
+}
+
+#endif /* !CONFIG_KAISER */
+#endif /* _INCLUDE_KAISER_H */
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -58,6 +58,7 @@
#include <linux/tsacct_kern.h>
#include <linux/cn_proc.h>
#include <linux/freezer.h>
+#include <linux/kaiser.h>
#include <linux/delayacct.h>
#include <linux/taskstats_kern.h>
#include <linux/random.h>
@@ -335,7 +336,6 @@ void set_task_stack_end_magic(struct tas
*stackend = STACK_END_MAGIC; /* for overflow detection */
}
-extern void kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags);
static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
{
struct task_struct *tsk;
@@ -357,9 +357,10 @@ static struct task_struct *dup_task_stru
goto free_ti;
tsk->stack = ti;
-#ifdef CONFIG_KAISER
- kaiser_add_mapping((unsigned long)tsk->stack, THREAD_SIZE, __PAGE_KERNEL);
-#endif
+
+ err= kaiser_add_mapping((unsigned long)tsk->stack, THREAD_SIZE, __PAGE_KERNEL);
+ if (err)
+ goto free_ti;
#ifdef CONFIG_SECCOMP
/*
* We must handle setting up seccomp filters once we're under
--- a/security/Kconfig
+++ b/security/Kconfig
@@ -32,12 +32,17 @@ config SECURITY
If you are unsure how to answer this question, answer N.
config KAISER
bool "Remove the kernel mapping in user mode"
+ default y
depends on X86_64
depends on !PARAVIRT
help
This enforces a strict kernel and user space isolation in order to close
hardware side channels on kernel address information.
+config KAISER_REAL_SWITCH
+ bool "KAISER: actually switch page tables"
+ default y
+
config SECURITYFS
bool "Enable the securityfs filesystem"
help
Patches currently in stable-queue which might be from dave.hansen(a)linux.intel.com are
queue-4.4/x86-paravirt-dont-patch-flush_tlb_single.patch
queue-4.4/kaiser-merged-update.patch
queue-4.4/kaiser-enhanced-by-kernel-and-user-pcids.patch
This is a note to let you know that I've just added the patch titled
kaiser: name that 0x1000 KAISER_SHADOW_PGD_OFFSET
to the 4.4-stable tree which can be found at:
http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum…
The filename of the patch is:
kaiser-name-that-0x1000-kaiser_shadow_pgd_offset.patch
and it can be found in the queue-4.4 subdirectory.
If you, or anyone else, feels it should not be added to the stable tree,
please let <stable(a)vger.kernel.org> know about it.
>From foo@baz Wed Jan 3 18:58:12 CET 2018
From: Hugh Dickins <hughd(a)google.com>
Date: Sat, 9 Sep 2017 17:31:18 -0700
Subject: kaiser: name that 0x1000 KAISER_SHADOW_PGD_OFFSET
From: Hugh Dickins <hughd(a)google.com>
There's a 0x1000 in various places, which looks better with a name.
Signed-off-by: Hugh Dickins <hughd(a)google.com>
Acked-by: Jiri Kosina <jkosina(a)suse.cz>
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
arch/x86/entry/entry_64.S | 4 ++--
arch/x86/include/asm/kaiser.h | 7 +++++--
2 files changed, 7 insertions(+), 4 deletions(-)
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -1292,7 +1292,7 @@ ENTRY(nmi)
movq %cr3, %rax
pushq %rax
#ifdef CONFIG_KAISER_REAL_SWITCH
- andq $(~0x1000), %rax
+ andq $(~KAISER_SHADOW_PGD_OFFSET), %rax
#endif
movq %rax, %cr3
#endif
@@ -1535,7 +1535,7 @@ end_repeat_nmi:
movq %cr3, %rax
pushq %rax
#ifdef CONFIG_KAISER_REAL_SWITCH
- andq $(~0x1000), %rax
+ andq $(~KAISER_SHADOW_PGD_OFFSET), %rax
#endif
movq %rax, %cr3
#endif
--- a/arch/x86/include/asm/kaiser.h
+++ b/arch/x86/include/asm/kaiser.h
@@ -13,13 +13,16 @@
* A minimalistic kernel mapping holds the parts needed to be mapped in user
* mode, such as the entry/exit functions of the user space, or the stacks.
*/
+
+#define KAISER_SHADOW_PGD_OFFSET 0x1000
+
#ifdef __ASSEMBLY__
#ifdef CONFIG_KAISER
.macro _SWITCH_TO_KERNEL_CR3 reg
movq %cr3, \reg
#ifdef CONFIG_KAISER_REAL_SWITCH
-andq $(~0x1000), \reg
+andq $(~KAISER_SHADOW_PGD_OFFSET), \reg
#endif
movq \reg, %cr3
.endm
@@ -27,7 +30,7 @@ movq \reg, %cr3
.macro _SWITCH_TO_USER_CR3 reg
movq %cr3, \reg
#ifdef CONFIG_KAISER_REAL_SWITCH
-orq $(0x1000), \reg
+orq $(KAISER_SHADOW_PGD_OFFSET), \reg
#endif
movq \reg, %cr3
.endm
Patches currently in stable-queue which might be from hughd(a)google.com are
queue-4.4/kaiser-vmstat-show-nr_kaisertable-as-nr_overhead.patch
queue-4.4/kaiser-add-nokaiser-boot-option-using-alternative.patch
queue-4.4/kaiser-fix-unlikely-error-in-alloc_ldt_struct.patch
queue-4.4/kaiser-_pgd_alloc-without-__gfp_repeat-to-avoid-stalls.patch
queue-4.4/kaiser-kaiser_flush_tlb_on_return_to_user-check-pcid.patch
queue-4.4/x86-paravirt-dont-patch-flush_tlb_single.patch
queue-4.4/kaiser-merged-update.patch
queue-4.4/kaiser-delete-kaiser_real_switch-option.patch
queue-4.4/kaiser-kaiser_remove_mapping-move-along-the-pgd.patch
queue-4.4/kaiser-fix-perf-crashes.patch
queue-4.4/kaiser-drop-is_atomic-arg-to-kaiser_pagetable_walk.patch
queue-4.4/kaiser-load_new_mm_cr3-let-switch_user_cr3-flush-user.patch
queue-4.4/kaiser-enhanced-by-kernel-and-user-pcids.patch
queue-4.4/kaiser-x86_cr3_pcid_noflush-and-x86_cr3_pcid_user.patch
queue-4.4/kaiser-use-alternative-instead-of-x86_cr3_pcid_noflush.patch
queue-4.4/kaiser-stack-map-page_size-at-thread_size-page_size.patch
queue-4.4/kaiser-name-that-0x1000-kaiser_shadow_pgd_offset.patch
queue-4.4/kaiser-fix-regs-to-do_nmi-ifndef-config_kaiser.patch
queue-4.4/kaiser-do-not-set-_page_nx-on-pgd_none.patch
queue-4.4/kaiser-tidied-up-asm-kaiser.h-somewhat.patch
queue-4.4/kaiser-cleanups-while-trying-for-gold-link.patch
queue-4.4/kaiser-tidied-up-kaiser_add-remove_mapping-slightly.patch
queue-4.4/kaiser-fix-build-and-fixme-in-alloc_ldt_struct.patch
queue-4.4/kaiser-kernel-address-isolation.patch
queue-4.4/kaiser-enomem-if-kaiser_pagetable_walk-null.patch
queue-4.4/kaiser-asm-tlbflush.h-handle-nopge-at-lower-level.patch
queue-4.4/kaiser-paranoid_entry-pass-cr3-need-to-paranoid_exit.patch
queue-4.4/kaiser-kaiser-depends-on-smp.patch
queue-4.4/kaiser-pcid-0-for-kernel-and-128-for-user.patch
This is a note to let you know that I've just added the patch titled
kaiser: load_new_mm_cr3() let SWITCH_USER_CR3 flush user
to the 4.4-stable tree which can be found at:
http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum…
The filename of the patch is:
kaiser-load_new_mm_cr3-let-switch_user_cr3-flush-user.patch
and it can be found in the queue-4.4 subdirectory.
If you, or anyone else, feels it should not be added to the stable tree,
please let <stable(a)vger.kernel.org> know about it.
>From foo@baz Wed Jan 3 18:58:12 CET 2018
From: Hugh Dickins <hughd(a)google.com>
Date: Thu, 17 Aug 2017 15:00:37 -0700
Subject: kaiser: load_new_mm_cr3() let SWITCH_USER_CR3 flush user
From: Hugh Dickins <hughd(a)google.com>
We have many machines (Westmere, Sandybridge, Ivybridge) supporting
PCID but not INVPCID: on these load_new_mm_cr3() simply crashed.
Flushing user context inside load_new_mm_cr3() without the use of
invpcid is difficult: momentarily switch from kernel to user context
and back to do so? I'm not sure whether that can be safely done at
all, and would risk polluting user context with kernel internals,
and kernel context with stale user externals.
Instead, follow the hint in the comment that was there: change
X86_CR3_PCID_USER_VAR to be a per-cpu variable, then load_new_mm_cr3()
can leave a note in it, for SWITCH_USER_CR3 on return to userspace to
flush user context TLB, instead of default X86_CR3_PCID_USER_NOFLUSH.
Which works well enough that there's no need to do it this way only
when invpcid is unsupported: it's a good alternative to invpcid here.
But there's a couple of inlines in asm/tlbflush.h that need to do the
same trick, so it's best to localize all this per-cpu business in
mm/kaiser.c: moving that part of the initialization from setup_pcid()
to kaiser_setup_pcid(); with kaiser_flush_tlb_on_return_to_user() the
function for noting an X86_CR3_PCID_USER_FLUSH. And let's keep a
KAISER_SHADOW_PGD_OFFSET in there, to avoid the extra OR on exit.
I did try to make the feature tests in asm/tlbflush.h more consistent
with each other: there seem to be far too many ways of performing such
tests, and I don't have a good grasp of their differences. At first
I converted them all to be static_cpu_has(): but that proved to be a
mistake, as the comment in __native_flush_tlb_single() hints; so then
I reversed and made them all this_cpu_has(). Probably all gratuitous
change, but that's the way it's working at present.
I am slightly bothered by the way non-per-cpu X86_CR3_PCID_KERN_VAR
gets re-initialized by each cpu (before and after these changes):
no problem when (as usual) all cpus on a machine have the same
features, but in principle incorrect. However, my experiment
to per-cpu-ify that one did not end well...
Signed-off-by: Hugh Dickins <hughd(a)google.com>
Acked-by: Jiri Kosina <jkosina(a)suse.cz>
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
arch/x86/include/asm/kaiser.h | 18 +++++++-----
arch/x86/include/asm/tlbflush.h | 58 +++++++++++++++++++++++++++-------------
arch/x86/kernel/cpu/common.c | 22 ---------------
arch/x86/mm/kaiser.c | 50 ++++++++++++++++++++++++++++++----
arch/x86/mm/tlb.c | 46 ++++++++++++-------------------
5 files changed, 114 insertions(+), 80 deletions(-)
--- a/arch/x86/include/asm/kaiser.h
+++ b/arch/x86/include/asm/kaiser.h
@@ -32,13 +32,12 @@ movq \reg, %cr3
.macro _SWITCH_TO_USER_CR3 reg
movq %cr3, \reg
andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), \reg
-/*
- * This can obviously be one instruction by putting the
- * KAISER_SHADOW_PGD_OFFSET bit in the X86_CR3_PCID_USER_VAR.
- * But, just leave it now for simplicity.
- */
-orq X86_CR3_PCID_USER_VAR, \reg
-orq $(KAISER_SHADOW_PGD_OFFSET), \reg
+orq PER_CPU_VAR(X86_CR3_PCID_USER_VAR), \reg
+js 9f
+// FLUSH this time, reset to NOFLUSH for next time
+// But if nopcid? Consider using 0x80 for user pcid?
+movb $(0x80), PER_CPU_VAR(X86_CR3_PCID_USER_VAR+7)
+9:
movq \reg, %cr3
.endm
@@ -90,6 +89,11 @@ movq PER_CPU_VAR(unsafe_stack_register_b
*/
DECLARE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup);
+extern unsigned long X86_CR3_PCID_KERN_VAR;
+DECLARE_PER_CPU(unsigned long, X86_CR3_PCID_USER_VAR);
+
+extern char __per_cpu_user_mapped_start[], __per_cpu_user_mapped_end[];
+
/**
* kaiser_add_mapping - map a virtual memory part to the shadow (user) mapping
* @addr: the start address of the range
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -12,6 +12,7 @@ static inline void __invpcid(unsigned lo
unsigned long type)
{
struct { u64 d[2]; } desc = { { pcid, addr } };
+
/*
* The memory clobber is because the whole point is to invalidate
* stale TLB entries and, especially if we're flushing global
@@ -130,27 +131,42 @@ static inline void cr4_set_bits_and_upda
cr4_set_bits(mask);
}
+/*
+ * Declare a couple of kaiser interfaces here for convenience,
+ * to avoid the need for asm/kaiser.h in unexpected places.
+ */
+#ifdef CONFIG_KAISER
+extern void kaiser_setup_pcid(void);
+extern void kaiser_flush_tlb_on_return_to_user(void);
+#else
+static inline void kaiser_setup_pcid(void)
+{
+}
+static inline void kaiser_flush_tlb_on_return_to_user(void)
+{
+}
+#endif
+
static inline void __native_flush_tlb(void)
{
- if (!cpu_feature_enabled(X86_FEATURE_INVPCID)) {
- /*
- * If current->mm == NULL then we borrow a mm which may change during a
- * task switch and therefore we must not be preempted while we write CR3
- * back:
+ if (this_cpu_has(X86_FEATURE_INVPCID)) {
+ /*
+ * Note, this works with CR4.PCIDE=0 or 1.
*/
- preempt_disable();
- native_write_cr3(native_read_cr3());
- preempt_enable();
+ invpcid_flush_all_nonglobals();
return;
}
+
/*
- * We are no longer using globals with KAISER, so a
- * "nonglobals" flush would work too. But, this is more
- * conservative.
- *
- * Note, this works with CR4.PCIDE=0 or 1.
+ * If current->mm == NULL then we borrow a mm which may change during a
+ * task switch and therefore we must not be preempted while we write CR3
+ * back:
*/
- invpcid_flush_all();
+ preempt_disable();
+ if (this_cpu_has(X86_FEATURE_PCID))
+ kaiser_flush_tlb_on_return_to_user();
+ native_write_cr3(native_read_cr3());
+ preempt_enable();
}
static inline void __native_flush_tlb_global_irq_disabled(void)
@@ -166,9 +182,13 @@ static inline void __native_flush_tlb_gl
static inline void __native_flush_tlb_global(void)
{
+#ifdef CONFIG_KAISER
+ /* Globals are not used at all */
+ __native_flush_tlb();
+#else
unsigned long flags;
- if (static_cpu_has(X86_FEATURE_INVPCID)) {
+ if (this_cpu_has(X86_FEATURE_INVPCID)) {
/*
* Using INVPCID is considerably faster than a pair of writes
* to CR4 sandwiched inside an IRQ flag save/restore.
@@ -185,10 +205,9 @@ static inline void __native_flush_tlb_gl
* be called from deep inside debugging code.)
*/
raw_local_irq_save(flags);
-
__native_flush_tlb_global_irq_disabled();
-
raw_local_irq_restore(flags);
+#endif
}
static inline void __native_flush_tlb_single(unsigned long addr)
@@ -199,9 +218,12 @@ static inline void __native_flush_tlb_si
*
* The ASIDs used below are hard-coded. But, we must not
* call invpcid(type=1/2) before CR4.PCIDE=1. Just call
- * invpcid in the case we are called early.
+ * invlpg in the case we are called early.
*/
+
if (!this_cpu_has(X86_FEATURE_INVPCID_SINGLE)) {
+ if (this_cpu_has(X86_FEATURE_PCID))
+ kaiser_flush_tlb_on_return_to_user();
asm volatile("invlpg (%0)" ::"r" (addr) : "memory");
return;
}
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -321,33 +321,12 @@ static __always_inline void setup_smap(s
}
}
-/*
- * These can have bit 63 set, so we can not just use a plain "or"
- * instruction to get their value or'd into CR3. It would take
- * another register. So, we use a memory reference to these
- * instead.
- *
- * This is also handy because systems that do not support
- * PCIDs just end up or'ing a 0 into their CR3, which does
- * no harm.
- */
-__aligned(PAGE_SIZE) unsigned long X86_CR3_PCID_KERN_VAR = 0;
-__aligned(PAGE_SIZE) unsigned long X86_CR3_PCID_USER_VAR = 0;
-
static void setup_pcid(struct cpuinfo_x86 *c)
{
if (cpu_has(c, X86_FEATURE_PCID)) {
if (cpu_has(c, X86_FEATURE_PGE)) {
cr4_set_bits(X86_CR4_PCIDE);
/*
- * These variables are used by the entry/exit
- * code to change PCIDs.
- */
-#ifdef CONFIG_KAISER
- X86_CR3_PCID_KERN_VAR = X86_CR3_PCID_KERN_NOFLUSH;
- X86_CR3_PCID_USER_VAR = X86_CR3_PCID_USER_NOFLUSH;
-#endif
- /*
* INVPCID has two "groups" of types:
* 1/2: Invalidate an individual address
* 3/4: Invalidate all contexts
@@ -372,6 +351,7 @@ static void setup_pcid(struct cpuinfo_x8
clear_cpu_cap(c, X86_FEATURE_PCID);
}
}
+ kaiser_setup_pcid();
}
/*
--- a/arch/x86/mm/kaiser.c
+++ b/arch/x86/mm/kaiser.c
@@ -12,12 +12,26 @@
#include <linux/ftrace.h>
#include <asm/kaiser.h>
+#include <asm/tlbflush.h> /* to verify its kaiser declarations */
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
#include <asm/desc.h>
+
#ifdef CONFIG_KAISER
+__visible
+DEFINE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup);
+
+/*
+ * These can have bit 63 set, so we can not just use a plain "or"
+ * instruction to get their value or'd into CR3. It would take
+ * another register. So, we use a memory reference to these instead.
+ *
+ * This is also handy because systems that do not support PCIDs
+ * just end up or'ing a 0 into their CR3, which does no harm.
+ */
+__aligned(PAGE_SIZE) unsigned long X86_CR3_PCID_KERN_VAR;
+DEFINE_PER_CPU(unsigned long, X86_CR3_PCID_USER_VAR);
-__visible DEFINE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup);
/*
* At runtime, the only things we map are some things for CPU
* hotplug, and stacks for new processes. No two CPUs will ever
@@ -239,9 +253,6 @@ static void __init kaiser_init_all_pgds(
WARN_ON(__ret); \
} while (0)
-extern char __per_cpu_user_mapped_start[], __per_cpu_user_mapped_end[];
-extern unsigned long X86_CR3_PCID_KERN_VAR;
-extern unsigned long X86_CR3_PCID_USER_VAR;
/*
* If anything in here fails, we will likely die on one of the
* first kernel->user transitions and init will die. But, we
@@ -295,8 +306,6 @@ void __init kaiser_init(void)
kaiser_add_user_map_early(&X86_CR3_PCID_KERN_VAR, PAGE_SIZE,
__PAGE_KERNEL);
- kaiser_add_user_map_early(&X86_CR3_PCID_USER_VAR, PAGE_SIZE,
- __PAGE_KERNEL);
}
/* Add a mapping to the shadow mapping, and synchronize the mappings */
@@ -361,4 +370,33 @@ pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp,
}
return pgd;
}
+
+void kaiser_setup_pcid(void)
+{
+ unsigned long kern_cr3 = 0;
+ unsigned long user_cr3 = KAISER_SHADOW_PGD_OFFSET;
+
+ if (this_cpu_has(X86_FEATURE_PCID)) {
+ kern_cr3 |= X86_CR3_PCID_KERN_NOFLUSH;
+ user_cr3 |= X86_CR3_PCID_USER_NOFLUSH;
+ }
+ /*
+ * These variables are used by the entry/exit
+ * code to change PCID and pgd and TLB flushing.
+ */
+ X86_CR3_PCID_KERN_VAR = kern_cr3;
+ this_cpu_write(X86_CR3_PCID_USER_VAR, user_cr3);
+}
+
+/*
+ * Make a note that this cpu will need to flush USER tlb on return to user.
+ * Caller checks whether this_cpu_has(X86_FEATURE_PCID) before calling:
+ * if cpu does not, then the NOFLUSH bit will never have been set.
+ */
+void kaiser_flush_tlb_on_return_to_user(void)
+{
+ this_cpu_write(X86_CR3_PCID_USER_VAR,
+ X86_CR3_PCID_USER_FLUSH | KAISER_SHADOW_PGD_OFFSET);
+}
+EXPORT_SYMBOL(kaiser_flush_tlb_on_return_to_user);
#endif /* CONFIG_KAISER */
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -6,13 +6,14 @@
#include <linux/interrupt.h>
#include <linux/module.h>
#include <linux/cpu.h>
+#include <linux/debugfs.h>
#include <asm/tlbflush.h>
#include <asm/mmu_context.h>
#include <asm/cache.h>
#include <asm/apic.h>
#include <asm/uv/uv.h>
-#include <linux/debugfs.h>
+#include <asm/kaiser.h>
/*
* TLB flushing, formerly SMP-only
@@ -38,34 +39,23 @@ static void load_new_mm_cr3(pgd_t *pgdir
{
unsigned long new_mm_cr3 = __pa(pgdir);
- /*
- * KAISER, plus PCIDs needs some extra work here. But,
- * if either of features is not present, we need no
- * PCIDs here and just do a normal, full TLB flush with
- * the write_cr3()
- */
- if (!IS_ENABLED(CONFIG_KAISER) ||
- !cpu_feature_enabled(X86_FEATURE_PCID))
- goto out_set_cr3;
- /*
- * We reuse the same PCID for different tasks, so we must
- * flush all the entires for the PCID out when we change
- * tasks.
- */
- new_mm_cr3 = X86_CR3_PCID_KERN_FLUSH | __pa(pgdir);
-
- /*
- * The flush from load_cr3() may leave old TLB entries
- * for userspace in place. We must flush that context
- * separately. We can theoretically delay doing this
- * until we actually load up the userspace CR3, but
- * that's a bit tricky. We have to have the "need to
- * flush userspace PCID" bit per-cpu and check it in the
- * exit-to-userspace paths.
- */
- invpcid_flush_single_context(X86_CR3_PCID_ASID_USER);
+#ifdef CONFIG_KAISER
+ if (this_cpu_has(X86_FEATURE_PCID)) {
+ /*
+ * We reuse the same PCID for different tasks, so we must
+ * flush all the entries for the PCID out when we change tasks.
+ * Flush KERN below, flush USER when returning to userspace in
+ * kaiser's SWITCH_USER_CR3 (_SWITCH_TO_USER_CR3) macro.
+ *
+ * invpcid_flush_single_context(X86_CR3_PCID_ASID_USER) could
+ * do it here, but can only be used if X86_FEATURE_INVPCID is
+ * available - and many machines support pcid without invpcid.
+ */
+ new_mm_cr3 |= X86_CR3_PCID_KERN_FLUSH;
+ kaiser_flush_tlb_on_return_to_user();
+ }
+#endif /* CONFIG_KAISER */
-out_set_cr3:
/*
* Caution: many callers of this function expect
* that load_cr3() is serializing and orders TLB
Patches currently in stable-queue which might be from hughd(a)google.com are
queue-4.4/kaiser-vmstat-show-nr_kaisertable-as-nr_overhead.patch
queue-4.4/kaiser-add-nokaiser-boot-option-using-alternative.patch
queue-4.4/kaiser-fix-unlikely-error-in-alloc_ldt_struct.patch
queue-4.4/kaiser-_pgd_alloc-without-__gfp_repeat-to-avoid-stalls.patch
queue-4.4/kaiser-kaiser_flush_tlb_on_return_to_user-check-pcid.patch
queue-4.4/x86-paravirt-dont-patch-flush_tlb_single.patch
queue-4.4/kaiser-merged-update.patch
queue-4.4/kaiser-delete-kaiser_real_switch-option.patch
queue-4.4/kaiser-kaiser_remove_mapping-move-along-the-pgd.patch
queue-4.4/kaiser-fix-perf-crashes.patch
queue-4.4/kaiser-drop-is_atomic-arg-to-kaiser_pagetable_walk.patch
queue-4.4/kaiser-load_new_mm_cr3-let-switch_user_cr3-flush-user.patch
queue-4.4/kaiser-enhanced-by-kernel-and-user-pcids.patch
queue-4.4/kaiser-x86_cr3_pcid_noflush-and-x86_cr3_pcid_user.patch
queue-4.4/kaiser-use-alternative-instead-of-x86_cr3_pcid_noflush.patch
queue-4.4/kaiser-stack-map-page_size-at-thread_size-page_size.patch
queue-4.4/kaiser-name-that-0x1000-kaiser_shadow_pgd_offset.patch
queue-4.4/kaiser-fix-regs-to-do_nmi-ifndef-config_kaiser.patch
queue-4.4/kaiser-do-not-set-_page_nx-on-pgd_none.patch
queue-4.4/kaiser-tidied-up-asm-kaiser.h-somewhat.patch
queue-4.4/kaiser-cleanups-while-trying-for-gold-link.patch
queue-4.4/kaiser-tidied-up-kaiser_add-remove_mapping-slightly.patch
queue-4.4/kaiser-fix-build-and-fixme-in-alloc_ldt_struct.patch
queue-4.4/kaiser-kernel-address-isolation.patch
queue-4.4/kaiser-enomem-if-kaiser_pagetable_walk-null.patch
queue-4.4/kaiser-asm-tlbflush.h-handle-nopge-at-lower-level.patch
queue-4.4/kaiser-paranoid_entry-pass-cr3-need-to-paranoid_exit.patch
queue-4.4/kaiser-kaiser-depends-on-smp.patch
queue-4.4/kaiser-pcid-0-for-kernel-and-128-for-user.patch
This is a note to let you know that I've just added the patch titled
kaiser: kaiser_remove_mapping() move along the pgd
to the 4.4-stable tree which can be found at:
http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum…
The filename of the patch is:
kaiser-kaiser_remove_mapping-move-along-the-pgd.patch
and it can be found in the queue-4.4 subdirectory.
If you, or anyone else, feels it should not be added to the stable tree,
please let <stable(a)vger.kernel.org> know about it.
>From foo@baz Wed Jan 3 18:58:12 CET 2018
From: Hugh Dickins <hughd(a)google.com>
Date: Mon, 2 Oct 2017 10:57:24 -0700
Subject: kaiser: kaiser_remove_mapping() move along the pgd
From: Hugh Dickins <hughd(a)google.com>
When removing the bogus comment from kaiser_remove_mapping(),
I really ought to have checked the extent of its bogosity: as
Neel points out, there is nothing to stop unmap_pud_range_nofree()
from continuing beyond the end of a pud (and starting in the wrong
position on the next).
Fix kaiser_remove_mapping() to constrain the extent and advance pgd
pointer correctly: use pgd_addr_end() macro as used throughout base
mm (but don't assume page-rounded start and size in this case).
But this bug was very unlikely to trigger in this backport: since
any buddy allocation is contained within a single pud extent, and
we are not using vmapped stacks (and are only mapping one page of
stack anyway): the only way to hit this bug here would be when
freeing a large modified ldt.
Signed-off-by: Hugh Dickins <hughd(a)google.com>
Acked-by: Jiri Kosina <jkosina(a)suse.cz>
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
arch/x86/mm/kaiser.c | 10 ++++++----
1 file changed, 6 insertions(+), 4 deletions(-)
--- a/arch/x86/mm/kaiser.c
+++ b/arch/x86/mm/kaiser.c
@@ -297,11 +297,13 @@ void kaiser_remove_mapping(unsigned long
extern void unmap_pud_range_nofree(pgd_t *pgd,
unsigned long start, unsigned long end);
unsigned long end = start + size;
- unsigned long addr;
+ unsigned long addr, next;
+ pgd_t *pgd;
- for (addr = start; addr < end; addr += PGDIR_SIZE) {
- pgd_t *pgd = native_get_shadow_pgd(pgd_offset_k(addr));
- unmap_pud_range_nofree(pgd, addr, end);
+ pgd = native_get_shadow_pgd(pgd_offset_k(start));
+ for (addr = start; addr < end; pgd++, addr = next) {
+ next = pgd_addr_end(addr, end);
+ unmap_pud_range_nofree(pgd, addr, next);
}
}
Patches currently in stable-queue which might be from hughd(a)google.com are
queue-4.4/kaiser-vmstat-show-nr_kaisertable-as-nr_overhead.patch
queue-4.4/kaiser-add-nokaiser-boot-option-using-alternative.patch
queue-4.4/kaiser-fix-unlikely-error-in-alloc_ldt_struct.patch
queue-4.4/kaiser-_pgd_alloc-without-__gfp_repeat-to-avoid-stalls.patch
queue-4.4/kaiser-kaiser_flush_tlb_on_return_to_user-check-pcid.patch
queue-4.4/x86-paravirt-dont-patch-flush_tlb_single.patch
queue-4.4/kaiser-merged-update.patch
queue-4.4/kaiser-delete-kaiser_real_switch-option.patch
queue-4.4/kaiser-kaiser_remove_mapping-move-along-the-pgd.patch
queue-4.4/kaiser-fix-perf-crashes.patch
queue-4.4/kaiser-drop-is_atomic-arg-to-kaiser_pagetable_walk.patch
queue-4.4/kaiser-load_new_mm_cr3-let-switch_user_cr3-flush-user.patch
queue-4.4/kaiser-enhanced-by-kernel-and-user-pcids.patch
queue-4.4/kaiser-x86_cr3_pcid_noflush-and-x86_cr3_pcid_user.patch
queue-4.4/kaiser-use-alternative-instead-of-x86_cr3_pcid_noflush.patch
queue-4.4/kaiser-stack-map-page_size-at-thread_size-page_size.patch
queue-4.4/kaiser-name-that-0x1000-kaiser_shadow_pgd_offset.patch
queue-4.4/kaiser-fix-regs-to-do_nmi-ifndef-config_kaiser.patch
queue-4.4/kaiser-do-not-set-_page_nx-on-pgd_none.patch
queue-4.4/kaiser-tidied-up-asm-kaiser.h-somewhat.patch
queue-4.4/kaiser-cleanups-while-trying-for-gold-link.patch
queue-4.4/kaiser-tidied-up-kaiser_add-remove_mapping-slightly.patch
queue-4.4/kaiser-fix-build-and-fixme-in-alloc_ldt_struct.patch
queue-4.4/kaiser-kernel-address-isolation.patch
queue-4.4/kaiser-enomem-if-kaiser_pagetable_walk-null.patch
queue-4.4/kaiser-asm-tlbflush.h-handle-nopge-at-lower-level.patch
queue-4.4/kaiser-paranoid_entry-pass-cr3-need-to-paranoid_exit.patch
queue-4.4/kaiser-kaiser-depends-on-smp.patch
queue-4.4/kaiser-pcid-0-for-kernel-and-128-for-user.patch
This is a note to let you know that I've just added the patch titled
kaiser: kaiser_flush_tlb_on_return_to_user() check PCID
to the 4.4-stable tree which can be found at:
http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum…
The filename of the patch is:
kaiser-kaiser_flush_tlb_on_return_to_user-check-pcid.patch
and it can be found in the queue-4.4 subdirectory.
If you, or anyone else, feels it should not be added to the stable tree,
please let <stable(a)vger.kernel.org> know about it.
>From foo@baz Wed Jan 3 18:58:12 CET 2018
From: Hugh Dickins <hughd(a)google.com>
Date: Sat, 4 Nov 2017 18:43:06 -0700
Subject: kaiser: kaiser_flush_tlb_on_return_to_user() check PCID
From: Hugh Dickins <hughd(a)google.com>
Let kaiser_flush_tlb_on_return_to_user() do the X86_FEATURE_PCID
check, instead of each caller doing it inline first: nobody needs
to optimize for the noPCID case, it's clearer this way, and better
suits later changes. Replace those no-op X86_CR3_PCID_KERN_FLUSH lines
by a BUILD_BUG_ON() in load_new_mm_cr3(), in case something changes.
Signed-off-by: Hugh Dickins <hughd(a)google.com>
Acked-by: Jiri Kosina <jkosina(a)suse.cz>
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
arch/x86/include/asm/tlbflush.h | 4 ++--
arch/x86/mm/kaiser.c | 6 +++---
arch/x86/mm/tlb.c | 8 ++++----
3 files changed, 9 insertions(+), 9 deletions(-)
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -157,7 +157,7 @@ static inline void __native_flush_tlb(vo
* back:
*/
preempt_disable();
- if (kaiser_enabled && this_cpu_has(X86_FEATURE_PCID))
+ if (kaiser_enabled)
kaiser_flush_tlb_on_return_to_user();
native_write_cr3(native_read_cr3());
preempt_enable();
@@ -216,7 +216,7 @@ static inline void __native_flush_tlb_si
*/
if (!this_cpu_has(X86_FEATURE_INVPCID_SINGLE)) {
- if (kaiser_enabled && this_cpu_has(X86_FEATURE_PCID))
+ if (kaiser_enabled)
kaiser_flush_tlb_on_return_to_user();
asm volatile("invlpg (%0)" ::"r" (addr) : "memory");
return;
--- a/arch/x86/mm/kaiser.c
+++ b/arch/x86/mm/kaiser.c
@@ -436,12 +436,12 @@ void kaiser_setup_pcid(void)
/*
* Make a note that this cpu will need to flush USER tlb on return to user.
- * Caller checks whether this_cpu_has(X86_FEATURE_PCID) before calling:
- * if cpu does not, then the NOFLUSH bit will never have been set.
+ * If cpu does not have PCID, then the NOFLUSH bit will never have been set.
*/
void kaiser_flush_tlb_on_return_to_user(void)
{
- this_cpu_write(x86_cr3_pcid_user,
+ if (this_cpu_has(X86_FEATURE_PCID))
+ this_cpu_write(x86_cr3_pcid_user,
X86_CR3_PCID_USER_FLUSH | KAISER_SHADOW_PGD_OFFSET);
}
EXPORT_SYMBOL(kaiser_flush_tlb_on_return_to_user);
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -39,7 +39,7 @@ static void load_new_mm_cr3(pgd_t *pgdir
{
unsigned long new_mm_cr3 = __pa(pgdir);
- if (kaiser_enabled && this_cpu_has(X86_FEATURE_PCID)) {
+ if (kaiser_enabled) {
/*
* We reuse the same PCID for different tasks, so we must
* flush all the entries for the PCID out when we change tasks.
@@ -50,10 +50,10 @@ static void load_new_mm_cr3(pgd_t *pgdir
* do it here, but can only be used if X86_FEATURE_INVPCID is
* available - and many machines support pcid without invpcid.
*
- * The line below is a no-op: X86_CR3_PCID_KERN_FLUSH is now 0;
- * but keep that line in there in case something changes.
+ * If X86_CR3_PCID_KERN_FLUSH actually added something, then it
+ * would be needed in the write_cr3() below - if PCIDs enabled.
*/
- new_mm_cr3 |= X86_CR3_PCID_KERN_FLUSH;
+ BUILD_BUG_ON(X86_CR3_PCID_KERN_FLUSH);
kaiser_flush_tlb_on_return_to_user();
}
Patches currently in stable-queue which might be from hughd(a)google.com are
queue-4.4/kaiser-vmstat-show-nr_kaisertable-as-nr_overhead.patch
queue-4.4/kaiser-add-nokaiser-boot-option-using-alternative.patch
queue-4.4/kaiser-fix-unlikely-error-in-alloc_ldt_struct.patch
queue-4.4/kaiser-_pgd_alloc-without-__gfp_repeat-to-avoid-stalls.patch
queue-4.4/kaiser-kaiser_flush_tlb_on_return_to_user-check-pcid.patch
queue-4.4/x86-paravirt-dont-patch-flush_tlb_single.patch
queue-4.4/kaiser-merged-update.patch
queue-4.4/kaiser-delete-kaiser_real_switch-option.patch
queue-4.4/kaiser-kaiser_remove_mapping-move-along-the-pgd.patch
queue-4.4/kaiser-fix-perf-crashes.patch
queue-4.4/kaiser-drop-is_atomic-arg-to-kaiser_pagetable_walk.patch
queue-4.4/kaiser-load_new_mm_cr3-let-switch_user_cr3-flush-user.patch
queue-4.4/kaiser-enhanced-by-kernel-and-user-pcids.patch
queue-4.4/kaiser-x86_cr3_pcid_noflush-and-x86_cr3_pcid_user.patch
queue-4.4/kaiser-use-alternative-instead-of-x86_cr3_pcid_noflush.patch
queue-4.4/kaiser-stack-map-page_size-at-thread_size-page_size.patch
queue-4.4/kaiser-name-that-0x1000-kaiser_shadow_pgd_offset.patch
queue-4.4/kaiser-fix-regs-to-do_nmi-ifndef-config_kaiser.patch
queue-4.4/kaiser-do-not-set-_page_nx-on-pgd_none.patch
queue-4.4/kaiser-tidied-up-asm-kaiser.h-somewhat.patch
queue-4.4/kaiser-cleanups-while-trying-for-gold-link.patch
queue-4.4/kaiser-tidied-up-kaiser_add-remove_mapping-slightly.patch
queue-4.4/kaiser-fix-build-and-fixme-in-alloc_ldt_struct.patch
queue-4.4/kaiser-kernel-address-isolation.patch
queue-4.4/kaiser-enomem-if-kaiser_pagetable_walk-null.patch
queue-4.4/kaiser-asm-tlbflush.h-handle-nopge-at-lower-level.patch
queue-4.4/kaiser-paranoid_entry-pass-cr3-need-to-paranoid_exit.patch
queue-4.4/kaiser-kaiser-depends-on-smp.patch
queue-4.4/kaiser-pcid-0-for-kernel-and-128-for-user.patch
This is a note to let you know that I've just added the patch titled
kaiser: KAISER depends on SMP
to the 4.4-stable tree which can be found at:
http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum…
The filename of the patch is:
kaiser-kaiser-depends-on-smp.patch
and it can be found in the queue-4.4 subdirectory.
If you, or anyone else, feels it should not be added to the stable tree,
please let <stable(a)vger.kernel.org> know about it.
>From foo@baz Wed Jan 3 18:58:12 CET 2018
From: Hugh Dickins <hughd(a)google.com>
Date: Wed, 13 Sep 2017 14:03:10 -0700
Subject: kaiser: KAISER depends on SMP
From: Hugh Dickins <hughd(a)google.com>
It is absurd that KAISER should depend on SMP, but apparently nobody
has tried a UP build before: which breaks on implicit declaration of
function 'per_cpu_offset' in arch/x86/mm/kaiser.c.
Now, you would expect that to be trivially fixed up; but looking at
the System.map when that block is #ifdef'ed out of kaiser_init(),
I see that in a UP build __per_cpu_user_mapped_end is precisely at
__per_cpu_user_mapped_start, and the items carefully gathered into
that section for user-mapping on SMP, dispersed elsewhere on UP.
So, some other kind of section assignment will be needed on UP,
but implementing that is not a priority: just make KAISER depend
on SMP for now.
Also inserted a blank line before the option, tidied up the
brief Kconfig help message, and added an "If unsure, Y".
Signed-off-by: Hugh Dickins <hughd(a)google.com>
Acked-by: Jiri Kosina <jkosina(a)suse.cz>
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
security/Kconfig | 10 ++++++----
1 file changed, 6 insertions(+), 4 deletions(-)
--- a/security/Kconfig
+++ b/security/Kconfig
@@ -30,14 +30,16 @@ config SECURITY
model will be used.
If you are unsure how to answer this question, answer N.
+
config KAISER
bool "Remove the kernel mapping in user mode"
default y
- depends on X86_64
- depends on !PARAVIRT
+ depends on X86_64 && SMP && !PARAVIRT
help
- This enforces a strict kernel and user space isolation in order to close
- hardware side channels on kernel address information.
+ This enforces a strict kernel and user space isolation, in order
+ to close hardware side channels on kernel address information.
+
+ If you are unsure how to answer this question, answer Y.
config KAISER_REAL_SWITCH
bool "KAISER: actually switch page tables"
Patches currently in stable-queue which might be from hughd(a)google.com are
queue-4.4/kaiser-vmstat-show-nr_kaisertable-as-nr_overhead.patch
queue-4.4/kaiser-add-nokaiser-boot-option-using-alternative.patch
queue-4.4/kaiser-fix-unlikely-error-in-alloc_ldt_struct.patch
queue-4.4/kaiser-_pgd_alloc-without-__gfp_repeat-to-avoid-stalls.patch
queue-4.4/kaiser-kaiser_flush_tlb_on_return_to_user-check-pcid.patch
queue-4.4/x86-paravirt-dont-patch-flush_tlb_single.patch
queue-4.4/kaiser-merged-update.patch
queue-4.4/kaiser-delete-kaiser_real_switch-option.patch
queue-4.4/kaiser-kaiser_remove_mapping-move-along-the-pgd.patch
queue-4.4/kaiser-fix-perf-crashes.patch
queue-4.4/kaiser-drop-is_atomic-arg-to-kaiser_pagetable_walk.patch
queue-4.4/kaiser-load_new_mm_cr3-let-switch_user_cr3-flush-user.patch
queue-4.4/kaiser-enhanced-by-kernel-and-user-pcids.patch
queue-4.4/kaiser-x86_cr3_pcid_noflush-and-x86_cr3_pcid_user.patch
queue-4.4/kaiser-use-alternative-instead-of-x86_cr3_pcid_noflush.patch
queue-4.4/kaiser-stack-map-page_size-at-thread_size-page_size.patch
queue-4.4/kaiser-name-that-0x1000-kaiser_shadow_pgd_offset.patch
queue-4.4/kaiser-fix-regs-to-do_nmi-ifndef-config_kaiser.patch
queue-4.4/kaiser-do-not-set-_page_nx-on-pgd_none.patch
queue-4.4/kaiser-tidied-up-asm-kaiser.h-somewhat.patch
queue-4.4/kaiser-cleanups-while-trying-for-gold-link.patch
queue-4.4/kaiser-tidied-up-kaiser_add-remove_mapping-slightly.patch
queue-4.4/kaiser-fix-build-and-fixme-in-alloc_ldt_struct.patch
queue-4.4/kaiser-kernel-address-isolation.patch
queue-4.4/kaiser-enomem-if-kaiser_pagetable_walk-null.patch
queue-4.4/kaiser-asm-tlbflush.h-handle-nopge-at-lower-level.patch
queue-4.4/kaiser-paranoid_entry-pass-cr3-need-to-paranoid_exit.patch
queue-4.4/kaiser-kaiser-depends-on-smp.patch
queue-4.4/kaiser-pcid-0-for-kernel-and-128-for-user.patch