Kernel TLB invalidation IPIs are a common source of interference on NOHZ_FULL CPUs. Given NOHZ_FULL CPUs executing in userspace are not accessing any kernel addresses, these invalidations do not need to happen immediately, and can be deferred until the next user->kernel transition.
Rather than make __flush_tlb_all() noinstr, add a minimal noinstr variant that doesn't try to leverage INVPCID.
FIXME: not fully noinstr compliant XXX: same issue as with ins patching, when do we access data that should be invalidated?
Signed-off-by: Valentin Schneider vschneid@redhat.com --- arch/x86/include/asm/context_tracking_work.h | 4 ++++ arch/x86/include/asm/tlbflush.h | 1 + arch/x86/mm/tlb.c | 17 +++++++++++++++++ include/linux/context_tracking_state.h | 4 ++++ include/linux/context_tracking_work.h | 2 ++ 5 files changed, 28 insertions(+)
diff --git a/arch/x86/include/asm/context_tracking_work.h b/arch/x86/include/asm/context_tracking_work.h index 2c66687ce00e2..9d4f021b5a45b 100644 --- a/arch/x86/include/asm/context_tracking_work.h +++ b/arch/x86/include/asm/context_tracking_work.h @@ -3,6 +3,7 @@ #define _ASM_X86_CONTEXT_TRACKING_WORK_H
#include <asm/sync_core.h> +#include <asm/tlbflush.h>
static __always_inline void arch_context_tracking_work(int work) { @@ -10,6 +11,9 @@ static __always_inline void arch_context_tracking_work(int work) case CONTEXT_WORK_SYNC: sync_core(); break; + case CONTEXT_WORK_TLBI: + __flush_tlb_all_noinstr(); + break; } }
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 80450e1d5385a..323b971987af7 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -17,6 +17,7 @@ DECLARE_PER_CPU(u64, tlbstate_untag_mask);
void __flush_tlb_all(void); +void noinstr __flush_tlb_all_noinstr(void);
#define TLB_FLUSH_ALL -1UL #define TLB_GENERATION_INVALID 0 diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 267acf27480af..631df9189ded4 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -1237,6 +1237,23 @@ void __flush_tlb_all(void) } EXPORT_SYMBOL_GPL(__flush_tlb_all);
+void noinstr __flush_tlb_all_noinstr(void) +{ + /* + * This is for invocation in early entry code that cannot be + * instrumented. A RMW to CR4 works for most cases, but relies on + * being able to flip either of the PGE or PCIDE bits. Flipping CR4.PCID + * would require also resetting CR3.PCID, so just try with CR4.PGE, else + * do the CR3 write. + * + * TODO: paravirt + */ + if (cpu_feature_enabled(X86_FEATURE_PGE)) + __native_tlb_flush_global(this_cpu_read(cpu_tlbstate.cr4)); + else + flush_tlb_local(); +} + void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch) { struct flush_tlb_info *info; diff --git a/include/linux/context_tracking_state.h b/include/linux/context_tracking_state.h index 292a0b7c06948..3571c62cbb9cd 100644 --- a/include/linux/context_tracking_state.h +++ b/include/linux/context_tracking_state.h @@ -62,6 +62,10 @@ enum ctx_state { #define RCU_DYNTICKS_END (CT_STATE_SIZE - 1) #define RCU_DYNTICKS_IDX BIT(RCU_DYNTICKS_START)
+/* + * When CONFIG_CONTEXT_TRACKING_WORK=n, _END is 1 behind _START, which makes + * the CONTEXT_WORK size computation below 0, which is what we want! + */ #define CONTEXT_WORK_START (CONTEXT_STATE_END + 1) #define CONTEXT_WORK_END (RCU_DYNTICKS_START - 1)
diff --git a/include/linux/context_tracking_work.h b/include/linux/context_tracking_work.h index 13fc97b395030..47d5ced39a43a 100644 --- a/include/linux/context_tracking_work.h +++ b/include/linux/context_tracking_work.h @@ -6,11 +6,13 @@
enum { CONTEXT_WORK_SYNC_OFFSET, + CONTEXT_WORK_TLBI_OFFSET, CONTEXT_WORK_MAX_OFFSET };
enum ct_work { CONTEXT_WORK_SYNC = BIT(CONTEXT_WORK_SYNC_OFFSET), + CONTEXT_WORK_TLBI = BIT(CONTEXT_WORK_TLBI_OFFSET), CONTEXT_WORK_MAX = BIT(CONTEXT_WORK_MAX_OFFSET) };