From: Dmitry Torokhov dmitry.torokhov@gmail.com
[ Upstream commit 1ab33c03145d0f6c345823fc2da935d9a1a9e9fc ]
__put_unaligned_be24() and friends use implicit casts to convert larger-sized data to bytes, which trips sparse truncation warnings when the argument is a constant:
CC [M] drivers/input/touchscreen/hynitron_cstxxx.o CHECK drivers/input/touchscreen/hynitron_cstxxx.c drivers/input/touchscreen/hynitron_cstxxx.c: note: in included file (through arch/x86/include/generated/asm/unaligned.h): include/asm-generic/unaligned.h:119:16: warning: cast truncates bits from constant value (aa01a0 becomes a0) include/asm-generic/unaligned.h:120:20: warning: cast truncates bits from constant value (aa01 becomes 1) include/asm-generic/unaligned.h:119:16: warning: cast truncates bits from constant value (ab00d0 becomes d0) include/asm-generic/unaligned.h:120:20: warning: cast truncates bits from constant value (ab00 becomes 0)
To avoid this let's mask off upper bits explicitly, the resulting code should be exactly the same, but it will keep sparse happy.
Reported-by: kernel test robot lkp@intel.com Suggested-by: Linus Torvalds torvalds@linux-foundation.org Closes: https://lore.kernel.org/oe-kbuild-all/202401070147.gqwVulOn-lkp@intel.com/ Signed-off-by: Dmitry Torokhov dmitry.torokhov@gmail.com Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Sasha Levin sashal@kernel.org --- include/asm-generic/unaligned.h | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-)
diff --git a/include/asm-generic/unaligned.h b/include/asm-generic/unaligned.h index 699650f81970..a84c64e5f11e 100644 --- a/include/asm-generic/unaligned.h +++ b/include/asm-generic/unaligned.h @@ -104,9 +104,9 @@ static inline u32 get_unaligned_le24(const void *p)
static inline void __put_unaligned_be24(const u32 val, u8 *p) { - *p++ = val >> 16; - *p++ = val >> 8; - *p++ = val; + *p++ = (val >> 16) & 0xff; + *p++ = (val >> 8) & 0xff; + *p++ = val & 0xff; }
static inline void put_unaligned_be24(const u32 val, void *p) @@ -116,9 +116,9 @@ static inline void put_unaligned_be24(const u32 val, void *p)
static inline void __put_unaligned_le24(const u32 val, u8 *p) { - *p++ = val; - *p++ = val >> 8; - *p++ = val >> 16; + *p++ = val & 0xff; + *p++ = (val >> 8) & 0xff; + *p++ = (val >> 16) & 0xff; }
static inline void put_unaligned_le24(const u32 val, void *p) @@ -128,12 +128,12 @@ static inline void put_unaligned_le24(const u32 val, void *p)
static inline void __put_unaligned_be48(const u64 val, u8 *p) { - *p++ = val >> 40; - *p++ = val >> 32; - *p++ = val >> 24; - *p++ = val >> 16; - *p++ = val >> 8; - *p++ = val; + *p++ = (val >> 40) & 0xff; + *p++ = (val >> 32) & 0xff; + *p++ = (val >> 24) & 0xff; + *p++ = (val >> 16) & 0xff; + *p++ = (val >> 8) & 0xff; + *p++ = val & 0xff; }
static inline void put_unaligned_be48(const u64 val, void *p)
From: Dmitry Antipov dmantipov@yandex.ru
[ Upstream commit 1bfc466b13cf6652ba227c282c27a30ffede69a5 ]
When compiling with gcc version 14.0.0 20231220 (experimental) and W=1, I've noticed the following warning:
kernel/watch_queue.c: In function 'watch_queue_set_size': kernel/watch_queue.c:273:32: warning: 'kcalloc' sizes specified with 'sizeof' in the earlier argument and not in the later argument [-Wcalloc-transposed-args] 273 | pages = kcalloc(sizeof(struct page *), nr_pages, GFP_KERNEL); | ^~~~~~
Since 'n' and 'size' arguments of 'kcalloc()' are multiplied to calculate the final size, their actual order doesn't affect the result and so this is not a bug. But it's still worth to fix it.
Signed-off-by: Dmitry Antipov dmantipov@yandex.ru Link: https://lore.kernel.org/r/20231221090139.12579-1-dmantipov@yandex.ru Signed-off-by: Christian Brauner brauner@kernel.org Signed-off-by: Sasha Levin sashal@kernel.org --- kernel/watch_queue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/kernel/watch_queue.c b/kernel/watch_queue.c index 778b4056700f..03b90d7d2175 100644 --- a/kernel/watch_queue.c +++ b/kernel/watch_queue.c @@ -270,7 +270,7 @@ long watch_queue_set_size(struct pipe_inode_info *pipe, unsigned int nr_notes) goto error;
ret = -ENOMEM; - pages = kcalloc(sizeof(struct page *), nr_pages, GFP_KERNEL); + pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL); if (!pages) goto error;
From: Kunwu Chan chentao@kylinos.cn
[ Upstream commit f46c8a75263f97bda13c739ba1c90aced0d3b071 ]
kasprintf() returns a pointer to dynamically allocated memory which can be NULL upon failure. Ensure the allocation was successful by checking the pointer validity.
Suggested-by: Christophe Leroy christophe.leroy@csgroup.eu Suggested-by: Michael Ellerman mpe@ellerman.id.au Signed-off-by: Kunwu Chan chentao@kylinos.cn Signed-off-by: Michael Ellerman mpe@ellerman.id.au Link: https://msgid.link/20231204023223.2447523-1-chentao@kylinos.cn Signed-off-by: Sasha Levin sashal@kernel.org --- arch/powerpc/mm/init-common.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/arch/powerpc/mm/init-common.c b/arch/powerpc/mm/init-common.c index 119ef491f797..d3a7726ecf51 100644 --- a/arch/powerpc/mm/init-common.c +++ b/arch/powerpc/mm/init-common.c @@ -126,7 +126,7 @@ void pgtable_cache_add(unsigned int shift) * as to leave enough 0 bits in the address to contain it. */ unsigned long minalign = max(MAX_PGTABLE_INDEX_SIZE + 1, HUGEPD_SHIFT_MASK + 1); - struct kmem_cache *new; + struct kmem_cache *new = NULL;
/* It would be nice if this was a BUILD_BUG_ON(), but at the * moment, gcc doesn't seem to recognize is_power_of_2 as a @@ -139,7 +139,8 @@ void pgtable_cache_add(unsigned int shift)
align = max_t(unsigned long, align, minalign); name = kasprintf(GFP_KERNEL, "pgtable-2^%d", shift); - new = kmem_cache_create(name, table_size, align, 0, ctor(shift)); + if (name) + new = kmem_cache_create(name, table_size, align, 0, ctor(shift)); if (!new) panic("Could not allocate pgtable cache for order %d", shift);
From: Huang Shijie shijie@os.amperecomputing.com
[ Upstream commit 75b5e0bf90bffaca4b1f19114065dc59f5cc161f ]
In current code, init_irq_stacks() will call cpu_to_node(). The cpu_to_node() depends on percpu "numa_node" which is initialized in: arch_call_rest_init() --> rest_init() -- kernel_init() --> kernel_init_freeable() --> smp_prepare_cpus()
But init_irq_stacks() is called in init_IRQ() which is before arch_call_rest_init().
So in init_irq_stacks(), the cpu_to_node() does not work, it always return 0. In NUMA, it makes the node 1 cpu accesses the IRQ stack which is in the node 0.
This patch fixes it by: 1.) export the early_cpu_to_node(), and use it in the init_irq_stacks(). 2.) change init_irq_stacks() to __init function.
Reviewed-by: Catalin Marinas catalin.marinas@arm.com Signed-off-by: Huang Shijie shijie@os.amperecomputing.com Link: https://lore.kernel.org/r/20231124031513.81548-1-shijie@os.amperecomputing.c... Signed-off-by: Will Deacon will@kernel.org Signed-off-by: Sasha Levin sashal@kernel.org --- arch/arm64/kernel/irq.c | 5 +++-- drivers/base/arch_numa.c | 2 +- include/asm-generic/numa.h | 2 ++ 3 files changed, 6 insertions(+), 3 deletions(-)
diff --git a/arch/arm64/kernel/irq.c b/arch/arm64/kernel/irq.c index 6ad5c6ef5329..9f253d8efe90 100644 --- a/arch/arm64/kernel/irq.c +++ b/arch/arm64/kernel/irq.c @@ -22,6 +22,7 @@ #include <linux/vmalloc.h> #include <asm/daifflags.h> #include <asm/exception.h> +#include <asm/numa.h> #include <asm/softirq_stack.h> #include <asm/stacktrace.h> #include <asm/vmap_stack.h> @@ -51,13 +52,13 @@ static void init_irq_scs(void) }
#ifdef CONFIG_VMAP_STACK -static void init_irq_stacks(void) +static void __init init_irq_stacks(void) { int cpu; unsigned long *p;
for_each_possible_cpu(cpu) { - p = arch_alloc_vmap_stack(IRQ_STACK_SIZE, cpu_to_node(cpu)); + p = arch_alloc_vmap_stack(IRQ_STACK_SIZE, early_cpu_to_node(cpu)); per_cpu(irq_stack_ptr, cpu) = p; } } diff --git a/drivers/base/arch_numa.c b/drivers/base/arch_numa.c index eaa31e567d1e..5b59d133b6af 100644 --- a/drivers/base/arch_numa.c +++ b/drivers/base/arch_numa.c @@ -144,7 +144,7 @@ void __init early_map_cpu_to_node(unsigned int cpu, int nid) unsigned long __per_cpu_offset[NR_CPUS] __read_mostly; EXPORT_SYMBOL(__per_cpu_offset);
-static int __init early_cpu_to_node(int cpu) +int __init early_cpu_to_node(int cpu) { return cpu_to_node_map[cpu]; } diff --git a/include/asm-generic/numa.h b/include/asm-generic/numa.h index 1a3ad6d29833..c32e0cf23c90 100644 --- a/include/asm-generic/numa.h +++ b/include/asm-generic/numa.h @@ -35,6 +35,7 @@ int __init numa_add_memblk(int nodeid, u64 start, u64 end); void __init numa_set_distance(int from, int to, int distance); void __init numa_free_distance(void); void __init early_map_cpu_to_node(unsigned int cpu, int nid); +int __init early_cpu_to_node(int cpu); void numa_store_cpu_info(unsigned int cpu); void numa_add_cpu(unsigned int cpu); void numa_remove_cpu(unsigned int cpu); @@ -46,6 +47,7 @@ static inline void numa_add_cpu(unsigned int cpu) { } static inline void numa_remove_cpu(unsigned int cpu) { } static inline void arch_numa_init(void) { } static inline void early_map_cpu_to_node(unsigned int cpu, int nid) { } +static inline int early_cpu_to_node(int cpu) { return 0; }
#endif /* CONFIG_NUMA */
From: Mark Rutland mark.rutland@arm.com
[ Upstream commit ca6f537e459e2da4b331fe8928d1a0b0f9301f42 ]
The SW_INCR event is somewhat unusual, and depends on the specific HW counter that it is programmed into. When programmed into PMEVCNTR<n>, SW_INCR will count any writes to PMSWINC_EL0 with bit n set, ignoring writes to SW_INCR with bit n clear.
Event rotation means that there's no fixed relationship between perf_events and HW counters, so this isn't all that useful.
Further, we program PMUSERENR.{SW,EN}=={0,0}, which causes EL0 writes to PMSWINC_EL0 to be trapped and handled as UNDEFINED, resulting in a SIGILL to userspace.
Given that, it's not a good idea to expose SW_INCR in sysfs. Hide it as we did for CHAIN back in commit:
4ba2578fa7b55701 ("arm64: perf: don't expose CHAIN event in sysfs")
Signed-off-by: Mark Rutland mark.rutland@arm.com Cc: Will Deacon will@kernel.org Link: https://lore.kernel.org/r/20231204115847.2993026-1-mark.rutland@arm.com Signed-off-by: Will Deacon will@kernel.org Signed-off-by: Sasha Levin sashal@kernel.org --- drivers/perf/arm_pmuv3.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-)
diff --git a/drivers/perf/arm_pmuv3.c b/drivers/perf/arm_pmuv3.c index d681638ec6b8..0e8f54168cb6 100644 --- a/drivers/perf/arm_pmuv3.c +++ b/drivers/perf/arm_pmuv3.c @@ -169,7 +169,11 @@ armv8pmu_events_sysfs_show(struct device *dev, PMU_EVENT_ATTR_ID(name, armv8pmu_events_sysfs_show, config)
static struct attribute *armv8_pmuv3_event_attrs[] = { - ARMV8_EVENT_ATTR(sw_incr, ARMV8_PMUV3_PERFCTR_SW_INCR), + /* + * Don't expose the sw_incr event in /sys. It's not usable as writes to + * PMSWINC_EL0 will trap as PMUSERENR.{SW,EN}=={0,0} and event rotation + * means we don't have a fixed event<->counter relationship regardless. + */ ARMV8_EVENT_ATTR(l1i_cache_refill, ARMV8_PMUV3_PERFCTR_L1I_CACHE_REFILL), ARMV8_EVENT_ATTR(l1i_tlb_refill, ARMV8_PMUV3_PERFCTR_L1I_TLB_REFILL), ARMV8_EVENT_ATTR(l1d_cache_refill, ARMV8_PMUV3_PERFCTR_L1D_CACHE_REFILL),
From: Michael Ellerman mpe@ellerman.id.au
[ Upstream commit f8d3555355653848082c351fa90775214fb8a4fa ]
With CONFIG_GENERIC_BUG=n the build fails with:
arch/powerpc/kernel/traps.c:1442:5: error: no previous prototype for ‘is_valid_bugaddr’ [-Werror=missing-prototypes] 1442 | int is_valid_bugaddr(unsigned long addr) | ^~~~~~~~~~~~~~~~
The prototype is only defined, and the function is only needed, when CONFIG_GENERIC_BUG=y, so move the implementation under that.
Signed-off-by: Michael Ellerman mpe@ellerman.id.au Link: https://msgid.link/20231130114433.3053544-2-mpe@ellerman.id.au Signed-off-by: Sasha Levin sashal@kernel.org --- arch/powerpc/kernel/traps.c | 2 ++ 1 file changed, 2 insertions(+)
diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c index fe3f720c9cd6..2de7f6dcd32b 100644 --- a/arch/powerpc/kernel/traps.c +++ b/arch/powerpc/kernel/traps.c @@ -1439,10 +1439,12 @@ static int emulate_instruction(struct pt_regs *regs) return -EINVAL; }
+#ifdef CONFIG_GENERIC_BUG int is_valid_bugaddr(unsigned long addr) { return is_kernel_addr(addr); } +#endif
#ifdef CONFIG_MATH_EMULATION static int emulate_math(struct pt_regs *regs)
From: Michael Ellerman mpe@ellerman.id.au
[ Upstream commit d8c3f243d4db24675b653f0568bb65dae34e6455 ]
With NUMA=n and FA_DUMP=y or PRESERVE_FA_DUMP=y the build fails with:
arch/powerpc/kernel/fadump.c:1739:22: error: no previous prototype for ‘arch_reserved_kernel_pages’ [-Werror=missing-prototypes] 1739 | unsigned long __init arch_reserved_kernel_pages(void) | ^~~~~~~~~~~~~~~~~~~~~~~~~~
The prototype for arch_reserved_kernel_pages() is in include/linux/mm.h, but it's guarded by __HAVE_ARCH_RESERVED_KERNEL_PAGES. The powerpc headers define __HAVE_ARCH_RESERVED_KERNEL_PAGES in asm/mmzone.h, which is not included into the generic headers when NUMA=n.
Move the definition of __HAVE_ARCH_RESERVED_KERNEL_PAGES into asm/mmu.h which is included regardless of NUMA=n.
Additionally the ifdef around __HAVE_ARCH_RESERVED_KERNEL_PAGES needs to also check for CONFIG_PRESERVE_FA_DUMP.
Signed-off-by: Michael Ellerman mpe@ellerman.id.au Link: https://msgid.link/20231130114433.3053544-1-mpe@ellerman.id.au Signed-off-by: Sasha Levin sashal@kernel.org --- arch/powerpc/include/asm/mmu.h | 4 ++++ arch/powerpc/include/asm/mmzone.h | 3 --- 2 files changed, 4 insertions(+), 3 deletions(-)
diff --git a/arch/powerpc/include/asm/mmu.h b/arch/powerpc/include/asm/mmu.h index 52cc25864a1b..d8b7e246a32f 100644 --- a/arch/powerpc/include/asm/mmu.h +++ b/arch/powerpc/include/asm/mmu.h @@ -412,5 +412,9 @@ extern void *abatron_pteptrs[2]; #include <asm/nohash/mmu.h> #endif
+#if defined(CONFIG_FA_DUMP) || defined(CONFIG_PRESERVE_FA_DUMP) +#define __HAVE_ARCH_RESERVED_KERNEL_PAGES +#endif + #endif /* __KERNEL__ */ #endif /* _ASM_POWERPC_MMU_H_ */ diff --git a/arch/powerpc/include/asm/mmzone.h b/arch/powerpc/include/asm/mmzone.h index 4c6c6dbd182f..3764d3585d30 100644 --- a/arch/powerpc/include/asm/mmzone.h +++ b/arch/powerpc/include/asm/mmzone.h @@ -42,9 +42,6 @@ u64 memory_hotplug_max(void); #else #define memory_hotplug_max() memblock_end_of_DRAM() #endif /* CONFIG_NUMA */ -#ifdef CONFIG_FA_DUMP -#define __HAVE_ARCH_RESERVED_KERNEL_PAGES -#endif
#ifdef CONFIG_MEMORY_HOTPLUG extern int create_section_mapping(unsigned long start, unsigned long end,
From: Michael Ellerman mpe@ellerman.id.au
[ Upstream commit ede66cd22441820cbd399936bf84fdc4294bc7fa ]
With CONFIG_NUMA=n the build fails with:
arch/powerpc/mm/book3s64/pgtable.c:275:15: error: no previous prototype for ‘create_section_mapping’ [-Werror=missing-prototypes] 275 | int __meminit create_section_mapping(unsigned long start, unsigned long end, | ^~~~~~~~~~~~~~~~~~~~~~
That happens because the prototype for create_section_mapping() is in asm/mmzone.h, but asm/mmzone.h is only included by linux/mmzone.h when CONFIG_NUMA=y.
In fact the prototype is only needed by arch/powerpc/mm code, so move the prototype into arch/powerpc/mm/mmu_decl.h, which also fixes the build error.
Signed-off-by: Michael Ellerman mpe@ellerman.id.au Link: https://msgid.link/20231129131919.2528517-5-mpe@ellerman.id.au Signed-off-by: Sasha Levin sashal@kernel.org --- arch/powerpc/include/asm/mmzone.h | 5 ----- arch/powerpc/mm/mmu_decl.h | 5 +++++ 2 files changed, 5 insertions(+), 5 deletions(-)
diff --git a/arch/powerpc/include/asm/mmzone.h b/arch/powerpc/include/asm/mmzone.h index 3764d3585d30..da827d2d0866 100644 --- a/arch/powerpc/include/asm/mmzone.h +++ b/arch/powerpc/include/asm/mmzone.h @@ -43,10 +43,5 @@ u64 memory_hotplug_max(void); #define memory_hotplug_max() memblock_end_of_DRAM() #endif /* CONFIG_NUMA */
-#ifdef CONFIG_MEMORY_HOTPLUG -extern int create_section_mapping(unsigned long start, unsigned long end, - int nid, pgprot_t prot); -#endif - #endif /* __KERNEL__ */ #endif /* _ASM_MMZONE_H_ */ diff --git a/arch/powerpc/mm/mmu_decl.h b/arch/powerpc/mm/mmu_decl.h index 7f9ff0640124..72341b9fb552 100644 --- a/arch/powerpc/mm/mmu_decl.h +++ b/arch/powerpc/mm/mmu_decl.h @@ -181,3 +181,8 @@ static inline bool debug_pagealloc_enabled_or_kfence(void) { return IS_ENABLED(CONFIG_KFENCE) || debug_pagealloc_enabled(); } + +#ifdef CONFIG_MEMORY_HOTPLUG +int create_section_mapping(unsigned long start, unsigned long end, + int nid, pgprot_t prot); +#endif
From: Jun'ichi Nomura junichi.nomura@nec.com
[ Upstream commit 78a509fba9c9b1fcb77f95b7c6be30da3d24823a ]
When there are two racing NMIs on x86, the first NMI invokes NMI handler and the 2nd NMI is latched until IRET is executed.
If panic on NMI and panic kexec are enabled, the first NMI triggers panic and starts booting the next kernel via kexec. Note that the 2nd NMI is still latched. During the early boot of the next kernel, once an IRET is executed as a result of a page fault, then the 2nd NMI is unlatched and invokes the NMI handler.
However, NMI handler is not set up at the early stage of boot, which results in a boot failure.
Avoid such problems by setting up a NOP handler for early NMIs.
[ mingo: Refined the changelog. ]
Signed-off-by: Jun'ichi Nomura junichi.nomura@nec.com Signed-off-by: Derek Barbosa debarbos@redhat.com Signed-off-by: Ingo Molnar mingo@kernel.org Cc: Kees Cook keescook@chromium.org Cc: Linus Torvalds torvalds@linux-foundation.org Cc: Paul E. McKenney paulmck@kernel.org Cc: Andy Lutomirski luto@kernel.org Cc: "H. Peter Anvin" hpa@zytor.com Cc: Peter Zijlstra peterz@infradead.org Signed-off-by: Sasha Levin sashal@kernel.org --- arch/x86/boot/compressed/ident_map_64.c | 5 +++++ arch/x86/boot/compressed/idt_64.c | 1 + arch/x86/boot/compressed/idt_handlers_64.S | 1 + arch/x86/boot/compressed/misc.h | 1 + 4 files changed, 8 insertions(+)
diff --git a/arch/x86/boot/compressed/ident_map_64.c b/arch/x86/boot/compressed/ident_map_64.c index 08f93b0401bb..aead80ec70a0 100644 --- a/arch/x86/boot/compressed/ident_map_64.c +++ b/arch/x86/boot/compressed/ident_map_64.c @@ -385,3 +385,8 @@ void do_boot_page_fault(struct pt_regs *regs, unsigned long error_code) */ kernel_add_identity_map(address, end); } + +void do_boot_nmi_trap(struct pt_regs *regs, unsigned long error_code) +{ + /* Empty handler to ignore NMI during early boot */ +} diff --git a/arch/x86/boot/compressed/idt_64.c b/arch/x86/boot/compressed/idt_64.c index 3cdf94b41456..d100284bbef4 100644 --- a/arch/x86/boot/compressed/idt_64.c +++ b/arch/x86/boot/compressed/idt_64.c @@ -61,6 +61,7 @@ void load_stage2_idt(void) boot_idt_desc.address = (unsigned long)boot_idt;
set_idt_entry(X86_TRAP_PF, boot_page_fault); + set_idt_entry(X86_TRAP_NMI, boot_nmi_trap);
#ifdef CONFIG_AMD_MEM_ENCRYPT /* diff --git a/arch/x86/boot/compressed/idt_handlers_64.S b/arch/x86/boot/compressed/idt_handlers_64.S index 22890e199f5b..4d03c8562f63 100644 --- a/arch/x86/boot/compressed/idt_handlers_64.S +++ b/arch/x86/boot/compressed/idt_handlers_64.S @@ -70,6 +70,7 @@ SYM_FUNC_END(\name) .code64
EXCEPTION_HANDLER boot_page_fault do_boot_page_fault error_code=1 +EXCEPTION_HANDLER boot_nmi_trap do_boot_nmi_trap error_code=0
#ifdef CONFIG_AMD_MEM_ENCRYPT EXCEPTION_HANDLER boot_stage1_vc do_vc_no_ghcb error_code=1 diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h index cc70d3fb9049..aae1a2db4251 100644 --- a/arch/x86/boot/compressed/misc.h +++ b/arch/x86/boot/compressed/misc.h @@ -197,6 +197,7 @@ static inline void cleanup_exception_handling(void) { }
/* IDT Entry Points */ void boot_page_fault(void); +void boot_nmi_trap(void); void boot_stage1_vc(void); void boot_stage2_vc(void);
From: Stephen Rothwell sfr@canb.auug.org.au
[ Upstream commit 0d555b57ee660d8a871781c0eebf006e855e918d ]
The linux-next build of powerpc64 allnoconfig fails with:
arch/powerpc/mm/book3s64/pgtable.c:557:5: error: no previous prototype for 'pmd_move_must_withdraw' 557 | int pmd_move_must_withdraw(struct spinlock *new_pmd_ptl, | ^~~~~~~~~~~~~~~~~~~~~~
Caused by commit:
c6345dfa6e3e ("Makefile.extrawarn: turn on missing-prototypes globally")
Fix it by moving the function definition under CONFIG_TRANSPARENT_HUGEPAGE like the prototype. The function is only called when CONFIG_TRANSPARENT_HUGEPAGE=y.
Signed-off-by: Stephen Rothwell sfr@canb.auug.org.au [mpe: Flesh out change log from linux-next patch] Signed-off-by: Michael Ellerman mpe@ellerman.id.au Link: https://msgid.link/20231127132809.45c2b398@canb.auug.org.au Signed-off-by: Sasha Levin sashal@kernel.org --- arch/powerpc/mm/book3s64/pgtable.c | 2 ++ 1 file changed, 2 insertions(+)
diff --git a/arch/powerpc/mm/book3s64/pgtable.c b/arch/powerpc/mm/book3s64/pgtable.c index 8f8a62d3ff4d..926bec775f41 100644 --- a/arch/powerpc/mm/book3s64/pgtable.c +++ b/arch/powerpc/mm/book3s64/pgtable.c @@ -542,6 +542,7 @@ void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, set_pte_at(vma->vm_mm, addr, ptep, pte); }
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE /* * For hash translation mode, we use the deposited table to store hash slot * information and they are stored at PTRS_PER_PMD offset from related pmd @@ -563,6 +564,7 @@ int pmd_move_must_withdraw(struct spinlock *new_pmd_ptl,
return true; } +#endif
/* * Does the CPU support tlbie?
From: Naveen N Rao naveen@kernel.org
[ Upstream commit 8f9abaa6d7de0a70fc68acaedce290c1f96e2e59 ]
Some of the fp/vmx code in sstep.c assume a certain maximum size for the instructions being emulated. The size of those operations however is determined separately in analyse_instr().
Add a check to validate the assumption on the maximum size of the operations, so as to prevent any unintended kernel stack corruption.
Signed-off-by: Naveen N Rao naveen@kernel.org Reviewed-by: Gustavo A. R. Silva gustavoars@kernel.org Build-tested-by: Gustavo A. R. Silva gustavoars@kernel.org Signed-off-by: Michael Ellerman mpe@ellerman.id.au Link: https://msgid.link/20231123071705.397625-1-naveen@kernel.org Signed-off-by: Sasha Levin sashal@kernel.org --- arch/powerpc/lib/sstep.c | 10 ++++++++++ 1 file changed, 10 insertions(+)
diff --git a/arch/powerpc/lib/sstep.c b/arch/powerpc/lib/sstep.c index a4ab8625061a..6af97dc0f6d5 100644 --- a/arch/powerpc/lib/sstep.c +++ b/arch/powerpc/lib/sstep.c @@ -586,6 +586,8 @@ static int do_fp_load(struct instruction_op *op, unsigned long ea, } u;
nb = GETSIZE(op->type); + if (nb > sizeof(u)) + return -EINVAL; if (!address_ok(regs, ea, nb)) return -EFAULT; rn = op->reg; @@ -636,6 +638,8 @@ static int do_fp_store(struct instruction_op *op, unsigned long ea, } u;
nb = GETSIZE(op->type); + if (nb > sizeof(u)) + return -EINVAL; if (!address_ok(regs, ea, nb)) return -EFAULT; rn = op->reg; @@ -680,6 +684,9 @@ static nokprobe_inline int do_vec_load(int rn, unsigned long ea, u8 b[sizeof(__vector128)]; } u = {};
+ if (size > sizeof(u)) + return -EINVAL; + if (!address_ok(regs, ea & ~0xfUL, 16)) return -EFAULT; /* align to multiple of size */ @@ -707,6 +714,9 @@ static nokprobe_inline int do_vec_store(int rn, unsigned long ea, u8 b[sizeof(__vector128)]; } u;
+ if (size > sizeof(u)) + return -EINVAL; + if (!address_ok(regs, ea & ~0xfUL, 16)) return -EFAULT; /* align to multiple of size */
From: Miklos Szeredi mszeredi@redhat.com
[ Upstream commit 98d2b43081972abeb5bb5a087bc3e3197531c46e ]
If a mount is released then its mnt_id can immediately be reused. This is bad news for user interfaces that want to uniquely identify a mount.
Implementing a unique mount ID is trivial (use a 64bit counter). Unfortunately userspace assumes 32bit size and would overflow after the counter reaches 2^32.
Introduce a new 64bit ID alongside the old one. Initialize the counter to 2^32, this guarantees that the old and new IDs are never mixed up.
Signed-off-by: Miklos Szeredi mszeredi@redhat.com Link: https://lore.kernel.org/r/20231025140205.3586473-2-mszeredi@redhat.com Reviewed-by: Ian Kent raven@themaw.net Signed-off-by: Christian Brauner brauner@kernel.org Signed-off-by: Sasha Levin sashal@kernel.org --- fs/mount.h | 3 ++- fs/namespace.c | 4 ++++ fs/stat.c | 9 +++++++-- include/uapi/linux/stat.h | 1 + 4 files changed, 14 insertions(+), 3 deletions(-)
diff --git a/fs/mount.h b/fs/mount.h index 130c07c2f8d2..a14f762b3f29 100644 --- a/fs/mount.h +++ b/fs/mount.h @@ -72,7 +72,8 @@ struct mount { struct fsnotify_mark_connector __rcu *mnt_fsnotify_marks; __u32 mnt_fsnotify_mask; #endif - int mnt_id; /* mount identifier */ + int mnt_id; /* mount identifier, reused */ + u64 mnt_id_unique; /* mount ID unique until reboot */ int mnt_group_id; /* peer group identifier */ int mnt_expiry_mark; /* true if marked for expiry */ struct hlist_head mnt_pins; diff --git a/fs/namespace.c b/fs/namespace.c index e157efc54023..e02bc5f41c7b 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -68,6 +68,9 @@ static u64 event; static DEFINE_IDA(mnt_id_ida); static DEFINE_IDA(mnt_group_ida);
+/* Don't allow confusion with old 32bit mount ID */ +static atomic64_t mnt_id_ctr = ATOMIC64_INIT(1ULL << 32); + static struct hlist_head *mount_hashtable __read_mostly; static struct hlist_head *mountpoint_hashtable __read_mostly; static struct kmem_cache *mnt_cache __read_mostly; @@ -131,6 +134,7 @@ static int mnt_alloc_id(struct mount *mnt) if (res < 0) return res; mnt->mnt_id = res; + mnt->mnt_id_unique = atomic64_inc_return(&mnt_id_ctr); return 0; }
diff --git a/fs/stat.c b/fs/stat.c index 5375be5f97cc..e2a124a2d5f3 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -247,8 +247,13 @@ static int vfs_statx(int dfd, struct filename *filename, int flags,
error = vfs_getattr(&path, stat, request_mask, flags);
- stat->mnt_id = real_mount(path.mnt)->mnt_id; - stat->result_mask |= STATX_MNT_ID; + if (request_mask & STATX_MNT_ID_UNIQUE) { + stat->mnt_id = real_mount(path.mnt)->mnt_id_unique; + stat->result_mask |= STATX_MNT_ID_UNIQUE; + } else { + stat->mnt_id = real_mount(path.mnt)->mnt_id; + stat->result_mask |= STATX_MNT_ID; + }
if (path.mnt->mnt_root == path.dentry) stat->attributes |= STATX_ATTR_MOUNT_ROOT; diff --git a/include/uapi/linux/stat.h b/include/uapi/linux/stat.h index 7cab2c65d3d7..2f2ee82d5517 100644 --- a/include/uapi/linux/stat.h +++ b/include/uapi/linux/stat.h @@ -154,6 +154,7 @@ struct statx { #define STATX_BTIME 0x00000800U /* Want/got stx_btime */ #define STATX_MNT_ID 0x00001000U /* Got stx_mnt_id */ #define STATX_DIOALIGN 0x00002000U /* Want/got direct I/O alignment info */ +#define STATX_MNT_ID_UNIQUE 0x00004000U /* Want/got extended stx_mount_id */
#define STATX__RESERVED 0x80000000U /* Reserved for future struct statx expansion */
From: "Borislav Petkov (AMD)" bp@alien8.de
[ Upstream commit 04c3024560d3a14acd18d0a51a1d0a89d29b7eb5 ]
AMD does not have the requirement for a synchronization barrier when acccessing a certain group of MSRs. Do not incur that unnecessary penalty there.
There will be a CPUID bit which explicitly states that a MFENCE is not needed. Once that bit is added to the APM, this will be extended with it.
While at it, move to processor.h to avoid include hell. Untangling that file properly is a matter for another day.
Some notes on the performance aspect of why this is relevant, courtesy of Kishon VijayAbraham Kishon.VijayAbraham@amd.com:
On a AMD Zen4 system with 96 cores, a modified ipi-bench[1] on a VM shows x2AVIC IPI rate is 3% to 4% lower than AVIC IPI rate. The ipi-bench is modified so that the IPIs are sent between two vCPUs in the same CCX. This also requires to pin the vCPU to a physical core to prevent any latencies. This simulates the use case of pinning vCPUs to the thread of a single CCX to avoid interrupt IPI latency.
In order to avoid run-to-run variance (for both x2AVIC and AVIC), the below configurations are done:
1) Disable Power States in BIOS (to prevent the system from going to lower power state)
2) Run the system at fixed frequency 2500MHz (to prevent the system from increasing the frequency when the load is more)
With the above configuration:
*) Performance measured using ipi-bench for AVIC: Average Latency: 1124.98ns [Time to send IPI from one vCPU to another vCPU]
Cumulative throughput: 42.6759M/s [Total number of IPIs sent in a second from 48 vCPUs simultaneously]
*) Performance measured using ipi-bench for x2AVIC: Average Latency: 1172.42ns [Time to send IPI from one vCPU to another vCPU]
Cumulative throughput: 40.9432M/s [Total number of IPIs sent in a second from 48 vCPUs simultaneously]
From above, x2AVIC latency is ~4% more than AVIC. However, the expectation is x2AVIC performance to be better or equivalent to AVIC. Upon analyzing the perf captures, it is observed significant time is spent in weak_wrmsr_fence() invoked by x2apic_send_IPI().
With the fix to skip weak_wrmsr_fence()
*) Performance measured using ipi-bench for x2AVIC: Average Latency: 1117.44ns [Time to send IPI from one vCPU to another vCPU]
Cumulative throughput: 42.9608M/s [Total number of IPIs sent in a second from 48 vCPUs simultaneously]
Comparing the performance of x2AVIC with and without the fix, it can be seen the performance improves by ~4%.
Performance captured using an unmodified ipi-bench using the 'mesh-ipi' option with and without weak_wrmsr_fence() on a Zen4 system also showed significant performance improvement without weak_wrmsr_fence(). The 'mesh-ipi' option ignores CCX or CCD and just picks random vCPU.
Average throughput (10 iterations) with weak_wrmsr_fence(), Cumulative throughput: 4933374 IPI/s
Average throughput (10 iterations) without weak_wrmsr_fence(), Cumulative throughput: 6355156 IPI/s
[1] https://github.com/bytedance/kvm-utils/tree/master/microbenchmark/ipi-bench
Signed-off-by: Borislav Petkov (AMD) bp@alien8.de Link: https://lore.kernel.org/r/20230622095212.20940-1-bp@alien8.de Signed-off-by: Sasha Levin sashal@kernel.org --- arch/x86/include/asm/barrier.h | 18 ------------------ arch/x86/include/asm/cpufeatures.h | 2 +- arch/x86/include/asm/processor.h | 18 ++++++++++++++++++ arch/x86/kernel/cpu/amd.c | 3 +++ arch/x86/kernel/cpu/common.c | 7 +++++++ arch/x86/kernel/cpu/hygon.c | 3 +++ 6 files changed, 32 insertions(+), 19 deletions(-)
diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h index 35389b2af88e..0216f63a366b 100644 --- a/arch/x86/include/asm/barrier.h +++ b/arch/x86/include/asm/barrier.h @@ -81,22 +81,4 @@ do { \
#include <asm-generic/barrier.h>
-/* - * Make previous memory operations globally visible before - * a WRMSR. - * - * MFENCE makes writes visible, but only affects load/store - * instructions. WRMSR is unfortunately not a load/store - * instruction and is unaffected by MFENCE. The LFENCE ensures - * that the WRMSR is not reordered. - * - * Most WRMSRs are full serializing instructions themselves and - * do not require this barrier. This is only required for the - * IA32_TSC_DEADLINE and X2APIC MSRs. - */ -static inline void weak_wrmsr_fence(void) -{ - asm volatile("mfence; lfence" : : : "memory"); -} - #endif /* _ASM_X86_BARRIER_H */ diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 58cb9495e40f..0091f1008314 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -308,10 +308,10 @@ #define X86_FEATURE_SMBA (11*32+21) /* "" Slow Memory Bandwidth Allocation */ #define X86_FEATURE_BMEC (11*32+22) /* "" Bandwidth Monitoring Event Configuration */ #define X86_FEATURE_USER_SHSTK (11*32+23) /* Shadow stack support for user mode applications */ - #define X86_FEATURE_SRSO (11*32+24) /* "" AMD BTB untrain RETs */ #define X86_FEATURE_SRSO_ALIAS (11*32+25) /* "" AMD BTB untrain RETs through aliasing */ #define X86_FEATURE_IBPB_ON_VMEXIT (11*32+26) /* "" Issue an IBPB only on VMEXIT */ +#define X86_FEATURE_APIC_MSRS_FENCE (11*32+27) /* "" IA32_TSC_DEADLINE and X2APIC MSRs need fencing */
/* Intel-defined CPU features, CPUID level 0x00000007:1 (EAX), word 12 */ #define X86_FEATURE_AVX_VNNI (12*32+ 4) /* AVX VNNI instructions */ diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index a3669a7774ed..191f1d8f0506 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -734,4 +734,22 @@ bool arch_is_platform_page(u64 paddr);
extern bool gds_ucode_mitigated(void);
+/* + * Make previous memory operations globally visible before + * a WRMSR. + * + * MFENCE makes writes visible, but only affects load/store + * instructions. WRMSR is unfortunately not a load/store + * instruction and is unaffected by MFENCE. The LFENCE ensures + * that the WRMSR is not reordered. + * + * Most WRMSRs are full serializing instructions themselves and + * do not require this barrier. This is only required for the + * IA32_TSC_DEADLINE and X2APIC MSRs. + */ +static inline void weak_wrmsr_fence(void) +{ + alternative("mfence; lfence", "", ALT_NOT(X86_FEATURE_APIC_MSRS_FENCE)); +} + #endif /* _ASM_X86_PROCESSOR_H */ diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 6e4f23f314ac..bb3efc825bf4 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -1157,6 +1157,9 @@ static void init_amd(struct cpuinfo_x86 *c) if (!cpu_has(c, X86_FEATURE_HYPERVISOR) && cpu_has_amd_erratum(c, amd_erratum_1485)) msr_set_bit(MSR_ZEN4_BP_CFG, MSR_ZEN4_BP_CFG_SHARED_BTB_FIX_BIT); + + /* AMD CPUs don't need fencing after x2APIC/TSC_DEADLINE MSR writes. */ + clear_cpu_cap(c, X86_FEATURE_APIC_MSRS_FENCE); }
#ifdef CONFIG_X86_32 diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 4e5ffc8b0e46..d98d023ae497 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1858,6 +1858,13 @@ static void identify_cpu(struct cpuinfo_x86 *c) c->apicid = apic->phys_pkg_id(c->initial_apicid, 0); #endif
+ + /* + * Set default APIC and TSC_DEADLINE MSR fencing flag. AMD and + * Hygon will clear it in ->c_init() below. + */ + set_cpu_cap(c, X86_FEATURE_APIC_MSRS_FENCE); + /* * Vendor-specific initialization. In this section we * canonicalize the feature flags, meaning if there are diff --git a/arch/x86/kernel/cpu/hygon.c b/arch/x86/kernel/cpu/hygon.c index a7b3ef4c4de9..6e738759779e 100644 --- a/arch/x86/kernel/cpu/hygon.c +++ b/arch/x86/kernel/cpu/hygon.c @@ -348,6 +348,9 @@ static void init_hygon(struct cpuinfo_x86 *c) set_cpu_bug(c, X86_BUG_SYSRET_SS_ATTRS);
check_null_seg_clears_base(c); + + /* Hygon CPUs don't need fencing after x2APIC/TSC_DEADLINE MSR writes. */ + clear_cpu_cap(c, X86_FEATURE_APIC_MSRS_FENCE); }
static void cpu_detect_tlb_hygon(struct cpuinfo_x86 *c)
From: Zhiquan Li zhiquan1.li@intel.com
[ Upstream commit 9f3b130048bfa2e44a8cfb1b616f826d9d5d8188 ]
Memory errors don't happen very often, especially fatal ones. However, in large-scale scenarios such as data centers, that probability increases with the amount of machines present.
When a fatal machine check happens, mce_panic() is called based on the severity grading of that error. The page containing the error is not marked as poison.
However, when kexec is enabled, tools like makedumpfile understand when pages are marked as poison and do not touch them so as not to cause a fatal machine check exception again while dumping the previous kernel's memory.
Therefore, mark the page containing the error as poisoned so that the kexec'ed kernel can avoid accessing the page.
[ bp: Rewrite commit message and comment. ]
Co-developed-by: Youquan Song youquan.song@intel.com Signed-off-by: Youquan Song youquan.song@intel.com Signed-off-by: Zhiquan Li zhiquan1.li@intel.com Signed-off-by: Borislav Petkov (AMD) bp@alien8.de Reviewed-by: Naoya Horiguchi naoya.horiguchi@nec.com Link: https://lore.kernel.org/r/20231014051754.3759099-1-zhiquan1.li@intel.com Signed-off-by: Sasha Levin sashal@kernel.org --- arch/x86/kernel/cpu/mce/core.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+)
diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 6f35f724cc14..20ab11aec60b 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -44,6 +44,7 @@ #include <linux/sync_core.h> #include <linux/task_work.h> #include <linux/hardirq.h> +#include <linux/kexec.h>
#include <asm/intel-family.h> #include <asm/processor.h> @@ -233,6 +234,7 @@ static noinstr void mce_panic(const char *msg, struct mce *final, char *exp) struct llist_node *pending; struct mce_evt_llist *l; int apei_err = 0; + struct page *p;
/* * Allow instrumentation around external facilities usage. Not that it @@ -286,6 +288,20 @@ static noinstr void mce_panic(const char *msg, struct mce *final, char *exp) if (!fake_panic) { if (panic_timeout == 0) panic_timeout = mca_cfg.panic_timeout; + + /* + * Kdump skips the poisoned page in order to avoid + * touching the error bits again. Poison the page even + * if the error is fatal and the machine is about to + * panic. + */ + if (kexec_crash_loaded()) { + if (final && (final->status & MCI_STATUS_ADDRV)) { + p = pfn_to_online_page(final->addr >> PAGE_SHIFT); + if (p) + SetPageHWPoison(p); + } + } panic(msg); } else pr_emerg(HW_ERR "Fake kernel panic: %s\n", msg);
linux-stable-mirror@lists.linaro.org