arch_bpf_trampoline_size() provides JIT size of the BPF trampoline
before the buffer for JIT'ing it is allocated. The total number of
instructions emitted for BPF trampoline JIT code depends on where
the final image is located. So, the size arrived at with the dummy
pass in arch_bpf_trampoline_size() can vary from the actual size
needed in arch_prepare_bpf_trampoline(). When the instructions
accounted in arch_bpf_trampoline_size() is less than the number of
instructions emitted during the actual JIT compile of the trampoline,
the below warning is produced:
WARNING: CPU: 8 PID: 204190 at arch/powerpc/net/bpf_jit_comp.c:981 __arch_prepare_bpf_trampoline.isra.0+0xd2c/0xdcc
which is:
/* Make sure the trampoline generation logic doesn't overflow */
if (image && WARN_ON_ONCE(&image[ctx->idx] >
(u32 *)rw_image_end - BPF_INSN_SAFETY)) {
So, during the dummy pass, instead of providing some arbitrary image
location, account for maximum possible instructions if and when there
is a dependency with image location for JIT'ing.
Fixes: d243b62b7bd3 ("powerpc64/bpf: Add support for bpf trampolines")
Reported-by: Venkat Rao Bagalkote <venkat88(a)linux.ibm.com>
Closes: https://lore.kernel.org/all/6168bfc8-659f-4b5a-a6fb-90a916dde3b3@linux.ibm.…
Cc: stable(a)vger.kernel.org # v6.13+
Acked-by: Naveen N Rao (AMD) <naveen(a)kernel.org>
Tested-by: Venkat Rao Bagalkote <venkat88(a)linux.ibm.com>
Signed-off-by: Hari Bathini <hbathini(a)linux.ibm.com>
---
Changes since v2:
- Address review comments from Naveen:
- Remove additional padding for 'case BPF_LD | BPF_IMM | BPF_DW:'
in arch/powerpc/net/bpf_jit_comp*.c
- Merge the if sequence in bpf_jit_emit_func_call_rel() with the
other conditionals
- Naveen, carried your Acked-by tag as the additional changes are
minimal and in line with your suggestion. Feel free to update
if you look at it differently.
- Venkat, carried your Tested-by tag from v2 as the changes in
v3 should not alter the test result. Feel free to update if
you look at it differently.
Changes since v1:
- Pass NULL for image during intial pass and account for max. possible
instruction during this pass as Naveen suggested.
arch/powerpc/net/bpf_jit.h | 20 ++++++++++++++++---
arch/powerpc/net/bpf_jit_comp.c | 33 ++++++++++---------------------
arch/powerpc/net/bpf_jit_comp32.c | 6 ------
arch/powerpc/net/bpf_jit_comp64.c | 15 +++++++-------
4 files changed, 35 insertions(+), 39 deletions(-)
diff --git a/arch/powerpc/net/bpf_jit.h b/arch/powerpc/net/bpf_jit.h
index 6beacaec63d3..4c26912c2e3c 100644
--- a/arch/powerpc/net/bpf_jit.h
+++ b/arch/powerpc/net/bpf_jit.h
@@ -51,8 +51,16 @@
EMIT(PPC_INST_BRANCH_COND | (((cond) & 0x3ff) << 16) | (offset & 0xfffc)); \
} while (0)
-/* Sign-extended 32-bit immediate load */
+/*
+ * Sign-extended 32-bit immediate load
+ *
+ * If this is a dummy pass (!image), account for
+ * maximum possible instructions.
+ */
#define PPC_LI32(d, i) do { \
+ if (!image) \
+ ctx->idx += 2; \
+ else { \
if ((int)(uintptr_t)(i) >= -32768 && \
(int)(uintptr_t)(i) < 32768) \
EMIT(PPC_RAW_LI(d, i)); \
@@ -60,10 +68,15 @@
EMIT(PPC_RAW_LIS(d, IMM_H(i))); \
if (IMM_L(i)) \
EMIT(PPC_RAW_ORI(d, d, IMM_L(i))); \
- } } while(0)
+ } \
+ } } while (0)
#ifdef CONFIG_PPC64
+/* If dummy pass (!image), account for maximum possible instructions */
#define PPC_LI64(d, i) do { \
+ if (!image) \
+ ctx->idx += 5; \
+ else { \
if ((long)(i) >= -2147483648 && \
(long)(i) < 2147483648) \
PPC_LI32(d, i); \
@@ -84,7 +97,8 @@
if ((uintptr_t)(i) & 0x000000000000ffffULL) \
EMIT(PPC_RAW_ORI(d, d, (uintptr_t)(i) & \
0xffff)); \
- } } while (0)
+ } \
+ } } while (0)
#define PPC_LI_ADDR PPC_LI64
#ifndef CONFIG_PPC_KERNEL_PCREL
diff --git a/arch/powerpc/net/bpf_jit_comp.c b/arch/powerpc/net/bpf_jit_comp.c
index 2991bb171a9b..c0684733e9d6 100644
--- a/arch/powerpc/net/bpf_jit_comp.c
+++ b/arch/powerpc/net/bpf_jit_comp.c
@@ -504,10 +504,11 @@ static int invoke_bpf_prog(u32 *image, u32 *ro_image, struct codegen_context *ct
EMIT(PPC_RAW_ADDI(_R3, _R1, regs_off));
if (!p->jited)
PPC_LI_ADDR(_R4, (unsigned long)p->insnsi);
- if (!create_branch(&branch_insn, (u32 *)&ro_image[ctx->idx], (unsigned long)p->bpf_func,
- BRANCH_SET_LINK)) {
- if (image)
- image[ctx->idx] = ppc_inst_val(branch_insn);
+ /* Account for max possible instructions during dummy pass for size calculation */
+ if (image && !create_branch(&branch_insn, (u32 *)&ro_image[ctx->idx],
+ (unsigned long)p->bpf_func,
+ BRANCH_SET_LINK)) {
+ image[ctx->idx] = ppc_inst_val(branch_insn);
ctx->idx++;
} else {
EMIT(PPC_RAW_LL(_R12, _R25, offsetof(struct bpf_prog, bpf_func)));
@@ -889,7 +890,8 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_im
bpf_trampoline_restore_tail_call_cnt(image, ctx, func_frame_offset, r4_off);
/* Reserve space to patch branch instruction to skip fexit progs */
- im->ip_after_call = &((u32 *)ro_image)[ctx->idx];
+ if (ro_image) /* image is NULL for dummy pass */
+ im->ip_after_call = &((u32 *)ro_image)[ctx->idx];
EMIT(PPC_RAW_NOP());
}
@@ -912,7 +914,8 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_im
}
if (flags & BPF_TRAMP_F_CALL_ORIG) {
- im->ip_epilogue = &((u32 *)ro_image)[ctx->idx];
+ if (ro_image) /* image is NULL for dummy pass */
+ im->ip_epilogue = &((u32 *)ro_image)[ctx->idx];
PPC_LI_ADDR(_R3, im);
ret = bpf_jit_emit_func_call_rel(image, ro_image, ctx,
(unsigned long)__bpf_tramp_exit);
@@ -973,25 +976,9 @@ int arch_bpf_trampoline_size(const struct btf_func_model *m, u32 flags,
struct bpf_tramp_links *tlinks, void *func_addr)
{
struct bpf_tramp_image im;
- void *image;
int ret;
- /*
- * Allocate a temporary buffer for __arch_prepare_bpf_trampoline().
- * This will NOT cause fragmentation in direct map, as we do not
- * call set_memory_*() on this buffer.
- *
- * We cannot use kvmalloc here, because we need image to be in
- * module memory range.
- */
- image = bpf_jit_alloc_exec(PAGE_SIZE);
- if (!image)
- return -ENOMEM;
-
- ret = __arch_prepare_bpf_trampoline(&im, image, image + PAGE_SIZE, image,
- m, flags, tlinks, func_addr);
- bpf_jit_free_exec(image);
-
+ ret = __arch_prepare_bpf_trampoline(&im, NULL, NULL, NULL, m, flags, tlinks, func_addr);
return ret;
}
diff --git a/arch/powerpc/net/bpf_jit_comp32.c b/arch/powerpc/net/bpf_jit_comp32.c
index c4db278dae36..0aace304dfe1 100644
--- a/arch/powerpc/net/bpf_jit_comp32.c
+++ b/arch/powerpc/net/bpf_jit_comp32.c
@@ -313,7 +313,6 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, u32 *fimage, struct code
u64 func_addr;
u32 true_cond;
u32 tmp_idx;
- int j;
if (i && (BPF_CLASS(code) == BPF_ALU64 || BPF_CLASS(code) == BPF_ALU) &&
(BPF_CLASS(prevcode) == BPF_ALU64 || BPF_CLASS(prevcode) == BPF_ALU) &&
@@ -1099,13 +1098,8 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, u32 *fimage, struct code
* 16 byte instruction that uses two 'struct bpf_insn'
*/
case BPF_LD | BPF_IMM | BPF_DW: /* dst = (u64) imm */
- tmp_idx = ctx->idx;
PPC_LI32(dst_reg_h, (u32)insn[i + 1].imm);
PPC_LI32(dst_reg, (u32)insn[i].imm);
- /* padding to allow full 4 instructions for later patching */
- if (!image)
- for (j = ctx->idx - tmp_idx; j < 4; j++)
- EMIT(PPC_RAW_NOP());
/* Adjust for two bpf instructions */
addrs[++i] = ctx->idx * 4;
break;
diff --git a/arch/powerpc/net/bpf_jit_comp64.c b/arch/powerpc/net/bpf_jit_comp64.c
index 233703b06d7c..5daa77aee7f7 100644
--- a/arch/powerpc/net/bpf_jit_comp64.c
+++ b/arch/powerpc/net/bpf_jit_comp64.c
@@ -227,7 +227,14 @@ int bpf_jit_emit_func_call_rel(u32 *image, u32 *fimage, struct codegen_context *
#ifdef CONFIG_PPC_KERNEL_PCREL
reladdr = func_addr - local_paca->kernelbase;
- if (reladdr < (long)SZ_8G && reladdr >= -(long)SZ_8G) {
+ /*
+ * If fimage is NULL (the initial pass to find image size),
+ * account for the maximum no. of instructions possible.
+ */
+ if (!fimage) {
+ ctx->idx += 7;
+ return 0;
+ } else if (reladdr < (long)SZ_8G && reladdr >= -(long)SZ_8G) {
EMIT(PPC_RAW_LD(_R12, _R13, offsetof(struct paca_struct, kernelbase)));
/* Align for subsequent prefix instruction */
if (!IS_ALIGNED((unsigned long)fimage + CTX_NIA(ctx), 8))
@@ -412,7 +419,6 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, u32 *fimage, struct code
u64 imm64;
u32 true_cond;
u32 tmp_idx;
- int j;
/*
* addrs[] maps a BPF bytecode address into a real offset from
@@ -1046,12 +1052,7 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, u32 *fimage, struct code
case BPF_LD | BPF_IMM | BPF_DW: /* dst = (u64) imm */
imm64 = ((u64)(u32) insn[i].imm) |
(((u64)(u32) insn[i+1].imm) << 32);
- tmp_idx = ctx->idx;
PPC_LI64(dst_reg, imm64);
- /* padding to allow full 5 instructions for later patching */
- if (!image)
- for (j = ctx->idx - tmp_idx; j < 5; j++)
- EMIT(PPC_RAW_NOP());
/* Adjust for two bpf instructions */
addrs[++i] = ctx->idx * 4;
break;
--
2.49.0
The patch titled
Subject: mm: fix folio_pte_batch() on XEN PV
has been added to the -mm mm-hotfixes-unstable branch. Its filename is
mm-fix-folio_pte_batch-on-xen-pv.patch
This patch will shortly appear at
https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patche…
This patch will later appear in the mm-hotfixes-unstable branch at
git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next via the mm-everything
branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
and is updated there every 2-3 working days
------------------------------------------------------
From: Petr Van��k <arkamar(a)atlas.cz>
Subject: mm: fix folio_pte_batch() on XEN PV
Date: Fri, 2 May 2025 23:50:19 +0200
On XEN PV, folio_pte_batch() can incorrectly batch beyond the end of a
folio due to a corner case in pte_advance_pfn(). Specifically, when the
PFN following the folio maps to an invalidated MFN,
expected_pte = pte_advance_pfn(expected_pte, nr);
produces a pte_none(). If the actual next PTE in memory is also
pte_none(), the pte_same() succeeds,
if (!pte_same(pte, expected_pte))
break;
the loop is not broken, and batching continues into unrelated memory.
For example, with a 4-page folio, the PTE layout might look like this:
[ 53.465673] [ T2552] folio_pte_batch: printing PTE values at addr=0x7f1ac9dc5000
[ 53.465674] [ T2552] PTE[453] = 000000010085c125
[ 53.465679] [ T2552] PTE[454] = 000000010085d125
[ 53.465682] [ T2552] PTE[455] = 000000010085e125
[ 53.465684] [ T2552] PTE[456] = 000000010085f125
[ 53.465686] [ T2552] PTE[457] = 0000000000000000 <-- not present
[ 53.465689] [ T2552] PTE[458] = 0000000101da7125
pte_advance_pfn(PTE[456]) returns a pte_none() due to invalid PFN->MFN
mapping. The next actual PTE (PTE[457]) is also pte_none(), so the loop
continues and includes PTE[457] in the batch, resulting in 5 batched
entries for a 4-page folio. This triggers the following warning:
[ 53.465751] [ T2552] page: refcount:85 mapcount:20 mapping:ffff88813ff4f6a8 index:0x110 pfn:0x10085c
[ 53.465754] [ T2552] head: order:2 mapcount:80 entire_mapcount:0 nr_pages_mapped:4 pincount:0
[ 53.465756] [ T2552] memcg:ffff888003573000
[ 53.465758] [ T2552] aops:0xffffffff8226fd20 ino:82467c dentry name(?):"libc.so.6"
[ 53.465761] [ T2552] flags: 0x2000000000416c(referenced|uptodate|lru|active|private|head|node=0|zone=2)
[ 53.465764] [ T2552] raw: 002000000000416c ffffea0004021f08 ffffea0004021908 ffff88813ff4f6a8
[ 53.465767] [ T2552] raw: 0000000000000110 ffff888133d8bd40 0000005500000013 ffff888003573000
[ 53.465768] [ T2552] head: 002000000000416c ffffea0004021f08 ffffea0004021908 ffff88813ff4f6a8
[ 53.465770] [ T2552] head: 0000000000000110 ffff888133d8bd40 0000005500000013 ffff888003573000
[ 53.465772] [ T2552] head: 0020000000000202 ffffea0004021701 000000040000004f 00000000ffffffff
[ 53.465774] [ T2552] head: 0000000300000003 8000000300000002 0000000000000013 0000000000000004
[ 53.465775] [ T2552] page dumped because: VM_WARN_ON_FOLIO((_Generic((page + nr_pages - 1), const struct page *: (const struct folio *)_compound_head(page + nr_pages - 1), struct page *: (struct folio *)_compound_head(page + nr_pages - 1))) != folio)
Original code works as expected everywhere, except on XEN PV, where
pte_advance_pfn() can yield a pte_none() after balloon inflation due to
MFNs invalidation. In XEN, pte_advance_pfn() ends up calling
__pte()->xen_make_pte()->pte_pfn_to_mfn(), which returns pte_none() when
mfn == INVALID_P2M_ENTRY.
The pte_pfn_to_mfn() documents that nastiness:
If there's no mfn for the pfn, then just create an
empty non-present pte. Unfortunately this loses
information about the original pfn, so
pte_mfn_to_pfn is asymmetric.
While such hacks should certainly be removed, we can do better in
folio_pte_batch() and simply check ahead of time how many PTEs we can
possibly batch in our folio.
This way, we can not only fix the issue but cleanup the code: removing the
pte_pfn() check inside the loop body and avoiding end_ptr comparison +
arithmetic.
Link: https://lkml.kernel.org/r/20250502215019.822-2-arkamar@atlas.cz
Fixes: f8d937761d65 ("mm/memory: optimize fork() with PTE-mapped THP")
Co-developed-by: David Hildenbrand <david(a)redhat.com>
Signed-off-by: David Hildenbrand <david(a)redhat.com>
Signed-off-by: Petr Van��k <arkamar(a)atlas.cz>
Cc: Ryan Roberts <ryan.roberts(a)arm.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/internal.h | 27 +++++++++++----------------
1 file changed, 11 insertions(+), 16 deletions(-)
--- a/mm/internal.h~mm-fix-folio_pte_batch-on-xen-pv
+++ a/mm/internal.h
@@ -248,11 +248,9 @@ static inline int folio_pte_batch(struct
pte_t *start_ptep, pte_t pte, int max_nr, fpb_t flags,
bool *any_writable, bool *any_young, bool *any_dirty)
{
- unsigned long folio_end_pfn = folio_pfn(folio) + folio_nr_pages(folio);
- const pte_t *end_ptep = start_ptep + max_nr;
pte_t expected_pte, *ptep;
bool writable, young, dirty;
- int nr;
+ int nr, cur_nr;
if (any_writable)
*any_writable = false;
@@ -265,11 +263,15 @@ static inline int folio_pte_batch(struct
VM_WARN_ON_FOLIO(!folio_test_large(folio) || max_nr < 1, folio);
VM_WARN_ON_FOLIO(page_folio(pfn_to_page(pte_pfn(pte))) != folio, folio);
+ /* Limit max_nr to the actual remaining PFNs in the folio we could batch. */
+ max_nr = min_t(unsigned long, max_nr,
+ folio_pfn(folio) + folio_nr_pages(folio) - pte_pfn(pte));
+
nr = pte_batch_hint(start_ptep, pte);
expected_pte = __pte_batch_clear_ignored(pte_advance_pfn(pte, nr), flags);
ptep = start_ptep + nr;
- while (ptep < end_ptep) {
+ while (nr < max_nr) {
pte = ptep_get(ptep);
if (any_writable)
writable = !!pte_write(pte);
@@ -282,14 +284,6 @@ static inline int folio_pte_batch(struct
if (!pte_same(pte, expected_pte))
break;
- /*
- * Stop immediately once we reached the end of the folio. In
- * corner cases the next PFN might fall into a different
- * folio.
- */
- if (pte_pfn(pte) >= folio_end_pfn)
- break;
-
if (any_writable)
*any_writable |= writable;
if (any_young)
@@ -297,12 +291,13 @@ static inline int folio_pte_batch(struct
if (any_dirty)
*any_dirty |= dirty;
- nr = pte_batch_hint(ptep, pte);
- expected_pte = pte_advance_pfn(expected_pte, nr);
- ptep += nr;
+ cur_nr = pte_batch_hint(ptep, pte);
+ expected_pte = pte_advance_pfn(expected_pte, cur_nr);
+ ptep += cur_nr;
+ nr += cur_nr;
}
- return min(ptep - start_ptep, max_nr);
+ return min(nr, max_nr);
}
/**
_
Patches currently in -mm which might be from arkamar(a)atlas.cz are
mm-fix-folio_pte_batch-on-xen-pv.patch
create_init_idmap() could be called before .bss section initialization
which is done in early_map_kernel() since data/test_prot could be set
wrong for PTE_MAYBE_NG macro.
PTE_MAYBE_NG macro is set according to value of "arm64_use_ng_mappings".
and this variable is located in .bss section.
# llvm-objdump-21 --syms vmlinux-gcc | grep arm64_use_ng_mappings
ffff800082f242a8 g O .bss 0000000000000001 arm64_use_ng_mappings
If .bss section doesn't initialized, "arm64_use_ng_mappings" would be set
with garbage value ant then the text_prot or data_prot could be set
with garbgae value.
Here is what i saw with kernel compiled via llvm-21
// create_init_idmap()
ffff80008255c058: d10103ff sub sp, sp, #0x40
ffff80008255c05c: a9017bfd stp x29, x30, [sp, #0x10]
ffff80008255c060: a90257f6 stp x22, x21, [sp, #0x20]
ffff80008255c064: a9034ff4 stp x20, x19, [sp, #0x30]
ffff80008255c068: 910043fd add x29, sp, #0x10
ffff80008255c06c: 90003fc8 adrp x8, 0xffff800082d54000
ffff80008255c070: d280e06a mov x10, #0x703 // =1795
ffff80008255c074: 91400409 add x9, x0, #0x1, lsl #12 // =0x1000
ffff80008255c078: 394a4108 ldrb w8, [x8, #0x290] ------------- (1)
ffff80008255c07c: f2e00d0a movk x10, #0x68, lsl #48
ffff80008255c080: f90007e9 str x9, [sp, #0x8]
ffff80008255c084: aa0103f3 mov x19, x1
ffff80008255c088: aa0003f4 mov x20, x0
ffff80008255c08c: 14000000 b 0xffff80008255c08c <__pi_create_init_idmap+0x34>
ffff80008255c090: aa082d56 orr x22, x10, x8, lsl #11 -------- (2)
Note (1) is load the arm64_use_ng_mappings value in w8.
and (2) is set the text or data prot with the w8 value to set PTE_NG bit.
If .bss section doesn't initialized, x8 can include garbage value
-- In case of some platform, x8 loaded with 0xcf -- it could generate
wrong mapping. (i.e) text_prot is expected with
PAGE_KERNEL_ROX(0x0040000000000F83) but
with garbage x8 -- 0xcf, it sets with (0x0040000000067F83)
and This makes boot failure with translation fault.
This error cannot happen according to code generated by compiler.
here is the case of gcc:
ffff80008260a940 <__pi_create_init_idmap>:
ffff80008260a940: d100c3ff sub sp, sp, #0x30
ffff80008260a944: aa0003ed mov x13, x0
ffff80008260a948: 91400400 add x0, x0, #0x1, lsl #12 // =0x1000
ffff80008260a94c: a9017bfd stp x29, x30, [sp, #0x10]
ffff80008260a950: 910043fd add x29, sp, #0x10
ffff80008260a954: f90017e0 str x0, [sp, #0x28]
ffff80008260a958: d00048c0 adrp x0, 0xffff800082f24000 <reset_devices>
ffff80008260a95c: 394aa000 ldrb w0, [x0, #0x2a8]
ffff80008260a960: 37000640 tbnz w0, #0x0, 0xffff80008260aa28 <__pi_create_init_idmap+0xe8> ---(3)
ffff80008260a964: d280f060 mov x0, #0x783 // =1923
ffff80008260a968: d280e062 mov x2, #0x703 // =1795
ffff80008260a96c: f2e00800 movk x0, #0x40, lsl #48
ffff80008260a970: f2e00d02 movk x2, #0x68, lsl #48
ffff80008260a974: aa2103e4 mvn x4, x1
ffff80008260a978: 8a210049 bic x9, x2, x1
...
ffff80008260aa28: d281f060 mov x0, #0xf83 // =3971
ffff80008260aa2c: d281e062 mov x2, #0xf03 // =3843
ffff80008260aa30: f2e00800 movk x0, #0x40, lsl #48
In case of gcc, according to value of arm64_use_ng_mappings (annoated as(3)),
it branches to each prot settup code.
However this is also problem since it branches according to garbage
value too -- idmapping with wrong pgprot.
To resolve this, annotate arm64_use_ng_mappings as ro_after_init.
Fixes: 84b04d3e6bdb ("arm64: kernel: Create initial ID map from C code")
Cc: <stable(a)vger.kernel.org> # 6.9.x
Signed-off-by: Yeoreum Yun <yeoreum.yun(a)arm.com>
---
There is another way to solve this problem by setting
test/data_prot with _PAGE_DEFAULT which doesn't include PTE_MAYBE_NG
with constanst check in create_init_idmap() to be free from
arm64_use_ng_mappings. but i think it would be better to change
arm64_use_ng_mappings as ro_after_init because it doesn't change after
init phase and solve this problem too.
---
arch/arm64/kernel/cpufeature.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index d2104a1e7843..967ffb1cbd52 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -114,7 +114,7 @@ static struct arm64_cpu_capabilities const __ro_after_init *cpucap_ptrs[ARM64_NC
DECLARE_BITMAP(boot_cpucaps, ARM64_NCAPS);
-bool arm64_use_ng_mappings = false;
+bool arm64_use_ng_mappings __ro_after_init = false;
EXPORT_SYMBOL(arm64_use_ng_mappings);
DEFINE_PER_CPU_READ_MOSTLY(const char *, this_cpu_vector) = vectors;
--
LEVI:{C3F47F37-75D8-414A-A8BA-3980EC8A46D7}
The ftrace __mcount_loc buildtime sort does not work properly when the host is
32-bit and the target is 64-bit. sorttable parses the start and stop addresses
by calling strtoul on the buffer holding the hexadecimal string. Since the
target is 64-bit but unsigned long on 32-bit machines is 32 bits, strtoul,
and by extension the start and stop addresses, can max out to 2^32 - 1.
This patch adds a new macro, parse_addr, that corresponds to a strtoul
or strtoull call based on whether you are operating on a 32-bit ELF or
a 64-bit ELF. This way, the correct width is guaranteed whether or not
the host is 32-bit. This should cleanly apply on all of the 6.x stable
kernels.
Manually verified that the __mcount_loc section is sorted by parsing the
ELF and verified tests corresponding to CONFIG_FTRACE_SORT_STARTUP_TEST
for kernels built on a 32-bit and a 64-bit host.
Signed-off-by: Sahil Gupta <s.gupta(a)arista.com>
---
scripts/sorttable.h | 7 +++++--
1 file changed, 5 insertions(+), 2 deletions(-)
diff --git a/scripts/sorttable.h b/scripts/sorttable.h
index 7bd0184380d3..9ed7acca9f30 100644
--- a/scripts/sorttable.h
+++ b/scripts/sorttable.h
@@ -40,6 +40,7 @@
#undef uint_t
#undef _r
#undef _w
+#undef parse_addr
#ifdef SORTTABLE_64
# define extable_ent_size 16
@@ -65,6 +66,7 @@
# define uint_t uint64_t
# define _r r8
# define _w w8
+# define parse_addr(buf) strtoull(buf, NULL, 16)
#else
# define extable_ent_size 8
# define compare_extable compare_extable_32
@@ -89,6 +91,7 @@
# define uint_t uint32_t
# define _r r
# define _w w
+# define parse_addr(buf) strtoul(buf, NULL, 16)
#endif
#if defined(SORTTABLE_64) && defined(UNWINDER_ORC_ENABLED)
@@ -246,13 +249,13 @@ static void get_mcount_loc(uint_t *_start, uint_t *_stop)
len = strlen(start_buff);
start_buff[len - 1] = '\0';
}
- *_start = strtoul(start_buff, NULL, 16);
+ *_start = parse_addr(start_buff);
while (fgets(stop_buff, sizeof(stop_buff), file_stop) != NULL) {
len = strlen(stop_buff);
stop_buff[len - 1] = '\0';
}
- *_stop = strtoul(stop_buff, NULL, 16);
+ *_stop = parse_addr(stop_buff);
pclose(file_start);
pclose(file_stop);
--
2.47.0
When a CL/CSD job times out, we check if the GPU has made any progress
since the last timeout. If so, instead of resetting the hardware, we skip
the reset and let the timer get rearmed. This gives long-running jobs a
chance to complete.
However, when `timedout_job()` is called, the job in question is removed
from the pending list, which means it won't be automatically freed through
`free_job()`. Consequently, when we skip the reset and keep the job
running, the job won't be freed when it finally completes.
This situation leads to a memory leak, as exposed in [1] and [2].
Similarly to commit 704d3d60fec4 ("drm/etnaviv: don't block scheduler when
GPU is still active"), this patch ensures the job is put back on the
pending list when extending the timeout.
Cc: stable(a)vger.kernel.org # 6.0
Reported-by: Daivik Bhatia <dtgs1208(a)gmail.com>
Closes: https://gitlab.freedesktop.org/mesa/mesa/-/issues/12227 [1]
Closes: https://github.com/raspberrypi/linux/issues/6817 [2]
Signed-off-by: Maíra Canal <mcanal(a)igalia.com>
Reviewed-by: Iago Toral Quiroga <itoral(a)igalia.com>
---
Hi,
While we typically strive to avoid exposing the scheduler's internals
within the drivers, I'm proposing this fix as an interim solution. I'm aware
that a comprehensive fix will need some adjustments on the DRM sched side,
and I plan to address that soon.
However, it would be hard to justify the backport of such patches to the
stable branches and this issue is affecting users in the moment.
Therefore, I'd like to push this patch to drm-misc-fixes in order to
address this leak as soon as possible, while working in a more generic
solution.
Best Regards,
- Maíra
v1 -> v2: https://lore.kernel.org/dri-devel/20250427202907.94415-2-mcanal@igalia.com/
- Protect the pending list by using its spinlock.
- Add the URL of another downstream issue related to this patch.
drivers/gpu/drm/v3d/v3d_sched.c | 28 +++++++++++++++++++++-------
1 file changed, 21 insertions(+), 7 deletions(-)
diff --git a/drivers/gpu/drm/v3d/v3d_sched.c b/drivers/gpu/drm/v3d/v3d_sched.c
index b3be08b0ca91..c612363181df 100644
--- a/drivers/gpu/drm/v3d/v3d_sched.c
+++ b/drivers/gpu/drm/v3d/v3d_sched.c
@@ -734,11 +734,16 @@ v3d_gpu_reset_for_timeout(struct v3d_dev *v3d, struct drm_sched_job *sched_job)
return DRM_GPU_SCHED_STAT_NOMINAL;
}
-/* If the current address or return address have changed, then the GPU
- * has probably made progress and we should delay the reset. This
- * could fail if the GPU got in an infinite loop in the CL, but that
- * is pretty unlikely outside of an i-g-t testcase.
- */
+static void
+v3d_sched_skip_reset(struct drm_sched_job *sched_job)
+{
+ struct drm_gpu_scheduler *sched = sched_job->sched;
+
+ spin_lock(&sched->job_list_lock);
+ list_add(&sched_job->list, &sched->pending_list);
+ spin_unlock(&sched->job_list_lock);
+}
+
static enum drm_gpu_sched_stat
v3d_cl_job_timedout(struct drm_sched_job *sched_job, enum v3d_queue q,
u32 *timedout_ctca, u32 *timedout_ctra)
@@ -748,9 +753,16 @@ v3d_cl_job_timedout(struct drm_sched_job *sched_job, enum v3d_queue q,
u32 ctca = V3D_CORE_READ(0, V3D_CLE_CTNCA(q));
u32 ctra = V3D_CORE_READ(0, V3D_CLE_CTNRA(q));
+ /* If the current address or return address have changed, then the GPU
+ * has probably made progress and we should delay the reset. This
+ * could fail if the GPU got in an infinite loop in the CL, but that
+ * is pretty unlikely outside of an i-g-t testcase.
+ */
if (*timedout_ctca != ctca || *timedout_ctra != ctra) {
*timedout_ctca = ctca;
*timedout_ctra = ctra;
+
+ v3d_sched_skip_reset(sched_job);
return DRM_GPU_SCHED_STAT_NOMINAL;
}
@@ -790,11 +802,13 @@ v3d_csd_job_timedout(struct drm_sched_job *sched_job)
struct v3d_dev *v3d = job->base.v3d;
u32 batches = V3D_CORE_READ(0, V3D_CSD_CURRENT_CFG4(v3d->ver));
- /* If we've made progress, skip reset and let the timer get
- * rearmed.
+ /* If we've made progress, skip reset, add the job to the pending
+ * list, and let the timer get rearmed.
*/
if (job->timedout_batches != batches) {
job->timedout_batches = batches;
+
+ v3d_sched_skip_reset(sched_job);
return DRM_GPU_SCHED_STAT_NOMINAL;
}
--
2.49.0
From: Ashish Kalra <ashish.kalra(a)amd.com>
When the shared pages are being made private during kdump preparation
there are additional checks to handle shared GHCB pages.
These additional checks include handling the case of GHCB page being
contained within a huge page.
While handling the case of GHCB page contained within a huge page
any shared page just below the GHCB page gets skipped from being
transitioned back to private during kdump preparation.
This subsequently causes a 0x404 #VC exception when this skipped
shared page is accessed later while dumping guest memory during
vmcore generation via kdump.
Split the initial check for skipping the GHCB page into the page
being skipped fully containing the GHCB and GHCB being contained
within a huge page. Also ensure that the skipped huge page
containing the GHCB page is transitioned back to private later
when changing GHCBs to private at end of kdump preparation.
Reviewed-by: Tom Lendacky <thomas.lendacky(a)amd.com>
Cc: stable(a)vger.kernel.org
Fixes: 3074152e56c9 ("x86/sev: Convert shared memory back to private on kexec")
Signed-off-by: Ashish Kalra <ashish.kalra(a)amd.com>
---
arch/x86/coco/sev/core.c | 14 ++++++++++++--
1 file changed, 12 insertions(+), 2 deletions(-)
diff --git a/arch/x86/coco/sev/core.c b/arch/x86/coco/sev/core.c
index d35fec7b164a..1f53383bd1fa 100644
--- a/arch/x86/coco/sev/core.c
+++ b/arch/x86/coco/sev/core.c
@@ -1019,7 +1019,13 @@ static void unshare_all_memory(void)
data = per_cpu(runtime_data, cpu);
ghcb = (unsigned long)&data->ghcb_page;
- if (addr <= ghcb && ghcb <= addr + size) {
+ /* Handle the case of a huge page containing the GHCB page */
+ if (level == PG_LEVEL_4K && addr == ghcb) {
+ skipped_addr = true;
+ break;
+ }
+ if (level > PG_LEVEL_4K && addr <= ghcb &&
+ ghcb < addr + size) {
skipped_addr = true;
break;
}
@@ -1131,8 +1137,8 @@ static void shutdown_all_aps(void)
void snp_kexec_finish(void)
{
struct sev_es_runtime_data *data;
+ unsigned long size, mask;
unsigned int level, cpu;
- unsigned long size;
struct ghcb *ghcb;
pte_t *pte;
@@ -1160,6 +1166,10 @@ void snp_kexec_finish(void)
ghcb = &data->ghcb_page;
pte = lookup_address((unsigned long)ghcb, &level);
size = page_level_size(level);
+ mask = page_level_mask(level);
+ /* Handle the case of a huge page containing the GHCB page */
+ if (level > PG_LEVEL_4K)
+ ghcb = (struct ghcb *)((unsigned long)ghcb & mask);
set_pte_enc(pte, level, (void *)ghcb);
snp_set_memory_private((unsigned long)ghcb, (size / PAGE_SIZE));
}
--
2.34.1
The following commit has been merged into the irq/urgent branch of tip:
Commit-ID: 38a05c0b87833f5b188ae43b428b1f792df2b384
Gitweb: https://git.kernel.org/tip/38a05c0b87833f5b188ae43b428b1f792df2b384
Author: Stephan Gerhold <stephan.gerhold(a)linaro.org>
AuthorDate: Fri, 02 May 2025 13:22:28 +02:00
Committer: Thomas Gleixner <tglx(a)linutronix.de>
CommitterDate: Fri, 02 May 2025 21:07:02 +02:00
irqchip/qcom-mpm: Prevent crash when trying to handle non-wake GPIOs
On Qualcomm chipsets not all GPIOs are wakeup capable. Those GPIOs do not
have a corresponding MPM pin and should not be handled inside the MPM
driver. The IRQ domain hierarchy is always applied, so it's required to
explicitly disconnect the hierarchy for those. The pinctrl-msm driver marks
these with GPIO_NO_WAKE_IRQ. qcom-pdc has a check for this, but
irq-qcom-mpm is currently missing the check. This is causing crashes when
setting up interrupts for non-wake GPIOs:
root@rb1:~# gpiomon -c gpiochip1 10
irq: IRQ159: trimming hierarchy from :soc@0:interrupt-controller@f200000-1
Unable to handle kernel paging request at virtual address ffff8000a1dc3820
Hardware name: Qualcomm Technologies, Inc. Robotics RB1 (DT)
pc : mpm_set_type+0x80/0xcc
lr : mpm_set_type+0x5c/0xcc
Call trace:
mpm_set_type+0x80/0xcc (P)
qcom_mpm_set_type+0x64/0x158
irq_chip_set_type_parent+0x20/0x38
msm_gpio_irq_set_type+0x50/0x530
__irq_set_trigger+0x60/0x184
__setup_irq+0x304/0x6bc
request_threaded_irq+0xc8/0x19c
edge_detector_setup+0x260/0x364
linereq_create+0x420/0x5a8
gpio_ioctl+0x2d4/0x6c0
Fix this by copying the check for GPIO_NO_WAKE_IRQ from qcom-pdc.c, so that
MPM is removed entirely from the hierarchy for non-wake GPIOs.
Fixes: a6199bb514d8 ("irqchip: Add Qualcomm MPM controller driver")
Reported-by: Alexey Klimov <alexey.klimov(a)linaro.org>
Signed-off-by: Stephan Gerhold <stephan.gerhold(a)linaro.org>
Signed-off-by: Thomas Gleixner <tglx(a)linutronix.de>
Tested-by: Alexey Klimov <alexey.klimov(a)linaro.org>
Reviewed-by: Bartosz Golaszewski <bartosz.golaszewski(a)linaro.org>
Cc: stable(a)vger.kernel.org
Link: https://lore.kernel.org/all/20250502-irq-qcom-mpm-fix-no-wake-v1-1-8a1eafcd…
---
drivers/irqchip/irq-qcom-mpm.c | 3 +++
1 file changed, 3 insertions(+)
diff --git a/drivers/irqchip/irq-qcom-mpm.c b/drivers/irqchip/irq-qcom-mpm.c
index 7942d8e..f772deb 100644
--- a/drivers/irqchip/irq-qcom-mpm.c
+++ b/drivers/irqchip/irq-qcom-mpm.c
@@ -227,6 +227,9 @@ static int qcom_mpm_alloc(struct irq_domain *domain, unsigned int virq,
if (ret)
return ret;
+ if (pin == GPIO_NO_WAKE_IRQ)
+ return irq_domain_disconnect_hierarchy(domain, virq);
+
ret = irq_domain_set_hwirq_and_chip(domain, virq, pin,
&qcom_mpm_chip, priv);
if (ret)