The patch below does not apply to the 6.1-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.1.y
git checkout FETCH_HEAD
git cherry-pick -x 081056dc00a27bccb55ccc3c6f230a3d5fd3f7e0
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025062041-uplifted-cahoots-6c42@gregkh' --subject-prefix 'PATCH 6.1.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 081056dc00a27bccb55ccc3c6f230a3d5fd3f7e0 Mon Sep 17 00:00:00 2001
From: Jann Horn <jannh(a)google.com>
Date: Tue, 27 May 2025 23:23:53 +0200
Subject: [PATCH] mm/hugetlb: unshare page tables during VMA split, not before
Currently, __split_vma() triggers hugetlb page table unsharing through
vm_ops->may_split(). This happens before the VMA lock and rmap locks are
taken - which is too early, it allows racing VMA-locked page faults in our
process and racing rmap walks from other processes to cause page tables to
be shared again before we actually perform the split.
Fix it by explicitly calling into the hugetlb unshare logic from
__split_vma() in the same place where THP splitting also happens. At that
point, both the VMA and the rmap(s) are write-locked.
An annoying detail is that we can now call into the helper
hugetlb_unshare_pmds() from two different locking contexts:
1. from hugetlb_split(), holding:
- mmap lock (exclusively)
- VMA lock
- file rmap lock (exclusively)
2. hugetlb_unshare_all_pmds(), which I think is designed to be able to
call us with only the mmap lock held (in shared mode), but currently
only runs while holding mmap lock (exclusively) and VMA lock
Backporting note:
This commit fixes a racy protection that was introduced in commit
b30c14cd6102 ("hugetlb: unshare some PMDs when splitting VMAs"); that
commit claimed to fix an issue introduced in 5.13, but it should actually
also go all the way back.
[jannh(a)google.com: v2]
Link: https://lkml.kernel.org/r/20250528-hugetlb-fixes-splitrace-v2-1-1329349bad1…
Link: https://lkml.kernel.org/r/20250528-hugetlb-fixes-splitrace-v2-0-1329349bad1…
Link: https://lkml.kernel.org/r/20250527-hugetlb-fixes-splitrace-v1-1-f4136f5ec58…
Fixes: 39dde65c9940 ("[PATCH] shared page table for hugetlb page")
Signed-off-by: Jann Horn <jannh(a)google.com>
Cc: Liam Howlett <liam.howlett(a)oracle.com>
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes(a)oracle.com>
Reviewed-by: Oscar Salvador <osalvador(a)suse.de>
Cc: Lorenzo Stoakes <lorenzo.stoakes(a)oracle.com>
Cc: Vlastimil Babka <vbabka(a)suse.cz>
Cc: <stable(a)vger.kernel.org> [b30c14cd6102: hugetlb: unshare some PMDs when splitting VMAs]
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 0598f36931de..42f374e828a2 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -279,6 +279,7 @@ bool is_hugetlb_entry_migration(pte_t pte);
bool is_hugetlb_entry_hwpoisoned(pte_t pte);
void hugetlb_unshare_all_pmds(struct vm_area_struct *vma);
void fixup_hugetlb_reservations(struct vm_area_struct *vma);
+void hugetlb_split(struct vm_area_struct *vma, unsigned long addr);
#else /* !CONFIG_HUGETLB_PAGE */
@@ -476,6 +477,8 @@ static inline void fixup_hugetlb_reservations(struct vm_area_struct *vma)
{
}
+static inline void hugetlb_split(struct vm_area_struct *vma, unsigned long addr) {}
+
#endif /* !CONFIG_HUGETLB_PAGE */
#ifndef pgd_write
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index f0b1d53079f9..7ba020d489d4 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -121,7 +121,7 @@ static void hugetlb_vma_lock_free(struct vm_area_struct *vma);
static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma);
static void __hugetlb_vma_unlock_write_free(struct vm_area_struct *vma);
static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
- unsigned long start, unsigned long end);
+ unsigned long start, unsigned long end, bool take_locks);
static struct resv_map *vma_resv_map(struct vm_area_struct *vma);
static void hugetlb_free_folio(struct folio *folio)
@@ -5426,26 +5426,40 @@ static int hugetlb_vm_op_split(struct vm_area_struct *vma, unsigned long addr)
{
if (addr & ~(huge_page_mask(hstate_vma(vma))))
return -EINVAL;
+ return 0;
+}
+void hugetlb_split(struct vm_area_struct *vma, unsigned long addr)
+{
/*
* PMD sharing is only possible for PUD_SIZE-aligned address ranges
* in HugeTLB VMAs. If we will lose PUD_SIZE alignment due to this
* split, unshare PMDs in the PUD_SIZE interval surrounding addr now.
+ * This function is called in the middle of a VMA split operation, with
+ * MM, VMA and rmap all write-locked to prevent concurrent page table
+ * walks (except hardware and gup_fast()).
*/
+ vma_assert_write_locked(vma);
+ i_mmap_assert_write_locked(vma->vm_file->f_mapping);
+
if (addr & ~PUD_MASK) {
- /*
- * hugetlb_vm_op_split is called right before we attempt to
- * split the VMA. We will need to unshare PMDs in the old and
- * new VMAs, so let's unshare before we split.
- */
unsigned long floor = addr & PUD_MASK;
unsigned long ceil = floor + PUD_SIZE;
- if (floor >= vma->vm_start && ceil <= vma->vm_end)
- hugetlb_unshare_pmds(vma, floor, ceil);
+ if (floor >= vma->vm_start && ceil <= vma->vm_end) {
+ /*
+ * Locking:
+ * Use take_locks=false here.
+ * The file rmap lock is already held.
+ * The hugetlb VMA lock can't be taken when we already
+ * hold the file rmap lock, and we don't need it because
+ * its purpose is to synchronize against concurrent page
+ * table walks, which are not possible thanks to the
+ * locks held by our caller.
+ */
+ hugetlb_unshare_pmds(vma, floor, ceil, /* take_locks = */ false);
+ }
}
-
- return 0;
}
static unsigned long hugetlb_vm_op_pagesize(struct vm_area_struct *vma)
@@ -7885,9 +7899,16 @@ void move_hugetlb_state(struct folio *old_folio, struct folio *new_folio, int re
spin_unlock_irq(&hugetlb_lock);
}
+/*
+ * If @take_locks is false, the caller must ensure that no concurrent page table
+ * access can happen (except for gup_fast() and hardware page walks).
+ * If @take_locks is true, we take the hugetlb VMA lock (to lock out things like
+ * concurrent page fault handling) and the file rmap lock.
+ */
static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
unsigned long start,
- unsigned long end)
+ unsigned long end,
+ bool take_locks)
{
struct hstate *h = hstate_vma(vma);
unsigned long sz = huge_page_size(h);
@@ -7911,8 +7932,12 @@ static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm,
start, end);
mmu_notifier_invalidate_range_start(&range);
- hugetlb_vma_lock_write(vma);
- i_mmap_lock_write(vma->vm_file->f_mapping);
+ if (take_locks) {
+ hugetlb_vma_lock_write(vma);
+ i_mmap_lock_write(vma->vm_file->f_mapping);
+ } else {
+ i_mmap_assert_write_locked(vma->vm_file->f_mapping);
+ }
for (address = start; address < end; address += PUD_SIZE) {
ptep = hugetlb_walk(vma, address, sz);
if (!ptep)
@@ -7922,8 +7947,10 @@ static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
spin_unlock(ptl);
}
flush_hugetlb_tlb_range(vma, start, end);
- i_mmap_unlock_write(vma->vm_file->f_mapping);
- hugetlb_vma_unlock_write(vma);
+ if (take_locks) {
+ i_mmap_unlock_write(vma->vm_file->f_mapping);
+ hugetlb_vma_unlock_write(vma);
+ }
/*
* No need to call mmu_notifier_arch_invalidate_secondary_tlbs(), see
* Documentation/mm/mmu_notifier.rst.
@@ -7938,7 +7965,8 @@ static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
void hugetlb_unshare_all_pmds(struct vm_area_struct *vma)
{
hugetlb_unshare_pmds(vma, ALIGN(vma->vm_start, PUD_SIZE),
- ALIGN_DOWN(vma->vm_end, PUD_SIZE));
+ ALIGN_DOWN(vma->vm_end, PUD_SIZE),
+ /* take_locks = */ true);
}
/*
diff --git a/mm/vma.c b/mm/vma.c
index 1c6595f282e5..7ebc9eb608f4 100644
--- a/mm/vma.c
+++ b/mm/vma.c
@@ -539,7 +539,14 @@ __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
init_vma_prep(&vp, vma);
vp.insert = new;
vma_prepare(&vp);
+
+ /*
+ * Get rid of huge pages and shared page tables straddling the split
+ * boundary.
+ */
vma_adjust_trans_huge(vma, vma->vm_start, addr, NULL);
+ if (is_vm_hugetlb_page(vma))
+ hugetlb_split(vma, addr);
if (new_below) {
vma->vm_start = addr;
diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h
index 441feb21aa5a..4505b1c31be1 100644
--- a/tools/testing/vma/vma_internal.h
+++ b/tools/testing/vma/vma_internal.h
@@ -932,6 +932,8 @@ static inline void vma_adjust_trans_huge(struct vm_area_struct *vma,
(void)next;
}
+static inline void hugetlb_split(struct vm_area_struct *, unsigned long) {}
+
static inline void vma_iter_free(struct vma_iterator *vmi)
{
mas_destroy(&vmi->mas);
The i2c_dw_xfer_init() function requires msgs and msg_write_idx from the
dev context to be initialized.
amd_i2c_dw_xfer_quirk() inits msgs and msgs_num, but not msg_write_idx.
This could allow an out of bounds access (of msgs).
Initialize msg_write_idx before calling i2c_dw_xfer_init().
Reviewed-by: Andy Shevchenko <andriy.shevchenko(a)linux.intel.com>
Fixes: 17631e8ca2d3 ("i2c: designware: Add driver support for AMD NAVI GPU")
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Michael J. Ruhl <michael.j.ruhl(a)intel.com>
---
drivers/i2c/busses/i2c-designware-master.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/drivers/i2c/busses/i2c-designware-master.c b/drivers/i2c/busses/i2c-designware-master.c
index c5394229b77f..40aa5114bf8c 100644
--- a/drivers/i2c/busses/i2c-designware-master.c
+++ b/drivers/i2c/busses/i2c-designware-master.c
@@ -363,6 +363,7 @@ static int amd_i2c_dw_xfer_quirk(struct i2c_adapter *adap, struct i2c_msg *msgs,
dev->msgs = msgs;
dev->msgs_num = num_msgs;
+ dev->msg_write_idx = 0;
i2c_dw_xfer_init(dev);
/* Initiate messages read/write transaction */
--
2.49.0
Hi all,
First patch is a prerequisite in order to avoid NULL pointer
dereferences in error paths. Then two fixes follow.
Signed-off-by: Kurt Borja <kuurtb(a)gmail.com>
---
Changes in v3:
- Add Cc stable tags to all patches
- Rebase on top of the 'fixes' branch
- Link to v2: https://lore.kernel.org/r/20250628-lmi-fix-v2-0-c530e1c959d7@gmail.com
Changes in v2:
[PATCH 02]
- Remove kobject_del() and commit message remark. It turns out it's
optional to call this (my bad)
- Leave only one fixes tag. The other two are not necessary.
- Link to v1: https://lore.kernel.org/r/20250628-lmi-fix-v1-0-c6eec9aa3ca7@gmail.com
---
Kurt Borja (3):
platform/x86: think-lmi: Create ksets consecutively
platform/x86: think-lmi: Fix kobject cleanup
platform/x86: think-lmi: Fix sysfs group cleanup
drivers/platform/x86/think-lmi.c | 90 ++++++++++++++--------------------------
1 file changed, 30 insertions(+), 60 deletions(-)
---
base-commit: 173bbec6693f3f3f00dac144f3aa0cd62fb60d33
change-id: 20250628-lmi-fix-98143b10d9fd
--
~ Kurt
Commit 88c02b3f79a6 ("s390/sha3: Support sha3 performance enhancements")
added the field s390_sha_ctx::first_message_part and made it be used by
s390_sha_update_blocks(). At the time, s390_sha_update_blocks() was
used by all the s390 SHA-1, SHA-2, and SHA-3 algorithms. However, only
the initialization functions for SHA-3 were updated, leaving SHA-1 and
SHA-2 using first_message_part uninitialized.
This could cause e.g. CPACF_KIMD_SHA_512 | CPACF_KIMD_NIP to be used
instead of just CPACF_KIMD_NIP. It's unclear why this didn't cause a
problem earlier; this bug was found only when UBSAN detected the
uninitialized boolean. Perhaps the CPU ignores CPACF_KIMD_NIP for SHA-1
and SHA-2. Regardless, let's fix this. For now just initialize to
false, i.e. don't try to "optimize" the SHA state initialization.
Note: in 6.16, we need to patch SHA-1, SHA-384, and SHA-512. In 6.15
and earlier, we'll also need to patch SHA-224 and SHA-256, as they
hadn't yet been librarified (which incidentally fixed this bug).
Fixes: 88c02b3f79a6 ("s390/sha3: Support sha3 performance enhancements")
Cc: stable(a)vger.kernel.org
Reported-by: Ingo Franzki <ifranzki(a)linux.ibm.com>
Closes: https://lore.kernel.org/r/12740696-595c-4604-873e-aefe8b405fbf@linux.ibm.com
Signed-off-by: Eric Biggers <ebiggers(a)kernel.org>
---
This is targeting 6.16. I'd prefer to take this through
libcrypto-fixes, since the librarification work is also touching this
area. But let me know if there's a preference for the crypto tree or
the s390 tree instead.
arch/s390/crypto/sha1_s390.c | 1 +
arch/s390/crypto/sha512_s390.c | 2 ++
2 files changed, 3 insertions(+)
diff --git a/arch/s390/crypto/sha1_s390.c b/arch/s390/crypto/sha1_s390.c
index d229cbd2ba229..73672e76a88f9 100644
--- a/arch/s390/crypto/sha1_s390.c
+++ b/arch/s390/crypto/sha1_s390.c
@@ -36,10 +36,11 @@ static int s390_sha1_init(struct shash_desc *desc)
sctx->state[2] = SHA1_H2;
sctx->state[3] = SHA1_H3;
sctx->state[4] = SHA1_H4;
sctx->count = 0;
sctx->func = CPACF_KIMD_SHA_1;
+ sctx->first_message_part = false;
return 0;
}
static int s390_sha1_export(struct shash_desc *desc, void *out)
diff --git a/arch/s390/crypto/sha512_s390.c b/arch/s390/crypto/sha512_s390.c
index 33711a29618c3..e9e112025ff22 100644
--- a/arch/s390/crypto/sha512_s390.c
+++ b/arch/s390/crypto/sha512_s390.c
@@ -30,10 +30,11 @@ static int sha512_init(struct shash_desc *desc)
ctx->sha512.state[6] = SHA512_H6;
ctx->sha512.state[7] = SHA512_H7;
ctx->count = 0;
ctx->sha512.count_hi = 0;
ctx->func = CPACF_KIMD_SHA_512;
+ ctx->first_message_part = false;
return 0;
}
static int sha512_export(struct shash_desc *desc, void *out)
@@ -95,10 +96,11 @@ static int sha384_init(struct shash_desc *desc)
ctx->sha512.state[6] = SHA384_H6;
ctx->sha512.state[7] = SHA384_H7;
ctx->count = 0;
ctx->sha512.count_hi = 0;
ctx->func = CPACF_KIMD_SHA_512;
+ ctx->first_message_part = false;
return 0;
}
static struct shash_alg sha384_alg = {
base-commit: e540341508ce2f6e27810106253d5de194b66750
--
2.50.0
From: Philipp Stanner <pstanner(a)redhat.com>
commit 313ebe47d75558511aa1237b6e35c663b5c0ec6f upstream.
Currently, user array duplications are sometimes done without an
overflow check. Sometimes the checks are done manually; sometimes the
array size is calculated with array_size() and sometimes by calculating
n * size directly in code.
Introduce wrappers for arrays for memdup_user() and vmemdup_user() to
provide a standardized and safe way for duplicating user arrays.
This is both for new code as well as replacing usage of (v)memdup_user()
in existing code that uses, e.g., n * size to calculate array sizes.
Suggested-by: David Airlie <airlied(a)redhat.com>
Signed-off-by: Philipp Stanner <pstanner(a)redhat.com>
Reviewed-by: Andy Shevchenko <andy.shevchenko(a)gmail.com>
Reviewed-by: Kees Cook <keescook(a)chromium.org>
Reviewed-by: Zack Rusin <zackr(a)vmware.com>
Signed-off-by: Dave Airlie <airlied(a)redhat.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20230920123612.16914-3-pstann…
[ fp: fix small conflict in header includes; take linux/errno.h as well ]
Signed-off-by: Fedor Pchelkin <pchelkin(a)ispras.ru>
---
s390 and x86_64 allmodconfig build passed.
include/linux/string.h | 41 +++++++++++++++++++++++++++++++++++++++++
1 file changed, 41 insertions(+)
diff --git a/include/linux/string.h b/include/linux/string.h
index 0cef345a6e87..cd8c007ad3be 100644
--- a/include/linux/string.h
+++ b/include/linux/string.h
@@ -5,6 +5,9 @@
#include <linux/compiler.h> /* for inline */
#include <linux/types.h> /* for size_t */
#include <linux/stddef.h> /* for NULL */
+#include <linux/err.h> /* for ERR_PTR() */
+#include <linux/errno.h> /* for E2BIG */
+#include <linux/overflow.h> /* for check_mul_overflow() */
#include <stdarg.h>
#include <uapi/linux/string.h>
@@ -13,6 +16,44 @@ extern void *memdup_user(const void __user *, size_t);
extern void *vmemdup_user(const void __user *, size_t);
extern void *memdup_user_nul(const void __user *, size_t);
+/**
+ * memdup_array_user - duplicate array from user space
+ * @src: source address in user space
+ * @n: number of array members to copy
+ * @size: size of one array member
+ *
+ * Return: an ERR_PTR() on failure. Result is physically
+ * contiguous, to be freed by kfree().
+ */
+static inline void *memdup_array_user(const void __user *src, size_t n, size_t size)
+{
+ size_t nbytes;
+
+ if (check_mul_overflow(n, size, &nbytes))
+ return ERR_PTR(-EOVERFLOW);
+
+ return memdup_user(src, nbytes);
+}
+
+/**
+ * vmemdup_array_user - duplicate array from user space
+ * @src: source address in user space
+ * @n: number of array members to copy
+ * @size: size of one array member
+ *
+ * Return: an ERR_PTR() on failure. Result may be not
+ * physically contiguous. Use kvfree() to free.
+ */
+static inline void *vmemdup_array_user(const void __user *src, size_t n, size_t size)
+{
+ size_t nbytes;
+
+ if (check_mul_overflow(n, size, &nbytes))
+ return ERR_PTR(-EOVERFLOW);
+
+ return vmemdup_user(src, nbytes);
+}
+
/*
* Include machine specific inline routines
*/
--
2.50.0
When calling buf_to_xdp, the len argument is the frame data's length
without virtio header's length (vi->hdr_len). We check that len with
xsk_pool_get_rx_frame_size() + vi->hdr_len
to ensure the provided len does not larger than the allocated chunk
size. The additional vi->hdr_len is because in virtnet_add_recvbuf_xsk,
we use part of XDP_PACKET_HEADROOM for virtio header and ask the vhost
to start placing data from
hard_start + XDP_PACKET_HEADROOM - vi->hdr_len
not
hard_start + XDP_PACKET_HEADROOM
But the first buffer has virtio_header, so the maximum frame's length in
the first buffer can only be
xsk_pool_get_rx_frame_size()
not
xsk_pool_get_rx_frame_size() + vi->hdr_len
like in the current check.
This commit adds an additional argument to buf_to_xdp differentiate
between the first buffer and other ones to correctly calculate the maximum
frame's length.
Cc: stable(a)vger.kernel.org
Reviewed-by: Xuan Zhuo <xuanzhuo(a)linux.alibaba.com>
Fixes: a4e7ba702701 ("virtio_net: xsk: rx: support recv small mode")
Signed-off-by: Bui Quang Minh <minhquangbui99(a)gmail.com>
---
drivers/net/virtio_net.c | 22 ++++++++++++++++++----
1 file changed, 18 insertions(+), 4 deletions(-)
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index e53ba600605a..1eb237cd5d0b 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -1127,15 +1127,29 @@ static void check_sq_full_and_disable(struct virtnet_info *vi,
}
}
+/* Note that @len is the length of received data without virtio header */
static struct xdp_buff *buf_to_xdp(struct virtnet_info *vi,
- struct receive_queue *rq, void *buf, u32 len)
+ struct receive_queue *rq, void *buf,
+ u32 len, bool first_buf)
{
struct xdp_buff *xdp;
u32 bufsize;
xdp = (struct xdp_buff *)buf;
- bufsize = xsk_pool_get_rx_frame_size(rq->xsk_pool) + vi->hdr_len;
+ /* In virtnet_add_recvbuf_xsk, we use part of XDP_PACKET_HEADROOM for
+ * virtio header and ask the vhost to fill data from
+ * hard_start + XDP_PACKET_HEADROOM - vi->hdr_len
+ * The first buffer has virtio header so the remaining region for frame
+ * data is
+ * xsk_pool_get_rx_frame_size()
+ * While other buffers than the first one do not have virtio header, so
+ * the maximum frame data's length can be
+ * xsk_pool_get_rx_frame_size() + vi->hdr_len
+ */
+ bufsize = xsk_pool_get_rx_frame_size(rq->xsk_pool);
+ if (!first_buf)
+ bufsize += vi->hdr_len;
if (unlikely(len > bufsize)) {
pr_debug("%s: rx error: len %u exceeds truesize %u\n",
@@ -1260,7 +1274,7 @@ static int xsk_append_merge_buffer(struct virtnet_info *vi,
u64_stats_add(&stats->bytes, len);
- xdp = buf_to_xdp(vi, rq, buf, len);
+ xdp = buf_to_xdp(vi, rq, buf, len, false);
if (!xdp)
goto err;
@@ -1358,7 +1372,7 @@ static void virtnet_receive_xsk_buf(struct virtnet_info *vi, struct receive_queu
u64_stats_add(&stats->bytes, len);
- xdp = buf_to_xdp(vi, rq, buf, len);
+ xdp = buf_to_xdp(vi, rq, buf, len, true);
if (!xdp)
return;
--
2.43.0
When large folios are enabled, the blocks-per-folio calculation in
ext4_da_writepages_trans_blocks() can overflow the journal transaction
limits, causing the writeback path to fail with errors like:
JBD2: kworker/u8:0 wants too many credits credits:416 rsv_credits:21 max:334
This occurs with small block sizes (1KB) and large folios (32MB), where
the calculation results in 32768 blocks per folio. The transaction credit
calculation then requests more credits than the journal can handle, leading
to the following warning and writeback failure:
WARNING: CPU: 1 PID: 43 at fs/jbd2/transaction.c:334 start_this_handle+0x4c0/0x4e0
EXT4-fs (loop0): ext4_do_writepages: jbd2_start: 9223372036854775807 pages, ino 14; err -28
Call trace leading to the issue:
ext4_do_writepages()
ext4_da_writepages_trans_blocks()
bpp = ext4_journal_blocks_per_folio() // Returns 32768 for 32MB folio with 1KB blocks
ext4_meta_trans_blocks(inode, MAX_WRITEPAGES_EXTENT_LEN + bpp - 1, bpp)
// With bpp=32768, lblocks=34815, pextents=32768
// Returns credits=415, but with overhead becomes 416 > max 334
ext4_journal_start_with_reserve()
jbd2_journal_start_reserved()
start_this_handle()
// Fails with warning when credits:416 > max:334
The issue was introduced by commit d6bf294773a47 ("ext4/jbd2: convert
jbd2_journal_blocks_per_page() to support large folio"), which added
support for large folios but didn't account for the journal credit limits.
Fix this by capping the blocks-per-folio value at 8192 in the writeback
path. This is the value we'd get with 32MB folios and 4KB blocks, or 8MB
folios with 1KB blocks, which is reasonable and safe for typical journal
configurations.
Fixes: d6bf294773a4 ("ext4/jbd2: convert jbd2_journal_blocks_per_page() to support large folio")
Cc: stable(a)vger.kernel.org
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
---
fs/ext4/inode.c | 34 ++++++++++++++++++++++++++++++++++
1 file changed, 34 insertions(+)
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index be9a4cba35fd5..860e59a176c97 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2070,6 +2070,14 @@ static int mpage_submit_folio(struct mpage_da_data *mpd, struct folio *folio)
*/
#define MAX_WRITEPAGES_EXTENT_LEN 2048
+/*
+ * Maximum blocks per folio to avoid JBD2 credit overflow.
+ * This is the value we'd get with 32MB folios and 4KB blocks,
+ * or 8MB folios with 1KB blocks, which is reasonable and safe
+ * for typical journal configurations.
+ */
+#define MAX_BLOCKS_PER_FOLIO_FOR_WRITEBACK 8192
+
/*
* mpage_add_bh_to_extent - try to add bh to extent of blocks to map
*
@@ -2481,6 +2489,18 @@ static int ext4_da_writepages_trans_blocks(struct inode *inode)
{
int bpp = ext4_journal_blocks_per_folio(inode);
+ /*
+ * With large folios, blocks per folio can get excessively large,
+ * especially with small block sizes. For example, with 32MB folios
+ * (order 11) and 1KB blocks, we get 32768 blocks per folio. This
+ * leads to credit requests that overflow the journal's transaction
+ * limit.
+ *
+ * Limit the value to avoid excessive credit requests.
+ */
+ if (bpp > MAX_BLOCKS_PER_FOLIO_FOR_WRITEBACK)
+ bpp = MAX_BLOCKS_PER_FOLIO_FOR_WRITEBACK;
+
return ext4_meta_trans_blocks(inode,
MAX_WRITEPAGES_EXTENT_LEN + bpp - 1, bpp);
}
@@ -2559,6 +2579,13 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)
handle_t *handle = NULL;
int bpp = ext4_journal_blocks_per_folio(mpd->inode);
+ /*
+ * With large folios, blocks per folio can get excessively large,
+ * especially with small block sizes. Cap it to avoid credit overflow.
+ */
+ if (bpp > MAX_BLOCKS_PER_FOLIO_FOR_WRITEBACK)
+ bpp = MAX_BLOCKS_PER_FOLIO_FOR_WRITEBACK;
+
if (mpd->wbc->sync_mode == WB_SYNC_ALL || mpd->wbc->tagged_writepages)
tag = PAGECACHE_TAG_TOWRITE;
else
@@ -6179,6 +6206,13 @@ int ext4_writepage_trans_blocks(struct inode *inode)
int bpp = ext4_journal_blocks_per_folio(inode);
int ret;
+ /*
+ * With large folios, blocks per folio can get excessively large,
+ * especially with small block sizes. Cap it to avoid credit overflow.
+ */
+ if (bpp > MAX_BLOCKS_PER_FOLIO_FOR_WRITEBACK)
+ bpp = MAX_BLOCKS_PER_FOLIO_FOR_WRITEBACK;
+
ret = ext4_meta_trans_blocks(inode, bpp, bpp);
/* Account for data blocks for journalled mode */
--
2.39.5
Since the commit 96336ec70264 ("PCI: Perform reset_resource() and build
fail list in sync") the failed list is always built and returned to let
the caller decide what to do with the failures. The caller may want to
retry resource fitting and assignment and before that can happen, the
resources should be restored to their original state (a reset
effectively clears the struct resource), which requires returning them
on the failed list so that the original state remains stored in the
associated struct pci_dev_resource.
Resource resizing is different from the ordinary resource fitting and
assignment in that it only considers part of the resources. This means
failures for other resource types are not relevant at all and should be
ignored. As resize doesn't unassign such unrelated resources, those
resource ending up into the failed list implies assignment of that
resource must have failed before resize too. The check in
pci_reassign_bridge_resources() to decide if the whole assignment is
successful, however, is based on list emptiness which will cause false
negatives when the failed list has resources with an unrelated type.
If the failed list is not empty, call pci_required_resource_failed()
and extend it to be able to filter on specific resource types too (if
provided).
Calling pci_required_resource_failed() at this point is slightly
problematic because the resource itself is reset when the failed list
is constructed in __assign_resources_sorted(). As a result,
pci_resource_is_optional() does not have access to the original
resource flags. This could be worked around by restoring and
re-reseting the resource around the call to pci_resource_is_optional(),
however, it shouldn't cause issue as resource resizing is meant for
64-bit prefetchable resources according to Christian König (see the
Link which unfortunately doesn't point directly to Christian's reply
because lore didn't store that email at all).
Fixes: 96336ec70264 ("PCI: Perform reset_resource() and build fail list in sync")
Link: https://lore.kernel.org/all/c5d1b5d8-8669-5572-75a7-0b480f581ac1@linux.inte…
Reported-by: D Scott Phillips <scott(a)os.amperecomputing.com>
Tested-by: D Scott Phillips <scott(a)os.amperecomputing.com>
Signed-off-by: Ilpo Järvinen <ilpo.jarvinen(a)linux.intel.com>
Reviewed-by: D Scott Phillips <scott(a)os.amperecomputing.com>
Cc: Christian König <christian.koenig(a)amd.com>
Cc: <stable(a)vger.kernel.org>
---
drivers/pci/setup-bus.c | 26 ++++++++++++++++++--------
1 file changed, 18 insertions(+), 8 deletions(-)
diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c
index 24863d8d0053..dbbd80d78d3d 100644
--- a/drivers/pci/setup-bus.c
+++ b/drivers/pci/setup-bus.c
@@ -28,6 +28,10 @@
#include <linux/acpi.h>
#include "pci.h"
+#define PCI_RES_TYPE_MASK \
+ (IORESOURCE_IO | IORESOURCE_MEM | IORESOURCE_PREFETCH |\
+ IORESOURCE_MEM_64)
+
unsigned int pci_flags;
EXPORT_SYMBOL_GPL(pci_flags);
@@ -384,13 +388,19 @@ static bool pci_need_to_release(unsigned long mask, struct resource *res)
}
/* Return: @true if assignment of a required resource failed. */
-static bool pci_required_resource_failed(struct list_head *fail_head)
+static bool pci_required_resource_failed(struct list_head *fail_head,
+ unsigned long type)
{
struct pci_dev_resource *fail_res;
+ type &= PCI_RES_TYPE_MASK;
+
list_for_each_entry(fail_res, fail_head, list) {
int idx = pci_resource_num(fail_res->dev, fail_res->res);
+ if (type && (fail_res->flags & PCI_RES_TYPE_MASK) != type)
+ continue;
+
if (!pci_resource_is_optional(fail_res->dev, idx))
return true;
}
@@ -504,7 +514,7 @@ static void __assign_resources_sorted(struct list_head *head,
}
/* Without realloc_head and only optional fails, nothing more to do. */
- if (!pci_required_resource_failed(&local_fail_head) &&
+ if (!pci_required_resource_failed(&local_fail_head, 0) &&
list_empty(realloc_head)) {
list_for_each_entry(save_res, &save_head, list) {
struct resource *res = save_res->res;
@@ -1708,10 +1718,6 @@ static void __pci_bridge_assign_resources(const struct pci_dev *bridge,
}
}
-#define PCI_RES_TYPE_MASK \
- (IORESOURCE_IO | IORESOURCE_MEM | IORESOURCE_PREFETCH |\
- IORESOURCE_MEM_64)
-
static void pci_bridge_release_resources(struct pci_bus *bus,
unsigned long type)
{
@@ -2449,8 +2455,12 @@ int pci_reassign_bridge_resources(struct pci_dev *bridge, unsigned long type)
free_list(&added);
if (!list_empty(&failed)) {
- ret = -ENOSPC;
- goto cleanup;
+ if (pci_required_resource_failed(&failed, type)) {
+ ret = -ENOSPC;
+ goto cleanup;
+ }
+ /* Only resources with unrelated types failed (again) */
+ free_list(&failed);
}
list_for_each_entry(dev_res, &saved, list) {
--
2.39.5