When calling buf_to_xdp, the len argument is the frame data's length
without virtio header's length (vi->hdr_len). We check that len with
xsk_pool_get_rx_frame_size() + vi->hdr_len
to ensure the provided len does not larger than the allocated chunk
size. The additional vi->hdr_len is because in virtnet_add_recvbuf_xsk,
we use part of XDP_PACKET_HEADROOM for virtio header and ask the vhost
to start placing data from
hard_start + XDP_PACKET_HEADROOM - vi->hdr_len
not
hard_start + XDP_PACKET_HEADROOM
But the first buffer has virtio_header, so the maximum frame's length in
the first buffer can only be
xsk_pool_get_rx_frame_size()
not
xsk_pool_get_rx_frame_size() + vi->hdr_len
like in the current check.
This commit adds an additional argument to buf_to_xdp differentiate
between the first buffer and other ones to correctly calculate the maximum
frame's length.
Cc: stable(a)vger.kernel.org
Reviewed-by: Xuan Zhuo <xuanzhuo(a)linux.alibaba.com>
Fixes: a4e7ba702701 ("virtio_net: xsk: rx: support recv small mode")
Signed-off-by: Bui Quang Minh <minhquangbui99(a)gmail.com>
---
drivers/net/virtio_net.c | 22 ++++++++++++++++++----
1 file changed, 18 insertions(+), 4 deletions(-)
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index e53ba600605a..1eb237cd5d0b 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -1127,15 +1127,29 @@ static void check_sq_full_and_disable(struct virtnet_info *vi,
}
}
+/* Note that @len is the length of received data without virtio header */
static struct xdp_buff *buf_to_xdp(struct virtnet_info *vi,
- struct receive_queue *rq, void *buf, u32 len)
+ struct receive_queue *rq, void *buf,
+ u32 len, bool first_buf)
{
struct xdp_buff *xdp;
u32 bufsize;
xdp = (struct xdp_buff *)buf;
- bufsize = xsk_pool_get_rx_frame_size(rq->xsk_pool) + vi->hdr_len;
+ /* In virtnet_add_recvbuf_xsk, we use part of XDP_PACKET_HEADROOM for
+ * virtio header and ask the vhost to fill data from
+ * hard_start + XDP_PACKET_HEADROOM - vi->hdr_len
+ * The first buffer has virtio header so the remaining region for frame
+ * data is
+ * xsk_pool_get_rx_frame_size()
+ * While other buffers than the first one do not have virtio header, so
+ * the maximum frame data's length can be
+ * xsk_pool_get_rx_frame_size() + vi->hdr_len
+ */
+ bufsize = xsk_pool_get_rx_frame_size(rq->xsk_pool);
+ if (!first_buf)
+ bufsize += vi->hdr_len;
if (unlikely(len > bufsize)) {
pr_debug("%s: rx error: len %u exceeds truesize %u\n",
@@ -1260,7 +1274,7 @@ static int xsk_append_merge_buffer(struct virtnet_info *vi,
u64_stats_add(&stats->bytes, len);
- xdp = buf_to_xdp(vi, rq, buf, len);
+ xdp = buf_to_xdp(vi, rq, buf, len, false);
if (!xdp)
goto err;
@@ -1358,7 +1372,7 @@ static void virtnet_receive_xsk_buf(struct virtnet_info *vi, struct receive_queu
u64_stats_add(&stats->bytes, len);
- xdp = buf_to_xdp(vi, rq, buf, len);
+ xdp = buf_to_xdp(vi, rq, buf, len, true);
if (!xdp)
return;
--
2.43.0
When large folios are enabled, the blocks-per-folio calculation in
ext4_da_writepages_trans_blocks() can overflow the journal transaction
limits, causing the writeback path to fail with errors like:
JBD2: kworker/u8:0 wants too many credits credits:416 rsv_credits:21 max:334
This occurs with small block sizes (1KB) and large folios (32MB), where
the calculation results in 32768 blocks per folio. The transaction credit
calculation then requests more credits than the journal can handle, leading
to the following warning and writeback failure:
WARNING: CPU: 1 PID: 43 at fs/jbd2/transaction.c:334 start_this_handle+0x4c0/0x4e0
EXT4-fs (loop0): ext4_do_writepages: jbd2_start: 9223372036854775807 pages, ino 14; err -28
Call trace leading to the issue:
ext4_do_writepages()
ext4_da_writepages_trans_blocks()
bpp = ext4_journal_blocks_per_folio() // Returns 32768 for 32MB folio with 1KB blocks
ext4_meta_trans_blocks(inode, MAX_WRITEPAGES_EXTENT_LEN + bpp - 1, bpp)
// With bpp=32768, lblocks=34815, pextents=32768
// Returns credits=415, but with overhead becomes 416 > max 334
ext4_journal_start_with_reserve()
jbd2_journal_start_reserved()
start_this_handle()
// Fails with warning when credits:416 > max:334
The issue was introduced by commit d6bf294773a47 ("ext4/jbd2: convert
jbd2_journal_blocks_per_page() to support large folio"), which added
support for large folios but didn't account for the journal credit limits.
Fix this by capping the blocks-per-folio value at 8192 in the writeback
path. This is the value we'd get with 32MB folios and 4KB blocks, or 8MB
folios with 1KB blocks, which is reasonable and safe for typical journal
configurations.
Fixes: d6bf294773a4 ("ext4/jbd2: convert jbd2_journal_blocks_per_page() to support large folio")
Cc: stable(a)vger.kernel.org
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
---
fs/ext4/inode.c | 34 ++++++++++++++++++++++++++++++++++
1 file changed, 34 insertions(+)
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index be9a4cba35fd5..860e59a176c97 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2070,6 +2070,14 @@ static int mpage_submit_folio(struct mpage_da_data *mpd, struct folio *folio)
*/
#define MAX_WRITEPAGES_EXTENT_LEN 2048
+/*
+ * Maximum blocks per folio to avoid JBD2 credit overflow.
+ * This is the value we'd get with 32MB folios and 4KB blocks,
+ * or 8MB folios with 1KB blocks, which is reasonable and safe
+ * for typical journal configurations.
+ */
+#define MAX_BLOCKS_PER_FOLIO_FOR_WRITEBACK 8192
+
/*
* mpage_add_bh_to_extent - try to add bh to extent of blocks to map
*
@@ -2481,6 +2489,18 @@ static int ext4_da_writepages_trans_blocks(struct inode *inode)
{
int bpp = ext4_journal_blocks_per_folio(inode);
+ /*
+ * With large folios, blocks per folio can get excessively large,
+ * especially with small block sizes. For example, with 32MB folios
+ * (order 11) and 1KB blocks, we get 32768 blocks per folio. This
+ * leads to credit requests that overflow the journal's transaction
+ * limit.
+ *
+ * Limit the value to avoid excessive credit requests.
+ */
+ if (bpp > MAX_BLOCKS_PER_FOLIO_FOR_WRITEBACK)
+ bpp = MAX_BLOCKS_PER_FOLIO_FOR_WRITEBACK;
+
return ext4_meta_trans_blocks(inode,
MAX_WRITEPAGES_EXTENT_LEN + bpp - 1, bpp);
}
@@ -2559,6 +2579,13 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)
handle_t *handle = NULL;
int bpp = ext4_journal_blocks_per_folio(mpd->inode);
+ /*
+ * With large folios, blocks per folio can get excessively large,
+ * especially with small block sizes. Cap it to avoid credit overflow.
+ */
+ if (bpp > MAX_BLOCKS_PER_FOLIO_FOR_WRITEBACK)
+ bpp = MAX_BLOCKS_PER_FOLIO_FOR_WRITEBACK;
+
if (mpd->wbc->sync_mode == WB_SYNC_ALL || mpd->wbc->tagged_writepages)
tag = PAGECACHE_TAG_TOWRITE;
else
@@ -6179,6 +6206,13 @@ int ext4_writepage_trans_blocks(struct inode *inode)
int bpp = ext4_journal_blocks_per_folio(inode);
int ret;
+ /*
+ * With large folios, blocks per folio can get excessively large,
+ * especially with small block sizes. Cap it to avoid credit overflow.
+ */
+ if (bpp > MAX_BLOCKS_PER_FOLIO_FOR_WRITEBACK)
+ bpp = MAX_BLOCKS_PER_FOLIO_FOR_WRITEBACK;
+
ret = ext4_meta_trans_blocks(inode, bpp, bpp);
/* Account for data blocks for journalled mode */
--
2.39.5
Since the commit 96336ec70264 ("PCI: Perform reset_resource() and build
fail list in sync") the failed list is always built and returned to let
the caller decide what to do with the failures. The caller may want to
retry resource fitting and assignment and before that can happen, the
resources should be restored to their original state (a reset
effectively clears the struct resource), which requires returning them
on the failed list so that the original state remains stored in the
associated struct pci_dev_resource.
Resource resizing is different from the ordinary resource fitting and
assignment in that it only considers part of the resources. This means
failures for other resource types are not relevant at all and should be
ignored. As resize doesn't unassign such unrelated resources, those
resource ending up into the failed list implies assignment of that
resource must have failed before resize too. The check in
pci_reassign_bridge_resources() to decide if the whole assignment is
successful, however, is based on list emptiness which will cause false
negatives when the failed list has resources with an unrelated type.
If the failed list is not empty, call pci_required_resource_failed()
and extend it to be able to filter on specific resource types too (if
provided).
Calling pci_required_resource_failed() at this point is slightly
problematic because the resource itself is reset when the failed list
is constructed in __assign_resources_sorted(). As a result,
pci_resource_is_optional() does not have access to the original
resource flags. This could be worked around by restoring and
re-reseting the resource around the call to pci_resource_is_optional(),
however, it shouldn't cause issue as resource resizing is meant for
64-bit prefetchable resources according to Christian König (see the
Link which unfortunately doesn't point directly to Christian's reply
because lore didn't store that email at all).
Fixes: 96336ec70264 ("PCI: Perform reset_resource() and build fail list in sync")
Link: https://lore.kernel.org/all/c5d1b5d8-8669-5572-75a7-0b480f581ac1@linux.inte…
Reported-by: D Scott Phillips <scott(a)os.amperecomputing.com>
Tested-by: D Scott Phillips <scott(a)os.amperecomputing.com>
Signed-off-by: Ilpo Järvinen <ilpo.jarvinen(a)linux.intel.com>
Reviewed-by: D Scott Phillips <scott(a)os.amperecomputing.com>
Cc: Christian König <christian.koenig(a)amd.com>
Cc: <stable(a)vger.kernel.org>
---
drivers/pci/setup-bus.c | 26 ++++++++++++++++++--------
1 file changed, 18 insertions(+), 8 deletions(-)
diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c
index 24863d8d0053..dbbd80d78d3d 100644
--- a/drivers/pci/setup-bus.c
+++ b/drivers/pci/setup-bus.c
@@ -28,6 +28,10 @@
#include <linux/acpi.h>
#include "pci.h"
+#define PCI_RES_TYPE_MASK \
+ (IORESOURCE_IO | IORESOURCE_MEM | IORESOURCE_PREFETCH |\
+ IORESOURCE_MEM_64)
+
unsigned int pci_flags;
EXPORT_SYMBOL_GPL(pci_flags);
@@ -384,13 +388,19 @@ static bool pci_need_to_release(unsigned long mask, struct resource *res)
}
/* Return: @true if assignment of a required resource failed. */
-static bool pci_required_resource_failed(struct list_head *fail_head)
+static bool pci_required_resource_failed(struct list_head *fail_head,
+ unsigned long type)
{
struct pci_dev_resource *fail_res;
+ type &= PCI_RES_TYPE_MASK;
+
list_for_each_entry(fail_res, fail_head, list) {
int idx = pci_resource_num(fail_res->dev, fail_res->res);
+ if (type && (fail_res->flags & PCI_RES_TYPE_MASK) != type)
+ continue;
+
if (!pci_resource_is_optional(fail_res->dev, idx))
return true;
}
@@ -504,7 +514,7 @@ static void __assign_resources_sorted(struct list_head *head,
}
/* Without realloc_head and only optional fails, nothing more to do. */
- if (!pci_required_resource_failed(&local_fail_head) &&
+ if (!pci_required_resource_failed(&local_fail_head, 0) &&
list_empty(realloc_head)) {
list_for_each_entry(save_res, &save_head, list) {
struct resource *res = save_res->res;
@@ -1708,10 +1718,6 @@ static void __pci_bridge_assign_resources(const struct pci_dev *bridge,
}
}
-#define PCI_RES_TYPE_MASK \
- (IORESOURCE_IO | IORESOURCE_MEM | IORESOURCE_PREFETCH |\
- IORESOURCE_MEM_64)
-
static void pci_bridge_release_resources(struct pci_bus *bus,
unsigned long type)
{
@@ -2449,8 +2455,12 @@ int pci_reassign_bridge_resources(struct pci_dev *bridge, unsigned long type)
free_list(&added);
if (!list_empty(&failed)) {
- ret = -ENOSPC;
- goto cleanup;
+ if (pci_required_resource_failed(&failed, type)) {
+ ret = -ENOSPC;
+ goto cleanup;
+ }
+ /* Only resources with unrelated types failed (again) */
+ free_list(&failed);
}
list_for_each_entry(dev_res, &saved, list) {
--
2.39.5
pdev_sort_resources() uses pdev_resources_assignable() helper to decide
if device's resources cannot be assigned. pbus_size_mem(), on the other
hand, does not do the same check. This could lead into a situation
where a resource ends up on realloc_head list but is not on the head
list, which is turn prevents emptying the resource from the
realloc_head list in __assign_resources_sorted().
A non-empty realloc_head is unacceptable because it triggers an
internal sanity check as show in this log with a device that has class
0 (PCI_CLASS_NOT_DEFINED):
pci 0001:01:00.0: [144d:a5a5] type 00 class 0x000000 PCIe Endpoint
pci 0001:01:00.0: BAR 0 [mem 0x00000000-0x000fffff 64bit]
pci 0001:01:00.0: ROM [mem 0x00000000-0x0000ffff pref]
pci 0001:01:00.0: enabling Extended Tags
pci 0001:01:00.0: PME# supported from D0 D3hot D3cold
pci 0001:01:00.0: 15.752 Gb/s available PCIe bandwidth, limited by 8.0 GT/s PCIe x2 link at 0001:00:00.0 (capable of 31.506 Gb/s with 16.0 GT/s PCIe x2 link)
pcieport 0001:00:00.0: bridge window [mem 0x00100000-0x001fffff] to [bus 01-ff] add_size 100000 add_align 100000
pcieport 0001:00:00.0: bridge window [mem 0x40000000-0x401fffff]: assigned
------------[ cut here ]------------
kernel BUG at drivers/pci/setup-bus.c:2532!
Internal error: Oops - BUG: 00000000f2000800 [#1] SMP
...
Call trace:
pci_assign_unassigned_bus_resources+0x110/0x114 (P)
pci_rescan_bus+0x28/0x48
Use pdev_resources_assignable() also within pbus_size_mem() to skip
processing of non-assignable resources which removes the disparity in
between what resources pdev_sort_resources() and pbus_size_mem()
consider. As non-assignable resources are no longer processed, they are
not added to the realloc_head list, thus the sanity check no longer
triggers.
This disparity problem is very old but only now became apparent after
the commit 2499f5348431 ("PCI: Rework optional resource handling") that
made the ROM resources optional when calculating bridge window sizes
which required adding the resource to the realloc_head list.
Previously, bridge windows were just sized larger than necessary.
Fixes: 2499f5348431 ("PCI: Rework optional resource handling")
Reported-by: Tudor Ambarus <tudor.ambarus(a)linaro.org>
Signed-off-by: Ilpo Järvinen <ilpo.jarvinen(a)linux.intel.com>
Cc: <stable(a)vger.kernel.org>
---
drivers/pci/setup-bus.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c
index f90d49cd07da..24863d8d0053 100644
--- a/drivers/pci/setup-bus.c
+++ b/drivers/pci/setup-bus.c
@@ -1191,6 +1191,7 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned long mask,
resource_size_t r_size;
if (r->parent || (r->flags & IORESOURCE_PCI_FIXED) ||
+ !pdev_resources_assignable(dev) ||
((r->flags & mask) != type &&
(r->flags & mask) != type2 &&
(r->flags & mask) != type3))
--
2.39.5
When using relaxed tail alignment for the bridge window,
pbus_size_mem() also tries to minimize min_align, which can under
certain scenarios end up increasing min_align from that found by
calculate_mem_align().
Ensure min_align is not increased by the relaxed tail alignment.
Eventually, it would be better to add calculate_relaxed_head_align()
similar to calculate_mem_align() which finds out what alignment can be
used for the head without introducing any gaps into the bridge window
to give flexibility on head address too. But that looks relatively
complex algorithm so it requires much more testing than fixing the
immediate problem causing a regression.
Fixes: 67f9085596ee ("PCI: Allow relaxed bridge window tail sizing for optional resources")
Reported-by: Rio <rio(a)r26.me>
Tested-by: Rio <rio(a)r26.me>
Signed-off-by: Ilpo Järvinen <ilpo.jarvinen(a)linux.intel.com>
Cc: <stable(a)vger.kernel.org>
---
drivers/pci/setup-bus.c | 11 +++++++----
1 file changed, 7 insertions(+), 4 deletions(-)
diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c
index 07c3d021a47e..f90d49cd07da 100644
--- a/drivers/pci/setup-bus.c
+++ b/drivers/pci/setup-bus.c
@@ -1169,6 +1169,7 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned long mask,
resource_size_t children_add_size = 0;
resource_size_t children_add_align = 0;
resource_size_t add_align = 0;
+ resource_size_t relaxed_align;
if (!b_res)
return -ENOSPC;
@@ -1246,8 +1247,9 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned long mask,
if (bus->self && size0 &&
!pbus_upstream_space_available(bus, mask | IORESOURCE_PREFETCH, type,
size0, min_align)) {
- min_align = 1ULL << (max_order + __ffs(SZ_1M));
- min_align = max(min_align, win_align);
+ relaxed_align = 1ULL << (max_order + __ffs(SZ_1M));
+ relaxed_align = max(relaxed_align, win_align);
+ min_align = min(min_align, relaxed_align);
size0 = calculate_memsize(size, min_size, 0, 0, resource_size(b_res), win_align);
pci_info(bus->self, "bridge window %pR to %pR requires relaxed alignment rules\n",
b_res, &bus->busn_res);
@@ -1261,8 +1263,9 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned long mask,
if (bus->self && size1 &&
!pbus_upstream_space_available(bus, mask | IORESOURCE_PREFETCH, type,
size1, add_align)) {
- min_align = 1ULL << (max_order + __ffs(SZ_1M));
- min_align = max(min_align, win_align);
+ relaxed_align = 1ULL << (max_order + __ffs(SZ_1M));
+ relaxed_align = max(min_align, win_align);
+ min_align = min(min_align, relaxed_align);
size1 = calculate_memsize(size, min_size, add_size, children_add_size,
resource_size(b_res), win_align);
pci_info(bus->self,
--
2.39.5