From: Lad Prabhakar <prabhakar.mahadev-lad.rj(a)bp.renesas.com>
Ensure TX descriptor type fields are written in a safe order so the DMA
engine does not begin processing a chain before all descriptors are
fully initialised.
For multi-descriptor transmissions the driver writes DT_FEND into the
last descriptor and DT_FSTART into the first. The DMA engine starts
processing when it sees DT_FSTART. If the compiler or CPU reorders the
writes and publishes DT_FSTART before DT_FEND, the DMA can start early
and process an incomplete chain, leading to corrupted transmissions or
DMA errors.
Fix this by writing DT_FEND before the dma_wmb() barrier, executing
dma_wmb() immediately before DT_FSTART (or DT_FSINGLE in the single
descriptor case), and then adding a wmb() after the type updates to
ensure CPU-side ordering before ringing the hardware doorbell.
On an RZ/G2L platform running an RT kernel, this reordering hazard was
observed as TX stalls and timeouts:
[ 372.968431] NETDEV WATCHDOG: end0 (ravb): transmit queue 0 timed out
[ 372.968494] WARNING: CPU: 0 PID: 10 at net/sched/sch_generic.c:467 dev_watchdog+0x4a4/0x4ac
[ 373.969291] ravb 11c20000.ethernet end0: transmit timed out, status 00000000, resetting...
This change enforces the required ordering and prevents the DMA engine
from observing DT_FSTART before the rest of the descriptor chain is
valid.
Fixes: 2f45d1902acf ("ravb: minimize TX data copying")
Cc: stable(a)vger.kernel.org
Co-developed-by: Fabrizio Castro <fabrizio.castro.jz(a)renesas.com>
Signed-off-by: Fabrizio Castro <fabrizio.castro.jz(a)renesas.com>
Signed-off-by: Lad Prabhakar <prabhakar.mahadev-lad.rj(a)bp.renesas.com>
---
drivers/net/ethernet/renesas/ravb_main.c | 14 +++++++++-----
1 file changed, 9 insertions(+), 5 deletions(-)
diff --git a/drivers/net/ethernet/renesas/ravb_main.c b/drivers/net/ethernet/renesas/ravb_main.c
index a200e205825a..2a995fa9bfff 100644
--- a/drivers/net/ethernet/renesas/ravb_main.c
+++ b/drivers/net/ethernet/renesas/ravb_main.c
@@ -2211,15 +2211,19 @@ static netdev_tx_t ravb_start_xmit(struct sk_buff *skb, struct net_device *ndev)
skb_tx_timestamp(skb);
}
- /* Descriptor type must be set after all the above writes */
- dma_wmb();
+
+ /* For multi-descriptors set DT_FEND before calling dma_wmb() */
if (num_tx_desc > 1) {
desc->die_dt = DT_FEND;
desc--;
- desc->die_dt = DT_FSTART;
- } else {
- desc->die_dt = DT_FSINGLE;
}
+
+ /* Descriptor type must be set after all the above writes */
+ dma_wmb();
+ desc->die_dt = (num_tx_desc > 1) ? DT_FSTART : DT_FSINGLE;
+
+ /* Ensure data is written to RAM before initiating DMA transfer */
+ wmb();
ravb_modify(ndev, TCCR, TCCR_TSRQ0 << q, TCCR_TSRQ0 << q);
priv->cur_tx[q] += num_tx_desc;
--
2.43.0
commit 3fcbf1c77d08 ("arch_topology: Fix cache attributes detection
in the CPU hotplug path")
adds a call to detect_cache_attributes() to populate the cacheinfo
before updating the siblings mask. detect_cache_attributes() allocates
memory and can take the PPTT mutex (on ACPI platforms). On PREEMPT_RT
kernels, on secondary CPUs, this triggers a:
'BUG: sleeping function called from invalid context'
as the code is executed with preemption and interrupts disabled:
| BUG: sleeping function called from invalid context at kernel/locking/spinlock_rt.c:46
| in_atomic(): 1, irqs_disabled(): 128, non_block: 0, pid: 0, name: swapper/111
| preempt_count: 1, expected: 0
| RCU nest depth: 1, expected: 1
| 3 locks held by swapper/111/0:
| #0: (&pcp->lock){+.+.}-{3:3}, at: get_page_from_freelist+0x218/0x12c8
| #1: (rcu_read_lock){....}-{1:3}, at: rt_spin_trylock+0x48/0xf0
| #2: (&zone->lock){+.+.}-{3:3}, at: rmqueue_bulk+0x64/0xa80
| irq event stamp: 0
| hardirqs last enabled at (0): 0x0
| hardirqs last disabled at (0): copy_process+0x5dc/0x1ab8
| softirqs last enabled at (0): copy_process+0x5dc/0x1ab8
| softirqs last disabled at (0): 0x0
| Preemption disabled at:
| migrate_enable+0x30/0x130
| CPU: 111 PID: 0 Comm: swapper/111 Tainted: G W 6.0.0-rc4-rt6-[...]
| Call trace:
| __kmalloc+0xbc/0x1e8
| detect_cache_attributes+0x2d4/0x5f0
| update_siblings_masks+0x30/0x368
| store_cpu_topology+0x78/0xb8
| secondary_start_kernel+0xd0/0x198
| __secondary_switched+0xb0/0xb4
Pierre fixed this issue in the upstream 6.3 and the original series is follows:
https://lore.kernel.org/all/167404285593.885445.6219705651301997538.b4-ty@a…
We also encountered the same issue on 6.1 stable branch, and need to backport this series.
Pierre Gondois (6):
cacheinfo: Use RISC-V's init_cache_level() as generic OF
implementation
cacheinfo: Return error code in init_of_cache_level()
cacheinfo: Check 'cache-unified' property to count cache leaves
ACPI: PPTT: Remove acpi_find_cache_levels()
ACPI: PPTT: Update acpi_find_last_cache_level() to
acpi_get_cache_info()
arch_topology: Build cacheinfo from primary CPU
arch/arm64/kernel/cacheinfo.c | 11 ++-
arch/riscv/kernel/cacheinfo.c | 42 -----------
drivers/acpi/pptt.c | 93 +++++++++++++----------
drivers/base/arch_topology.c | 12 ++-
drivers/base/cacheinfo.c | 134 +++++++++++++++++++++++++++++-----
include/linux/cacheinfo.h | 11 ++-
6 files changed, 196 insertions(+), 107 deletions(-)
--
2.25.1
A few cleanups and a bugfix that are either suitable after the swap
table phase I or found during code review.
Patch 1 is a bugfix and needs to be included in the stable branch,
the rest have no behavior change.
---
Kairui Song (4):
mm, swap: do not perform synchronous discard during allocation
mm, swap: rename helper for setup bad slots
mm, swap: cleanup swap entry allocation parameter
mm/migrate, swap: drop usage of folio_index
include/linux/swap.h | 4 ++--
mm/migrate.c | 4 ++--
mm/shmem.c | 2 +-
mm/swap.h | 21 -----------------
mm/swapfile.c | 64 ++++++++++++++++++++++++++++++++++++----------------
mm/vmscan.c | 4 ++--
6 files changed, 52 insertions(+), 47 deletions(-)
---
base-commit: 53e573001f2b5168f9b65d2b79e9563a3b479c17
change-id: 20251007-swap-clean-after-swap-table-p1-b9a7635ee3fa
Best regards,
--
Kairui Song <kasong(a)tencent.com>
From: Tim Hostetler <thostet(a)google.com>
The device returns a valid bit in the LSB of the low timestamp byte in
the completion descriptor that the driver should check before
setting the SKB's hardware timestamp. If the timestamp is not valid, do not
hardware timestamp the SKB.
Cc: stable(a)vger.kernel.org
Fixes: b2c7aeb49056 ("gve: Implement ndo_hwtstamp_get/set for RX timestamping")
Reviewed-by: Joshua Washington <joshwash(a)google.com>
Signed-off-by: Tim Hostetler <thostet(a)google.com>
Signed-off-by: Harshitha Ramamurthy <hramamurthy(a)google.com>
---
drivers/net/ethernet/google/gve/gve.h | 2 ++
drivers/net/ethernet/google/gve/gve_desc_dqo.h | 3 ++-
drivers/net/ethernet/google/gve/gve_rx_dqo.c | 18 ++++++++++++------
3 files changed, 16 insertions(+), 7 deletions(-)
diff --git a/drivers/net/ethernet/google/gve/gve.h b/drivers/net/ethernet/google/gve/gve.h
index bceaf9b05cb4..4cc6dcbfd367 100644
--- a/drivers/net/ethernet/google/gve/gve.h
+++ b/drivers/net/ethernet/google/gve/gve.h
@@ -100,6 +100,8 @@
*/
#define GVE_DQO_QPL_ONDEMAND_ALLOC_THRESHOLD 96
+#define GVE_DQO_RX_HWTSTAMP_VALID 0x1
+
/* Each slot in the desc ring has a 1:1 mapping to a slot in the data ring */
struct gve_rx_desc_queue {
struct gve_rx_desc *desc_ring; /* the descriptor ring */
diff --git a/drivers/net/ethernet/google/gve/gve_desc_dqo.h b/drivers/net/ethernet/google/gve/gve_desc_dqo.h
index d17da841b5a0..f7786b03c744 100644
--- a/drivers/net/ethernet/google/gve/gve_desc_dqo.h
+++ b/drivers/net/ethernet/google/gve/gve_desc_dqo.h
@@ -236,7 +236,8 @@ struct gve_rx_compl_desc_dqo {
u8 status_error1;
- __le16 reserved5;
+ u8 reserved5;
+ u8 ts_sub_nsecs_low;
__le16 buf_id; /* Buffer ID which was sent on the buffer queue. */
union {
diff --git a/drivers/net/ethernet/google/gve/gve_rx_dqo.c b/drivers/net/ethernet/google/gve/gve_rx_dqo.c
index 7380c2b7a2d8..02e25be8a50d 100644
--- a/drivers/net/ethernet/google/gve/gve_rx_dqo.c
+++ b/drivers/net/ethernet/google/gve/gve_rx_dqo.c
@@ -456,14 +456,20 @@ static void gve_rx_skb_hash(struct sk_buff *skb,
* Note that this means if the time delta between packet reception and the last
* clock read is greater than ~2 seconds, this will provide invalid results.
*/
-static void gve_rx_skb_hwtstamp(struct gve_rx_ring *rx, u32 hwts)
+static void gve_rx_skb_hwtstamp(struct gve_rx_ring *rx,
+ const struct gve_rx_compl_desc_dqo *desc)
{
u64 last_read = READ_ONCE(rx->gve->last_sync_nic_counter);
struct sk_buff *skb = rx->ctx.skb_head;
- u32 low = (u32)last_read;
- s32 diff = hwts - low;
-
- skb_hwtstamps(skb)->hwtstamp = ns_to_ktime(last_read + diff);
+ u32 ts, low;
+ s32 diff;
+
+ if (desc->ts_sub_nsecs_low & GVE_DQO_RX_HWTSTAMP_VALID) {
+ ts = le32_to_cpu(desc->ts);
+ low = (u32)last_read;
+ diff = ts - low;
+ skb_hwtstamps(skb)->hwtstamp = ns_to_ktime(last_read + diff);
+ }
}
static void gve_rx_free_skb(struct napi_struct *napi, struct gve_rx_ring *rx)
@@ -919,7 +925,7 @@ static int gve_rx_complete_skb(struct gve_rx_ring *rx, struct napi_struct *napi,
gve_rx_skb_csum(rx->ctx.skb_head, desc, ptype);
if (rx->gve->ts_config.rx_filter == HWTSTAMP_FILTER_ALL)
- gve_rx_skb_hwtstamp(rx, le32_to_cpu(desc->ts));
+ gve_rx_skb_hwtstamp(rx, desc);
/* RSC packets must set gso_size otherwise the TCP stack will complain
* that packets are larger than MTU.
--
2.51.0.740.g6adb054d12-goog
From: Lad Prabhakar <prabhakar.mahadev-lad.rj(a)bp.renesas.com>
On SoCs that only support the best-effort queue and not the network
control queue, calling alloc_etherdev_mqs() with fixed values for
TX/RX queues is not appropriate. Use the nc_queues flag from the
per-SoC match data to determine whether the network control queue
is available, and fall back to a single TX/RX queue when it is not.
This ensures correct queue allocation across all supported SoCs.
Fixes: a92f4f0662bf ("ravb: Add nc_queue to struct ravb_hw_info")
Cc: stable(a)vger.kernel.org
Signed-off-by: Lad Prabhakar <prabhakar.mahadev-lad.rj(a)bp.renesas.com>
---
drivers/net/ethernet/renesas/ravb_main.c | 7 ++++---
1 file changed, 4 insertions(+), 3 deletions(-)
diff --git a/drivers/net/ethernet/renesas/ravb_main.c b/drivers/net/ethernet/renesas/ravb_main.c
index 69d382e8757d..a200e205825a 100644
--- a/drivers/net/ethernet/renesas/ravb_main.c
+++ b/drivers/net/ethernet/renesas/ravb_main.c
@@ -2926,13 +2926,14 @@ static int ravb_probe(struct platform_device *pdev)
return dev_err_probe(&pdev->dev, PTR_ERR(rstc),
"failed to get cpg reset\n");
+ info = of_device_get_match_data(&pdev->dev);
+
ndev = alloc_etherdev_mqs(sizeof(struct ravb_private),
- NUM_TX_QUEUE, NUM_RX_QUEUE);
+ info->nc_queues ? NUM_TX_QUEUE : 1,
+ info->nc_queues ? NUM_RX_QUEUE : 1);
if (!ndev)
return -ENOMEM;
- info = of_device_get_match_data(&pdev->dev);
-
ndev->features = info->net_features;
ndev->hw_features = info->net_hw_features;
ndev->vlan_features = info->vlan_features;
--
2.43.0
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.15.y
git checkout FETCH_HEAD
git cherry-pick -x 7b26da407420e5054e3f06c5d13271697add9423
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025101534-attempt-stubbly-cf5f@gregkh' --subject-prefix 'PATCH 5.15.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 7b26da407420e5054e3f06c5d13271697add9423 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu(a)suse.com>
Date: Fri, 19 Sep 2025 14:33:23 +0930
Subject: [PATCH] btrfs: fix the incorrect max_bytes value for
find_lock_delalloc_range()
[BUG]
With my local branch to enable bs > ps support for btrfs, sometimes I
hit the following ASSERT() inside submit_one_sector():
ASSERT(block_start != EXTENT_MAP_HOLE);
Please note that it's not yet possible to hit this ASSERT() in the wild
yet, as it requires btrfs bs > ps support, which is not even in the
development branch.
But on the other hand, there is also a very low chance to hit above
ASSERT() with bs < ps cases, so this is an existing bug affect not only
the incoming bs > ps support but also the existing bs < ps support.
[CAUSE]
Firstly that ASSERT() means we're trying to submit a dirty block but
without a real extent map nor ordered extent map backing it.
Furthermore with extra debugging, the folio triggering such ASSERT() is
always larger than the fs block size in my bs > ps case.
(8K block size, 4K page size)
After some more debugging, the ASSERT() is trigger by the following
sequence:
extent_writepage()
| We got a 32K folio (4 fs blocks) at file offset 0, and the fs block
| size is 8K, page size is 4K.
| And there is another 8K folio at file offset 32K, which is also
| dirty.
| So the filemap layout looks like the following:
|
| "||" is the filio boundary in the filemap.
| "//| is the dirty range.
|
| 0 8K 16K 24K 32K 40K
| |////////| |//////////////////////||////////|
|
|- writepage_delalloc()
| |- find_lock_delalloc_range() for [0, 8K)
| | Now range [0, 8K) is properly locked.
| |
| |- find_lock_delalloc_range() for [16K, 40K)
| | |- btrfs_find_delalloc_range() returned range [16K, 40K)
| | |- lock_delalloc_folios() locked folio 0 successfully
| | |
| | | The filemap range [32K, 40K) got dropped from filemap.
| | |
| | |- lock_delalloc_folios() failed with -EAGAIN on folio 32K
| | | As the folio at 32K is dropped.
| | |
| | |- loops = 1;
| | |- max_bytes = PAGE_SIZE;
| | |- goto again;
| | | This will re-do the lookup for dirty delalloc ranges.
| | |
| | |- btrfs_find_delalloc_range() called with @max_bytes == 4K
| | | This is smaller than block size, so
| | | btrfs_find_delalloc_range() is unable to return any range.
| | \- return false;
| |
| \- Now only range [0, 8K) has an OE for it, but for dirty range
| [16K, 32K) it's dirty without an OE.
| This breaks the assumption that writepage_delalloc() will find
| and lock all dirty ranges inside the folio.
|
|- extent_writepage_io()
|- submit_one_sector() for [0, 8K)
| Succeeded
|
|- submit_one_sector() for [16K, 24K)
Triggering the ASSERT(), as there is no OE, and the original
extent map is a hole.
Please note that, this also exposed the same problem for bs < ps
support. E.g. with 64K page size and 4K block size.
If we failed to lock a folio, and falls back into the "loops = 1;"
branch, we will re-do the search using 64K as max_bytes.
Which may fail again to lock the next folio, and exit early without
handling all dirty blocks inside the folio.
[FIX]
Instead of using the fixed size PAGE_SIZE as @max_bytes, use
@sectorsize, so that we are ensured to find and lock any remaining
blocks inside the folio.
And since we're here, add an extra ASSERT() to
before calling btrfs_find_delalloc_range() to make sure the @max_bytes is
at least no smaller than a block to avoid false negative.
Cc: stable(a)vger.kernel.org # 5.15+
Signed-off-by: Qu Wenruo <wqu(a)suse.com>
Signed-off-by: David Sterba <dsterba(a)suse.com>
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 0782533aad51..2b6027ebf265 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -393,6 +393,13 @@ noinline_for_stack bool find_lock_delalloc_range(struct inode *inode,
/* step one, find a bunch of delalloc bytes starting at start */
delalloc_start = *start;
delalloc_end = 0;
+
+ /*
+ * If @max_bytes is smaller than a block, btrfs_find_delalloc_range() can
+ * return early without handling any dirty ranges.
+ */
+ ASSERT(max_bytes >= fs_info->sectorsize);
+
found = btrfs_find_delalloc_range(tree, &delalloc_start, &delalloc_end,
max_bytes, &cached_state);
if (!found || delalloc_end <= *start || delalloc_start > orig_end) {
@@ -423,13 +430,14 @@ noinline_for_stack bool find_lock_delalloc_range(struct inode *inode,
delalloc_end);
ASSERT(!ret || ret == -EAGAIN);
if (ret == -EAGAIN) {
- /* some of the folios are gone, lets avoid looping by
- * shortening the size of the delalloc range we're searching
+ /*
+ * Some of the folios are gone, lets avoid looping by
+ * shortening the size of the delalloc range we're searching.
*/
btrfs_free_extent_state(cached_state);
cached_state = NULL;
if (!loops) {
- max_bytes = PAGE_SIZE;
+ max_bytes = fs_info->sectorsize;
loops = 1;
goto again;
} else {
The L1 substates support requires additional steps to work, see e.g.
section '11.6.6.4 L1 Substate' in the RK3588 TRM V1.0.
These steps are currently missing from the driver.
While this has always been a problem when using e.g.
CONFIG_PCIEASPM_POWER_SUPERSAVE=y, the problem became more apparent after
commit f3ac2ff14834 ("PCI/ASPM: Enable all ClockPM and ASPM states for
devicetree platforms"), which enabled ASPM also for
CONFIG_PCIEASPM_DEFAULT=y.
Disable L1 substates until proper support is added.
Cc: stable(a)vger.kernel.org
Fixes: 0e898eb8df4e ("PCI: rockchip-dwc: Add Rockchip RK356X host controller driver")
Fixes: f3ac2ff14834 ("PCI/ASPM: Enable all ClockPM and ASPM states for devicetree platforms")
Signed-off-by: Niklas Cassel <cassel(a)kernel.org>
---
drivers/pci/controller/dwc/pcie-dw-rockchip.c | 22 +++++++++++++++++++
1 file changed, 22 insertions(+)
diff --git a/drivers/pci/controller/dwc/pcie-dw-rockchip.c b/drivers/pci/controller/dwc/pcie-dw-rockchip.c
index 3e2752c7dd09..28e0fffe2542 100644
--- a/drivers/pci/controller/dwc/pcie-dw-rockchip.c
+++ b/drivers/pci/controller/dwc/pcie-dw-rockchip.c
@@ -200,6 +200,26 @@ static bool rockchip_pcie_link_up(struct dw_pcie *pci)
return FIELD_GET(PCIE_LINKUP_MASK, val) == PCIE_LINKUP;
}
+/*
+ * See e.g. section '11.6.6.4 L1 Substate' in the RK3588 TRM V1.0 for the steps
+ * needed to support L1 substates. Currently, not a single rockchip platform
+ * performs these steps, so disable L1 substates until there is proper support.
+ */
+static void rockchip_pcie_disable_l1sub(struct dw_pcie *pci)
+{
+ u32 cap, l1subcap;
+
+ cap = dw_pcie_find_ext_capability(pci, PCI_EXT_CAP_ID_L1SS);
+ if (cap) {
+ l1subcap = dw_pcie_readl_dbi(pci, cap + PCI_L1SS_CAP);
+ l1subcap &= ~(PCI_L1SS_CAP_L1_PM_SS | PCI_L1SS_CAP_ASPM_L1_1 |
+ PCI_L1SS_CAP_ASPM_L1_2 | PCI_L1SS_CAP_PCIPM_L1_1 |
+ PCI_L1SS_CAP_PCIPM_L1_2);
+ dw_pcie_writel_dbi(pci, cap + PCI_L1SS_CAP, l1subcap);
+ l1subcap = dw_pcie_readl_dbi(pci, cap + PCI_L1SS_CAP);
+ }
+}
+
static void rockchip_pcie_enable_l0s(struct dw_pcie *pci)
{
u32 cap, lnkcap;
@@ -264,6 +284,7 @@ static int rockchip_pcie_host_init(struct dw_pcie_rp *pp)
irq_set_chained_handler_and_data(irq, rockchip_pcie_intx_handler,
rockchip);
+ rockchip_pcie_disable_l1sub(pci);
rockchip_pcie_enable_l0s(pci);
return 0;
@@ -301,6 +322,7 @@ static void rockchip_pcie_ep_init(struct dw_pcie_ep *ep)
struct dw_pcie *pci = to_dw_pcie_from_ep(ep);
enum pci_barno bar;
+ rockchip_pcie_disable_l1sub(pci);
rockchip_pcie_enable_l0s(pci);
rockchip_pcie_ep_hide_broken_ats_cap_rk3588(ep);
--
2.51.0
According to Peter, we've had for a very long time an issue on some
mutltiouch touchpads where the fingers were stuck in a scrolling mode,
or 3 fingers gesture mode. I was unable to debug it because it was
rather hard to reproduce.
Recently, some people raised the issue again on libinput, and this time
added a recording of the actual bug.
It turns out that the sticky finger quirk that was introduced back in
2017 was only checking the last report, and that those missing releases
also happen when moving from 3 to 1 finger (only 1 is released instead
of 2).
This solution seems to me to be the most sensible, because we could also
add the NSMU quirk to win8 multitouch touchpads, but this would involve
a lot more computations at each report for rather annoying corner cases.
Link: https://gitlab.freedesktop.org/libinput/libinput/-/issues/1194
Signed-off-by: Benjamin Tissoires <bentiss(a)kernel.org>
---
Benjamin Tissoires (2):
HID: multitouch: fix sticky fingers
selftests/hid: add tests for missing release on the Dell Synaptics
drivers/hid/hid-multitouch.c | 27 ++++++-----
.../testing/selftests/hid/tests/test_multitouch.py | 55 ++++++++++++++++++++++
2 files changed, 69 insertions(+), 13 deletions(-)
---
base-commit: 54ba6d9b1393a0061600c0e49c8ebef65d60a8b2
change-id: 20250926-fix-sticky-fingers-8ae88436ae82
Best regards,
--
Benjamin Tissoires <bentiss(a)kernel.org>
The patch below does not apply to the 6.1-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.1.y
git checkout FETCH_HEAD
git cherry-pick -x 7b26da407420e5054e3f06c5d13271697add9423
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025101530-composed-concave-1075@gregkh' --subject-prefix 'PATCH 6.1.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 7b26da407420e5054e3f06c5d13271697add9423 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu(a)suse.com>
Date: Fri, 19 Sep 2025 14:33:23 +0930
Subject: [PATCH] btrfs: fix the incorrect max_bytes value for
find_lock_delalloc_range()
[BUG]
With my local branch to enable bs > ps support for btrfs, sometimes I
hit the following ASSERT() inside submit_one_sector():
ASSERT(block_start != EXTENT_MAP_HOLE);
Please note that it's not yet possible to hit this ASSERT() in the wild
yet, as it requires btrfs bs > ps support, which is not even in the
development branch.
But on the other hand, there is also a very low chance to hit above
ASSERT() with bs < ps cases, so this is an existing bug affect not only
the incoming bs > ps support but also the existing bs < ps support.
[CAUSE]
Firstly that ASSERT() means we're trying to submit a dirty block but
without a real extent map nor ordered extent map backing it.
Furthermore with extra debugging, the folio triggering such ASSERT() is
always larger than the fs block size in my bs > ps case.
(8K block size, 4K page size)
After some more debugging, the ASSERT() is trigger by the following
sequence:
extent_writepage()
| We got a 32K folio (4 fs blocks) at file offset 0, and the fs block
| size is 8K, page size is 4K.
| And there is another 8K folio at file offset 32K, which is also
| dirty.
| So the filemap layout looks like the following:
|
| "||" is the filio boundary in the filemap.
| "//| is the dirty range.
|
| 0 8K 16K 24K 32K 40K
| |////////| |//////////////////////||////////|
|
|- writepage_delalloc()
| |- find_lock_delalloc_range() for [0, 8K)
| | Now range [0, 8K) is properly locked.
| |
| |- find_lock_delalloc_range() for [16K, 40K)
| | |- btrfs_find_delalloc_range() returned range [16K, 40K)
| | |- lock_delalloc_folios() locked folio 0 successfully
| | |
| | | The filemap range [32K, 40K) got dropped from filemap.
| | |
| | |- lock_delalloc_folios() failed with -EAGAIN on folio 32K
| | | As the folio at 32K is dropped.
| | |
| | |- loops = 1;
| | |- max_bytes = PAGE_SIZE;
| | |- goto again;
| | | This will re-do the lookup for dirty delalloc ranges.
| | |
| | |- btrfs_find_delalloc_range() called with @max_bytes == 4K
| | | This is smaller than block size, so
| | | btrfs_find_delalloc_range() is unable to return any range.
| | \- return false;
| |
| \- Now only range [0, 8K) has an OE for it, but for dirty range
| [16K, 32K) it's dirty without an OE.
| This breaks the assumption that writepage_delalloc() will find
| and lock all dirty ranges inside the folio.
|
|- extent_writepage_io()
|- submit_one_sector() for [0, 8K)
| Succeeded
|
|- submit_one_sector() for [16K, 24K)
Triggering the ASSERT(), as there is no OE, and the original
extent map is a hole.
Please note that, this also exposed the same problem for bs < ps
support. E.g. with 64K page size and 4K block size.
If we failed to lock a folio, and falls back into the "loops = 1;"
branch, we will re-do the search using 64K as max_bytes.
Which may fail again to lock the next folio, and exit early without
handling all dirty blocks inside the folio.
[FIX]
Instead of using the fixed size PAGE_SIZE as @max_bytes, use
@sectorsize, so that we are ensured to find and lock any remaining
blocks inside the folio.
And since we're here, add an extra ASSERT() to
before calling btrfs_find_delalloc_range() to make sure the @max_bytes is
at least no smaller than a block to avoid false negative.
Cc: stable(a)vger.kernel.org # 5.15+
Signed-off-by: Qu Wenruo <wqu(a)suse.com>
Signed-off-by: David Sterba <dsterba(a)suse.com>
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 0782533aad51..2b6027ebf265 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -393,6 +393,13 @@ noinline_for_stack bool find_lock_delalloc_range(struct inode *inode,
/* step one, find a bunch of delalloc bytes starting at start */
delalloc_start = *start;
delalloc_end = 0;
+
+ /*
+ * If @max_bytes is smaller than a block, btrfs_find_delalloc_range() can
+ * return early without handling any dirty ranges.
+ */
+ ASSERT(max_bytes >= fs_info->sectorsize);
+
found = btrfs_find_delalloc_range(tree, &delalloc_start, &delalloc_end,
max_bytes, &cached_state);
if (!found || delalloc_end <= *start || delalloc_start > orig_end) {
@@ -423,13 +430,14 @@ noinline_for_stack bool find_lock_delalloc_range(struct inode *inode,
delalloc_end);
ASSERT(!ret || ret == -EAGAIN);
if (ret == -EAGAIN) {
- /* some of the folios are gone, lets avoid looping by
- * shortening the size of the delalloc range we're searching
+ /*
+ * Some of the folios are gone, lets avoid looping by
+ * shortening the size of the delalloc range we're searching.
*/
btrfs_free_extent_state(cached_state);
cached_state = NULL;
if (!loops) {
- max_bytes = PAGE_SIZE;
+ max_bytes = fs_info->sectorsize;
loops = 1;
goto again;
} else {
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.15.y
git checkout FETCH_HEAD
git cherry-pick -x 72d271a7baa7062cb27e774ac37c5459c6d20e22
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025101543-quake-judicial-9e2e@gregkh' --subject-prefix 'PATCH 5.15.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 72d271a7baa7062cb27e774ac37c5459c6d20e22 Mon Sep 17 00:00:00 2001
From: Aleksa Sarai <cyphar(a)cyphar.com>
Date: Thu, 7 Aug 2025 03:55:23 +1000
Subject: [PATCH] fscontext: do not consume log entries when returning
-EMSGSIZE
Userspace generally expects APIs that return -EMSGSIZE to allow for them
to adjust their buffer size and retry the operation. However, the
fscontext log would previously clear the message even in the -EMSGSIZE
case.
Given that it is very cheap for us to check whether the buffer is too
small before we remove the message from the ring buffer, let's just do
that instead. While we're at it, refactor some fscontext_read() into a
separate helper to make the ring buffer logic a bit easier to read.
Fixes: 007ec26cdc9f ("vfs: Implement logging through fs_context")
Cc: David Howells <dhowells(a)redhat.com>
Cc: stable(a)vger.kernel.org # v5.2+
Signed-off-by: Aleksa Sarai <cyphar(a)cyphar.com>
Link: https://lore.kernel.org/20250807-fscontext-log-cleanups-v3-1-8d91d6242dc3@c…
Signed-off-by: Christian Brauner <brauner(a)kernel.org>
diff --git a/fs/fsopen.c b/fs/fsopen.c
index 1aaf4cb2afb2..f645c99204eb 100644
--- a/fs/fsopen.c
+++ b/fs/fsopen.c
@@ -18,50 +18,56 @@
#include "internal.h"
#include "mount.h"
+static inline const char *fetch_message_locked(struct fc_log *log, size_t len,
+ bool *need_free)
+{
+ const char *p;
+ int index;
+
+ if (unlikely(log->head == log->tail))
+ return ERR_PTR(-ENODATA);
+
+ index = log->tail & (ARRAY_SIZE(log->buffer) - 1);
+ p = log->buffer[index];
+ if (unlikely(strlen(p) > len))
+ return ERR_PTR(-EMSGSIZE);
+
+ log->buffer[index] = NULL;
+ *need_free = log->need_free & (1 << index);
+ log->need_free &= ~(1 << index);
+ log->tail++;
+
+ return p;
+}
+
/*
* Allow the user to read back any error, warning or informational messages.
+ * Only one message is returned for each read(2) call.
*/
static ssize_t fscontext_read(struct file *file,
char __user *_buf, size_t len, loff_t *pos)
{
struct fs_context *fc = file->private_data;
- struct fc_log *log = fc->log.log;
- unsigned int logsize = ARRAY_SIZE(log->buffer);
- ssize_t ret;
- char *p;
+ ssize_t err;
+ const char *p __free(kfree) = NULL, *message;
bool need_free;
- int index, n;
+ int n;
- ret = mutex_lock_interruptible(&fc->uapi_mutex);
- if (ret < 0)
- return ret;
-
- if (log->head == log->tail) {
- mutex_unlock(&fc->uapi_mutex);
- return -ENODATA;
- }
-
- index = log->tail & (logsize - 1);
- p = log->buffer[index];
- need_free = log->need_free & (1 << index);
- log->buffer[index] = NULL;
- log->need_free &= ~(1 << index);
- log->tail++;
+ err = mutex_lock_interruptible(&fc->uapi_mutex);
+ if (err < 0)
+ return err;
+ message = fetch_message_locked(fc->log.log, len, &need_free);
mutex_unlock(&fc->uapi_mutex);
+ if (IS_ERR(message))
+ return PTR_ERR(message);
- ret = -EMSGSIZE;
- n = strlen(p);
- if (n > len)
- goto err_free;
- ret = -EFAULT;
- if (copy_to_user(_buf, p, n) != 0)
- goto err_free;
- ret = n;
-
-err_free:
if (need_free)
- kfree(p);
- return ret;
+ p = message;
+
+ n = strlen(message);
+ if (copy_to_user(_buf, message, n))
+ return -EFAULT;
+ return n;
}
static int fscontext_release(struct inode *inode, struct file *file)