The receive error handling code is shared between RSCI and all other
SCIF port types, but the RSCI overrun_reg is specified as a memory
offset, while for other SCIF types it is an enum value used to index
into the sci_port_params->regs array, as mentioned above the
sci_serial_in() function.
For RSCI, the overrun_reg is CSR (0x48), causing the sci_getreg() call
inside the sci_handle_fifo_overrun() function to index outside the
bounds of the regs array, which currently has a size of 20, as specified
by SCI_NR_REGS.
Because of this, we end up accessing memory outside of RSCI's
rsci_port_params structure, which, when interpreted as a plat_sci_reg,
happens to have a non-zero size, causing the following WARN when
sci_serial_in() is called, as the accidental size does not match the
supported register sizes.
The existence of the overrun_reg needs to be checked because
SCIx_SH3_SCIF_REGTYPE has overrun_reg set to SCLSR, but SCLSR is not
present in the regs array.
Avoid calling sci_getreg() for port types which don't use standard
register handling.
Use the ops->read_reg() and ops->write_reg() functions to properly read
and write registers for RSCI, and change the type of the status variable
to accommodate the 32-bit CSR register.
sci_getreg() and sci_serial_in() are also called with overrun_reg in the
sci_mpxed_interrupt() interrupt handler, but that code path is not used
for RSCI, as it does not have a muxed interrupt.
------------[ cut here ]------------
Invalid register access
WARNING: CPU: 0 PID: 0 at drivers/tty/serial/sh-sci.c:522 sci_serial_in+0x38/0xac
Modules linked in: renesas_usbhs at24 rzt2h_adc industrialio_adc sha256 cfg80211 bluetooth ecdh_generic ecc rfkill fuse drm backlight ipv6
CPU: 0 UID: 0 PID: 0 Comm: swapper/0 Not tainted 6.17.0-rc1+ #30 PREEMPT
Hardware name: Renesas RZ/T2H EVK Board based on r9a09g077m44 (DT)
pstate: 604000c5 (nZCv daIF +PAN -UAO -TCO -DIT -SSBS BTYPE=--)
pc : sci_serial_in+0x38/0xac
lr : sci_serial_in+0x38/0xac
sp : ffff800080003e80
x29: ffff800080003e80 x28: ffff800082195b80 x27: 000000000000000d
x26: ffff8000821956d0 x25: 0000000000000000 x24: ffff800082195b80
x23: ffff000180e0d800 x22: 0000000000000010 x21: 0000000000000000
x20: 0000000000000010 x19: ffff000180e72000 x18: 000000000000000a
x17: ffff8002bcee7000 x16: ffff800080000000 x15: 0720072007200720
x14: 0720072007200720 x13: 0720072007200720 x12: 0720072007200720
x11: 0000000000000058 x10: 0000000000000018 x9 : ffff8000821a6a48
x8 : 0000000000057fa8 x7 : 0000000000000406 x6 : ffff8000821fea48
x5 : ffff00033ef88408 x4 : ffff8002bcee7000 x3 : ffff800082195b80
x2 : 0000000000000000 x1 : 0000000000000000 x0 : ffff800082195b80
Call trace:
sci_serial_in+0x38/0xac (P)
sci_handle_fifo_overrun.isra.0+0x70/0x134
sci_er_interrupt+0x50/0x39c
__handle_irq_event_percpu+0x48/0x140
handle_irq_event+0x44/0xb0
handle_fasteoi_irq+0xf4/0x1a0
handle_irq_desc+0x34/0x58
generic_handle_domain_irq+0x1c/0x28
gic_handle_irq+0x4c/0x140
call_on_irq_stack+0x30/0x48
do_interrupt_handler+0x80/0x84
el1_interrupt+0x34/0x68
el1h_64_irq_handler+0x18/0x24
el1h_64_irq+0x6c/0x70
default_idle_call+0x28/0x58 (P)
do_idle+0x1f8/0x250
cpu_startup_entry+0x34/0x3c
rest_init+0xd8/0xe0
console_on_rootfs+0x0/0x6c
__primary_switched+0x88/0x90
---[ end trace 0000000000000000 ]---
Cc: stable(a)vger.kernel.org
Fixes: 0666e3fe95ab ("serial: sh-sci: Add support for RZ/T2H SCI")
Signed-off-by: Cosmin Tanislav <cosmin-gabriel.tanislav.xa(a)renesas.com>
---
drivers/tty/serial/sh-sci.c | 14 ++++++++------
1 file changed, 8 insertions(+), 6 deletions(-)
diff --git a/drivers/tty/serial/sh-sci.c b/drivers/tty/serial/sh-sci.c
index 538b2f991609..62bb62b82cbe 100644
--- a/drivers/tty/serial/sh-sci.c
+++ b/drivers/tty/serial/sh-sci.c
@@ -1014,16 +1014,18 @@ static int sci_handle_fifo_overrun(struct uart_port *port)
struct sci_port *s = to_sci_port(port);
const struct plat_sci_reg *reg;
int copied = 0;
- u16 status;
+ u32 status;
- reg = sci_getreg(port, s->params->overrun_reg);
- if (!reg->size)
- return 0;
+ if (s->type != SCI_PORT_RSCI) {
+ reg = sci_getreg(port, s->params->overrun_reg);
+ if (!reg->size)
+ return 0;
+ }
- status = sci_serial_in(port, s->params->overrun_reg);
+ status = s->ops->read_reg(port, s->params->overrun_reg);
if (status & s->params->overrun_mask) {
status &= ~s->params->overrun_mask;
- sci_serial_out(port, s->params->overrun_reg, status);
+ s->ops->write_reg(port, s->params->overrun_reg, status);
port->icount.overrun++;
--
2.51.0
From: Christian Hitz <christian.hitz(a)bbv.ch>
If a GPIO is used to control the chip's enable pin, it needs to be pulled
high before any i2c communication is attempted.
Currently, the enable GPIO handling is not correct.
Assume the enable GPIO is low when the probe function is entered. In this
case the device is in SHUTDOWN mode and does not react to i2c commands.
During probe the following sequence happens:
1. The call to lp50xx_reset() on line 548 has no effect as i2c is not
possible yet.
2. Then - on line 552 - lp50xx_enable_disable() is called. As
"priv->enable_gpio“ has not yet been initialized, setting the GPIO has
no effect. Also the i2c enable command is not executed as the device
is still in SHUTDOWN.
3. On line 556 the call to lp50xx_probe_dt() finally parses the rest of
the DT and the configured priv->enable_gpio is set up.
As a result the device is still in SHUTDOWN mode and not ready for
operation.
Split lp50xx_enable_disable() into distinct enable and disable functions
to enforce correct ordering between enable_gpio manipulations and i2c
commands.
Read enable_gpio configuration from DT before attempting to manipulate
enable_gpio.
Add delays to observe correct wait timing after manipulating enable_gpio
and before any i2c communication.
Fixes: 242b81170fb8 ("leds: lp50xx: Add the LP50XX family of the RGB LED driver")
Cc: stable(a)vger.kernel.org
Signed-off-by: Christian Hitz <christian.hitz(a)bbv.ch>
---
Changes in v2:
- Unconditionally reset in lp50xx_enable
- Define magic numbers
- Improve log message
---
drivers/leds/leds-lp50xx.c | 55 +++++++++++++++++++++++++++-----------
1 file changed, 40 insertions(+), 15 deletions(-)
diff --git a/drivers/leds/leds-lp50xx.c b/drivers/leds/leds-lp50xx.c
index 94f8ef6b482c..d3485d814cf4 100644
--- a/drivers/leds/leds-lp50xx.c
+++ b/drivers/leds/leds-lp50xx.c
@@ -50,6 +50,12 @@
#define LP50XX_SW_RESET 0xff
#define LP50XX_CHIP_EN BIT(6)
+#define LP50XX_CHIP_DISABLE 0x00
+#define LP50XX_START_TIME_US 500
+#define LP50XX_RESET_TIME_US 3
+
+#define LP50XX_EN_GPIO_LOW 0
+#define LP50XX_EN_GPIO_HIGH 1
/* There are 3 LED outputs per bank */
#define LP50XX_LEDS_PER_MODULE 3
@@ -371,19 +377,42 @@ static int lp50xx_reset(struct lp50xx *priv)
return regmap_write(priv->regmap, priv->chip_info->reset_reg, LP50XX_SW_RESET);
}
-static int lp50xx_enable_disable(struct lp50xx *priv, int enable_disable)
+static int lp50xx_enable(struct lp50xx *priv)
{
int ret;
- ret = gpiod_direction_output(priv->enable_gpio, enable_disable);
+ if (priv->enable_gpio) {
+ ret = gpiod_direction_output(priv->enable_gpio, LP50XX_EN_GPIO_HIGH);
+ if (ret)
+ return ret;
+
+ udelay(LP50XX_START_TIME_US);
+ }
+
+ ret = lp50xx_reset(priv);
if (ret)
return ret;
- if (enable_disable)
- return regmap_write(priv->regmap, LP50XX_DEV_CFG0, LP50XX_CHIP_EN);
- else
- return regmap_write(priv->regmap, LP50XX_DEV_CFG0, 0);
+ return regmap_write(priv->regmap, LP50XX_DEV_CFG0, LP50XX_CHIP_EN);
+}
+static int lp50xx_disable(struct lp50xx *priv)
+{
+ int ret;
+
+ ret = regmap_write(priv->regmap, LP50XX_DEV_CFG0, LP50XX_CHIP_DISABLE);
+ if (ret)
+ return ret;
+
+ if (priv->enable_gpio) {
+ ret = gpiod_direction_output(priv->enable_gpio, LP50XX_EN_GPIO_LOW);
+ if (ret)
+ return ret;
+
+ udelay(LP50XX_RESET_TIME_US);
+ }
+
+ return 0;
}
static int lp50xx_probe_leds(struct fwnode_handle *child, struct lp50xx *priv,
@@ -447,6 +476,10 @@ static int lp50xx_probe_dt(struct lp50xx *priv)
return dev_err_probe(priv->dev, PTR_ERR(priv->enable_gpio),
"Failed to get enable GPIO\n");
+ ret = lp50xx_enable(priv);
+ if (ret)
+ return ret;
+
priv->regulator = devm_regulator_get(priv->dev, "vled");
if (IS_ERR(priv->regulator))
priv->regulator = NULL;
@@ -547,14 +580,6 @@ static int lp50xx_probe(struct i2c_client *client)
return ret;
}
- ret = lp50xx_reset(led);
- if (ret)
- return ret;
-
- ret = lp50xx_enable_disable(led, 1);
- if (ret)
- return ret;
-
return lp50xx_probe_dt(led);
}
@@ -563,7 +588,7 @@ static void lp50xx_remove(struct i2c_client *client)
struct lp50xx *led = i2c_get_clientdata(client);
int ret;
- ret = lp50xx_enable_disable(led, 0);
+ ret = lp50xx_disable(led);
if (ret)
dev_err(led->dev, "Failed to disable chip\n");
--
2.51.1
The function qcom_slim_ngd_notify_slaves() calls of_slim_get_device() which
internally uses device_find_child() to obtain a device reference.
According to the device_find_child() documentation,
the caller must drop the reference with put_device() after use.
Found via static analysis and this is similar to commit 4e65bda8273c
("ASoC: wcd934x: fix error handling in wcd934x_codec_parse_data()")
Fixes: 917809e2280b ("slimbus: ngd: Add qcom SLIMBus NGD driver")
Cc: stable(a)vger.kernel.org
Signed-off-by: Miaoqian Lin <linmq006(a)gmail.com>
---
drivers/slimbus/qcom-ngd-ctrl.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/drivers/slimbus/qcom-ngd-ctrl.c b/drivers/slimbus/qcom-ngd-ctrl.c
index 4fb66986cc22..cd40ab839c54 100644
--- a/drivers/slimbus/qcom-ngd-ctrl.c
+++ b/drivers/slimbus/qcom-ngd-ctrl.c
@@ -1241,6 +1241,7 @@ static void qcom_slim_ngd_notify_slaves(struct qcom_slim_ngd_ctrl *ctrl)
if (slim_get_logical_addr(sbdev))
dev_err(ctrl->dev, "Failed to get logical address\n");
+ put_device(&sbdev->dev);
}
}
--
2.39.5 (Apple Git-154)
The L1 substates support requires additional steps to work, namely:
-Proper handling of the CLKREQ# sideband signal. (It is mostly handled by
hardware, but software still needs to set the clkreq fields in the
PCIE_CLIENT_POWER_CON register to match the hardware implementation.)
-Program the frequency of the aux clock into the
DSP_PCIE_PL_AUX_CLK_FREQ_OFF register. (During L1 substates the core_clk
is turned off and the aux_clk is used instead.)
These steps are currently missing from the driver.
For more details, see section '18.6.6.4 L1 Substate' in the RK3658 TRM 1.1
Part 2, or section '11.6.6.4 L1 Substate' in the RK3588 TRM 1.0 Part2.
While this has always been a problem when using e.g.
CONFIG_PCIEASPM_POWER_SUPERSAVE=y, or when modifying
/sys/bus/pci/devices/.../link/l1_2_aspm, the lacking driver support for L1
substates became more apparent after commit f3ac2ff14834 ("PCI/ASPM:
Enable all ClockPM and ASPM states for devicetree platforms"), which
enabled ASPM also for CONFIG_PCIEASPM_DEFAULT=y.
When using e.g. an NVMe drive connected to the PCIe controller, the
problem will be seen as:
nvme nvme0: controller is down; will reset: CSTS=0xffffffff, PCI_STATUS=0x10
nvme nvme0: Does your device have a faulty power saving mode enabled?
nvme nvme0: Try "nvme_core.default_ps_max_latency_us=0 pcie_aspm=off pcie_port_pm=off" and report a bug
Thus, prevent advertising L1 Substates support until proper driver support
is added.
Cc: stable(a)vger.kernel.org
Fixes: 0e898eb8df4e ("PCI: rockchip-dwc: Add Rockchip RK356X host controller driver")
Fixes: f3ac2ff14834 ("PCI/ASPM: Enable all ClockPM and ASPM states for devicetree platforms")
Acked-by: Shawn Lin <shawn.lin(a)rock-chips.com>
Signed-off-by: Niklas Cassel <cassel(a)kernel.org>
---
Changes since v2:
-Improve commit message (Bjorn)
drivers/pci/controller/dwc/pcie-dw-rockchip.c | 21 +++++++++++++++++++
1 file changed, 21 insertions(+)
diff --git a/drivers/pci/controller/dwc/pcie-dw-rockchip.c b/drivers/pci/controller/dwc/pcie-dw-rockchip.c
index 3e2752c7dd09..84f882abbca5 100644
--- a/drivers/pci/controller/dwc/pcie-dw-rockchip.c
+++ b/drivers/pci/controller/dwc/pcie-dw-rockchip.c
@@ -200,6 +200,25 @@ static bool rockchip_pcie_link_up(struct dw_pcie *pci)
return FIELD_GET(PCIE_LINKUP_MASK, val) == PCIE_LINKUP;
}
+/*
+ * See e.g. section '11.6.6.4 L1 Substate' in the RK3588 TRM V1.0 for the steps
+ * needed to support L1 substates. Currently, not a single rockchip platform
+ * performs these steps, so disable L1 substates until there is proper support.
+ */
+static void rockchip_pcie_disable_l1sub(struct dw_pcie *pci)
+{
+ u32 cap, l1subcap;
+
+ cap = dw_pcie_find_ext_capability(pci, PCI_EXT_CAP_ID_L1SS);
+ if (cap) {
+ l1subcap = dw_pcie_readl_dbi(pci, cap + PCI_L1SS_CAP);
+ l1subcap &= ~(PCI_L1SS_CAP_L1_PM_SS | PCI_L1SS_CAP_ASPM_L1_1 |
+ PCI_L1SS_CAP_ASPM_L1_2 | PCI_L1SS_CAP_PCIPM_L1_1 |
+ PCI_L1SS_CAP_PCIPM_L1_2);
+ dw_pcie_writel_dbi(pci, cap + PCI_L1SS_CAP, l1subcap);
+ }
+}
+
static void rockchip_pcie_enable_l0s(struct dw_pcie *pci)
{
u32 cap, lnkcap;
@@ -264,6 +283,7 @@ static int rockchip_pcie_host_init(struct dw_pcie_rp *pp)
irq_set_chained_handler_and_data(irq, rockchip_pcie_intx_handler,
rockchip);
+ rockchip_pcie_disable_l1sub(pci);
rockchip_pcie_enable_l0s(pci);
return 0;
@@ -301,6 +321,7 @@ static void rockchip_pcie_ep_init(struct dw_pcie_ep *ep)
struct dw_pcie *pci = to_dw_pcie_from_ep(ep);
enum pci_barno bar;
+ rockchip_pcie_disable_l1sub(pci);
rockchip_pcie_enable_l0s(pci);
rockchip_pcie_ep_hide_broken_ats_cap_rk3588(ep);
--
2.51.0
In hrtimer_interrupt(), interrupts are disabled when acquiring a spinlock,
which subsequently triggers an oops. During the oops call chain,
blocking_notifier_call_chain() invokes _cond_resched, ultimately leading
to a hard lockup.
Call Stack:
hrtimer_interrupt//raw_spin_lock_irqsave
__hrtimer_run_queues
page_fault
do_page_fault
bad_area_nosemaphore
no_context
oops_end
bust_spinlocks
unblank_screen
do_unblank_screen
fbcon_blank
fb_notifier_call_chain
blocking_notifier_call_chain
down_read
_cond_resched
If the system is in an oops state, use down_read_trylock instead of a
blocking lock acquisition. If the trylock fails, skip executing the
notifier callbacks to avoid potential deadlocks or unsafe operations
during the oops handling process.
Cc: stable(a)vger.kernel.org # 6.6
Fixes: fe9d4f576324 ("Add kernel/notifier.c")
Signed-off-by: Yi Yang <yiyang13(a)huawei.com>
---
kernel/notifier.c | 15 ++++++++++++---
1 file changed, 12 insertions(+), 3 deletions(-)
diff --git a/kernel/notifier.c b/kernel/notifier.c
index b3ce28f39eb6..ebff2315fac2 100644
--- a/kernel/notifier.c
+++ b/kernel/notifier.c
@@ -384,9 +384,18 @@ int blocking_notifier_call_chain(struct blocking_notifier_head *nh,
* is, we re-check the list after having taken the lock anyway:
*/
if (rcu_access_pointer(nh->head)) {
- down_read(&nh->rwsem);
- ret = notifier_call_chain(&nh->head, val, v, -1, NULL);
- up_read(&nh->rwsem);
+ if (!oops_in_progress) {
+ down_read(&nh->rwsem);
+ ret = notifier_call_chain(&nh->head, val, v, -1, NULL);
+ up_read(&nh->rwsem);
+ } else {
+ if (down_read_trylock(&nh->rwsem)) {
+ ret = notifier_call_chain(&nh->head, val, v, -1, NULL);
+ up_read(&nh->rwsem);
+ } else {
+ ret = NOTIFY_BAD;
+ }
+ }
}
return ret;
}
--
2.25.1
F2FS can mount filesystems with corrupted directory depth values that
get runtime-clamped to MAX_DIR_HASH_DEPTH. When RENAME_WHITEOUT
operations are performed on such directories, f2fs_rename performs
directory modifications (updating target entry and deleting source
entry) before attempting to add the whiteout entry via f2fs_add_link.
If f2fs_add_link fails due to the corrupted directory structure, the
function returns an error to VFS, but the partial directory
modifications have already been committed to disk. VFS assumes the
entire rename operation failed and does not update the dentry cache,
leaving stale mappings.
In the error path, VFS does not call d_move() to update the dentry
cache. This results in new_dentry still pointing to the old inode
(new_inode) which has already had its i_nlink decremented to zero.
The stale cache causes subsequent operations to incorrectly reference
the freed inode.
This causes subsequent operations to use cached dentry information that
no longer matches the on-disk state. When a second rename targets the
same entry, VFS attempts to decrement i_nlink on the stale inode, which
may already have i_nlink=0, triggering a WARNING in drop_nlink().
Example sequence:
1. First rename (RENAME_WHITEOUT): file2 → file1
- f2fs updates file1 entry on disk (points to inode 8)
- f2fs deletes file2 entry on disk
- f2fs_add_link(whiteout) fails (corrupted directory)
- Returns error to VFS
- VFS does not call d_move() due to error
- VFS cache still has: file1 → inode 7 (stale!)
- inode 7 has i_nlink=0 (already decremented)
2. Second rename: file3 → file1
- VFS uses stale cache: file1 → inode 7
- Tries to drop_nlink on inode 7 (i_nlink already 0)
- WARNING in drop_nlink()
Fix this by explicitly invalidating old_dentry and new_dentry when
f2fs_add_link fails during whiteout creation. This forces VFS to
refresh from disk on subsequent operations, ensuring cache consistency
even when the rename partially succeeds.
Reproducer:
1. Mount F2FS image with corrupted i_current_depth
2. renameat2(file2, file1, RENAME_WHITEOUT)
3. renameat2(file3, file1, 0)
4. System triggers WARNING in drop_nlink()
Fixes: 7e01e7ad746b ("f2fs: support RENAME_WHITEOUT")
Reported-by: syzbot+632cf32276a9a564188d(a)syzkaller.appspotmail.com
Closes: https://syzkaller.appspot.com/bug?extid=632cf32276a9a564188d
Suggested-by: Chao Yu <chao(a)kernel.org>
Link: https://lore.kernel.org/all/20251022233349.102728-1-kartikey406@gmail.com/ [v1]
Cc: stable(a)vger.kernel.org
Signed-off-by: Deepanshu Kartikey <kartikey406(a)gmail.com>
---
Changes in v2:
- Added detailed explanation about VFS not calling d_move() in error path,
resulting in new_dentry still pointing to inode with zeroed i_nlink
(suggested by Chao Yu)
- Added Fixes tag pointing to commit 7e01e7ad746b
- Added Cc: stable(a)vger.kernel.org for backporting to stable kernels
---
fs/f2fs/namei.c | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index b882771e4699..712479b7b93d 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -1053,9 +1053,11 @@ static int f2fs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
if (whiteout) {
set_inode_flag(whiteout, FI_INC_LINK);
err = f2fs_add_link(old_dentry, whiteout);
- if (err)
+ if (err) {
+ d_invalidate(old_dentry);
+ d_invalidate(new_dentry);
goto put_out_dir;
-
+ }
spin_lock(&whiteout->i_lock);
whiteout->i_state &= ~I_LINKABLE;
spin_unlock(&whiteout->i_lock);
--
2.43.0
Commit e26ee4efbc79 ("fuse: allocate ff->release_args only if release is
needed") skips allocating ff->release_args if the server does not
implement open. However in doing so, fuse_prepare_release() now skips
grabbing the reference on the inode, which makes it possible for an
inode to be evicted from the dcache while there are inflight readahead
requests. This causes a deadlock if the server triggers reclaim while
servicing the readahead request and reclaim attempts to evict the inode
of the file being read ahead. Since the folio is locked during
readahead, when reclaim evicts the fuse inode and fuse_evict_inode()
attempts to remove all folios associated with the inode from the page
cache (truncate_inode_pages_range()), reclaim will block forever waiting
for the lock since readahead cannot relinquish the lock because it is
itself blocked in reclaim:
>>> stack_trace(1504735)
folio_wait_bit_common (mm/filemap.c:1308:4)
folio_lock (./include/linux/pagemap.h:1052:3)
truncate_inode_pages_range (mm/truncate.c:336:10)
fuse_evict_inode (fs/fuse/inode.c:161:2)
evict (fs/inode.c:704:3)
dentry_unlink_inode (fs/dcache.c:412:3)
__dentry_kill (fs/dcache.c:615:3)
shrink_kill (fs/dcache.c:1060:12)
shrink_dentry_list (fs/dcache.c:1087:3)
prune_dcache_sb (fs/dcache.c:1168:2)
super_cache_scan (fs/super.c:221:10)
do_shrink_slab (mm/shrinker.c:435:9)
shrink_slab (mm/shrinker.c:626:10)
shrink_node (mm/vmscan.c:5951:2)
shrink_zones (mm/vmscan.c:6195:3)
do_try_to_free_pages (mm/vmscan.c:6257:3)
do_swap_page (mm/memory.c:4136:11)
handle_pte_fault (mm/memory.c:5562:10)
handle_mm_fault (mm/memory.c:5870:9)
do_user_addr_fault (arch/x86/mm/fault.c:1338:10)
handle_page_fault (arch/x86/mm/fault.c:1481:3)
exc_page_fault (arch/x86/mm/fault.c:1539:2)
asm_exc_page_fault+0x22/0x27
Fix this deadlock by allocating ff->release_args and grabbing the
reference on the inode when preparing the file for release even if the
server does not implement open. The inode reference will be dropped when
the last reference on the fuse file is dropped (see fuse_file_put() ->
fuse_release_end()).
Fixes: e26ee4efbc79 ("fuse: allocate ff->release_args only if release is needed")
Cc: stable(a)vger.kernel.org
Signed-off-by: Joanne Koong <joannelkoong(a)gmail.com>
Reported-by: Omar Sandoval <osandov(a)fb.com>
---
fs/fuse/file.c | 40 ++++++++++++++++++++++++++--------------
1 file changed, 26 insertions(+), 14 deletions(-)
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index f1ef77a0be05..654e21ee93fb 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -100,7 +100,7 @@ static void fuse_release_end(struct fuse_mount *fm, struct fuse_args *args,
kfree(ra);
}
-static void fuse_file_put(struct fuse_file *ff, bool sync)
+static void fuse_file_put(struct fuse_file *ff, bool sync, bool isdir)
{
if (refcount_dec_and_test(&ff->count)) {
struct fuse_release_args *ra = &ff->args->release_args;
@@ -110,7 +110,9 @@ static void fuse_file_put(struct fuse_file *ff, bool sync)
fuse_file_io_release(ff, ra->inode);
if (!args) {
- /* Do nothing when server does not implement 'open' */
+ /* Do nothing when server does not implement 'opendir' */
+ } else if (!isdir && ff->fm->fc->no_open) {
+ fuse_release_end(ff->fm, args, 0);
} else if (sync) {
fuse_simple_request(ff->fm, args);
fuse_release_end(ff->fm, args, 0);
@@ -131,8 +133,17 @@ struct fuse_file *fuse_file_open(struct fuse_mount *fm, u64 nodeid,
struct fuse_file *ff;
int opcode = isdir ? FUSE_OPENDIR : FUSE_OPEN;
bool open = isdir ? !fc->no_opendir : !fc->no_open;
+ bool release = !isdir || open;
- ff = fuse_file_alloc(fm, open);
+ /*
+ * ff->args->release_args still needs to be allocated (so we can hold an
+ * inode reference while there are pending inflight file operations when
+ * ->release() is called, see fuse_prepare_release()) even if
+ * fc->no_open is set else it becomes possible for reclaim to deadlock
+ * if while servicing the readahead request the server triggers reclaim
+ * and reclaim evicts the inode of the file being read ahead.
+ */
+ ff = fuse_file_alloc(fm, release);
if (!ff)
return ERR_PTR(-ENOMEM);
@@ -152,13 +163,14 @@ struct fuse_file *fuse_file_open(struct fuse_mount *fm, u64 nodeid,
fuse_file_free(ff);
return ERR_PTR(err);
} else {
- /* No release needed */
- kfree(ff->args);
- ff->args = NULL;
- if (isdir)
+ if (isdir) {
+ /* No release needed */
+ kfree(ff->args);
+ ff->args = NULL;
fc->no_opendir = 1;
- else
+ } else {
fc->no_open = 1;
+ }
}
}
@@ -363,7 +375,7 @@ void fuse_file_release(struct inode *inode, struct fuse_file *ff,
* own ref to the file, the IO completion has to drop the ref, which is
* how the fuse server can end up closing its clients' files.
*/
- fuse_file_put(ff, false);
+ fuse_file_put(ff, false, isdir);
}
void fuse_release_common(struct file *file, bool isdir)
@@ -394,7 +406,7 @@ void fuse_sync_release(struct fuse_inode *fi, struct fuse_file *ff,
{
WARN_ON(refcount_read(&ff->count) > 1);
fuse_prepare_release(fi, ff, flags, FUSE_RELEASE, true);
- fuse_file_put(ff, true);
+ fuse_file_put(ff, true, false);
}
EXPORT_SYMBOL_GPL(fuse_sync_release);
@@ -891,7 +903,7 @@ static void fuse_readpages_end(struct fuse_mount *fm, struct fuse_args *args,
folio_put(ap->folios[i]);
}
if (ia->ff)
- fuse_file_put(ia->ff, false);
+ fuse_file_put(ia->ff, false, false);
fuse_io_free(ia);
}
@@ -1815,7 +1827,7 @@ static void fuse_writepage_free(struct fuse_writepage_args *wpa)
if (wpa->bucket)
fuse_sync_bucket_dec(wpa->bucket);
- fuse_file_put(wpa->ia.ff, false);
+ fuse_file_put(wpa->ia.ff, false, false);
kfree(ap->folios);
kfree(wpa);
@@ -1968,7 +1980,7 @@ int fuse_write_inode(struct inode *inode, struct writeback_control *wbc)
ff = __fuse_write_file_get(fi);
err = fuse_flush_times(inode, ff);
if (ff)
- fuse_file_put(ff, false);
+ fuse_file_put(ff, false, false);
return err;
}
@@ -2186,7 +2198,7 @@ static int fuse_iomap_writeback_submit(struct iomap_writepage_ctx *wpc,
}
if (data->ff)
- fuse_file_put(data->ff, false);
+ fuse_file_put(data->ff, false, false);
return error;
}
--
2.47.3