The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.10.y
git checkout FETCH_HEAD
git cherry-pick -x 42f9c66a6d0cc45758dab77233c5460e1cf003df
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025101640-scouring-earthworm-26af@gregkh' --subject-prefix 'PATCH 5.10.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 42f9c66a6d0cc45758dab77233c5460e1cf003df Mon Sep 17 00:00:00 2001
From: Niklas Cassel <cassel(a)kernel.org>
Date: Mon, 22 Sep 2025 16:08:25 +0200
Subject: [PATCH] PCI: tegra194: Reset BARs when running in PCIe endpoint mode
Tegra already defines all BARs except BAR0 as BAR_RESERVED. This is
sufficient for pci-epf-test to not allocate backing memory and to not call
set_bar() for those BARs. However, marking a BAR as BAR_RESERVED does not
mean that the BAR gets disabled.
The host side driver, pci_endpoint_test, simply does an ioremap for all
enabled BARs and will run tests against all enabled BARs, so it will run
tests against the BARs marked as BAR_RESERVED.
After running the BAR tests (which will write to all enabled BARs), the
inbound address translation is broken. This is because the tegra controller
exposes the ATU Port Logic Structure in BAR4, so when BAR4 is written, the
inbound address translation settings get overwritten.
To avoid this, implement the dw_pcie_ep_ops .init() callback and start off
by disabling all BARs (pci-epf-test will later enable/configure BARs that
are not defined as BAR_RESERVED).
This matches the behavior of other PCIe endpoint drivers: dra7xx, imx6,
layerscape-ep, artpec6, dw-rockchip, qcom-ep, rcar-gen4, and uniphier-ep.
With this, the PCI endpoint kselftest test case CONSECUTIVE_BAR_TEST (which
was specifically made to detect address translation issues) passes.
Fixes: c57247f940e8 ("PCI: tegra: Add support for PCIe endpoint mode in Tegra194")
Signed-off-by: Niklas Cassel <cassel(a)kernel.org>
Signed-off-by: Manivannan Sadhasivam <mani(a)kernel.org>
Signed-off-by: Bjorn Helgaas <bhelgaas(a)google.com>
Cc: stable(a)vger.kernel.org
Link: https://patch.msgid.link/20250922140822.519796-7-cassel@kernel.org
diff --git a/drivers/pci/controller/dwc/pcie-tegra194.c b/drivers/pci/controller/dwc/pcie-tegra194.c
index fe418b9bfbb4..359d92dca86a 100644
--- a/drivers/pci/controller/dwc/pcie-tegra194.c
+++ b/drivers/pci/controller/dwc/pcie-tegra194.c
@@ -1941,6 +1941,15 @@ static irqreturn_t tegra_pcie_ep_pex_rst_irq(int irq, void *arg)
return IRQ_HANDLED;
}
+static void tegra_pcie_ep_init(struct dw_pcie_ep *ep)
+{
+ struct dw_pcie *pci = to_dw_pcie_from_ep(ep);
+ enum pci_barno bar;
+
+ for (bar = 0; bar < PCI_STD_NUM_BARS; bar++)
+ dw_pcie_ep_reset_bar(pci, bar);
+};
+
static int tegra_pcie_ep_raise_intx_irq(struct tegra_pcie_dw *pcie, u16 irq)
{
/* Tegra194 supports only INTA */
@@ -2016,6 +2025,7 @@ tegra_pcie_ep_get_features(struct dw_pcie_ep *ep)
}
static const struct dw_pcie_ep_ops pcie_ep_ops = {
+ .init = tegra_pcie_ep_init,
.raise_irq = tegra_pcie_ep_raise_irq,
.get_features = tegra_pcie_ep_get_features,
};
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.4.y
git checkout FETCH_HEAD
git cherry-pick -x 1cbc5e25fb70e942a7a735a1f3d6dd391afc9b29
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025101610-deserve-skittle-9bde@gregkh' --subject-prefix 'PATCH 5.4.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 1cbc5e25fb70e942a7a735a1f3d6dd391afc9b29 Mon Sep 17 00:00:00 2001
From: Lukas Wunner <lukas(a)wunner.de>
Date: Wed, 13 Aug 2025 07:11:02 +0200
Subject: [PATCH] PCI/ERR: Fix uevent on failure to recover
Upon failure to recover from a PCIe error through AER, DPC or EDR, a
uevent is sent to inform user space about disconnection of the bridge
whose subordinate devices failed to recover.
However the bridge itself is not disconnected. Instead, a uevent should
be sent for each of the subordinate devices.
Only if the "bridge" happens to be a Root Complex Event Collector or
Integrated Endpoint does it make sense to send a uevent for it (because
there are no subordinate devices).
Right now if there is a mix of subordinate devices with and without
pci_error_handlers, a BEGIN_RECOVERY event is sent for those with
pci_error_handlers but no FAILED_RECOVERY event is ever sent for them
afterwards. Fix it.
Fixes: 856e1eb9bdd4 ("PCI/AER: Add uevents in AER and EEH error/resume")
Signed-off-by: Lukas Wunner <lukas(a)wunner.de>
Signed-off-by: Bjorn Helgaas <bhelgaas(a)google.com>
Cc: stable(a)vger.kernel.org # v4.16+
Link: https://patch.msgid.link/68fc527a380821b5d861dd554d2ce42cb739591c.175500815…
diff --git a/drivers/pci/pcie/err.c b/drivers/pci/pcie/err.c
index e795e5ae6b03..21d554359fb1 100644
--- a/drivers/pci/pcie/err.c
+++ b/drivers/pci/pcie/err.c
@@ -108,6 +108,12 @@ static int report_normal_detected(struct pci_dev *dev, void *data)
return report_error_detected(dev, pci_channel_io_normal, data);
}
+static int report_perm_failure_detected(struct pci_dev *dev, void *data)
+{
+ pci_uevent_ers(dev, PCI_ERS_RESULT_DISCONNECT);
+ return 0;
+}
+
static int report_mmio_enabled(struct pci_dev *dev, void *data)
{
struct pci_driver *pdrv;
@@ -272,7 +278,7 @@ pci_ers_result_t pcie_do_recovery(struct pci_dev *dev,
failed:
pci_walk_bridge(bridge, pci_pm_runtime_put, NULL);
- pci_uevent_ers(bridge, PCI_ERS_RESULT_DISCONNECT);
+ pci_walk_bridge(bridge, report_perm_failure_detected, NULL);
pci_info(bridge, "device recovery failed\n");
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.4.y
git checkout FETCH_HEAD
git cherry-pick -x 48991e4935078b05f80616c75d1ee2ea3ae18e58
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025101641-squire-emboss-834e@gregkh' --subject-prefix 'PATCH 5.4.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 48991e4935078b05f80616c75d1ee2ea3ae18e58 Mon Sep 17 00:00:00 2001
From: Brian Norris <briannorris(a)google.com>
Date: Wed, 24 Sep 2025 09:57:11 -0700
Subject: [PATCH] PCI/sysfs: Ensure devices are powered for config reads
The "max_link_width", "current_link_speed", "current_link_width",
"secondary_bus_number", and "subordinate_bus_number" sysfs files all access
config registers, but they don't check the runtime PM state. If the device
is in D3cold or a parent bridge is suspended, we may see -EINVAL, bogus
values, or worse, depending on implementation details.
Wrap these access in pci_config_pm_runtime_{get,put}() like most of the
rest of the similar sysfs attributes.
Notably, "max_link_speed" does not access config registers; it returns a
cached value since d2bd39c0456b ("PCI: Store all PCIe Supported Link
Speeds").
Fixes: 56c1af4606f0 ("PCI: Add sysfs max_link_speed/width, current_link_speed/width, etc")
Signed-off-by: Brian Norris <briannorris(a)google.com>
Signed-off-by: Brian Norris <briannorris(a)chromium.org>
Signed-off-by: Bjorn Helgaas <bhelgaas(a)google.com>
Cc: stable(a)vger.kernel.org
Link: https://patch.msgid.link/20250924095711.v2.1.Ibb5b6ca1e2c059e04ec53140cd98a…
diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c
index 5eea14c1f7f5..2b231ef1dac9 100644
--- a/drivers/pci/pci-sysfs.c
+++ b/drivers/pci/pci-sysfs.c
@@ -201,8 +201,14 @@ static ssize_t max_link_width_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct pci_dev *pdev = to_pci_dev(dev);
+ ssize_t ret;
- return sysfs_emit(buf, "%u\n", pcie_get_width_cap(pdev));
+ /* We read PCI_EXP_LNKCAP, so we need the device to be accessible. */
+ pci_config_pm_runtime_get(pdev);
+ ret = sysfs_emit(buf, "%u\n", pcie_get_width_cap(pdev));
+ pci_config_pm_runtime_put(pdev);
+
+ return ret;
}
static DEVICE_ATTR_RO(max_link_width);
@@ -214,7 +220,10 @@ static ssize_t current_link_speed_show(struct device *dev,
int err;
enum pci_bus_speed speed;
+ pci_config_pm_runtime_get(pci_dev);
err = pcie_capability_read_word(pci_dev, PCI_EXP_LNKSTA, &linkstat);
+ pci_config_pm_runtime_put(pci_dev);
+
if (err)
return -EINVAL;
@@ -231,7 +240,10 @@ static ssize_t current_link_width_show(struct device *dev,
u16 linkstat;
int err;
+ pci_config_pm_runtime_get(pci_dev);
err = pcie_capability_read_word(pci_dev, PCI_EXP_LNKSTA, &linkstat);
+ pci_config_pm_runtime_put(pci_dev);
+
if (err)
return -EINVAL;
@@ -247,7 +259,10 @@ static ssize_t secondary_bus_number_show(struct device *dev,
u8 sec_bus;
int err;
+ pci_config_pm_runtime_get(pci_dev);
err = pci_read_config_byte(pci_dev, PCI_SECONDARY_BUS, &sec_bus);
+ pci_config_pm_runtime_put(pci_dev);
+
if (err)
return -EINVAL;
@@ -263,7 +278,10 @@ static ssize_t subordinate_bus_number_show(struct device *dev,
u8 sub_bus;
int err;
+ pci_config_pm_runtime_get(pci_dev);
err = pci_read_config_byte(pci_dev, PCI_SUBORDINATE_BUS, &sub_bus);
+ pci_config_pm_runtime_put(pci_dev);
+
if (err)
return -EINVAL;
This patch fixes ce7a381697cb ("net: bonding: add broadcast_neighbor option for 802.3ad").
Before this commit, on the broadcast mode, all devices were traversed using the
bond_for_each_slave_rcu. This patch supports traversing devices by using all_slaves.
Therefore, we need to update the slave array when enslave or release salve.
Fixes: ce7a381697cb ("net: bonding: add broadcast_neighbor option for 802.3ad")
Cc: Jay Vosburgh <jv(a)jvosburgh.net>
Cc: "David S. Miller" <davem(a)davemloft.net>
Cc: Eric Dumazet <edumazet(a)google.com>
Cc: Jakub Kicinski <kuba(a)kernel.org>
Cc: Paolo Abeni <pabeni(a)redhat.com>
Cc: Simon Horman <horms(a)kernel.org>
Cc: Jonathan Corbet <corbet(a)lwn.net>
Cc: Andrew Lunn <andrew+netdev(a)lunn.ch>
Cc: Nikolay Aleksandrov <razor(a)blackwall.org>
Cc: Hangbin Liu <liuhangbin(a)gmail.com>
Cc: Jiri Slaby <jirislaby(a)kernel.org>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Tonghao Zhang <tonghao(a)bamaicloud.com>
---
drivers/net/bonding/bond_main.c | 7 +++++--
1 file changed, 5 insertions(+), 2 deletions(-)
diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 17c7542be6a5..2d6883296e32 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -2384,7 +2384,9 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev,
unblock_netpoll_tx();
}
- if (bond_mode_can_use_xmit_hash(bond))
+ /* broadcast mode uses the all_slaves to loop through slaves. */
+ if (bond_mode_can_use_xmit_hash(bond) ||
+ BOND_MODE(bond) == BOND_MODE_BROADCAST)
bond_update_slave_arr(bond, NULL);
if (!slave_dev->netdev_ops->ndo_bpf ||
@@ -2560,7 +2562,8 @@ static int __bond_release_one(struct net_device *bond_dev,
bond_upper_dev_unlink(bond, slave);
- if (bond_mode_can_use_xmit_hash(bond))
+ if (bond_mode_can_use_xmit_hash(bond) ||
+ BOND_MODE(bond) == BOND_MODE_BROADCAST)
bond_update_slave_arr(bond, slave);
slave_info(bond_dev, slave_dev, "Releasing %s interface\n",
--
2.34.1
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.4.y
git checkout FETCH_HEAD
git cherry-pick -x 5d5f08fd0cd970184376bee07d59f635c8403f63
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025101607-uncivil-cartload-f9ef@gregkh' --subject-prefix 'PATCH 5.4.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 5d5f08fd0cd970184376bee07d59f635c8403f63 Mon Sep 17 00:00:00 2001
From: Miaoqian Lin <linmq006(a)gmail.com>
Date: Fri, 29 Aug 2025 16:30:15 +0800
Subject: [PATCH] xtensa: simdisk: add input size check in proc_write_simdisk
A malicious user could pass an arbitrarily bad value
to memdup_user_nul(), potentially causing kernel crash.
This follows the same pattern as commit ee76746387f6
("netdevsim: prevent bad user input in nsim_dev_health_break_write()")
Fixes: b6c7e873daf7 ("xtensa: ISS: add host file-based simulated disk")
Fixes: 16e5c1fc3604 ("convert a bunch of open-coded instances of memdup_user_nul()")
Cc: stable(a)vger.kernel.org
Signed-off-by: Miaoqian Lin <linmq006(a)gmail.com>
Message-Id: <20250829083015.1992751-1-linmq006(a)gmail.com>
Signed-off-by: Max Filippov <jcmvbkbc(a)gmail.com>
diff --git a/arch/xtensa/platforms/iss/simdisk.c b/arch/xtensa/platforms/iss/simdisk.c
index 6ed009318d24..3cafc8feddee 100644
--- a/arch/xtensa/platforms/iss/simdisk.c
+++ b/arch/xtensa/platforms/iss/simdisk.c
@@ -231,10 +231,14 @@ static ssize_t proc_read_simdisk(struct file *file, char __user *buf,
static ssize_t proc_write_simdisk(struct file *file, const char __user *buf,
size_t count, loff_t *ppos)
{
- char *tmp = memdup_user_nul(buf, count);
+ char *tmp;
struct simdisk *dev = pde_data(file_inode(file));
int err;
+ if (count == 0 || count > PAGE_SIZE)
+ return -EINVAL;
+
+ tmp = memdup_user_nul(buf, count);
if (IS_ERR(tmp))
return PTR_ERR(tmp);
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.10.y
git checkout FETCH_HEAD
git cherry-pick -x 5d5f08fd0cd970184376bee07d59f635c8403f63
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025101607-sift-throttle-b1aa@gregkh' --subject-prefix 'PATCH 5.10.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 5d5f08fd0cd970184376bee07d59f635c8403f63 Mon Sep 17 00:00:00 2001
From: Miaoqian Lin <linmq006(a)gmail.com>
Date: Fri, 29 Aug 2025 16:30:15 +0800
Subject: [PATCH] xtensa: simdisk: add input size check in proc_write_simdisk
A malicious user could pass an arbitrarily bad value
to memdup_user_nul(), potentially causing kernel crash.
This follows the same pattern as commit ee76746387f6
("netdevsim: prevent bad user input in nsim_dev_health_break_write()")
Fixes: b6c7e873daf7 ("xtensa: ISS: add host file-based simulated disk")
Fixes: 16e5c1fc3604 ("convert a bunch of open-coded instances of memdup_user_nul()")
Cc: stable(a)vger.kernel.org
Signed-off-by: Miaoqian Lin <linmq006(a)gmail.com>
Message-Id: <20250829083015.1992751-1-linmq006(a)gmail.com>
Signed-off-by: Max Filippov <jcmvbkbc(a)gmail.com>
diff --git a/arch/xtensa/platforms/iss/simdisk.c b/arch/xtensa/platforms/iss/simdisk.c
index 6ed009318d24..3cafc8feddee 100644
--- a/arch/xtensa/platforms/iss/simdisk.c
+++ b/arch/xtensa/platforms/iss/simdisk.c
@@ -231,10 +231,14 @@ static ssize_t proc_read_simdisk(struct file *file, char __user *buf,
static ssize_t proc_write_simdisk(struct file *file, const char __user *buf,
size_t count, loff_t *ppos)
{
- char *tmp = memdup_user_nul(buf, count);
+ char *tmp;
struct simdisk *dev = pde_data(file_inode(file));
int err;
+ if (count == 0 || count > PAGE_SIZE)
+ return -EINVAL;
+
+ tmp = memdup_user_nul(buf, count);
if (IS_ERR(tmp))
return PTR_ERR(tmp);
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.15.y
git checkout FETCH_HEAD
git cherry-pick -x 5d5f08fd0cd970184376bee07d59f635c8403f63
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025101606-wildfire-moody-50d8@gregkh' --subject-prefix 'PATCH 5.15.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 5d5f08fd0cd970184376bee07d59f635c8403f63 Mon Sep 17 00:00:00 2001
From: Miaoqian Lin <linmq006(a)gmail.com>
Date: Fri, 29 Aug 2025 16:30:15 +0800
Subject: [PATCH] xtensa: simdisk: add input size check in proc_write_simdisk
A malicious user could pass an arbitrarily bad value
to memdup_user_nul(), potentially causing kernel crash.
This follows the same pattern as commit ee76746387f6
("netdevsim: prevent bad user input in nsim_dev_health_break_write()")
Fixes: b6c7e873daf7 ("xtensa: ISS: add host file-based simulated disk")
Fixes: 16e5c1fc3604 ("convert a bunch of open-coded instances of memdup_user_nul()")
Cc: stable(a)vger.kernel.org
Signed-off-by: Miaoqian Lin <linmq006(a)gmail.com>
Message-Id: <20250829083015.1992751-1-linmq006(a)gmail.com>
Signed-off-by: Max Filippov <jcmvbkbc(a)gmail.com>
diff --git a/arch/xtensa/platforms/iss/simdisk.c b/arch/xtensa/platforms/iss/simdisk.c
index 6ed009318d24..3cafc8feddee 100644
--- a/arch/xtensa/platforms/iss/simdisk.c
+++ b/arch/xtensa/platforms/iss/simdisk.c
@@ -231,10 +231,14 @@ static ssize_t proc_read_simdisk(struct file *file, char __user *buf,
static ssize_t proc_write_simdisk(struct file *file, const char __user *buf,
size_t count, loff_t *ppos)
{
- char *tmp = memdup_user_nul(buf, count);
+ char *tmp;
struct simdisk *dev = pde_data(file_inode(file));
int err;
+ if (count == 0 || count > PAGE_SIZE)
+ return -EINVAL;
+
+ tmp = memdup_user_nul(buf, count);
if (IS_ERR(tmp))
return PTR_ERR(tmp);
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.4.y
git checkout FETCH_HEAD
git cherry-pick -x 3a4b9d027e4061766f618292df91760ea64a1fcc
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025101613-candle-babble-137f@gregkh' --subject-prefix 'PATCH 5.4.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 3a4b9d027e4061766f618292df91760ea64a1fcc Mon Sep 17 00:00:00 2001
From: Jisheng Zhang <jszhang(a)kernel.org>
Date: Tue, 19 Aug 2025 19:42:24 +0800
Subject: [PATCH] pwm: berlin: Fix wrong register in suspend/resume
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
The 'enable' register should be BERLIN_PWM_EN rather than
BERLIN_PWM_ENABLE, otherwise, the driver accesses wrong address, there
will be cpu exception then kernel panic during suspend/resume.
Fixes: bbf0722c1c66 ("pwm: berlin: Add suspend/resume support")
Signed-off-by: Jisheng Zhang <jszhang(a)kernel.org>
Link: https://lore.kernel.org/r/20250819114224.31825-1-jszhang@kernel.org
Cc: stable(a)vger.kernel.org
Signed-off-by: Uwe Kleine-König <ukleinek(a)kernel.org>
diff --git a/drivers/pwm/pwm-berlin.c b/drivers/pwm/pwm-berlin.c
index 831aed228caf..858d36991374 100644
--- a/drivers/pwm/pwm-berlin.c
+++ b/drivers/pwm/pwm-berlin.c
@@ -234,7 +234,7 @@ static int berlin_pwm_suspend(struct device *dev)
for (i = 0; i < chip->npwm; i++) {
struct berlin_pwm_channel *channel = &bpc->channel[i];
- channel->enable = berlin_pwm_readl(bpc, i, BERLIN_PWM_ENABLE);
+ channel->enable = berlin_pwm_readl(bpc, i, BERLIN_PWM_EN);
channel->ctrl = berlin_pwm_readl(bpc, i, BERLIN_PWM_CONTROL);
channel->duty = berlin_pwm_readl(bpc, i, BERLIN_PWM_DUTY);
channel->tcnt = berlin_pwm_readl(bpc, i, BERLIN_PWM_TCNT);
@@ -262,7 +262,7 @@ static int berlin_pwm_resume(struct device *dev)
berlin_pwm_writel(bpc, i, channel->ctrl, BERLIN_PWM_CONTROL);
berlin_pwm_writel(bpc, i, channel->duty, BERLIN_PWM_DUTY);
berlin_pwm_writel(bpc, i, channel->tcnt, BERLIN_PWM_TCNT);
- berlin_pwm_writel(bpc, i, channel->enable, BERLIN_PWM_ENABLE);
+ berlin_pwm_writel(bpc, i, channel->enable, BERLIN_PWM_EN);
}
return 0;
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.10.y
git checkout FETCH_HEAD
git cherry-pick -x 284fb19a3ffb1083c3ad9c00d29749d09dddb99c
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025101651-unskilled-shore-57a7@gregkh' --subject-prefix 'PATCH 5.10.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 284fb19a3ffb1083c3ad9c00d29749d09dddb99c Mon Sep 17 00:00:00 2001
From: Devarsh Thakkar <devarsht(a)ti.com>
Date: Fri, 4 Jul 2025 18:29:14 +0530
Subject: [PATCH] phy: cadence: cdns-dphy: Fix PLL lock and O_CMN_READY polling
PLL lockup and O_CMN_READY assertion can only happen after common state
machine gets enabled by programming DPHY_CMN_SSM register, but driver was
polling them before the common state machine was enabled which is
incorrect. This is as per the DPHY initialization sequence as mentioned in
J721E TRM [1] at section "12.7.2.4.1.2.1 Start-up Sequence Timing Diagram".
It shows O_CMN_READY polling at the end after common configuration pin
setup where the common configuration pin setup step enables state machine
as referenced in "Table 12-1533. Common Configuration-Related Setup
mentions state machine"
To fix this :
- Add new function callbacks for polling on PLL lock and O_CMN_READY
assertion.
- As state machine and clocks get enabled in power_on callback only, move
the clock related programming part from configure callback to power_on
callback and poll for the PLL lockup and O_CMN_READY assertion after state
machine gets enabled.
- The configure callback only saves the PLL configuration received from the
client driver which will be applied later on in power_on callback.
- Add checks to ensure configure is called before power_on and state
machine is in disabled state before power_on callback is called.
- Disable state machine in power_off so that client driver can re-configure
the PLL by following up a power_off, configure, power_on sequence.
[1]: https://www.ti.com/lit/zip/spruil1
Cc: stable(a)vger.kernel.org
Fixes: 7a343c8bf4b5 ("phy: Add Cadence D-PHY support")
Signed-off-by: Devarsh Thakkar <devarsht(a)ti.com>
Tested-by: Harikrishna Shenoy <h-shenoy(a)ti.com>
Reviewed-by: Tomi Valkeinen <tomi.valkeinen(a)ideasonboard.com>
Link: https://lore.kernel.org/r/20250704125915.1224738-2-devarsht@ti.com
Signed-off-by: Vinod Koul <vkoul(a)kernel.org>
diff --git a/drivers/phy/cadence/cdns-dphy.c b/drivers/phy/cadence/cdns-dphy.c
index 33a42e72362e..da8de0a9d086 100644
--- a/drivers/phy/cadence/cdns-dphy.c
+++ b/drivers/phy/cadence/cdns-dphy.c
@@ -92,6 +92,8 @@ struct cdns_dphy_ops {
void (*set_pll_cfg)(struct cdns_dphy *dphy,
const struct cdns_dphy_cfg *cfg);
unsigned long (*get_wakeup_time_ns)(struct cdns_dphy *dphy);
+ int (*wait_for_pll_lock)(struct cdns_dphy *dphy);
+ int (*wait_for_cmn_ready)(struct cdns_dphy *dphy);
};
struct cdns_dphy {
@@ -101,6 +103,8 @@ struct cdns_dphy {
struct clk *pll_ref_clk;
const struct cdns_dphy_ops *ops;
struct phy *phy;
+ bool is_configured;
+ bool is_powered;
};
/* Order of bands is important since the index is the band number. */
@@ -186,6 +190,16 @@ static unsigned long cdns_dphy_get_wakeup_time_ns(struct cdns_dphy *dphy)
return dphy->ops->get_wakeup_time_ns(dphy);
}
+static int cdns_dphy_wait_for_pll_lock(struct cdns_dphy *dphy)
+{
+ return dphy->ops->wait_for_pll_lock ? dphy->ops->wait_for_pll_lock(dphy) : 0;
+}
+
+static int cdns_dphy_wait_for_cmn_ready(struct cdns_dphy *dphy)
+{
+ return dphy->ops->wait_for_cmn_ready ? dphy->ops->wait_for_cmn_ready(dphy) : 0;
+}
+
static unsigned long cdns_dphy_ref_get_wakeup_time_ns(struct cdns_dphy *dphy)
{
/* Default wakeup time is 800 ns (in a simulated environment). */
@@ -227,7 +241,6 @@ static unsigned long cdns_dphy_j721e_get_wakeup_time_ns(struct cdns_dphy *dphy)
static void cdns_dphy_j721e_set_pll_cfg(struct cdns_dphy *dphy,
const struct cdns_dphy_cfg *cfg)
{
- u32 status;
/*
* set the PWM and PLL Byteclk divider settings to recommended values
@@ -244,13 +257,6 @@ static void cdns_dphy_j721e_set_pll_cfg(struct cdns_dphy *dphy,
writel(DPHY_TX_J721E_WIZ_LANE_RSTB,
dphy->regs + DPHY_TX_J721E_WIZ_RST_CTRL);
-
- readl_poll_timeout(dphy->regs + DPHY_TX_J721E_WIZ_PLL_CTRL, status,
- (status & DPHY_TX_WIZ_PLL_LOCK), 0, POLL_TIMEOUT_US);
-
- readl_poll_timeout(dphy->regs + DPHY_TX_J721E_WIZ_STATUS, status,
- (status & DPHY_TX_WIZ_O_CMN_READY), 0,
- POLL_TIMEOUT_US);
}
static void cdns_dphy_j721e_set_psm_div(struct cdns_dphy *dphy, u8 div)
@@ -258,6 +264,23 @@ static void cdns_dphy_j721e_set_psm_div(struct cdns_dphy *dphy, u8 div)
writel(div, dphy->regs + DPHY_TX_J721E_WIZ_PSM_FREQ);
}
+static int cdns_dphy_j721e_wait_for_pll_lock(struct cdns_dphy *dphy)
+{
+ u32 status;
+
+ return readl_poll_timeout(dphy->regs + DPHY_TX_J721E_WIZ_PLL_CTRL, status,
+ status & DPHY_TX_WIZ_PLL_LOCK, 0, POLL_TIMEOUT_US);
+}
+
+static int cdns_dphy_j721e_wait_for_cmn_ready(struct cdns_dphy *dphy)
+{
+ u32 status;
+
+ return readl_poll_timeout(dphy->regs + DPHY_TX_J721E_WIZ_STATUS, status,
+ status & DPHY_TX_WIZ_O_CMN_READY, 0,
+ POLL_TIMEOUT_US);
+}
+
/*
* This is the reference implementation of DPHY hooks. Specific integration of
* this IP may have to re-implement some of them depending on how they decided
@@ -273,6 +296,8 @@ static const struct cdns_dphy_ops j721e_dphy_ops = {
.get_wakeup_time_ns = cdns_dphy_j721e_get_wakeup_time_ns,
.set_pll_cfg = cdns_dphy_j721e_set_pll_cfg,
.set_psm_div = cdns_dphy_j721e_set_psm_div,
+ .wait_for_pll_lock = cdns_dphy_j721e_wait_for_pll_lock,
+ .wait_for_cmn_ready = cdns_dphy_j721e_wait_for_cmn_ready,
};
static int cdns_dphy_config_from_opts(struct phy *phy,
@@ -328,21 +353,36 @@ static int cdns_dphy_validate(struct phy *phy, enum phy_mode mode, int submode,
static int cdns_dphy_configure(struct phy *phy, union phy_configure_opts *opts)
{
struct cdns_dphy *dphy = phy_get_drvdata(phy);
- struct cdns_dphy_cfg cfg = { 0 };
- int ret, band_ctrl;
- unsigned int reg;
+ int ret;
- ret = cdns_dphy_config_from_opts(phy, &opts->mipi_dphy, &cfg);
- if (ret)
- return ret;
+ ret = cdns_dphy_config_from_opts(phy, &opts->mipi_dphy, &dphy->cfg);
+ if (!ret)
+ dphy->is_configured = true;
+
+ return ret;
+}
+
+static int cdns_dphy_power_on(struct phy *phy)
+{
+ struct cdns_dphy *dphy = phy_get_drvdata(phy);
+ int ret;
+ u32 reg;
+
+ if (!dphy->is_configured || dphy->is_powered)
+ return -EINVAL;
+
+ clk_prepare_enable(dphy->psm_clk);
+ clk_prepare_enable(dphy->pll_ref_clk);
/*
* Configure the internal PSM clk divider so that the DPHY has a
* 1MHz clk (or something close).
*/
ret = cdns_dphy_setup_psm(dphy);
- if (ret)
- return ret;
+ if (ret) {
+ dev_err(&dphy->phy->dev, "Failed to setup PSM with error %d\n", ret);
+ goto err_power_on;
+ }
/*
* Configure attach clk lanes to data lanes: the DPHY has 2 clk lanes
@@ -357,40 +397,60 @@ static int cdns_dphy_configure(struct phy *phy, union phy_configure_opts *opts)
* Configure the DPHY PLL that will be used to generate the TX byte
* clk.
*/
- cdns_dphy_set_pll_cfg(dphy, &cfg);
+ cdns_dphy_set_pll_cfg(dphy, &dphy->cfg);
- band_ctrl = cdns_dphy_tx_get_band_ctrl(opts->mipi_dphy.hs_clk_rate);
- if (band_ctrl < 0)
- return band_ctrl;
+ ret = cdns_dphy_tx_get_band_ctrl(dphy->cfg.hs_clk_rate);
+ if (ret < 0) {
+ dev_err(&dphy->phy->dev, "Failed to get band control value with error %d\n", ret);
+ goto err_power_on;
+ }
- reg = FIELD_PREP(DPHY_BAND_CFG_LEFT_BAND, band_ctrl) |
- FIELD_PREP(DPHY_BAND_CFG_RIGHT_BAND, band_ctrl);
+ reg = FIELD_PREP(DPHY_BAND_CFG_LEFT_BAND, ret) |
+ FIELD_PREP(DPHY_BAND_CFG_RIGHT_BAND, ret);
writel(reg, dphy->regs + DPHY_BAND_CFG);
- return 0;
-}
-
-static int cdns_dphy_power_on(struct phy *phy)
-{
- struct cdns_dphy *dphy = phy_get_drvdata(phy);
-
- clk_prepare_enable(dphy->psm_clk);
- clk_prepare_enable(dphy->pll_ref_clk);
-
/* Start TX state machine. */
writel(DPHY_CMN_SSM_EN | DPHY_CMN_TX_MODE_EN,
dphy->regs + DPHY_CMN_SSM);
+ ret = cdns_dphy_wait_for_pll_lock(dphy);
+ if (ret) {
+ dev_err(&dphy->phy->dev, "Failed to lock PLL with error %d\n", ret);
+ goto err_power_on;
+ }
+
+ ret = cdns_dphy_wait_for_cmn_ready(dphy);
+ if (ret) {
+ dev_err(&dphy->phy->dev, "O_CMN_READY signal failed to assert with error %d\n",
+ ret);
+ goto err_power_on;
+ }
+
+ dphy->is_powered = true;
+
return 0;
+
+err_power_on:
+ clk_disable_unprepare(dphy->pll_ref_clk);
+ clk_disable_unprepare(dphy->psm_clk);
+
+ return ret;
}
static int cdns_dphy_power_off(struct phy *phy)
{
struct cdns_dphy *dphy = phy_get_drvdata(phy);
+ u32 reg;
clk_disable_unprepare(dphy->pll_ref_clk);
clk_disable_unprepare(dphy->psm_clk);
+ /* Stop TX state machine. */
+ reg = readl(dphy->regs + DPHY_CMN_SSM);
+ writel(reg & ~DPHY_CMN_SSM_EN, dphy->regs + DPHY_CMN_SSM);
+
+ dphy->is_powered = false;
+
return 0;
}
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.4.y
git checkout FETCH_HEAD
git cherry-pick -x 284fb19a3ffb1083c3ad9c00d29749d09dddb99c
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025101651-cotton-movable-3a6d@gregkh' --subject-prefix 'PATCH 5.4.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 284fb19a3ffb1083c3ad9c00d29749d09dddb99c Mon Sep 17 00:00:00 2001
From: Devarsh Thakkar <devarsht(a)ti.com>
Date: Fri, 4 Jul 2025 18:29:14 +0530
Subject: [PATCH] phy: cadence: cdns-dphy: Fix PLL lock and O_CMN_READY polling
PLL lockup and O_CMN_READY assertion can only happen after common state
machine gets enabled by programming DPHY_CMN_SSM register, but driver was
polling them before the common state machine was enabled which is
incorrect. This is as per the DPHY initialization sequence as mentioned in
J721E TRM [1] at section "12.7.2.4.1.2.1 Start-up Sequence Timing Diagram".
It shows O_CMN_READY polling at the end after common configuration pin
setup where the common configuration pin setup step enables state machine
as referenced in "Table 12-1533. Common Configuration-Related Setup
mentions state machine"
To fix this :
- Add new function callbacks for polling on PLL lock and O_CMN_READY
assertion.
- As state machine and clocks get enabled in power_on callback only, move
the clock related programming part from configure callback to power_on
callback and poll for the PLL lockup and O_CMN_READY assertion after state
machine gets enabled.
- The configure callback only saves the PLL configuration received from the
client driver which will be applied later on in power_on callback.
- Add checks to ensure configure is called before power_on and state
machine is in disabled state before power_on callback is called.
- Disable state machine in power_off so that client driver can re-configure
the PLL by following up a power_off, configure, power_on sequence.
[1]: https://www.ti.com/lit/zip/spruil1
Cc: stable(a)vger.kernel.org
Fixes: 7a343c8bf4b5 ("phy: Add Cadence D-PHY support")
Signed-off-by: Devarsh Thakkar <devarsht(a)ti.com>
Tested-by: Harikrishna Shenoy <h-shenoy(a)ti.com>
Reviewed-by: Tomi Valkeinen <tomi.valkeinen(a)ideasonboard.com>
Link: https://lore.kernel.org/r/20250704125915.1224738-2-devarsht@ti.com
Signed-off-by: Vinod Koul <vkoul(a)kernel.org>
diff --git a/drivers/phy/cadence/cdns-dphy.c b/drivers/phy/cadence/cdns-dphy.c
index 33a42e72362e..da8de0a9d086 100644
--- a/drivers/phy/cadence/cdns-dphy.c
+++ b/drivers/phy/cadence/cdns-dphy.c
@@ -92,6 +92,8 @@ struct cdns_dphy_ops {
void (*set_pll_cfg)(struct cdns_dphy *dphy,
const struct cdns_dphy_cfg *cfg);
unsigned long (*get_wakeup_time_ns)(struct cdns_dphy *dphy);
+ int (*wait_for_pll_lock)(struct cdns_dphy *dphy);
+ int (*wait_for_cmn_ready)(struct cdns_dphy *dphy);
};
struct cdns_dphy {
@@ -101,6 +103,8 @@ struct cdns_dphy {
struct clk *pll_ref_clk;
const struct cdns_dphy_ops *ops;
struct phy *phy;
+ bool is_configured;
+ bool is_powered;
};
/* Order of bands is important since the index is the band number. */
@@ -186,6 +190,16 @@ static unsigned long cdns_dphy_get_wakeup_time_ns(struct cdns_dphy *dphy)
return dphy->ops->get_wakeup_time_ns(dphy);
}
+static int cdns_dphy_wait_for_pll_lock(struct cdns_dphy *dphy)
+{
+ return dphy->ops->wait_for_pll_lock ? dphy->ops->wait_for_pll_lock(dphy) : 0;
+}
+
+static int cdns_dphy_wait_for_cmn_ready(struct cdns_dphy *dphy)
+{
+ return dphy->ops->wait_for_cmn_ready ? dphy->ops->wait_for_cmn_ready(dphy) : 0;
+}
+
static unsigned long cdns_dphy_ref_get_wakeup_time_ns(struct cdns_dphy *dphy)
{
/* Default wakeup time is 800 ns (in a simulated environment). */
@@ -227,7 +241,6 @@ static unsigned long cdns_dphy_j721e_get_wakeup_time_ns(struct cdns_dphy *dphy)
static void cdns_dphy_j721e_set_pll_cfg(struct cdns_dphy *dphy,
const struct cdns_dphy_cfg *cfg)
{
- u32 status;
/*
* set the PWM and PLL Byteclk divider settings to recommended values
@@ -244,13 +257,6 @@ static void cdns_dphy_j721e_set_pll_cfg(struct cdns_dphy *dphy,
writel(DPHY_TX_J721E_WIZ_LANE_RSTB,
dphy->regs + DPHY_TX_J721E_WIZ_RST_CTRL);
-
- readl_poll_timeout(dphy->regs + DPHY_TX_J721E_WIZ_PLL_CTRL, status,
- (status & DPHY_TX_WIZ_PLL_LOCK), 0, POLL_TIMEOUT_US);
-
- readl_poll_timeout(dphy->regs + DPHY_TX_J721E_WIZ_STATUS, status,
- (status & DPHY_TX_WIZ_O_CMN_READY), 0,
- POLL_TIMEOUT_US);
}
static void cdns_dphy_j721e_set_psm_div(struct cdns_dphy *dphy, u8 div)
@@ -258,6 +264,23 @@ static void cdns_dphy_j721e_set_psm_div(struct cdns_dphy *dphy, u8 div)
writel(div, dphy->regs + DPHY_TX_J721E_WIZ_PSM_FREQ);
}
+static int cdns_dphy_j721e_wait_for_pll_lock(struct cdns_dphy *dphy)
+{
+ u32 status;
+
+ return readl_poll_timeout(dphy->regs + DPHY_TX_J721E_WIZ_PLL_CTRL, status,
+ status & DPHY_TX_WIZ_PLL_LOCK, 0, POLL_TIMEOUT_US);
+}
+
+static int cdns_dphy_j721e_wait_for_cmn_ready(struct cdns_dphy *dphy)
+{
+ u32 status;
+
+ return readl_poll_timeout(dphy->regs + DPHY_TX_J721E_WIZ_STATUS, status,
+ status & DPHY_TX_WIZ_O_CMN_READY, 0,
+ POLL_TIMEOUT_US);
+}
+
/*
* This is the reference implementation of DPHY hooks. Specific integration of
* this IP may have to re-implement some of them depending on how they decided
@@ -273,6 +296,8 @@ static const struct cdns_dphy_ops j721e_dphy_ops = {
.get_wakeup_time_ns = cdns_dphy_j721e_get_wakeup_time_ns,
.set_pll_cfg = cdns_dphy_j721e_set_pll_cfg,
.set_psm_div = cdns_dphy_j721e_set_psm_div,
+ .wait_for_pll_lock = cdns_dphy_j721e_wait_for_pll_lock,
+ .wait_for_cmn_ready = cdns_dphy_j721e_wait_for_cmn_ready,
};
static int cdns_dphy_config_from_opts(struct phy *phy,
@@ -328,21 +353,36 @@ static int cdns_dphy_validate(struct phy *phy, enum phy_mode mode, int submode,
static int cdns_dphy_configure(struct phy *phy, union phy_configure_opts *opts)
{
struct cdns_dphy *dphy = phy_get_drvdata(phy);
- struct cdns_dphy_cfg cfg = { 0 };
- int ret, band_ctrl;
- unsigned int reg;
+ int ret;
- ret = cdns_dphy_config_from_opts(phy, &opts->mipi_dphy, &cfg);
- if (ret)
- return ret;
+ ret = cdns_dphy_config_from_opts(phy, &opts->mipi_dphy, &dphy->cfg);
+ if (!ret)
+ dphy->is_configured = true;
+
+ return ret;
+}
+
+static int cdns_dphy_power_on(struct phy *phy)
+{
+ struct cdns_dphy *dphy = phy_get_drvdata(phy);
+ int ret;
+ u32 reg;
+
+ if (!dphy->is_configured || dphy->is_powered)
+ return -EINVAL;
+
+ clk_prepare_enable(dphy->psm_clk);
+ clk_prepare_enable(dphy->pll_ref_clk);
/*
* Configure the internal PSM clk divider so that the DPHY has a
* 1MHz clk (or something close).
*/
ret = cdns_dphy_setup_psm(dphy);
- if (ret)
- return ret;
+ if (ret) {
+ dev_err(&dphy->phy->dev, "Failed to setup PSM with error %d\n", ret);
+ goto err_power_on;
+ }
/*
* Configure attach clk lanes to data lanes: the DPHY has 2 clk lanes
@@ -357,40 +397,60 @@ static int cdns_dphy_configure(struct phy *phy, union phy_configure_opts *opts)
* Configure the DPHY PLL that will be used to generate the TX byte
* clk.
*/
- cdns_dphy_set_pll_cfg(dphy, &cfg);
+ cdns_dphy_set_pll_cfg(dphy, &dphy->cfg);
- band_ctrl = cdns_dphy_tx_get_band_ctrl(opts->mipi_dphy.hs_clk_rate);
- if (band_ctrl < 0)
- return band_ctrl;
+ ret = cdns_dphy_tx_get_band_ctrl(dphy->cfg.hs_clk_rate);
+ if (ret < 0) {
+ dev_err(&dphy->phy->dev, "Failed to get band control value with error %d\n", ret);
+ goto err_power_on;
+ }
- reg = FIELD_PREP(DPHY_BAND_CFG_LEFT_BAND, band_ctrl) |
- FIELD_PREP(DPHY_BAND_CFG_RIGHT_BAND, band_ctrl);
+ reg = FIELD_PREP(DPHY_BAND_CFG_LEFT_BAND, ret) |
+ FIELD_PREP(DPHY_BAND_CFG_RIGHT_BAND, ret);
writel(reg, dphy->regs + DPHY_BAND_CFG);
- return 0;
-}
-
-static int cdns_dphy_power_on(struct phy *phy)
-{
- struct cdns_dphy *dphy = phy_get_drvdata(phy);
-
- clk_prepare_enable(dphy->psm_clk);
- clk_prepare_enable(dphy->pll_ref_clk);
-
/* Start TX state machine. */
writel(DPHY_CMN_SSM_EN | DPHY_CMN_TX_MODE_EN,
dphy->regs + DPHY_CMN_SSM);
+ ret = cdns_dphy_wait_for_pll_lock(dphy);
+ if (ret) {
+ dev_err(&dphy->phy->dev, "Failed to lock PLL with error %d\n", ret);
+ goto err_power_on;
+ }
+
+ ret = cdns_dphy_wait_for_cmn_ready(dphy);
+ if (ret) {
+ dev_err(&dphy->phy->dev, "O_CMN_READY signal failed to assert with error %d\n",
+ ret);
+ goto err_power_on;
+ }
+
+ dphy->is_powered = true;
+
return 0;
+
+err_power_on:
+ clk_disable_unprepare(dphy->pll_ref_clk);
+ clk_disable_unprepare(dphy->psm_clk);
+
+ return ret;
}
static int cdns_dphy_power_off(struct phy *phy)
{
struct cdns_dphy *dphy = phy_get_drvdata(phy);
+ u32 reg;
clk_disable_unprepare(dphy->pll_ref_clk);
clk_disable_unprepare(dphy->psm_clk);
+ /* Stop TX state machine. */
+ reg = readl(dphy->regs + DPHY_CMN_SSM);
+ writel(reg & ~DPHY_CMN_SSM_EN, dphy->regs + DPHY_CMN_SSM);
+
+ dphy->is_powered = false;
+
return 0;
}
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.15.y
git checkout FETCH_HEAD
git cherry-pick -x 284fb19a3ffb1083c3ad9c00d29749d09dddb99c
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025101650-unloving-grinning-8caf@gregkh' --subject-prefix 'PATCH 5.15.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 284fb19a3ffb1083c3ad9c00d29749d09dddb99c Mon Sep 17 00:00:00 2001
From: Devarsh Thakkar <devarsht(a)ti.com>
Date: Fri, 4 Jul 2025 18:29:14 +0530
Subject: [PATCH] phy: cadence: cdns-dphy: Fix PLL lock and O_CMN_READY polling
PLL lockup and O_CMN_READY assertion can only happen after common state
machine gets enabled by programming DPHY_CMN_SSM register, but driver was
polling them before the common state machine was enabled which is
incorrect. This is as per the DPHY initialization sequence as mentioned in
J721E TRM [1] at section "12.7.2.4.1.2.1 Start-up Sequence Timing Diagram".
It shows O_CMN_READY polling at the end after common configuration pin
setup where the common configuration pin setup step enables state machine
as referenced in "Table 12-1533. Common Configuration-Related Setup
mentions state machine"
To fix this :
- Add new function callbacks for polling on PLL lock and O_CMN_READY
assertion.
- As state machine and clocks get enabled in power_on callback only, move
the clock related programming part from configure callback to power_on
callback and poll for the PLL lockup and O_CMN_READY assertion after state
machine gets enabled.
- The configure callback only saves the PLL configuration received from the
client driver which will be applied later on in power_on callback.
- Add checks to ensure configure is called before power_on and state
machine is in disabled state before power_on callback is called.
- Disable state machine in power_off so that client driver can re-configure
the PLL by following up a power_off, configure, power_on sequence.
[1]: https://www.ti.com/lit/zip/spruil1
Cc: stable(a)vger.kernel.org
Fixes: 7a343c8bf4b5 ("phy: Add Cadence D-PHY support")
Signed-off-by: Devarsh Thakkar <devarsht(a)ti.com>
Tested-by: Harikrishna Shenoy <h-shenoy(a)ti.com>
Reviewed-by: Tomi Valkeinen <tomi.valkeinen(a)ideasonboard.com>
Link: https://lore.kernel.org/r/20250704125915.1224738-2-devarsht@ti.com
Signed-off-by: Vinod Koul <vkoul(a)kernel.org>
diff --git a/drivers/phy/cadence/cdns-dphy.c b/drivers/phy/cadence/cdns-dphy.c
index 33a42e72362e..da8de0a9d086 100644
--- a/drivers/phy/cadence/cdns-dphy.c
+++ b/drivers/phy/cadence/cdns-dphy.c
@@ -92,6 +92,8 @@ struct cdns_dphy_ops {
void (*set_pll_cfg)(struct cdns_dphy *dphy,
const struct cdns_dphy_cfg *cfg);
unsigned long (*get_wakeup_time_ns)(struct cdns_dphy *dphy);
+ int (*wait_for_pll_lock)(struct cdns_dphy *dphy);
+ int (*wait_for_cmn_ready)(struct cdns_dphy *dphy);
};
struct cdns_dphy {
@@ -101,6 +103,8 @@ struct cdns_dphy {
struct clk *pll_ref_clk;
const struct cdns_dphy_ops *ops;
struct phy *phy;
+ bool is_configured;
+ bool is_powered;
};
/* Order of bands is important since the index is the band number. */
@@ -186,6 +190,16 @@ static unsigned long cdns_dphy_get_wakeup_time_ns(struct cdns_dphy *dphy)
return dphy->ops->get_wakeup_time_ns(dphy);
}
+static int cdns_dphy_wait_for_pll_lock(struct cdns_dphy *dphy)
+{
+ return dphy->ops->wait_for_pll_lock ? dphy->ops->wait_for_pll_lock(dphy) : 0;
+}
+
+static int cdns_dphy_wait_for_cmn_ready(struct cdns_dphy *dphy)
+{
+ return dphy->ops->wait_for_cmn_ready ? dphy->ops->wait_for_cmn_ready(dphy) : 0;
+}
+
static unsigned long cdns_dphy_ref_get_wakeup_time_ns(struct cdns_dphy *dphy)
{
/* Default wakeup time is 800 ns (in a simulated environment). */
@@ -227,7 +241,6 @@ static unsigned long cdns_dphy_j721e_get_wakeup_time_ns(struct cdns_dphy *dphy)
static void cdns_dphy_j721e_set_pll_cfg(struct cdns_dphy *dphy,
const struct cdns_dphy_cfg *cfg)
{
- u32 status;
/*
* set the PWM and PLL Byteclk divider settings to recommended values
@@ -244,13 +257,6 @@ static void cdns_dphy_j721e_set_pll_cfg(struct cdns_dphy *dphy,
writel(DPHY_TX_J721E_WIZ_LANE_RSTB,
dphy->regs + DPHY_TX_J721E_WIZ_RST_CTRL);
-
- readl_poll_timeout(dphy->regs + DPHY_TX_J721E_WIZ_PLL_CTRL, status,
- (status & DPHY_TX_WIZ_PLL_LOCK), 0, POLL_TIMEOUT_US);
-
- readl_poll_timeout(dphy->regs + DPHY_TX_J721E_WIZ_STATUS, status,
- (status & DPHY_TX_WIZ_O_CMN_READY), 0,
- POLL_TIMEOUT_US);
}
static void cdns_dphy_j721e_set_psm_div(struct cdns_dphy *dphy, u8 div)
@@ -258,6 +264,23 @@ static void cdns_dphy_j721e_set_psm_div(struct cdns_dphy *dphy, u8 div)
writel(div, dphy->regs + DPHY_TX_J721E_WIZ_PSM_FREQ);
}
+static int cdns_dphy_j721e_wait_for_pll_lock(struct cdns_dphy *dphy)
+{
+ u32 status;
+
+ return readl_poll_timeout(dphy->regs + DPHY_TX_J721E_WIZ_PLL_CTRL, status,
+ status & DPHY_TX_WIZ_PLL_LOCK, 0, POLL_TIMEOUT_US);
+}
+
+static int cdns_dphy_j721e_wait_for_cmn_ready(struct cdns_dphy *dphy)
+{
+ u32 status;
+
+ return readl_poll_timeout(dphy->regs + DPHY_TX_J721E_WIZ_STATUS, status,
+ status & DPHY_TX_WIZ_O_CMN_READY, 0,
+ POLL_TIMEOUT_US);
+}
+
/*
* This is the reference implementation of DPHY hooks. Specific integration of
* this IP may have to re-implement some of them depending on how they decided
@@ -273,6 +296,8 @@ static const struct cdns_dphy_ops j721e_dphy_ops = {
.get_wakeup_time_ns = cdns_dphy_j721e_get_wakeup_time_ns,
.set_pll_cfg = cdns_dphy_j721e_set_pll_cfg,
.set_psm_div = cdns_dphy_j721e_set_psm_div,
+ .wait_for_pll_lock = cdns_dphy_j721e_wait_for_pll_lock,
+ .wait_for_cmn_ready = cdns_dphy_j721e_wait_for_cmn_ready,
};
static int cdns_dphy_config_from_opts(struct phy *phy,
@@ -328,21 +353,36 @@ static int cdns_dphy_validate(struct phy *phy, enum phy_mode mode, int submode,
static int cdns_dphy_configure(struct phy *phy, union phy_configure_opts *opts)
{
struct cdns_dphy *dphy = phy_get_drvdata(phy);
- struct cdns_dphy_cfg cfg = { 0 };
- int ret, band_ctrl;
- unsigned int reg;
+ int ret;
- ret = cdns_dphy_config_from_opts(phy, &opts->mipi_dphy, &cfg);
- if (ret)
- return ret;
+ ret = cdns_dphy_config_from_opts(phy, &opts->mipi_dphy, &dphy->cfg);
+ if (!ret)
+ dphy->is_configured = true;
+
+ return ret;
+}
+
+static int cdns_dphy_power_on(struct phy *phy)
+{
+ struct cdns_dphy *dphy = phy_get_drvdata(phy);
+ int ret;
+ u32 reg;
+
+ if (!dphy->is_configured || dphy->is_powered)
+ return -EINVAL;
+
+ clk_prepare_enable(dphy->psm_clk);
+ clk_prepare_enable(dphy->pll_ref_clk);
/*
* Configure the internal PSM clk divider so that the DPHY has a
* 1MHz clk (or something close).
*/
ret = cdns_dphy_setup_psm(dphy);
- if (ret)
- return ret;
+ if (ret) {
+ dev_err(&dphy->phy->dev, "Failed to setup PSM with error %d\n", ret);
+ goto err_power_on;
+ }
/*
* Configure attach clk lanes to data lanes: the DPHY has 2 clk lanes
@@ -357,40 +397,60 @@ static int cdns_dphy_configure(struct phy *phy, union phy_configure_opts *opts)
* Configure the DPHY PLL that will be used to generate the TX byte
* clk.
*/
- cdns_dphy_set_pll_cfg(dphy, &cfg);
+ cdns_dphy_set_pll_cfg(dphy, &dphy->cfg);
- band_ctrl = cdns_dphy_tx_get_band_ctrl(opts->mipi_dphy.hs_clk_rate);
- if (band_ctrl < 0)
- return band_ctrl;
+ ret = cdns_dphy_tx_get_band_ctrl(dphy->cfg.hs_clk_rate);
+ if (ret < 0) {
+ dev_err(&dphy->phy->dev, "Failed to get band control value with error %d\n", ret);
+ goto err_power_on;
+ }
- reg = FIELD_PREP(DPHY_BAND_CFG_LEFT_BAND, band_ctrl) |
- FIELD_PREP(DPHY_BAND_CFG_RIGHT_BAND, band_ctrl);
+ reg = FIELD_PREP(DPHY_BAND_CFG_LEFT_BAND, ret) |
+ FIELD_PREP(DPHY_BAND_CFG_RIGHT_BAND, ret);
writel(reg, dphy->regs + DPHY_BAND_CFG);
- return 0;
-}
-
-static int cdns_dphy_power_on(struct phy *phy)
-{
- struct cdns_dphy *dphy = phy_get_drvdata(phy);
-
- clk_prepare_enable(dphy->psm_clk);
- clk_prepare_enable(dphy->pll_ref_clk);
-
/* Start TX state machine. */
writel(DPHY_CMN_SSM_EN | DPHY_CMN_TX_MODE_EN,
dphy->regs + DPHY_CMN_SSM);
+ ret = cdns_dphy_wait_for_pll_lock(dphy);
+ if (ret) {
+ dev_err(&dphy->phy->dev, "Failed to lock PLL with error %d\n", ret);
+ goto err_power_on;
+ }
+
+ ret = cdns_dphy_wait_for_cmn_ready(dphy);
+ if (ret) {
+ dev_err(&dphy->phy->dev, "O_CMN_READY signal failed to assert with error %d\n",
+ ret);
+ goto err_power_on;
+ }
+
+ dphy->is_powered = true;
+
return 0;
+
+err_power_on:
+ clk_disable_unprepare(dphy->pll_ref_clk);
+ clk_disable_unprepare(dphy->psm_clk);
+
+ return ret;
}
static int cdns_dphy_power_off(struct phy *phy)
{
struct cdns_dphy *dphy = phy_get_drvdata(phy);
+ u32 reg;
clk_disable_unprepare(dphy->pll_ref_clk);
clk_disable_unprepare(dphy->psm_clk);
+ /* Stop TX state machine. */
+ reg = readl(dphy->regs + DPHY_CMN_SSM);
+ writel(reg & ~DPHY_CMN_SSM_EN, dphy->regs + DPHY_CMN_SSM);
+
+ dphy->is_powered = false;
+
return 0;
}
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.10.y
git checkout FETCH_HEAD
git cherry-pick -x 5ef7e24c742038a5d8c626fdc0e3a21834358341
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025101631-encrust-snagged-88c4@gregkh' --subject-prefix 'PATCH 5.10.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 5ef7e24c742038a5d8c626fdc0e3a21834358341 Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu(a)linux.intel.com>
Date: Thu, 18 Sep 2025 13:02:02 +0800
Subject: [PATCH] iommu/vt-d: PRS isn't usable if PDS isn't supported
The specification, Section 7.10, "Software Steps to Drain Page Requests &
Responses," requires software to submit an Invalidation Wait Descriptor
(inv_wait_dsc) with the Page-request Drain (PD=1) flag set, along with
the Invalidation Wait Completion Status Write flag (SW=1). It then waits
for the Invalidation Wait Descriptor's completion.
However, the PD field in the Invalidation Wait Descriptor is optional, as
stated in Section 6.5.2.9, "Invalidation Wait Descriptor":
"Page-request Drain (PD): Remapping hardware implementations reporting
Page-request draining as not supported (PDS = 0 in ECAP_REG) treat this
field as reserved."
This implies that if the IOMMU doesn't support the PDS capability, software
can't drain page requests and group responses as expected.
Do not enable PCI/PRI if the IOMMU doesn't support PDS.
Reported-by: Joel Granados <joel.granados(a)kernel.org>
Closes: https://lore.kernel.org/r/20250909-jag-pds-v1-1-ad8cba0e494e@kernel.org
Fixes: 66ac4db36f4c ("iommu/vt-d: Add page request draining support")
Cc: stable(a)vger.kernel.org
Signed-off-by: Lu Baolu <baolu.lu(a)linux.intel.com>
Link: https://lore.kernel.org/r/20250915062946.120196-1-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <joerg.roedel(a)amd.com>
diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 9c3ab9d9f69a..92759a8f8330 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -3812,7 +3812,7 @@ static struct iommu_device *intel_iommu_probe_device(struct device *dev)
}
if (info->ats_supported && ecap_prs(iommu->ecap) &&
- pci_pri_supported(pdev))
+ ecap_pds(iommu->ecap) && pci_pri_supported(pdev))
info->pri_supported = 1;
}
}
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.4.y
git checkout FETCH_HEAD
git cherry-pick -x 26e5c67deb2e1f42a951f022fdf5b9f7eb747b01
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025101631-recount-smashing-e39c@gregkh' --subject-prefix 'PATCH 5.4.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 26e5c67deb2e1f42a951f022fdf5b9f7eb747b01 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong(a)kernel.org>
Date: Mon, 15 Sep 2025 17:24:17 -0700
Subject: [PATCH] fuse: fix livelock in synchronous file put from fuseblk
workers
I observed a hang when running generic/323 against a fuseblk server.
This test opens a file, initiates a lot of AIO writes to that file
descriptor, and closes the file descriptor before the writes complete.
Unsurprisingly, the AIO exerciser threads are mostly stuck waiting for
responses from the fuseblk server:
# cat /proc/372265/task/372313/stack
[<0>] request_wait_answer+0x1fe/0x2a0 [fuse]
[<0>] __fuse_simple_request+0xd3/0x2b0 [fuse]
[<0>] fuse_do_getattr+0xfc/0x1f0 [fuse]
[<0>] fuse_file_read_iter+0xbe/0x1c0 [fuse]
[<0>] aio_read+0x130/0x1e0
[<0>] io_submit_one+0x542/0x860
[<0>] __x64_sys_io_submit+0x98/0x1a0
[<0>] do_syscall_64+0x37/0xf0
[<0>] entry_SYSCALL_64_after_hwframe+0x4b/0x53
But the /weird/ part is that the fuseblk server threads are waiting for
responses from itself:
# cat /proc/372210/task/372232/stack
[<0>] request_wait_answer+0x1fe/0x2a0 [fuse]
[<0>] __fuse_simple_request+0xd3/0x2b0 [fuse]
[<0>] fuse_file_put+0x9a/0xd0 [fuse]
[<0>] fuse_release+0x36/0x50 [fuse]
[<0>] __fput+0xec/0x2b0
[<0>] task_work_run+0x55/0x90
[<0>] syscall_exit_to_user_mode+0xe9/0x100
[<0>] do_syscall_64+0x43/0xf0
[<0>] entry_SYSCALL_64_after_hwframe+0x4b/0x53
The fuseblk server is fuse2fs so there's nothing all that exciting in
the server itself. So why is the fuse server calling fuse_file_put?
The commit message for the fstest sheds some light on that:
"By closing the file descriptor before calling io_destroy, you pretty
much guarantee that the last put on the ioctx will be done in interrupt
context (during I/O completion).
Aha. AIO fgets a new struct file from the fd when it queues the ioctx.
The completion of the FUSE_WRITE command from userspace causes the fuse
server to call the AIO completion function. The completion puts the
struct file, queuing a delayed fput to the fuse server task. When the
fuse server task returns to userspace, it has to run the delayed fput,
which in the case of a fuseblk server, it does synchronously.
Sending the FUSE_RELEASE command sychronously from fuse server threads
is a bad idea because a client program can initiate enough simultaneous
AIOs such that all the fuse server threads end up in delayed_fput, and
now there aren't any threads left to handle the queued fuse commands.
Fix this by only using asynchronous fputs when closing files, and leave
a comment explaining why.
Cc: stable(a)vger.kernel.org # v2.6.38
Fixes: 5a18ec176c934c ("fuse: fix hang of single threaded fuseblk filesystem")
Signed-off-by: Darrick J. Wong <djwong(a)kernel.org>
Signed-off-by: Miklos Szeredi <mszeredi(a)redhat.com>
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 54786f62a9d8..f1ef77a0be05 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -356,8 +356,14 @@ void fuse_file_release(struct inode *inode, struct fuse_file *ff,
* Make the release synchronous if this is a fuseblk mount,
* synchronous RELEASE is allowed (and desirable) in this case
* because the server can be trusted not to screw up.
+ *
+ * Always use the asynchronous file put because the current thread
+ * might be the fuse server. This can happen if a process starts some
+ * aio and closes the fd before the aio completes. Since aio takes its
+ * own ref to the file, the IO completion has to drop the ref, which is
+ * how the fuse server can end up closing its clients' files.
*/
- fuse_file_put(ff, ff->fm->fc->destroy);
+ fuse_file_put(ff, false);
}
void fuse_release_common(struct file *file, bool isdir)
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.4.y
git checkout FETCH_HEAD
git cherry-pick -x 72b7ceca857f38a8ca7c5629feffc63769638974
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025101653-unloved-prodigal-b5e0@gregkh' --subject-prefix 'PATCH 5.4.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 72b7ceca857f38a8ca7c5629feffc63769638974 Mon Sep 17 00:00:00 2001
From: Shashank A P <shashank.ap(a)samsung.com>
Date: Mon, 1 Sep 2025 14:59:00 +0530
Subject: [PATCH] fs: quota: create dedicated workqueue for quota_release_work
There is a kernel panic due to WARN_ONCE when panic_on_warn is set.
This issue occurs when writeback is triggered due to sync call for an
opened file(ie, writeback reason is WB_REASON_SYNC). When f2fs balance
is needed at sync path, flush for quota_release_work is triggered.
By default quota_release_work is queued to "events_unbound" queue which
does not have WQ_MEM_RECLAIM flag. During f2fs balance "writeback"
workqueue tries to flush quota_release_work causing kernel panic due to
MEM_RECLAIM flag mismatch errors.
This patch creates dedicated workqueue with WQ_MEM_RECLAIM flag
for work quota_release_work.
------------[ cut here ]------------
WARNING: CPU: 4 PID: 14867 at kernel/workqueue.c:3721 check_flush_dependency+0x13c/0x148
Call trace:
check_flush_dependency+0x13c/0x148
__flush_work+0xd0/0x398
flush_delayed_work+0x44/0x5c
dquot_writeback_dquots+0x54/0x318
f2fs_do_quota_sync+0xb8/0x1a8
f2fs_write_checkpoint+0x3cc/0x99c
f2fs_gc+0x190/0x750
f2fs_balance_fs+0x110/0x168
f2fs_write_single_data_page+0x474/0x7dc
f2fs_write_data_pages+0x7d0/0xd0c
do_writepages+0xe0/0x2f4
__writeback_single_inode+0x44/0x4ac
writeback_sb_inodes+0x30c/0x538
wb_writeback+0xf4/0x440
wb_workfn+0x128/0x5d4
process_scheduled_works+0x1c4/0x45c
worker_thread+0x32c/0x3e8
kthread+0x11c/0x1b0
ret_from_fork+0x10/0x20
Kernel panic - not syncing: kernel: panic_on_warn set ...
Fixes: ac6f420291b3 ("quota: flush quota_release_work upon quota writeback")
CC: stable(a)vger.kernel.org
Signed-off-by: Shashank A P <shashank.ap(a)samsung.com>
Link: https://patch.msgid.link/20250901092905.2115-1-shashank.ap@samsung.com
Signed-off-by: Jan Kara <jack(a)suse.cz>
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index df4a9b348769..6c4a6ee1fa2b 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -162,6 +162,9 @@ static struct quota_module_name module_names[] = INIT_QUOTA_MODULE_NAMES;
/* SLAB cache for dquot structures */
static struct kmem_cache *dquot_cachep;
+/* workqueue for work quota_release_work*/
+static struct workqueue_struct *quota_unbound_wq;
+
void register_quota_format(struct quota_format_type *fmt)
{
spin_lock(&dq_list_lock);
@@ -881,7 +884,7 @@ void dqput(struct dquot *dquot)
put_releasing_dquots(dquot);
atomic_dec(&dquot->dq_count);
spin_unlock(&dq_list_lock);
- queue_delayed_work(system_unbound_wq, "a_release_work, 1);
+ queue_delayed_work(quota_unbound_wq, "a_release_work, 1);
}
EXPORT_SYMBOL(dqput);
@@ -3041,6 +3044,11 @@ static int __init dquot_init(void)
shrinker_register(dqcache_shrinker);
+ quota_unbound_wq = alloc_workqueue("quota_events_unbound",
+ WQ_UNBOUND | WQ_MEM_RECLAIM, WQ_MAX_ACTIVE);
+ if (!quota_unbound_wq)
+ panic("Cannot create quota_unbound_wq\n");
+
return 0;
}
fs_initcall(dquot_init);
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.10.y
git checkout FETCH_HEAD
git cherry-pick -x 72b7ceca857f38a8ca7c5629feffc63769638974
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025101652-coziness-jailer-d369@gregkh' --subject-prefix 'PATCH 5.10.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 72b7ceca857f38a8ca7c5629feffc63769638974 Mon Sep 17 00:00:00 2001
From: Shashank A P <shashank.ap(a)samsung.com>
Date: Mon, 1 Sep 2025 14:59:00 +0530
Subject: [PATCH] fs: quota: create dedicated workqueue for quota_release_work
There is a kernel panic due to WARN_ONCE when panic_on_warn is set.
This issue occurs when writeback is triggered due to sync call for an
opened file(ie, writeback reason is WB_REASON_SYNC). When f2fs balance
is needed at sync path, flush for quota_release_work is triggered.
By default quota_release_work is queued to "events_unbound" queue which
does not have WQ_MEM_RECLAIM flag. During f2fs balance "writeback"
workqueue tries to flush quota_release_work causing kernel panic due to
MEM_RECLAIM flag mismatch errors.
This patch creates dedicated workqueue with WQ_MEM_RECLAIM flag
for work quota_release_work.
------------[ cut here ]------------
WARNING: CPU: 4 PID: 14867 at kernel/workqueue.c:3721 check_flush_dependency+0x13c/0x148
Call trace:
check_flush_dependency+0x13c/0x148
__flush_work+0xd0/0x398
flush_delayed_work+0x44/0x5c
dquot_writeback_dquots+0x54/0x318
f2fs_do_quota_sync+0xb8/0x1a8
f2fs_write_checkpoint+0x3cc/0x99c
f2fs_gc+0x190/0x750
f2fs_balance_fs+0x110/0x168
f2fs_write_single_data_page+0x474/0x7dc
f2fs_write_data_pages+0x7d0/0xd0c
do_writepages+0xe0/0x2f4
__writeback_single_inode+0x44/0x4ac
writeback_sb_inodes+0x30c/0x538
wb_writeback+0xf4/0x440
wb_workfn+0x128/0x5d4
process_scheduled_works+0x1c4/0x45c
worker_thread+0x32c/0x3e8
kthread+0x11c/0x1b0
ret_from_fork+0x10/0x20
Kernel panic - not syncing: kernel: panic_on_warn set ...
Fixes: ac6f420291b3 ("quota: flush quota_release_work upon quota writeback")
CC: stable(a)vger.kernel.org
Signed-off-by: Shashank A P <shashank.ap(a)samsung.com>
Link: https://patch.msgid.link/20250901092905.2115-1-shashank.ap@samsung.com
Signed-off-by: Jan Kara <jack(a)suse.cz>
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index df4a9b348769..6c4a6ee1fa2b 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -162,6 +162,9 @@ static struct quota_module_name module_names[] = INIT_QUOTA_MODULE_NAMES;
/* SLAB cache for dquot structures */
static struct kmem_cache *dquot_cachep;
+/* workqueue for work quota_release_work*/
+static struct workqueue_struct *quota_unbound_wq;
+
void register_quota_format(struct quota_format_type *fmt)
{
spin_lock(&dq_list_lock);
@@ -881,7 +884,7 @@ void dqput(struct dquot *dquot)
put_releasing_dquots(dquot);
atomic_dec(&dquot->dq_count);
spin_unlock(&dq_list_lock);
- queue_delayed_work(system_unbound_wq, "a_release_work, 1);
+ queue_delayed_work(quota_unbound_wq, "a_release_work, 1);
}
EXPORT_SYMBOL(dqput);
@@ -3041,6 +3044,11 @@ static int __init dquot_init(void)
shrinker_register(dqcache_shrinker);
+ quota_unbound_wq = alloc_workqueue("quota_events_unbound",
+ WQ_UNBOUND | WQ_MEM_RECLAIM, WQ_MAX_ACTIVE);
+ if (!quota_unbound_wq)
+ panic("Cannot create quota_unbound_wq\n");
+
return 0;
}
fs_initcall(dquot_init);
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.15.y
git checkout FETCH_HEAD
git cherry-pick -x 72b7ceca857f38a8ca7c5629feffc63769638974
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025101651-whiny-creme-9639@gregkh' --subject-prefix 'PATCH 5.15.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 72b7ceca857f38a8ca7c5629feffc63769638974 Mon Sep 17 00:00:00 2001
From: Shashank A P <shashank.ap(a)samsung.com>
Date: Mon, 1 Sep 2025 14:59:00 +0530
Subject: [PATCH] fs: quota: create dedicated workqueue for quota_release_work
There is a kernel panic due to WARN_ONCE when panic_on_warn is set.
This issue occurs when writeback is triggered due to sync call for an
opened file(ie, writeback reason is WB_REASON_SYNC). When f2fs balance
is needed at sync path, flush for quota_release_work is triggered.
By default quota_release_work is queued to "events_unbound" queue which
does not have WQ_MEM_RECLAIM flag. During f2fs balance "writeback"
workqueue tries to flush quota_release_work causing kernel panic due to
MEM_RECLAIM flag mismatch errors.
This patch creates dedicated workqueue with WQ_MEM_RECLAIM flag
for work quota_release_work.
------------[ cut here ]------------
WARNING: CPU: 4 PID: 14867 at kernel/workqueue.c:3721 check_flush_dependency+0x13c/0x148
Call trace:
check_flush_dependency+0x13c/0x148
__flush_work+0xd0/0x398
flush_delayed_work+0x44/0x5c
dquot_writeback_dquots+0x54/0x318
f2fs_do_quota_sync+0xb8/0x1a8
f2fs_write_checkpoint+0x3cc/0x99c
f2fs_gc+0x190/0x750
f2fs_balance_fs+0x110/0x168
f2fs_write_single_data_page+0x474/0x7dc
f2fs_write_data_pages+0x7d0/0xd0c
do_writepages+0xe0/0x2f4
__writeback_single_inode+0x44/0x4ac
writeback_sb_inodes+0x30c/0x538
wb_writeback+0xf4/0x440
wb_workfn+0x128/0x5d4
process_scheduled_works+0x1c4/0x45c
worker_thread+0x32c/0x3e8
kthread+0x11c/0x1b0
ret_from_fork+0x10/0x20
Kernel panic - not syncing: kernel: panic_on_warn set ...
Fixes: ac6f420291b3 ("quota: flush quota_release_work upon quota writeback")
CC: stable(a)vger.kernel.org
Signed-off-by: Shashank A P <shashank.ap(a)samsung.com>
Link: https://patch.msgid.link/20250901092905.2115-1-shashank.ap@samsung.com
Signed-off-by: Jan Kara <jack(a)suse.cz>
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index df4a9b348769..6c4a6ee1fa2b 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -162,6 +162,9 @@ static struct quota_module_name module_names[] = INIT_QUOTA_MODULE_NAMES;
/* SLAB cache for dquot structures */
static struct kmem_cache *dquot_cachep;
+/* workqueue for work quota_release_work*/
+static struct workqueue_struct *quota_unbound_wq;
+
void register_quota_format(struct quota_format_type *fmt)
{
spin_lock(&dq_list_lock);
@@ -881,7 +884,7 @@ void dqput(struct dquot *dquot)
put_releasing_dquots(dquot);
atomic_dec(&dquot->dq_count);
spin_unlock(&dq_list_lock);
- queue_delayed_work(system_unbound_wq, "a_release_work, 1);
+ queue_delayed_work(quota_unbound_wq, "a_release_work, 1);
}
EXPORT_SYMBOL(dqput);
@@ -3041,6 +3044,11 @@ static int __init dquot_init(void)
shrinker_register(dqcache_shrinker);
+ quota_unbound_wq = alloc_workqueue("quota_events_unbound",
+ WQ_UNBOUND | WQ_MEM_RECLAIM, WQ_MAX_ACTIVE);
+ if (!quota_unbound_wq)
+ panic("Cannot create quota_unbound_wq\n");
+
return 0;
}
fs_initcall(dquot_init);
The patch below does not apply to the 6.1-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.1.y
git checkout FETCH_HEAD
git cherry-pick -x 72b7ceca857f38a8ca7c5629feffc63769638974
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025101651-arrive-frail-3c0f@gregkh' --subject-prefix 'PATCH 6.1.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 72b7ceca857f38a8ca7c5629feffc63769638974 Mon Sep 17 00:00:00 2001
From: Shashank A P <shashank.ap(a)samsung.com>
Date: Mon, 1 Sep 2025 14:59:00 +0530
Subject: [PATCH] fs: quota: create dedicated workqueue for quota_release_work
There is a kernel panic due to WARN_ONCE when panic_on_warn is set.
This issue occurs when writeback is triggered due to sync call for an
opened file(ie, writeback reason is WB_REASON_SYNC). When f2fs balance
is needed at sync path, flush for quota_release_work is triggered.
By default quota_release_work is queued to "events_unbound" queue which
does not have WQ_MEM_RECLAIM flag. During f2fs balance "writeback"
workqueue tries to flush quota_release_work causing kernel panic due to
MEM_RECLAIM flag mismatch errors.
This patch creates dedicated workqueue with WQ_MEM_RECLAIM flag
for work quota_release_work.
------------[ cut here ]------------
WARNING: CPU: 4 PID: 14867 at kernel/workqueue.c:3721 check_flush_dependency+0x13c/0x148
Call trace:
check_flush_dependency+0x13c/0x148
__flush_work+0xd0/0x398
flush_delayed_work+0x44/0x5c
dquot_writeback_dquots+0x54/0x318
f2fs_do_quota_sync+0xb8/0x1a8
f2fs_write_checkpoint+0x3cc/0x99c
f2fs_gc+0x190/0x750
f2fs_balance_fs+0x110/0x168
f2fs_write_single_data_page+0x474/0x7dc
f2fs_write_data_pages+0x7d0/0xd0c
do_writepages+0xe0/0x2f4
__writeback_single_inode+0x44/0x4ac
writeback_sb_inodes+0x30c/0x538
wb_writeback+0xf4/0x440
wb_workfn+0x128/0x5d4
process_scheduled_works+0x1c4/0x45c
worker_thread+0x32c/0x3e8
kthread+0x11c/0x1b0
ret_from_fork+0x10/0x20
Kernel panic - not syncing: kernel: panic_on_warn set ...
Fixes: ac6f420291b3 ("quota: flush quota_release_work upon quota writeback")
CC: stable(a)vger.kernel.org
Signed-off-by: Shashank A P <shashank.ap(a)samsung.com>
Link: https://patch.msgid.link/20250901092905.2115-1-shashank.ap@samsung.com
Signed-off-by: Jan Kara <jack(a)suse.cz>
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index df4a9b348769..6c4a6ee1fa2b 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -162,6 +162,9 @@ static struct quota_module_name module_names[] = INIT_QUOTA_MODULE_NAMES;
/* SLAB cache for dquot structures */
static struct kmem_cache *dquot_cachep;
+/* workqueue for work quota_release_work*/
+static struct workqueue_struct *quota_unbound_wq;
+
void register_quota_format(struct quota_format_type *fmt)
{
spin_lock(&dq_list_lock);
@@ -881,7 +884,7 @@ void dqput(struct dquot *dquot)
put_releasing_dquots(dquot);
atomic_dec(&dquot->dq_count);
spin_unlock(&dq_list_lock);
- queue_delayed_work(system_unbound_wq, "a_release_work, 1);
+ queue_delayed_work(quota_unbound_wq, "a_release_work, 1);
}
EXPORT_SYMBOL(dqput);
@@ -3041,6 +3044,11 @@ static int __init dquot_init(void)
shrinker_register(dqcache_shrinker);
+ quota_unbound_wq = alloc_workqueue("quota_events_unbound",
+ WQ_UNBOUND | WQ_MEM_RECLAIM, WQ_MAX_ACTIVE);
+ if (!quota_unbound_wq)
+ panic("Cannot create quota_unbound_wq\n");
+
return 0;
}
fs_initcall(dquot_init);
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.4.y
git checkout FETCH_HEAD
git cherry-pick -x 0c43094f8cc9d3d99d835c0ac9c4fe1ccc62babd
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025101620-overcook-path-9b00@gregkh' --subject-prefix 'PATCH 5.4.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 0c43094f8cc9d3d99d835c0ac9c4fe1ccc62babd Mon Sep 17 00:00:00 2001
From: Nam Cao <namcao(a)linutronix.de>
Date: Tue, 15 Jul 2025 14:46:34 +0200
Subject: [PATCH] eventpoll: Replace rwlock with spinlock
The ready event list of an epoll object is protected by read-write
semaphore:
- The consumer (waiter) acquires the write lock and takes items.
- the producer (waker) takes the read lock and adds items.
The point of this design is enabling epoll to scale well with large number
of producers, as multiple producers can hold the read lock at the same
time.
Unfortunately, this implementation may cause scheduling priority inversion
problem. Suppose the consumer has higher scheduling priority than the
producer. The consumer needs to acquire the write lock, but may be blocked
by the producer holding the read lock. Since read-write semaphore does not
support priority-boosting for the readers (even with CONFIG_PREEMPT_RT=y),
we have a case of priority inversion: a higher priority consumer is blocked
by a lower priority producer. This problem was reported in [1].
Furthermore, this could also cause stall problem, as described in [2].
Fix this problem by replacing rwlock with spinlock.
This reduces the event bandwidth, as the producers now have to contend with
each other for the spinlock. According to the benchmark from
https://github.com/rouming/test-tools/blob/master/stress-epoll.c:
On 12 x86 CPUs:
Before After Diff
threads events/ms events/ms
8 7162 4956 -31%
16 8733 5383 -38%
32 7968 5572 -30%
64 10652 5739 -46%
128 11236 5931 -47%
On 4 riscv CPUs:
Before After Diff
threads events/ms events/ms
8 2958 2833 -4%
16 3323 3097 -7%
32 3451 3240 -6%
64 3554 3178 -11%
128 3601 3235 -10%
Although the numbers look bad, it should be noted that this benchmark
creates multiple threads who do nothing except constantly generating new
epoll events, thus contention on the spinlock is high. For real workload,
the event rate is likely much lower, and the performance drop is not as
bad.
Using another benchmark (perf bench epoll wait) where spinlock contention
is lower, improvement is even observed on x86:
On 12 x86 CPUs:
Before: Averaged 110279 operations/sec (+- 1.09%), total secs = 8
After: Averaged 114577 operations/sec (+- 2.25%), total secs = 8
On 4 riscv CPUs:
Before: Averaged 175767 operations/sec (+- 0.62%), total secs = 8
After: Averaged 167396 operations/sec (+- 0.23%), total secs = 8
In conclusion, no one is likely to be upset over this change. After all,
spinlock was used originally for years, and the commit which converted to
rwlock didn't mention a real workload, just that the benchmark numbers are
nice.
This patch is not exactly the revert of commit a218cc491420 ("epoll: use
rwlock in order to reduce ep_poll_callback() contention"), because git
revert conflicts in some places which are not obvious on the resolution.
This patch is intended to be backported, therefore go with the obvious
approach:
- Replace rwlock_t with spinlock_t one to one
- Delete list_add_tail_lockless() and chain_epi_lockless(). These were
introduced to allow producers to concurrently add items to the list.
But now that spinlock no longer allows producers to touch the event
list concurrently, these two functions are not necessary anymore.
Fixes: a218cc491420 ("epoll: use rwlock in order to reduce ep_poll_callback() contention")
Signed-off-by: Nam Cao <namcao(a)linutronix.de>
Link: https://lore.kernel.org/ec92458ea357ec503c737ead0f10b2c6e4c37d47.1752581388…
Tested-by: K Prateek Nayak <kprateek.nayak(a)amd.com>
Cc: stable(a)vger.kernel.org
Reported-by: Frederic Weisbecker <frederic(a)kernel.org>
Closes: https://lore.kernel.org/linux-rt-users/20210825132754.GA895675@lothringen/ [1]
Reported-by: Valentin Schneider <vschneid(a)redhat.com>
Closes: https://lore.kernel.org/linux-rt-users/xhsmhttqvnall.mognet@vschneid.remote… [2]
Signed-off-by: Christian Brauner <brauner(a)kernel.org>
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index b22d6f819f78..ee7c4b683ec3 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -46,10 +46,10 @@
*
* 1) epnested_mutex (mutex)
* 2) ep->mtx (mutex)
- * 3) ep->lock (rwlock)
+ * 3) ep->lock (spinlock)
*
* The acquire order is the one listed above, from 1 to 3.
- * We need a rwlock (ep->lock) because we manipulate objects
+ * We need a spinlock (ep->lock) because we manipulate objects
* from inside the poll callback, that might be triggered from
* a wake_up() that in turn might be called from IRQ context.
* So we can't sleep inside the poll callback and hence we need
@@ -195,7 +195,7 @@ struct eventpoll {
struct list_head rdllist;
/* Lock which protects rdllist and ovflist */
- rwlock_t lock;
+ spinlock_t lock;
/* RB tree root used to store monitored fd structs */
struct rb_root_cached rbr;
@@ -741,10 +741,10 @@ static void ep_start_scan(struct eventpoll *ep, struct list_head *txlist)
* in a lockless way.
*/
lockdep_assert_irqs_enabled();
- write_lock_irq(&ep->lock);
+ spin_lock_irq(&ep->lock);
list_splice_init(&ep->rdllist, txlist);
WRITE_ONCE(ep->ovflist, NULL);
- write_unlock_irq(&ep->lock);
+ spin_unlock_irq(&ep->lock);
}
static void ep_done_scan(struct eventpoll *ep,
@@ -752,7 +752,7 @@ static void ep_done_scan(struct eventpoll *ep,
{
struct epitem *epi, *nepi;
- write_lock_irq(&ep->lock);
+ spin_lock_irq(&ep->lock);
/*
* During the time we spent inside the "sproc" callback, some
* other events might have been queued by the poll callback.
@@ -793,7 +793,7 @@ static void ep_done_scan(struct eventpoll *ep,
wake_up(&ep->wq);
}
- write_unlock_irq(&ep->lock);
+ spin_unlock_irq(&ep->lock);
}
static void ep_get(struct eventpoll *ep)
@@ -868,10 +868,10 @@ static bool __ep_remove(struct eventpoll *ep, struct epitem *epi, bool force)
rb_erase_cached(&epi->rbn, &ep->rbr);
- write_lock_irq(&ep->lock);
+ spin_lock_irq(&ep->lock);
if (ep_is_linked(epi))
list_del_init(&epi->rdllink);
- write_unlock_irq(&ep->lock);
+ spin_unlock_irq(&ep->lock);
wakeup_source_unregister(ep_wakeup_source(epi));
/*
@@ -1152,7 +1152,7 @@ static int ep_alloc(struct eventpoll **pep)
return -ENOMEM;
mutex_init(&ep->mtx);
- rwlock_init(&ep->lock);
+ spin_lock_init(&ep->lock);
init_waitqueue_head(&ep->wq);
init_waitqueue_head(&ep->poll_wait);
INIT_LIST_HEAD(&ep->rdllist);
@@ -1239,100 +1239,10 @@ struct file *get_epoll_tfile_raw_ptr(struct file *file, int tfd,
}
#endif /* CONFIG_KCMP */
-/*
- * Adds a new entry to the tail of the list in a lockless way, i.e.
- * multiple CPUs are allowed to call this function concurrently.
- *
- * Beware: it is necessary to prevent any other modifications of the
- * existing list until all changes are completed, in other words
- * concurrent list_add_tail_lockless() calls should be protected
- * with a read lock, where write lock acts as a barrier which
- * makes sure all list_add_tail_lockless() calls are fully
- * completed.
- *
- * Also an element can be locklessly added to the list only in one
- * direction i.e. either to the tail or to the head, otherwise
- * concurrent access will corrupt the list.
- *
- * Return: %false if element has been already added to the list, %true
- * otherwise.
- */
-static inline bool list_add_tail_lockless(struct list_head *new,
- struct list_head *head)
-{
- struct list_head *prev;
-
- /*
- * This is simple 'new->next = head' operation, but cmpxchg()
- * is used in order to detect that same element has been just
- * added to the list from another CPU: the winner observes
- * new->next == new.
- */
- if (!try_cmpxchg(&new->next, &new, head))
- return false;
-
- /*
- * Initially ->next of a new element must be updated with the head
- * (we are inserting to the tail) and only then pointers are atomically
- * exchanged. XCHG guarantees memory ordering, thus ->next should be
- * updated before pointers are actually swapped and pointers are
- * swapped before prev->next is updated.
- */
-
- prev = xchg(&head->prev, new);
-
- /*
- * It is safe to modify prev->next and new->prev, because a new element
- * is added only to the tail and new->next is updated before XCHG.
- */
-
- prev->next = new;
- new->prev = prev;
-
- return true;
-}
-
-/*
- * Chains a new epi entry to the tail of the ep->ovflist in a lockless way,
- * i.e. multiple CPUs are allowed to call this function concurrently.
- *
- * Return: %false if epi element has been already chained, %true otherwise.
- */
-static inline bool chain_epi_lockless(struct epitem *epi)
-{
- struct eventpoll *ep = epi->ep;
-
- /* Fast preliminary check */
- if (epi->next != EP_UNACTIVE_PTR)
- return false;
-
- /* Check that the same epi has not been just chained from another CPU */
- if (cmpxchg(&epi->next, EP_UNACTIVE_PTR, NULL) != EP_UNACTIVE_PTR)
- return false;
-
- /* Atomically exchange tail */
- epi->next = xchg(&ep->ovflist, epi);
-
- return true;
-}
-
/*
* This is the callback that is passed to the wait queue wakeup
* mechanism. It is called by the stored file descriptors when they
* have events to report.
- *
- * This callback takes a read lock in order not to contend with concurrent
- * events from another file descriptor, thus all modifications to ->rdllist
- * or ->ovflist are lockless. Read lock is paired with the write lock from
- * ep_start/done_scan(), which stops all list modifications and guarantees
- * that lists state is seen correctly.
- *
- * Another thing worth to mention is that ep_poll_callback() can be called
- * concurrently for the same @epi from different CPUs if poll table was inited
- * with several wait queues entries. Plural wakeup from different CPUs of a
- * single wait queue is serialized by wq.lock, but the case when multiple wait
- * queues are used should be detected accordingly. This is detected using
- * cmpxchg() operation.
*/
static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
{
@@ -1343,7 +1253,7 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v
unsigned long flags;
int ewake = 0;
- read_lock_irqsave(&ep->lock, flags);
+ spin_lock_irqsave(&ep->lock, flags);
ep_set_busy_poll_napi_id(epi);
@@ -1372,12 +1282,15 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v
* chained in ep->ovflist and requeued later on.
*/
if (READ_ONCE(ep->ovflist) != EP_UNACTIVE_PTR) {
- if (chain_epi_lockless(epi))
+ if (epi->next == EP_UNACTIVE_PTR) {
+ epi->next = READ_ONCE(ep->ovflist);
+ WRITE_ONCE(ep->ovflist, epi);
ep_pm_stay_awake_rcu(epi);
+ }
} else if (!ep_is_linked(epi)) {
/* In the usual case, add event to ready list. */
- if (list_add_tail_lockless(&epi->rdllink, &ep->rdllist))
- ep_pm_stay_awake_rcu(epi);
+ list_add_tail(&epi->rdllink, &ep->rdllist);
+ ep_pm_stay_awake_rcu(epi);
}
/*
@@ -1410,7 +1323,7 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v
pwake++;
out_unlock:
- read_unlock_irqrestore(&ep->lock, flags);
+ spin_unlock_irqrestore(&ep->lock, flags);
/* We have to call this outside the lock */
if (pwake)
@@ -1745,7 +1658,7 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
}
/* We have to drop the new item inside our item list to keep track of it */
- write_lock_irq(&ep->lock);
+ spin_lock_irq(&ep->lock);
/* record NAPI ID of new item if present */
ep_set_busy_poll_napi_id(epi);
@@ -1762,7 +1675,7 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
pwake++;
}
- write_unlock_irq(&ep->lock);
+ spin_unlock_irq(&ep->lock);
/* We have to call this outside the lock */
if (pwake)
@@ -1826,7 +1739,7 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi,
* list, push it inside.
*/
if (ep_item_poll(epi, &pt, 1)) {
- write_lock_irq(&ep->lock);
+ spin_lock_irq(&ep->lock);
if (!ep_is_linked(epi)) {
list_add_tail(&epi->rdllink, &ep->rdllist);
ep_pm_stay_awake(epi);
@@ -1837,7 +1750,7 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi,
if (waitqueue_active(&ep->poll_wait))
pwake++;
}
- write_unlock_irq(&ep->lock);
+ spin_unlock_irq(&ep->lock);
}
/* We have to call this outside the lock */
@@ -2089,7 +2002,7 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
init_wait(&wait);
wait.func = ep_autoremove_wake_function;
- write_lock_irq(&ep->lock);
+ spin_lock_irq(&ep->lock);
/*
* Barrierless variant, waitqueue_active() is called under
* the same lock on wakeup ep_poll_callback() side, so it
@@ -2108,7 +2021,7 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
if (!eavail)
__add_wait_queue_exclusive(&ep->wq, &wait);
- write_unlock_irq(&ep->lock);
+ spin_unlock_irq(&ep->lock);
if (!eavail)
timed_out = !ep_schedule_timeout(to) ||
@@ -2124,7 +2037,7 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
eavail = 1;
if (!list_empty_careful(&wait.entry)) {
- write_lock_irq(&ep->lock);
+ spin_lock_irq(&ep->lock);
/*
* If the thread timed out and is not on the wait queue,
* it means that the thread was woken up after its
@@ -2135,7 +2048,7 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
if (timed_out)
eavail = list_empty(&wait.entry);
__remove_wait_queue(&ep->wq, &wait);
- write_unlock_irq(&ep->lock);
+ spin_unlock_irq(&ep->lock);
}
}
}
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.10.y
git checkout FETCH_HEAD
git cherry-pick -x 0c43094f8cc9d3d99d835c0ac9c4fe1ccc62babd
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025101619-cesarean-cresting-01cb@gregkh' --subject-prefix 'PATCH 5.10.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 0c43094f8cc9d3d99d835c0ac9c4fe1ccc62babd Mon Sep 17 00:00:00 2001
From: Nam Cao <namcao(a)linutronix.de>
Date: Tue, 15 Jul 2025 14:46:34 +0200
Subject: [PATCH] eventpoll: Replace rwlock with spinlock
The ready event list of an epoll object is protected by read-write
semaphore:
- The consumer (waiter) acquires the write lock and takes items.
- the producer (waker) takes the read lock and adds items.
The point of this design is enabling epoll to scale well with large number
of producers, as multiple producers can hold the read lock at the same
time.
Unfortunately, this implementation may cause scheduling priority inversion
problem. Suppose the consumer has higher scheduling priority than the
producer. The consumer needs to acquire the write lock, but may be blocked
by the producer holding the read lock. Since read-write semaphore does not
support priority-boosting for the readers (even with CONFIG_PREEMPT_RT=y),
we have a case of priority inversion: a higher priority consumer is blocked
by a lower priority producer. This problem was reported in [1].
Furthermore, this could also cause stall problem, as described in [2].
Fix this problem by replacing rwlock with spinlock.
This reduces the event bandwidth, as the producers now have to contend with
each other for the spinlock. According to the benchmark from
https://github.com/rouming/test-tools/blob/master/stress-epoll.c:
On 12 x86 CPUs:
Before After Diff
threads events/ms events/ms
8 7162 4956 -31%
16 8733 5383 -38%
32 7968 5572 -30%
64 10652 5739 -46%
128 11236 5931 -47%
On 4 riscv CPUs:
Before After Diff
threads events/ms events/ms
8 2958 2833 -4%
16 3323 3097 -7%
32 3451 3240 -6%
64 3554 3178 -11%
128 3601 3235 -10%
Although the numbers look bad, it should be noted that this benchmark
creates multiple threads who do nothing except constantly generating new
epoll events, thus contention on the spinlock is high. For real workload,
the event rate is likely much lower, and the performance drop is not as
bad.
Using another benchmark (perf bench epoll wait) where spinlock contention
is lower, improvement is even observed on x86:
On 12 x86 CPUs:
Before: Averaged 110279 operations/sec (+- 1.09%), total secs = 8
After: Averaged 114577 operations/sec (+- 2.25%), total secs = 8
On 4 riscv CPUs:
Before: Averaged 175767 operations/sec (+- 0.62%), total secs = 8
After: Averaged 167396 operations/sec (+- 0.23%), total secs = 8
In conclusion, no one is likely to be upset over this change. After all,
spinlock was used originally for years, and the commit which converted to
rwlock didn't mention a real workload, just that the benchmark numbers are
nice.
This patch is not exactly the revert of commit a218cc491420 ("epoll: use
rwlock in order to reduce ep_poll_callback() contention"), because git
revert conflicts in some places which are not obvious on the resolution.
This patch is intended to be backported, therefore go with the obvious
approach:
- Replace rwlock_t with spinlock_t one to one
- Delete list_add_tail_lockless() and chain_epi_lockless(). These were
introduced to allow producers to concurrently add items to the list.
But now that spinlock no longer allows producers to touch the event
list concurrently, these two functions are not necessary anymore.
Fixes: a218cc491420 ("epoll: use rwlock in order to reduce ep_poll_callback() contention")
Signed-off-by: Nam Cao <namcao(a)linutronix.de>
Link: https://lore.kernel.org/ec92458ea357ec503c737ead0f10b2c6e4c37d47.1752581388…
Tested-by: K Prateek Nayak <kprateek.nayak(a)amd.com>
Cc: stable(a)vger.kernel.org
Reported-by: Frederic Weisbecker <frederic(a)kernel.org>
Closes: https://lore.kernel.org/linux-rt-users/20210825132754.GA895675@lothringen/ [1]
Reported-by: Valentin Schneider <vschneid(a)redhat.com>
Closes: https://lore.kernel.org/linux-rt-users/xhsmhttqvnall.mognet@vschneid.remote… [2]
Signed-off-by: Christian Brauner <brauner(a)kernel.org>
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index b22d6f819f78..ee7c4b683ec3 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -46,10 +46,10 @@
*
* 1) epnested_mutex (mutex)
* 2) ep->mtx (mutex)
- * 3) ep->lock (rwlock)
+ * 3) ep->lock (spinlock)
*
* The acquire order is the one listed above, from 1 to 3.
- * We need a rwlock (ep->lock) because we manipulate objects
+ * We need a spinlock (ep->lock) because we manipulate objects
* from inside the poll callback, that might be triggered from
* a wake_up() that in turn might be called from IRQ context.
* So we can't sleep inside the poll callback and hence we need
@@ -195,7 +195,7 @@ struct eventpoll {
struct list_head rdllist;
/* Lock which protects rdllist and ovflist */
- rwlock_t lock;
+ spinlock_t lock;
/* RB tree root used to store monitored fd structs */
struct rb_root_cached rbr;
@@ -741,10 +741,10 @@ static void ep_start_scan(struct eventpoll *ep, struct list_head *txlist)
* in a lockless way.
*/
lockdep_assert_irqs_enabled();
- write_lock_irq(&ep->lock);
+ spin_lock_irq(&ep->lock);
list_splice_init(&ep->rdllist, txlist);
WRITE_ONCE(ep->ovflist, NULL);
- write_unlock_irq(&ep->lock);
+ spin_unlock_irq(&ep->lock);
}
static void ep_done_scan(struct eventpoll *ep,
@@ -752,7 +752,7 @@ static void ep_done_scan(struct eventpoll *ep,
{
struct epitem *epi, *nepi;
- write_lock_irq(&ep->lock);
+ spin_lock_irq(&ep->lock);
/*
* During the time we spent inside the "sproc" callback, some
* other events might have been queued by the poll callback.
@@ -793,7 +793,7 @@ static void ep_done_scan(struct eventpoll *ep,
wake_up(&ep->wq);
}
- write_unlock_irq(&ep->lock);
+ spin_unlock_irq(&ep->lock);
}
static void ep_get(struct eventpoll *ep)
@@ -868,10 +868,10 @@ static bool __ep_remove(struct eventpoll *ep, struct epitem *epi, bool force)
rb_erase_cached(&epi->rbn, &ep->rbr);
- write_lock_irq(&ep->lock);
+ spin_lock_irq(&ep->lock);
if (ep_is_linked(epi))
list_del_init(&epi->rdllink);
- write_unlock_irq(&ep->lock);
+ spin_unlock_irq(&ep->lock);
wakeup_source_unregister(ep_wakeup_source(epi));
/*
@@ -1152,7 +1152,7 @@ static int ep_alloc(struct eventpoll **pep)
return -ENOMEM;
mutex_init(&ep->mtx);
- rwlock_init(&ep->lock);
+ spin_lock_init(&ep->lock);
init_waitqueue_head(&ep->wq);
init_waitqueue_head(&ep->poll_wait);
INIT_LIST_HEAD(&ep->rdllist);
@@ -1239,100 +1239,10 @@ struct file *get_epoll_tfile_raw_ptr(struct file *file, int tfd,
}
#endif /* CONFIG_KCMP */
-/*
- * Adds a new entry to the tail of the list in a lockless way, i.e.
- * multiple CPUs are allowed to call this function concurrently.
- *
- * Beware: it is necessary to prevent any other modifications of the
- * existing list until all changes are completed, in other words
- * concurrent list_add_tail_lockless() calls should be protected
- * with a read lock, where write lock acts as a barrier which
- * makes sure all list_add_tail_lockless() calls are fully
- * completed.
- *
- * Also an element can be locklessly added to the list only in one
- * direction i.e. either to the tail or to the head, otherwise
- * concurrent access will corrupt the list.
- *
- * Return: %false if element has been already added to the list, %true
- * otherwise.
- */
-static inline bool list_add_tail_lockless(struct list_head *new,
- struct list_head *head)
-{
- struct list_head *prev;
-
- /*
- * This is simple 'new->next = head' operation, but cmpxchg()
- * is used in order to detect that same element has been just
- * added to the list from another CPU: the winner observes
- * new->next == new.
- */
- if (!try_cmpxchg(&new->next, &new, head))
- return false;
-
- /*
- * Initially ->next of a new element must be updated with the head
- * (we are inserting to the tail) and only then pointers are atomically
- * exchanged. XCHG guarantees memory ordering, thus ->next should be
- * updated before pointers are actually swapped and pointers are
- * swapped before prev->next is updated.
- */
-
- prev = xchg(&head->prev, new);
-
- /*
- * It is safe to modify prev->next and new->prev, because a new element
- * is added only to the tail and new->next is updated before XCHG.
- */
-
- prev->next = new;
- new->prev = prev;
-
- return true;
-}
-
-/*
- * Chains a new epi entry to the tail of the ep->ovflist in a lockless way,
- * i.e. multiple CPUs are allowed to call this function concurrently.
- *
- * Return: %false if epi element has been already chained, %true otherwise.
- */
-static inline bool chain_epi_lockless(struct epitem *epi)
-{
- struct eventpoll *ep = epi->ep;
-
- /* Fast preliminary check */
- if (epi->next != EP_UNACTIVE_PTR)
- return false;
-
- /* Check that the same epi has not been just chained from another CPU */
- if (cmpxchg(&epi->next, EP_UNACTIVE_PTR, NULL) != EP_UNACTIVE_PTR)
- return false;
-
- /* Atomically exchange tail */
- epi->next = xchg(&ep->ovflist, epi);
-
- return true;
-}
-
/*
* This is the callback that is passed to the wait queue wakeup
* mechanism. It is called by the stored file descriptors when they
* have events to report.
- *
- * This callback takes a read lock in order not to contend with concurrent
- * events from another file descriptor, thus all modifications to ->rdllist
- * or ->ovflist are lockless. Read lock is paired with the write lock from
- * ep_start/done_scan(), which stops all list modifications and guarantees
- * that lists state is seen correctly.
- *
- * Another thing worth to mention is that ep_poll_callback() can be called
- * concurrently for the same @epi from different CPUs if poll table was inited
- * with several wait queues entries. Plural wakeup from different CPUs of a
- * single wait queue is serialized by wq.lock, but the case when multiple wait
- * queues are used should be detected accordingly. This is detected using
- * cmpxchg() operation.
*/
static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
{
@@ -1343,7 +1253,7 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v
unsigned long flags;
int ewake = 0;
- read_lock_irqsave(&ep->lock, flags);
+ spin_lock_irqsave(&ep->lock, flags);
ep_set_busy_poll_napi_id(epi);
@@ -1372,12 +1282,15 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v
* chained in ep->ovflist and requeued later on.
*/
if (READ_ONCE(ep->ovflist) != EP_UNACTIVE_PTR) {
- if (chain_epi_lockless(epi))
+ if (epi->next == EP_UNACTIVE_PTR) {
+ epi->next = READ_ONCE(ep->ovflist);
+ WRITE_ONCE(ep->ovflist, epi);
ep_pm_stay_awake_rcu(epi);
+ }
} else if (!ep_is_linked(epi)) {
/* In the usual case, add event to ready list. */
- if (list_add_tail_lockless(&epi->rdllink, &ep->rdllist))
- ep_pm_stay_awake_rcu(epi);
+ list_add_tail(&epi->rdllink, &ep->rdllist);
+ ep_pm_stay_awake_rcu(epi);
}
/*
@@ -1410,7 +1323,7 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v
pwake++;
out_unlock:
- read_unlock_irqrestore(&ep->lock, flags);
+ spin_unlock_irqrestore(&ep->lock, flags);
/* We have to call this outside the lock */
if (pwake)
@@ -1745,7 +1658,7 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
}
/* We have to drop the new item inside our item list to keep track of it */
- write_lock_irq(&ep->lock);
+ spin_lock_irq(&ep->lock);
/* record NAPI ID of new item if present */
ep_set_busy_poll_napi_id(epi);
@@ -1762,7 +1675,7 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
pwake++;
}
- write_unlock_irq(&ep->lock);
+ spin_unlock_irq(&ep->lock);
/* We have to call this outside the lock */
if (pwake)
@@ -1826,7 +1739,7 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi,
* list, push it inside.
*/
if (ep_item_poll(epi, &pt, 1)) {
- write_lock_irq(&ep->lock);
+ spin_lock_irq(&ep->lock);
if (!ep_is_linked(epi)) {
list_add_tail(&epi->rdllink, &ep->rdllist);
ep_pm_stay_awake(epi);
@@ -1837,7 +1750,7 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi,
if (waitqueue_active(&ep->poll_wait))
pwake++;
}
- write_unlock_irq(&ep->lock);
+ spin_unlock_irq(&ep->lock);
}
/* We have to call this outside the lock */
@@ -2089,7 +2002,7 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
init_wait(&wait);
wait.func = ep_autoremove_wake_function;
- write_lock_irq(&ep->lock);
+ spin_lock_irq(&ep->lock);
/*
* Barrierless variant, waitqueue_active() is called under
* the same lock on wakeup ep_poll_callback() side, so it
@@ -2108,7 +2021,7 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
if (!eavail)
__add_wait_queue_exclusive(&ep->wq, &wait);
- write_unlock_irq(&ep->lock);
+ spin_unlock_irq(&ep->lock);
if (!eavail)
timed_out = !ep_schedule_timeout(to) ||
@@ -2124,7 +2037,7 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
eavail = 1;
if (!list_empty_careful(&wait.entry)) {
- write_lock_irq(&ep->lock);
+ spin_lock_irq(&ep->lock);
/*
* If the thread timed out and is not on the wait queue,
* it means that the thread was woken up after its
@@ -2135,7 +2048,7 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
if (timed_out)
eavail = list_empty(&wait.entry);
__remove_wait_queue(&ep->wq, &wait);
- write_unlock_irq(&ep->lock);
+ spin_unlock_irq(&ep->lock);
}
}
}