From: Marc Kleine-Budde <mkl(a)pengutronix.de>
[ Upstream commit ac2b81eb8b2d104033560daea886ee84531e3d0a ]
When changing the interface from CAN-CC to CAN-FD mode the old
coalescing parameters are re-used. This might cause problem, as the
configured parameters are too big for CAN-FD mode.
During testing an invalid TX coalescing configuration has been seen.
The problem should be been fixed in the previous patch, but add a
safeguard here to ensure that the number of TEF coalescing buffers (if
configured) is exactly the half of all TEF buffers.
Link: https://lore.kernel.org/all/20240805-mcp251xfd-fix-ringconfig-v1-2-72086f0c…
Signed-off-by: Marc Kleine-Budde <mkl(a)pengutronix.de>
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
---
drivers/net/can/spi/mcp251xfd/mcp251xfd-ring.c | 14 +++++++++++---
1 file changed, 11 insertions(+), 3 deletions(-)
diff --git a/drivers/net/can/spi/mcp251xfd/mcp251xfd-ring.c b/drivers/net/can/spi/mcp251xfd/mcp251xfd-ring.c
index 4d0246a0779a..f677a4cd00f3 100644
--- a/drivers/net/can/spi/mcp251xfd/mcp251xfd-ring.c
+++ b/drivers/net/can/spi/mcp251xfd/mcp251xfd-ring.c
@@ -279,7 +279,7 @@ int mcp251xfd_ring_init(struct mcp251xfd_priv *priv)
const struct mcp251xfd_rx_ring *rx_ring;
u16 base = 0, ram_used;
u8 fifo_nr = 1;
- int i;
+ int err = 0, i;
netdev_reset_queue(priv->ndev);
@@ -375,10 +375,18 @@ int mcp251xfd_ring_init(struct mcp251xfd_priv *priv)
netdev_err(priv->ndev,
"Error during ring configuration, using more RAM (%u bytes) than available (%u bytes).\n",
ram_used, MCP251XFD_RAM_SIZE);
- return -ENOMEM;
+ err = -ENOMEM;
}
- return 0;
+ if (priv->tx_obj_num_coalesce_irq &&
+ priv->tx_obj_num_coalesce_irq * 2 != priv->tx->obj_num) {
+ netdev_err(priv->ndev,
+ "Error during ring configuration, number of TEF coalescing buffers (%u) must be half of TEF buffers (%u).\n",
+ priv->tx_obj_num_coalesce_irq, priv->tx->obj_num);
+ err = -EINVAL;
+ }
+
+ return err;
}
void mcp251xfd_ring_free(struct mcp251xfd_priv *priv)
--
2.43.0
From: Marc Kleine-Budde <mkl(a)pengutronix.de>
[ Upstream commit ac2b81eb8b2d104033560daea886ee84531e3d0a ]
When changing the interface from CAN-CC to CAN-FD mode the old
coalescing parameters are re-used. This might cause problem, as the
configured parameters are too big for CAN-FD mode.
During testing an invalid TX coalescing configuration has been seen.
The problem should be been fixed in the previous patch, but add a
safeguard here to ensure that the number of TEF coalescing buffers (if
configured) is exactly the half of all TEF buffers.
Link: https://lore.kernel.org/all/20240805-mcp251xfd-fix-ringconfig-v1-2-72086f0c…
Signed-off-by: Marc Kleine-Budde <mkl(a)pengutronix.de>
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
---
drivers/net/can/spi/mcp251xfd/mcp251xfd-ring.c | 14 +++++++++++---
1 file changed, 11 insertions(+), 3 deletions(-)
diff --git a/drivers/net/can/spi/mcp251xfd/mcp251xfd-ring.c b/drivers/net/can/spi/mcp251xfd/mcp251xfd-ring.c
index 4cb79a4f2461..489d1439563a 100644
--- a/drivers/net/can/spi/mcp251xfd/mcp251xfd-ring.c
+++ b/drivers/net/can/spi/mcp251xfd/mcp251xfd-ring.c
@@ -289,7 +289,7 @@ int mcp251xfd_ring_init(struct mcp251xfd_priv *priv)
const struct mcp251xfd_rx_ring *rx_ring;
u16 base = 0, ram_used;
u8 fifo_nr = 1;
- int i;
+ int err = 0, i;
netdev_reset_queue(priv->ndev);
@@ -385,10 +385,18 @@ int mcp251xfd_ring_init(struct mcp251xfd_priv *priv)
netdev_err(priv->ndev,
"Error during ring configuration, using more RAM (%u bytes) than available (%u bytes).\n",
ram_used, MCP251XFD_RAM_SIZE);
- return -ENOMEM;
+ err = -ENOMEM;
}
- return 0;
+ if (priv->tx_obj_num_coalesce_irq &&
+ priv->tx_obj_num_coalesce_irq * 2 != priv->tx->obj_num) {
+ netdev_err(priv->ndev,
+ "Error during ring configuration, number of TEF coalescing buffers (%u) must be half of TEF buffers (%u).\n",
+ priv->tx_obj_num_coalesce_irq, priv->tx->obj_num);
+ err = -EINVAL;
+ }
+
+ return err;
}
void mcp251xfd_ring_free(struct mcp251xfd_priv *priv)
--
2.43.0
From: Larysa Zaremba <larysa.zaremba(a)intel.com>
[ Upstream commit f50c68763436bc8f805712a7c5ceaf58cfcf5f07 ]
If VSI rebuild is pending, .ndo_bpf() can attach/detach the XDP program on
VSI without applying new ring configuration. When unconfiguring the VSI, we
can encounter the state in which there is an XDP program but no XDP rings
to destroy or there will be XDP rings that need to be destroyed, but no XDP
program to indicate their presence.
When unconfiguring, rely on the presence of XDP rings rather then XDP
program, as they better represent the current state that has to be
destroyed.
Reviewed-by: Wojciech Drewek <wojciech.drewek(a)intel.com>
Reviewed-by: Jacob Keller <jacob.e.keller(a)intel.com>
Tested-by: Chandan Kumar Rout <chandanx.rout(a)intel.com>
Acked-by: Maciej Fijalkowski <maciej.fijalkowski(a)intel.com>
Signed-off-by: Larysa Zaremba <larysa.zaremba(a)intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen(a)intel.com>
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
---
drivers/net/ethernet/intel/ice/ice_lib.c | 4 ++--
drivers/net/ethernet/intel/ice/ice_main.c | 4 ++--
drivers/net/ethernet/intel/ice/ice_xsk.c | 6 +++---
3 files changed, 7 insertions(+), 7 deletions(-)
diff --git a/drivers/net/ethernet/intel/ice/ice_lib.c b/drivers/net/ethernet/intel/ice/ice_lib.c
index 7629b0190578..73d03535561a 100644
--- a/drivers/net/ethernet/intel/ice/ice_lib.c
+++ b/drivers/net/ethernet/intel/ice/ice_lib.c
@@ -2426,7 +2426,7 @@ void ice_vsi_decfg(struct ice_vsi *vsi)
dev_err(ice_pf_to_dev(pf), "Failed to remove RDMA scheduler config for VSI %u, err %d\n",
vsi->vsi_num, err);
- if (ice_is_xdp_ena_vsi(vsi))
+ if (vsi->xdp_rings)
/* return value check can be skipped here, it always returns
* 0 if reset is in progress
*/
@@ -2528,7 +2528,7 @@ static void ice_vsi_release_msix(struct ice_vsi *vsi)
for (q = 0; q < q_vector->num_ring_tx; q++) {
ice_write_itr(&q_vector->tx, 0);
wr32(hw, QINT_TQCTL(vsi->txq_map[txq]), 0);
- if (ice_is_xdp_ena_vsi(vsi)) {
+ if (vsi->xdp_rings) {
u32 xdp_txq = txq + vsi->num_xdp_txq;
wr32(hw, QINT_TQCTL(vsi->txq_map[xdp_txq]), 0);
diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c
index f16d13e9ff6e..448b854d1128 100644
--- a/drivers/net/ethernet/intel/ice/ice_main.c
+++ b/drivers/net/ethernet/intel/ice/ice_main.c
@@ -7222,7 +7222,7 @@ int ice_down(struct ice_vsi *vsi)
if (tx_err)
netdev_err(vsi->netdev, "Failed stop Tx rings, VSI %d error %d\n",
vsi->vsi_num, tx_err);
- if (!tx_err && ice_is_xdp_ena_vsi(vsi)) {
+ if (!tx_err && vsi->xdp_rings) {
tx_err = ice_vsi_stop_xdp_tx_rings(vsi);
if (tx_err)
netdev_err(vsi->netdev, "Failed stop XDP rings, VSI %d error %d\n",
@@ -7239,7 +7239,7 @@ int ice_down(struct ice_vsi *vsi)
ice_for_each_txq(vsi, i)
ice_clean_tx_ring(vsi->tx_rings[i]);
- if (ice_is_xdp_ena_vsi(vsi))
+ if (vsi->xdp_rings)
ice_for_each_xdp_txq(vsi, i)
ice_clean_tx_ring(vsi->xdp_rings[i]);
diff --git a/drivers/net/ethernet/intel/ice/ice_xsk.c b/drivers/net/ethernet/intel/ice/ice_xsk.c
index 240a7bec242b..c2aa6f589937 100644
--- a/drivers/net/ethernet/intel/ice/ice_xsk.c
+++ b/drivers/net/ethernet/intel/ice/ice_xsk.c
@@ -39,7 +39,7 @@ static void ice_qp_reset_stats(struct ice_vsi *vsi, u16 q_idx)
sizeof(vsi_stat->rx_ring_stats[q_idx]->rx_stats));
memset(&vsi_stat->tx_ring_stats[q_idx]->stats, 0,
sizeof(vsi_stat->tx_ring_stats[q_idx]->stats));
- if (ice_is_xdp_ena_vsi(vsi))
+ if (vsi->xdp_rings)
memset(&vsi->xdp_rings[q_idx]->ring_stats->stats, 0,
sizeof(vsi->xdp_rings[q_idx]->ring_stats->stats));
}
@@ -52,7 +52,7 @@ static void ice_qp_reset_stats(struct ice_vsi *vsi, u16 q_idx)
static void ice_qp_clean_rings(struct ice_vsi *vsi, u16 q_idx)
{
ice_clean_tx_ring(vsi->tx_rings[q_idx]);
- if (ice_is_xdp_ena_vsi(vsi))
+ if (vsi->xdp_rings)
ice_clean_tx_ring(vsi->xdp_rings[q_idx]);
ice_clean_rx_ring(vsi->rx_rings[q_idx]);
}
@@ -194,7 +194,7 @@ static int ice_qp_dis(struct ice_vsi *vsi, u16 q_idx)
err = ice_vsi_stop_tx_ring(vsi, ICE_NO_RESET, 0, tx_ring, &txq_meta);
if (!fail)
fail = err;
- if (ice_is_xdp_ena_vsi(vsi)) {
+ if (vsi->xdp_rings) {
struct ice_tx_ring *xdp_ring = vsi->xdp_rings[q_idx];
memset(&txq_meta, 0, sizeof(txq_meta));
--
2.43.0
From: Paulo Alcantara <pc(a)manguebit.com>
[ Upstream commit 7ccc1465465d78e6411b7bd730d06e7435802b5c ]
Call cifs_reconnect() to wake up processes waiting on negotiate
protocol to handle the case where server abruptly shut down and had no
chance to properly close the socket.
Simple reproducer:
ssh 192.168.2.100 pkill -STOP smbd
mount.cifs //192.168.2.100/test /mnt -o ... [never returns]
Cc: Rickard Andersson <rickaran(a)axis.com>
Signed-off-by: Paulo Alcantara (Red Hat) <pc(a)manguebit.com>
Signed-off-by: Steve French <stfrench(a)microsoft.com>
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
---
fs/smb/client/connect.c | 14 +++++++++++++-
1 file changed, 13 insertions(+), 1 deletion(-)
diff --git a/fs/smb/client/connect.c b/fs/smb/client/connect.c
index d2307162a2de..e325e06357ff 100644
--- a/fs/smb/client/connect.c
+++ b/fs/smb/client/connect.c
@@ -656,6 +656,19 @@ allocate_buffers(struct TCP_Server_Info *server)
static bool
server_unresponsive(struct TCP_Server_Info *server)
{
+ /*
+ * If we're in the process of mounting a share or reconnecting a session
+ * and the server abruptly shut down (e.g. socket wasn't closed, packet
+ * had been ACK'ed but no SMB response), don't wait longer than 20s to
+ * negotiate protocol.
+ */
+ spin_lock(&server->srv_lock);
+ if (server->tcpStatus == CifsInNegotiate &&
+ time_after(jiffies, server->lstrp + 20 * HZ)) {
+ spin_unlock(&server->srv_lock);
+ cifs_reconnect(server, false);
+ return true;
+ }
/*
* We need to wait 3 echo intervals to make sure we handle such
* situations right:
@@ -667,7 +680,6 @@ server_unresponsive(struct TCP_Server_Info *server)
* 65s kernel_recvmsg times out, and we see that we haven't gotten
* a response in >60s.
*/
- spin_lock(&server->srv_lock);
if ((server->tcpStatus == CifsGood ||
server->tcpStatus == CifsNeedNegotiate) &&
(!server->ops->can_echo || server->ops->can_echo(server)) &&
--
2.43.0
From: Baochen Qiang <quic_bqiang(a)quicinc.com>
[ Upstream commit d3e154d7776ba57ab679fb816fb87b627fba21c9 ]
This reverts commit 7f0343b7b8710436c1e6355c71782d32ada47e0c.
We are going to revert commit 166a490f59ac ("wifi: ath11k: support hibernation"), on
which this commit depends. With that commit reverted, this one is not needed any
more, so revert this commit first.
Signed-off-by: Baochen Qiang <quic_bqiang(a)quicinc.com>
Acked-by: Jeff Johnson <quic_jjohnson(a)quicinc.com>
Signed-off-by: Kalle Valo <quic_kvalo(a)quicinc.com>
Link: https://patch.msgid.link/20240830073420.5790-2-quic_bqiang@quicinc.com
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
---
drivers/net/wireless/ath/ath11k/core.c | 10 ----------
1 file changed, 10 deletions(-)
diff --git a/drivers/net/wireless/ath/ath11k/core.c b/drivers/net/wireless/ath/ath11k/core.c
index 47554c361963..a14d0c65000a 100644
--- a/drivers/net/wireless/ath/ath11k/core.c
+++ b/drivers/net/wireless/ath/ath11k/core.c
@@ -1009,16 +1009,6 @@ int ath11k_core_resume(struct ath11k_base *ab)
return -ETIMEDOUT;
}
- if (ab->hw_params.current_cc_support &&
- ar->alpha2[0] != 0 && ar->alpha2[1] != 0) {
- ret = ath11k_reg_set_cc(ar);
- if (ret) {
- ath11k_warn(ab, "failed to set country code during resume: %d\n",
- ret);
- return ret;
- }
- }
-
ret = ath11k_dp_rx_pktlog_start(ab);
if (ret)
ath11k_warn(ab, "failed to start rx pktlog during resume: %d\n",
--
2.43.0
From: "hongchi.peng" <hongchi.peng(a)siengine.com>
[ Upstream commit 258905cb9a6414be5c9ca4aa20ef855f8dc894d4 ]
We use komeda_crtc_normalize_zpos to normalize zpos of affected planes
to their blending zorder in CU. If there's only one slave plane in
affected planes and its layer_split property is enabled, order++ for
its split layer, so that when calculating the normalized_zpos
of master planes, the split layer of the slave plane is included, but
the max_slave_zorder does not include the split layer and keep zero
because there's only one slave plane in affacted planes, although we
actually use two slave layers in this commit.
In most cases, this bug does not result in a commit failure, but assume
the following situation:
slave_layer 0: zpos = 0, layer split enabled, normalized_zpos =
0;(use slave_layer 2 as its split layer)
master_layer 0: zpos = 2, layer_split enabled, normalized_zpos =
2;(use master_layer 2 as its split layer)
master_layer 1: zpos = 4, normalized_zpos = 4;
master_layer 3: zpos = 5, normalized_zpos = 5;
kcrtc_st->max_slave_zorder = 0;
When we use master_layer 3 as a input of CU in function
komeda_compiz_set_input and check it with function
komeda_component_check_input, the parameter idx is equal to
normailzed_zpos minus max_slave_zorder, the value of idx is 5
and is euqal to CU's max_active_inputs, so that
komeda_component_check_input returns a -EINVAL value.
To fix the bug described above, when calculating the max_slave_zorder
with the layer_split enabled, count the split layer in this calculation
directly.
Signed-off-by: hongchi.peng <hongchi.peng(a)siengine.com>
Acked-by: Liviu Dudau <liviu.dudau(a)arm.com>
Signed-off-by: Liviu Dudau <liviu.dudau(a)arm.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20240826024517.3739-1-hongchi…
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
---
drivers/gpu/drm/arm/display/komeda/komeda_kms.c | 10 +++++++---
1 file changed, 7 insertions(+), 3 deletions(-)
diff --git a/drivers/gpu/drm/arm/display/komeda/komeda_kms.c b/drivers/gpu/drm/arm/display/komeda/komeda_kms.c
index fe46b0ebefea..e5eb5d672bcd 100644
--- a/drivers/gpu/drm/arm/display/komeda/komeda_kms.c
+++ b/drivers/gpu/drm/arm/display/komeda/komeda_kms.c
@@ -160,6 +160,7 @@ static int komeda_crtc_normalize_zpos(struct drm_crtc *crtc,
struct drm_plane *plane;
struct list_head zorder_list;
int order = 0, err;
+ u32 slave_zpos = 0;
DRM_DEBUG_ATOMIC("[CRTC:%d:%s] calculating normalized zpos values\n",
crtc->base.id, crtc->name);
@@ -199,10 +200,13 @@ static int komeda_crtc_normalize_zpos(struct drm_crtc *crtc,
plane_st->zpos, plane_st->normalized_zpos);
/* calculate max slave zorder */
- if (has_bit(drm_plane_index(plane), kcrtc->slave_planes))
+ if (has_bit(drm_plane_index(plane), kcrtc->slave_planes)) {
+ slave_zpos = plane_st->normalized_zpos;
+ if (to_kplane_st(plane_st)->layer_split)
+ slave_zpos++;
kcrtc_st->max_slave_zorder =
- max(plane_st->normalized_zpos,
- kcrtc_st->max_slave_zorder);
+ max(slave_zpos, kcrtc_st->max_slave_zorder);
+ }
}
crtc_st->zpos_changed = true;
--
2.43.0
From: Hans de Goede <hdegoede(a)redhat.com>
[ Upstream commit 839a4ec06f75cec8fec2cc5fc14e921d0c3f7369 ]
There are 2G and 4G RAM versions of the Lenovo Yoga Tab 3 X90F and it
turns out that the 2G version has a DMI product name of
"CHERRYVIEW D1 PLATFORM" where as the 4G version has
"CHERRYVIEW C0 PLATFORM". The sys-vendor + product-version check are
unique enough that the product-name check is not necessary.
Drop the product-name check so that the existing DMI match for the 4G
RAM version also matches the 2G RAM version.
Signed-off-by: Hans de Goede <hdegoede(a)redhat.com>
Reviewed-by: Pierre-Louis Bossart <pierre-louis.bossart(a)linux.intel.com>
Link: https://patch.msgid.link/20240823074305.16873-1-hdegoede@redhat.com
Signed-off-by: Mark Brown <broonie(a)kernel.org>
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
---
sound/soc/intel/common/soc-acpi-intel-cht-match.c | 1 -
1 file changed, 1 deletion(-)
diff --git a/sound/soc/intel/common/soc-acpi-intel-cht-match.c b/sound/soc/intel/common/soc-acpi-intel-cht-match.c
index 5e2ec60e2954..e4c3492a0c28 100644
--- a/sound/soc/intel/common/soc-acpi-intel-cht-match.c
+++ b/sound/soc/intel/common/soc-acpi-intel-cht-match.c
@@ -84,7 +84,6 @@ static const struct dmi_system_id lenovo_yoga_tab3_x90[] = {
/* Lenovo Yoga Tab 3 Pro YT3-X90, codec missing from DSDT */
.matches = {
DMI_MATCH(DMI_SYS_VENDOR, "Intel Corporation"),
- DMI_MATCH(DMI_PRODUCT_NAME, "CHERRYVIEW D1 PLATFORM"),
DMI_MATCH(DMI_PRODUCT_VERSION, "Blade3-10A-001"),
},
},
--
2.43.0
From: Marc Kleine-Budde <mkl(a)pengutronix.de>
[ Upstream commit ac2b81eb8b2d104033560daea886ee84531e3d0a ]
When changing the interface from CAN-CC to CAN-FD mode the old
coalescing parameters are re-used. This might cause problem, as the
configured parameters are too big for CAN-FD mode.
During testing an invalid TX coalescing configuration has been seen.
The problem should be been fixed in the previous patch, but add a
safeguard here to ensure that the number of TEF coalescing buffers (if
configured) is exactly the half of all TEF buffers.
Link: https://lore.kernel.org/all/20240805-mcp251xfd-fix-ringconfig-v1-2-72086f0c…
Signed-off-by: Marc Kleine-Budde <mkl(a)pengutronix.de>
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
---
drivers/net/can/spi/mcp251xfd/mcp251xfd-ring.c | 14 +++++++++++---
1 file changed, 11 insertions(+), 3 deletions(-)
diff --git a/drivers/net/can/spi/mcp251xfd/mcp251xfd-ring.c b/drivers/net/can/spi/mcp251xfd/mcp251xfd-ring.c
index 4cb79a4f2461..489d1439563a 100644
--- a/drivers/net/can/spi/mcp251xfd/mcp251xfd-ring.c
+++ b/drivers/net/can/spi/mcp251xfd/mcp251xfd-ring.c
@@ -289,7 +289,7 @@ int mcp251xfd_ring_init(struct mcp251xfd_priv *priv)
const struct mcp251xfd_rx_ring *rx_ring;
u16 base = 0, ram_used;
u8 fifo_nr = 1;
- int i;
+ int err = 0, i;
netdev_reset_queue(priv->ndev);
@@ -385,10 +385,18 @@ int mcp251xfd_ring_init(struct mcp251xfd_priv *priv)
netdev_err(priv->ndev,
"Error during ring configuration, using more RAM (%u bytes) than available (%u bytes).\n",
ram_used, MCP251XFD_RAM_SIZE);
- return -ENOMEM;
+ err = -ENOMEM;
}
- return 0;
+ if (priv->tx_obj_num_coalesce_irq &&
+ priv->tx_obj_num_coalesce_irq * 2 != priv->tx->obj_num) {
+ netdev_err(priv->ndev,
+ "Error during ring configuration, number of TEF coalescing buffers (%u) must be half of TEF buffers (%u).\n",
+ priv->tx_obj_num_coalesce_irq, priv->tx->obj_num);
+ err = -EINVAL;
+ }
+
+ return err;
}
void mcp251xfd_ring_free(struct mcp251xfd_priv *priv)
--
2.43.0
The patch below does not apply to the 6.1-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.1.y
git checkout FETCH_HEAD
git cherry-pick -x a5a3c952e82c1ada12bf8c55b73af26f1a454bd2
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024090831-camera-backlands-a643@gregkh' --subject-prefix 'PATCH 6.1.y' HEAD^..
Possible dependencies:
a5a3c952e82c ("rust: macros: provide correct provenance when constructing THIS_MODULE")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From a5a3c952e82c1ada12bf8c55b73af26f1a454bd2 Mon Sep 17 00:00:00 2001
From: Boqun Feng <boqun.feng(a)gmail.com>
Date: Wed, 28 Aug 2024 11:01:29 -0700
Subject: [PATCH] rust: macros: provide correct provenance when constructing
THIS_MODULE
Currently while defining `THIS_MODULE` symbol in `module!()`, the
pointer used to construct `ThisModule` is derived from an immutable
reference of `__this_module`, which means the pointer doesn't have
the provenance for writing, and that means any write to that pointer
is UB regardless of data races or not. However, the usage of
`THIS_MODULE` includes passing this pointer to functions that may write
to it (probably in unsafe code), and this will create soundness issues.
One way to fix this is using `addr_of_mut!()` but that requires the
unstable feature "const_mut_refs". So instead of `addr_of_mut()!`,
an extern static `Opaque` is used here: since `Opaque<T>` is transparent
to `T`, an extern static `Opaque` will just wrap the C symbol (defined
in a C compile unit) in an `Opaque`, which provides a pointer with
writable provenance via `Opaque::get()`. This fix the potential UBs
because of pointer provenance unmatched.
Reported-by: Alice Ryhl <aliceryhl(a)google.com>
Signed-off-by: Boqun Feng <boqun.feng(a)gmail.com>
Reviewed-by: Alice Ryhl <aliceryhl(a)google.com>
Reviewed-by: Trevor Gross <tmgross(a)umich.edu>
Reviewed-by: Benno Lossin <benno.lossin(a)proton.me>
Reviewed-by: Gary Guo <gary(a)garyguo.net>
Closes: https://rust-for-linux.zulipchat.com/#narrow/stream/x/topic/x/near/465412664
Fixes: 1fbde52bde73 ("rust: add `macros` crate")
Cc: stable(a)vger.kernel.org # 6.6.x: be2ca1e03965: ("rust: types: Make Opaque::get const")
Link: https://lore.kernel.org/r/20240828180129.4046355-1-boqun.feng@gmail.com
[ Fixed two typos, reworded title. - Miguel ]
Signed-off-by: Miguel Ojeda <ojeda(a)kernel.org>
diff --git a/rust/macros/module.rs b/rust/macros/module.rs
index 411dc103d82e..7a5b899e47b7 100644
--- a/rust/macros/module.rs
+++ b/rust/macros/module.rs
@@ -217,7 +217,11 @@ pub(crate) fn module(ts: TokenStream) -> TokenStream {
// freed until the module is unloaded.
#[cfg(MODULE)]
static THIS_MODULE: kernel::ThisModule = unsafe {{
- kernel::ThisModule::from_ptr(&kernel::bindings::__this_module as *const _ as *mut _)
+ extern \"C\" {{
+ static __this_module: kernel::types::Opaque<kernel::bindings::module>;
+ }}
+
+ kernel::ThisModule::from_ptr(__this_module.get())
}};
#[cfg(not(MODULE))]
static THIS_MODULE: kernel::ThisModule = unsafe {{
From: Hans de Goede <hdegoede(a)redhat.com>
commit 815d5121927093017947fd76e627da03f0f70be7 upstream.
There is only one "goto done;" in set_device_flags() and this happens
*before* hci_dev_lock() is called, move the done label to after the
hci_dev_unlock() to fix the following unlock balance:
[ 31.493567] =====================================
[ 31.493571] WARNING: bad unlock balance detected!
[ 31.493576] 5.17.0-rc2+ #13 Tainted: G C E
[ 31.493581] -------------------------------------
[ 31.493584] bluetoothd/685 is trying to release lock (&hdev->lock) at:
[ 31.493594] [<ffffffffc07603f5>] set_device_flags+0x65/0x1f0 [bluetooth]
[ 31.493684] but there are no more locks to release!
Note this bug has been around for a couple of years, but before
commit fe92ee6425a2 ("Bluetooth: hci_core: Rework hci_conn_params flags")
supported_flags was hardcoded to "((1U << HCI_CONN_FLAG_MAX) - 1)" so
the check for unsupported flags which does the "goto done;" never
triggered.
Fixes: fe92ee6425a2 ("Bluetooth: hci_core: Rework hci_conn_params flags")
Cc: Luiz Augusto von Dentz <luiz.von.dentz(a)intel.com>
Signed-off-by: Hans de Goede <hdegoede(a)redhat.com>
Signed-off-by: Marcel Holtmann <marcel(a)holtmann.org>
[fp: the check for unsupported flags can actually be triggered before
commit fe92ee6425a2 ("Bluetooth: hci_core: Rework hci_conn_params
flags") by "Set Device Flags - Invalid Parameter 2" Bluez test as
current_flags value comes from userspace]
Signed-off-by: Fedor Pchelkin <pchelkin(a)ispras.ru>
---
net/bluetooth/mgmt.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c
index 0078e33e12ba..5d2289cf2bb1 100644
--- a/net/bluetooth/mgmt.c
+++ b/net/bluetooth/mgmt.c
@@ -4138,9 +4138,9 @@ static int set_device_flags(struct sock *sk, struct hci_dev *hdev, void *data,
}
}
-done:
hci_dev_unlock(hdev);
+done:
if (status == MGMT_STATUS_SUCCESS)
device_flags_changed(sk, hdev, &cp->addr.bdaddr, cp->addr.type,
supported_flags, current_flags);
--
2.39.2
On Mon, 9 Sep 2024 08:48:14 -0400
Sasha Levin <sashal(a)kernel.org> wrote:
> This is a note to let you know that I've just added the patch titled
>
> rust: macros: provide correct provenance when constructing THIS_MODULE
>
> to the 6.1-stable tree which can be found at:
> http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum…
>
> The filename of the patch is:
> rust-macros-provide-correct-provenance-when-construc.patch
> and it can be found in the queue-6.1 subdirectory.
>
> If you, or anyone else, feels it should not be added to the stable tree,
> please let <stable(a)vger.kernel.org> know about it.
>
>
Hi Sasha,
The `Opaque` type doesn't exist yet in 6.1, so this patch should not be
applied to it.
Best,
Gary
>
> commit 5b320b29ddf985d8de92c3afa9aebe13ecd5cfad
> Author: Boqun Feng <boqun.feng(a)gmail.com>
> Date: Wed Aug 28 11:01:29 2024 -0700
>
> rust: macros: provide correct provenance when constructing THIS_MODULE
>
> [ Upstream commit a5a3c952e82c1ada12bf8c55b73af26f1a454bd2 ]
>
> Currently while defining `THIS_MODULE` symbol in `module!()`, the
> pointer used to construct `ThisModule` is derived from an immutable
> reference of `__this_module`, which means the pointer doesn't have
> the provenance for writing, and that means any write to that pointer
> is UB regardless of data races or not. However, the usage of
> `THIS_MODULE` includes passing this pointer to functions that may write
> to it (probably in unsafe code), and this will create soundness issues.
>
> One way to fix this is using `addr_of_mut!()` but that requires the
> unstable feature "const_mut_refs". So instead of `addr_of_mut()!`,
> an extern static `Opaque` is used here: since `Opaque<T>` is transparent
> to `T`, an extern static `Opaque` will just wrap the C symbol (defined
> in a C compile unit) in an `Opaque`, which provides a pointer with
> writable provenance via `Opaque::get()`. This fix the potential UBs
> because of pointer provenance unmatched.
>
> Reported-by: Alice Ryhl <aliceryhl(a)google.com>
> Signed-off-by: Boqun Feng <boqun.feng(a)gmail.com>
> Reviewed-by: Alice Ryhl <aliceryhl(a)google.com>
> Reviewed-by: Trevor Gross <tmgross(a)umich.edu>
> Reviewed-by: Benno Lossin <benno.lossin(a)proton.me>
> Reviewed-by: Gary Guo <gary(a)garyguo.net>
> Closes: https://rust-for-linux.zulipchat.com/#narrow/stream/x/topic/x/near/465412664
> Fixes: 1fbde52bde73 ("rust: add `macros` crate")
> Cc: stable(a)vger.kernel.org # 6.6.x: be2ca1e03965: ("rust: types: Make Opaque::get const")
> Link: https://lore.kernel.org/r/20240828180129.4046355-1-boqun.feng@gmail.com
> [ Fixed two typos, reworded title. - Miguel ]
> Signed-off-by: Miguel Ojeda <ojeda(a)kernel.org>
> Signed-off-by: Sasha Levin <sashal(a)kernel.org>
>
> diff --git a/rust/macros/module.rs b/rust/macros/module.rs
> index 031028b3dc41..071b96639a2e 100644
> --- a/rust/macros/module.rs
> +++ b/rust/macros/module.rs
> @@ -183,7 +183,11 @@ pub(crate) fn module(ts: TokenStream) -> TokenStream {
> // freed until the module is unloaded.
> #[cfg(MODULE)]
> static THIS_MODULE: kernel::ThisModule = unsafe {{
> - kernel::ThisModule::from_ptr(&kernel::bindings::__this_module as *const _ as *mut _)
> + extern \"C\" {{
> + static __this_module: kernel::types::Opaque<kernel::bindings::module>;
> + }}
> +
> + kernel::ThisModule::from_ptr(__this_module.get())
> }};
> #[cfg(not(MODULE))]
> static THIS_MODULE: kernel::ThisModule = unsafe {{
init() was removed from ramgp102 when reworking the memory detection, as
it was thought that the code was only necessary when the driver performs
mclk changes, which nouveau doesn't support on pascal.
However, it turns out that we still need to execute this on some GPUs to
restore settings after DEVINIT, so revert to the original behaviour.
v2: fix tags in commit message, cc stable
Closes: https://gitlab.freedesktop.org/drm/nouveau/-/issues/319
Fixes: 2c0c15a22fa0 ("drm/nouveau/fb/gp102-ga100: switch to simpler vram size detection method")
Cc: <stable(a)vger.kernel.org> # 6.6+
Signed-off-by: Ben Skeggs <bskeggs(a)nvidia.com>
---
drivers/gpu/drm/nouveau/nvkm/subdev/fb/ram.h | 2 ++
drivers/gpu/drm/nouveau/nvkm/subdev/fb/ramgp100.c | 2 +-
drivers/gpu/drm/nouveau/nvkm/subdev/fb/ramgp102.c | 1 +
3 files changed, 4 insertions(+), 1 deletion(-)
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/fb/ram.h b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/ram.h
index 50f0c1914f58..4c3f74396579 100644
--- a/drivers/gpu/drm/nouveau/nvkm/subdev/fb/ram.h
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/ram.h
@@ -46,6 +46,8 @@ u32 gm107_ram_probe_fbp(const struct nvkm_ram_func *,
u32 gm200_ram_probe_fbp_amount(const struct nvkm_ram_func *, u32,
struct nvkm_device *, int, int *);
+int gp100_ram_init(struct nvkm_ram *);
+
/* RAM type-specific MR calculation routines */
int nvkm_sddr2_calc(struct nvkm_ram *);
int nvkm_sddr3_calc(struct nvkm_ram *);
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/fb/ramgp100.c b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/ramgp100.c
index 378f6fb70990..8987a21e81d1 100644
--- a/drivers/gpu/drm/nouveau/nvkm/subdev/fb/ramgp100.c
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/ramgp100.c
@@ -27,7 +27,7 @@
#include <subdev/bios/init.h>
#include <subdev/bios/rammap.h>
-static int
+int
gp100_ram_init(struct nvkm_ram *ram)
{
struct nvkm_subdev *subdev = &ram->fb->subdev;
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/fb/ramgp102.c b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/ramgp102.c
index 8550f5e47347..b6b6ee59019d 100644
--- a/drivers/gpu/drm/nouveau/nvkm/subdev/fb/ramgp102.c
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/ramgp102.c
@@ -5,6 +5,7 @@
static const struct nvkm_ram_func
gp102_ram = {
+ .init = gp100_ram_init,
};
int
--
2.45.1
In __iodyn_find_io_region(), pcmcia_make_resource() is assigned to
res and used in pci_bus_alloc_resource(). There is a dereference of res
in pci_bus_alloc_resource(), which could lead to a NULL pointer
dereference on failure of pcmcia_make_resource().
Fix this bug by adding a check of res.
Found by code review, complie tested only.
Cc: stable(a)vger.kernel.org
Fixes: 49b1153adfe1 ("pcmcia: move all pcmcia_resource_ops providers into one module")
Signed-off-by: Ma Ke <make24(a)iscas.ac.cn>
---
drivers/pcmcia/rsrc_iodyn.c | 3 +++
1 file changed, 3 insertions(+)
diff --git a/drivers/pcmcia/rsrc_iodyn.c b/drivers/pcmcia/rsrc_iodyn.c
index b04b16496b0c..2677b577c1f8 100644
--- a/drivers/pcmcia/rsrc_iodyn.c
+++ b/drivers/pcmcia/rsrc_iodyn.c
@@ -62,6 +62,9 @@ static struct resource *__iodyn_find_io_region(struct pcmcia_socket *s,
unsigned long min = base;
int ret;
+ if (!res)
+ return NULL;
+
data.mask = align - 1;
data.offset = base & data.mask;
--
2.25.1
This reverts commit 2e42b7f817acd6e8d78226445eb6fe44fe79c12a.
If the GC victim section has a pinned block when fallocate() trigger
FG_GC, the section is not able to be recycled. And this will return
-EAGAIN cause fallocate() failed, even though there are much spare space
as user see. As the GC policy prone to chose the same victim,
fallocate() may not successed at a long period.
This scenario has been found during Android OTA.
Link: https://lore.kernel.org/linux-f2fs-devel/20231030094024.263707-1-bo.wu@vivo…
CC: stable(a)vger.kernel.org
Signed-off-by: Wu Bo <bo.wu(a)vivo.com>
---
fs/f2fs/file.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index b58ab1157b7e..19915faccee9 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -1725,7 +1725,7 @@ static int f2fs_expand_inode_data(struct inode *inode, loff_t offset,
f2fs_down_write(&sbi->gc_lock);
stat_inc_gc_call_count(sbi, FOREGROUND);
err = f2fs_gc(sbi, &gc_control);
- if (err && err != -ENODATA)
+ if (err && err != -ENODATA && err != -EAGAIN)
goto out_err;
}
--
2.25.1
From: Daniel Borkmann <daniel(a)iogearbox.net>
commit 626dfed5fa3bfb41e0dffd796032b555b69f9cde upstream.
When using a BPF program on kernel_connect(), the call can return -EPERM. This
causes xs_tcp_setup_socket() to loop forever, filling up the syslog and causing
the kernel to potentially freeze up.
Neil suggested:
This will propagate -EPERM up into other layers which might not be ready
to handle it. It might be safer to map EPERM to an error we would be more
likely to expect from the network system - such as ECONNREFUSED or ENETDOWN.
ECONNREFUSED as error seems reasonable. For programs setting a different error
can be out of reach (see handling in 4fbac77d2d09) in particular on kernels
which do not have f10d05966196 ("bpf: Make BPF_PROG_RUN_ARRAY return -err
instead of allow boolean"), thus given that it is better to simply remap for
consistent behavior. UDP does handle EPERM in xs_udp_send_request().
Fixes: d74bad4e74ee ("bpf: Hooks for sys_connect")
Fixes: 4fbac77d2d09 ("bpf: Hooks for sys_bind")
Co-developed-by: Lex Siegel <usiegl00(a)gmail.com>
Signed-off-by: Lex Siegel <usiegl00(a)gmail.com>
Signed-off-by: Daniel Borkmann <daniel(a)iogearbox.net>
Cc: Neil Brown <neilb(a)suse.de>
Cc: Trond Myklebust <trondmy(a)kernel.org>
Cc: Anna Schumaker <anna(a)kernel.org>
Link: https://github.com/cilium/cilium/issues/33395
Link: https://lore.kernel.org/bpf/171374175513.12877.8993642908082014881@noble.ne…
Link: https://patch.msgid.link/9069ec1d59e4b2129fc23433349fd5580ad43921.172007507…
Signed-off-by: Paolo Abeni <pabeni(a)redhat.com>
Signed-off-by: Hugo SIMELIERE <hsimeliere.opensource(a)witekio.com>
---
net/sunrpc/xprtsock.c | 7 +++++++
1 file changed, 7 insertions(+)
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 938c649c5c9f..625b5f69d3ca 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -2466,6 +2466,13 @@ static void xs_tcp_setup_socket(struct work_struct *work)
case -EALREADY:
xprt_unlock_connect(xprt, transport);
return;
+ case -EPERM:
+ /* Happens, for instance, if a BPF program is preventing
+ * the connect. Remap the error so upper layers can better
+ * deal with it.
+ */
+ status = -ECONNREFUSED;
+ /* fall through */
case -EINVAL:
/* Happens, for instance, if the user specified a link
* local IPv6 address without a scope-id.
--
2.43.0
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.10.y
git checkout FETCH_HEAD
git cherry-pick -x 9972605a238339b85bd16b084eed5f18414d22db
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024081218-demote-shakily-f31c@gregkh' --subject-prefix 'PATCH 5.10.y' HEAD^..
Possible dependencies:
9972605a2383 ("memcg: protect concurrent access to mem_cgroup_idr")
6f0df8e16eb5 ("memcontrol: ensure memcg acquired by id is properly set up")
e4dde56cd208 ("mm: multi-gen LRU: per-node lru_gen_folio lists")
7348cc91821b ("mm: multi-gen LRU: remove aging fairness safeguard")
a579086c99ed ("mm: multi-gen LRU: remove eviction fairness safeguard")
adb8213014b2 ("mm: memcg: fix stale protection of reclaim target memcg")
57e9cc50f4dd ("mm: vmscan: split khugepaged stats from direct reclaim stats")
e4fea72b1438 ("mglru: mm/vmscan.c: fix imprecise comments")
d396def5d86d ("memcg: rearrange code")
410f8e82689e ("memcg: extract memcg_vmstats from struct mem_cgroup")
d6c3af7d8a2b ("mm: multi-gen LRU: debugfs interface")
1332a809d95a ("mm: multi-gen LRU: thrashing prevention")
354ed5974429 ("mm: multi-gen LRU: kill switch")
f76c83378851 ("mm: multi-gen LRU: optimize multiple memcgs")
bd74fdaea146 ("mm: multi-gen LRU: support page table walks")
018ee47f1489 ("mm: multi-gen LRU: exploit locality in rmap")
ac35a4902374 ("mm: multi-gen LRU: minimal implementation")
ec1c86b25f4b ("mm: multi-gen LRU: groundwork")
f1e1a7be4718 ("mm/vmscan.c: refactor shrink_node()")
d3629af59f41 ("mm/vmscan: make the annotations of refaults code at the right place")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 9972605a238339b85bd16b084eed5f18414d22db Mon Sep 17 00:00:00 2001
From: Shakeel Butt <shakeel.butt(a)linux.dev>
Date: Fri, 2 Aug 2024 16:58:22 -0700
Subject: [PATCH] memcg: protect concurrent access to mem_cgroup_idr
Commit 73f576c04b94 ("mm: memcontrol: fix cgroup creation failure after
many small jobs") decoupled the memcg IDs from the CSS ID space to fix the
cgroup creation failures. It introduced IDR to maintain the memcg ID
space. The IDR depends on external synchronization mechanisms for
modifications. For the mem_cgroup_idr, the idr_alloc() and idr_replace()
happen within css callback and thus are protected through cgroup_mutex
from concurrent modifications. However idr_remove() for mem_cgroup_idr
was not protected against concurrency and can be run concurrently for
different memcgs when they hit their refcnt to zero. Fix that.
We have been seeing list_lru based kernel crashes at a low frequency in
our fleet for a long time. These crashes were in different part of
list_lru code including list_lru_add(), list_lru_del() and reparenting
code. Upon further inspection, it looked like for a given object (dentry
and inode), the super_block's list_lru didn't have list_lru_one for the
memcg of that object. The initial suspicions were either the object is
not allocated through kmem_cache_alloc_lru() or somehow
memcg_list_lru_alloc() failed to allocate list_lru_one() for a memcg but
returned success. No evidence were found for these cases.
Looking more deeply, we started seeing situations where valid memcg's id
is not present in mem_cgroup_idr and in some cases multiple valid memcgs
have same id and mem_cgroup_idr is pointing to one of them. So, the most
reasonable explanation is that these situations can happen due to race
between multiple idr_remove() calls or race between
idr_alloc()/idr_replace() and idr_remove(). These races are causing
multiple memcgs to acquire the same ID and then offlining of one of them
would cleanup list_lrus on the system for all of them. Later access from
other memcgs to the list_lru cause crashes due to missing list_lru_one.
Link: https://lkml.kernel.org/r/20240802235822.1830976-1-shakeel.butt@linux.dev
Fixes: 73f576c04b94 ("mm: memcontrol: fix cgroup creation failure after many small jobs")
Signed-off-by: Shakeel Butt <shakeel.butt(a)linux.dev>
Acked-by: Muchun Song <muchun.song(a)linux.dev>
Reviewed-by: Roman Gushchin <roman.gushchin(a)linux.dev>
Acked-by: Johannes Weiner <hannes(a)cmpxchg.org>
Cc: Michal Hocko <mhocko(a)suse.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 960371788687..f29157288b7d 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -3386,11 +3386,28 @@ static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg)
#define MEM_CGROUP_ID_MAX ((1UL << MEM_CGROUP_ID_SHIFT) - 1)
static DEFINE_IDR(mem_cgroup_idr);
+static DEFINE_SPINLOCK(memcg_idr_lock);
+
+static int mem_cgroup_alloc_id(void)
+{
+ int ret;
+
+ idr_preload(GFP_KERNEL);
+ spin_lock(&memcg_idr_lock);
+ ret = idr_alloc(&mem_cgroup_idr, NULL, 1, MEM_CGROUP_ID_MAX + 1,
+ GFP_NOWAIT);
+ spin_unlock(&memcg_idr_lock);
+ idr_preload_end();
+ return ret;
+}
static void mem_cgroup_id_remove(struct mem_cgroup *memcg)
{
if (memcg->id.id > 0) {
+ spin_lock(&memcg_idr_lock);
idr_remove(&mem_cgroup_idr, memcg->id.id);
+ spin_unlock(&memcg_idr_lock);
+
memcg->id.id = 0;
}
}
@@ -3524,8 +3541,7 @@ static struct mem_cgroup *mem_cgroup_alloc(struct mem_cgroup *parent)
if (!memcg)
return ERR_PTR(error);
- memcg->id.id = idr_alloc(&mem_cgroup_idr, NULL,
- 1, MEM_CGROUP_ID_MAX + 1, GFP_KERNEL);
+ memcg->id.id = mem_cgroup_alloc_id();
if (memcg->id.id < 0) {
error = memcg->id.id;
goto fail;
@@ -3667,7 +3683,9 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
* publish it here at the end of onlining. This matches the
* regular ID destruction during offlining.
*/
+ spin_lock(&memcg_idr_lock);
idr_replace(&mem_cgroup_idr, memcg, memcg->id.id);
+ spin_unlock(&memcg_idr_lock);
return 0;
offline_kmem:
From: Filipe Manana <fdmanana(a)suse.com>
commit cd9253c23aedd61eb5ff11f37a36247cd46faf86 upstream.
If we have 2 threads that are using the same file descriptor and one of
them is doing direct IO writes while the other is doing fsync, we have a
race where we can end up either:
1) Attempt a fsync without holding the inode's lock, triggering an
assertion failures when assertions are enabled;
2) Do an invalid memory access from the fsync task because the file private
points to memory allocated on stack by the direct IO task and it may be
used by the fsync task after the stack was destroyed.
The race happens like this:
1) A user space program opens a file descriptor with O_DIRECT;
2) The program spawns 2 threads using libpthread for example;
3) One of the threads uses the file descriptor to do direct IO writes,
while the other calls fsync using the same file descriptor.
4) Call task A the thread doing direct IO writes and task B the thread
doing fsyncs;
5) Task A does a direct IO write, and at btrfs_direct_write() sets the
file's private to an on stack allocated private with the member
'fsync_skip_inode_lock' set to true;
6) Task B enters btrfs_sync_file() and sees that there's a private
structure associated to the file which has 'fsync_skip_inode_lock' set
to true, so it skips locking the inode's vfs lock;
7) Task A completes the direct IO write, and resets the file's private to
NULL since it had no prior private and our private was stack allocated.
Then it unlocks the inode's vfs lock;
8) Task B enters btrfs_get_ordered_extents_for_logging(), then the
assertion that checks the inode's vfs lock is held fails, since task B
never locked it and task A has already unlocked it.
The stack trace produced is the following:
Aug 21 11:46:43 kerberos kernel: assertion failed: inode_is_locked(&inode->vfs_inode), in fs/btrfs/ordered-data.c:983
Aug 21 11:46:43 kerberos kernel: ------------[ cut here ]------------
Aug 21 11:46:43 kerberos kernel: kernel BUG at fs/btrfs/ordered-data.c:983!
Aug 21 11:46:43 kerberos kernel: Oops: invalid opcode: 0000 [#1] PREEMPT SMP PTI
Aug 21 11:46:43 kerberos kernel: CPU: 9 PID: 5072 Comm: worker Tainted: G U OE 6.10.5-1-default #1 openSUSE Tumbleweed 69f48d427608e1c09e60ea24c6c55e2ca1b049e8
Aug 21 11:46:43 kerberos kernel: Hardware name: Acer Predator PH315-52/Covini_CFS, BIOS V1.12 07/28/2020
Aug 21 11:46:43 kerberos kernel: RIP: 0010:btrfs_get_ordered_extents_for_logging.cold+0x1f/0x42 [btrfs]
Aug 21 11:46:43 kerberos kernel: Code: 50 d6 86 c0 e8 (...)
Aug 21 11:46:43 kerberos kernel: RSP: 0018:ffff9e4a03dcfc78 EFLAGS: 00010246
Aug 21 11:46:43 kerberos kernel: RAX: 0000000000000054 RBX: ffff9078a9868e98 RCX: 0000000000000000
Aug 21 11:46:43 kerberos kernel: RDX: 0000000000000000 RSI: ffff907dce4a7800 RDI: ffff907dce4a7800
Aug 21 11:46:43 kerberos kernel: RBP: ffff907805518800 R08: 0000000000000000 R09: ffff9e4a03dcfb38
Aug 21 11:46:43 kerberos kernel: R10: ffff9e4a03dcfb30 R11: 0000000000000003 R12: ffff907684ae7800
Aug 21 11:46:43 kerberos kernel: R13: 0000000000000001 R14: ffff90774646b600 R15: 0000000000000000
Aug 21 11:46:43 kerberos kernel: FS: 00007f04b96006c0(0000) GS:ffff907dce480000(0000) knlGS:0000000000000000
Aug 21 11:46:43 kerberos kernel: CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
Aug 21 11:46:43 kerberos kernel: CR2: 00007f32acbfc000 CR3: 00000001fd4fa005 CR4: 00000000003726f0
Aug 21 11:46:43 kerberos kernel: Call Trace:
Aug 21 11:46:43 kerberos kernel: <TASK>
Aug 21 11:46:43 kerberos kernel: ? __die_body.cold+0x14/0x24
Aug 21 11:46:43 kerberos kernel: ? die+0x2e/0x50
Aug 21 11:46:43 kerberos kernel: ? do_trap+0xca/0x110
Aug 21 11:46:43 kerberos kernel: ? do_error_trap+0x6a/0x90
Aug 21 11:46:43 kerberos kernel: ? btrfs_get_ordered_extents_for_logging.cold+0x1f/0x42 [btrfs bb26272d49b4cdc847cf3f7faadd459b62caee9a]
Aug 21 11:46:43 kerberos kernel: ? exc_invalid_op+0x50/0x70
Aug 21 11:46:43 kerberos kernel: ? btrfs_get_ordered_extents_for_logging.cold+0x1f/0x42 [btrfs bb26272d49b4cdc847cf3f7faadd459b62caee9a]
Aug 21 11:46:43 kerberos kernel: ? asm_exc_invalid_op+0x1a/0x20
Aug 21 11:46:43 kerberos kernel: ? btrfs_get_ordered_extents_for_logging.cold+0x1f/0x42 [btrfs bb26272d49b4cdc847cf3f7faadd459b62caee9a]
Aug 21 11:46:43 kerberos kernel: ? btrfs_get_ordered_extents_for_logging.cold+0x1f/0x42 [btrfs bb26272d49b4cdc847cf3f7faadd459b62caee9a]
Aug 21 11:46:43 kerberos kernel: btrfs_sync_file+0x21a/0x4d0 [btrfs bb26272d49b4cdc847cf3f7faadd459b62caee9a]
Aug 21 11:46:43 kerberos kernel: ? __seccomp_filter+0x31d/0x4f0
Aug 21 11:46:43 kerberos kernel: __x64_sys_fdatasync+0x4f/0x90
Aug 21 11:46:43 kerberos kernel: do_syscall_64+0x82/0x160
Aug 21 11:46:43 kerberos kernel: ? do_futex+0xcb/0x190
Aug 21 11:46:43 kerberos kernel: ? __x64_sys_futex+0x10e/0x1d0
Aug 21 11:46:43 kerberos kernel: ? switch_fpu_return+0x4f/0xd0
Aug 21 11:46:43 kerberos kernel: ? syscall_exit_to_user_mode+0x72/0x220
Aug 21 11:46:43 kerberos kernel: ? do_syscall_64+0x8e/0x160
Aug 21 11:46:43 kerberos kernel: ? syscall_exit_to_user_mode+0x72/0x220
Aug 21 11:46:43 kerberos kernel: ? do_syscall_64+0x8e/0x160
Aug 21 11:46:43 kerberos kernel: ? syscall_exit_to_user_mode+0x72/0x220
Aug 21 11:46:43 kerberos kernel: ? do_syscall_64+0x8e/0x160
Aug 21 11:46:43 kerberos kernel: ? syscall_exit_to_user_mode+0x72/0x220
Aug 21 11:46:43 kerberos kernel: ? do_syscall_64+0x8e/0x160
Aug 21 11:46:43 kerberos kernel: entry_SYSCALL_64_after_hwframe+0x76/0x7e
Another problem here is if task B grabs the private pointer and then uses
it after task A has finished, since the private was allocated in the stack
of trask A, it results in some invalid memory access with a hard to predict
result.
This issue, triggering the assertion, was observed with QEMU workloads by
two users in the Link tags below.
Fix this by not relying on a file's private to pass information to fsync
that it should skip locking the inode and instead pass this information
through a special value stored in current->journal_info. This is safe
because in the relevant section of the direct IO write path we are not
holding a transaction handle, so current->journal_info is NULL.
The following C program triggers the issue:
$ cat repro.c
/* Get the O_DIRECT definition. */
#ifndef _GNU_SOURCE
#define _GNU_SOURCE
#endif
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <stdint.h>
#include <fcntl.h>
#include <errno.h>
#include <string.h>
#include <pthread.h>
static int fd;
static ssize_t do_write(int fd, const void *buf, size_t count, off_t offset)
{
while (count > 0) {
ssize_t ret;
ret = pwrite(fd, buf, count, offset);
if (ret < 0) {
if (errno == EINTR)
continue;
return ret;
}
count -= ret;
buf += ret;
}
return 0;
}
static void *fsync_loop(void *arg)
{
while (1) {
int ret;
ret = fsync(fd);
if (ret != 0) {
perror("Fsync failed");
exit(6);
}
}
}
int main(int argc, char *argv[])
{
long pagesize;
void *write_buf;
pthread_t fsyncer;
int ret;
if (argc != 2) {
fprintf(stderr, "Use: %s <file path>\n", argv[0]);
return 1;
}
fd = open(argv[1], O_WRONLY | O_CREAT | O_TRUNC | O_DIRECT, 0666);
if (fd == -1) {
perror("Failed to open/create file");
return 1;
}
pagesize = sysconf(_SC_PAGE_SIZE);
if (pagesize == -1) {
perror("Failed to get page size");
return 2;
}
ret = posix_memalign(&write_buf, pagesize, pagesize);
if (ret) {
perror("Failed to allocate buffer");
return 3;
}
ret = pthread_create(&fsyncer, NULL, fsync_loop, NULL);
if (ret != 0) {
fprintf(stderr, "Failed to create writer thread: %d\n", ret);
return 4;
}
while (1) {
ret = do_write(fd, write_buf, pagesize, 0);
if (ret != 0) {
perror("Write failed");
exit(5);
}
}
return 0;
}
$ mkfs.btrfs -f /dev/sdi
$ mount /dev/sdi /mnt/sdi
$ timeout 10 ./repro /mnt/sdi/foo
Usually the race is triggered within less than 1 second. A test case for
fstests will follow soon.
Reported-by: Paulo Dias <paulo.miguel.dias(a)gmail.com>
Link: https://bugzilla.kernel.org/show_bug.cgi?id=219187
Reported-by: Andreas Jahn <jahn-andi(a)web.de>
Link: https://bugzilla.kernel.org/show_bug.cgi?id=219199
Reported-by: syzbot+4704b3cc972bd76024f1(a)syzkaller.appspotmail.com
Link: https://lore.kernel.org/linux-btrfs/00000000000044ff540620d7dee2@google.com/
Fixes: 939b656bc8ab ("btrfs: fix corruption after buffer fault in during direct IO append write")
CC: stable(a)vger.kernel.org # 5.15+
Reviewed-by: Josef Bacik <josef(a)toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana(a)suse.com>
Reviewed-by: David Sterba <dsterba(a)suse.com>
Signed-off-by: David Sterba <dsterba(a)suse.com>
---
fs/btrfs/ctree.h | 1 -
fs/btrfs/file.c | 25 ++++++++++---------------
fs/btrfs/transaction.h | 6 ++++++
3 files changed, 16 insertions(+), 16 deletions(-)
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index f19c6aa3ea4b..17ebcf19b444 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1383,7 +1383,6 @@ struct btrfs_drop_extents_args {
struct btrfs_file_private {
void *filldir_buf;
u64 last_index;
- bool fsync_skip_inode_lock;
};
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index c44dfb4370d7..44160d4ad53e 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1992,13 +1992,6 @@ static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
if (IS_ERR_OR_NULL(dio)) {
err = PTR_ERR_OR_ZERO(dio);
} else {
- struct btrfs_file_private stack_private = { 0 };
- struct btrfs_file_private *private;
- const bool have_private = (file->private_data != NULL);
-
- if (!have_private)
- file->private_data = &stack_private;
-
/*
* If we have a synchoronous write, we must make sure the fsync
* triggered by the iomap_dio_complete() call below doesn't
@@ -2007,13 +2000,10 @@ static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
* partial writes due to the input buffer (or parts of it) not
* being already faulted in.
*/
- private = file->private_data;
- private->fsync_skip_inode_lock = true;
+ ASSERT(current->journal_info == NULL);
+ current->journal_info = BTRFS_TRANS_DIO_WRITE_STUB;
err = iomap_dio_complete(dio);
- private->fsync_skip_inode_lock = false;
-
- if (!have_private)
- file->private_data = NULL;
+ current->journal_info = NULL;
}
/* No increment (+=) because iomap returns a cumulative value. */
@@ -2195,7 +2185,6 @@ static inline bool skip_inode_logging(const struct btrfs_log_ctx *ctx)
*/
int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
{
- struct btrfs_file_private *private = file->private_data;
struct dentry *dentry = file_dentry(file);
struct inode *inode = d_inode(dentry);
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
@@ -2205,7 +2194,13 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
int ret = 0, err;
u64 len;
bool full_sync;
- const bool skip_ilock = (private ? private->fsync_skip_inode_lock : false);
+ bool skip_ilock = false;
+
+ if (current->journal_info == BTRFS_TRANS_DIO_WRITE_STUB) {
+ skip_ilock = true;
+ current->journal_info = NULL;
+ lockdep_assert_held(&inode->i_rwsem);
+ }
trace_btrfs_sync_file(file, datasync);
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 0ded32bbd001..a06bc6ad4764 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -11,6 +11,12 @@
#include "delayed-ref.h"
#include "ctree.h"
+/*
+ * Signal that a direct IO write is in progress, to avoid deadlock for sync
+ * direct IO writes when fsync is called during the direct IO write path.
+ */
+#define BTRFS_TRANS_DIO_WRITE_STUB ((void *) 1)
+
enum btrfs_trans_state {
TRANS_STATE_RUNNING,
TRANS_STATE_COMMIT_START,
--
2.43.0
From: Tvrtko Ursulin <tvrtko.ursulin(a)igalia.com>
Since drm_sched_entity_modify_sched() can modify the entities run queue,
lets make sure to only dereference the pointer once so both adding and
waking up are guaranteed to be consistent.
Alternative of moving the spin_unlock to after the wake up would for now
be more problematic since the same lock is taken inside
drm_sched_rq_update_fifo().
v2:
* Improve commit message. (Philipp)
* Cache the scheduler pointer directly. (Christian)
Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin(a)igalia.com>
Fixes: b37aced31eb0 ("drm/scheduler: implement a function to modify sched list")
Cc: Christian König <christian.koenig(a)amd.com>
Cc: Alex Deucher <alexander.deucher(a)amd.com>
Cc: Luben Tuikov <ltuikov89(a)gmail.com>
Cc: Matthew Brost <matthew.brost(a)intel.com>
Cc: David Airlie <airlied(a)gmail.com>
Cc: Daniel Vetter <daniel(a)ffwll.ch>
Cc: Philipp Stanner <pstanner(a)redhat.com>
Cc: dri-devel(a)lists.freedesktop.org
Cc: <stable(a)vger.kernel.org> # v5.7+
---
drivers/gpu/drm/scheduler/sched_entity.c | 10 ++++++++--
1 file changed, 8 insertions(+), 2 deletions(-)
diff --git a/drivers/gpu/drm/scheduler/sched_entity.c b/drivers/gpu/drm/scheduler/sched_entity.c
index ae8be30472cd..76e422548d40 100644
--- a/drivers/gpu/drm/scheduler/sched_entity.c
+++ b/drivers/gpu/drm/scheduler/sched_entity.c
@@ -599,6 +599,9 @@ void drm_sched_entity_push_job(struct drm_sched_job *sched_job)
/* first job wakes up scheduler */
if (first) {
+ struct drm_gpu_scheduler *sched;
+ struct drm_sched_rq *rq;
+
/* Add the entity to the run queue */
spin_lock(&entity->rq_lock);
if (entity->stopped) {
@@ -608,13 +611,16 @@ void drm_sched_entity_push_job(struct drm_sched_job *sched_job)
return;
}
- drm_sched_rq_add_entity(entity->rq, entity);
+ rq = entity->rq;
+ sched = rq->sched;
+
+ drm_sched_rq_add_entity(rq, entity);
spin_unlock(&entity->rq_lock);
if (drm_sched_policy == DRM_SCHED_POLICY_FIFO)
drm_sched_rq_update_fifo(entity, submit_ts);
- drm_sched_wakeup(entity->rq->sched, entity);
+ drm_sched_wakeup(sched, entity);
}
}
EXPORT_SYMBOL(drm_sched_entity_push_job);
--
2.46.0
From: Daniel Borkmann <daniel(a)iogearbox.net>
commit 626dfed5fa3bfb41e0dffd796032b555b69f9cde upstream.
When using a BPF program on kernel_connect(), the call can return -EPERM. This
causes xs_tcp_setup_socket() to loop forever, filling up the syslog and causing
the kernel to potentially freeze up.
Neil suggested:
This will propagate -EPERM up into other layers which might not be ready
to handle it. It might be safer to map EPERM to an error we would be more
likely to expect from the network system - such as ECONNREFUSED or ENETDOWN.
ECONNREFUSED as error seems reasonable. For programs setting a different error
can be out of reach (see handling in 4fbac77d2d09) in particular on kernels
which do not have f10d05966196 ("bpf: Make BPF_PROG_RUN_ARRAY return -err
instead of allow boolean"), thus given that it is better to simply remap for
consistent behavior. UDP does handle EPERM in xs_udp_send_request().
Fixes: d74bad4e74ee ("bpf: Hooks for sys_connect")
Fixes: 4fbac77d2d09 ("bpf: Hooks for sys_bind")
Co-developed-by: Lex Siegel <usiegl00(a)gmail.com>
Signed-off-by: Lex Siegel <usiegl00(a)gmail.com>
Signed-off-by: Daniel Borkmann <daniel(a)iogearbox.net>
Cc: Neil Brown <neilb(a)suse.de>
Cc: Trond Myklebust <trondmy(a)kernel.org>
Cc: Anna Schumaker <anna(a)kernel.org>
Link: https://github.com/cilium/cilium/issues/33395
Link: https://lore.kernel.org/bpf/171374175513.12877.8993642908082014881@noble.ne…
Link: https://patch.msgid.link/9069ec1d59e4b2129fc23433349fd5580ad43921.172007507…
Signed-off-by: Paolo Abeni <pabeni(a)redhat.com>
Signed-off-by: Hugo SIMELIERE <hsimeliere.opensource(a)witekio.com>
---
net/sunrpc/xprtsock.c | 7 +++++++
1 file changed, 7 insertions(+)
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 0666f981618a..e0cd6d735053 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -2314,6 +2314,13 @@ static void xs_tcp_setup_socket(struct work_struct *work)
case -EALREADY:
xprt_unlock_connect(xprt, transport);
return;
+ case -EPERM:
+ /* Happens, for instance, if a BPF program is preventing
+ * the connect. Remap the error so upper layers can better
+ * deal with it.
+ */
+ status = -ECONNREFUSED;
+ fallthrough;
case -EINVAL:
/* Happens, for instance, if the user specified a link
* local IPv6 address without a scope-id.
--
2.43.0
On Mon, Sep 09 2024 at 05:03, Jamie Heilman wrote:
> 3db03fb4995e ("x86/mm: Fix pti_clone_entry_text() for i386") which got
> landed in 6.6.46, has introduced two back to back warnings on boot on
> my 32bit system (found on 6.6.50):
Right.
> Reverting that commit removes the warnings (tested against 6.6.50).
> The follow-on commit of c48b5a4cf312 ("x86/mm: Fix PTI for i386 some
> more") doesn't apply cleanly to 6.6.50, but I did try out a build of
> 6.11-rc7 and that works fine too with no warnings on boot.
See backport below.
Thanks,
tglx
---
From: Thomas Gleixner <tglx(a)linutronix.de>
Date: Tue Aug 6 20:48:43 2024 +0200
Subject: [PATCH 6.6.y, 6.1.y, 5.10.y] x86/mm: Fix PTI for i386 some more
commit c48b5a4cf3125adb679e28ef093f66ff81368d05 upstream.
So it turns out that we have to do two passes of
pti_clone_entry_text(), once before initcalls, such that device and
late initcalls can use user-mode-helper / modprobe and once after
free_initmem() / mark_readonly().
Now obviously mark_readonly() can cause PMD splits, and
pti_clone_pgtable() doesn't like that much.
Allow the late clone to split PMDs so that pagetables stay in sync.
[peterz: Changelog and comments]
Reported-by: Guenter Roeck <linux(a)roeck-us.net>
Signed-off-by: Thomas Gleixner <tglx(a)linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz(a)infradead.org>
Signed-off-by: Thomas Gleixner <tglx(a)linutronix.de>
Tested-by: Guenter Roeck <linux(a)roeck-us.net>
Link: https://lkml.kernel.org/r/20240806184843.GX37996@noisy.programming.kicks-as…
---
arch/x86/mm/pti.c | 45 +++++++++++++++++++++++++++++----------------
1 file changed, 29 insertions(+), 16 deletions(-)
--- a/arch/x86/mm/pti.c
+++ b/arch/x86/mm/pti.c
@@ -241,7 +241,7 @@ static pmd_t *pti_user_pagetable_walk_pm
*
* Returns a pointer to a PTE on success, or NULL on failure.
*/
-static pte_t *pti_user_pagetable_walk_pte(unsigned long address)
+static pte_t *pti_user_pagetable_walk_pte(unsigned long address, bool late_text)
{
gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
pmd_t *pmd;
@@ -251,10 +251,15 @@ static pte_t *pti_user_pagetable_walk_pt
if (!pmd)
return NULL;
- /* We can't do anything sensible if we hit a large mapping. */
+ /* Large PMD mapping found */
if (pmd_large(*pmd)) {
- WARN_ON(1);
- return NULL;
+ /* Clear the PMD if we hit a large mapping from the first round */
+ if (late_text) {
+ set_pmd(pmd, __pmd(0));
+ } else {
+ WARN_ON_ONCE(1);
+ return NULL;
+ }
}
if (pmd_none(*pmd)) {
@@ -283,7 +288,7 @@ static void __init pti_setup_vsyscall(vo
if (!pte || WARN_ON(level != PG_LEVEL_4K) || pte_none(*pte))
return;
- target_pte = pti_user_pagetable_walk_pte(VSYSCALL_ADDR);
+ target_pte = pti_user_pagetable_walk_pte(VSYSCALL_ADDR, false);
if (WARN_ON(!target_pte))
return;
@@ -301,7 +306,7 @@ enum pti_clone_level {
static void
pti_clone_pgtable(unsigned long start, unsigned long end,
- enum pti_clone_level level)
+ enum pti_clone_level level, bool late_text)
{
unsigned long addr;
@@ -390,7 +395,7 @@ pti_clone_pgtable(unsigned long start, u
return;
/* Allocate PTE in the user page-table */
- target_pte = pti_user_pagetable_walk_pte(addr);
+ target_pte = pti_user_pagetable_walk_pte(addr, late_text);
if (WARN_ON(!target_pte))
return;
@@ -453,7 +458,7 @@ static void __init pti_clone_user_shared
phys_addr_t pa = per_cpu_ptr_to_phys((void *)va);
pte_t *target_pte;
- target_pte = pti_user_pagetable_walk_pte(va);
+ target_pte = pti_user_pagetable_walk_pte(va, false);
if (WARN_ON(!target_pte))
return;
@@ -476,7 +481,7 @@ static void __init pti_clone_user_shared
start = CPU_ENTRY_AREA_BASE;
end = start + (PAGE_SIZE * CPU_ENTRY_AREA_PAGES);
- pti_clone_pgtable(start, end, PTI_CLONE_PMD);
+ pti_clone_pgtable(start, end, PTI_CLONE_PMD, false);
}
#endif /* CONFIG_X86_64 */
@@ -493,11 +498,11 @@ static void __init pti_setup_espfix64(vo
/*
* Clone the populated PMDs of the entry text and force it RO.
*/
-static void pti_clone_entry_text(void)
+static void pti_clone_entry_text(bool late)
{
pti_clone_pgtable((unsigned long) __entry_text_start,
(unsigned long) __entry_text_end,
- PTI_LEVEL_KERNEL_IMAGE);
+ PTI_LEVEL_KERNEL_IMAGE, late);
}
/*
@@ -572,7 +577,7 @@ static void pti_clone_kernel_text(void)
* pti_set_kernel_image_nonglobal() did to clear the
* global bit.
*/
- pti_clone_pgtable(start, end_clone, PTI_LEVEL_KERNEL_IMAGE);
+ pti_clone_pgtable(start, end_clone, PTI_LEVEL_KERNEL_IMAGE, false);
/*
* pti_clone_pgtable() will set the global bit in any PMDs
@@ -639,8 +644,15 @@ void __init pti_init(void)
/* Undo all global bits from the init pagetables in head_64.S: */
pti_set_kernel_image_nonglobal();
+
/* Replace some of the global bits just for shared entry text: */
- pti_clone_entry_text();
+ /*
+ * This is very early in boot. Device and Late initcalls can do
+ * modprobe before free_initmem() and mark_readonly(). This
+ * pti_clone_entry_text() allows those user-mode-helpers to function,
+ * but notably the text is still RW.
+ */
+ pti_clone_entry_text(false);
pti_setup_espfix64();
pti_setup_vsyscall();
}
@@ -657,10 +669,11 @@ void pti_finalize(void)
if (!boot_cpu_has(X86_FEATURE_PTI))
return;
/*
- * We need to clone everything (again) that maps parts of the
- * kernel image.
+ * This is after free_initmem() (all initcalls are done) and we've done
+ * mark_readonly(). Text is now NX which might've split some PMDs
+ * relative to the early clone.
*/
- pti_clone_entry_text();
+ pti_clone_entry_text(true);
pti_clone_kernel_text();
debug_checkwx_user();
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-4.19.y
git checkout FETCH_HEAD
git cherry-pick -x d33d26036a0274b472299d7dcdaa5fb34329f91b
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024090850-nuclear-radar-ea2b@gregkh' --subject-prefix 'PATCH 4.19.y' HEAD^..
Possible dependencies:
d33d26036a02 ("rtmutex: Drop rt_mutex::wait_lock before scheduling")
add461325ec5 ("locking/rtmutex: Extend the rtmutex core to support ww_mutex")
1c143c4b65da ("locking/rtmutex: Provide the spin/rwlock core lock function")
e17ba59b7e8e ("locking/rtmutex: Guard regular sleeping locks specific functions")
7980aa397cc0 ("locking/rtmutex: Use rt_mutex_wake_q_head")
c014ef69b3ac ("locking/rtmutex: Add wake_state to rt_mutex_waiter")
42254105dfe8 ("locking/rwsem: Add rtmutex based R/W semaphore implementation")
ebbdc41e90ff ("locking/rtmutex: Provide rt_mutex_slowlock_locked()")
830e6acc8a1c ("locking/rtmutex: Split out the inner parts of 'struct rtmutex'")
531ae4b06a73 ("locking/rtmutex: Split API from implementation")
785159301bed ("locking/rtmutex: Convert macros to inlines")
b41cda037655 ("locking/rtmutex: Set proper wait context for lockdep")
2f064a59a11f ("sched: Change task_struct::state")
d6c23bb3a2ad ("sched: Add get_current_state()")
b03fbd4ff24c ("sched: Introduce task_is_running()")
a9e906b71f96 ("Merge branch 'sched/urgent' into sched/core, to pick up fixes")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From d33d26036a0274b472299d7dcdaa5fb34329f91b Mon Sep 17 00:00:00 2001
From: Roland Xu <mu001999(a)outlook.com>
Date: Thu, 15 Aug 2024 10:58:13 +0800
Subject: [PATCH] rtmutex: Drop rt_mutex::wait_lock before scheduling
rt_mutex_handle_deadlock() is called with rt_mutex::wait_lock held. In the
good case it returns with the lock held and in the deadlock case it emits a
warning and goes into an endless scheduling loop with the lock held, which
triggers the 'scheduling in atomic' warning.
Unlock rt_mutex::wait_lock in the dead lock case before issuing the warning
and dropping into the schedule for ever loop.
[ tglx: Moved unlock before the WARN(), removed the pointless comment,
massaged changelog, added Fixes tag ]
Fixes: 3d5c9340d194 ("rtmutex: Handle deadlock detection smarter")
Signed-off-by: Roland Xu <mu001999(a)outlook.com>
Signed-off-by: Thomas Gleixner <tglx(a)linutronix.de>
Cc: stable(a)vger.kernel.org
Link: https://lore.kernel.org/all/ME0P300MB063599BEF0743B8FA339C2CECC802@ME0P300M…
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 88d08eeb8bc0..fba1229f1de6 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -1644,6 +1644,7 @@ static int __sched rt_mutex_slowlock_block(struct rt_mutex_base *lock,
}
static void __sched rt_mutex_handle_deadlock(int res, int detect_deadlock,
+ struct rt_mutex_base *lock,
struct rt_mutex_waiter *w)
{
/*
@@ -1656,10 +1657,10 @@ static void __sched rt_mutex_handle_deadlock(int res, int detect_deadlock,
if (build_ww_mutex() && w->ww_ctx)
return;
- /*
- * Yell loudly and stop the task right here.
- */
+ raw_spin_unlock_irq(&lock->wait_lock);
+
WARN(1, "rtmutex deadlock detected\n");
+
while (1) {
set_current_state(TASK_INTERRUPTIBLE);
rt_mutex_schedule();
@@ -1713,7 +1714,7 @@ static int __sched __rt_mutex_slowlock(struct rt_mutex_base *lock,
} else {
__set_current_state(TASK_RUNNING);
remove_waiter(lock, waiter);
- rt_mutex_handle_deadlock(ret, chwalk, waiter);
+ rt_mutex_handle_deadlock(ret, chwalk, lock, waiter);
}
/*
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.10.y
git checkout FETCH_HEAD
git cherry-pick -x aea62c744a9ae2a8247c54ec42138405216414da
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024090852-importer-unadorned-f55b@gregkh' --subject-prefix 'PATCH 5.10.y' HEAD^..
Possible dependencies:
aea62c744a9a ("mmc: cqhci: Fix checking of CQHCI_HALT state")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From aea62c744a9ae2a8247c54ec42138405216414da Mon Sep 17 00:00:00 2001
From: Seunghwan Baek <sh8267.baek(a)samsung.com>
Date: Thu, 29 Aug 2024 15:18:22 +0900
Subject: [PATCH] mmc: cqhci: Fix checking of CQHCI_HALT state
To check if mmc cqe is in halt state, need to check set/clear of CQHCI_HALT
bit. At this time, we need to check with &, not &&.
Fixes: a4080225f51d ("mmc: cqhci: support for command queue enabled host")
Cc: stable(a)vger.kernel.org
Signed-off-by: Seunghwan Baek <sh8267.baek(a)samsung.com>
Reviewed-by: Ritesh Harjani <ritesh.list(a)gmail.com>
Acked-by: Adrian Hunter <adrian.hunter(a)intel.com>
Link: https://lore.kernel.org/r/20240829061823.3718-2-sh8267.baek@samsung.com
Signed-off-by: Ulf Hansson <ulf.hansson(a)linaro.org>
diff --git a/drivers/mmc/host/cqhci-core.c b/drivers/mmc/host/cqhci-core.c
index c14d7251d0bb..a02da26a1efd 100644
--- a/drivers/mmc/host/cqhci-core.c
+++ b/drivers/mmc/host/cqhci-core.c
@@ -617,7 +617,7 @@ static int cqhci_request(struct mmc_host *mmc, struct mmc_request *mrq)
cqhci_writel(cq_host, 0, CQHCI_CTL);
mmc->cqe_on = true;
pr_debug("%s: cqhci: CQE on\n", mmc_hostname(mmc));
- if (cqhci_readl(cq_host, CQHCI_CTL) && CQHCI_HALT) {
+ if (cqhci_readl(cq_host, CQHCI_CTL) & CQHCI_HALT) {
pr_err("%s: cqhci: CQE failed to exit halt state\n",
mmc_hostname(mmc));
}
From: Willem de Bruijn <willemb(a)google.com>
Backport the following commit, because it fixes an existing backport
that has caused multiple reports of breakage on 5.15 based kernels:
net: drop bad gso csum_start and offset in virtio_net_hdr
To backport without conflicts, also backport its two dependencies:
net: more strict VIRTIO_NET_HDR_GSO_UDP_L4 validation
gso: fix dodgy bit handling for GSO_UDP_L4
Also backport the one patch in netdev-net/main that references one
of the above in its Fixes tag:
net: change maximum number of UDP segments to 128
All four patches also exist in 6.1.109
include/linux/udp.h | 2 +-
include/linux/virtio_net.h | 35 +++++++++++++++++-----------
net/ipv4/tcp_offload.c | 3 +++
net/ipv4/udp_offload.c | 17 +++++++++++---
tools/testing/selftests/net/udpgso.c | 2 +-
5 files changed, 40 insertions(+), 19 deletions(-)
--
2.46.0.598.g6f2099f65c-goog
read_hv_sched_clock_tsc() assumes that the Hyper-V clock counter is
bigger than the variable hv_sched_clock_offset, which is cached during
early boot, but depending on the timing this assumption may be false
when a hibernated VM starts again (the clock counter starts from 0
again) and is resuming back (Note: hv_init_tsc_clocksource() is not
called during hibernation/resume); consequently,
read_hv_sched_clock_tsc() may return a negative integer (which is
interpreted as a huge positive integer since the return type is u64)
and new kernel messages are prefixed with huge timestamps before
read_hv_sched_clock_tsc() grows big enough (which typically takes
several seconds).
Fix the issue by saving the Hyper-V clock counter just before the
suspend, and using it to correct the hv_sched_clock_offset in
resume. Override x86_platform.save_sched_clock_state and
x86_platform.restore_sched_clock_state so that we don't
have to touch the common x86 code.
Note: if Invariant TSC is available, the issue doesn't happen because
1) we don't register read_hv_sched_clock_tsc() for sched clock:
See commit e5313f1c5404 ("clocksource/drivers/hyper-v: Rework
clocksource and sched clock setup");
2) the common x86 code adjusts TSC similarly: see
__restore_processor_state() -> tsc_verify_tsc_adjust(true) and
x86_platform.restore_sched_clock_state().
Cc: stable(a)vger.kernel.org
Fixes: 1349401ff1aa ("clocksource/drivers/hyper-v: Suspend/resume Hyper-V clocksource for hibernation")
Co-developed-by: Dexuan Cui <decui(a)microsoft.com>
Signed-off-by: Dexuan Cui <decui(a)microsoft.com>
Signed-off-by: Naman Jain <namjain(a)linux.microsoft.com>
---
drivers/clocksource/hyperv_timer.c | 64 +++++++++++++++++++++++++++++-
1 file changed, 63 insertions(+), 1 deletion(-)
diff --git a/drivers/clocksource/hyperv_timer.c b/drivers/clocksource/hyperv_timer.c
index b2a080647e41..7aa44b8aae2e 100644
--- a/drivers/clocksource/hyperv_timer.c
+++ b/drivers/clocksource/hyperv_timer.c
@@ -27,7 +27,10 @@
#include <asm/mshyperv.h>
static struct clock_event_device __percpu *hv_clock_event;
-static u64 hv_sched_clock_offset __ro_after_init;
+
+/* Can have negative values, after resume from hibernation, so keep them s64 */
+static s64 hv_sched_clock_offset __read_mostly;
+static s64 hv_sched_clock_offset_saved;
/*
* If false, we're using the old mechanism for stimer0 interrupts
@@ -51,6 +54,9 @@ static int stimer0_irq = -1;
static int stimer0_message_sint;
static __maybe_unused DEFINE_PER_CPU(long, stimer0_evt);
+static void (*old_save_sched_clock_state)(void);
+static void (*old_restore_sched_clock_state)(void);
+
/*
* Common code for stimer0 interrupts coming via Direct Mode or
* as a VMbus message.
@@ -434,6 +440,39 @@ static u64 noinstr read_hv_sched_clock_tsc(void)
(NSEC_PER_SEC / HV_CLOCK_HZ);
}
+/*
+ * Hyper-V clock counter resets during hibernation. Save and restore clock
+ * offset during suspend/resume, while also considering the time passed
+ * before suspend. This is to make sure that sched_clock using hv tsc page
+ * based clocksource, proceeds from where it left off during suspend and
+ * it shows correct time for the timestamps of kernel messages after resume.
+ */
+static void save_hv_clock_tsc_state(void)
+{
+ hv_sched_clock_offset_saved = hv_read_reference_counter();
+}
+
+static void restore_hv_clock_tsc_state(void)
+{
+ /*
+ * Time passed before suspend = hv_sched_clock_offset_saved
+ * - hv_sched_clock_offset (old)
+ *
+ * After Hyper-V clock counter resets, hv_sched_clock_offset needs a correction.
+ *
+ * New time = hv_read_reference_counter() (future) - hv_sched_clock_offset (new)
+ * New time = Time passed before suspend + hv_read_reference_counter() (future)
+ * - hv_read_reference_counter() (now)
+ *
+ * Solving the above two equations gives:
+ *
+ * hv_sched_clock_offset (new) = hv_sched_clock_offset (old)
+ * - hv_sched_clock_offset_saved
+ * + hv_read_reference_counter() (now))
+ */
+ hv_sched_clock_offset -= hv_sched_clock_offset_saved - hv_read_reference_counter();
+}
+
static void suspend_hv_clock_tsc(struct clocksource *arg)
{
union hv_reference_tsc_msr tsc_msr;
@@ -456,6 +495,24 @@ static void resume_hv_clock_tsc(struct clocksource *arg)
hv_set_msr(HV_MSR_REFERENCE_TSC, tsc_msr.as_uint64);
}
+/*
+ * Functions to override save_sched_clock_state and restore_sched_clock_state
+ * functions of x86_platform. The Hyper-V clock counter is reset during
+ * suspend-resume and the offset used to measure time needs to be
+ * corrected, post resume.
+ */
+static void hv_save_sched_clock_state(void)
+{
+ save_hv_clock_tsc_state();
+ old_save_sched_clock_state();
+}
+
+static void hv_restore_sched_clock_state(void)
+{
+ restore_hv_clock_tsc_state();
+ old_restore_sched_clock_state();
+}
+
#ifdef HAVE_VDSO_CLOCKMODE_HVCLOCK
static int hv_cs_enable(struct clocksource *cs)
{
@@ -539,6 +596,11 @@ static void __init hv_init_tsc_clocksource(void)
hv_read_reference_counter = read_hv_clock_tsc;
+ old_save_sched_clock_state = x86_platform.save_sched_clock_state;
+ x86_platform.save_sched_clock_state = hv_save_sched_clock_state;
+ old_restore_sched_clock_state = x86_platform.restore_sched_clock_state;
+ x86_platform.restore_sched_clock_state = hv_restore_sched_clock_state;
+
/*
* TSC page mapping works differently in root compared to guest.
* - In guest partition the guest PFN has to be passed to the
base-commit: da3ea35007d0af457a0afc87e84fddaebc4e0b63
--
2.25.1
If the net_conf pointer is NULL and the code attempts to access its
fields without a check, it will lead to a null pointer dereference.
Add a NULL check before dereferencing the pointer.
Found by Linux Verification Center (linuxtesting.org) with SVACE.
Fixes: 44ed167da748 ("drbd: rcu_read_lock() and rcu_dereference() for tconn->net_conf")
Cc: stable(a)vger.kernel.org
Signed-off-by: Mikhail Lobanov <m.lobanov(a)rosalinux.ru>
---
drivers/block/drbd/drbd_state.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/block/drbd/drbd_state.c b/drivers/block/drbd/drbd_state.c
index 287a8d1d3f70..87cf5883078f 100644
--- a/drivers/block/drbd/drbd_state.c
+++ b/drivers/block/drbd/drbd_state.c
@@ -876,7 +876,7 @@ is_valid_state(struct drbd_device *device, union drbd_state ns)
ns.disk == D_OUTDATED)
rv = SS_CONNECTED_OUTDATES;
- else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
+ else if (nc && (ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
(nc->verify_alg[0] == 0))
rv = SS_NO_VERIFY_ALG;
--
2.43.0
From: Willem de Bruijn <willemb(a)google.com>
Tighten csum_start and csum_offset checks in virtio_net_hdr_to_skb
for GSO packets.
The function already checks that a checksum requested with
VIRTIO_NET_HDR_F_NEEDS_CSUM is in skb linear. But for GSO packets
this might not hold for segs after segmentation.
Syzkaller demonstrated to reach this warning in skb_checksum_help
offset = skb_checksum_start_offset(skb);
ret = -EINVAL;
if (WARN_ON_ONCE(offset >= skb_headlen(skb)))
By injecting a TSO packet:
WARNING: CPU: 1 PID: 3539 at net/core/dev.c:3284 skb_checksum_help+0x3d0/0x5b0
ip_do_fragment+0x209/0x1b20 net/ipv4/ip_output.c:774
ip_finish_output_gso net/ipv4/ip_output.c:279 [inline]
__ip_finish_output+0x2bd/0x4b0 net/ipv4/ip_output.c:301
iptunnel_xmit+0x50c/0x930 net/ipv4/ip_tunnel_core.c:82
ip_tunnel_xmit+0x2296/0x2c70 net/ipv4/ip_tunnel.c:813
__gre_xmit net/ipv4/ip_gre.c:469 [inline]
ipgre_xmit+0x759/0xa60 net/ipv4/ip_gre.c:661
__netdev_start_xmit include/linux/netdevice.h:4850 [inline]
netdev_start_xmit include/linux/netdevice.h:4864 [inline]
xmit_one net/core/dev.c:3595 [inline]
dev_hard_start_xmit+0x261/0x8c0 net/core/dev.c:3611
__dev_queue_xmit+0x1b97/0x3c90 net/core/dev.c:4261
packet_snd net/packet/af_packet.c:3073 [inline]
The geometry of the bad input packet at tcp_gso_segment:
[ 52.003050][ T8403] skb len=12202 headroom=244 headlen=12093 tailroom=0
[ 52.003050][ T8403] mac=(168,24) mac_len=24 net=(192,52) trans=244
[ 52.003050][ T8403] shinfo(txflags=0 nr_frags=1 gso(size=1552 type=3 segs=0))
[ 52.003050][ T8403] csum(0x60000c7 start=199 offset=1536
ip_summed=3 complete_sw=0 valid=0 level=0)
Mitigate with stricter input validation.
csum_offset: for GSO packets, deduce the correct value from gso_type.
This is already done for USO. Extend it to TSO. Let UFO be:
udp[46]_ufo_fragment ignores these fields and always computes the
checksum in software.
csum_start: finding the real offset requires parsing to the transport
header. Do not add a parser, use existing segmentation parsing. Thanks
to SKB_GSO_DODGY, that also catches bad packets that are hw offloaded.
Again test both TSO and USO. Do not test UFO for the above reason, and
do not test UDP tunnel offload.
GSO packet are almost always CHECKSUM_PARTIAL. USO packets may be
CHECKSUM_NONE since commit 10154dbded6d6 ("udp: Allow GSO transmit
from devices with no checksum offload"), but then still these fields
are initialized correctly in udp4_hwcsum/udp6_hwcsum_outgoing. So no
need to test for ip_summed == CHECKSUM_PARTIAL first.
This revises an existing fix mentioned in the Fixes tag, which broke
small packets with GSO offload, as detected by kselftests.
Link: https://syzkaller.appspot.com/bug?extid=e1db31216c789f552871
Link: https://lore.kernel.org/netdev/20240723223109.2196886-1-kuba@kernel.org
Fixes: e269d79c7d35 ("net: missing check virtio")
Cc: stable(a)vger.kernel.org
Signed-off-by: Willem de Bruijn <willemb(a)google.com>
---
v1->v2
- skb_transport_header instead of skb->transport_header (edumazet@)
- typo: migitate -> mitigate
---
include/linux/virtio_net.h | 16 +++++-----------
net/ipv4/tcp_offload.c | 3 +++
net/ipv4/udp_offload.c | 4 ++++
3 files changed, 12 insertions(+), 11 deletions(-)
diff --git a/include/linux/virtio_net.h b/include/linux/virtio_net.h
index d1d7825318c32..6c395a2600e8d 100644
--- a/include/linux/virtio_net.h
+++ b/include/linux/virtio_net.h
@@ -56,7 +56,6 @@ static inline int virtio_net_hdr_to_skb(struct sk_buff *skb,
unsigned int thlen = 0;
unsigned int p_off = 0;
unsigned int ip_proto;
- u64 ret, remainder, gso_size;
if (hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) {
switch (hdr->gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
@@ -99,16 +98,6 @@ static inline int virtio_net_hdr_to_skb(struct sk_buff *skb,
u32 off = __virtio16_to_cpu(little_endian, hdr->csum_offset);
u32 needed = start + max_t(u32, thlen, off + sizeof(__sum16));
- if (hdr->gso_size) {
- gso_size = __virtio16_to_cpu(little_endian, hdr->gso_size);
- ret = div64_u64_rem(skb->len, gso_size, &remainder);
- if (!(ret && (hdr->gso_size > needed) &&
- ((remainder > needed) || (remainder == 0)))) {
- return -EINVAL;
- }
- skb_shinfo(skb)->tx_flags |= SKBFL_SHARED_FRAG;
- }
-
if (!pskb_may_pull(skb, needed))
return -EINVAL;
@@ -182,6 +171,11 @@ static inline int virtio_net_hdr_to_skb(struct sk_buff *skb,
if (gso_type != SKB_GSO_UDP_L4)
return -EINVAL;
break;
+ case SKB_GSO_TCPV4:
+ case SKB_GSO_TCPV6:
+ if (skb->csum_offset != offsetof(struct tcphdr, check))
+ return -EINVAL;
+ break;
}
/* Kernel has a special handling for GSO_BY_FRAGS. */
diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c
index 4b791e74529e1..e4ad3311e1489 100644
--- a/net/ipv4/tcp_offload.c
+++ b/net/ipv4/tcp_offload.c
@@ -140,6 +140,9 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb,
if (thlen < sizeof(*th))
goto out;
+ if (unlikely(skb_checksum_start(skb) != skb_transport_header(skb)))
+ goto out;
+
if (!pskb_may_pull(skb, thlen))
goto out;
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index aa2e0a28ca613..bc8a9da750fed 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -278,6 +278,10 @@ struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb,
if (gso_skb->len <= sizeof(*uh) + mss)
return ERR_PTR(-EINVAL);
+ if (unlikely(skb_checksum_start(gso_skb) !=
+ skb_transport_header(gso_skb)))
+ return ERR_PTR(-EINVAL);
+
if (skb_gso_ok(gso_skb, features | NETIF_F_GSO_ROBUST)) {
/* Packet is from an untrusted source, reset gso_segs. */
skb_shinfo(gso_skb)->gso_segs = DIV_ROUND_UP(gso_skb->len - sizeof(*uh),
--
2.46.0.rc1.232.g9752f9e123-goog
The quilt patch titled
Subject: mm/damon/vaddr: protect vma traversal in __damon_va_thre_regions() with rcu read lock
has been removed from the -mm tree. Its filename was
mm-damon-vaddr-protect-vma-traversal-in-__damon_va_thre_regions-with-rcu-read-lock.patch
This patch was dropped because it was merged into the mm-hotfixes-stable branch
of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
------------------------------------------------------
From: "Liam R. Howlett" <Liam.Howlett(a)oracle.com>
Subject: mm/damon/vaddr: protect vma traversal in __damon_va_thre_regions() with rcu read lock
Date: Wed, 4 Sep 2024 17:12:04 -0700
Traversing VMAs of a given maple tree should be protected by rcu read
lock. However, __damon_va_three_regions() is not doing the protection.
Hold the lock.
Link: https://lkml.kernel.org/r/20240905001204.1481-1-sj@kernel.org
Fixes: d0cf3dd47f0d ("damon: convert __damon_va_three_regions to use the VMA iterator")
Signed-off-by: Liam R. Howlett <Liam.Howlett(a)oracle.com>
Signed-off-by: SeongJae Park <sj(a)kernel.org>
Reported-by: Guenter Roeck <linux(a)roeck-us.net>
Closes: https://lore.kernel.org/b83651a0-5b24-4206-b860-cb54ffdf209b@roeck-us.net
Tested-by: Guenter Roeck <linux(a)roeck-us.net>
Cc: David Hildenbrand <david(a)redhat.com>
Cc: Matthew Wilcox <willy(a)infradead.org>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/damon/vaddr.c | 2 ++
1 file changed, 2 insertions(+)
--- a/mm/damon/vaddr.c~mm-damon-vaddr-protect-vma-traversal-in-__damon_va_thre_regions-with-rcu-read-lock
+++ a/mm/damon/vaddr.c
@@ -126,6 +126,7 @@ static int __damon_va_three_regions(stru
* If this is too slow, it can be optimised to examine the maple
* tree gaps.
*/
+ rcu_read_lock();
for_each_vma(vmi, vma) {
unsigned long gap;
@@ -146,6 +147,7 @@ static int __damon_va_three_regions(stru
next:
prev = vma;
}
+ rcu_read_unlock();
if (!sz_range(&second_gap) || !sz_range(&first_gap))
return -EINVAL;
_
Patches currently in -mm which might be from Liam.Howlett(a)oracle.com are
maple_tree-mark-three-functions-as-__maybe_unused.patch
The quilt patch titled
Subject: ocfs2: cancel dqi_sync_work before freeing oinfo
has been removed from the -mm tree. Its filename was
ocfs2-cancel-dqi_sync_work-before-freeing-oinfo.patch
This patch was dropped because it was merged into the mm-hotfixes-stable branch
of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
------------------------------------------------------
From: Joseph Qi <joseph.qi(a)linux.alibaba.com>
Subject: ocfs2: cancel dqi_sync_work before freeing oinfo
Date: Wed, 4 Sep 2024 15:10:03 +0800
ocfs2_global_read_info() will initialize and schedule dqi_sync_work at the
end, if error occurs after successfully reading global quota, it will
trigger the following warning with CONFIG_DEBUG_OBJECTS_* enabled:
ODEBUG: free active (active state 0) object: 00000000d8b0ce28 object type: timer_list hint: qsync_work_fn+0x0/0x16c
This reports that there is an active delayed work when freeing oinfo in
error handling, so cancel dqi_sync_work first. BTW, return status instead
of -1 when .read_file_info fails.
Link: https://syzkaller.appspot.com/bug?extid=f7af59df5d6b25f0febd
Link: https://lkml.kernel.org/r/20240904071004.2067695-1-joseph.qi@linux.alibaba.…
Fixes: 171bf93ce11f ("ocfs2: Periodic quota syncing")
Signed-off-by: Joseph Qi <joseph.qi(a)linux.alibaba.com>
Reviewed-by: Heming Zhao <heming.zhao(a)suse.com>
Reported-by: syzbot+f7af59df5d6b25f0febd(a)syzkaller.appspotmail.com
Tested-by: syzbot+f7af59df5d6b25f0febd(a)syzkaller.appspotmail.com
Cc: Mark Fasheh <mark(a)fasheh.com>
Cc: Joel Becker <jlbec(a)evilplan.org>
Cc: Junxiao Bi <junxiao.bi(a)oracle.com>
Cc: Changwei Ge <gechangwei(a)live.cn>
Cc: Gang He <ghe(a)suse.com>
Cc: Jun Piao <piaojun(a)huawei.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
fs/ocfs2/quota_local.c | 8 ++++++--
1 file changed, 6 insertions(+), 2 deletions(-)
--- a/fs/ocfs2/quota_local.c~ocfs2-cancel-dqi_sync_work-before-freeing-oinfo
+++ a/fs/ocfs2/quota_local.c
@@ -692,7 +692,7 @@ static int ocfs2_local_read_info(struct
int status;
struct buffer_head *bh = NULL;
struct ocfs2_quota_recovery *rec;
- int locked = 0;
+ int locked = 0, global_read = 0;
info->dqi_max_spc_limit = 0x7fffffffffffffffLL;
info->dqi_max_ino_limit = 0x7fffffffffffffffLL;
@@ -700,6 +700,7 @@ static int ocfs2_local_read_info(struct
if (!oinfo) {
mlog(ML_ERROR, "failed to allocate memory for ocfs2 quota"
" info.");
+ status = -ENOMEM;
goto out_err;
}
info->dqi_priv = oinfo;
@@ -712,6 +713,7 @@ static int ocfs2_local_read_info(struct
status = ocfs2_global_read_info(sb, type);
if (status < 0)
goto out_err;
+ global_read = 1;
status = ocfs2_inode_lock(lqinode, &oinfo->dqi_lqi_bh, 1);
if (status < 0) {
@@ -782,10 +784,12 @@ out_err:
if (locked)
ocfs2_inode_unlock(lqinode, 1);
ocfs2_release_local_quota_bitmaps(&oinfo->dqi_chunk);
+ if (global_read)
+ cancel_delayed_work_sync(&oinfo->dqi_sync_work);
kfree(oinfo);
}
brelse(bh);
- return -1;
+ return status;
}
/* Write local info to quota file */
_
Patches currently in -mm which might be from joseph.qi(a)linux.alibaba.com are
ocfs2-cleanup-return-value-and-mlog-in-ocfs2_global_read_info.patch
The quilt patch titled
Subject: ocfs2: fix possible null-ptr-deref in ocfs2_set_buffer_uptodate
has been removed from the -mm tree. Its filename was
ocfs2-fix-possible-null-ptr-deref-in-ocfs2_set_buffer_uptodate.patch
This patch was dropped because it was merged into the mm-hotfixes-stable branch
of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
------------------------------------------------------
From: Lizhi Xu <lizhi.xu(a)windriver.com>
Subject: ocfs2: fix possible null-ptr-deref in ocfs2_set_buffer_uptodate
Date: Mon, 2 Sep 2024 10:36:36 +0800
When doing cleanup, if flags without OCFS2_BH_READAHEAD, it may trigger
NULL pointer dereference in the following ocfs2_set_buffer_uptodate() if
bh is NULL.
Link: https://lkml.kernel.org/r/20240902023636.1843422-3-joseph.qi@linux.alibaba.…
Fixes: cf76c78595ca ("ocfs2: don't put and assigning null to bh allocated outside")
Signed-off-by: Lizhi Xu <lizhi.xu(a)windriver.com>
Signed-off-by: Joseph Qi <joseph.qi(a)linux.alibaba.com>
Reviewed-by: Joseph Qi <joseph.qi(a)linux.alibaba.com>
Reported-by: Heming Zhao <heming.zhao(a)suse.com>
Suggested-by: Heming Zhao <heming.zhao(a)suse.com>
Cc: <stable(a)vger.kernel.org> [4.20+]
Cc: Changwei Ge <gechangwei(a)live.cn>
Cc: Gang He <ghe(a)suse.com>
Cc: Joel Becker <jlbec(a)evilplan.org>
Cc: Jun Piao <piaojun(a)huawei.com>
Cc: Junxiao Bi <junxiao.bi(a)oracle.com>
Cc: Mark Fasheh <mark(a)fasheh.com>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
fs/ocfs2/buffer_head_io.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
--- a/fs/ocfs2/buffer_head_io.c~ocfs2-fix-possible-null-ptr-deref-in-ocfs2_set_buffer_uptodate
+++ a/fs/ocfs2/buffer_head_io.c
@@ -388,7 +388,8 @@ read_failure:
/* Always set the buffer in the cache, even if it was
* a forced read, or read-ahead which hasn't yet
* completed. */
- ocfs2_set_buffer_uptodate(ci, bh);
+ if (bh)
+ ocfs2_set_buffer_uptodate(ci, bh);
}
ocfs2_metadata_cache_io_unlock(ci);
_
Patches currently in -mm which might be from lizhi.xu(a)windriver.com are
The quilt patch titled
Subject: ocfs2: remove unreasonable unlock in ocfs2_read_blocks
has been removed from the -mm tree. Its filename was
ocfs2-remove-unreasonable-unlock-in-ocfs2_read_blocks.patch
This patch was dropped because it was merged into the mm-hotfixes-stable branch
of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
------------------------------------------------------
From: Lizhi Xu <lizhi.xu(a)windriver.com>
Subject: ocfs2: remove unreasonable unlock in ocfs2_read_blocks
Date: Mon, 2 Sep 2024 10:36:35 +0800
Patch series "Misc fixes for ocfs2_read_blocks", v5.
This series contains 2 fixes for ocfs2_read_blocks(). The first patch fix
the issue reported by syzbot, which detects bad unlock balance in
ocfs2_read_blocks(). The second patch fixes an issue reported by Heming
Zhao when reviewing above fix.
This patch (of 2):
There was a lock release before exiting, so remove the unreasonable unlock.
Link: https://lkml.kernel.org/r/20240902023636.1843422-1-joseph.qi@linux.alibaba.…
Link: https://lkml.kernel.org/r/20240902023636.1843422-2-joseph.qi@linux.alibaba.…
Fixes: cf76c78595ca ("ocfs2: don't put and assigning null to bh allocated outside")
Signed-off-by: Lizhi Xu <lizhi.xu(a)windriver.com>
Signed-off-by: Joseph Qi <joseph.qi(a)linux.alibaba.com>
Reviewed-by: Heming Zhao <heming.zhao(a)suse.com>
Reviewed-by: Joseph Qi <joseph.qi(a)linux.alibaba.com>
Reported-by: syzbot+ab134185af9ef88dfed5(a)syzkaller.appspotmail.com
Closes: https://syzkaller.appspot.com/bug?extid=ab134185af9ef88dfed5
Tested-by: syzbot+ab134185af9ef88dfed5(a)syzkaller.appspotmail.com
Cc: Mark Fasheh <mark(a)fasheh.com>
Cc: Joel Becker <jlbec(a)evilplan.org>
Cc: Junxiao Bi <junxiao.bi(a)oracle.com>
Cc: Changwei Ge <gechangwei(a)live.cn>
Cc: Gang He <ghe(a)suse.com>
Cc: Jun Piao <piaojun(a)huawei.com>
Cc: <stable(a)vger.kernel.org> [4.20+]
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
fs/ocfs2/buffer_head_io.c | 1 -
1 file changed, 1 deletion(-)
--- a/fs/ocfs2/buffer_head_io.c~ocfs2-remove-unreasonable-unlock-in-ocfs2_read_blocks
+++ a/fs/ocfs2/buffer_head_io.c
@@ -235,7 +235,6 @@ int ocfs2_read_blocks(struct ocfs2_cachi
if (bhs[i] == NULL) {
bhs[i] = sb_getblk(sb, block++);
if (bhs[i] == NULL) {
- ocfs2_metadata_cache_io_unlock(ci);
status = -ENOMEM;
mlog_errno(status);
/* Don't forget to put previous bh! */
_
Patches currently in -mm which might be from lizhi.xu(a)windriver.com are
The quilt patch titled
Subject: ocfs2: fix null-ptr-deref when journal load failed.
has been removed from the -mm tree. Its filename was
ocfs2-fix-null-ptr-deref-when-journal-load-failed.patch
This patch was dropped because it was merged into the mm-hotfixes-stable branch
of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
------------------------------------------------------
From: Julian Sun <sunjunchao2870(a)gmail.com>
Subject: ocfs2: fix null-ptr-deref when journal load failed.
Date: Mon, 2 Sep 2024 11:08:44 +0800
During the mounting process, if journal_reset() fails because of too short
journal, then lead to jbd2_journal_load() fails with NULL j_sb_buffer.
Subsequently, ocfs2_journal_shutdown() calls
jbd2_journal_flush()->jbd2_cleanup_journal_tail()->
__jbd2_update_log_tail()->jbd2_journal_update_sb_log_tail()
->lock_buffer(journal->j_sb_buffer), resulting in a null-pointer
dereference error.
To resolve this issue, we should check the JBD2_LOADED flag to ensure the
journal was properly loaded. Additionally, use journal instead of
osb->journal directly to simplify the code.
Link: https://syzkaller.appspot.com/bug?extid=05b9b39d8bdfe1a0861f
Link: https://lkml.kernel.org/r/20240902030844.422725-1-sunjunchao2870@gmail.com
Fixes: f6f50e28f0cb ("jbd2: Fail to load a journal if it is too short")
Signed-off-by: Julian Sun <sunjunchao2870(a)gmail.com>
Reported-by: syzbot+05b9b39d8bdfe1a0861f(a)syzkaller.appspotmail.com
Suggested-by: Joseph Qi <joseph.qi(a)linux.alibaba.com>
Reviewed-by: Joseph Qi <joseph.qi(a)linux.alibaba.com>
Cc: Mark Fasheh <mark(a)fasheh.com>
Cc: Joel Becker <jlbec(a)evilplan.org>
Cc: Junxiao Bi <junxiao.bi(a)oracle.com>
Cc: Changwei Ge <gechangwei(a)live.cn>
Cc: Gang He <ghe(a)suse.com>
Cc: Jun Piao <piaojun(a)huawei.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
fs/ocfs2/journal.c | 7 ++++---
1 file changed, 4 insertions(+), 3 deletions(-)
--- a/fs/ocfs2/journal.c~ocfs2-fix-null-ptr-deref-when-journal-load-failed
+++ a/fs/ocfs2/journal.c
@@ -1055,7 +1055,7 @@ void ocfs2_journal_shutdown(struct ocfs2
if (!igrab(inode))
BUG();
- num_running_trans = atomic_read(&(osb->journal->j_num_trans));
+ num_running_trans = atomic_read(&(journal->j_num_trans));
trace_ocfs2_journal_shutdown(num_running_trans);
/* Do a commit_cache here. It will flush our journal, *and*
@@ -1074,9 +1074,10 @@ void ocfs2_journal_shutdown(struct ocfs2
osb->commit_task = NULL;
}
- BUG_ON(atomic_read(&(osb->journal->j_num_trans)) != 0);
+ BUG_ON(atomic_read(&(journal->j_num_trans)) != 0);
- if (ocfs2_mount_local(osb)) {
+ if (ocfs2_mount_local(osb) &&
+ (journal->j_journal->j_flags & JBD2_LOADED)) {
jbd2_journal_lock_updates(journal->j_journal);
status = jbd2_journal_flush(journal->j_journal, 0);
jbd2_journal_unlock_updates(journal->j_journal);
_
Patches currently in -mm which might be from sunjunchao2870(a)gmail.com are
The mwifiex chips support simultaneous Accesspoint and station mode,
but this only works when all are using the same channel. The downstream
driver uses ECSA which makes the Accesspoint automatically switch to the
channel the station is going to use. Until this is implemented in the
mwifiex driver at least catch this situation and bail out with an error.
Userspace doesn't have a meaningful way to figure out what went wrong,
so print an error message to give the user a clue.
Without this patch the driver would timeout on the
HostCmd_CMD_802_11_ASSOCIATE command when creating a station with a
channel different from the one that an existing accesspoint uses.
Signed-off-by: Sascha Hauer <s.hauer(a)pengutronix.de>
Cc: stable(a)vger.kernel.org
---
drivers/net/wireless/marvell/mwifiex/cfg80211.c | 52 ++++++++++++++++++++++++
drivers/net/wireless/marvell/mwifiex/main.h | 1 +
drivers/net/wireless/marvell/mwifiex/sta_ioctl.c | 3 ++
3 files changed, 56 insertions(+)
diff --git a/drivers/net/wireless/marvell/mwifiex/cfg80211.c b/drivers/net/wireless/marvell/mwifiex/cfg80211.c
index 5697a02e6b8d3..0d3bf624cd3de 100644
--- a/drivers/net/wireless/marvell/mwifiex/cfg80211.c
+++ b/drivers/net/wireless/marvell/mwifiex/cfg80211.c
@@ -2054,6 +2054,55 @@ static int mwifiex_cfg80211_stop_ap(struct wiphy *wiphy, struct net_device *dev,
return 0;
}
+bool mwifiex_channel_conflict(struct mwifiex_private *priv, struct ieee80211_channel *ch)
+{
+ struct mwifiex_adapter *adapter = priv->adapter;
+ struct mwifiex_current_bss_params *bss_params;
+ u8 band;
+ int freq, i;
+
+ for (i = 0; i < adapter->priv_num; i++) {
+ struct mwifiex_private *p = adapter->priv[i];
+ struct ieee80211_channel *used = NULL;
+
+ if (p == priv)
+ continue;
+
+ switch (GET_BSS_ROLE(p)) {
+ case MWIFIEX_BSS_ROLE_UAP:
+ if (!netif_carrier_ok(p->netdev))
+ break;
+
+ if (!cfg80211_chandef_valid(&p->bss_chandef))
+ break;
+
+ used = p->bss_chandef.chan;
+
+ break;
+ case MWIFIEX_BSS_ROLE_STA:
+ if (!p->media_connected)
+ break;
+
+ bss_params = &p->curr_bss_params;
+ band = mwifiex_band_to_radio_type(bss_params->band);
+ freq = ieee80211_channel_to_frequency(bss_params->bss_descriptor.channel,
+ band);
+
+ used = ieee80211_get_channel(priv->wdev.wiphy, freq);
+
+ break;
+ }
+
+ if (used && !ieee80211_channel_equal(used, ch)) {
+ mwifiex_dbg(priv->adapter, MSG,
+ "all AP and STA must operate on same channel\n");
+ return false;
+ }
+ }
+
+ return true;
+}
+
/* cfg80211 operation handler for start_ap.
* Function sets beacon period, DTIM period, SSID and security into
* AP config structure.
@@ -2069,6 +2118,9 @@ static int mwifiex_cfg80211_start_ap(struct wiphy *wiphy,
if (GET_BSS_ROLE(priv) != MWIFIEX_BSS_ROLE_UAP)
return -1;
+ if (!mwifiex_channel_conflict(priv, params->chandef.chan))
+ return -EBUSY;
+
bss_cfg = kzalloc(sizeof(struct mwifiex_uap_bss_param), GFP_KERNEL);
if (!bss_cfg)
return -ENOMEM;
diff --git a/drivers/net/wireless/marvell/mwifiex/main.h b/drivers/net/wireless/marvell/mwifiex/main.h
index 529863edd7a25..b68dbf884156b 100644
--- a/drivers/net/wireless/marvell/mwifiex/main.h
+++ b/drivers/net/wireless/marvell/mwifiex/main.h
@@ -1697,6 +1697,7 @@ int mwifiex_set_mac_address(struct mwifiex_private *priv,
struct net_device *dev,
bool external, u8 *new_mac);
void mwifiex_devdump_tmo_func(unsigned long function_context);
+bool mwifiex_channel_conflict(struct mwifiex_private *priv, struct ieee80211_channel *ch);
#ifdef CONFIG_DEBUG_FS
void mwifiex_debugfs_init(void);
diff --git a/drivers/net/wireless/marvell/mwifiex/sta_ioctl.c b/drivers/net/wireless/marvell/mwifiex/sta_ioctl.c
index d3cba6895f8ce..9794816d8a0c6 100644
--- a/drivers/net/wireless/marvell/mwifiex/sta_ioctl.c
+++ b/drivers/net/wireless/marvell/mwifiex/sta_ioctl.c
@@ -291,6 +291,9 @@ int mwifiex_bss_start(struct mwifiex_private *priv, struct cfg80211_bss *bss,
if (!bss_desc)
return -1;
+ if (!mwifiex_channel_conflict(priv, bss->channel))
+ return -EBUSY;
+
if (mwifiex_band_to_radio_type(bss_desc->bss_band) ==
HostCmd_SCAN_RADIO_TYPE_BG) {
config_bands = BAND_B | BAND_G | BAND_GN;
---
base-commit: 67a72043aa2e6f60f7bbe7bfa598ba168f16d04f
change-id: 20240830-mwifiex-check-channel-f411a156bbe0
Best regards,
--
Sascha Hauer <s.hauer(a)pengutronix.de>
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.4.y
git checkout FETCH_HEAD
git cherry-pick -x 9149c9b0c7e046273141e41eebd8a517416144ac
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024090942-clunky-disobey-80a9@gregkh' --subject-prefix 'PATCH 5.4.y' HEAD^..
Possible dependencies:
9149c9b0c7e0 ("usb: dwc3: core: update LC timer as per USB Spec V3.2")
63d7f9810a38 ("usb: dwc3: core: Enable GUCTL1 bit 10 for fixing termination error after resume bug")
843714bb37d9 ("usb: dwc3: Decouple USB 2.0 L1 & L2 events")
9af21dd6faeb ("usb: dwc3: Add support for DWC_usb32 IP")
8bb14308a869 ("usb: dwc3: core: Use role-switch default dr_mode")
d94ea5319813 ("usb: dwc3: gadget: Properly set maxpacket limit")
586f4335700f ("usb: dwc3: Fix GTXFIFOSIZ.TXFDEP macro name")
7ba6b09fda5e ("usb: dwc3: core: add support for disabling SS instances in park mode")
5eb5afb07853 ("usb: dwc3: use proper initializers for property entries")
9ba3aca8fe82 ("usb: dwc3: Disable phy suspend after power-on reset")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 9149c9b0c7e046273141e41eebd8a517416144ac Mon Sep 17 00:00:00 2001
From: Faisal Hassan <quic_faisalh(a)quicinc.com>
Date: Thu, 29 Aug 2024 15:15:02 +0530
Subject: [PATCH] usb: dwc3: core: update LC timer as per USB Spec V3.2
This fix addresses STAR 9001285599, which only affects DWC_usb3 version
3.20a. The timer value for PM_LC_TIMER in DWC_usb3 3.20a for the Link
ECN changes is incorrect. If the PM TIMER ECN is enabled via GUCTL2[19],
the link compliance test (TD7.21) may fail. If the ECN is not enabled
(GUCTL2[19] = 0), the controller will use the old timer value (5us),
which is still acceptable for the link compliance test. Therefore, clear
GUCTL2[19] to pass the USB link compliance test: TD 7.21.
Cc: stable(a)vger.kernel.org
Signed-off-by: Faisal Hassan <quic_faisalh(a)quicinc.com>
Acked-by: Thinh Nguyen <Thinh.Nguyen(a)synopsys.com>
Link: https://lore.kernel.org/r/20240829094502.26502-1-quic_faisalh@quicinc.com
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
diff --git a/drivers/usb/dwc3/core.c b/drivers/usb/dwc3/core.c
index ccc3895dbd7f..9eb085f359ce 100644
--- a/drivers/usb/dwc3/core.c
+++ b/drivers/usb/dwc3/core.c
@@ -1386,6 +1386,21 @@ static int dwc3_core_init(struct dwc3 *dwc)
dwc3_writel(dwc->regs, DWC3_GUCTL2, reg);
}
+ /*
+ * STAR 9001285599: This issue affects DWC_usb3 version 3.20a
+ * only. If the PM TIMER ECM is enabled through GUCTL2[19], the
+ * link compliance test (TD7.21) may fail. If the ECN is not
+ * enabled (GUCTL2[19] = 0), the controller will use the old timer
+ * value (5us), which is still acceptable for the link compliance
+ * test. Therefore, do not enable PM TIMER ECM in 3.20a by
+ * setting GUCTL2[19] by default; instead, use GUCTL2[19] = 0.
+ */
+ if (DWC3_VER_IS(DWC3, 320A)) {
+ reg = dwc3_readl(dwc->regs, DWC3_GUCTL2);
+ reg &= ~DWC3_GUCTL2_LC_TIMER;
+ dwc3_writel(dwc->regs, DWC3_GUCTL2, reg);
+ }
+
/*
* When configured in HOST mode, after issuing U3/L2 exit controller
* fails to send proper CRC checksum in CRC5 feild. Because of this
diff --git a/drivers/usb/dwc3/core.h b/drivers/usb/dwc3/core.h
index 1e561fd8b86e..c71240e8f7c7 100644
--- a/drivers/usb/dwc3/core.h
+++ b/drivers/usb/dwc3/core.h
@@ -421,6 +421,7 @@
/* Global User Control Register 2 */
#define DWC3_GUCTL2_RST_ACTBITLATER BIT(14)
+#define DWC3_GUCTL2_LC_TIMER BIT(19)
/* Global User Control Register 3 */
#define DWC3_GUCTL3_SPLITDISABLE BIT(14)
@@ -1269,6 +1270,7 @@ struct dwc3 {
#define DWC3_REVISION_290A 0x5533290a
#define DWC3_REVISION_300A 0x5533300a
#define DWC3_REVISION_310A 0x5533310a
+#define DWC3_REVISION_320A 0x5533320a
#define DWC3_REVISION_330A 0x5533330a
#define DWC31_REVISION_ANY 0x0
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.10.y
git checkout FETCH_HEAD
git cherry-pick -x 9149c9b0c7e046273141e41eebd8a517416144ac
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024090943-justness-geologist-75e9@gregkh' --subject-prefix 'PATCH 5.10.y' HEAD^..
Possible dependencies:
9149c9b0c7e0 ("usb: dwc3: core: update LC timer as per USB Spec V3.2")
63d7f9810a38 ("usb: dwc3: core: Enable GUCTL1 bit 10 for fixing termination error after resume bug")
843714bb37d9 ("usb: dwc3: Decouple USB 2.0 L1 & L2 events")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 9149c9b0c7e046273141e41eebd8a517416144ac Mon Sep 17 00:00:00 2001
From: Faisal Hassan <quic_faisalh(a)quicinc.com>
Date: Thu, 29 Aug 2024 15:15:02 +0530
Subject: [PATCH] usb: dwc3: core: update LC timer as per USB Spec V3.2
This fix addresses STAR 9001285599, which only affects DWC_usb3 version
3.20a. The timer value for PM_LC_TIMER in DWC_usb3 3.20a for the Link
ECN changes is incorrect. If the PM TIMER ECN is enabled via GUCTL2[19],
the link compliance test (TD7.21) may fail. If the ECN is not enabled
(GUCTL2[19] = 0), the controller will use the old timer value (5us),
which is still acceptable for the link compliance test. Therefore, clear
GUCTL2[19] to pass the USB link compliance test: TD 7.21.
Cc: stable(a)vger.kernel.org
Signed-off-by: Faisal Hassan <quic_faisalh(a)quicinc.com>
Acked-by: Thinh Nguyen <Thinh.Nguyen(a)synopsys.com>
Link: https://lore.kernel.org/r/20240829094502.26502-1-quic_faisalh@quicinc.com
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
diff --git a/drivers/usb/dwc3/core.c b/drivers/usb/dwc3/core.c
index ccc3895dbd7f..9eb085f359ce 100644
--- a/drivers/usb/dwc3/core.c
+++ b/drivers/usb/dwc3/core.c
@@ -1386,6 +1386,21 @@ static int dwc3_core_init(struct dwc3 *dwc)
dwc3_writel(dwc->regs, DWC3_GUCTL2, reg);
}
+ /*
+ * STAR 9001285599: This issue affects DWC_usb3 version 3.20a
+ * only. If the PM TIMER ECM is enabled through GUCTL2[19], the
+ * link compliance test (TD7.21) may fail. If the ECN is not
+ * enabled (GUCTL2[19] = 0), the controller will use the old timer
+ * value (5us), which is still acceptable for the link compliance
+ * test. Therefore, do not enable PM TIMER ECM in 3.20a by
+ * setting GUCTL2[19] by default; instead, use GUCTL2[19] = 0.
+ */
+ if (DWC3_VER_IS(DWC3, 320A)) {
+ reg = dwc3_readl(dwc->regs, DWC3_GUCTL2);
+ reg &= ~DWC3_GUCTL2_LC_TIMER;
+ dwc3_writel(dwc->regs, DWC3_GUCTL2, reg);
+ }
+
/*
* When configured in HOST mode, after issuing U3/L2 exit controller
* fails to send proper CRC checksum in CRC5 feild. Because of this
diff --git a/drivers/usb/dwc3/core.h b/drivers/usb/dwc3/core.h
index 1e561fd8b86e..c71240e8f7c7 100644
--- a/drivers/usb/dwc3/core.h
+++ b/drivers/usb/dwc3/core.h
@@ -421,6 +421,7 @@
/* Global User Control Register 2 */
#define DWC3_GUCTL2_RST_ACTBITLATER BIT(14)
+#define DWC3_GUCTL2_LC_TIMER BIT(19)
/* Global User Control Register 3 */
#define DWC3_GUCTL3_SPLITDISABLE BIT(14)
@@ -1269,6 +1270,7 @@ struct dwc3 {
#define DWC3_REVISION_290A 0x5533290a
#define DWC3_REVISION_300A 0x5533300a
#define DWC3_REVISION_310A 0x5533310a
+#define DWC3_REVISION_320A 0x5533320a
#define DWC3_REVISION_330A 0x5533330a
#define DWC31_REVISION_ANY 0x0
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.4.y
git checkout FETCH_HEAD
git cherry-pick -x 96f9ab0d5933c1c00142dd052f259fce0bc3ced2
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024090907-pancreas-remodeler-f80d@gregkh' --subject-prefix 'PATCH 5.4.y' HEAD^..
Possible dependencies:
96f9ab0d5933 ("iio: adc: ad7124: fix chip ID mismatch")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 96f9ab0d5933c1c00142dd052f259fce0bc3ced2 Mon Sep 17 00:00:00 2001
From: Dumitru Ceclan <mitrutzceclan(a)gmail.com>
Date: Wed, 31 Jul 2024 15:37:22 +0300
Subject: [PATCH] iio: adc: ad7124: fix chip ID mismatch
The ad7124_soft_reset() function has the assumption that the chip will
assert the "power-on reset" bit in the STATUS register after a software
reset without any delay. The POR bit =0 is used to check if the chip
initialization is done.
A chip ID mismatch probe error appears intermittently when the probe
continues too soon and the ID register does not contain the expected
value.
Fix by adding a 200us delay after the software reset command is issued.
Fixes: b3af341bbd96 ("iio: adc: Add ad7124 support")
Signed-off-by: Dumitru Ceclan <dumitru.ceclan(a)analog.com>
Reviewed-by: Nuno Sa <nuno.sa(a)analog.com>
Link: https://patch.msgid.link/20240731-ad7124-fix-v1-1-46a76aa4b9be@analog.com
Cc: <Stable(a)vger.kernel.org>
Signed-off-by: Jonathan Cameron <Jonathan.Cameron(a)huawei.com>
diff --git a/drivers/iio/adc/ad7124.c b/drivers/iio/adc/ad7124.c
index 3beed78496c5..c0b82f64c976 100644
--- a/drivers/iio/adc/ad7124.c
+++ b/drivers/iio/adc/ad7124.c
@@ -764,6 +764,7 @@ static int ad7124_soft_reset(struct ad7124_state *st)
if (ret < 0)
return ret;
+ fsleep(200);
timeout = 100;
do {
ret = ad_sd_read_reg(&st->sd, AD7124_STATUS, 1, &readval);
Hi,
We are offering you the visitors contact list of Big Data LDN Expo 2024.
We have 12,137 Verified Contact List with discount.
List Contains: Contact Name, Title, Phone Number, Fax Number, Physical address, Company Name, Company URL, Employee Size, Revenue Size, Industry, and more…
Let me know if you’re interested so that I can share you the pricing for the same.
Kind Regards,
Jacob Smith
Senior Marketing Executive
If you do not wish to receive our emails, please reply with "Not Interested."
On 9/9/24 16:36, Charles Keepax wrote:
> On Wed, Sep 04, 2024 at 04:52:28PM +0200, Krzysztof Kozlowski wrote:
>> This reverts commit ab8d66d132bc8f1992d3eb6cab8d32dda6733c84 because it
>> breaks codecs using non-continuous masks in source and sink ports. The
>> commit missed the point that port numbers are not used as indices for
>> iterating over prop.sink_ports or prop.source_ports.
>>
>> Soundwire core and existing codecs expect that the array passed as
>> prop.sink_ports and prop.source_ports is continuous. The port mask still
>> might be non-continuous, but that's unrelated.
>>
>> Reported-by: Bard Liao <yung-chuan.liao(a)linux.intel.com>
>> Closes: https://lore.kernel.org/all/b6c75eee-761d-44c8-8413-2a5b34ee2f98@linux.inte…
>> Fixes: ab8d66d132bc ("soundwire: stream: fix programming slave ports for non-continous port maps")
>> Signed-off-by: Krzysztof Kozlowski <krzysztof.kozlowski(a)linaro.org>
>>
>> ---
>
> Would be good to merge this as soon as we can, this is causing
> soundwire regressions from rc6 onwards.
the revert also needs to happen in -stable. 6.10.8 is broken as well.
https://github.com/thesofproject/linux/issues/5168
These are a few patches broken out from [1]. Kalle requested to limit
the number of patches per series to approximately 12 and Francesco to
move the fixes to the front of the series, so here we go.
First two patches are fixes. First one is for host mlme support which
currently is in wireless-next, so no stable tag needed, second one has a
stable tag.
The remaining patches except the last one I have chosen to upstream
first. I'll continue with the other patches after having this series
in shape and merged.
The last one is a new patch not included in [1].
Sascha
[1] https://lore.kernel.org/all/20240820-mwifiex-cleanup-v1-0-320d8de4a4b7@peng…
Signed-off-by: Sascha Hauer <s.hauer(a)pengutronix.de>
---
Sascha Hauer (12):
wifi: mwifiex: add missing locking
wifi: mwifiex: fix MAC address handling
wifi: mwifiex: deduplicate code in mwifiex_cmd_tx_rate_cfg()
wifi: mwifiex: use adapter as context pointer for mwifiex_hs_activated_event()
wifi: mwifiex: drop unnecessary initialization
wifi: mwifiex: make region_code_mapping_t const
wifi: mwifiex: pass adapter to mwifiex_dnld_cmd_to_fw()
wifi: mwifiex: simplify mwifiex_setup_ht_caps()
wifi: mwifiex: fix indention
wifi: mwifiex: make locally used function static
wifi: mwifiex: move common settings out of switch/case
wifi: mwifiex: drop asynchronous init waiting code
drivers/net/wireless/marvell/mwifiex/cfg80211.c | 38 ++++------
drivers/net/wireless/marvell/mwifiex/cfp.c | 4 +-
drivers/net/wireless/marvell/mwifiex/cmdevt.c | 76 +++++++-------------
drivers/net/wireless/marvell/mwifiex/init.c | 19 ++---
drivers/net/wireless/marvell/mwifiex/main.c | 94 +++++++++----------------
drivers/net/wireless/marvell/mwifiex/main.h | 16 ++---
drivers/net/wireless/marvell/mwifiex/sta_cmd.c | 49 ++++---------
drivers/net/wireless/marvell/mwifiex/txrx.c | 3 +-
drivers/net/wireless/marvell/mwifiex/util.c | 22 +-----
drivers/net/wireless/marvell/mwifiex/wmm.c | 12 ++--
10 files changed, 105 insertions(+), 228 deletions(-)
---
base-commit: 67a72043aa2e6f60f7bbe7bfa598ba168f16d04f
change-id: 20240826-mwifiex-cleanup-1-b5035c7faff6
Best regards,
--
Sascha Hauer <s.hauer(a)pengutronix.de>
It's incorrect to assume that LBR can/should only be used with sampling
events. BPF subsystem provides bpf_get_branch_snapshot() BPF helper,
which expects a properly setup and activated perf event which allows
kernel to capture LBR data.
For instance, retsnoop tool ([0]) makes an extensive use of this
functionality and sets up perf event as follows:
struct perf_event_attr attr;
memset(&attr, 0, sizeof(attr));
attr.size = sizeof(attr);
attr.type = PERF_TYPE_HARDWARE;
attr.config = PERF_COUNT_HW_CPU_CYCLES;
attr.sample_type = PERF_SAMPLE_BRANCH_STACK;
attr.branch_sample_type = PERF_SAMPLE_BRANCH_KERNEL;
Commit referenced in Fixes tag broke this setup by making invalid assumption
that LBR is useful only for sampling events. Remove that assumption.
Note, earlier we removed a similar assumption on AMD side of LBR support,
see [1] for details.
[0] https://github.com/anakryiko/retsnoop
[1] 9794563d4d05 ("perf/x86/amd: Don't reject non-sampling events with configured LBR")
Cc: stable(a)vger.kernel.org # 6.8+
Fixes: 85846b27072d ("perf/x86: Add PERF_X86_EVENT_NEEDS_BRANCH_STACK flag")
Signed-off-by: Andrii Nakryiko <andrii(a)kernel.org>
---
arch/x86/events/intel/core.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 9e519d8a810a..f82a342b8852 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -3972,7 +3972,7 @@ static int intel_pmu_hw_config(struct perf_event *event)
x86_pmu.pebs_aliases(event);
}
- if (needs_branch_stack(event) && is_sampling_event(event))
+ if (needs_branch_stack(event))
event->hw.flags |= PERF_X86_EVENT_NEEDS_BRANCH_STACK;
if (branch_sample_counters(event)) {
--
2.43.5
On Mon, Sep 9, 2024 at 2:48 PM Sasha Levin <sashal(a)kernel.org> wrote:
> This is a note to let you know that I've just added the patch titled
>
> userfaultfd: fix checks for huge PMDs
>
> to the 6.1-stable tree which can be found at:
> http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum…
Thanks for the backport!
Are you also doing the backport for older trees, or should someone
else take care of that?
The following commit has been merged into the core/debugobjects branch of tip:
Commit-ID: 684d28feb8546d1e9597aa363c3bfcf52fe250b7
Gitweb: https://git.kernel.org/tip/684d28feb8546d1e9597aa363c3bfcf52fe250b7
Author: Zhen Lei <thunder.leizhen(a)huawei.com>
AuthorDate: Wed, 04 Sep 2024 21:39:40 +08:00
Committer: Thomas Gleixner <tglx(a)linutronix.de>
CommitterDate: Mon, 09 Sep 2024 16:40:25 +02:00
debugobjects: Fix conditions in fill_pool()
fill_pool() uses 'obj_pool_min_free' to decide whether objects should be
handed back to the kmem cache. But 'obj_pool_min_free' records the lowest
historical value of the number of objects in the object pool and not the
minimum number of objects which should be kept in the pool.
Use 'debug_objects_pool_min_level' instead, which holds the minimum number
which was scaled to the number of CPUs at boot time.
[ tglx: Massage change log ]
Fixes: d26bf5056fc0 ("debugobjects: Reduce number of pool_lock acquisitions in fill_pool()")
Fixes: 36c4ead6f6df ("debugobjects: Add global free list and the counter")
Signed-off-by: Zhen Lei <thunder.leizhen(a)huawei.com>
Signed-off-by: Thomas Gleixner <tglx(a)linutronix.de>
Cc: stable(a)vger.kernel.org
Link: https://lore.kernel.org/all/20240904133944.2124-3-thunder.leizhen@huawei.com
---
lib/debugobjects.c | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/lib/debugobjects.c b/lib/debugobjects.c
index 7226fdb..6329a86 100644
--- a/lib/debugobjects.c
+++ b/lib/debugobjects.c
@@ -142,13 +142,14 @@ static void fill_pool(void)
* READ_ONCE()s pair with the WRITE_ONCE()s in pool_lock critical
* sections.
*/
- while (READ_ONCE(obj_nr_tofree) && (READ_ONCE(obj_pool_free) < obj_pool_min_free)) {
+ while (READ_ONCE(obj_nr_tofree) &&
+ READ_ONCE(obj_pool_free) < debug_objects_pool_min_level) {
raw_spin_lock_irqsave(&pool_lock, flags);
/*
* Recheck with the lock held as the worker thread might have
* won the race and freed the global free list already.
*/
- while (obj_nr_tofree && (obj_pool_free < obj_pool_min_free)) {
+ while (obj_nr_tofree && (obj_pool_free < debug_objects_pool_min_level)) {
obj = hlist_entry(obj_to_free.first, typeof(*obj), node);
hlist_del(&obj->node);
WRITE_ONCE(obj_nr_tofree, obj_nr_tofree - 1);
Two bitmasks in 'struct sdw_slave_prop' - 'source_ports' and
'sink_ports' - define which ports to program in
sdw_program_slave_port_params(). The masks are used to get the
appropriate data port properties ('struct sdw_get_slave_dpn_prop') from
an array.
Bitmasks can be non-continuous or can start from index different than 0,
thus when looking for matching port property for given port, we must
iterate over mask bits, not from 0 up to number of ports.
This fixes allocation and programming slave ports, when a source or sink
masks start from further index.
Fixes: f8101c74aa54 ("soundwire: Add Master and Slave port programming")
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Krzysztof Kozlowski <krzysztof.kozlowski(a)linaro.org>
---
drivers/soundwire/stream.c | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/drivers/soundwire/stream.c b/drivers/soundwire/stream.c
index 7aa4900dcf31..f275143d7b18 100644
--- a/drivers/soundwire/stream.c
+++ b/drivers/soundwire/stream.c
@@ -1291,18 +1291,18 @@ struct sdw_dpn_prop *sdw_get_slave_dpn_prop(struct sdw_slave *slave,
unsigned int port_num)
{
struct sdw_dpn_prop *dpn_prop;
- u8 num_ports;
+ unsigned long mask;
int i;
if (direction == SDW_DATA_DIR_TX) {
- num_ports = hweight32(slave->prop.source_ports);
+ mask = slave->prop.source_ports;
dpn_prop = slave->prop.src_dpn_prop;
} else {
- num_ports = hweight32(slave->prop.sink_ports);
+ mask = slave->prop.sink_ports;
dpn_prop = slave->prop.sink_dpn_prop;
}
- for (i = 0; i < num_ports; i++) {
+ for_each_set_bit(i, &mask, 32) {
if (dpn_prop[i].num == port_num)
return &dpn_prop[i];
}
--
2.43.0
This patch addresses an issue with improper reference count handling in the
ice_sriov_set_msix_vec_count() function.
First, the function calls ice_get_vf_by_id(), which increments the
reference count of the vf pointer. If the subsequent call to
ice_get_vf_vsi() fails, the function currently returns an error without
decrementing the reference count of the vf pointer, leading to a reference
count leak. The correct behavior, as implemented in this patch, is to
decrement the reference count using ice_put_vf(vf) before returning an
error when vsi is NULL.
Second, the function calls ice_sriov_get_irqs(), which sets
vf->first_vector_idx. If this call returns a negative value, indicating an
error, the function returns an error without decrementing the reference
count of the vf pointer, resulting in another reference count leak. The
patch addresses this by adding a call to ice_put_vf(vf) before returning
an error when vf->first_vector_idx < 0.
This bug was identified by an experimental static analysis tool developed
by our team. The tool specializes in analyzing reference count operations
and identifying potential mismanagement of reference counts. In this case,
the tool flagged the missing decrement operation as a potential issue,
leading to this patch.
Fixes: 4035c72dc1ba ("ice: reconfig host after changing MSI-X on VF")
Fixes: 4d38cb44bd32 ("ice: manage VFs MSI-X using resource tracking")
Cc: stable(a)vger.kernel.org
Signed-off-by: Gui-Dong Han <hanguidong02(a)outlook.com>
---
v2:
* In this patch v2, an additional resource leak was addressed when
vf->first_vector_idx < 0. The issue is now fixed by adding ice_put_vf(vf)
before returning an error.
Thanks to Simon Horman for identifying this additional leak scenario.
---
drivers/net/ethernet/intel/ice/ice_sriov.c | 8 ++++++--
1 file changed, 6 insertions(+), 2 deletions(-)
diff --git a/drivers/net/ethernet/intel/ice/ice_sriov.c b/drivers/net/ethernet/intel/ice/ice_sriov.c
index 55ef33208456..fbf18ac97875 100644
--- a/drivers/net/ethernet/intel/ice/ice_sriov.c
+++ b/drivers/net/ethernet/intel/ice/ice_sriov.c
@@ -1096,8 +1096,10 @@ int ice_sriov_set_msix_vec_count(struct pci_dev *vf_dev, int msix_vec_count)
return -ENOENT;
vsi = ice_get_vf_vsi(vf);
- if (!vsi)
+ if (!vsi) {
+ ice_put_vf(vf);
return -ENOENT;
+ }
prev_msix = vf->num_msix;
prev_queues = vf->num_vf_qs;
@@ -1142,8 +1144,10 @@ int ice_sriov_set_msix_vec_count(struct pci_dev *vf_dev, int msix_vec_count)
vf->num_msix = prev_msix;
vf->num_vf_qs = prev_queues;
vf->first_vector_idx = ice_sriov_get_irqs(pf, vf->num_msix);
- if (vf->first_vector_idx < 0)
+ if (vf->first_vector_idx < 0) {
+ ice_put_vf(vf);
return -EINVAL;
+ }
if (needs_rebuild) {
ice_vf_reconfig_vsi(vf);
--
2.25.1
From: Tvrtko Ursulin <tvrtko.ursulin(a)igalia.com>
Without the locking amdgpu currently can race
amdgpu_ctx_set_entity_priority() and drm_sched_job_arm(), leading to the
latter accesing potentially inconsitent entity->sched_list and
entity->num_sched_list pair.
The comment on drm_sched_entity_modify_sched() however says:
"""
* Note that this must be called under the same common lock for @entity as
* drm_sched_job_arm() and drm_sched_entity_push_job(), or the driver needs to
* guarantee through some other means that this is never called while new jobs
* can be pushed to @entity.
"""
It is unclear if that is referring to this race or something else.
Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin(a)igalia.com>
Fixes: b37aced31eb0 ("drm/scheduler: implement a function to modify sched list")
Cc: Christian König <christian.koenig(a)amd.com>
Cc: Alex Deucher <alexander.deucher(a)amd.com>
Cc: Luben Tuikov <ltuikov89(a)gmail.com>
Cc: Matthew Brost <matthew.brost(a)intel.com>
Cc: David Airlie <airlied(a)gmail.com>
Cc: Daniel Vetter <daniel(a)ffwll.ch>
Cc: dri-devel(a)lists.freedesktop.org
Cc: <stable(a)vger.kernel.org> # v5.7+
---
drivers/gpu/drm/scheduler/sched_entity.c | 2 ++
1 file changed, 2 insertions(+)
diff --git a/drivers/gpu/drm/scheduler/sched_entity.c b/drivers/gpu/drm/scheduler/sched_entity.c
index 58c8161289fe..ae8be30472cd 100644
--- a/drivers/gpu/drm/scheduler/sched_entity.c
+++ b/drivers/gpu/drm/scheduler/sched_entity.c
@@ -133,8 +133,10 @@ void drm_sched_entity_modify_sched(struct drm_sched_entity *entity,
{
WARN_ON(!num_sched_list || !sched_list);
+ spin_lock(&entity->rq_lock);
entity->sched_list = sched_list;
entity->num_sched_list = num_sched_list;
+ spin_unlock(&entity->rq_lock);
}
EXPORT_SYMBOL(drm_sched_entity_modify_sched);
--
2.46.0
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.15.y
git checkout FETCH_HEAD
git cherry-pick -x 9972605a238339b85bd16b084eed5f18414d22db
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024081211-owl-snowdrop-d2aa@gregkh' --subject-prefix 'PATCH 5.15.y' HEAD^..
Possible dependencies:
9972605a2383 ("memcg: protect concurrent access to mem_cgroup_idr")
6f0df8e16eb5 ("memcontrol: ensure memcg acquired by id is properly set up")
e4dde56cd208 ("mm: multi-gen LRU: per-node lru_gen_folio lists")
7348cc91821b ("mm: multi-gen LRU: remove aging fairness safeguard")
a579086c99ed ("mm: multi-gen LRU: remove eviction fairness safeguard")
adb8213014b2 ("mm: memcg: fix stale protection of reclaim target memcg")
57e9cc50f4dd ("mm: vmscan: split khugepaged stats from direct reclaim stats")
e4fea72b1438 ("mglru: mm/vmscan.c: fix imprecise comments")
d396def5d86d ("memcg: rearrange code")
410f8e82689e ("memcg: extract memcg_vmstats from struct mem_cgroup")
d6c3af7d8a2b ("mm: multi-gen LRU: debugfs interface")
1332a809d95a ("mm: multi-gen LRU: thrashing prevention")
354ed5974429 ("mm: multi-gen LRU: kill switch")
f76c83378851 ("mm: multi-gen LRU: optimize multiple memcgs")
bd74fdaea146 ("mm: multi-gen LRU: support page table walks")
018ee47f1489 ("mm: multi-gen LRU: exploit locality in rmap")
ac35a4902374 ("mm: multi-gen LRU: minimal implementation")
ec1c86b25f4b ("mm: multi-gen LRU: groundwork")
f1e1a7be4718 ("mm/vmscan.c: refactor shrink_node()")
d3629af59f41 ("mm/vmscan: make the annotations of refaults code at the right place")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 9972605a238339b85bd16b084eed5f18414d22db Mon Sep 17 00:00:00 2001
From: Shakeel Butt <shakeel.butt(a)linux.dev>
Date: Fri, 2 Aug 2024 16:58:22 -0700
Subject: [PATCH] memcg: protect concurrent access to mem_cgroup_idr
Commit 73f576c04b94 ("mm: memcontrol: fix cgroup creation failure after
many small jobs") decoupled the memcg IDs from the CSS ID space to fix the
cgroup creation failures. It introduced IDR to maintain the memcg ID
space. The IDR depends on external synchronization mechanisms for
modifications. For the mem_cgroup_idr, the idr_alloc() and idr_replace()
happen within css callback and thus are protected through cgroup_mutex
from concurrent modifications. However idr_remove() for mem_cgroup_idr
was not protected against concurrency and can be run concurrently for
different memcgs when they hit their refcnt to zero. Fix that.
We have been seeing list_lru based kernel crashes at a low frequency in
our fleet for a long time. These crashes were in different part of
list_lru code including list_lru_add(), list_lru_del() and reparenting
code. Upon further inspection, it looked like for a given object (dentry
and inode), the super_block's list_lru didn't have list_lru_one for the
memcg of that object. The initial suspicions were either the object is
not allocated through kmem_cache_alloc_lru() or somehow
memcg_list_lru_alloc() failed to allocate list_lru_one() for a memcg but
returned success. No evidence were found for these cases.
Looking more deeply, we started seeing situations where valid memcg's id
is not present in mem_cgroup_idr and in some cases multiple valid memcgs
have same id and mem_cgroup_idr is pointing to one of them. So, the most
reasonable explanation is that these situations can happen due to race
between multiple idr_remove() calls or race between
idr_alloc()/idr_replace() and idr_remove(). These races are causing
multiple memcgs to acquire the same ID and then offlining of one of them
would cleanup list_lrus on the system for all of them. Later access from
other memcgs to the list_lru cause crashes due to missing list_lru_one.
Link: https://lkml.kernel.org/r/20240802235822.1830976-1-shakeel.butt@linux.dev
Fixes: 73f576c04b94 ("mm: memcontrol: fix cgroup creation failure after many small jobs")
Signed-off-by: Shakeel Butt <shakeel.butt(a)linux.dev>
Acked-by: Muchun Song <muchun.song(a)linux.dev>
Reviewed-by: Roman Gushchin <roman.gushchin(a)linux.dev>
Acked-by: Johannes Weiner <hannes(a)cmpxchg.org>
Cc: Michal Hocko <mhocko(a)suse.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 960371788687..f29157288b7d 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -3386,11 +3386,28 @@ static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg)
#define MEM_CGROUP_ID_MAX ((1UL << MEM_CGROUP_ID_SHIFT) - 1)
static DEFINE_IDR(mem_cgroup_idr);
+static DEFINE_SPINLOCK(memcg_idr_lock);
+
+static int mem_cgroup_alloc_id(void)
+{
+ int ret;
+
+ idr_preload(GFP_KERNEL);
+ spin_lock(&memcg_idr_lock);
+ ret = idr_alloc(&mem_cgroup_idr, NULL, 1, MEM_CGROUP_ID_MAX + 1,
+ GFP_NOWAIT);
+ spin_unlock(&memcg_idr_lock);
+ idr_preload_end();
+ return ret;
+}
static void mem_cgroup_id_remove(struct mem_cgroup *memcg)
{
if (memcg->id.id > 0) {
+ spin_lock(&memcg_idr_lock);
idr_remove(&mem_cgroup_idr, memcg->id.id);
+ spin_unlock(&memcg_idr_lock);
+
memcg->id.id = 0;
}
}
@@ -3524,8 +3541,7 @@ static struct mem_cgroup *mem_cgroup_alloc(struct mem_cgroup *parent)
if (!memcg)
return ERR_PTR(error);
- memcg->id.id = idr_alloc(&mem_cgroup_idr, NULL,
- 1, MEM_CGROUP_ID_MAX + 1, GFP_KERNEL);
+ memcg->id.id = mem_cgroup_alloc_id();
if (memcg->id.id < 0) {
error = memcg->id.id;
goto fail;
@@ -3667,7 +3683,9 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
* publish it here at the end of onlining. This matches the
* regular ID destruction during offlining.
*/
+ spin_lock(&memcg_idr_lock);
idr_replace(&mem_cgroup_idr, memcg, memcg->id.id);
+ spin_unlock(&memcg_idr_lock);
return 0;
offline_kmem:
The patch below does not apply to the 6.1-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.1.y
git checkout FETCH_HEAD
git cherry-pick -x 9972605a238339b85bd16b084eed5f18414d22db
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024081259-plow-freezing-a93e@gregkh' --subject-prefix 'PATCH 6.1.y' HEAD^..
Possible dependencies:
9972605a2383 ("memcg: protect concurrent access to mem_cgroup_idr")
6f0df8e16eb5 ("memcontrol: ensure memcg acquired by id is properly set up")
e4dde56cd208 ("mm: multi-gen LRU: per-node lru_gen_folio lists")
7348cc91821b ("mm: multi-gen LRU: remove aging fairness safeguard")
a579086c99ed ("mm: multi-gen LRU: remove eviction fairness safeguard")
adb8213014b2 ("mm: memcg: fix stale protection of reclaim target memcg")
57e9cc50f4dd ("mm: vmscan: split khugepaged stats from direct reclaim stats")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 9972605a238339b85bd16b084eed5f18414d22db Mon Sep 17 00:00:00 2001
From: Shakeel Butt <shakeel.butt(a)linux.dev>
Date: Fri, 2 Aug 2024 16:58:22 -0700
Subject: [PATCH] memcg: protect concurrent access to mem_cgroup_idr
Commit 73f576c04b94 ("mm: memcontrol: fix cgroup creation failure after
many small jobs") decoupled the memcg IDs from the CSS ID space to fix the
cgroup creation failures. It introduced IDR to maintain the memcg ID
space. The IDR depends on external synchronization mechanisms for
modifications. For the mem_cgroup_idr, the idr_alloc() and idr_replace()
happen within css callback and thus are protected through cgroup_mutex
from concurrent modifications. However idr_remove() for mem_cgroup_idr
was not protected against concurrency and can be run concurrently for
different memcgs when they hit their refcnt to zero. Fix that.
We have been seeing list_lru based kernel crashes at a low frequency in
our fleet for a long time. These crashes were in different part of
list_lru code including list_lru_add(), list_lru_del() and reparenting
code. Upon further inspection, it looked like for a given object (dentry
and inode), the super_block's list_lru didn't have list_lru_one for the
memcg of that object. The initial suspicions were either the object is
not allocated through kmem_cache_alloc_lru() or somehow
memcg_list_lru_alloc() failed to allocate list_lru_one() for a memcg but
returned success. No evidence were found for these cases.
Looking more deeply, we started seeing situations where valid memcg's id
is not present in mem_cgroup_idr and in some cases multiple valid memcgs
have same id and mem_cgroup_idr is pointing to one of them. So, the most
reasonable explanation is that these situations can happen due to race
between multiple idr_remove() calls or race between
idr_alloc()/idr_replace() and idr_remove(). These races are causing
multiple memcgs to acquire the same ID and then offlining of one of them
would cleanup list_lrus on the system for all of them. Later access from
other memcgs to the list_lru cause crashes due to missing list_lru_one.
Link: https://lkml.kernel.org/r/20240802235822.1830976-1-shakeel.butt@linux.dev
Fixes: 73f576c04b94 ("mm: memcontrol: fix cgroup creation failure after many small jobs")
Signed-off-by: Shakeel Butt <shakeel.butt(a)linux.dev>
Acked-by: Muchun Song <muchun.song(a)linux.dev>
Reviewed-by: Roman Gushchin <roman.gushchin(a)linux.dev>
Acked-by: Johannes Weiner <hannes(a)cmpxchg.org>
Cc: Michal Hocko <mhocko(a)suse.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 960371788687..f29157288b7d 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -3386,11 +3386,28 @@ static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg)
#define MEM_CGROUP_ID_MAX ((1UL << MEM_CGROUP_ID_SHIFT) - 1)
static DEFINE_IDR(mem_cgroup_idr);
+static DEFINE_SPINLOCK(memcg_idr_lock);
+
+static int mem_cgroup_alloc_id(void)
+{
+ int ret;
+
+ idr_preload(GFP_KERNEL);
+ spin_lock(&memcg_idr_lock);
+ ret = idr_alloc(&mem_cgroup_idr, NULL, 1, MEM_CGROUP_ID_MAX + 1,
+ GFP_NOWAIT);
+ spin_unlock(&memcg_idr_lock);
+ idr_preload_end();
+ return ret;
+}
static void mem_cgroup_id_remove(struct mem_cgroup *memcg)
{
if (memcg->id.id > 0) {
+ spin_lock(&memcg_idr_lock);
idr_remove(&mem_cgroup_idr, memcg->id.id);
+ spin_unlock(&memcg_idr_lock);
+
memcg->id.id = 0;
}
}
@@ -3524,8 +3541,7 @@ static struct mem_cgroup *mem_cgroup_alloc(struct mem_cgroup *parent)
if (!memcg)
return ERR_PTR(error);
- memcg->id.id = idr_alloc(&mem_cgroup_idr, NULL,
- 1, MEM_CGROUP_ID_MAX + 1, GFP_KERNEL);
+ memcg->id.id = mem_cgroup_alloc_id();
if (memcg->id.id < 0) {
error = memcg->id.id;
goto fail;
@@ -3667,7 +3683,9 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
* publish it here at the end of onlining. This matches the
* regular ID destruction during offlining.
*/
+ spin_lock(&memcg_idr_lock);
idr_replace(&mem_cgroup_idr, memcg, memcg->id.id);
+ spin_unlock(&memcg_idr_lock);
return 0;
offline_kmem:
From: Filipe Manana <fdmanana(a)suse.com>
commit cd9253c23aedd61eb5ff11f37a36247cd46faf86 upstream.
If we have 2 threads that are using the same file descriptor and one of
them is doing direct IO writes while the other is doing fsync, we have a
race where we can end up either:
1) Attempt a fsync without holding the inode's lock, triggering an
assertion failures when assertions are enabled;
2) Do an invalid memory access from the fsync task because the file private
points to memory allocated on stack by the direct IO task and it may be
used by the fsync task after the stack was destroyed.
The race happens like this:
1) A user space program opens a file descriptor with O_DIRECT;
2) The program spawns 2 threads using libpthread for example;
3) One of the threads uses the file descriptor to do direct IO writes,
while the other calls fsync using the same file descriptor.
4) Call task A the thread doing direct IO writes and task B the thread
doing fsyncs;
5) Task A does a direct IO write, and at btrfs_direct_write() sets the
file's private to an on stack allocated private with the member
'fsync_skip_inode_lock' set to true;
6) Task B enters btrfs_sync_file() and sees that there's a private
structure associated to the file which has 'fsync_skip_inode_lock' set
to true, so it skips locking the inode's vfs lock;
7) Task A completes the direct IO write, and resets the file's private to
NULL since it had no prior private and our private was stack allocated.
Then it unlocks the inode's vfs lock;
8) Task B enters btrfs_get_ordered_extents_for_logging(), then the
assertion that checks the inode's vfs lock is held fails, since task B
never locked it and task A has already unlocked it.
The stack trace produced is the following:
Aug 21 11:46:43 kerberos kernel: assertion failed: inode_is_locked(&inode->vfs_inode), in fs/btrfs/ordered-data.c:983
Aug 21 11:46:43 kerberos kernel: ------------[ cut here ]------------
Aug 21 11:46:43 kerberos kernel: kernel BUG at fs/btrfs/ordered-data.c:983!
Aug 21 11:46:43 kerberos kernel: Oops: invalid opcode: 0000 [#1] PREEMPT SMP PTI
Aug 21 11:46:43 kerberos kernel: CPU: 9 PID: 5072 Comm: worker Tainted: G U OE 6.10.5-1-default #1 openSUSE Tumbleweed 69f48d427608e1c09e60ea24c6c55e2ca1b049e8
Aug 21 11:46:43 kerberos kernel: Hardware name: Acer Predator PH315-52/Covini_CFS, BIOS V1.12 07/28/2020
Aug 21 11:46:43 kerberos kernel: RIP: 0010:btrfs_get_ordered_extents_for_logging.cold+0x1f/0x42 [btrfs]
Aug 21 11:46:43 kerberos kernel: Code: 50 d6 86 c0 e8 (...)
Aug 21 11:46:43 kerberos kernel: RSP: 0018:ffff9e4a03dcfc78 EFLAGS: 00010246
Aug 21 11:46:43 kerberos kernel: RAX: 0000000000000054 RBX: ffff9078a9868e98 RCX: 0000000000000000
Aug 21 11:46:43 kerberos kernel: RDX: 0000000000000000 RSI: ffff907dce4a7800 RDI: ffff907dce4a7800
Aug 21 11:46:43 kerberos kernel: RBP: ffff907805518800 R08: 0000000000000000 R09: ffff9e4a03dcfb38
Aug 21 11:46:43 kerberos kernel: R10: ffff9e4a03dcfb30 R11: 0000000000000003 R12: ffff907684ae7800
Aug 21 11:46:43 kerberos kernel: R13: 0000000000000001 R14: ffff90774646b600 R15: 0000000000000000
Aug 21 11:46:43 kerberos kernel: FS: 00007f04b96006c0(0000) GS:ffff907dce480000(0000) knlGS:0000000000000000
Aug 21 11:46:43 kerberos kernel: CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
Aug 21 11:46:43 kerberos kernel: CR2: 00007f32acbfc000 CR3: 00000001fd4fa005 CR4: 00000000003726f0
Aug 21 11:46:43 kerberos kernel: Call Trace:
Aug 21 11:46:43 kerberos kernel: <TASK>
Aug 21 11:46:43 kerberos kernel: ? __die_body.cold+0x14/0x24
Aug 21 11:46:43 kerberos kernel: ? die+0x2e/0x50
Aug 21 11:46:43 kerberos kernel: ? do_trap+0xca/0x110
Aug 21 11:46:43 kerberos kernel: ? do_error_trap+0x6a/0x90
Aug 21 11:46:43 kerberos kernel: ? btrfs_get_ordered_extents_for_logging.cold+0x1f/0x42 [btrfs bb26272d49b4cdc847cf3f7faadd459b62caee9a]
Aug 21 11:46:43 kerberos kernel: ? exc_invalid_op+0x50/0x70
Aug 21 11:46:43 kerberos kernel: ? btrfs_get_ordered_extents_for_logging.cold+0x1f/0x42 [btrfs bb26272d49b4cdc847cf3f7faadd459b62caee9a]
Aug 21 11:46:43 kerberos kernel: ? asm_exc_invalid_op+0x1a/0x20
Aug 21 11:46:43 kerberos kernel: ? btrfs_get_ordered_extents_for_logging.cold+0x1f/0x42 [btrfs bb26272d49b4cdc847cf3f7faadd459b62caee9a]
Aug 21 11:46:43 kerberos kernel: ? btrfs_get_ordered_extents_for_logging.cold+0x1f/0x42 [btrfs bb26272d49b4cdc847cf3f7faadd459b62caee9a]
Aug 21 11:46:43 kerberos kernel: btrfs_sync_file+0x21a/0x4d0 [btrfs bb26272d49b4cdc847cf3f7faadd459b62caee9a]
Aug 21 11:46:43 kerberos kernel: ? __seccomp_filter+0x31d/0x4f0
Aug 21 11:46:43 kerberos kernel: __x64_sys_fdatasync+0x4f/0x90
Aug 21 11:46:43 kerberos kernel: do_syscall_64+0x82/0x160
Aug 21 11:46:43 kerberos kernel: ? do_futex+0xcb/0x190
Aug 21 11:46:43 kerberos kernel: ? __x64_sys_futex+0x10e/0x1d0
Aug 21 11:46:43 kerberos kernel: ? switch_fpu_return+0x4f/0xd0
Aug 21 11:46:43 kerberos kernel: ? syscall_exit_to_user_mode+0x72/0x220
Aug 21 11:46:43 kerberos kernel: ? do_syscall_64+0x8e/0x160
Aug 21 11:46:43 kerberos kernel: ? syscall_exit_to_user_mode+0x72/0x220
Aug 21 11:46:43 kerberos kernel: ? do_syscall_64+0x8e/0x160
Aug 21 11:46:43 kerberos kernel: ? syscall_exit_to_user_mode+0x72/0x220
Aug 21 11:46:43 kerberos kernel: ? do_syscall_64+0x8e/0x160
Aug 21 11:46:43 kerberos kernel: ? syscall_exit_to_user_mode+0x72/0x220
Aug 21 11:46:43 kerberos kernel: ? do_syscall_64+0x8e/0x160
Aug 21 11:46:43 kerberos kernel: entry_SYSCALL_64_after_hwframe+0x76/0x7e
Another problem here is if task B grabs the private pointer and then uses
it after task A has finished, since the private was allocated in the stack
of trask A, it results in some invalid memory access with a hard to predict
result.
This issue, triggering the assertion, was observed with QEMU workloads by
two users in the Link tags below.
Fix this by not relying on a file's private to pass information to fsync
that it should skip locking the inode and instead pass this information
through a special value stored in current->journal_info. This is safe
because in the relevant section of the direct IO write path we are not
holding a transaction handle, so current->journal_info is NULL.
The following C program triggers the issue:
$ cat repro.c
/* Get the O_DIRECT definition. */
#ifndef _GNU_SOURCE
#define _GNU_SOURCE
#endif
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <stdint.h>
#include <fcntl.h>
#include <errno.h>
#include <string.h>
#include <pthread.h>
static int fd;
static ssize_t do_write(int fd, const void *buf, size_t count, off_t offset)
{
while (count > 0) {
ssize_t ret;
ret = pwrite(fd, buf, count, offset);
if (ret < 0) {
if (errno == EINTR)
continue;
return ret;
}
count -= ret;
buf += ret;
}
return 0;
}
static void *fsync_loop(void *arg)
{
while (1) {
int ret;
ret = fsync(fd);
if (ret != 0) {
perror("Fsync failed");
exit(6);
}
}
}
int main(int argc, char *argv[])
{
long pagesize;
void *write_buf;
pthread_t fsyncer;
int ret;
if (argc != 2) {
fprintf(stderr, "Use: %s <file path>\n", argv[0]);
return 1;
}
fd = open(argv[1], O_WRONLY | O_CREAT | O_TRUNC | O_DIRECT, 0666);
if (fd == -1) {
perror("Failed to open/create file");
return 1;
}
pagesize = sysconf(_SC_PAGE_SIZE);
if (pagesize == -1) {
perror("Failed to get page size");
return 2;
}
ret = posix_memalign(&write_buf, pagesize, pagesize);
if (ret) {
perror("Failed to allocate buffer");
return 3;
}
ret = pthread_create(&fsyncer, NULL, fsync_loop, NULL);
if (ret != 0) {
fprintf(stderr, "Failed to create writer thread: %d\n", ret);
return 4;
}
while (1) {
ret = do_write(fd, write_buf, pagesize, 0);
if (ret != 0) {
perror("Write failed");
exit(5);
}
}
return 0;
}
$ mkfs.btrfs -f /dev/sdi
$ mount /dev/sdi /mnt/sdi
$ timeout 10 ./repro /mnt/sdi/foo
Usually the race is triggered within less than 1 second. A test case for
fstests will follow soon.
Reported-by: Paulo Dias <paulo.miguel.dias(a)gmail.com>
Link: https://bugzilla.kernel.org/show_bug.cgi?id=219187
Reported-by: Andreas Jahn <jahn-andi(a)web.de>
Link: https://bugzilla.kernel.org/show_bug.cgi?id=219199
Reported-by: syzbot+4704b3cc972bd76024f1(a)syzkaller.appspotmail.com
Link: https://lore.kernel.org/linux-btrfs/00000000000044ff540620d7dee2@google.com/
Fixes: 939b656bc8ab ("btrfs: fix corruption after buffer fault in during direct IO append write")
CC: stable(a)vger.kernel.org # 5.15+
Reviewed-by: Josef Bacik <josef(a)toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana(a)suse.com>
Reviewed-by: David Sterba <dsterba(a)suse.com>
Signed-off-by: David Sterba <dsterba(a)suse.com>
---
fs/btrfs/ctree.h | 1 -
fs/btrfs/file.c | 25 ++++++++++---------------
fs/btrfs/transaction.h | 6 ++++++
3 files changed, 16 insertions(+), 16 deletions(-)
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 853b1f96b1fd..cca1acf2e037 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1553,7 +1553,6 @@ struct btrfs_drop_extents_args {
struct btrfs_file_private {
void *filldir_buf;
u64 last_index;
- bool fsync_skip_inode_lock;
};
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index e23d178f9778..c8231677c79e 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1534,13 +1534,6 @@ static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
if (IS_ERR_OR_NULL(dio)) {
err = PTR_ERR_OR_ZERO(dio);
} else {
- struct btrfs_file_private stack_private = { 0 };
- struct btrfs_file_private *private;
- const bool have_private = (file->private_data != NULL);
-
- if (!have_private)
- file->private_data = &stack_private;
-
/*
* If we have a synchoronous write, we must make sure the fsync
* triggered by the iomap_dio_complete() call below doesn't
@@ -1549,13 +1542,10 @@ static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
* partial writes due to the input buffer (or parts of it) not
* being already faulted in.
*/
- private = file->private_data;
- private->fsync_skip_inode_lock = true;
+ ASSERT(current->journal_info == NULL);
+ current->journal_info = BTRFS_TRANS_DIO_WRITE_STUB;
err = iomap_dio_complete(dio);
- private->fsync_skip_inode_lock = false;
-
- if (!have_private)
- file->private_data = NULL;
+ current->journal_info = NULL;
}
/* No increment (+=) because iomap returns a cumulative value. */
@@ -1795,7 +1785,6 @@ static inline bool skip_inode_logging(const struct btrfs_log_ctx *ctx)
*/
int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
{
- struct btrfs_file_private *private = file->private_data;
struct dentry *dentry = file_dentry(file);
struct inode *inode = d_inode(dentry);
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
@@ -1805,7 +1794,13 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
int ret = 0, err;
u64 len;
bool full_sync;
- const bool skip_ilock = (private ? private->fsync_skip_inode_lock : false);
+ bool skip_ilock = false;
+
+ if (current->journal_info == BTRFS_TRANS_DIO_WRITE_STUB) {
+ skip_ilock = true;
+ current->journal_info = NULL;
+ lockdep_assert_held(&inode->i_rwsem);
+ }
trace_btrfs_sync_file(file, datasync);
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 970ff316069d..8b88446df36d 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -11,6 +11,12 @@
#include "delayed-ref.h"
#include "ctree.h"
+/*
+ * Signal that a direct IO write is in progress, to avoid deadlock for sync
+ * direct IO writes when fsync is called during the direct IO write path.
+ */
+#define BTRFS_TRANS_DIO_WRITE_STUB ((void *) 1)
+
enum btrfs_trans_state {
TRANS_STATE_RUNNING,
TRANS_STATE_COMMIT_START,
--
2.43.0
From: Filipe Manana <fdmanana(a)suse.com>
commit cd9253c23aedd61eb5ff11f37a36247cd46faf86 upstream.
If we have 2 threads that are using the same file descriptor and one of
them is doing direct IO writes while the other is doing fsync, we have a
race where we can end up either:
1) Attempt a fsync without holding the inode's lock, triggering an
assertion failures when assertions are enabled;
2) Do an invalid memory access from the fsync task because the file private
points to memory allocated on stack by the direct IO task and it may be
used by the fsync task after the stack was destroyed.
The race happens like this:
1) A user space program opens a file descriptor with O_DIRECT;
2) The program spawns 2 threads using libpthread for example;
3) One of the threads uses the file descriptor to do direct IO writes,
while the other calls fsync using the same file descriptor.
4) Call task A the thread doing direct IO writes and task B the thread
doing fsyncs;
5) Task A does a direct IO write, and at btrfs_direct_write() sets the
file's private to an on stack allocated private with the member
'fsync_skip_inode_lock' set to true;
6) Task B enters btrfs_sync_file() and sees that there's a private
structure associated to the file which has 'fsync_skip_inode_lock' set
to true, so it skips locking the inode's vfs lock;
7) Task A completes the direct IO write, and resets the file's private to
NULL since it had no prior private and our private was stack allocated.
Then it unlocks the inode's vfs lock;
8) Task B enters btrfs_get_ordered_extents_for_logging(), then the
assertion that checks the inode's vfs lock is held fails, since task B
never locked it and task A has already unlocked it.
The stack trace produced is the following:
Aug 21 11:46:43 kerberos kernel: assertion failed: inode_is_locked(&inode->vfs_inode), in fs/btrfs/ordered-data.c:983
Aug 21 11:46:43 kerberos kernel: ------------[ cut here ]------------
Aug 21 11:46:43 kerberos kernel: kernel BUG at fs/btrfs/ordered-data.c:983!
Aug 21 11:46:43 kerberos kernel: Oops: invalid opcode: 0000 [#1] PREEMPT SMP PTI
Aug 21 11:46:43 kerberos kernel: CPU: 9 PID: 5072 Comm: worker Tainted: G U OE 6.10.5-1-default #1 openSUSE Tumbleweed 69f48d427608e1c09e60ea24c6c55e2ca1b049e8
Aug 21 11:46:43 kerberos kernel: Hardware name: Acer Predator PH315-52/Covini_CFS, BIOS V1.12 07/28/2020
Aug 21 11:46:43 kerberos kernel: RIP: 0010:btrfs_get_ordered_extents_for_logging.cold+0x1f/0x42 [btrfs]
Aug 21 11:46:43 kerberos kernel: Code: 50 d6 86 c0 e8 (...)
Aug 21 11:46:43 kerberos kernel: RSP: 0018:ffff9e4a03dcfc78 EFLAGS: 00010246
Aug 21 11:46:43 kerberos kernel: RAX: 0000000000000054 RBX: ffff9078a9868e98 RCX: 0000000000000000
Aug 21 11:46:43 kerberos kernel: RDX: 0000000000000000 RSI: ffff907dce4a7800 RDI: ffff907dce4a7800
Aug 21 11:46:43 kerberos kernel: RBP: ffff907805518800 R08: 0000000000000000 R09: ffff9e4a03dcfb38
Aug 21 11:46:43 kerberos kernel: R10: ffff9e4a03dcfb30 R11: 0000000000000003 R12: ffff907684ae7800
Aug 21 11:46:43 kerberos kernel: R13: 0000000000000001 R14: ffff90774646b600 R15: 0000000000000000
Aug 21 11:46:43 kerberos kernel: FS: 00007f04b96006c0(0000) GS:ffff907dce480000(0000) knlGS:0000000000000000
Aug 21 11:46:43 kerberos kernel: CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
Aug 21 11:46:43 kerberos kernel: CR2: 00007f32acbfc000 CR3: 00000001fd4fa005 CR4: 00000000003726f0
Aug 21 11:46:43 kerberos kernel: Call Trace:
Aug 21 11:46:43 kerberos kernel: <TASK>
Aug 21 11:46:43 kerberos kernel: ? __die_body.cold+0x14/0x24
Aug 21 11:46:43 kerberos kernel: ? die+0x2e/0x50
Aug 21 11:46:43 kerberos kernel: ? do_trap+0xca/0x110
Aug 21 11:46:43 kerberos kernel: ? do_error_trap+0x6a/0x90
Aug 21 11:46:43 kerberos kernel: ? btrfs_get_ordered_extents_for_logging.cold+0x1f/0x42 [btrfs bb26272d49b4cdc847cf3f7faadd459b62caee9a]
Aug 21 11:46:43 kerberos kernel: ? exc_invalid_op+0x50/0x70
Aug 21 11:46:43 kerberos kernel: ? btrfs_get_ordered_extents_for_logging.cold+0x1f/0x42 [btrfs bb26272d49b4cdc847cf3f7faadd459b62caee9a]
Aug 21 11:46:43 kerberos kernel: ? asm_exc_invalid_op+0x1a/0x20
Aug 21 11:46:43 kerberos kernel: ? btrfs_get_ordered_extents_for_logging.cold+0x1f/0x42 [btrfs bb26272d49b4cdc847cf3f7faadd459b62caee9a]
Aug 21 11:46:43 kerberos kernel: ? btrfs_get_ordered_extents_for_logging.cold+0x1f/0x42 [btrfs bb26272d49b4cdc847cf3f7faadd459b62caee9a]
Aug 21 11:46:43 kerberos kernel: btrfs_sync_file+0x21a/0x4d0 [btrfs bb26272d49b4cdc847cf3f7faadd459b62caee9a]
Aug 21 11:46:43 kerberos kernel: ? __seccomp_filter+0x31d/0x4f0
Aug 21 11:46:43 kerberos kernel: __x64_sys_fdatasync+0x4f/0x90
Aug 21 11:46:43 kerberos kernel: do_syscall_64+0x82/0x160
Aug 21 11:46:43 kerberos kernel: ? do_futex+0xcb/0x190
Aug 21 11:46:43 kerberos kernel: ? __x64_sys_futex+0x10e/0x1d0
Aug 21 11:46:43 kerberos kernel: ? switch_fpu_return+0x4f/0xd0
Aug 21 11:46:43 kerberos kernel: ? syscall_exit_to_user_mode+0x72/0x220
Aug 21 11:46:43 kerberos kernel: ? do_syscall_64+0x8e/0x160
Aug 21 11:46:43 kerberos kernel: ? syscall_exit_to_user_mode+0x72/0x220
Aug 21 11:46:43 kerberos kernel: ? do_syscall_64+0x8e/0x160
Aug 21 11:46:43 kerberos kernel: ? syscall_exit_to_user_mode+0x72/0x220
Aug 21 11:46:43 kerberos kernel: ? do_syscall_64+0x8e/0x160
Aug 21 11:46:43 kerberos kernel: ? syscall_exit_to_user_mode+0x72/0x220
Aug 21 11:46:43 kerberos kernel: ? do_syscall_64+0x8e/0x160
Aug 21 11:46:43 kerberos kernel: entry_SYSCALL_64_after_hwframe+0x76/0x7e
Another problem here is if task B grabs the private pointer and then uses
it after task A has finished, since the private was allocated in the stack
of trask A, it results in some invalid memory access with a hard to predict
result.
This issue, triggering the assertion, was observed with QEMU workloads by
two users in the Link tags below.
Fix this by not relying on a file's private to pass information to fsync
that it should skip locking the inode and instead pass this information
through a special value stored in current->journal_info. This is safe
because in the relevant section of the direct IO write path we are not
holding a transaction handle, so current->journal_info is NULL.
The following C program triggers the issue:
$ cat repro.c
/* Get the O_DIRECT definition. */
#ifndef _GNU_SOURCE
#define _GNU_SOURCE
#endif
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <stdint.h>
#include <fcntl.h>
#include <errno.h>
#include <string.h>
#include <pthread.h>
static int fd;
static ssize_t do_write(int fd, const void *buf, size_t count, off_t offset)
{
while (count > 0) {
ssize_t ret;
ret = pwrite(fd, buf, count, offset);
if (ret < 0) {
if (errno == EINTR)
continue;
return ret;
}
count -= ret;
buf += ret;
}
return 0;
}
static void *fsync_loop(void *arg)
{
while (1) {
int ret;
ret = fsync(fd);
if (ret != 0) {
perror("Fsync failed");
exit(6);
}
}
}
int main(int argc, char *argv[])
{
long pagesize;
void *write_buf;
pthread_t fsyncer;
int ret;
if (argc != 2) {
fprintf(stderr, "Use: %s <file path>\n", argv[0]);
return 1;
}
fd = open(argv[1], O_WRONLY | O_CREAT | O_TRUNC | O_DIRECT, 0666);
if (fd == -1) {
perror("Failed to open/create file");
return 1;
}
pagesize = sysconf(_SC_PAGE_SIZE);
if (pagesize == -1) {
perror("Failed to get page size");
return 2;
}
ret = posix_memalign(&write_buf, pagesize, pagesize);
if (ret) {
perror("Failed to allocate buffer");
return 3;
}
ret = pthread_create(&fsyncer, NULL, fsync_loop, NULL);
if (ret != 0) {
fprintf(stderr, "Failed to create writer thread: %d\n", ret);
return 4;
}
while (1) {
ret = do_write(fd, write_buf, pagesize, 0);
if (ret != 0) {
perror("Write failed");
exit(5);
}
}
return 0;
}
$ mkfs.btrfs -f /dev/sdi
$ mount /dev/sdi /mnt/sdi
$ timeout 10 ./repro /mnt/sdi/foo
Usually the race is triggered within less than 1 second. A test case for
fstests will follow soon.
Reported-by: Paulo Dias <paulo.miguel.dias(a)gmail.com>
Link: https://bugzilla.kernel.org/show_bug.cgi?id=219187
Reported-by: Andreas Jahn <jahn-andi(a)web.de>
Link: https://bugzilla.kernel.org/show_bug.cgi?id=219199
Reported-by: syzbot+4704b3cc972bd76024f1(a)syzkaller.appspotmail.com
Link: https://lore.kernel.org/linux-btrfs/00000000000044ff540620d7dee2@google.com/
Fixes: 939b656bc8ab ("btrfs: fix corruption after buffer fault in during direct IO append write")
CC: stable(a)vger.kernel.org # 5.15+
Reviewed-by: Josef Bacik <josef(a)toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana(a)suse.com>
Reviewed-by: David Sterba <dsterba(a)suse.com>
Signed-off-by: David Sterba <dsterba(a)suse.com>
---
fs/btrfs/ctree.h | 1 -
fs/btrfs/file.c | 25 ++++++++++---------------
fs/btrfs/transaction.h | 6 ++++++
3 files changed, 16 insertions(+), 16 deletions(-)
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 86c7f8ce1715..06333a74d6c4 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -445,7 +445,6 @@ struct btrfs_file_private {
void *filldir_buf;
u64 last_index;
struct extent_state *llseek_cached_state;
- bool fsync_skip_inode_lock;
};
static inline u32 BTRFS_LEAF_DATA_SIZE(const struct btrfs_fs_info *info)
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 952cf145c629..15fd8c00f4c0 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1543,13 +1543,6 @@ static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
if (IS_ERR_OR_NULL(dio)) {
err = PTR_ERR_OR_ZERO(dio);
} else {
- struct btrfs_file_private stack_private = { 0 };
- struct btrfs_file_private *private;
- const bool have_private = (file->private_data != NULL);
-
- if (!have_private)
- file->private_data = &stack_private;
-
/*
* If we have a synchoronous write, we must make sure the fsync
* triggered by the iomap_dio_complete() call below doesn't
@@ -1558,13 +1551,10 @@ static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
* partial writes due to the input buffer (or parts of it) not
* being already faulted in.
*/
- private = file->private_data;
- private->fsync_skip_inode_lock = true;
+ ASSERT(current->journal_info == NULL);
+ current->journal_info = BTRFS_TRANS_DIO_WRITE_STUB;
err = iomap_dio_complete(dio);
- private->fsync_skip_inode_lock = false;
-
- if (!have_private)
- file->private_data = NULL;
+ current->journal_info = NULL;
}
/* No increment (+=) because iomap returns a cumulative value. */
@@ -1796,7 +1786,6 @@ static inline bool skip_inode_logging(const struct btrfs_log_ctx *ctx)
*/
int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
{
- struct btrfs_file_private *private = file->private_data;
struct dentry *dentry = file_dentry(file);
struct inode *inode = d_inode(dentry);
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
@@ -1806,7 +1795,13 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
int ret = 0, err;
u64 len;
bool full_sync;
- const bool skip_ilock = (private ? private->fsync_skip_inode_lock : false);
+ bool skip_ilock = false;
+
+ if (current->journal_info == BTRFS_TRANS_DIO_WRITE_STUB) {
+ skip_ilock = true;
+ current->journal_info = NULL;
+ lockdep_assert_held(&inode->i_rwsem);
+ }
trace_btrfs_sync_file(file, datasync);
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 238a0ab85df9..7623db359881 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -12,6 +12,12 @@
#include "ctree.h"
#include "misc.h"
+/*
+ * Signal that a direct IO write is in progress, to avoid deadlock for sync
+ * direct IO writes when fsync is called during the direct IO write path.
+ */
+#define BTRFS_TRANS_DIO_WRITE_STUB ((void *) 1)
+
/* Radix-tree tag for roots that are part of the trasaction. */
#define BTRFS_ROOT_TRANS_TAG 0
--
2.43.0
From: Filipe Manana <fdmanana(a)suse.com>
commit cd9253c23aedd61eb5ff11f37a36247cd46faf86 upstream.
If we have 2 threads that are using the same file descriptor and one of
them is doing direct IO writes while the other is doing fsync, we have a
race where we can end up either:
1) Attempt a fsync without holding the inode's lock, triggering an
assertion failures when assertions are enabled;
2) Do an invalid memory access from the fsync task because the file private
points to memory allocated on stack by the direct IO task and it may be
used by the fsync task after the stack was destroyed.
The race happens like this:
1) A user space program opens a file descriptor with O_DIRECT;
2) The program spawns 2 threads using libpthread for example;
3) One of the threads uses the file descriptor to do direct IO writes,
while the other calls fsync using the same file descriptor.
4) Call task A the thread doing direct IO writes and task B the thread
doing fsyncs;
5) Task A does a direct IO write, and at btrfs_direct_write() sets the
file's private to an on stack allocated private with the member
'fsync_skip_inode_lock' set to true;
6) Task B enters btrfs_sync_file() and sees that there's a private
structure associated to the file which has 'fsync_skip_inode_lock' set
to true, so it skips locking the inode's vfs lock;
7) Task A completes the direct IO write, and resets the file's private to
NULL since it had no prior private and our private was stack allocated.
Then it unlocks the inode's vfs lock;
8) Task B enters btrfs_get_ordered_extents_for_logging(), then the
assertion that checks the inode's vfs lock is held fails, since task B
never locked it and task A has already unlocked it.
The stack trace produced is the following:
Aug 21 11:46:43 kerberos kernel: assertion failed: inode_is_locked(&inode->vfs_inode), in fs/btrfs/ordered-data.c:983
Aug 21 11:46:43 kerberos kernel: ------------[ cut here ]------------
Aug 21 11:46:43 kerberos kernel: kernel BUG at fs/btrfs/ordered-data.c:983!
Aug 21 11:46:43 kerberos kernel: Oops: invalid opcode: 0000 [#1] PREEMPT SMP PTI
Aug 21 11:46:43 kerberos kernel: CPU: 9 PID: 5072 Comm: worker Tainted: G U OE 6.10.5-1-default #1 openSUSE Tumbleweed 69f48d427608e1c09e60ea24c6c55e2ca1b049e8
Aug 21 11:46:43 kerberos kernel: Hardware name: Acer Predator PH315-52/Covini_CFS, BIOS V1.12 07/28/2020
Aug 21 11:46:43 kerberos kernel: RIP: 0010:btrfs_get_ordered_extents_for_logging.cold+0x1f/0x42 [btrfs]
Aug 21 11:46:43 kerberos kernel: Code: 50 d6 86 c0 e8 (...)
Aug 21 11:46:43 kerberos kernel: RSP: 0018:ffff9e4a03dcfc78 EFLAGS: 00010246
Aug 21 11:46:43 kerberos kernel: RAX: 0000000000000054 RBX: ffff9078a9868e98 RCX: 0000000000000000
Aug 21 11:46:43 kerberos kernel: RDX: 0000000000000000 RSI: ffff907dce4a7800 RDI: ffff907dce4a7800
Aug 21 11:46:43 kerberos kernel: RBP: ffff907805518800 R08: 0000000000000000 R09: ffff9e4a03dcfb38
Aug 21 11:46:43 kerberos kernel: R10: ffff9e4a03dcfb30 R11: 0000000000000003 R12: ffff907684ae7800
Aug 21 11:46:43 kerberos kernel: R13: 0000000000000001 R14: ffff90774646b600 R15: 0000000000000000
Aug 21 11:46:43 kerberos kernel: FS: 00007f04b96006c0(0000) GS:ffff907dce480000(0000) knlGS:0000000000000000
Aug 21 11:46:43 kerberos kernel: CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
Aug 21 11:46:43 kerberos kernel: CR2: 00007f32acbfc000 CR3: 00000001fd4fa005 CR4: 00000000003726f0
Aug 21 11:46:43 kerberos kernel: Call Trace:
Aug 21 11:46:43 kerberos kernel: <TASK>
Aug 21 11:46:43 kerberos kernel: ? __die_body.cold+0x14/0x24
Aug 21 11:46:43 kerberos kernel: ? die+0x2e/0x50
Aug 21 11:46:43 kerberos kernel: ? do_trap+0xca/0x110
Aug 21 11:46:43 kerberos kernel: ? do_error_trap+0x6a/0x90
Aug 21 11:46:43 kerberos kernel: ? btrfs_get_ordered_extents_for_logging.cold+0x1f/0x42 [btrfs bb26272d49b4cdc847cf3f7faadd459b62caee9a]
Aug 21 11:46:43 kerberos kernel: ? exc_invalid_op+0x50/0x70
Aug 21 11:46:43 kerberos kernel: ? btrfs_get_ordered_extents_for_logging.cold+0x1f/0x42 [btrfs bb26272d49b4cdc847cf3f7faadd459b62caee9a]
Aug 21 11:46:43 kerberos kernel: ? asm_exc_invalid_op+0x1a/0x20
Aug 21 11:46:43 kerberos kernel: ? btrfs_get_ordered_extents_for_logging.cold+0x1f/0x42 [btrfs bb26272d49b4cdc847cf3f7faadd459b62caee9a]
Aug 21 11:46:43 kerberos kernel: ? btrfs_get_ordered_extents_for_logging.cold+0x1f/0x42 [btrfs bb26272d49b4cdc847cf3f7faadd459b62caee9a]
Aug 21 11:46:43 kerberos kernel: btrfs_sync_file+0x21a/0x4d0 [btrfs bb26272d49b4cdc847cf3f7faadd459b62caee9a]
Aug 21 11:46:43 kerberos kernel: ? __seccomp_filter+0x31d/0x4f0
Aug 21 11:46:43 kerberos kernel: __x64_sys_fdatasync+0x4f/0x90
Aug 21 11:46:43 kerberos kernel: do_syscall_64+0x82/0x160
Aug 21 11:46:43 kerberos kernel: ? do_futex+0xcb/0x190
Aug 21 11:46:43 kerberos kernel: ? __x64_sys_futex+0x10e/0x1d0
Aug 21 11:46:43 kerberos kernel: ? switch_fpu_return+0x4f/0xd0
Aug 21 11:46:43 kerberos kernel: ? syscall_exit_to_user_mode+0x72/0x220
Aug 21 11:46:43 kerberos kernel: ? do_syscall_64+0x8e/0x160
Aug 21 11:46:43 kerberos kernel: ? syscall_exit_to_user_mode+0x72/0x220
Aug 21 11:46:43 kerberos kernel: ? do_syscall_64+0x8e/0x160
Aug 21 11:46:43 kerberos kernel: ? syscall_exit_to_user_mode+0x72/0x220
Aug 21 11:46:43 kerberos kernel: ? do_syscall_64+0x8e/0x160
Aug 21 11:46:43 kerberos kernel: ? syscall_exit_to_user_mode+0x72/0x220
Aug 21 11:46:43 kerberos kernel: ? do_syscall_64+0x8e/0x160
Aug 21 11:46:43 kerberos kernel: entry_SYSCALL_64_after_hwframe+0x76/0x7e
Another problem here is if task B grabs the private pointer and then uses
it after task A has finished, since the private was allocated in the stack
of trask A, it results in some invalid memory access with a hard to predict
result.
This issue, triggering the assertion, was observed with QEMU workloads by
two users in the Link tags below.
Fix this by not relying on a file's private to pass information to fsync
that it should skip locking the inode and instead pass this information
through a special value stored in current->journal_info. This is safe
because in the relevant section of the direct IO write path we are not
holding a transaction handle, so current->journal_info is NULL.
The following C program triggers the issue:
$ cat repro.c
/* Get the O_DIRECT definition. */
#ifndef _GNU_SOURCE
#define _GNU_SOURCE
#endif
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <stdint.h>
#include <fcntl.h>
#include <errno.h>
#include <string.h>
#include <pthread.h>
static int fd;
static ssize_t do_write(int fd, const void *buf, size_t count, off_t offset)
{
while (count > 0) {
ssize_t ret;
ret = pwrite(fd, buf, count, offset);
if (ret < 0) {
if (errno == EINTR)
continue;
return ret;
}
count -= ret;
buf += ret;
}
return 0;
}
static void *fsync_loop(void *arg)
{
while (1) {
int ret;
ret = fsync(fd);
if (ret != 0) {
perror("Fsync failed");
exit(6);
}
}
}
int main(int argc, char *argv[])
{
long pagesize;
void *write_buf;
pthread_t fsyncer;
int ret;
if (argc != 2) {
fprintf(stderr, "Use: %s <file path>\n", argv[0]);
return 1;
}
fd = open(argv[1], O_WRONLY | O_CREAT | O_TRUNC | O_DIRECT, 0666);
if (fd == -1) {
perror("Failed to open/create file");
return 1;
}
pagesize = sysconf(_SC_PAGE_SIZE);
if (pagesize == -1) {
perror("Failed to get page size");
return 2;
}
ret = posix_memalign(&write_buf, pagesize, pagesize);
if (ret) {
perror("Failed to allocate buffer");
return 3;
}
ret = pthread_create(&fsyncer, NULL, fsync_loop, NULL);
if (ret != 0) {
fprintf(stderr, "Failed to create writer thread: %d\n", ret);
return 4;
}
while (1) {
ret = do_write(fd, write_buf, pagesize, 0);
if (ret != 0) {
perror("Write failed");
exit(5);
}
}
return 0;
}
$ mkfs.btrfs -f /dev/sdi
$ mount /dev/sdi /mnt/sdi
$ timeout 10 ./repro /mnt/sdi/foo
Usually the race is triggered within less than 1 second. A test case for
fstests will follow soon.
Reported-by: Paulo Dias <paulo.miguel.dias(a)gmail.com>
Link: https://bugzilla.kernel.org/show_bug.cgi?id=219187
Reported-by: Andreas Jahn <jahn-andi(a)web.de>
Link: https://bugzilla.kernel.org/show_bug.cgi?id=219199
Reported-by: syzbot+4704b3cc972bd76024f1(a)syzkaller.appspotmail.com
Link: https://lore.kernel.org/linux-btrfs/00000000000044ff540620d7dee2@google.com/
Fixes: 939b656bc8ab ("btrfs: fix corruption after buffer fault in during direct IO append write")
CC: stable(a)vger.kernel.org # 5.15+
Reviewed-by: Josef Bacik <josef(a)toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana(a)suse.com>
Reviewed-by: David Sterba <dsterba(a)suse.com>
Signed-off-by: David Sterba <dsterba(a)suse.com>
---
fs/btrfs/ctree.h | 1 -
fs/btrfs/file.c | 25 ++++++++++---------------
fs/btrfs/transaction.h | 6 ++++++
3 files changed, 16 insertions(+), 16 deletions(-)
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index a56209d275c1..b2e4b30b8fae 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -457,7 +457,6 @@ struct btrfs_file_private {
void *filldir_buf;
u64 last_index;
struct extent_state *llseek_cached_state;
- bool fsync_skip_inode_lock;
};
static inline u32 BTRFS_LEAF_DATA_SIZE(const struct btrfs_fs_info *info)
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index ca434f0cd27f..66dfee873906 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1558,13 +1558,6 @@ static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
if (IS_ERR_OR_NULL(dio)) {
ret = PTR_ERR_OR_ZERO(dio);
} else {
- struct btrfs_file_private stack_private = { 0 };
- struct btrfs_file_private *private;
- const bool have_private = (file->private_data != NULL);
-
- if (!have_private)
- file->private_data = &stack_private;
-
/*
* If we have a synchoronous write, we must make sure the fsync
* triggered by the iomap_dio_complete() call below doesn't
@@ -1573,13 +1566,10 @@ static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
* partial writes due to the input buffer (or parts of it) not
* being already faulted in.
*/
- private = file->private_data;
- private->fsync_skip_inode_lock = true;
+ ASSERT(current->journal_info == NULL);
+ current->journal_info = BTRFS_TRANS_DIO_WRITE_STUB;
ret = iomap_dio_complete(dio);
- private->fsync_skip_inode_lock = false;
-
- if (!have_private)
- file->private_data = NULL;
+ current->journal_info = NULL;
}
/* No increment (+=) because iomap returns a cumulative value. */
@@ -1811,7 +1801,6 @@ static inline bool skip_inode_logging(const struct btrfs_log_ctx *ctx)
*/
int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
{
- struct btrfs_file_private *private = file->private_data;
struct dentry *dentry = file_dentry(file);
struct inode *inode = d_inode(dentry);
struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
@@ -1821,7 +1810,13 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
int ret = 0, err;
u64 len;
bool full_sync;
- const bool skip_ilock = (private ? private->fsync_skip_inode_lock : false);
+ bool skip_ilock = false;
+
+ if (current->journal_info == BTRFS_TRANS_DIO_WRITE_STUB) {
+ skip_ilock = true;
+ current->journal_info = NULL;
+ lockdep_assert_held(&inode->i_rwsem);
+ }
trace_btrfs_sync_file(file, datasync);
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 4e451ab173b1..62ec85f4b777 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -27,6 +27,12 @@ struct btrfs_root_item;
struct btrfs_root;
struct btrfs_path;
+/*
+ * Signal that a direct IO write is in progress, to avoid deadlock for sync
+ * direct IO writes when fsync is called during the direct IO write path.
+ */
+#define BTRFS_TRANS_DIO_WRITE_STUB ((void *) 1)
+
/* Radix-tree tag for roots that are part of the trasaction. */
#define BTRFS_ROOT_TRANS_TAG 0
--
2.43.0
From: Tvrtko Ursulin <tvrtko.ursulin(a)igalia.com>
Since drm_sched_entity_modify_sched() can modify the entities run queue
lets make sure to only derefernce the pointer once so both adding and
waking up are guaranteed to be consistent.
Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin(a)igalia.com>
Fixes: b37aced31eb0 ("drm/scheduler: implement a function to modify sched list")
Cc: Christian König <christian.koenig(a)amd.com>
Cc: Alex Deucher <alexander.deucher(a)amd.com>
Cc: Luben Tuikov <ltuikov89(a)gmail.com>
Cc: Matthew Brost <matthew.brost(a)intel.com>
Cc: David Airlie <airlied(a)gmail.com>
Cc: Daniel Vetter <daniel(a)ffwll.ch>
Cc: dri-devel(a)lists.freedesktop.org
Cc: <stable(a)vger.kernel.org> # v5.7+
---
drivers/gpu/drm/scheduler/sched_entity.c | 8 ++++++--
1 file changed, 6 insertions(+), 2 deletions(-)
diff --git a/drivers/gpu/drm/scheduler/sched_entity.c b/drivers/gpu/drm/scheduler/sched_entity.c
index ae8be30472cd..62b07ef7630a 100644
--- a/drivers/gpu/drm/scheduler/sched_entity.c
+++ b/drivers/gpu/drm/scheduler/sched_entity.c
@@ -599,6 +599,8 @@ void drm_sched_entity_push_job(struct drm_sched_job *sched_job)
/* first job wakes up scheduler */
if (first) {
+ struct drm_sched_rq *rq;
+
/* Add the entity to the run queue */
spin_lock(&entity->rq_lock);
if (entity->stopped) {
@@ -608,13 +610,15 @@ void drm_sched_entity_push_job(struct drm_sched_job *sched_job)
return;
}
- drm_sched_rq_add_entity(entity->rq, entity);
+ rq = entity->rq;
+
+ drm_sched_rq_add_entity(rq, entity);
spin_unlock(&entity->rq_lock);
if (drm_sched_policy == DRM_SCHED_POLICY_FIFO)
drm_sched_rq_update_fifo(entity, submit_ts);
- drm_sched_wakeup(entity->rq->sched, entity);
+ drm_sched_wakeup(rq->sched, entity);
}
}
EXPORT_SYMBOL(drm_sched_entity_push_job);
--
2.46.0
Memory access #VEs are hard for Linux to handle in contexts like the
entry code or NMIs. But other OSes need them for functionality.
There's a static (pre-guest-boot) way for a VMM to choose one or the
other. But VMMs don't always know which OS they are booting, so they
choose to deliver those #VEs so the "other" OSes will work. That,
unfortunately has left us in the lurch and exposed to these
hard-to-handle #VEs.
The TDX module has introduced a new feature. Even if the static
configuration is set to "send nasty #VEs", the kernel can dynamically
request that they be disabled. Once they are disabled, access to private
memory that is not in the Mapped state in the Secure-EPT (SEPT) will
result in an exit to the VMM rather than injecting a #VE.
Check if the feature is available and disable SEPT #VE if possible.
If the TD is allowed to disable/enable SEPT #VEs, the ATTR_SEPT_VE_DISABLE
attribute is no longer reliable. It reflects the initial state of the
control for the TD, but it will not be updated if someone (e.g. bootloader)
changes it before the kernel starts. Kernel must check TDCS_TD_CTLS bit to
determine if SEPT #VEs are enabled or disabled.
Signed-off-by: Kirill A. Shutemov <kirill.shutemov(a)linux.intel.com>
Fixes: 373e715e31bf ("x86/tdx: Panic on bad configs that #VE on "private" memory access")
Cc: stable(a)vger.kernel.org
Acked-by: Kai Huang <kai.huang(a)intel.com>
---
arch/x86/coco/tdx/tdx.c | 76 ++++++++++++++++++++++++-------
arch/x86/include/asm/shared/tdx.h | 10 +++-
2 files changed, 69 insertions(+), 17 deletions(-)
diff --git a/arch/x86/coco/tdx/tdx.c b/arch/x86/coco/tdx/tdx.c
index 08ce488b54d0..f969f4f5ebf8 100644
--- a/arch/x86/coco/tdx/tdx.c
+++ b/arch/x86/coco/tdx/tdx.c
@@ -78,7 +78,7 @@ static inline void tdcall(u64 fn, struct tdx_module_args *args)
}
/* Read TD-scoped metadata */
-static inline u64 __maybe_unused tdg_vm_rd(u64 field, u64 *value)
+static inline u64 tdg_vm_rd(u64 field, u64 *value)
{
struct tdx_module_args args = {
.rdx = field,
@@ -193,6 +193,62 @@ static void __noreturn tdx_panic(const char *msg)
__tdx_hypercall(&args);
}
+/*
+ * The kernel cannot handle #VEs when accessing normal kernel memory. Ensure
+ * that no #VE will be delivered for accesses to TD-private memory.
+ *
+ * TDX 1.0 does not allow the guest to disable SEPT #VE on its own. The VMM
+ * controls if the guest will receive such #VE with TD attribute
+ * ATTR_SEPT_VE_DISABLE.
+ *
+ * Newer TDX modules allow the guest to control if it wants to receive SEPT
+ * violation #VEs.
+ *
+ * Check if the feature is available and disable SEPT #VE if possible.
+ *
+ * If the TD is allowed to disable/enable SEPT #VEs, the ATTR_SEPT_VE_DISABLE
+ * attribute is no longer reliable. It reflects the initial state of the
+ * control for the TD, but it will not be updated if someone (e.g. bootloader)
+ * changes it before the kernel starts. Kernel must check TDCS_TD_CTLS bit to
+ * determine if SEPT #VEs are enabled or disabled.
+ */
+static void disable_sept_ve(u64 td_attr)
+{
+ const char *msg = "TD misconfiguration: SEPT #VE has to be disabled";
+ bool debug = td_attr & ATTR_DEBUG;
+ u64 config, controls;
+
+ /* Is this TD allowed to disable SEPT #VE */
+ tdg_vm_rd(TDCS_CONFIG_FLAGS, &config);
+ if (!(config & TDCS_CONFIG_FLEXIBLE_PENDING_VE)) {
+ /* No SEPT #VE controls for the guest: check the attribute */
+ if (td_attr & ATTR_SEPT_VE_DISABLE)
+ return;
+
+ /* Relax SEPT_VE_DISABLE check for debug TD for backtraces */
+ if (debug)
+ pr_warn("%s\n", msg);
+ else
+ tdx_panic(msg);
+ return;
+ }
+
+ /* Check if SEPT #VE has been disabled before us */
+ tdg_vm_rd(TDCS_TD_CTLS, &controls);
+ if (controls & TD_CTLS_PENDING_VE_DISABLE)
+ return;
+
+ /* Keep #VEs enabled for splats in debugging environments */
+ if (debug)
+ return;
+
+ /* Disable SEPT #VEs */
+ tdg_vm_wr(TDCS_TD_CTLS, TD_CTLS_PENDING_VE_DISABLE,
+ TD_CTLS_PENDING_VE_DISABLE);
+
+ return;
+}
+
static void tdx_setup(u64 *cc_mask)
{
struct tdx_module_args args = {};
@@ -218,24 +274,12 @@ static void tdx_setup(u64 *cc_mask)
gpa_width = args.rcx & GENMASK(5, 0);
*cc_mask = BIT_ULL(gpa_width - 1);
+ td_attr = args.rdx;
+
/* Kernel does not use NOTIFY_ENABLES and does not need random #VEs */
tdg_vm_wr(TDCS_NOTIFY_ENABLES, 0, -1ULL);
- /*
- * The kernel can not handle #VE's when accessing normal kernel
- * memory. Ensure that no #VE will be delivered for accesses to
- * TD-private memory. Only VMM-shared memory (MMIO) will #VE.
- */
- td_attr = args.rdx;
- if (!(td_attr & ATTR_SEPT_VE_DISABLE)) {
- const char *msg = "TD misconfiguration: SEPT_VE_DISABLE attribute must be set.";
-
- /* Relax SEPT_VE_DISABLE check for debug TD. */
- if (td_attr & ATTR_DEBUG)
- pr_warn("%s\n", msg);
- else
- tdx_panic(msg);
- }
+ disable_sept_ve(td_attr);
}
/*
diff --git a/arch/x86/include/asm/shared/tdx.h b/arch/x86/include/asm/shared/tdx.h
index 7e12cfa28bec..fecb2a6e864b 100644
--- a/arch/x86/include/asm/shared/tdx.h
+++ b/arch/x86/include/asm/shared/tdx.h
@@ -19,9 +19,17 @@
#define TDG_VM_RD 7
#define TDG_VM_WR 8
-/* TDCS fields. To be used by TDG.VM.WR and TDG.VM.RD module calls */
+/* TDX TD-Scope Metadata. To be used by TDG.VM.WR and TDG.VM.RD */
+#define TDCS_CONFIG_FLAGS 0x1110000300000016
+#define TDCS_TD_CTLS 0x1110000300000017
#define TDCS_NOTIFY_ENABLES 0x9100000000000010
+/* TDCS_CONFIG_FLAGS bits */
+#define TDCS_CONFIG_FLEXIBLE_PENDING_VE BIT_ULL(1)
+
+/* TDCS_TD_CTLS bits */
+#define TD_CTLS_PENDING_VE_DISABLE BIT_ULL(0)
+
/* TDX hypercall Leaf IDs */
#define TDVMCALL_MAP_GPA 0x10001
#define TDVMCALL_GET_QUOTE 0x10002
--
2.45.2
Rename tdx_parse_tdinfo() to tdx_setup() and move setting NOTIFY_ENABLES
there.
The function will be extended to adjust TD configuration.
Signed-off-by: Kirill A. Shutemov <kirill.shutemov(a)linux.intel.com>
Reviewed-by: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy(a)linux.intel.com>
Reviewed-by: Kai Huang <kai.huang(a)intel.com>
Cc: stable(a)vger.kernel.org
---
arch/x86/coco/tdx/tdx.c | 13 ++++++++-----
1 file changed, 8 insertions(+), 5 deletions(-)
diff --git a/arch/x86/coco/tdx/tdx.c b/arch/x86/coco/tdx/tdx.c
index 64717a96a936..08ce488b54d0 100644
--- a/arch/x86/coco/tdx/tdx.c
+++ b/arch/x86/coco/tdx/tdx.c
@@ -193,7 +193,7 @@ static void __noreturn tdx_panic(const char *msg)
__tdx_hypercall(&args);
}
-static void tdx_parse_tdinfo(u64 *cc_mask)
+static void tdx_setup(u64 *cc_mask)
{
struct tdx_module_args args = {};
unsigned int gpa_width;
@@ -218,6 +218,9 @@ static void tdx_parse_tdinfo(u64 *cc_mask)
gpa_width = args.rcx & GENMASK(5, 0);
*cc_mask = BIT_ULL(gpa_width - 1);
+ /* Kernel does not use NOTIFY_ENABLES and does not need random #VEs */
+ tdg_vm_wr(TDCS_NOTIFY_ENABLES, 0, -1ULL);
+
/*
* The kernel can not handle #VE's when accessing normal kernel
* memory. Ensure that no #VE will be delivered for accesses to
@@ -964,11 +967,11 @@ void __init tdx_early_init(void)
setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE);
cc_vendor = CC_VENDOR_INTEL;
- tdx_parse_tdinfo(&cc_mask);
- cc_set_mask(cc_mask);
- /* Kernel does not use NOTIFY_ENABLES and does not need random #VEs */
- tdg_vm_wr(TDCS_NOTIFY_ENABLES, 0, -1ULL);
+ /* Configure the TD */
+ tdx_setup(&cc_mask);
+
+ cc_set_mask(cc_mask);
/*
* All bits above GPA width are reserved and kernel treats shared bit
--
2.45.2
From: Jason Andryuk <jason.andryuk(a)amd.com>
Hi Arthur,
Can you give the patch below a try? If it works, please respond with a
Tested-by. I'll then submit it with your Reported-by and Tested-by.
Thanks,
Jason
[PATCH] fbdev/xen-fbfront: Assign fb_info->device
Probing xen-fbfront faults in video_is_primary_device(). The passed-in
struct device is NULL since xen-fbfront doesn't assign it and the
memory is kzalloc()-ed. Assign fb_info->device to avoid this.
This was exposed by the conversion of fb_is_primary_device() to
video_is_primary_device() which dropped a NULL check for struct device.
Fixes: f178e96de7f0 ("arch: Remove struct fb_info from video helpers")
CC: stable(a)vger.kernel.org
Signed-off-by: Jason Andryuk <jason.andryuk(a)amd.com>
---
The other option would be to re-instate the NULL check in
video_is_primary_device()
---
drivers/video/fbdev/xen-fbfront.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/drivers/video/fbdev/xen-fbfront.c b/drivers/video/fbdev/xen-fbfront.c
index 66d4628a96ae..c90f48ebb15e 100644
--- a/drivers/video/fbdev/xen-fbfront.c
+++ b/drivers/video/fbdev/xen-fbfront.c
@@ -407,6 +407,7 @@ static int xenfb_probe(struct xenbus_device *dev,
/* complete the abuse: */
fb_info->pseudo_palette = fb_info->par;
fb_info->par = info;
+ fb_info->device = &dev->dev;
fb_info->screen_buffer = info->fb;
--
2.43.0
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-4.19.y
git checkout FETCH_HEAD
git cherry-pick -x 2ab9d830262c132ab5db2f571003d80850d56b2a
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024090922-directed-majorette-f8ad@gregkh' --subject-prefix 'PATCH 4.19.y' HEAD^..
Possible dependencies:
2ab9d830262c ("perf/aux: Fix AUX buffer serialization")
c1e8d7c6a7a6 ("mmap locking API: convert mmap_sem comments")
d8ed45c5dcd4 ("mmap locking API: use coccinelle to convert mmap_sem rwsem call sites")
5a36f0f3f518 ("Merge tag 'vfio-v5.8-rc1' of git://github.com/awilliam/linux-vfio")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 2ab9d830262c132ab5db2f571003d80850d56b2a Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz(a)infradead.org>
Date: Mon, 2 Sep 2024 10:14:24 +0200
Subject: [PATCH] perf/aux: Fix AUX buffer serialization
Ole reported that event->mmap_mutex is strictly insufficient to
serialize the AUX buffer, add a per RB mutex to fully serialize it.
Note that in the lock order comment the perf_event::mmap_mutex order
was already wrong, that is, it nesting under mmap_lock is not new with
this patch.
Fixes: 45bfb2e50471 ("perf: Add AUX area to ring buffer for raw data streams")
Reported-by: Ole <ole(a)binarygecko.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz(a)infradead.org>
Signed-off-by: Ingo Molnar <mingo(a)kernel.org>
diff --git a/kernel/events/core.c b/kernel/events/core.c
index c973e3c11e03..8a6c6bbcd658 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1255,8 +1255,9 @@ static void put_ctx(struct perf_event_context *ctx)
* perf_event_context::mutex
* perf_event::child_mutex;
* perf_event_context::lock
- * perf_event::mmap_mutex
* mmap_lock
+ * perf_event::mmap_mutex
+ * perf_buffer::aux_mutex
* perf_addr_filters_head::lock
*
* cpu_hotplug_lock
@@ -6373,12 +6374,11 @@ static void perf_mmap_close(struct vm_area_struct *vma)
event->pmu->event_unmapped(event, vma->vm_mm);
/*
- * rb->aux_mmap_count will always drop before rb->mmap_count and
- * event->mmap_count, so it is ok to use event->mmap_mutex to
- * serialize with perf_mmap here.
+ * The AUX buffer is strictly a sub-buffer, serialize using aux_mutex
+ * to avoid complications.
*/
if (rb_has_aux(rb) && vma->vm_pgoff == rb->aux_pgoff &&
- atomic_dec_and_mutex_lock(&rb->aux_mmap_count, &event->mmap_mutex)) {
+ atomic_dec_and_mutex_lock(&rb->aux_mmap_count, &rb->aux_mutex)) {
/*
* Stop all AUX events that are writing to this buffer,
* so that we can free its AUX pages and corresponding PMU
@@ -6395,7 +6395,7 @@ static void perf_mmap_close(struct vm_area_struct *vma)
rb_free_aux(rb);
WARN_ON_ONCE(refcount_read(&rb->aux_refcount));
- mutex_unlock(&event->mmap_mutex);
+ mutex_unlock(&rb->aux_mutex);
}
if (atomic_dec_and_test(&rb->mmap_count))
@@ -6483,6 +6483,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
struct perf_event *event = file->private_data;
unsigned long user_locked, user_lock_limit;
struct user_struct *user = current_user();
+ struct mutex *aux_mutex = NULL;
struct perf_buffer *rb = NULL;
unsigned long locked, lock_limit;
unsigned long vma_size;
@@ -6531,6 +6532,9 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
if (!rb)
goto aux_unlock;
+ aux_mutex = &rb->aux_mutex;
+ mutex_lock(aux_mutex);
+
aux_offset = READ_ONCE(rb->user_page->aux_offset);
aux_size = READ_ONCE(rb->user_page->aux_size);
@@ -6681,6 +6685,8 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
atomic_dec(&rb->mmap_count);
}
aux_unlock:
+ if (aux_mutex)
+ mutex_unlock(aux_mutex);
mutex_unlock(&event->mmap_mutex);
/*
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index 451514442a1b..e072d995d670 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -40,6 +40,7 @@ struct perf_buffer {
struct user_struct *mmap_user;
/* AUX area */
+ struct mutex aux_mutex;
long aux_head;
unsigned int aux_nest;
long aux_wakeup; /* last aux_watermark boundary crossed by aux_head */
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index 8cadf97bc290..4f46f688d0d4 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -337,6 +337,8 @@ ring_buffer_init(struct perf_buffer *rb, long watermark, int flags)
*/
if (!rb->nr_pages)
rb->paused = 1;
+
+ mutex_init(&rb->aux_mutex);
}
void perf_aux_output_flag(struct perf_output_handle *handle, u64 flags)
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.4.y
git checkout FETCH_HEAD
git cherry-pick -x 2ab9d830262c132ab5db2f571003d80850d56b2a
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024090922-footbath-barrack-c29b@gregkh' --subject-prefix 'PATCH 5.4.y' HEAD^..
Possible dependencies:
2ab9d830262c ("perf/aux: Fix AUX buffer serialization")
c1e8d7c6a7a6 ("mmap locking API: convert mmap_sem comments")
d8ed45c5dcd4 ("mmap locking API: use coccinelle to convert mmap_sem rwsem call sites")
5a36f0f3f518 ("Merge tag 'vfio-v5.8-rc1' of git://github.com/awilliam/linux-vfio")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 2ab9d830262c132ab5db2f571003d80850d56b2a Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz(a)infradead.org>
Date: Mon, 2 Sep 2024 10:14:24 +0200
Subject: [PATCH] perf/aux: Fix AUX buffer serialization
Ole reported that event->mmap_mutex is strictly insufficient to
serialize the AUX buffer, add a per RB mutex to fully serialize it.
Note that in the lock order comment the perf_event::mmap_mutex order
was already wrong, that is, it nesting under mmap_lock is not new with
this patch.
Fixes: 45bfb2e50471 ("perf: Add AUX area to ring buffer for raw data streams")
Reported-by: Ole <ole(a)binarygecko.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz(a)infradead.org>
Signed-off-by: Ingo Molnar <mingo(a)kernel.org>
diff --git a/kernel/events/core.c b/kernel/events/core.c
index c973e3c11e03..8a6c6bbcd658 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1255,8 +1255,9 @@ static void put_ctx(struct perf_event_context *ctx)
* perf_event_context::mutex
* perf_event::child_mutex;
* perf_event_context::lock
- * perf_event::mmap_mutex
* mmap_lock
+ * perf_event::mmap_mutex
+ * perf_buffer::aux_mutex
* perf_addr_filters_head::lock
*
* cpu_hotplug_lock
@@ -6373,12 +6374,11 @@ static void perf_mmap_close(struct vm_area_struct *vma)
event->pmu->event_unmapped(event, vma->vm_mm);
/*
- * rb->aux_mmap_count will always drop before rb->mmap_count and
- * event->mmap_count, so it is ok to use event->mmap_mutex to
- * serialize with perf_mmap here.
+ * The AUX buffer is strictly a sub-buffer, serialize using aux_mutex
+ * to avoid complications.
*/
if (rb_has_aux(rb) && vma->vm_pgoff == rb->aux_pgoff &&
- atomic_dec_and_mutex_lock(&rb->aux_mmap_count, &event->mmap_mutex)) {
+ atomic_dec_and_mutex_lock(&rb->aux_mmap_count, &rb->aux_mutex)) {
/*
* Stop all AUX events that are writing to this buffer,
* so that we can free its AUX pages and corresponding PMU
@@ -6395,7 +6395,7 @@ static void perf_mmap_close(struct vm_area_struct *vma)
rb_free_aux(rb);
WARN_ON_ONCE(refcount_read(&rb->aux_refcount));
- mutex_unlock(&event->mmap_mutex);
+ mutex_unlock(&rb->aux_mutex);
}
if (atomic_dec_and_test(&rb->mmap_count))
@@ -6483,6 +6483,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
struct perf_event *event = file->private_data;
unsigned long user_locked, user_lock_limit;
struct user_struct *user = current_user();
+ struct mutex *aux_mutex = NULL;
struct perf_buffer *rb = NULL;
unsigned long locked, lock_limit;
unsigned long vma_size;
@@ -6531,6 +6532,9 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
if (!rb)
goto aux_unlock;
+ aux_mutex = &rb->aux_mutex;
+ mutex_lock(aux_mutex);
+
aux_offset = READ_ONCE(rb->user_page->aux_offset);
aux_size = READ_ONCE(rb->user_page->aux_size);
@@ -6681,6 +6685,8 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
atomic_dec(&rb->mmap_count);
}
aux_unlock:
+ if (aux_mutex)
+ mutex_unlock(aux_mutex);
mutex_unlock(&event->mmap_mutex);
/*
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index 451514442a1b..e072d995d670 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -40,6 +40,7 @@ struct perf_buffer {
struct user_struct *mmap_user;
/* AUX area */
+ struct mutex aux_mutex;
long aux_head;
unsigned int aux_nest;
long aux_wakeup; /* last aux_watermark boundary crossed by aux_head */
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index 8cadf97bc290..4f46f688d0d4 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -337,6 +337,8 @@ ring_buffer_init(struct perf_buffer *rb, long watermark, int flags)
*/
if (!rb->nr_pages)
rb->paused = 1;
+
+ mutex_init(&rb->aux_mutex);
}
void perf_aux_output_flag(struct perf_output_handle *handle, u64 flags)
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-4.19.y
git checkout FETCH_HEAD
git cherry-pick -x 471ef0b5a8aaca4296108e756b970acfc499ede4
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024090938-resale-impose-3699@gregkh' --subject-prefix 'PATCH 4.19.y' HEAD^..
Possible dependencies:
471ef0b5a8aa ("clocksource/drivers/timer-of: Remove percpu irq related code")
0f1a7b3fac05 ("timer-of: don't use conditional expression with mixed 'void' types")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 471ef0b5a8aaca4296108e756b970acfc499ede4 Mon Sep 17 00:00:00 2001
From: Daniel Lezcano <daniel.lezcano(a)linaro.org>
Date: Mon, 19 Aug 2024 12:03:35 +0200
Subject: [PATCH] clocksource/drivers/timer-of: Remove percpu irq related code
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
GCC's named address space checks errors out with:
drivers/clocksource/timer-of.c: In function ‘timer_of_irq_exit’:
drivers/clocksource/timer-of.c:29:46: error: passing argument 2 of
‘free_percpu_irq’ from pointer to non-enclosed address space
29 | free_percpu_irq(of_irq->irq, clkevt);
| ^~~~~~
In file included from drivers/clocksource/timer-of.c:8:
./include/linux/interrupt.h:201:43: note: expected ‘__seg_gs void *’
but argument is of type ‘struct clock_event_device *’
201 | extern void free_percpu_irq(unsigned int, void __percpu *);
| ^~~~~~~~~~~~~~~
drivers/clocksource/timer-of.c: In function ‘timer_of_irq_init’:
drivers/clocksource/timer-of.c:74:51: error: passing argument 4 of
‘request_percpu_irq’ from pointer to non-enclosed address space
74 | np->full_name, clkevt) :
| ^~~~~~
./include/linux/interrupt.h:190:56: note: expected ‘__seg_gs void *’
but argument is of type ‘struct clock_event_device *’
190 | const char *devname, void __percpu *percpu_dev_id)
Sparse warns about:
timer-of.c:29:46: warning: incorrect type in argument 2 (different address spaces)
timer-of.c:29:46: expected void [noderef] __percpu *
timer-of.c:29:46: got struct clock_event_device *clkevt
timer-of.c:74:51: warning: incorrect type in argument 4 (different address spaces)
timer-of.c:74:51: expected void [noderef] __percpu *percpu_dev_id
timer-of.c:74:51: got struct clock_event_device *clkevt
It appears the code is incorrect as reported by Uros Bizjak:
"The referred code is questionable as it tries to reuse
the clkevent pointer once as percpu pointer and once as generic
pointer, which should be avoided."
This change removes the percpu related code as no drivers is using it.
[Daniel: Fixed the description]
Fixes: dc11bae785295 ("clocksource/drivers: Add timer-of common init routine")
Reported-by: Uros Bizjak <ubizjak(a)gmail.com>
Tested-by: Uros Bizjak <ubizjak(a)gmail.com>
Link: https://lore.kernel.org/r/20240819100335.2394751-1-daniel.lezcano@linaro.org
Signed-off-by: Daniel Lezcano <daniel.lezcano(a)linaro.org>
diff --git a/drivers/clocksource/timer-of.c b/drivers/clocksource/timer-of.c
index c3f54d9912be..420202bf76e4 100644
--- a/drivers/clocksource/timer-of.c
+++ b/drivers/clocksource/timer-of.c
@@ -25,10 +25,7 @@ static __init void timer_of_irq_exit(struct of_timer_irq *of_irq)
struct clock_event_device *clkevt = &to->clkevt;
- if (of_irq->percpu)
- free_percpu_irq(of_irq->irq, clkevt);
- else
- free_irq(of_irq->irq, clkevt);
+ free_irq(of_irq->irq, clkevt);
}
/**
@@ -42,9 +39,6 @@ static __init void timer_of_irq_exit(struct of_timer_irq *of_irq)
* - Get interrupt number by name
* - Get interrupt number by index
*
- * When the interrupt is per CPU, 'request_percpu_irq()' is called,
- * otherwise 'request_irq()' is used.
- *
* Returns 0 on success, < 0 otherwise
*/
static __init int timer_of_irq_init(struct device_node *np,
@@ -69,12 +63,9 @@ static __init int timer_of_irq_init(struct device_node *np,
return -EINVAL;
}
- ret = of_irq->percpu ?
- request_percpu_irq(of_irq->irq, of_irq->handler,
- np->full_name, clkevt) :
- request_irq(of_irq->irq, of_irq->handler,
- of_irq->flags ? of_irq->flags : IRQF_TIMER,
- np->full_name, clkevt);
+ ret = request_irq(of_irq->irq, of_irq->handler,
+ of_irq->flags ? of_irq->flags : IRQF_TIMER,
+ np->full_name, clkevt);
if (ret) {
pr_err("Failed to request irq %d for %pOF\n", of_irq->irq, np);
return ret;
diff --git a/drivers/clocksource/timer-of.h b/drivers/clocksource/timer-of.h
index a5478f3e8589..01a2c6b7db06 100644
--- a/drivers/clocksource/timer-of.h
+++ b/drivers/clocksource/timer-of.h
@@ -11,7 +11,6 @@
struct of_timer_irq {
int irq;
int index;
- int percpu;
const char *name;
unsigned long flags;
irq_handler_t handler;
The patch below does not apply to the 6.1-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.1.y
git checkout FETCH_HEAD
git cherry-pick -x 8679e8b4a1ebdb40c4429e49368d29353e07b601
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024090922-marathon-revival-7771@gregkh' --subject-prefix 'PATCH 6.1.y' HEAD^..
Possible dependencies:
8679e8b4a1eb ("nvmem: u-boot-env: error if NVMEM device is too small")
6bafe07c9306 ("nvmem: u-boot-env: improve coding style")
a832556d23c5 ("nvmem: u-boot-env: use nvmem device helpers")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 8679e8b4a1ebdb40c4429e49368d29353e07b601 Mon Sep 17 00:00:00 2001
From: John Thomson <git(a)johnthomson.fastmail.com.au>
Date: Mon, 2 Sep 2024 15:25:08 +0100
Subject: [PATCH] nvmem: u-boot-env: error if NVMEM device is too small
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Verify data size before trying to parse it to avoid reading out of
buffer. This could happen in case of problems at MTD level or invalid DT
bindings.
Signed-off-by: John Thomson <git(a)johnthomson.fastmail.com.au>
Cc: stable <stable(a)kernel.org>
Fixes: d5542923f200 ("nvmem: add driver handling U-Boot environment variables")
[rmilecki: simplify commit description & rebase]
Signed-off-by: Rafał Miłecki <rafal(a)milecki.pl>
Signed-off-by: Srinivas Kandagatla <srinivas.kandagatla(a)linaro.org>
Link: https://lore.kernel.org/r/20240902142510.71096-2-srinivas.kandagatla@linaro…
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
diff --git a/drivers/nvmem/u-boot-env.c b/drivers/nvmem/u-boot-env.c
index 936e39b20b38..593f0bf4a395 100644
--- a/drivers/nvmem/u-boot-env.c
+++ b/drivers/nvmem/u-boot-env.c
@@ -176,6 +176,13 @@ static int u_boot_env_parse(struct u_boot_env *priv)
data_offset = offsetof(struct u_boot_env_image_broadcom, data);
break;
}
+
+ if (dev_size < data_offset) {
+ dev_err(dev, "Device too small for u-boot-env\n");
+ err = -EIO;
+ goto err_kfree;
+ }
+
crc32_addr = (__le32 *)(buf + crc32_offset);
crc32 = le32_to_cpu(*crc32_addr);
crc32_data_len = dev_size - crc32_data_offset;
The patch below does not apply to the 6.6-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.6.y
git checkout FETCH_HEAD
git cherry-pick -x 8679e8b4a1ebdb40c4429e49368d29353e07b601
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024090921-cycling-overfed-49be@gregkh' --subject-prefix 'PATCH 6.6.y' HEAD^..
Possible dependencies:
8679e8b4a1eb ("nvmem: u-boot-env: error if NVMEM device is too small")
6bafe07c9306 ("nvmem: u-boot-env: improve coding style")
a832556d23c5 ("nvmem: u-boot-env: use nvmem device helpers")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 8679e8b4a1ebdb40c4429e49368d29353e07b601 Mon Sep 17 00:00:00 2001
From: John Thomson <git(a)johnthomson.fastmail.com.au>
Date: Mon, 2 Sep 2024 15:25:08 +0100
Subject: [PATCH] nvmem: u-boot-env: error if NVMEM device is too small
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Verify data size before trying to parse it to avoid reading out of
buffer. This could happen in case of problems at MTD level or invalid DT
bindings.
Signed-off-by: John Thomson <git(a)johnthomson.fastmail.com.au>
Cc: stable <stable(a)kernel.org>
Fixes: d5542923f200 ("nvmem: add driver handling U-Boot environment variables")
[rmilecki: simplify commit description & rebase]
Signed-off-by: Rafał Miłecki <rafal(a)milecki.pl>
Signed-off-by: Srinivas Kandagatla <srinivas.kandagatla(a)linaro.org>
Link: https://lore.kernel.org/r/20240902142510.71096-2-srinivas.kandagatla@linaro…
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
diff --git a/drivers/nvmem/u-boot-env.c b/drivers/nvmem/u-boot-env.c
index 936e39b20b38..593f0bf4a395 100644
--- a/drivers/nvmem/u-boot-env.c
+++ b/drivers/nvmem/u-boot-env.c
@@ -176,6 +176,13 @@ static int u_boot_env_parse(struct u_boot_env *priv)
data_offset = offsetof(struct u_boot_env_image_broadcom, data);
break;
}
+
+ if (dev_size < data_offset) {
+ dev_err(dev, "Device too small for u-boot-env\n");
+ err = -EIO;
+ goto err_kfree;
+ }
+
crc32_addr = (__le32 *)(buf + crc32_offset);
crc32 = le32_to_cpu(*crc32_addr);
crc32_data_len = dev_size - crc32_data_offset;
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-4.19.y
git checkout FETCH_HEAD
git cherry-pick -x 00dcf2fa449f23a263343d7fe051741bdde65d0b
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024090925-chokehold-enhance-1d11@gregkh' --subject-prefix 'PATCH 4.19.y' HEAD^..
Possible dependencies:
00dcf2fa449f ("usb: dwc3: Avoid waking up gadget during startxfer")
047161686b81 ("usb: dwc3: Add remote wakeup handling")
a02a26eb0aea ("usb: dwc3: gadget: Ignore Update Transfer cmd params")
63c4c320ccf7 ("usb: dwc3: gadget: Check for L1/L2/U3 for Start Transfer")
40edb52298df ("usb: dwc3: avoid NULL access of usb_gadget_driver")
c560e76319a9 ("usb: dwc3: gadget: Fix START_TRANSFER link state check")
475e8be53d04 ("usb: dwc3: gadget: Check for disabled LPM quirk")
82c46b8ed9dc ("usb: dwc3: gadget: Introduce a DWC3 VBUS draw callback")
f580170f135a ("usb: dwc3: Add splitdisable quirk for Hisilicon Kirin Soc")
e81a7018d93a ("usb: dwc3: allocate gadget structure dynamically")
c5a7092f4015 ("usb: dwc3: gadget: make starting isoc transfers more robust")
9af21dd6faeb ("usb: dwc3: Add support for DWC_usb32 IP")
8bb14308a869 ("usb: dwc3: core: Use role-switch default dr_mode")
d0550cd20e52 ("usb: dwc3: gadget: Do link recovery for SS and SSP")
d94ea5319813 ("usb: dwc3: gadget: Properly set maxpacket limit")
586f4335700f ("usb: dwc3: Fix GTXFIFOSIZ.TXFDEP macro name")
5eb5afb07853 ("usb: dwc3: use proper initializers for property entries")
9ba3aca8fe82 ("usb: dwc3: Disable phy suspend after power-on reset")
a0a465569b45 ("usb: dwc3: remove generic PHY calibrate() calls")
c09b73cfac2a ("usb: dwc3: don't set gadget->is_otg flag")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 00dcf2fa449f23a263343d7fe051741bdde65d0b Mon Sep 17 00:00:00 2001
From: Prashanth K <quic_prashk(a)quicinc.com>
Date: Wed, 28 Aug 2024 12:13:02 +0530
Subject: [PATCH] usb: dwc3: Avoid waking up gadget during startxfer
When operating in High-Speed, it is observed that DSTS[USBLNKST] doesn't
update link state immediately after receiving the wakeup interrupt. Since
wakeup event handler calls the resume callbacks, there is a chance that
function drivers can perform an ep queue, which in turn tries to perform
remote wakeup from send_gadget_ep_cmd(STARTXFER). This happens because
DSTS[[21:18] wasn't updated to U0 yet, it's observed that the latency of
DSTS can be in order of milli-seconds. Hence avoid calling gadget_wakeup
during startxfer to prevent unnecessarily issuing remote wakeup to host.
Fixes: c36d8e947a56 ("usb: dwc3: gadget: put link to U0 before Start Transfer")
Cc: stable(a)vger.kernel.org
Suggested-by: Thinh Nguyen <Thinh.Nguyen(a)synopsys.com>
Signed-off-by: Prashanth K <quic_prashk(a)quicinc.com>
Acked-by: Thinh Nguyen <Thinh.Nguyen(a)synopsys.com>
Link: https://lore.kernel.org/r/20240828064302.3796315-1-quic_prashk@quicinc.com
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
diff --git a/drivers/usb/dwc3/gadget.c b/drivers/usb/dwc3/gadget.c
index 89fc690fdf34..291bc549935b 100644
--- a/drivers/usb/dwc3/gadget.c
+++ b/drivers/usb/dwc3/gadget.c
@@ -287,6 +287,23 @@ static int __dwc3_gadget_wakeup(struct dwc3 *dwc, bool async);
*
* Caller should handle locking. This function will issue @cmd with given
* @params to @dep and wait for its completion.
+ *
+ * According to the programming guide, if the link state is in L1/L2/U3,
+ * then sending the Start Transfer command may not complete. The
+ * programming guide suggested to bring the link state back to ON/U0 by
+ * performing remote wakeup prior to sending the command. However, don't
+ * initiate remote wakeup when the user/function does not send wakeup
+ * request via wakeup ops. Send the command when it's allowed.
+ *
+ * Notes:
+ * For L1 link state, issuing a command requires the clearing of
+ * GUSB2PHYCFG.SUSPENDUSB2, which turns on the signal required to complete
+ * the given command (usually within 50us). This should happen within the
+ * command timeout set by driver. No additional step is needed.
+ *
+ * For L2 or U3 link state, the gadget is in USB suspend. Care should be
+ * taken when sending Start Transfer command to ensure that it's done after
+ * USB resume.
*/
int dwc3_send_gadget_ep_cmd(struct dwc3_ep *dep, unsigned int cmd,
struct dwc3_gadget_ep_cmd_params *params)
@@ -327,30 +344,6 @@ int dwc3_send_gadget_ep_cmd(struct dwc3_ep *dep, unsigned int cmd,
dwc3_writel(dwc->regs, DWC3_GUSB2PHYCFG(0), reg);
}
- if (DWC3_DEPCMD_CMD(cmd) == DWC3_DEPCMD_STARTTRANSFER) {
- int link_state;
-
- /*
- * Initiate remote wakeup if the link state is in U3 when
- * operating in SS/SSP or L1/L2 when operating in HS/FS. If the
- * link state is in U1/U2, no remote wakeup is needed. The Start
- * Transfer command will initiate the link recovery.
- */
- link_state = dwc3_gadget_get_link_state(dwc);
- switch (link_state) {
- case DWC3_LINK_STATE_U2:
- if (dwc->gadget->speed >= USB_SPEED_SUPER)
- break;
-
- fallthrough;
- case DWC3_LINK_STATE_U3:
- ret = __dwc3_gadget_wakeup(dwc, false);
- dev_WARN_ONCE(dwc->dev, ret, "wakeup failed --> %d\n",
- ret);
- break;
- }
- }
-
/*
* For some commands such as Update Transfer command, DEPCMDPARn
* registers are reserved. Since the driver often sends Update Transfer
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.4.y
git checkout FETCH_HEAD
git cherry-pick -x 00dcf2fa449f23a263343d7fe051741bdde65d0b
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024090924-crescent-edging-00be@gregkh' --subject-prefix 'PATCH 5.4.y' HEAD^..
Possible dependencies:
00dcf2fa449f ("usb: dwc3: Avoid waking up gadget during startxfer")
047161686b81 ("usb: dwc3: Add remote wakeup handling")
a02a26eb0aea ("usb: dwc3: gadget: Ignore Update Transfer cmd params")
63c4c320ccf7 ("usb: dwc3: gadget: Check for L1/L2/U3 for Start Transfer")
40edb52298df ("usb: dwc3: avoid NULL access of usb_gadget_driver")
c560e76319a9 ("usb: dwc3: gadget: Fix START_TRANSFER link state check")
475e8be53d04 ("usb: dwc3: gadget: Check for disabled LPM quirk")
82c46b8ed9dc ("usb: dwc3: gadget: Introduce a DWC3 VBUS draw callback")
f580170f135a ("usb: dwc3: Add splitdisable quirk for Hisilicon Kirin Soc")
e81a7018d93a ("usb: dwc3: allocate gadget structure dynamically")
c5a7092f4015 ("usb: dwc3: gadget: make starting isoc transfers more robust")
9af21dd6faeb ("usb: dwc3: Add support for DWC_usb32 IP")
8bb14308a869 ("usb: dwc3: core: Use role-switch default dr_mode")
d0550cd20e52 ("usb: dwc3: gadget: Do link recovery for SS and SSP")
d94ea5319813 ("usb: dwc3: gadget: Properly set maxpacket limit")
586f4335700f ("usb: dwc3: Fix GTXFIFOSIZ.TXFDEP macro name")
5eb5afb07853 ("usb: dwc3: use proper initializers for property entries")
9ba3aca8fe82 ("usb: dwc3: Disable phy suspend after power-on reset")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 00dcf2fa449f23a263343d7fe051741bdde65d0b Mon Sep 17 00:00:00 2001
From: Prashanth K <quic_prashk(a)quicinc.com>
Date: Wed, 28 Aug 2024 12:13:02 +0530
Subject: [PATCH] usb: dwc3: Avoid waking up gadget during startxfer
When operating in High-Speed, it is observed that DSTS[USBLNKST] doesn't
update link state immediately after receiving the wakeup interrupt. Since
wakeup event handler calls the resume callbacks, there is a chance that
function drivers can perform an ep queue, which in turn tries to perform
remote wakeup from send_gadget_ep_cmd(STARTXFER). This happens because
DSTS[[21:18] wasn't updated to U0 yet, it's observed that the latency of
DSTS can be in order of milli-seconds. Hence avoid calling gadget_wakeup
during startxfer to prevent unnecessarily issuing remote wakeup to host.
Fixes: c36d8e947a56 ("usb: dwc3: gadget: put link to U0 before Start Transfer")
Cc: stable(a)vger.kernel.org
Suggested-by: Thinh Nguyen <Thinh.Nguyen(a)synopsys.com>
Signed-off-by: Prashanth K <quic_prashk(a)quicinc.com>
Acked-by: Thinh Nguyen <Thinh.Nguyen(a)synopsys.com>
Link: https://lore.kernel.org/r/20240828064302.3796315-1-quic_prashk@quicinc.com
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
diff --git a/drivers/usb/dwc3/gadget.c b/drivers/usb/dwc3/gadget.c
index 89fc690fdf34..291bc549935b 100644
--- a/drivers/usb/dwc3/gadget.c
+++ b/drivers/usb/dwc3/gadget.c
@@ -287,6 +287,23 @@ static int __dwc3_gadget_wakeup(struct dwc3 *dwc, bool async);
*
* Caller should handle locking. This function will issue @cmd with given
* @params to @dep and wait for its completion.
+ *
+ * According to the programming guide, if the link state is in L1/L2/U3,
+ * then sending the Start Transfer command may not complete. The
+ * programming guide suggested to bring the link state back to ON/U0 by
+ * performing remote wakeup prior to sending the command. However, don't
+ * initiate remote wakeup when the user/function does not send wakeup
+ * request via wakeup ops. Send the command when it's allowed.
+ *
+ * Notes:
+ * For L1 link state, issuing a command requires the clearing of
+ * GUSB2PHYCFG.SUSPENDUSB2, which turns on the signal required to complete
+ * the given command (usually within 50us). This should happen within the
+ * command timeout set by driver. No additional step is needed.
+ *
+ * For L2 or U3 link state, the gadget is in USB suspend. Care should be
+ * taken when sending Start Transfer command to ensure that it's done after
+ * USB resume.
*/
int dwc3_send_gadget_ep_cmd(struct dwc3_ep *dep, unsigned int cmd,
struct dwc3_gadget_ep_cmd_params *params)
@@ -327,30 +344,6 @@ int dwc3_send_gadget_ep_cmd(struct dwc3_ep *dep, unsigned int cmd,
dwc3_writel(dwc->regs, DWC3_GUSB2PHYCFG(0), reg);
}
- if (DWC3_DEPCMD_CMD(cmd) == DWC3_DEPCMD_STARTTRANSFER) {
- int link_state;
-
- /*
- * Initiate remote wakeup if the link state is in U3 when
- * operating in SS/SSP or L1/L2 when operating in HS/FS. If the
- * link state is in U1/U2, no remote wakeup is needed. The Start
- * Transfer command will initiate the link recovery.
- */
- link_state = dwc3_gadget_get_link_state(dwc);
- switch (link_state) {
- case DWC3_LINK_STATE_U2:
- if (dwc->gadget->speed >= USB_SPEED_SUPER)
- break;
-
- fallthrough;
- case DWC3_LINK_STATE_U3:
- ret = __dwc3_gadget_wakeup(dwc, false);
- dev_WARN_ONCE(dwc->dev, ret, "wakeup failed --> %d\n",
- ret);
- break;
- }
- }
-
/*
* For some commands such as Update Transfer command, DEPCMDPARn
* registers are reserved. Since the driver often sends Update Transfer
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.10.y
git checkout FETCH_HEAD
git cherry-pick -x 00dcf2fa449f23a263343d7fe051741bdde65d0b
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024090923-trustful-helium-3f09@gregkh' --subject-prefix 'PATCH 5.10.y' HEAD^..
Possible dependencies:
00dcf2fa449f ("usb: dwc3: Avoid waking up gadget during startxfer")
047161686b81 ("usb: dwc3: Add remote wakeup handling")
a02a26eb0aea ("usb: dwc3: gadget: Ignore Update Transfer cmd params")
63c4c320ccf7 ("usb: dwc3: gadget: Check for L1/L2/U3 for Start Transfer")
40edb52298df ("usb: dwc3: avoid NULL access of usb_gadget_driver")
c560e76319a9 ("usb: dwc3: gadget: Fix START_TRANSFER link state check")
475e8be53d04 ("usb: dwc3: gadget: Check for disabled LPM quirk")
82c46b8ed9dc ("usb: dwc3: gadget: Introduce a DWC3 VBUS draw callback")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 00dcf2fa449f23a263343d7fe051741bdde65d0b Mon Sep 17 00:00:00 2001
From: Prashanth K <quic_prashk(a)quicinc.com>
Date: Wed, 28 Aug 2024 12:13:02 +0530
Subject: [PATCH] usb: dwc3: Avoid waking up gadget during startxfer
When operating in High-Speed, it is observed that DSTS[USBLNKST] doesn't
update link state immediately after receiving the wakeup interrupt. Since
wakeup event handler calls the resume callbacks, there is a chance that
function drivers can perform an ep queue, which in turn tries to perform
remote wakeup from send_gadget_ep_cmd(STARTXFER). This happens because
DSTS[[21:18] wasn't updated to U0 yet, it's observed that the latency of
DSTS can be in order of milli-seconds. Hence avoid calling gadget_wakeup
during startxfer to prevent unnecessarily issuing remote wakeup to host.
Fixes: c36d8e947a56 ("usb: dwc3: gadget: put link to U0 before Start Transfer")
Cc: stable(a)vger.kernel.org
Suggested-by: Thinh Nguyen <Thinh.Nguyen(a)synopsys.com>
Signed-off-by: Prashanth K <quic_prashk(a)quicinc.com>
Acked-by: Thinh Nguyen <Thinh.Nguyen(a)synopsys.com>
Link: https://lore.kernel.org/r/20240828064302.3796315-1-quic_prashk@quicinc.com
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
diff --git a/drivers/usb/dwc3/gadget.c b/drivers/usb/dwc3/gadget.c
index 89fc690fdf34..291bc549935b 100644
--- a/drivers/usb/dwc3/gadget.c
+++ b/drivers/usb/dwc3/gadget.c
@@ -287,6 +287,23 @@ static int __dwc3_gadget_wakeup(struct dwc3 *dwc, bool async);
*
* Caller should handle locking. This function will issue @cmd with given
* @params to @dep and wait for its completion.
+ *
+ * According to the programming guide, if the link state is in L1/L2/U3,
+ * then sending the Start Transfer command may not complete. The
+ * programming guide suggested to bring the link state back to ON/U0 by
+ * performing remote wakeup prior to sending the command. However, don't
+ * initiate remote wakeup when the user/function does not send wakeup
+ * request via wakeup ops. Send the command when it's allowed.
+ *
+ * Notes:
+ * For L1 link state, issuing a command requires the clearing of
+ * GUSB2PHYCFG.SUSPENDUSB2, which turns on the signal required to complete
+ * the given command (usually within 50us). This should happen within the
+ * command timeout set by driver. No additional step is needed.
+ *
+ * For L2 or U3 link state, the gadget is in USB suspend. Care should be
+ * taken when sending Start Transfer command to ensure that it's done after
+ * USB resume.
*/
int dwc3_send_gadget_ep_cmd(struct dwc3_ep *dep, unsigned int cmd,
struct dwc3_gadget_ep_cmd_params *params)
@@ -327,30 +344,6 @@ int dwc3_send_gadget_ep_cmd(struct dwc3_ep *dep, unsigned int cmd,
dwc3_writel(dwc->regs, DWC3_GUSB2PHYCFG(0), reg);
}
- if (DWC3_DEPCMD_CMD(cmd) == DWC3_DEPCMD_STARTTRANSFER) {
- int link_state;
-
- /*
- * Initiate remote wakeup if the link state is in U3 when
- * operating in SS/SSP or L1/L2 when operating in HS/FS. If the
- * link state is in U1/U2, no remote wakeup is needed. The Start
- * Transfer command will initiate the link recovery.
- */
- link_state = dwc3_gadget_get_link_state(dwc);
- switch (link_state) {
- case DWC3_LINK_STATE_U2:
- if (dwc->gadget->speed >= USB_SPEED_SUPER)
- break;
-
- fallthrough;
- case DWC3_LINK_STATE_U3:
- ret = __dwc3_gadget_wakeup(dwc, false);
- dev_WARN_ONCE(dwc->dev, ret, "wakeup failed --> %d\n",
- ret);
- break;
- }
- }
-
/*
* For some commands such as Update Transfer command, DEPCMDPARn
* registers are reserved. Since the driver often sends Update Transfer
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.15.y
git checkout FETCH_HEAD
git cherry-pick -x 00dcf2fa449f23a263343d7fe051741bdde65d0b
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024090922-shrewdly-bright-07e5@gregkh' --subject-prefix 'PATCH 5.15.y' HEAD^..
Possible dependencies:
00dcf2fa449f ("usb: dwc3: Avoid waking up gadget during startxfer")
047161686b81 ("usb: dwc3: Add remote wakeup handling")
a02a26eb0aea ("usb: dwc3: gadget: Ignore Update Transfer cmd params")
63c4c320ccf7 ("usb: dwc3: gadget: Check for L1/L2/U3 for Start Transfer")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 00dcf2fa449f23a263343d7fe051741bdde65d0b Mon Sep 17 00:00:00 2001
From: Prashanth K <quic_prashk(a)quicinc.com>
Date: Wed, 28 Aug 2024 12:13:02 +0530
Subject: [PATCH] usb: dwc3: Avoid waking up gadget during startxfer
When operating in High-Speed, it is observed that DSTS[USBLNKST] doesn't
update link state immediately after receiving the wakeup interrupt. Since
wakeup event handler calls the resume callbacks, there is a chance that
function drivers can perform an ep queue, which in turn tries to perform
remote wakeup from send_gadget_ep_cmd(STARTXFER). This happens because
DSTS[[21:18] wasn't updated to U0 yet, it's observed that the latency of
DSTS can be in order of milli-seconds. Hence avoid calling gadget_wakeup
during startxfer to prevent unnecessarily issuing remote wakeup to host.
Fixes: c36d8e947a56 ("usb: dwc3: gadget: put link to U0 before Start Transfer")
Cc: stable(a)vger.kernel.org
Suggested-by: Thinh Nguyen <Thinh.Nguyen(a)synopsys.com>
Signed-off-by: Prashanth K <quic_prashk(a)quicinc.com>
Acked-by: Thinh Nguyen <Thinh.Nguyen(a)synopsys.com>
Link: https://lore.kernel.org/r/20240828064302.3796315-1-quic_prashk@quicinc.com
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
diff --git a/drivers/usb/dwc3/gadget.c b/drivers/usb/dwc3/gadget.c
index 89fc690fdf34..291bc549935b 100644
--- a/drivers/usb/dwc3/gadget.c
+++ b/drivers/usb/dwc3/gadget.c
@@ -287,6 +287,23 @@ static int __dwc3_gadget_wakeup(struct dwc3 *dwc, bool async);
*
* Caller should handle locking. This function will issue @cmd with given
* @params to @dep and wait for its completion.
+ *
+ * According to the programming guide, if the link state is in L1/L2/U3,
+ * then sending the Start Transfer command may not complete. The
+ * programming guide suggested to bring the link state back to ON/U0 by
+ * performing remote wakeup prior to sending the command. However, don't
+ * initiate remote wakeup when the user/function does not send wakeup
+ * request via wakeup ops. Send the command when it's allowed.
+ *
+ * Notes:
+ * For L1 link state, issuing a command requires the clearing of
+ * GUSB2PHYCFG.SUSPENDUSB2, which turns on the signal required to complete
+ * the given command (usually within 50us). This should happen within the
+ * command timeout set by driver. No additional step is needed.
+ *
+ * For L2 or U3 link state, the gadget is in USB suspend. Care should be
+ * taken when sending Start Transfer command to ensure that it's done after
+ * USB resume.
*/
int dwc3_send_gadget_ep_cmd(struct dwc3_ep *dep, unsigned int cmd,
struct dwc3_gadget_ep_cmd_params *params)
@@ -327,30 +344,6 @@ int dwc3_send_gadget_ep_cmd(struct dwc3_ep *dep, unsigned int cmd,
dwc3_writel(dwc->regs, DWC3_GUSB2PHYCFG(0), reg);
}
- if (DWC3_DEPCMD_CMD(cmd) == DWC3_DEPCMD_STARTTRANSFER) {
- int link_state;
-
- /*
- * Initiate remote wakeup if the link state is in U3 when
- * operating in SS/SSP or L1/L2 when operating in HS/FS. If the
- * link state is in U1/U2, no remote wakeup is needed. The Start
- * Transfer command will initiate the link recovery.
- */
- link_state = dwc3_gadget_get_link_state(dwc);
- switch (link_state) {
- case DWC3_LINK_STATE_U2:
- if (dwc->gadget->speed >= USB_SPEED_SUPER)
- break;
-
- fallthrough;
- case DWC3_LINK_STATE_U3:
- ret = __dwc3_gadget_wakeup(dwc, false);
- dev_WARN_ONCE(dwc->dev, ret, "wakeup failed --> %d\n",
- ret);
- break;
- }
- }
-
/*
* For some commands such as Update Transfer command, DEPCMDPARn
* registers are reserved. Since the driver often sends Update Transfer
The patch below does not apply to the 6.1-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.1.y
git checkout FETCH_HEAD
git cherry-pick -x 00dcf2fa449f23a263343d7fe051741bdde65d0b
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024090921-chastity-calzone-c81f@gregkh' --subject-prefix 'PATCH 6.1.y' HEAD^..
Possible dependencies:
00dcf2fa449f ("usb: dwc3: Avoid waking up gadget during startxfer")
047161686b81 ("usb: dwc3: Add remote wakeup handling")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 00dcf2fa449f23a263343d7fe051741bdde65d0b Mon Sep 17 00:00:00 2001
From: Prashanth K <quic_prashk(a)quicinc.com>
Date: Wed, 28 Aug 2024 12:13:02 +0530
Subject: [PATCH] usb: dwc3: Avoid waking up gadget during startxfer
When operating in High-Speed, it is observed that DSTS[USBLNKST] doesn't
update link state immediately after receiving the wakeup interrupt. Since
wakeup event handler calls the resume callbacks, there is a chance that
function drivers can perform an ep queue, which in turn tries to perform
remote wakeup from send_gadget_ep_cmd(STARTXFER). This happens because
DSTS[[21:18] wasn't updated to U0 yet, it's observed that the latency of
DSTS can be in order of milli-seconds. Hence avoid calling gadget_wakeup
during startxfer to prevent unnecessarily issuing remote wakeup to host.
Fixes: c36d8e947a56 ("usb: dwc3: gadget: put link to U0 before Start Transfer")
Cc: stable(a)vger.kernel.org
Suggested-by: Thinh Nguyen <Thinh.Nguyen(a)synopsys.com>
Signed-off-by: Prashanth K <quic_prashk(a)quicinc.com>
Acked-by: Thinh Nguyen <Thinh.Nguyen(a)synopsys.com>
Link: https://lore.kernel.org/r/20240828064302.3796315-1-quic_prashk@quicinc.com
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
diff --git a/drivers/usb/dwc3/gadget.c b/drivers/usb/dwc3/gadget.c
index 89fc690fdf34..291bc549935b 100644
--- a/drivers/usb/dwc3/gadget.c
+++ b/drivers/usb/dwc3/gadget.c
@@ -287,6 +287,23 @@ static int __dwc3_gadget_wakeup(struct dwc3 *dwc, bool async);
*
* Caller should handle locking. This function will issue @cmd with given
* @params to @dep and wait for its completion.
+ *
+ * According to the programming guide, if the link state is in L1/L2/U3,
+ * then sending the Start Transfer command may not complete. The
+ * programming guide suggested to bring the link state back to ON/U0 by
+ * performing remote wakeup prior to sending the command. However, don't
+ * initiate remote wakeup when the user/function does not send wakeup
+ * request via wakeup ops. Send the command when it's allowed.
+ *
+ * Notes:
+ * For L1 link state, issuing a command requires the clearing of
+ * GUSB2PHYCFG.SUSPENDUSB2, which turns on the signal required to complete
+ * the given command (usually within 50us). This should happen within the
+ * command timeout set by driver. No additional step is needed.
+ *
+ * For L2 or U3 link state, the gadget is in USB suspend. Care should be
+ * taken when sending Start Transfer command to ensure that it's done after
+ * USB resume.
*/
int dwc3_send_gadget_ep_cmd(struct dwc3_ep *dep, unsigned int cmd,
struct dwc3_gadget_ep_cmd_params *params)
@@ -327,30 +344,6 @@ int dwc3_send_gadget_ep_cmd(struct dwc3_ep *dep, unsigned int cmd,
dwc3_writel(dwc->regs, DWC3_GUSB2PHYCFG(0), reg);
}
- if (DWC3_DEPCMD_CMD(cmd) == DWC3_DEPCMD_STARTTRANSFER) {
- int link_state;
-
- /*
- * Initiate remote wakeup if the link state is in U3 when
- * operating in SS/SSP or L1/L2 when operating in HS/FS. If the
- * link state is in U1/U2, no remote wakeup is needed. The Start
- * Transfer command will initiate the link recovery.
- */
- link_state = dwc3_gadget_get_link_state(dwc);
- switch (link_state) {
- case DWC3_LINK_STATE_U2:
- if (dwc->gadget->speed >= USB_SPEED_SUPER)
- break;
-
- fallthrough;
- case DWC3_LINK_STATE_U3:
- ret = __dwc3_gadget_wakeup(dwc, false);
- dev_WARN_ONCE(dwc->dev, ret, "wakeup failed --> %d\n",
- ret);
- break;
- }
- }
-
/*
* For some commands such as Update Transfer command, DEPCMDPARn
* registers are reserved. Since the driver often sends Update Transfer
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.15.y
git checkout FETCH_HEAD
git cherry-pick -x 61cbfb5368dd50ed0d65ce21d305aa923581db2b
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024090916-dazzling-showman-af36@gregkh' --subject-prefix 'PATCH 5.15.y' HEAD^..
Possible dependencies:
61cbfb5368dd ("iio: adc: ad7124: fix DT configuration parsing")
a6eaf02b8274 ("iio: adc: ad7124: Switch from of specific to fwnode based property handling")
1240c94ce819 ("iio: adc: Explicitly include correct DT includes")
4c077429b422 ("iio: mlx90614: Sort headers")
a99544c6c883 ("iio: adc: palmas: add support for iio threshold events")
2d48dbdfc7d4 ("iio: adc: palmas: move eventX_enable into palmas_adc_event")
7501a3a97e4f ("iio: adc: palmas: use iio_event_direction for threshold polarity")
d2ab4eea732d ("iio: adc: palmas: replace "wakeup" with "event"")
79d9622d622d ("iio: adc: palmas: remove adc_wakeupX_data")
6d52b0e70698 ("iio: adc: palmas: Take probe fully device managed.")
49f76c499d38 ("iio: adc: palmas_gpadc: fix NULL dereference on rmmod")
3a258747a01f ("iio: adc: ad7124: Silence no spi_device_id warnings")
5cfe8a1c2577 ("iio: adc: ad7124: add sequencer support")
fd5ba89e4cd4 ("iio: adc: ad7124: Add update_scan_mode")
3f1a9c392d69 ("iio:adc:palmas_gpadc: Switch from CONFIG_PM_SLEEP guards to pm_sleep_ptr()")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 61cbfb5368dd50ed0d65ce21d305aa923581db2b Mon Sep 17 00:00:00 2001
From: Dumitru Ceclan <mitrutzceclan(a)gmail.com>
Date: Tue, 6 Aug 2024 11:51:33 +0300
Subject: [PATCH] iio: adc: ad7124: fix DT configuration parsing
The cfg pointer is set before reading the channel number that the
configuration should point to. This causes configurations to be shifted
by one channel.
For example setting bipolar to the first channel defined in the DT will
cause bipolar mode to be active on the second defined channel.
Fix by moving the cfg pointer setting after reading the channel number.
Fixes: 7b8d045e497a ("iio: adc: ad7124: allow more than 8 channels")
Signed-off-by: Dumitru Ceclan <dumitru.ceclan(a)analog.com>
Reviewed-by: Nuno Sa <nuno.sa(a)analog.com>
Link: https://patch.msgid.link/20240806085133.114547-1-dumitru.ceclan@analog.com
Cc: <Stable(a)vger.kernel.org>
Signed-off-by: Jonathan Cameron <Jonathan.Cameron(a)huawei.com>
diff --git a/drivers/iio/adc/ad7124.c b/drivers/iio/adc/ad7124.c
index afb5f4d741e6..108e9ccab1ef 100644
--- a/drivers/iio/adc/ad7124.c
+++ b/drivers/iio/adc/ad7124.c
@@ -844,8 +844,6 @@ static int ad7124_parse_channel_config(struct iio_dev *indio_dev,
st->channels = channels;
device_for_each_child_node_scoped(dev, child) {
- cfg = &st->channels[channel].cfg;
-
ret = fwnode_property_read_u32(child, "reg", &channel);
if (ret)
return ret;
@@ -863,6 +861,7 @@ static int ad7124_parse_channel_config(struct iio_dev *indio_dev,
st->channels[channel].ain = AD7124_CHANNEL_AINP(ain[0]) |
AD7124_CHANNEL_AINM(ain[1]);
+ cfg = &st->channels[channel].cfg;
cfg->bipolar = fwnode_property_read_bool(child, "bipolar");
ret = fwnode_property_read_u32(child, "adi,reference-select", &tmp);
The patch below does not apply to the 6.1-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.1.y
git checkout FETCH_HEAD
git cherry-pick -x 61cbfb5368dd50ed0d65ce21d305aa923581db2b
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024090915-luridness-parameter-3447@gregkh' --subject-prefix 'PATCH 6.1.y' HEAD^..
Possible dependencies:
61cbfb5368dd ("iio: adc: ad7124: fix DT configuration parsing")
a6eaf02b8274 ("iio: adc: ad7124: Switch from of specific to fwnode based property handling")
1240c94ce819 ("iio: adc: Explicitly include correct DT includes")
4c077429b422 ("iio: mlx90614: Sort headers")
a99544c6c883 ("iio: adc: palmas: add support for iio threshold events")
2d48dbdfc7d4 ("iio: adc: palmas: move eventX_enable into palmas_adc_event")
7501a3a97e4f ("iio: adc: palmas: use iio_event_direction for threshold polarity")
d2ab4eea732d ("iio: adc: palmas: replace "wakeup" with "event"")
79d9622d622d ("iio: adc: palmas: remove adc_wakeupX_data")
6d52b0e70698 ("iio: adc: palmas: Take probe fully device managed.")
49f76c499d38 ("iio: adc: palmas_gpadc: fix NULL dereference on rmmod")
3a258747a01f ("iio: adc: ad7124: Silence no spi_device_id warnings")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 61cbfb5368dd50ed0d65ce21d305aa923581db2b Mon Sep 17 00:00:00 2001
From: Dumitru Ceclan <mitrutzceclan(a)gmail.com>
Date: Tue, 6 Aug 2024 11:51:33 +0300
Subject: [PATCH] iio: adc: ad7124: fix DT configuration parsing
The cfg pointer is set before reading the channel number that the
configuration should point to. This causes configurations to be shifted
by one channel.
For example setting bipolar to the first channel defined in the DT will
cause bipolar mode to be active on the second defined channel.
Fix by moving the cfg pointer setting after reading the channel number.
Fixes: 7b8d045e497a ("iio: adc: ad7124: allow more than 8 channels")
Signed-off-by: Dumitru Ceclan <dumitru.ceclan(a)analog.com>
Reviewed-by: Nuno Sa <nuno.sa(a)analog.com>
Link: https://patch.msgid.link/20240806085133.114547-1-dumitru.ceclan@analog.com
Cc: <Stable(a)vger.kernel.org>
Signed-off-by: Jonathan Cameron <Jonathan.Cameron(a)huawei.com>
diff --git a/drivers/iio/adc/ad7124.c b/drivers/iio/adc/ad7124.c
index afb5f4d741e6..108e9ccab1ef 100644
--- a/drivers/iio/adc/ad7124.c
+++ b/drivers/iio/adc/ad7124.c
@@ -844,8 +844,6 @@ static int ad7124_parse_channel_config(struct iio_dev *indio_dev,
st->channels = channels;
device_for_each_child_node_scoped(dev, child) {
- cfg = &st->channels[channel].cfg;
-
ret = fwnode_property_read_u32(child, "reg", &channel);
if (ret)
return ret;
@@ -863,6 +861,7 @@ static int ad7124_parse_channel_config(struct iio_dev *indio_dev,
st->channels[channel].ain = AD7124_CHANNEL_AINP(ain[0]) |
AD7124_CHANNEL_AINM(ain[1]);
+ cfg = &st->channels[channel].cfg;
cfg->bipolar = fwnode_property_read_bool(child, "bipolar");
ret = fwnode_property_read_u32(child, "adi,reference-select", &tmp);
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-4.19.y
git checkout FETCH_HEAD
git cherry-pick -x 90826e08468ba7fb35d8b39645b22d9e80004afe
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024090946-heading-mortality-97cb@gregkh' --subject-prefix 'PATCH 4.19.y' HEAD^..
Possible dependencies:
90826e08468b ("iio: adc: ad7606: remove frstdata check for serial mode")
7989b4bb23fe ("iio: adc: ad7616: Add support for AD7616 ADC")
6bf229abce75 ("iio: adc: ad7606: Move oversampling options in chip info and rework *_avail attributes")
2985a5d88455 ("staging: iio: adc: ad7606: Move out of staging")
54160ae3b2d3 ("staging: iio: adc: ad7606: Misc style fixes (no functional change)")
cc49bd1652a4 ("staging: iio: adc: ad7606: Add support for threaded irq")
2bbf53e3e506 ("staging: iio: adc: ad7606: Simplify the Kconfing menu")
43f9b204edf0 ("staging: iio: adc: ad7606: Add OF device ID table")
41f71e5e7daf ("staging: iio: adc: ad7606: Use find_closest() macro")
c0683bfd3772 ("staging: iio: adc: ad7606: Use devm functions in probe")
557e585c3fdb ("staging: iio: adc: ad7606: Use wait-for-completion handler")
7c0bc65c8403 ("Merge tag 'iio-for-4.21a' of git://git.kernel.org/pub/scm/linux/kernel/git/jic23/iio into staging-testing")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 90826e08468ba7fb35d8b39645b22d9e80004afe Mon Sep 17 00:00:00 2001
From: Guillaume Stols <gstols(a)baylibre.com>
Date: Tue, 2 Jul 2024 12:52:51 +0000
Subject: [PATCH] iio: adc: ad7606: remove frstdata check for serial mode
The current implementation attempts to recover from an eventual glitch
in the clock by checking frstdata state after reading the first
channel's sample: If frstdata is low, it will reset the chip and
return -EIO.
This will only work in parallel mode, where frstdata pin is set low
after the 2nd sample read starts.
For the serial mode, according to the datasheet, "The FRSTDATA output
returns to a logic low following the 16th SCLK falling edge.", thus
after the Xth pulse, X being the number of bits in a sample, the check
will always be true, and the driver will not work at all in serial
mode if frstdata(optional) is defined in the devicetree as it will
reset the chip, and return -EIO every time read_sample is called.
Hence, this check must be removed for serial mode.
Fixes: b9618c0cacd7 ("staging: IIO: ADC: New driver for AD7606/AD7606-6/AD7606-4")
Signed-off-by: Guillaume Stols <gstols(a)baylibre.com>
Reviewed-by: Nuno Sa <nuno.sa(a)analog.com>
Link: https://patch.msgid.link/20240702-cleanup-ad7606-v3-1-18d5ea18770e@baylibre…
Cc: <Stable(a)vger.kernel.org>
Signed-off-by: Jonathan Cameron <Jonathan.Cameron(a)huawei.com>
diff --git a/drivers/iio/adc/ad7606.c b/drivers/iio/adc/ad7606.c
index 3a417595294f..c321c6ef48df 100644
--- a/drivers/iio/adc/ad7606.c
+++ b/drivers/iio/adc/ad7606.c
@@ -49,7 +49,7 @@ static const unsigned int ad7616_oversampling_avail[8] = {
1, 2, 4, 8, 16, 32, 64, 128,
};
-static int ad7606_reset(struct ad7606_state *st)
+int ad7606_reset(struct ad7606_state *st)
{
if (st->gpio_reset) {
gpiod_set_value(st->gpio_reset, 1);
@@ -60,6 +60,7 @@ static int ad7606_reset(struct ad7606_state *st)
return -ENODEV;
}
+EXPORT_SYMBOL_NS_GPL(ad7606_reset, IIO_AD7606);
static int ad7606_reg_access(struct iio_dev *indio_dev,
unsigned int reg,
@@ -88,31 +89,6 @@ static int ad7606_read_samples(struct ad7606_state *st)
{
unsigned int num = st->chip_info->num_channels - 1;
u16 *data = st->data;
- int ret;
-
- /*
- * The frstdata signal is set to high while and after reading the sample
- * of the first channel and low for all other channels. This can be used
- * to check that the incoming data is correctly aligned. During normal
- * operation the data should never become unaligned, but some glitch or
- * electrostatic discharge might cause an extra read or clock cycle.
- * Monitoring the frstdata signal allows to recover from such failure
- * situations.
- */
-
- if (st->gpio_frstdata) {
- ret = st->bops->read_block(st->dev, 1, data);
- if (ret)
- return ret;
-
- if (!gpiod_get_value(st->gpio_frstdata)) {
- ad7606_reset(st);
- return -EIO;
- }
-
- data++;
- num--;
- }
return st->bops->read_block(st->dev, num, data);
}
diff --git a/drivers/iio/adc/ad7606.h b/drivers/iio/adc/ad7606.h
index 0c6a88cc4695..6649e84d25de 100644
--- a/drivers/iio/adc/ad7606.h
+++ b/drivers/iio/adc/ad7606.h
@@ -151,6 +151,8 @@ int ad7606_probe(struct device *dev, int irq, void __iomem *base_address,
const char *name, unsigned int id,
const struct ad7606_bus_ops *bops);
+int ad7606_reset(struct ad7606_state *st);
+
enum ad7606_supported_device_ids {
ID_AD7605_4,
ID_AD7606_8,
diff --git a/drivers/iio/adc/ad7606_par.c b/drivers/iio/adc/ad7606_par.c
index d8408052262e..6bc587b20f05 100644
--- a/drivers/iio/adc/ad7606_par.c
+++ b/drivers/iio/adc/ad7606_par.c
@@ -7,6 +7,7 @@
#include <linux/mod_devicetable.h>
#include <linux/module.h>
+#include <linux/gpio/consumer.h>
#include <linux/platform_device.h>
#include <linux/types.h>
#include <linux/err.h>
@@ -21,8 +22,29 @@ static int ad7606_par16_read_block(struct device *dev,
struct iio_dev *indio_dev = dev_get_drvdata(dev);
struct ad7606_state *st = iio_priv(indio_dev);
- insw((unsigned long)st->base_address, buf, count);
+ /*
+ * On the parallel interface, the frstdata signal is set to high while
+ * and after reading the sample of the first channel and low for all
+ * other channels. This can be used to check that the incoming data is
+ * correctly aligned. During normal operation the data should never
+ * become unaligned, but some glitch or electrostatic discharge might
+ * cause an extra read or clock cycle. Monitoring the frstdata signal
+ * allows to recover from such failure situations.
+ */
+ int num = count;
+ u16 *_buf = buf;
+
+ if (st->gpio_frstdata) {
+ insw((unsigned long)st->base_address, _buf, 1);
+ if (!gpiod_get_value(st->gpio_frstdata)) {
+ ad7606_reset(st);
+ return -EIO;
+ }
+ _buf++;
+ num--;
+ }
+ insw((unsigned long)st->base_address, _buf, num);
return 0;
}
@@ -35,8 +57,28 @@ static int ad7606_par8_read_block(struct device *dev,
{
struct iio_dev *indio_dev = dev_get_drvdata(dev);
struct ad7606_state *st = iio_priv(indio_dev);
+ /*
+ * On the parallel interface, the frstdata signal is set to high while
+ * and after reading the sample of the first channel and low for all
+ * other channels. This can be used to check that the incoming data is
+ * correctly aligned. During normal operation the data should never
+ * become unaligned, but some glitch or electrostatic discharge might
+ * cause an extra read or clock cycle. Monitoring the frstdata signal
+ * allows to recover from such failure situations.
+ */
+ int num = count;
+ u16 *_buf = buf;
- insb((unsigned long)st->base_address, buf, count * 2);
+ if (st->gpio_frstdata) {
+ insb((unsigned long)st->base_address, _buf, 2);
+ if (!gpiod_get_value(st->gpio_frstdata)) {
+ ad7606_reset(st);
+ return -EIO;
+ }
+ _buf++;
+ num--;
+ }
+ insb((unsigned long)st->base_address, _buf, num * 2);
return 0;
}
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-4.19.y
git checkout FETCH_HEAD
git cherry-pick -x b48aa991758999d4e8f9296c5bbe388f293ef465
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024090940-shale-handcart-eb5d@gregkh' --subject-prefix 'PATCH 4.19.y' HEAD^..
Possible dependencies:
b48aa9917589 ("staging: iio: frequency: ad9834: Validate frequency parameter value")
8e8040c52e63 ("staging: iio: frequency: ad9833: Load clock using clock framework")
80109c32348d ("staging: iio: frequency: ad9833: Get frequency value statically")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From b48aa991758999d4e8f9296c5bbe388f293ef465 Mon Sep 17 00:00:00 2001
From: Aleksandr Mishin <amishin(a)t-argos.ru>
Date: Wed, 3 Jul 2024 18:45:06 +0300
Subject: [PATCH] staging: iio: frequency: ad9834: Validate frequency parameter
value
In ad9834_write_frequency() clk_get_rate() can return 0. In such case
ad9834_calc_freqreg() call will lead to division by zero. Checking
'if (fout > (clk_freq / 2))' doesn't protect in case of 'fout' is 0.
ad9834_write_frequency() is called from ad9834_write(), where fout is
taken from text buffer, which can contain any value.
Modify parameters checking.
Found by Linux Verification Center (linuxtesting.org) with SVACE.
Fixes: 12b9d5bf76bf ("Staging: IIO: DDS: AD9833 / AD9834 driver")
Suggested-by: Dan Carpenter <dan.carpenter(a)linaro.org>
Signed-off-by: Aleksandr Mishin <amishin(a)t-argos.ru>
Reviewed-by: Dan Carpenter <dan.carpenter(a)linaro.org>
Link: https://patch.msgid.link/20240703154506.25584-1-amishin@t-argos.ru
Cc: <Stable(a)vger.kernel.org>
Signed-off-by: Jonathan Cameron <Jonathan.Cameron(a)huawei.com>
diff --git a/drivers/staging/iio/frequency/ad9834.c b/drivers/staging/iio/frequency/ad9834.c
index a7a5cdcc6590..47e7d7e6d920 100644
--- a/drivers/staging/iio/frequency/ad9834.c
+++ b/drivers/staging/iio/frequency/ad9834.c
@@ -114,7 +114,7 @@ static int ad9834_write_frequency(struct ad9834_state *st,
clk_freq = clk_get_rate(st->mclk);
- if (fout > (clk_freq / 2))
+ if (!clk_freq || fout > (clk_freq / 2))
return -EINVAL;
regval = ad9834_calc_freqreg(clk_freq, fout);
On Sun, 08 Sep 2024 14:36:37 +0100,
Sasha Levin <sashal(a)kernel.org> wrote:
>
> This is a note to let you know that I've just added the patch titled
>
> irqchip/gic-v4: Make sure a VPE is locked when VMAPP is issued
>
> to the 6.6-stable tree which can be found at:
> http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum…
>
> The filename of the patch is:
> irqchip-gic-v4-make-sure-a-vpe-is-locked-when-vmapp-.patch
> and it can be found in the queue-6.6 subdirectory.
>
> If you, or anyone else, feels it should not be added to the stable tree,
> please let <stable(a)vger.kernel.org> know about it.
>
>
>
> commit 1a232324773145ff7ce59b6a1b52b3247223f9d4
> Author: Marc Zyngier <maz(a)kernel.org>
> Date: Fri Jul 5 10:31:55 2024 +0100
>
> irqchip/gic-v4: Make sure a VPE is locked when VMAPP is issued
>
> [ Upstream commit a84a07fa3100d7ad46a3d6882af25a3df9c9e7e3 ]
>
> In order to make sure that vpe->col_idx is correctly sampled when a VMAPP
> command is issued, the vpe_lock must be held for the VPE. This is now
> possible since the introduction of the per-VM vmapp_lock, which can be
> taken before vpe_lock in the correct locking order.
>
> Signed-off-by: Marc Zyngier <maz(a)kernel.org>
> Signed-off-by: Thomas Gleixner <tglx(a)linutronix.de>
> Tested-by: Nianyao Tang <tangnianyao(a)huawei.com>
> Link: https://lore.kernel.org/r/20240705093155.871070-4-maz@kernel.org
> Signed-off-by: Sasha Levin <sashal(a)kernel.org>
>
> diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c
> index e25dea0e50c7..1e0f0e1bf481 100644
> --- a/drivers/irqchip/irq-gic-v3-its.c
> +++ b/drivers/irqchip/irq-gic-v3-its.c
> @@ -1804,7 +1804,9 @@ static void its_map_vm(struct its_node *its, struct its_vm *vm)
> for (i = 0; i < vm->nr_vpes; i++) {
> struct its_vpe *vpe = vm->vpes[i];
>
> - its_send_vmapp(its, vpe, true);
> + scoped_guard(raw_spinlock, &vpe->vpe_lock)
> + its_send_vmapp(its, vpe, true);
> +
> its_send_vinvall(its, vpe);
> }
> }
> @@ -1825,8 +1827,10 @@ static void its_unmap_vm(struct its_node *its, struct its_vm *vm)
> if (!--vm->vlpi_count[its->list_nr]) {
> int i;
>
> - for (i = 0; i < vm->nr_vpes; i++)
> + for (i = 0; i < vm->nr_vpes; i++) {
> + guard(raw_spinlock)(&vm->vpes[i]->vpe_lock);
> its_send_vmapp(its, vm->vpes[i], false);
> + }
> }
>
> raw_spin_unlock_irqrestore(&vmovp_lock, flags);
>
No please.
Not only you are missing the essential part of the series (the patch
introducing the per-VM lock that this change relies on), you are also
missing the fixes that followed.
So please drop this patch from the 6.6 and 6.1 queues.
M.
--
Without deviation from the norm, progress is not possible.
On Sun, Sep 08, 2024 at 09:25:58AM GMT, Sasha Levin wrote:
> This is a note to let you know that I've just added the patch titled
>
> bcachefs: Add error code to defer option parsing
???
Sasha, this and the other patch aren't bugfixes at all, they're prep
work for the new mount API, i.e. feature work.
Please just drop the bcachefs patches from stable entirely; the lockless
IO patch revert is a fix but I'll be sending that with a couple other
fixes in a day or so.
Am Sonntag, dem 08.09.2024 um 09:37 -0400 schrieb Sasha Levin:
> This is a note to let you know that I've just added the patch titled
>
> wifi: mt76: mt7921: fix NULL pointer access in mt7921_ipv6_addr_change
>
> to the 6.6-stable tree which can be found at:
> http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum…
>
> The filename of the patch is:
> wifi-mt76-mt7921-fix-null-pointer-access-in-mt7921_i.patch
> and it can be found in the queue-6.6 subdirectory.
>
> If you, or anyone else, feels it should not be added to the stable tree,
> please let <stable(a)vger.kernel.org> know about it.
>
>
>
> commit 857d7854c40324bfc70a6d32c9eb0792bc7c0b56
> Author: Bert Karwatzki <spasswolf(a)web.de>
> Date: Mon Aug 12 12:45:41 2024 +0200
>
> wifi: mt76: mt7921: fix NULL pointer access in mt7921_ipv6_addr_change
>
> [ Upstream commit 479ffee68d59c599f8aed8fa2dcc8e13e7bd13c3 ]
>
> When disabling wifi mt7921_ipv6_addr_change() is called as a notifier.
> At this point mvif->phy is already NULL so we cannot use it here.
>
> Signed-off-by: Bert Karwatzki <spasswolf(a)web.de>
> Signed-off-by: Felix Fietkau <nbd(a)nbd.name>
> Signed-off-by: Kalle Valo <kvalo(a)kernel.org>
> Link: https://patch.msgid.link/20240812104542.80760-1-spasswolf@web.de
> Signed-off-by: Sasha Levin <sashal(a)kernel.org>
>
> diff --git a/drivers/net/wireless/mediatek/mt76/mt7921/main.c b/drivers/net/wireless/mediatek/mt76/mt7921/main.c
> index 6a5c2cae087d..6dec54431312 100644
> --- a/drivers/net/wireless/mediatek/mt76/mt7921/main.c
> +++ b/drivers/net/wireless/mediatek/mt76/mt7921/main.c
> @@ -1095,7 +1095,7 @@ static void mt7921_ipv6_addr_change(struct ieee80211_hw *hw,
> struct inet6_dev *idev)
> {
> struct mt792x_vif *mvif = (struct mt792x_vif *)vif->drv_priv;
> - struct mt792x_dev *dev = mvif->phy->dev;
> + struct mt792x_dev *dev = mt792x_hw_dev(hw);
> struct inet6_ifaddr *ifa;
> struct in6_addr ns_addrs[IEEE80211_BSS_ARP_ADDR_LIST_LEN];
> struct sk_buff *skb;
The patch is only fixes a NULL pointer if the tree also contains this commit:
commit 574e609c4e6a0843a9ed53de79e00da8fb3e7437
Author: Felix Fietkau <nbd(a)nbd.name>
Date: Thu Jul 4 15:09:47 2024 +0200
wifi: mac80211: clear vif drv_priv after remove_interface when stopping
Avoid reusing stale driver data when an interface is brought down and up
again. In order to avoid having to duplicate the memset in every single
driver, do it here.
Signed-off-by: Felix Fietkau <nbd(a)nbd.name>
Link: https://patch.msgid.link/20240704130947.48609-1-nbd@nbd.name
Signed-off-by: Johannes Berg <johannes.berg(a)intel.com>
In trees which do not contain this the patch is not necessary.
Bert Karwatzki
The patch "intel: legacy: Partial revert of field get conversion"
(commit ba54b1a276a6b69d80649942fe5334d19851443e in mainline) fixes a
broken refactoring that prevents Wake-on-LAN from working on some e1000e
devices.
v6.10 already includes that fix and v6.1 and earlier did not yet contain
the offending refactoring, so it should only be necessary to apply this
to 6.6.
Thanks!
The patch below does not apply to the 6.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.10.y
git checkout FETCH_HEAD
git cherry-pick -x e399257349098bf7c84343f99efb2bc9c22eb9fd
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024090839-crimp-posted-6a31@gregkh' --subject-prefix 'PATCH 6.10.y' HEAD^..
Possible dependencies:
e39925734909 ("mm/memcontrol: respect zswap.writeback setting from parent cg too")
2b33a97c94bc ("mm: zswap: rename is_zswap_enabled() to zswap_is_enabled()")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From e399257349098bf7c84343f99efb2bc9c22eb9fd Mon Sep 17 00:00:00 2001
From: Mike Yuan <me(a)yhndnzj.com>
Date: Fri, 23 Aug 2024 16:27:06 +0000
Subject: [PATCH] mm/memcontrol: respect zswap.writeback setting from parent cg
too
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Currently, the behavior of zswap.writeback wrt. the cgroup hierarchy
seems a bit odd. Unlike zswap.max, it doesn't honor the value from parent
cgroups. This surfaced when people tried to globally disable zswap
writeback, i.e. reserve physical swap space only for hibernation [1] -
disabling zswap.writeback only for the root cgroup results in subcgroups
with zswap.writeback=1 still performing writeback.
The inconsistency became more noticeable after I introduced the
MemoryZSwapWriteback= systemd unit setting [2] for controlling the knob.
The patch assumed that the kernel would enforce the value of parent
cgroups. It could probably be workarounded from systemd's side, by going
up the slice unit tree and inheriting the value. Yet I think it's more
sensible to make it behave consistently with zswap.max and friends.
[1] https://wiki.archlinux.org/title/Power_management/Suspend_and_hibernate#Dis…
[2] https://github.com/systemd/systemd/pull/31734
Link: https://lkml.kernel.org/r/20240823162506.12117-1-me@yhndnzj.com
Fixes: 501a06fe8e4c ("zswap: memcontrol: implement zswap writeback disabling")
Signed-off-by: Mike Yuan <me(a)yhndnzj.com>
Reviewed-by: Nhat Pham <nphamcs(a)gmail.com>
Acked-by: Yosry Ahmed <yosryahmed(a)google.com>
Cc: Johannes Weiner <hannes(a)cmpxchg.org>
Cc: Michal Hocko <mhocko(a)kernel.org>
Cc: Michal Koutný <mkoutny(a)suse.com>
Cc: Muchun Song <muchun.song(a)linux.dev>
Cc: Roman Gushchin <roman.gushchin(a)linux.dev>
Cc: Shakeel Butt <shakeel.butt(a)linux.dev>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
index 86311c2907cd..95c18bc17083 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -1717,9 +1717,10 @@ The following nested keys are defined.
entries fault back in or are written out to disk.
memory.zswap.writeback
- A read-write single value file. The default value is "1". The
- initial value of the root cgroup is 1, and when a new cgroup is
- created, it inherits the current value of its parent.
+ A read-write single value file. The default value is "1".
+ Note that this setting is hierarchical, i.e. the writeback would be
+ implicitly disabled for child cgroups if the upper hierarchy
+ does so.
When this is set to 0, all swapping attempts to swapping devices
are disabled. This included both zswap writebacks, and swapping due
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index f29157288b7d..d563fb515766 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -3613,8 +3613,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
memcg1_soft_limit_reset(memcg);
#ifdef CONFIG_ZSWAP
memcg->zswap_max = PAGE_COUNTER_MAX;
- WRITE_ONCE(memcg->zswap_writeback,
- !parent || READ_ONCE(parent->zswap_writeback));
+ WRITE_ONCE(memcg->zswap_writeback, true);
#endif
page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX);
if (parent) {
@@ -5320,7 +5319,14 @@ void obj_cgroup_uncharge_zswap(struct obj_cgroup *objcg, size_t size)
bool mem_cgroup_zswap_writeback_enabled(struct mem_cgroup *memcg)
{
/* if zswap is disabled, do not block pages going to the swapping device */
- return !zswap_is_enabled() || !memcg || READ_ONCE(memcg->zswap_writeback);
+ if (!zswap_is_enabled())
+ return true;
+
+ for (; memcg; memcg = parent_mem_cgroup(memcg))
+ if (!READ_ONCE(memcg->zswap_writeback))
+ return false;
+
+ return true;
}
static u64 zswap_current_read(struct cgroup_subsys_state *css,
On Tue, 2024-08-13 at 08:25 -0700, Darrick J. Wong wrote:
> On Tue, Aug 13, 2024 at 04:59:25PM +0200, Christoph Hellwig wrote:
> > On Tue, Aug 13, 2024 at 07:19:28PM +1000, Dave Chinner wrote:
> > > In hindsight, this was a wholly avoidable bug - a single patch made
> > > two different API modifications that only differed by a single
> > > letter, and one of the 23 conversions missed a single letter. If
> > > that was two patches - one for the finobt conversion, the second for
> > > the inobt conversion, the bug would have been plainly obvious during
> > > review....
> >
> > Maybe we should avoid identifiers that close anyway :)
> >
> > The change looks good:
> >
> > Reviewed-by: Christoph Hellwig <hch(a)lst.de>
>
> Looks good to me too
> Reviewed-by: Darrick J. Wong <djwong(a)kernel.org>
Is it (now commit 95179935bea) going to -stable too?
Setting the PF_NO_SETAFFINITY flag creates problems in combination with
cpuset operations (see commit messages for details). To mitigate this, fixes have
been written to remove the flag from the poller threads, which landed in v6.3. We
need them in v6.1 as well.
Best regards,
Felix Moessbauer
Siemens AG
Jens Axboe (1):
io_uring/io-wq: stop setting PF_NO_SETAFFINITY on io-wq workers
Michal Koutný (1):
io_uring/sqpoll: Do not set PF_NO_SETAFFINITY on sqpoll threads
io_uring/io-wq.c | 16 +++++++++++-----
io_uring/sqpoll.c | 1 -
2 files changed, 11 insertions(+), 6 deletions(-)
--
2.39.2
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.15.y
git checkout FETCH_HEAD
git cherry-pick -x ef34a6ea0cab1800f4b3c9c3c2cefd5091e03379
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024082642-google-strongman-27a7@gregkh' --subject-prefix 'PATCH 5.15.y' HEAD^..
Possible dependencies:
ef34a6ea0cab ("mptcp: pm: re-using ID of unused flushed subflows")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From ef34a6ea0cab1800f4b3c9c3c2cefd5091e03379 Mon Sep 17 00:00:00 2001
From: "Matthieu Baerts (NGI0)" <matttbe(a)kernel.org>
Date: Mon, 19 Aug 2024 21:45:23 +0200
Subject: [PATCH] mptcp: pm: re-using ID of unused flushed subflows
If no subflows are attached to the 'subflow' endpoints that are being
flushed, the corresponding addr IDs will not be marked as available
again.
Mark all ID as being available when flushing all the 'subflow'
endpoints, and reset local_addr_used counter to cover these cases.
Note that mptcp_pm_remove_addrs_and_subflows() helper is only called for
flushing operations, not to remove a specific set of addresses and
subflows.
Fixes: 06faa2271034 ("mptcp: remove multi addresses and subflows in PM")
Cc: stable(a)vger.kernel.org
Reviewed-by: Mat Martineau <martineau(a)kernel.org>
Signed-off-by: Matthieu Baerts (NGI0) <matttbe(a)kernel.org>
Link: https://patch.msgid.link/20240819-net-mptcp-pm-reusing-id-v1-5-38035d40de5b…
Signed-off-by: Jakub Kicinski <kuba(a)kernel.org>
diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c
index 8b232a210a06..2c26696b820e 100644
--- a/net/mptcp/pm_netlink.c
+++ b/net/mptcp/pm_netlink.c
@@ -1623,8 +1623,15 @@ static void mptcp_pm_remove_addrs_and_subflows(struct mptcp_sock *msk,
mptcp_pm_remove_addr(msk, &alist);
spin_unlock_bh(&msk->pm.lock);
}
+
if (slist.nr)
mptcp_pm_remove_subflow(msk, &slist);
+
+ /* Reset counters: maybe some subflows have been removed before */
+ spin_lock_bh(&msk->pm.lock);
+ bitmap_fill(msk->pm.id_avail_bitmap, MPTCP_PM_MAX_ADDR_ID + 1);
+ msk->pm.local_addr_used = 0;
+ spin_unlock_bh(&msk->pm.lock);
}
static void mptcp_nl_remove_addrs_list(struct net *net,
From: Chuck Lever <chuck.lever(a)oracle.com>
Following up on
https://lore.kernel.org/linux-nfs/d4b235df-4ee5-4824-9d48-e3b3c1f1f4d1@orac…
Here is a backport series targeting origin/linux-5.10.y that closes
the information leak described in the above thread.
Review comments welcome.
Chuck Lever (6):
NFSD: Refactor nfsd_reply_cache_free_locked()
NFSD: Rename nfsd_reply_cache_alloc()
NFSD: Replace nfsd_prune_bucket()
NFSD: Refactor the duplicate reply cache shrinker
NFSD: Rewrite synopsis of nfsd_percpu_counters_init()
NFSD: Fix frame size warning in svc_export_parse()
Jeff Layton (2):
nfsd: move reply cache initialization into nfsd startup
nfsd: move init of percpu reply_cache_stats counters back to
nfsd_init_net
Josef Bacik (10):
sunrpc: don't change ->sv_stats if it doesn't exist
nfsd: stop setting ->pg_stats for unused stats
sunrpc: pass in the sv_stats struct through svc_create_pooled
sunrpc: remove ->pg_stats from svc_program
sunrpc: use the struct net as the svc proc private
nfsd: rename NFSD_NET_* to NFSD_STATS_*
nfsd: expose /proc/net/sunrpc/nfsd in net namespaces
nfsd: make all of the nfsd stats per-network namespace
nfsd: remove nfsd_stats, make th_cnt a global counter
nfsd: make svc_stat per-network namespace instead of global
NeilBrown (1):
NFSD: simplify error paths in nfsd_svc()
fs/lockd/svc.c | 3 -
fs/nfs/callback.c | 3 -
fs/nfsd/export.c | 32 ++++--
fs/nfsd/export.h | 4 +-
fs/nfsd/netns.h | 25 ++++-
fs/nfsd/nfs4proc.c | 6 +-
fs/nfsd/nfscache.c | 202 ++++++++++++++++++++++---------------
fs/nfsd/nfsctl.c | 24 ++---
fs/nfsd/nfsd.h | 1 +
fs/nfsd/nfsfh.c | 3 +-
fs/nfsd/nfssvc.c | 38 ++++---
fs/nfsd/stats.c | 52 ++++------
fs/nfsd/stats.h | 83 ++++++---------
fs/nfsd/trace.h | 22 ++++
fs/nfsd/vfs.c | 6 +-
include/linux/sunrpc/svc.h | 5 +-
net/sunrpc/stats.c | 2 +-
net/sunrpc/svc.c | 36 ++++---
18 files changed, 306 insertions(+), 241 deletions(-)
--
2.45.1
The patch below does not apply to the 6.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.10.y
git checkout FETCH_HEAD
git cherry-pick -x 562755501d44cfbbe82703a62cb41502bd067bd1
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024090812-ample-stowaway-5c06@gregkh' --subject-prefix 'PATCH 6.10.y' HEAD^..
Possible dependencies:
562755501d44 ("ALSA: hda/realtek: extend quirks for Clevo V5[46]0")
03c5c350e38d ("ALSA: hda/realtek: Add support for new HP G12 laptops")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 562755501d44cfbbe82703a62cb41502bd067bd1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Marczykowski-G=C3=B3recki?=
<marmarek(a)invisiblethingslab.com>
Date: Tue, 3 Sep 2024 14:49:31 +0200
Subject: [PATCH] ALSA: hda/realtek: extend quirks for Clevo V5[46]0
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
The mic in those laptops suffers too high gain resulting in mostly (fan
or else) noise being recorded. In addition to the existing fixup about
mic detection, apply also limiting its boost. While at it, extend the
quirk to also V5[46]0TNE models, which have the same issue.
Signed-off-by: Marek Marczykowski-Górecki <marmarek(a)invisiblethingslab.com>
Cc: <stable(a)vger.kernel.org>
Link: https://patch.msgid.link/20240903124939.6213-1-marmarek@invisiblethingslab.…
Signed-off-by: Takashi Iwai <tiwai(a)suse.de>
diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c
index ff62702a8226..fd7711d69823 100644
--- a/sound/pci/hda/patch_realtek.c
+++ b/sound/pci/hda/patch_realtek.c
@@ -7638,6 +7638,7 @@ enum {
ALC287_FIXUP_LENOVO_14ARP8_LEGION_IAH7,
ALC287_FIXUP_LENOVO_SSID_17AA3820,
ALCXXX_FIXUP_CS35LXX,
+ ALC245_FIXUP_CLEVO_NOISY_MIC,
};
/* A special fixup for Lenovo C940 and Yoga Duet 7;
@@ -9977,6 +9978,12 @@ static const struct hda_fixup alc269_fixups[] = {
.type = HDA_FIXUP_FUNC,
.v.func = cs35lxx_autodet_fixup,
},
+ [ALC245_FIXUP_CLEVO_NOISY_MIC] = {
+ .type = HDA_FIXUP_FUNC,
+ .v.func = alc269_fixup_limit_int_mic_boost,
+ .chained = true,
+ .chain_id = ALC256_FIXUP_SYSTEM76_MIC_NO_PRESENCE,
+ },
};
static const struct snd_pci_quirk alc269_fixup_tbl[] = {
@@ -10626,7 +10633,8 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = {
SND_PCI_QUIRK(0x1558, 0xa600, "Clevo NL50NU", ALC293_FIXUP_SYSTEM76_MIC_NO_PRESENCE),
SND_PCI_QUIRK(0x1558, 0xa650, "Clevo NP[567]0SN[CD]", ALC256_FIXUP_SYSTEM76_MIC_NO_PRESENCE),
SND_PCI_QUIRK(0x1558, 0xa671, "Clevo NP70SN[CDE]", ALC256_FIXUP_SYSTEM76_MIC_NO_PRESENCE),
- SND_PCI_QUIRK(0x1558, 0xa763, "Clevo V54x_6x_TU", ALC256_FIXUP_SYSTEM76_MIC_NO_PRESENCE),
+ SND_PCI_QUIRK(0x1558, 0xa741, "Clevo V54x_6x_TNE", ALC245_FIXUP_CLEVO_NOISY_MIC),
+ SND_PCI_QUIRK(0x1558, 0xa763, "Clevo V54x_6x_TU", ALC245_FIXUP_CLEVO_NOISY_MIC),
SND_PCI_QUIRK(0x1558, 0xb018, "Clevo NP50D[BE]", ALC293_FIXUP_SYSTEM76_MIC_NO_PRESENCE),
SND_PCI_QUIRK(0x1558, 0xb019, "Clevo NH77D[BE]Q", ALC293_FIXUP_SYSTEM76_MIC_NO_PRESENCE),
SND_PCI_QUIRK(0x1558, 0xb022, "Clevo NH77D[DC][QW]", ALC293_FIXUP_SYSTEM76_MIC_NO_PRESENCE),
The patch below does not apply to the 6.1-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.1.y
git checkout FETCH_HEAD
git cherry-pick -x 65444581a4aecf0e96b4691bb20fc75c602f5863
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024090809-wrongly-repulsive-5a71@gregkh' --subject-prefix 'PATCH 6.1.y' HEAD^..
Possible dependencies:
65444581a4ae ("drm/amd/display: Determine IPS mode by ASIC and PMFW versions")
234e94555800 ("drm/amd/display: Enable copying of bounding box data from VBIOS DMUB")
afca033f10d3 ("drm/amd/display: Add periodic detection for IPS")
05c5ffaac770 ("drm/amd/display: gpuvm handling in DML21")
9ba971b25316 ("drm/amd/display: Re-enable IPS2 for static screen")
70839da63605 ("drm/amd/display: Add new DCN401 sources")
14813934b629 ("drm/amd/display: Allow RCG for Static Screen + LVP for DCN35")
e779f4587f61 ("drm/amd/display: Add handling for DC power mode")
cc263c3a0c9f ("drm/amd/display: remove context->dml2 dependency from DML21 wrapper")
d62d5551dd61 ("drm/amd/display: Backup and restore only on full updates")
2d5bb791e24f ("drm/amd/display: Implement update_planes_and_stream_v3 sequence")
4f5b8d78ca43 ("drm/amd/display: Init DPPCLK from SMU on dcn32")
2728e9c7c842 ("drm/amd/display: add DC changes for DCN351")
d2dea1f14038 ("drm/amd/display: Generalize new minimal transition path")
0701117efd1e ("Revert "drm/amd/display: For FPO and SubVP/DRR configs program vmin/max sel"")
a9b1a4f684b3 ("drm/amd/display: Add more checks for exiting idle in DC")
13b3d6bdbeb4 ("drm/amd/display: add debugfs disallow edp psr")
dcbf438d4834 ("drm/amd/display: Unify optimize_required flags and VRR adjustments")
1630c6ded587 ("drm/amd/display: "Enable IPS by default"")
8457bddc266c ("drm/amd/display: Revert "Rework DC Z10 restore"")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 65444581a4aecf0e96b4691bb20fc75c602f5863 Mon Sep 17 00:00:00 2001
From: Leo Li <sunpeng.li(a)amd.com>
Date: Tue, 27 Aug 2024 11:29:53 -0400
Subject: [PATCH] drm/amd/display: Determine IPS mode by ASIC and PMFW versions
[Why]
DCN IPS interoperates with other system idle power features, such as
Zstates.
On DCN35, there is a known issue where system Z8 + DCN IPS2 causes a
hard hang. We observe this on systems where the SBIOS allows Z8.
Though there is a SBIOS fix, there's no guarantee that users will get it
any time soon, or even install it. A workaround is needed to prevent
this from rearing its head in the wild.
[How]
For DCN35, check the pmfw version to determine whether the SBIOS has the
fix. If not, set IPS1+RCG as the deepest possible state in all cases
except for s0ix and display off (DPMS). Otherwise, enable all IPS
Signed-off-by: Leo Li <sunpeng.li(a)amd.com>
Reviewed-by: Harry Wentland <harry.wentland(a)amd.com>
Signed-off-by: Alex Deucher <alexander.deucher(a)amd.com>
(cherry picked from commit 28d43d0895896f84c038d906d244e0a95eb243ec)
Cc: stable(a)vger.kernel.org
diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
index 983a977632ff..e6cea5b9bdb3 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
@@ -1752,6 +1752,30 @@ static struct dml2_soc_bb *dm_dmub_get_vbios_bounding_box(struct amdgpu_device *
return bb;
}
+static enum dmub_ips_disable_type dm_get_default_ips_mode(
+ struct amdgpu_device *adev)
+{
+ /*
+ * On DCN35 systems with Z8 enabled, it's possible for IPS2 + Z8 to
+ * cause a hard hang. A fix exists for newer PMFW.
+ *
+ * As a workaround, for non-fixed PMFW, force IPS1+RCG as the deepest
+ * IPS state in all cases, except for s0ix and all displays off (DPMS),
+ * where IPS2 is allowed.
+ *
+ * When checking pmfw version, use the major and minor only.
+ */
+ if (amdgpu_ip_version(adev, DCE_HWIP, 0) == IP_VERSION(3, 5, 0) &&
+ (adev->pm.fw_version & 0x00FFFF00) < 0x005D6300)
+ return DMUB_IPS_RCG_IN_ACTIVE_IPS2_IN_OFF;
+
+ if (amdgpu_ip_version(adev, DCE_HWIP, 0) >= IP_VERSION(3, 5, 0))
+ return DMUB_IPS_ENABLE;
+
+ /* ASICs older than DCN35 do not have IPSs */
+ return DMUB_IPS_DISABLE_ALL;
+}
+
static int amdgpu_dm_init(struct amdgpu_device *adev)
{
struct dc_init_data init_data;
@@ -1863,7 +1887,7 @@ static int amdgpu_dm_init(struct amdgpu_device *adev)
if (amdgpu_dc_debug_mask & DC_DISABLE_IPS)
init_data.flags.disable_ips = DMUB_IPS_DISABLE_ALL;
else
- init_data.flags.disable_ips = DMUB_IPS_ENABLE;
+ init_data.flags.disable_ips = dm_get_default_ips_mode(adev);
init_data.flags.disable_ips_in_vpb = 0;
The patch below does not apply to the 6.6-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.6.y
git checkout FETCH_HEAD
git cherry-pick -x 65444581a4aecf0e96b4691bb20fc75c602f5863
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024090805-visa-hankering-f46e@gregkh' --subject-prefix 'PATCH 6.6.y' HEAD^..
Possible dependencies:
65444581a4ae ("drm/amd/display: Determine IPS mode by ASIC and PMFW versions")
234e94555800 ("drm/amd/display: Enable copying of bounding box data from VBIOS DMUB")
afca033f10d3 ("drm/amd/display: Add periodic detection for IPS")
05c5ffaac770 ("drm/amd/display: gpuvm handling in DML21")
9ba971b25316 ("drm/amd/display: Re-enable IPS2 for static screen")
70839da63605 ("drm/amd/display: Add new DCN401 sources")
14813934b629 ("drm/amd/display: Allow RCG for Static Screen + LVP for DCN35")
e779f4587f61 ("drm/amd/display: Add handling for DC power mode")
cc263c3a0c9f ("drm/amd/display: remove context->dml2 dependency from DML21 wrapper")
d62d5551dd61 ("drm/amd/display: Backup and restore only on full updates")
2d5bb791e24f ("drm/amd/display: Implement update_planes_and_stream_v3 sequence")
4f5b8d78ca43 ("drm/amd/display: Init DPPCLK from SMU on dcn32")
2728e9c7c842 ("drm/amd/display: add DC changes for DCN351")
d2dea1f14038 ("drm/amd/display: Generalize new minimal transition path")
0701117efd1e ("Revert "drm/amd/display: For FPO and SubVP/DRR configs program vmin/max sel"")
a9b1a4f684b3 ("drm/amd/display: Add more checks for exiting idle in DC")
13b3d6bdbeb4 ("drm/amd/display: add debugfs disallow edp psr")
dcbf438d4834 ("drm/amd/display: Unify optimize_required flags and VRR adjustments")
1630c6ded587 ("drm/amd/display: "Enable IPS by default"")
8457bddc266c ("drm/amd/display: Revert "Rework DC Z10 restore"")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 65444581a4aecf0e96b4691bb20fc75c602f5863 Mon Sep 17 00:00:00 2001
From: Leo Li <sunpeng.li(a)amd.com>
Date: Tue, 27 Aug 2024 11:29:53 -0400
Subject: [PATCH] drm/amd/display: Determine IPS mode by ASIC and PMFW versions
[Why]
DCN IPS interoperates with other system idle power features, such as
Zstates.
On DCN35, there is a known issue where system Z8 + DCN IPS2 causes a
hard hang. We observe this on systems where the SBIOS allows Z8.
Though there is a SBIOS fix, there's no guarantee that users will get it
any time soon, or even install it. A workaround is needed to prevent
this from rearing its head in the wild.
[How]
For DCN35, check the pmfw version to determine whether the SBIOS has the
fix. If not, set IPS1+RCG as the deepest possible state in all cases
except for s0ix and display off (DPMS). Otherwise, enable all IPS
Signed-off-by: Leo Li <sunpeng.li(a)amd.com>
Reviewed-by: Harry Wentland <harry.wentland(a)amd.com>
Signed-off-by: Alex Deucher <alexander.deucher(a)amd.com>
(cherry picked from commit 28d43d0895896f84c038d906d244e0a95eb243ec)
Cc: stable(a)vger.kernel.org
diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
index 983a977632ff..e6cea5b9bdb3 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
@@ -1752,6 +1752,30 @@ static struct dml2_soc_bb *dm_dmub_get_vbios_bounding_box(struct amdgpu_device *
return bb;
}
+static enum dmub_ips_disable_type dm_get_default_ips_mode(
+ struct amdgpu_device *adev)
+{
+ /*
+ * On DCN35 systems with Z8 enabled, it's possible for IPS2 + Z8 to
+ * cause a hard hang. A fix exists for newer PMFW.
+ *
+ * As a workaround, for non-fixed PMFW, force IPS1+RCG as the deepest
+ * IPS state in all cases, except for s0ix and all displays off (DPMS),
+ * where IPS2 is allowed.
+ *
+ * When checking pmfw version, use the major and minor only.
+ */
+ if (amdgpu_ip_version(adev, DCE_HWIP, 0) == IP_VERSION(3, 5, 0) &&
+ (adev->pm.fw_version & 0x00FFFF00) < 0x005D6300)
+ return DMUB_IPS_RCG_IN_ACTIVE_IPS2_IN_OFF;
+
+ if (amdgpu_ip_version(adev, DCE_HWIP, 0) >= IP_VERSION(3, 5, 0))
+ return DMUB_IPS_ENABLE;
+
+ /* ASICs older than DCN35 do not have IPSs */
+ return DMUB_IPS_DISABLE_ALL;
+}
+
static int amdgpu_dm_init(struct amdgpu_device *adev)
{
struct dc_init_data init_data;
@@ -1863,7 +1887,7 @@ static int amdgpu_dm_init(struct amdgpu_device *adev)
if (amdgpu_dc_debug_mask & DC_DISABLE_IPS)
init_data.flags.disable_ips = DMUB_IPS_DISABLE_ALL;
else
- init_data.flags.disable_ips = DMUB_IPS_ENABLE;
+ init_data.flags.disable_ips = dm_get_default_ips_mode(adev);
init_data.flags.disable_ips_in_vpb = 0;
The patch below does not apply to the 6.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.10.y
git checkout FETCH_HEAD
git cherry-pick -x 65444581a4aecf0e96b4691bb20fc75c602f5863
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024090802-spill-spooky-94c0@gregkh' --subject-prefix 'PATCH 6.10.y' HEAD^..
Possible dependencies:
65444581a4ae ("drm/amd/display: Determine IPS mode by ASIC and PMFW versions")
234e94555800 ("drm/amd/display: Enable copying of bounding box data from VBIOS DMUB")
afca033f10d3 ("drm/amd/display: Add periodic detection for IPS")
05c5ffaac770 ("drm/amd/display: gpuvm handling in DML21")
9ba971b25316 ("drm/amd/display: Re-enable IPS2 for static screen")
70839da63605 ("drm/amd/display: Add new DCN401 sources")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 65444581a4aecf0e96b4691bb20fc75c602f5863 Mon Sep 17 00:00:00 2001
From: Leo Li <sunpeng.li(a)amd.com>
Date: Tue, 27 Aug 2024 11:29:53 -0400
Subject: [PATCH] drm/amd/display: Determine IPS mode by ASIC and PMFW versions
[Why]
DCN IPS interoperates with other system idle power features, such as
Zstates.
On DCN35, there is a known issue where system Z8 + DCN IPS2 causes a
hard hang. We observe this on systems where the SBIOS allows Z8.
Though there is a SBIOS fix, there's no guarantee that users will get it
any time soon, or even install it. A workaround is needed to prevent
this from rearing its head in the wild.
[How]
For DCN35, check the pmfw version to determine whether the SBIOS has the
fix. If not, set IPS1+RCG as the deepest possible state in all cases
except for s0ix and display off (DPMS). Otherwise, enable all IPS
Signed-off-by: Leo Li <sunpeng.li(a)amd.com>
Reviewed-by: Harry Wentland <harry.wentland(a)amd.com>
Signed-off-by: Alex Deucher <alexander.deucher(a)amd.com>
(cherry picked from commit 28d43d0895896f84c038d906d244e0a95eb243ec)
Cc: stable(a)vger.kernel.org
diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
index 983a977632ff..e6cea5b9bdb3 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
@@ -1752,6 +1752,30 @@ static struct dml2_soc_bb *dm_dmub_get_vbios_bounding_box(struct amdgpu_device *
return bb;
}
+static enum dmub_ips_disable_type dm_get_default_ips_mode(
+ struct amdgpu_device *adev)
+{
+ /*
+ * On DCN35 systems with Z8 enabled, it's possible for IPS2 + Z8 to
+ * cause a hard hang. A fix exists for newer PMFW.
+ *
+ * As a workaround, for non-fixed PMFW, force IPS1+RCG as the deepest
+ * IPS state in all cases, except for s0ix and all displays off (DPMS),
+ * where IPS2 is allowed.
+ *
+ * When checking pmfw version, use the major and minor only.
+ */
+ if (amdgpu_ip_version(adev, DCE_HWIP, 0) == IP_VERSION(3, 5, 0) &&
+ (adev->pm.fw_version & 0x00FFFF00) < 0x005D6300)
+ return DMUB_IPS_RCG_IN_ACTIVE_IPS2_IN_OFF;
+
+ if (amdgpu_ip_version(adev, DCE_HWIP, 0) >= IP_VERSION(3, 5, 0))
+ return DMUB_IPS_ENABLE;
+
+ /* ASICs older than DCN35 do not have IPSs */
+ return DMUB_IPS_DISABLE_ALL;
+}
+
static int amdgpu_dm_init(struct amdgpu_device *adev)
{
struct dc_init_data init_data;
@@ -1863,7 +1887,7 @@ static int amdgpu_dm_init(struct amdgpu_device *adev)
if (amdgpu_dc_debug_mask & DC_DISABLE_IPS)
init_data.flags.disable_ips = DMUB_IPS_DISABLE_ALL;
else
- init_data.flags.disable_ips = DMUB_IPS_ENABLE;
+ init_data.flags.disable_ips = dm_get_default_ips_mode(adev);
init_data.flags.disable_ips_in_vpb = 0;
The patch below does not apply to the 6.1-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.1.y
git checkout FETCH_HEAD
git cherry-pick -x 38e3285dbd07db44487bbaca8c383a5d7f3c11f3
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024090844-speech-subzero-1de7@gregkh' --subject-prefix 'PATCH 6.1.y' HEAD^..
Possible dependencies:
38e3285dbd07 ("drm/amd/display: Block timing sync for different signals in PMO")
70839da63605 ("drm/amd/display: Add new DCN401 sources")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 38e3285dbd07db44487bbaca8c383a5d7f3c11f3 Mon Sep 17 00:00:00 2001
From: Dillon Varone <dillon.varone(a)amd.com>
Date: Thu, 22 Aug 2024 17:52:57 -0400
Subject: [PATCH] drm/amd/display: Block timing sync for different signals in
PMO
PMO assumes that like timings can be synchronized, but DC only allows
this if the signal types match.
Reviewed-by: Austin Zheng <austin.zheng(a)amd.com>
Signed-off-by: Dillon Varone <dillon.varone(a)amd.com>
Signed-off-by: Hamza Mahfooz <hamza.mahfooz(a)amd.com>
Signed-off-by: Alex Deucher <alexander.deucher(a)amd.com>
(cherry picked from commit 29d3d6af43135de7bec677f334292ca8dab53d67)
Cc: stable(a)vger.kernel.org
diff --git a/drivers/gpu/drm/amd/display/dc/dml2/dml21/src/dml2_pmo/dml2_pmo_dcn4_fams2.c b/drivers/gpu/drm/amd/display/dc/dml2/dml21/src/dml2_pmo/dml2_pmo_dcn4_fams2.c
index 603036df68ba..6547cc2c2a77 100644
--- a/drivers/gpu/drm/amd/display/dc/dml2/dml21/src/dml2_pmo/dml2_pmo_dcn4_fams2.c
+++ b/drivers/gpu/drm/amd/display/dc/dml2/dml21/src/dml2_pmo/dml2_pmo_dcn4_fams2.c
@@ -811,7 +811,8 @@ static void build_synchronized_timing_groups(
for (j = i + 1; j < display_config->display_config.num_streams; j++) {
if (memcmp(master_timing,
&display_config->display_config.stream_descriptors[j].timing,
- sizeof(struct dml2_timing_cfg)) == 0) {
+ sizeof(struct dml2_timing_cfg)) == 0 &&
+ display_config->display_config.stream_descriptors[i].output.output_encoder == display_config->display_config.stream_descriptors[j].output.output_encoder) {
set_bit_in_bitfield(&pmo->scratch.pmo_dcn4.synchronized_timing_group_masks[timing_group_idx], j);
set_bit_in_bitfield(&stream_mapped_mask, j);
}
The patch below does not apply to the 6.6-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.6.y
git checkout FETCH_HEAD
git cherry-pick -x 38e3285dbd07db44487bbaca8c383a5d7f3c11f3
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024090842-uncounted-lustrous-8af4@gregkh' --subject-prefix 'PATCH 6.6.y' HEAD^..
Possible dependencies:
38e3285dbd07 ("drm/amd/display: Block timing sync for different signals in PMO")
70839da63605 ("drm/amd/display: Add new DCN401 sources")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 38e3285dbd07db44487bbaca8c383a5d7f3c11f3 Mon Sep 17 00:00:00 2001
From: Dillon Varone <dillon.varone(a)amd.com>
Date: Thu, 22 Aug 2024 17:52:57 -0400
Subject: [PATCH] drm/amd/display: Block timing sync for different signals in
PMO
PMO assumes that like timings can be synchronized, but DC only allows
this if the signal types match.
Reviewed-by: Austin Zheng <austin.zheng(a)amd.com>
Signed-off-by: Dillon Varone <dillon.varone(a)amd.com>
Signed-off-by: Hamza Mahfooz <hamza.mahfooz(a)amd.com>
Signed-off-by: Alex Deucher <alexander.deucher(a)amd.com>
(cherry picked from commit 29d3d6af43135de7bec677f334292ca8dab53d67)
Cc: stable(a)vger.kernel.org
diff --git a/drivers/gpu/drm/amd/display/dc/dml2/dml21/src/dml2_pmo/dml2_pmo_dcn4_fams2.c b/drivers/gpu/drm/amd/display/dc/dml2/dml21/src/dml2_pmo/dml2_pmo_dcn4_fams2.c
index 603036df68ba..6547cc2c2a77 100644
--- a/drivers/gpu/drm/amd/display/dc/dml2/dml21/src/dml2_pmo/dml2_pmo_dcn4_fams2.c
+++ b/drivers/gpu/drm/amd/display/dc/dml2/dml21/src/dml2_pmo/dml2_pmo_dcn4_fams2.c
@@ -811,7 +811,8 @@ static void build_synchronized_timing_groups(
for (j = i + 1; j < display_config->display_config.num_streams; j++) {
if (memcmp(master_timing,
&display_config->display_config.stream_descriptors[j].timing,
- sizeof(struct dml2_timing_cfg)) == 0) {
+ sizeof(struct dml2_timing_cfg)) == 0 &&
+ display_config->display_config.stream_descriptors[i].output.output_encoder == display_config->display_config.stream_descriptors[j].output.output_encoder) {
set_bit_in_bitfield(&pmo->scratch.pmo_dcn4.synchronized_timing_group_masks[timing_group_idx], j);
set_bit_in_bitfield(&stream_mapped_mask, j);
}
The patch below does not apply to the 6.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.10.y
git checkout FETCH_HEAD
git cherry-pick -x 38e3285dbd07db44487bbaca8c383a5d7f3c11f3
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024090839-curry-shallot-c638@gregkh' --subject-prefix 'PATCH 6.10.y' HEAD^..
Possible dependencies:
38e3285dbd07 ("drm/amd/display: Block timing sync for different signals in PMO")
70839da63605 ("drm/amd/display: Add new DCN401 sources")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 38e3285dbd07db44487bbaca8c383a5d7f3c11f3 Mon Sep 17 00:00:00 2001
From: Dillon Varone <dillon.varone(a)amd.com>
Date: Thu, 22 Aug 2024 17:52:57 -0400
Subject: [PATCH] drm/amd/display: Block timing sync for different signals in
PMO
PMO assumes that like timings can be synchronized, but DC only allows
this if the signal types match.
Reviewed-by: Austin Zheng <austin.zheng(a)amd.com>
Signed-off-by: Dillon Varone <dillon.varone(a)amd.com>
Signed-off-by: Hamza Mahfooz <hamza.mahfooz(a)amd.com>
Signed-off-by: Alex Deucher <alexander.deucher(a)amd.com>
(cherry picked from commit 29d3d6af43135de7bec677f334292ca8dab53d67)
Cc: stable(a)vger.kernel.org
diff --git a/drivers/gpu/drm/amd/display/dc/dml2/dml21/src/dml2_pmo/dml2_pmo_dcn4_fams2.c b/drivers/gpu/drm/amd/display/dc/dml2/dml21/src/dml2_pmo/dml2_pmo_dcn4_fams2.c
index 603036df68ba..6547cc2c2a77 100644
--- a/drivers/gpu/drm/amd/display/dc/dml2/dml21/src/dml2_pmo/dml2_pmo_dcn4_fams2.c
+++ b/drivers/gpu/drm/amd/display/dc/dml2/dml21/src/dml2_pmo/dml2_pmo_dcn4_fams2.c
@@ -811,7 +811,8 @@ static void build_synchronized_timing_groups(
for (j = i + 1; j < display_config->display_config.num_streams; j++) {
if (memcmp(master_timing,
&display_config->display_config.stream_descriptors[j].timing,
- sizeof(struct dml2_timing_cfg)) == 0) {
+ sizeof(struct dml2_timing_cfg)) == 0 &&
+ display_config->display_config.stream_descriptors[i].output.output_encoder == display_config->display_config.stream_descriptors[j].output.output_encoder) {
set_bit_in_bitfield(&pmo->scratch.pmo_dcn4.synchronized_timing_group_masks[timing_group_idx], j);
set_bit_in_bitfield(&stream_mapped_mask, j);
}
The patch below does not apply to the 6.6-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.6.y
git checkout FETCH_HEAD
git cherry-pick -x e8705632435ae2f2253b65d3786da389982e8813
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024090848-sharpness-hunk-c88f@gregkh' --subject-prefix 'PATCH 6.6.y' HEAD^..
Possible dependencies:
e8705632435a ("drm/i915: Fix readout degamma_lut mismatch on ilk/snb")
da8c3cdb016c ("drm/i915: Rename bigjoiner master/slave to bigjoiner primary/secondary")
fb4943574f92 ("drm/i915: Rename all bigjoiner to joiner")
578ff98403ce ("drm/i915: Allow bigjoiner for MST")
3607b30836ae ("drm/i915: Handle joined pipes inside hsw_crtc_enable()")
e16bcbb01186 ("drm/i915: Handle joined pipes inside hsw_crtc_disable()")
2b8ad19d3ed6 ("drm/i915: Introduce intel_crtc_joined_pipe_mask()")
e43b4f7980f8 ("drm/i915: Pass connector to intel_dp_need_bigjoiner()")
5a1527ed8b43 ("drm/i915/mst: Check intel_dp_joiner_needs_dsc()")
aa099402f98b ("drm/i915: Extract intel_dp_joiner_needs_dsc()")
c0b8afc3a777 ("drm/i915: s/intel_dp_can_bigjoiner()/intel_dp_has_bigjoiner()/")
e02ef5553d9b ("drm/i915: Update pipes in reverse order for bigjoiner")
3a5e09d82f97 ("drm/i915: Fix intel_modeset_pipe_config_late() for bigjoiner")
f9d5e51db656 ("drm/i915/vrr: Disable VRR when using bigjoiner")
ef79820db723 ("drm/i915: Disable live M/N updates when using bigjoiner")
b37e1347b991 ("drm/i915: Disable port sync when bigjoiner is used")
372fa0c79d3f ("drm/i915/psr: Disable PSR when bigjoiner is used")
7a3f171c8f6a ("drm/i915: Extract glk_need_scaler_clock_gating_wa()")
c922a47913f9 ("drm/i915: Clean up glk_pipe_scaler_clock_gating_wa()")
e9fa99dd47a4 ("drm/i915: Shuffle DP .mode_valid() checks")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From e8705632435ae2f2253b65d3786da389982e8813 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ville=20Syrj=C3=A4l=C3=A4?= <ville.syrjala(a)linux.intel.com>
Date: Wed, 10 Jul 2024 15:41:37 +0300
Subject: [PATCH] drm/i915: Fix readout degamma_lut mismatch on ilk/snb
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
On ilk/snb the pipe may be configured to place the LUT before or
after the CSC depending on various factors, but as there is only
one LUT (no split mode like on IVB+) we only advertise a gamma_lut
and no degamma_lut in the uapi to avoid confusing userspace.
This can cause a problem during readout if the VBIOS/GOP enabled
the LUT in the pre CSC configuration. The current code blindly
assigns the results of the readout to the degamma_lut, which will
cause a failure during the next atomic_check() as we aren't expecting
anything to be in degamma_lut since it's not visible to userspace.
Fix the problem by assigning whatever LUT we read out from the
hardware into gamma_lut.
Cc: stable(a)vger.kernel.org
Fixes: d2559299d339 ("drm/i915: Make ilk_read_luts() capable of degamma readout")
Closes: https://gitlab.freedesktop.org/drm/i915/kernel/-/issues/11608
Signed-off-by: Ville Syrjälä <ville.syrjala(a)linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20240710124137.16773-1-ville.…
Reviewed-by: Uma Shankar <uma.shankar(a)intel.com>
(cherry picked from commit 33eca84db6e31091cef63584158ab64704f78462)
Signed-off-by: Joonas Lahtinen <joonas.lahtinen(a)linux.intel.com>
diff --git a/drivers/gpu/drm/i915/display/intel_modeset_setup.c b/drivers/gpu/drm/i915/display/intel_modeset_setup.c
index 7602cb30ebf1..e1213f3d93cc 100644
--- a/drivers/gpu/drm/i915/display/intel_modeset_setup.c
+++ b/drivers/gpu/drm/i915/display/intel_modeset_setup.c
@@ -326,6 +326,8 @@ static void intel_modeset_update_connector_atomic_state(struct drm_i915_private
static void intel_crtc_copy_hw_to_uapi_state(struct intel_crtc_state *crtc_state)
{
+ struct drm_i915_private *i915 = to_i915(crtc_state->uapi.crtc->dev);
+
if (intel_crtc_is_joiner_secondary(crtc_state))
return;
@@ -337,11 +339,30 @@ static void intel_crtc_copy_hw_to_uapi_state(struct intel_crtc_state *crtc_state
crtc_state->uapi.adjusted_mode = crtc_state->hw.adjusted_mode;
crtc_state->uapi.scaling_filter = crtc_state->hw.scaling_filter;
- /* assume 1:1 mapping */
- drm_property_replace_blob(&crtc_state->hw.degamma_lut,
- crtc_state->pre_csc_lut);
- drm_property_replace_blob(&crtc_state->hw.gamma_lut,
- crtc_state->post_csc_lut);
+ if (DISPLAY_INFO(i915)->color.degamma_lut_size) {
+ /* assume 1:1 mapping */
+ drm_property_replace_blob(&crtc_state->hw.degamma_lut,
+ crtc_state->pre_csc_lut);
+ drm_property_replace_blob(&crtc_state->hw.gamma_lut,
+ crtc_state->post_csc_lut);
+ } else {
+ /*
+ * ilk/snb hw may be configured for either pre_csc_lut
+ * or post_csc_lut, but we don't advertise degamma_lut as
+ * being available in the uapi since there is only one
+ * hardware LUT. Always assign the result of the readout
+ * to gamma_lut as that is the only valid source of LUTs
+ * in the uapi.
+ */
+ drm_WARN_ON(&i915->drm, crtc_state->post_csc_lut &&
+ crtc_state->pre_csc_lut);
+
+ drm_property_replace_blob(&crtc_state->hw.degamma_lut,
+ NULL);
+ drm_property_replace_blob(&crtc_state->hw.gamma_lut,
+ crtc_state->post_csc_lut ?:
+ crtc_state->pre_csc_lut);
+ }
drm_property_replace_blob(&crtc_state->uapi.degamma_lut,
crtc_state->hw.degamma_lut);
The patch below does not apply to the 6.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.10.y
git checkout FETCH_HEAD
git cherry-pick -x e8705632435ae2f2253b65d3786da389982e8813
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024090844-result-caucasian-a9e5@gregkh' --subject-prefix 'PATCH 6.10.y' HEAD^..
Possible dependencies:
e8705632435a ("drm/i915: Fix readout degamma_lut mismatch on ilk/snb")
da8c3cdb016c ("drm/i915: Rename bigjoiner master/slave to bigjoiner primary/secondary")
fb4943574f92 ("drm/i915: Rename all bigjoiner to joiner")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From e8705632435ae2f2253b65d3786da389982e8813 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ville=20Syrj=C3=A4l=C3=A4?= <ville.syrjala(a)linux.intel.com>
Date: Wed, 10 Jul 2024 15:41:37 +0300
Subject: [PATCH] drm/i915: Fix readout degamma_lut mismatch on ilk/snb
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
On ilk/snb the pipe may be configured to place the LUT before or
after the CSC depending on various factors, but as there is only
one LUT (no split mode like on IVB+) we only advertise a gamma_lut
and no degamma_lut in the uapi to avoid confusing userspace.
This can cause a problem during readout if the VBIOS/GOP enabled
the LUT in the pre CSC configuration. The current code blindly
assigns the results of the readout to the degamma_lut, which will
cause a failure during the next atomic_check() as we aren't expecting
anything to be in degamma_lut since it's not visible to userspace.
Fix the problem by assigning whatever LUT we read out from the
hardware into gamma_lut.
Cc: stable(a)vger.kernel.org
Fixes: d2559299d339 ("drm/i915: Make ilk_read_luts() capable of degamma readout")
Closes: https://gitlab.freedesktop.org/drm/i915/kernel/-/issues/11608
Signed-off-by: Ville Syrjälä <ville.syrjala(a)linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20240710124137.16773-1-ville.…
Reviewed-by: Uma Shankar <uma.shankar(a)intel.com>
(cherry picked from commit 33eca84db6e31091cef63584158ab64704f78462)
Signed-off-by: Joonas Lahtinen <joonas.lahtinen(a)linux.intel.com>
diff --git a/drivers/gpu/drm/i915/display/intel_modeset_setup.c b/drivers/gpu/drm/i915/display/intel_modeset_setup.c
index 7602cb30ebf1..e1213f3d93cc 100644
--- a/drivers/gpu/drm/i915/display/intel_modeset_setup.c
+++ b/drivers/gpu/drm/i915/display/intel_modeset_setup.c
@@ -326,6 +326,8 @@ static void intel_modeset_update_connector_atomic_state(struct drm_i915_private
static void intel_crtc_copy_hw_to_uapi_state(struct intel_crtc_state *crtc_state)
{
+ struct drm_i915_private *i915 = to_i915(crtc_state->uapi.crtc->dev);
+
if (intel_crtc_is_joiner_secondary(crtc_state))
return;
@@ -337,11 +339,30 @@ static void intel_crtc_copy_hw_to_uapi_state(struct intel_crtc_state *crtc_state
crtc_state->uapi.adjusted_mode = crtc_state->hw.adjusted_mode;
crtc_state->uapi.scaling_filter = crtc_state->hw.scaling_filter;
- /* assume 1:1 mapping */
- drm_property_replace_blob(&crtc_state->hw.degamma_lut,
- crtc_state->pre_csc_lut);
- drm_property_replace_blob(&crtc_state->hw.gamma_lut,
- crtc_state->post_csc_lut);
+ if (DISPLAY_INFO(i915)->color.degamma_lut_size) {
+ /* assume 1:1 mapping */
+ drm_property_replace_blob(&crtc_state->hw.degamma_lut,
+ crtc_state->pre_csc_lut);
+ drm_property_replace_blob(&crtc_state->hw.gamma_lut,
+ crtc_state->post_csc_lut);
+ } else {
+ /*
+ * ilk/snb hw may be configured for either pre_csc_lut
+ * or post_csc_lut, but we don't advertise degamma_lut as
+ * being available in the uapi since there is only one
+ * hardware LUT. Always assign the result of the readout
+ * to gamma_lut as that is the only valid source of LUTs
+ * in the uapi.
+ */
+ drm_WARN_ON(&i915->drm, crtc_state->post_csc_lut &&
+ crtc_state->pre_csc_lut);
+
+ drm_property_replace_blob(&crtc_state->hw.degamma_lut,
+ NULL);
+ drm_property_replace_blob(&crtc_state->hw.gamma_lut,
+ crtc_state->post_csc_lut ?:
+ crtc_state->pre_csc_lut);
+ }
drm_property_replace_blob(&crtc_state->uapi.degamma_lut,
crtc_state->hw.degamma_lut);
The patch below does not apply to the 6.1-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.1.y
git checkout FETCH_HEAD
git cherry-pick -x 72a6e22c604c95ddb3b10b5d3bb85b6ff4dbc34f
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024090838-thus-fiftieth-f4f7@gregkh' --subject-prefix 'PATCH 6.1.y' HEAD^..
Possible dependencies:
72a6e22c604c ("fscache: delete fscache_cookie_lru_timer when fscache exits to avoid UAF")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 72a6e22c604c95ddb3b10b5d3bb85b6ff4dbc34f Mon Sep 17 00:00:00 2001
From: Baokun Li <libaokun1(a)huawei.com>
Date: Mon, 26 Aug 2024 19:20:56 +0800
Subject: [PATCH] fscache: delete fscache_cookie_lru_timer when fscache exits
to avoid UAF
The fscache_cookie_lru_timer is initialized when the fscache module
is inserted, but is not deleted when the fscache module is removed.
If timer_reduce() is called before removing the fscache module,
the fscache_cookie_lru_timer will be added to the timer list of
the current cpu. Afterwards, a use-after-free will be triggered
in the softIRQ after removing the fscache module, as follows:
==================================================================
BUG: unable to handle page fault for address: fffffbfff803c9e9
PF: supervisor read access in kernel mode
PF: error_code(0x0000) - not-present page
PGD 21ffea067 P4D 21ffea067 PUD 21ffe6067 PMD 110a7c067 PTE 0
Oops: Oops: 0000 [#1] PREEMPT SMP KASAN PTI
CPU: 1 UID: 0 PID: 0 Comm: swapper/1 Tainted: G W 6.11.0-rc3 #855
Tainted: [W]=WARN
RIP: 0010:__run_timer_base.part.0+0x254/0x8a0
Call Trace:
<IRQ>
tmigr_handle_remote_up+0x627/0x810
__walk_groups.isra.0+0x47/0x140
tmigr_handle_remote+0x1fa/0x2f0
handle_softirqs+0x180/0x590
irq_exit_rcu+0x84/0xb0
sysvec_apic_timer_interrupt+0x6e/0x90
</IRQ>
<TASK>
asm_sysvec_apic_timer_interrupt+0x1a/0x20
RIP: 0010:default_idle+0xf/0x20
default_idle_call+0x38/0x60
do_idle+0x2b5/0x300
cpu_startup_entry+0x54/0x60
start_secondary+0x20d/0x280
common_startup_64+0x13e/0x148
</TASK>
Modules linked in: [last unloaded: netfs]
==================================================================
Therefore delete fscache_cookie_lru_timer when removing the fscahe module.
Fixes: 12bb21a29c19 ("fscache: Implement cookie user counting and resource pinning")
Cc: stable(a)kernel.org
Signed-off-by: Baokun Li <libaokun1(a)huawei.com>
Link: https://lore.kernel.org/r/20240826112056.2458299-1-libaokun@huaweicloud.com
Acked-by: David Howells <dhowells(a)redhat.com>
Signed-off-by: Christian Brauner <brauner(a)kernel.org>
diff --git a/fs/netfs/fscache_main.c b/fs/netfs/fscache_main.c
index 42e98bb523e3..49849005eb7c 100644
--- a/fs/netfs/fscache_main.c
+++ b/fs/netfs/fscache_main.c
@@ -103,6 +103,7 @@ void __exit fscache_exit(void)
kmem_cache_destroy(fscache_cookie_jar);
fscache_proc_cleanup();
+ timer_shutdown_sync(&fscache_cookie_lru_timer);
destroy_workqueue(fscache_wq);
pr_notice("FS-Cache unloaded\n");
}
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.15.y
git checkout FETCH_HEAD
git cherry-pick -x cd9253c23aedd61eb5ff11f37a36247cd46faf86
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024090853-untagged-gravy-ccd3@gregkh' --subject-prefix 'PATCH 5.15.y' HEAD^..
Possible dependencies:
cd9253c23aed ("btrfs: fix race between direct IO write and fsync when using same fd")
939b656bc8ab ("btrfs: fix corruption after buffer fault in during direct IO append write")
9aa29a20b700 ("btrfs: move the direct IO code into its own file")
04ef7631bfa5 ("btrfs: cleanup duplicated parameters related to btrfs_create_dio_extent()")
9fec848b3a33 ("btrfs: cleanup duplicated parameters related to create_io_em()")
e9ea31fb5c1f ("btrfs: cleanup duplicated parameters related to btrfs_alloc_ordered_extent")
cdc627e65c7e ("btrfs: cleanup duplicated parameters related to can_nocow_file_extent_args")
c77a8c61002e ("btrfs: remove extent_map::block_start member")
e28b851ed9b2 ("btrfs: remove extent_map::block_len member")
4aa7b5d1784f ("btrfs: remove extent_map::orig_start member")
3f255ece2f1e ("btrfs: introduce extra sanity checks for extent maps")
3d2ac9922465 ("btrfs: introduce new members for extent_map")
87a6962f73b1 ("btrfs: export the expected file extent through can_nocow_extent()")
e8fe524da027 ("btrfs: rename extent_map::orig_block_len to disk_num_bytes")
8996f61ab9ff ("btrfs: move fiemap code into its own file")
56b7169f691c ("btrfs: use a btrfs_inode local variable at btrfs_sync_file()")
e641e323abb3 ("btrfs: pass a btrfs_inode to btrfs_wait_ordered_range()")
cef2daba4268 ("btrfs: pass a btrfs_inode to btrfs_fdatawrite_range()")
4e660ca3a98d ("btrfs: use a regular rb_root instead of cached rb_root for extent_map_tree")
7f5830bc964d ("btrfs: rename rb_root member of extent_map_tree from map to root")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From cd9253c23aedd61eb5ff11f37a36247cd46faf86 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana(a)suse.com>
Date: Thu, 29 Aug 2024 18:25:49 +0100
Subject: [PATCH] btrfs: fix race between direct IO write and fsync when using
same fd
If we have 2 threads that are using the same file descriptor and one of
them is doing direct IO writes while the other is doing fsync, we have a
race where we can end up either:
1) Attempt a fsync without holding the inode's lock, triggering an
assertion failures when assertions are enabled;
2) Do an invalid memory access from the fsync task because the file private
points to memory allocated on stack by the direct IO task and it may be
used by the fsync task after the stack was destroyed.
The race happens like this:
1) A user space program opens a file descriptor with O_DIRECT;
2) The program spawns 2 threads using libpthread for example;
3) One of the threads uses the file descriptor to do direct IO writes,
while the other calls fsync using the same file descriptor.
4) Call task A the thread doing direct IO writes and task B the thread
doing fsyncs;
5) Task A does a direct IO write, and at btrfs_direct_write() sets the
file's private to an on stack allocated private with the member
'fsync_skip_inode_lock' set to true;
6) Task B enters btrfs_sync_file() and sees that there's a private
structure associated to the file which has 'fsync_skip_inode_lock' set
to true, so it skips locking the inode's VFS lock;
7) Task A completes the direct IO write, and resets the file's private to
NULL since it had no prior private and our private was stack allocated.
Then it unlocks the inode's VFS lock;
8) Task B enters btrfs_get_ordered_extents_for_logging(), then the
assertion that checks the inode's VFS lock is held fails, since task B
never locked it and task A has already unlocked it.
The stack trace produced is the following:
assertion failed: inode_is_locked(&inode->vfs_inode), in fs/btrfs/ordered-data.c:983
------------[ cut here ]------------
kernel BUG at fs/btrfs/ordered-data.c:983!
Oops: invalid opcode: 0000 [#1] PREEMPT SMP PTI
CPU: 9 PID: 5072 Comm: worker Tainted: G U OE 6.10.5-1-default #1 openSUSE Tumbleweed 69f48d427608e1c09e60ea24c6c55e2ca1b049e8
Hardware name: Acer Predator PH315-52/Covini_CFS, BIOS V1.12 07/28/2020
RIP: 0010:btrfs_get_ordered_extents_for_logging.cold+0x1f/0x42 [btrfs]
Code: 50 d6 86 c0 e8 (...)
RSP: 0018:ffff9e4a03dcfc78 EFLAGS: 00010246
RAX: 0000000000000054 RBX: ffff9078a9868e98 RCX: 0000000000000000
RDX: 0000000000000000 RSI: ffff907dce4a7800 RDI: ffff907dce4a7800
RBP: ffff907805518800 R08: 0000000000000000 R09: ffff9e4a03dcfb38
R10: ffff9e4a03dcfb30 R11: 0000000000000003 R12: ffff907684ae7800
R13: 0000000000000001 R14: ffff90774646b600 R15: 0000000000000000
FS: 00007f04b96006c0(0000) GS:ffff907dce480000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00007f32acbfc000 CR3: 00000001fd4fa005 CR4: 00000000003726f0
Call Trace:
<TASK>
? __die_body.cold+0x14/0x24
? die+0x2e/0x50
? do_trap+0xca/0x110
? do_error_trap+0x6a/0x90
? btrfs_get_ordered_extents_for_logging.cold+0x1f/0x42 [btrfs bb26272d49b4cdc847cf3f7faadd459b62caee9a]
? exc_invalid_op+0x50/0x70
? btrfs_get_ordered_extents_for_logging.cold+0x1f/0x42 [btrfs bb26272d49b4cdc847cf3f7faadd459b62caee9a]
? asm_exc_invalid_op+0x1a/0x20
? btrfs_get_ordered_extents_for_logging.cold+0x1f/0x42 [btrfs bb26272d49b4cdc847cf3f7faadd459b62caee9a]
? btrfs_get_ordered_extents_for_logging.cold+0x1f/0x42 [btrfs bb26272d49b4cdc847cf3f7faadd459b62caee9a]
btrfs_sync_file+0x21a/0x4d0 [btrfs bb26272d49b4cdc847cf3f7faadd459b62caee9a]
? __seccomp_filter+0x31d/0x4f0
__x64_sys_fdatasync+0x4f/0x90
do_syscall_64+0x82/0x160
? do_futex+0xcb/0x190
? __x64_sys_futex+0x10e/0x1d0
? switch_fpu_return+0x4f/0xd0
? syscall_exit_to_user_mode+0x72/0x220
? do_syscall_64+0x8e/0x160
? syscall_exit_to_user_mode+0x72/0x220
? do_syscall_64+0x8e/0x160
? syscall_exit_to_user_mode+0x72/0x220
? do_syscall_64+0x8e/0x160
? syscall_exit_to_user_mode+0x72/0x220
? do_syscall_64+0x8e/0x160
entry_SYSCALL_64_after_hwframe+0x76/0x7e
Another problem here is if task B grabs the private pointer and then uses
it after task A has finished, since the private was allocated in the stack
of task A, it results in some invalid memory access with a hard to predict
result.
This issue, triggering the assertion, was observed with QEMU workloads by
two users in the Link tags below.
Fix this by not relying on a file's private to pass information to fsync
that it should skip locking the inode and instead pass this information
through a special value stored in current->journal_info. This is safe
because in the relevant section of the direct IO write path we are not
holding a transaction handle, so current->journal_info is NULL.
The following C program triggers the issue:
$ cat repro.c
/* Get the O_DIRECT definition. */
#ifndef _GNU_SOURCE
#define _GNU_SOURCE
#endif
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <stdint.h>
#include <fcntl.h>
#include <errno.h>
#include <string.h>
#include <pthread.h>
static int fd;
static ssize_t do_write(int fd, const void *buf, size_t count, off_t offset)
{
while (count > 0) {
ssize_t ret;
ret = pwrite(fd, buf, count, offset);
if (ret < 0) {
if (errno == EINTR)
continue;
return ret;
}
count -= ret;
buf += ret;
}
return 0;
}
static void *fsync_loop(void *arg)
{
while (1) {
int ret;
ret = fsync(fd);
if (ret != 0) {
perror("Fsync failed");
exit(6);
}
}
}
int main(int argc, char *argv[])
{
long pagesize;
void *write_buf;
pthread_t fsyncer;
int ret;
if (argc != 2) {
fprintf(stderr, "Use: %s <file path>\n", argv[0]);
return 1;
}
fd = open(argv[1], O_WRONLY | O_CREAT | O_TRUNC | O_DIRECT, 0666);
if (fd == -1) {
perror("Failed to open/create file");
return 1;
}
pagesize = sysconf(_SC_PAGE_SIZE);
if (pagesize == -1) {
perror("Failed to get page size");
return 2;
}
ret = posix_memalign(&write_buf, pagesize, pagesize);
if (ret) {
perror("Failed to allocate buffer");
return 3;
}
ret = pthread_create(&fsyncer, NULL, fsync_loop, NULL);
if (ret != 0) {
fprintf(stderr, "Failed to create writer thread: %d\n", ret);
return 4;
}
while (1) {
ret = do_write(fd, write_buf, pagesize, 0);
if (ret != 0) {
perror("Write failed");
exit(5);
}
}
return 0;
}
$ mkfs.btrfs -f /dev/sdi
$ mount /dev/sdi /mnt/sdi
$ timeout 10 ./repro /mnt/sdi/foo
Usually the race is triggered within less than 1 second. A test case for
fstests will follow soon.
Reported-by: Paulo Dias <paulo.miguel.dias(a)gmail.com>
Link: https://bugzilla.kernel.org/show_bug.cgi?id=219187
Reported-by: Andreas Jahn <jahn-andi(a)web.de>
Link: https://bugzilla.kernel.org/show_bug.cgi?id=219199
Reported-by: syzbot+4704b3cc972bd76024f1(a)syzkaller.appspotmail.com
Link: https://lore.kernel.org/linux-btrfs/00000000000044ff540620d7dee2@google.com/
Fixes: 939b656bc8ab ("btrfs: fix corruption after buffer fault in during direct IO append write")
CC: stable(a)vger.kernel.org # 5.15+
Reviewed-by: Josef Bacik <josef(a)toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana(a)suse.com>
Reviewed-by: David Sterba <dsterba(a)suse.com>
Signed-off-by: David Sterba <dsterba(a)suse.com>
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 75fa563e4cac..c8568b1a61c4 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -459,7 +459,6 @@ struct btrfs_file_private {
void *filldir_buf;
u64 last_index;
struct extent_state *llseek_cached_state;
- bool fsync_skip_inode_lock;
};
static inline u32 BTRFS_LEAF_DATA_SIZE(const struct btrfs_fs_info *info)
diff --git a/fs/btrfs/direct-io.c b/fs/btrfs/direct-io.c
index 67adbe9d294a..364bce34f034 100644
--- a/fs/btrfs/direct-io.c
+++ b/fs/btrfs/direct-io.c
@@ -864,13 +864,6 @@ ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
if (IS_ERR_OR_NULL(dio)) {
ret = PTR_ERR_OR_ZERO(dio);
} else {
- struct btrfs_file_private stack_private = { 0 };
- struct btrfs_file_private *private;
- const bool have_private = (file->private_data != NULL);
-
- if (!have_private)
- file->private_data = &stack_private;
-
/*
* If we have a synchronous write, we must make sure the fsync
* triggered by the iomap_dio_complete() call below doesn't
@@ -879,13 +872,10 @@ ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
* partial writes due to the input buffer (or parts of it) not
* being already faulted in.
*/
- private = file->private_data;
- private->fsync_skip_inode_lock = true;
+ ASSERT(current->journal_info == NULL);
+ current->journal_info = BTRFS_TRANS_DIO_WRITE_STUB;
ret = iomap_dio_complete(dio);
- private->fsync_skip_inode_lock = false;
-
- if (!have_private)
- file->private_data = NULL;
+ current->journal_info = NULL;
}
/* No increment (+=) because iomap returns a cumulative value. */
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 9914419f3b7d..2aeb8116549c 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1603,7 +1603,6 @@ static inline bool skip_inode_logging(const struct btrfs_log_ctx *ctx)
*/
int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
{
- struct btrfs_file_private *private = file->private_data;
struct dentry *dentry = file_dentry(file);
struct btrfs_inode *inode = BTRFS_I(d_inode(dentry));
struct btrfs_root *root = inode->root;
@@ -1613,7 +1612,13 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
int ret = 0, err;
u64 len;
bool full_sync;
- const bool skip_ilock = (private ? private->fsync_skip_inode_lock : false);
+ bool skip_ilock = false;
+
+ if (current->journal_info == BTRFS_TRANS_DIO_WRITE_STUB) {
+ skip_ilock = true;
+ current->journal_info = NULL;
+ lockdep_assert_held(&inode->vfs_inode.i_rwsem);
+ }
trace_btrfs_sync_file(file, datasync);
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 98c03ddc760b..dd9ce9b9f69e 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -27,6 +27,12 @@ struct btrfs_root_item;
struct btrfs_root;
struct btrfs_path;
+/*
+ * Signal that a direct IO write is in progress, to avoid deadlock for sync
+ * direct IO writes when fsync is called during the direct IO write path.
+ */
+#define BTRFS_TRANS_DIO_WRITE_STUB ((void *) 1)
+
/* Radix-tree tag for roots that are part of the trasaction. */
#define BTRFS_ROOT_TRANS_TAG 0
The patch below does not apply to the 6.1-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.1.y
git checkout FETCH_HEAD
git cherry-pick -x cd9253c23aedd61eb5ff11f37a36247cd46faf86
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024090850-naturist-deafness-b924@gregkh' --subject-prefix 'PATCH 6.1.y' HEAD^..
Possible dependencies:
cd9253c23aed ("btrfs: fix race between direct IO write and fsync when using same fd")
939b656bc8ab ("btrfs: fix corruption after buffer fault in during direct IO append write")
9aa29a20b700 ("btrfs: move the direct IO code into its own file")
04ef7631bfa5 ("btrfs: cleanup duplicated parameters related to btrfs_create_dio_extent()")
9fec848b3a33 ("btrfs: cleanup duplicated parameters related to create_io_em()")
e9ea31fb5c1f ("btrfs: cleanup duplicated parameters related to btrfs_alloc_ordered_extent")
cdc627e65c7e ("btrfs: cleanup duplicated parameters related to can_nocow_file_extent_args")
c77a8c61002e ("btrfs: remove extent_map::block_start member")
e28b851ed9b2 ("btrfs: remove extent_map::block_len member")
4aa7b5d1784f ("btrfs: remove extent_map::orig_start member")
3f255ece2f1e ("btrfs: introduce extra sanity checks for extent maps")
3d2ac9922465 ("btrfs: introduce new members for extent_map")
87a6962f73b1 ("btrfs: export the expected file extent through can_nocow_extent()")
e8fe524da027 ("btrfs: rename extent_map::orig_block_len to disk_num_bytes")
8996f61ab9ff ("btrfs: move fiemap code into its own file")
56b7169f691c ("btrfs: use a btrfs_inode local variable at btrfs_sync_file()")
e641e323abb3 ("btrfs: pass a btrfs_inode to btrfs_wait_ordered_range()")
cef2daba4268 ("btrfs: pass a btrfs_inode to btrfs_fdatawrite_range()")
4e660ca3a98d ("btrfs: use a regular rb_root instead of cached rb_root for extent_map_tree")
7f5830bc964d ("btrfs: rename rb_root member of extent_map_tree from map to root")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From cd9253c23aedd61eb5ff11f37a36247cd46faf86 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana(a)suse.com>
Date: Thu, 29 Aug 2024 18:25:49 +0100
Subject: [PATCH] btrfs: fix race between direct IO write and fsync when using
same fd
If we have 2 threads that are using the same file descriptor and one of
them is doing direct IO writes while the other is doing fsync, we have a
race where we can end up either:
1) Attempt a fsync without holding the inode's lock, triggering an
assertion failures when assertions are enabled;
2) Do an invalid memory access from the fsync task because the file private
points to memory allocated on stack by the direct IO task and it may be
used by the fsync task after the stack was destroyed.
The race happens like this:
1) A user space program opens a file descriptor with O_DIRECT;
2) The program spawns 2 threads using libpthread for example;
3) One of the threads uses the file descriptor to do direct IO writes,
while the other calls fsync using the same file descriptor.
4) Call task A the thread doing direct IO writes and task B the thread
doing fsyncs;
5) Task A does a direct IO write, and at btrfs_direct_write() sets the
file's private to an on stack allocated private with the member
'fsync_skip_inode_lock' set to true;
6) Task B enters btrfs_sync_file() and sees that there's a private
structure associated to the file which has 'fsync_skip_inode_lock' set
to true, so it skips locking the inode's VFS lock;
7) Task A completes the direct IO write, and resets the file's private to
NULL since it had no prior private and our private was stack allocated.
Then it unlocks the inode's VFS lock;
8) Task B enters btrfs_get_ordered_extents_for_logging(), then the
assertion that checks the inode's VFS lock is held fails, since task B
never locked it and task A has already unlocked it.
The stack trace produced is the following:
assertion failed: inode_is_locked(&inode->vfs_inode), in fs/btrfs/ordered-data.c:983
------------[ cut here ]------------
kernel BUG at fs/btrfs/ordered-data.c:983!
Oops: invalid opcode: 0000 [#1] PREEMPT SMP PTI
CPU: 9 PID: 5072 Comm: worker Tainted: G U OE 6.10.5-1-default #1 openSUSE Tumbleweed 69f48d427608e1c09e60ea24c6c55e2ca1b049e8
Hardware name: Acer Predator PH315-52/Covini_CFS, BIOS V1.12 07/28/2020
RIP: 0010:btrfs_get_ordered_extents_for_logging.cold+0x1f/0x42 [btrfs]
Code: 50 d6 86 c0 e8 (...)
RSP: 0018:ffff9e4a03dcfc78 EFLAGS: 00010246
RAX: 0000000000000054 RBX: ffff9078a9868e98 RCX: 0000000000000000
RDX: 0000000000000000 RSI: ffff907dce4a7800 RDI: ffff907dce4a7800
RBP: ffff907805518800 R08: 0000000000000000 R09: ffff9e4a03dcfb38
R10: ffff9e4a03dcfb30 R11: 0000000000000003 R12: ffff907684ae7800
R13: 0000000000000001 R14: ffff90774646b600 R15: 0000000000000000
FS: 00007f04b96006c0(0000) GS:ffff907dce480000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00007f32acbfc000 CR3: 00000001fd4fa005 CR4: 00000000003726f0
Call Trace:
<TASK>
? __die_body.cold+0x14/0x24
? die+0x2e/0x50
? do_trap+0xca/0x110
? do_error_trap+0x6a/0x90
? btrfs_get_ordered_extents_for_logging.cold+0x1f/0x42 [btrfs bb26272d49b4cdc847cf3f7faadd459b62caee9a]
? exc_invalid_op+0x50/0x70
? btrfs_get_ordered_extents_for_logging.cold+0x1f/0x42 [btrfs bb26272d49b4cdc847cf3f7faadd459b62caee9a]
? asm_exc_invalid_op+0x1a/0x20
? btrfs_get_ordered_extents_for_logging.cold+0x1f/0x42 [btrfs bb26272d49b4cdc847cf3f7faadd459b62caee9a]
? btrfs_get_ordered_extents_for_logging.cold+0x1f/0x42 [btrfs bb26272d49b4cdc847cf3f7faadd459b62caee9a]
btrfs_sync_file+0x21a/0x4d0 [btrfs bb26272d49b4cdc847cf3f7faadd459b62caee9a]
? __seccomp_filter+0x31d/0x4f0
__x64_sys_fdatasync+0x4f/0x90
do_syscall_64+0x82/0x160
? do_futex+0xcb/0x190
? __x64_sys_futex+0x10e/0x1d0
? switch_fpu_return+0x4f/0xd0
? syscall_exit_to_user_mode+0x72/0x220
? do_syscall_64+0x8e/0x160
? syscall_exit_to_user_mode+0x72/0x220
? do_syscall_64+0x8e/0x160
? syscall_exit_to_user_mode+0x72/0x220
? do_syscall_64+0x8e/0x160
? syscall_exit_to_user_mode+0x72/0x220
? do_syscall_64+0x8e/0x160
entry_SYSCALL_64_after_hwframe+0x76/0x7e
Another problem here is if task B grabs the private pointer and then uses
it after task A has finished, since the private was allocated in the stack
of task A, it results in some invalid memory access with a hard to predict
result.
This issue, triggering the assertion, was observed with QEMU workloads by
two users in the Link tags below.
Fix this by not relying on a file's private to pass information to fsync
that it should skip locking the inode and instead pass this information
through a special value stored in current->journal_info. This is safe
because in the relevant section of the direct IO write path we are not
holding a transaction handle, so current->journal_info is NULL.
The following C program triggers the issue:
$ cat repro.c
/* Get the O_DIRECT definition. */
#ifndef _GNU_SOURCE
#define _GNU_SOURCE
#endif
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <stdint.h>
#include <fcntl.h>
#include <errno.h>
#include <string.h>
#include <pthread.h>
static int fd;
static ssize_t do_write(int fd, const void *buf, size_t count, off_t offset)
{
while (count > 0) {
ssize_t ret;
ret = pwrite(fd, buf, count, offset);
if (ret < 0) {
if (errno == EINTR)
continue;
return ret;
}
count -= ret;
buf += ret;
}
return 0;
}
static void *fsync_loop(void *arg)
{
while (1) {
int ret;
ret = fsync(fd);
if (ret != 0) {
perror("Fsync failed");
exit(6);
}
}
}
int main(int argc, char *argv[])
{
long pagesize;
void *write_buf;
pthread_t fsyncer;
int ret;
if (argc != 2) {
fprintf(stderr, "Use: %s <file path>\n", argv[0]);
return 1;
}
fd = open(argv[1], O_WRONLY | O_CREAT | O_TRUNC | O_DIRECT, 0666);
if (fd == -1) {
perror("Failed to open/create file");
return 1;
}
pagesize = sysconf(_SC_PAGE_SIZE);
if (pagesize == -1) {
perror("Failed to get page size");
return 2;
}
ret = posix_memalign(&write_buf, pagesize, pagesize);
if (ret) {
perror("Failed to allocate buffer");
return 3;
}
ret = pthread_create(&fsyncer, NULL, fsync_loop, NULL);
if (ret != 0) {
fprintf(stderr, "Failed to create writer thread: %d\n", ret);
return 4;
}
while (1) {
ret = do_write(fd, write_buf, pagesize, 0);
if (ret != 0) {
perror("Write failed");
exit(5);
}
}
return 0;
}
$ mkfs.btrfs -f /dev/sdi
$ mount /dev/sdi /mnt/sdi
$ timeout 10 ./repro /mnt/sdi/foo
Usually the race is triggered within less than 1 second. A test case for
fstests will follow soon.
Reported-by: Paulo Dias <paulo.miguel.dias(a)gmail.com>
Link: https://bugzilla.kernel.org/show_bug.cgi?id=219187
Reported-by: Andreas Jahn <jahn-andi(a)web.de>
Link: https://bugzilla.kernel.org/show_bug.cgi?id=219199
Reported-by: syzbot+4704b3cc972bd76024f1(a)syzkaller.appspotmail.com
Link: https://lore.kernel.org/linux-btrfs/00000000000044ff540620d7dee2@google.com/
Fixes: 939b656bc8ab ("btrfs: fix corruption after buffer fault in during direct IO append write")
CC: stable(a)vger.kernel.org # 5.15+
Reviewed-by: Josef Bacik <josef(a)toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana(a)suse.com>
Reviewed-by: David Sterba <dsterba(a)suse.com>
Signed-off-by: David Sterba <dsterba(a)suse.com>
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 75fa563e4cac..c8568b1a61c4 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -459,7 +459,6 @@ struct btrfs_file_private {
void *filldir_buf;
u64 last_index;
struct extent_state *llseek_cached_state;
- bool fsync_skip_inode_lock;
};
static inline u32 BTRFS_LEAF_DATA_SIZE(const struct btrfs_fs_info *info)
diff --git a/fs/btrfs/direct-io.c b/fs/btrfs/direct-io.c
index 67adbe9d294a..364bce34f034 100644
--- a/fs/btrfs/direct-io.c
+++ b/fs/btrfs/direct-io.c
@@ -864,13 +864,6 @@ ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
if (IS_ERR_OR_NULL(dio)) {
ret = PTR_ERR_OR_ZERO(dio);
} else {
- struct btrfs_file_private stack_private = { 0 };
- struct btrfs_file_private *private;
- const bool have_private = (file->private_data != NULL);
-
- if (!have_private)
- file->private_data = &stack_private;
-
/*
* If we have a synchronous write, we must make sure the fsync
* triggered by the iomap_dio_complete() call below doesn't
@@ -879,13 +872,10 @@ ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
* partial writes due to the input buffer (or parts of it) not
* being already faulted in.
*/
- private = file->private_data;
- private->fsync_skip_inode_lock = true;
+ ASSERT(current->journal_info == NULL);
+ current->journal_info = BTRFS_TRANS_DIO_WRITE_STUB;
ret = iomap_dio_complete(dio);
- private->fsync_skip_inode_lock = false;
-
- if (!have_private)
- file->private_data = NULL;
+ current->journal_info = NULL;
}
/* No increment (+=) because iomap returns a cumulative value. */
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 9914419f3b7d..2aeb8116549c 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1603,7 +1603,6 @@ static inline bool skip_inode_logging(const struct btrfs_log_ctx *ctx)
*/
int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
{
- struct btrfs_file_private *private = file->private_data;
struct dentry *dentry = file_dentry(file);
struct btrfs_inode *inode = BTRFS_I(d_inode(dentry));
struct btrfs_root *root = inode->root;
@@ -1613,7 +1612,13 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
int ret = 0, err;
u64 len;
bool full_sync;
- const bool skip_ilock = (private ? private->fsync_skip_inode_lock : false);
+ bool skip_ilock = false;
+
+ if (current->journal_info == BTRFS_TRANS_DIO_WRITE_STUB) {
+ skip_ilock = true;
+ current->journal_info = NULL;
+ lockdep_assert_held(&inode->vfs_inode.i_rwsem);
+ }
trace_btrfs_sync_file(file, datasync);
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 98c03ddc760b..dd9ce9b9f69e 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -27,6 +27,12 @@ struct btrfs_root_item;
struct btrfs_root;
struct btrfs_path;
+/*
+ * Signal that a direct IO write is in progress, to avoid deadlock for sync
+ * direct IO writes when fsync is called during the direct IO write path.
+ */
+#define BTRFS_TRANS_DIO_WRITE_STUB ((void *) 1)
+
/* Radix-tree tag for roots that are part of the trasaction. */
#define BTRFS_ROOT_TRANS_TAG 0
The patch below does not apply to the 6.6-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.6.y
git checkout FETCH_HEAD
git cherry-pick -x cd9253c23aedd61eb5ff11f37a36247cd46faf86
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024090847-flashing-dimmed-2c7f@gregkh' --subject-prefix 'PATCH 6.6.y' HEAD^..
Possible dependencies:
cd9253c23aed ("btrfs: fix race between direct IO write and fsync when using same fd")
939b656bc8ab ("btrfs: fix corruption after buffer fault in during direct IO append write")
9aa29a20b700 ("btrfs: move the direct IO code into its own file")
04ef7631bfa5 ("btrfs: cleanup duplicated parameters related to btrfs_create_dio_extent()")
9fec848b3a33 ("btrfs: cleanup duplicated parameters related to create_io_em()")
e9ea31fb5c1f ("btrfs: cleanup duplicated parameters related to btrfs_alloc_ordered_extent")
cdc627e65c7e ("btrfs: cleanup duplicated parameters related to can_nocow_file_extent_args")
c77a8c61002e ("btrfs: remove extent_map::block_start member")
e28b851ed9b2 ("btrfs: remove extent_map::block_len member")
4aa7b5d1784f ("btrfs: remove extent_map::orig_start member")
3f255ece2f1e ("btrfs: introduce extra sanity checks for extent maps")
3d2ac9922465 ("btrfs: introduce new members for extent_map")
87a6962f73b1 ("btrfs: export the expected file extent through can_nocow_extent()")
e8fe524da027 ("btrfs: rename extent_map::orig_block_len to disk_num_bytes")
8996f61ab9ff ("btrfs: move fiemap code into its own file")
56b7169f691c ("btrfs: use a btrfs_inode local variable at btrfs_sync_file()")
e641e323abb3 ("btrfs: pass a btrfs_inode to btrfs_wait_ordered_range()")
cef2daba4268 ("btrfs: pass a btrfs_inode to btrfs_fdatawrite_range()")
4e660ca3a98d ("btrfs: use a regular rb_root instead of cached rb_root for extent_map_tree")
7f5830bc964d ("btrfs: rename rb_root member of extent_map_tree from map to root")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From cd9253c23aedd61eb5ff11f37a36247cd46faf86 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana(a)suse.com>
Date: Thu, 29 Aug 2024 18:25:49 +0100
Subject: [PATCH] btrfs: fix race between direct IO write and fsync when using
same fd
If we have 2 threads that are using the same file descriptor and one of
them is doing direct IO writes while the other is doing fsync, we have a
race where we can end up either:
1) Attempt a fsync without holding the inode's lock, triggering an
assertion failures when assertions are enabled;
2) Do an invalid memory access from the fsync task because the file private
points to memory allocated on stack by the direct IO task and it may be
used by the fsync task after the stack was destroyed.
The race happens like this:
1) A user space program opens a file descriptor with O_DIRECT;
2) The program spawns 2 threads using libpthread for example;
3) One of the threads uses the file descriptor to do direct IO writes,
while the other calls fsync using the same file descriptor.
4) Call task A the thread doing direct IO writes and task B the thread
doing fsyncs;
5) Task A does a direct IO write, and at btrfs_direct_write() sets the
file's private to an on stack allocated private with the member
'fsync_skip_inode_lock' set to true;
6) Task B enters btrfs_sync_file() and sees that there's a private
structure associated to the file which has 'fsync_skip_inode_lock' set
to true, so it skips locking the inode's VFS lock;
7) Task A completes the direct IO write, and resets the file's private to
NULL since it had no prior private and our private was stack allocated.
Then it unlocks the inode's VFS lock;
8) Task B enters btrfs_get_ordered_extents_for_logging(), then the
assertion that checks the inode's VFS lock is held fails, since task B
never locked it and task A has already unlocked it.
The stack trace produced is the following:
assertion failed: inode_is_locked(&inode->vfs_inode), in fs/btrfs/ordered-data.c:983
------------[ cut here ]------------
kernel BUG at fs/btrfs/ordered-data.c:983!
Oops: invalid opcode: 0000 [#1] PREEMPT SMP PTI
CPU: 9 PID: 5072 Comm: worker Tainted: G U OE 6.10.5-1-default #1 openSUSE Tumbleweed 69f48d427608e1c09e60ea24c6c55e2ca1b049e8
Hardware name: Acer Predator PH315-52/Covini_CFS, BIOS V1.12 07/28/2020
RIP: 0010:btrfs_get_ordered_extents_for_logging.cold+0x1f/0x42 [btrfs]
Code: 50 d6 86 c0 e8 (...)
RSP: 0018:ffff9e4a03dcfc78 EFLAGS: 00010246
RAX: 0000000000000054 RBX: ffff9078a9868e98 RCX: 0000000000000000
RDX: 0000000000000000 RSI: ffff907dce4a7800 RDI: ffff907dce4a7800
RBP: ffff907805518800 R08: 0000000000000000 R09: ffff9e4a03dcfb38
R10: ffff9e4a03dcfb30 R11: 0000000000000003 R12: ffff907684ae7800
R13: 0000000000000001 R14: ffff90774646b600 R15: 0000000000000000
FS: 00007f04b96006c0(0000) GS:ffff907dce480000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00007f32acbfc000 CR3: 00000001fd4fa005 CR4: 00000000003726f0
Call Trace:
<TASK>
? __die_body.cold+0x14/0x24
? die+0x2e/0x50
? do_trap+0xca/0x110
? do_error_trap+0x6a/0x90
? btrfs_get_ordered_extents_for_logging.cold+0x1f/0x42 [btrfs bb26272d49b4cdc847cf3f7faadd459b62caee9a]
? exc_invalid_op+0x50/0x70
? btrfs_get_ordered_extents_for_logging.cold+0x1f/0x42 [btrfs bb26272d49b4cdc847cf3f7faadd459b62caee9a]
? asm_exc_invalid_op+0x1a/0x20
? btrfs_get_ordered_extents_for_logging.cold+0x1f/0x42 [btrfs bb26272d49b4cdc847cf3f7faadd459b62caee9a]
? btrfs_get_ordered_extents_for_logging.cold+0x1f/0x42 [btrfs bb26272d49b4cdc847cf3f7faadd459b62caee9a]
btrfs_sync_file+0x21a/0x4d0 [btrfs bb26272d49b4cdc847cf3f7faadd459b62caee9a]
? __seccomp_filter+0x31d/0x4f0
__x64_sys_fdatasync+0x4f/0x90
do_syscall_64+0x82/0x160
? do_futex+0xcb/0x190
? __x64_sys_futex+0x10e/0x1d0
? switch_fpu_return+0x4f/0xd0
? syscall_exit_to_user_mode+0x72/0x220
? do_syscall_64+0x8e/0x160
? syscall_exit_to_user_mode+0x72/0x220
? do_syscall_64+0x8e/0x160
? syscall_exit_to_user_mode+0x72/0x220
? do_syscall_64+0x8e/0x160
? syscall_exit_to_user_mode+0x72/0x220
? do_syscall_64+0x8e/0x160
entry_SYSCALL_64_after_hwframe+0x76/0x7e
Another problem here is if task B grabs the private pointer and then uses
it after task A has finished, since the private was allocated in the stack
of task A, it results in some invalid memory access with a hard to predict
result.
This issue, triggering the assertion, was observed with QEMU workloads by
two users in the Link tags below.
Fix this by not relying on a file's private to pass information to fsync
that it should skip locking the inode and instead pass this information
through a special value stored in current->journal_info. This is safe
because in the relevant section of the direct IO write path we are not
holding a transaction handle, so current->journal_info is NULL.
The following C program triggers the issue:
$ cat repro.c
/* Get the O_DIRECT definition. */
#ifndef _GNU_SOURCE
#define _GNU_SOURCE
#endif
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <stdint.h>
#include <fcntl.h>
#include <errno.h>
#include <string.h>
#include <pthread.h>
static int fd;
static ssize_t do_write(int fd, const void *buf, size_t count, off_t offset)
{
while (count > 0) {
ssize_t ret;
ret = pwrite(fd, buf, count, offset);
if (ret < 0) {
if (errno == EINTR)
continue;
return ret;
}
count -= ret;
buf += ret;
}
return 0;
}
static void *fsync_loop(void *arg)
{
while (1) {
int ret;
ret = fsync(fd);
if (ret != 0) {
perror("Fsync failed");
exit(6);
}
}
}
int main(int argc, char *argv[])
{
long pagesize;
void *write_buf;
pthread_t fsyncer;
int ret;
if (argc != 2) {
fprintf(stderr, "Use: %s <file path>\n", argv[0]);
return 1;
}
fd = open(argv[1], O_WRONLY | O_CREAT | O_TRUNC | O_DIRECT, 0666);
if (fd == -1) {
perror("Failed to open/create file");
return 1;
}
pagesize = sysconf(_SC_PAGE_SIZE);
if (pagesize == -1) {
perror("Failed to get page size");
return 2;
}
ret = posix_memalign(&write_buf, pagesize, pagesize);
if (ret) {
perror("Failed to allocate buffer");
return 3;
}
ret = pthread_create(&fsyncer, NULL, fsync_loop, NULL);
if (ret != 0) {
fprintf(stderr, "Failed to create writer thread: %d\n", ret);
return 4;
}
while (1) {
ret = do_write(fd, write_buf, pagesize, 0);
if (ret != 0) {
perror("Write failed");
exit(5);
}
}
return 0;
}
$ mkfs.btrfs -f /dev/sdi
$ mount /dev/sdi /mnt/sdi
$ timeout 10 ./repro /mnt/sdi/foo
Usually the race is triggered within less than 1 second. A test case for
fstests will follow soon.
Reported-by: Paulo Dias <paulo.miguel.dias(a)gmail.com>
Link: https://bugzilla.kernel.org/show_bug.cgi?id=219187
Reported-by: Andreas Jahn <jahn-andi(a)web.de>
Link: https://bugzilla.kernel.org/show_bug.cgi?id=219199
Reported-by: syzbot+4704b3cc972bd76024f1(a)syzkaller.appspotmail.com
Link: https://lore.kernel.org/linux-btrfs/00000000000044ff540620d7dee2@google.com/
Fixes: 939b656bc8ab ("btrfs: fix corruption after buffer fault in during direct IO append write")
CC: stable(a)vger.kernel.org # 5.15+
Reviewed-by: Josef Bacik <josef(a)toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana(a)suse.com>
Reviewed-by: David Sterba <dsterba(a)suse.com>
Signed-off-by: David Sterba <dsterba(a)suse.com>
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 75fa563e4cac..c8568b1a61c4 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -459,7 +459,6 @@ struct btrfs_file_private {
void *filldir_buf;
u64 last_index;
struct extent_state *llseek_cached_state;
- bool fsync_skip_inode_lock;
};
static inline u32 BTRFS_LEAF_DATA_SIZE(const struct btrfs_fs_info *info)
diff --git a/fs/btrfs/direct-io.c b/fs/btrfs/direct-io.c
index 67adbe9d294a..364bce34f034 100644
--- a/fs/btrfs/direct-io.c
+++ b/fs/btrfs/direct-io.c
@@ -864,13 +864,6 @@ ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
if (IS_ERR_OR_NULL(dio)) {
ret = PTR_ERR_OR_ZERO(dio);
} else {
- struct btrfs_file_private stack_private = { 0 };
- struct btrfs_file_private *private;
- const bool have_private = (file->private_data != NULL);
-
- if (!have_private)
- file->private_data = &stack_private;
-
/*
* If we have a synchronous write, we must make sure the fsync
* triggered by the iomap_dio_complete() call below doesn't
@@ -879,13 +872,10 @@ ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
* partial writes due to the input buffer (or parts of it) not
* being already faulted in.
*/
- private = file->private_data;
- private->fsync_skip_inode_lock = true;
+ ASSERT(current->journal_info == NULL);
+ current->journal_info = BTRFS_TRANS_DIO_WRITE_STUB;
ret = iomap_dio_complete(dio);
- private->fsync_skip_inode_lock = false;
-
- if (!have_private)
- file->private_data = NULL;
+ current->journal_info = NULL;
}
/* No increment (+=) because iomap returns a cumulative value. */
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 9914419f3b7d..2aeb8116549c 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1603,7 +1603,6 @@ static inline bool skip_inode_logging(const struct btrfs_log_ctx *ctx)
*/
int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
{
- struct btrfs_file_private *private = file->private_data;
struct dentry *dentry = file_dentry(file);
struct btrfs_inode *inode = BTRFS_I(d_inode(dentry));
struct btrfs_root *root = inode->root;
@@ -1613,7 +1612,13 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
int ret = 0, err;
u64 len;
bool full_sync;
- const bool skip_ilock = (private ? private->fsync_skip_inode_lock : false);
+ bool skip_ilock = false;
+
+ if (current->journal_info == BTRFS_TRANS_DIO_WRITE_STUB) {
+ skip_ilock = true;
+ current->journal_info = NULL;
+ lockdep_assert_held(&inode->vfs_inode.i_rwsem);
+ }
trace_btrfs_sync_file(file, datasync);
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 98c03ddc760b..dd9ce9b9f69e 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -27,6 +27,12 @@ struct btrfs_root_item;
struct btrfs_root;
struct btrfs_path;
+/*
+ * Signal that a direct IO write is in progress, to avoid deadlock for sync
+ * direct IO writes when fsync is called during the direct IO write path.
+ */
+#define BTRFS_TRANS_DIO_WRITE_STUB ((void *) 1)
+
/* Radix-tree tag for roots that are part of the trasaction. */
#define BTRFS_ROOT_TRANS_TAG 0
The patch below does not apply to the 6.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.10.y
git checkout FETCH_HEAD
git cherry-pick -x cd9253c23aedd61eb5ff11f37a36247cd46faf86
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024090844-flattered-badass-d13c@gregkh' --subject-prefix 'PATCH 6.10.y' HEAD^..
Possible dependencies:
cd9253c23aed ("btrfs: fix race between direct IO write and fsync when using same fd")
939b656bc8ab ("btrfs: fix corruption after buffer fault in during direct IO append write")
9aa29a20b700 ("btrfs: move the direct IO code into its own file")
04ef7631bfa5 ("btrfs: cleanup duplicated parameters related to btrfs_create_dio_extent()")
9fec848b3a33 ("btrfs: cleanup duplicated parameters related to create_io_em()")
e9ea31fb5c1f ("btrfs: cleanup duplicated parameters related to btrfs_alloc_ordered_extent")
cdc627e65c7e ("btrfs: cleanup duplicated parameters related to can_nocow_file_extent_args")
c77a8c61002e ("btrfs: remove extent_map::block_start member")
e28b851ed9b2 ("btrfs: remove extent_map::block_len member")
4aa7b5d1784f ("btrfs: remove extent_map::orig_start member")
3f255ece2f1e ("btrfs: introduce extra sanity checks for extent maps")
3d2ac9922465 ("btrfs: introduce new members for extent_map")
87a6962f73b1 ("btrfs: export the expected file extent through can_nocow_extent()")
e8fe524da027 ("btrfs: rename extent_map::orig_block_len to disk_num_bytes")
8996f61ab9ff ("btrfs: move fiemap code into its own file")
56b7169f691c ("btrfs: use a btrfs_inode local variable at btrfs_sync_file()")
e641e323abb3 ("btrfs: pass a btrfs_inode to btrfs_wait_ordered_range()")
cef2daba4268 ("btrfs: pass a btrfs_inode to btrfs_fdatawrite_range()")
4e660ca3a98d ("btrfs: use a regular rb_root instead of cached rb_root for extent_map_tree")
7f5830bc964d ("btrfs: rename rb_root member of extent_map_tree from map to root")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From cd9253c23aedd61eb5ff11f37a36247cd46faf86 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana(a)suse.com>
Date: Thu, 29 Aug 2024 18:25:49 +0100
Subject: [PATCH] btrfs: fix race between direct IO write and fsync when using
same fd
If we have 2 threads that are using the same file descriptor and one of
them is doing direct IO writes while the other is doing fsync, we have a
race where we can end up either:
1) Attempt a fsync without holding the inode's lock, triggering an
assertion failures when assertions are enabled;
2) Do an invalid memory access from the fsync task because the file private
points to memory allocated on stack by the direct IO task and it may be
used by the fsync task after the stack was destroyed.
The race happens like this:
1) A user space program opens a file descriptor with O_DIRECT;
2) The program spawns 2 threads using libpthread for example;
3) One of the threads uses the file descriptor to do direct IO writes,
while the other calls fsync using the same file descriptor.
4) Call task A the thread doing direct IO writes and task B the thread
doing fsyncs;
5) Task A does a direct IO write, and at btrfs_direct_write() sets the
file's private to an on stack allocated private with the member
'fsync_skip_inode_lock' set to true;
6) Task B enters btrfs_sync_file() and sees that there's a private
structure associated to the file which has 'fsync_skip_inode_lock' set
to true, so it skips locking the inode's VFS lock;
7) Task A completes the direct IO write, and resets the file's private to
NULL since it had no prior private and our private was stack allocated.
Then it unlocks the inode's VFS lock;
8) Task B enters btrfs_get_ordered_extents_for_logging(), then the
assertion that checks the inode's VFS lock is held fails, since task B
never locked it and task A has already unlocked it.
The stack trace produced is the following:
assertion failed: inode_is_locked(&inode->vfs_inode), in fs/btrfs/ordered-data.c:983
------------[ cut here ]------------
kernel BUG at fs/btrfs/ordered-data.c:983!
Oops: invalid opcode: 0000 [#1] PREEMPT SMP PTI
CPU: 9 PID: 5072 Comm: worker Tainted: G U OE 6.10.5-1-default #1 openSUSE Tumbleweed 69f48d427608e1c09e60ea24c6c55e2ca1b049e8
Hardware name: Acer Predator PH315-52/Covini_CFS, BIOS V1.12 07/28/2020
RIP: 0010:btrfs_get_ordered_extents_for_logging.cold+0x1f/0x42 [btrfs]
Code: 50 d6 86 c0 e8 (...)
RSP: 0018:ffff9e4a03dcfc78 EFLAGS: 00010246
RAX: 0000000000000054 RBX: ffff9078a9868e98 RCX: 0000000000000000
RDX: 0000000000000000 RSI: ffff907dce4a7800 RDI: ffff907dce4a7800
RBP: ffff907805518800 R08: 0000000000000000 R09: ffff9e4a03dcfb38
R10: ffff9e4a03dcfb30 R11: 0000000000000003 R12: ffff907684ae7800
R13: 0000000000000001 R14: ffff90774646b600 R15: 0000000000000000
FS: 00007f04b96006c0(0000) GS:ffff907dce480000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00007f32acbfc000 CR3: 00000001fd4fa005 CR4: 00000000003726f0
Call Trace:
<TASK>
? __die_body.cold+0x14/0x24
? die+0x2e/0x50
? do_trap+0xca/0x110
? do_error_trap+0x6a/0x90
? btrfs_get_ordered_extents_for_logging.cold+0x1f/0x42 [btrfs bb26272d49b4cdc847cf3f7faadd459b62caee9a]
? exc_invalid_op+0x50/0x70
? btrfs_get_ordered_extents_for_logging.cold+0x1f/0x42 [btrfs bb26272d49b4cdc847cf3f7faadd459b62caee9a]
? asm_exc_invalid_op+0x1a/0x20
? btrfs_get_ordered_extents_for_logging.cold+0x1f/0x42 [btrfs bb26272d49b4cdc847cf3f7faadd459b62caee9a]
? btrfs_get_ordered_extents_for_logging.cold+0x1f/0x42 [btrfs bb26272d49b4cdc847cf3f7faadd459b62caee9a]
btrfs_sync_file+0x21a/0x4d0 [btrfs bb26272d49b4cdc847cf3f7faadd459b62caee9a]
? __seccomp_filter+0x31d/0x4f0
__x64_sys_fdatasync+0x4f/0x90
do_syscall_64+0x82/0x160
? do_futex+0xcb/0x190
? __x64_sys_futex+0x10e/0x1d0
? switch_fpu_return+0x4f/0xd0
? syscall_exit_to_user_mode+0x72/0x220
? do_syscall_64+0x8e/0x160
? syscall_exit_to_user_mode+0x72/0x220
? do_syscall_64+0x8e/0x160
? syscall_exit_to_user_mode+0x72/0x220
? do_syscall_64+0x8e/0x160
? syscall_exit_to_user_mode+0x72/0x220
? do_syscall_64+0x8e/0x160
entry_SYSCALL_64_after_hwframe+0x76/0x7e
Another problem here is if task B grabs the private pointer and then uses
it after task A has finished, since the private was allocated in the stack
of task A, it results in some invalid memory access with a hard to predict
result.
This issue, triggering the assertion, was observed with QEMU workloads by
two users in the Link tags below.
Fix this by not relying on a file's private to pass information to fsync
that it should skip locking the inode and instead pass this information
through a special value stored in current->journal_info. This is safe
because in the relevant section of the direct IO write path we are not
holding a transaction handle, so current->journal_info is NULL.
The following C program triggers the issue:
$ cat repro.c
/* Get the O_DIRECT definition. */
#ifndef _GNU_SOURCE
#define _GNU_SOURCE
#endif
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <stdint.h>
#include <fcntl.h>
#include <errno.h>
#include <string.h>
#include <pthread.h>
static int fd;
static ssize_t do_write(int fd, const void *buf, size_t count, off_t offset)
{
while (count > 0) {
ssize_t ret;
ret = pwrite(fd, buf, count, offset);
if (ret < 0) {
if (errno == EINTR)
continue;
return ret;
}
count -= ret;
buf += ret;
}
return 0;
}
static void *fsync_loop(void *arg)
{
while (1) {
int ret;
ret = fsync(fd);
if (ret != 0) {
perror("Fsync failed");
exit(6);
}
}
}
int main(int argc, char *argv[])
{
long pagesize;
void *write_buf;
pthread_t fsyncer;
int ret;
if (argc != 2) {
fprintf(stderr, "Use: %s <file path>\n", argv[0]);
return 1;
}
fd = open(argv[1], O_WRONLY | O_CREAT | O_TRUNC | O_DIRECT, 0666);
if (fd == -1) {
perror("Failed to open/create file");
return 1;
}
pagesize = sysconf(_SC_PAGE_SIZE);
if (pagesize == -1) {
perror("Failed to get page size");
return 2;
}
ret = posix_memalign(&write_buf, pagesize, pagesize);
if (ret) {
perror("Failed to allocate buffer");
return 3;
}
ret = pthread_create(&fsyncer, NULL, fsync_loop, NULL);
if (ret != 0) {
fprintf(stderr, "Failed to create writer thread: %d\n", ret);
return 4;
}
while (1) {
ret = do_write(fd, write_buf, pagesize, 0);
if (ret != 0) {
perror("Write failed");
exit(5);
}
}
return 0;
}
$ mkfs.btrfs -f /dev/sdi
$ mount /dev/sdi /mnt/sdi
$ timeout 10 ./repro /mnt/sdi/foo
Usually the race is triggered within less than 1 second. A test case for
fstests will follow soon.
Reported-by: Paulo Dias <paulo.miguel.dias(a)gmail.com>
Link: https://bugzilla.kernel.org/show_bug.cgi?id=219187
Reported-by: Andreas Jahn <jahn-andi(a)web.de>
Link: https://bugzilla.kernel.org/show_bug.cgi?id=219199
Reported-by: syzbot+4704b3cc972bd76024f1(a)syzkaller.appspotmail.com
Link: https://lore.kernel.org/linux-btrfs/00000000000044ff540620d7dee2@google.com/
Fixes: 939b656bc8ab ("btrfs: fix corruption after buffer fault in during direct IO append write")
CC: stable(a)vger.kernel.org # 5.15+
Reviewed-by: Josef Bacik <josef(a)toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana(a)suse.com>
Reviewed-by: David Sterba <dsterba(a)suse.com>
Signed-off-by: David Sterba <dsterba(a)suse.com>
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 75fa563e4cac..c8568b1a61c4 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -459,7 +459,6 @@ struct btrfs_file_private {
void *filldir_buf;
u64 last_index;
struct extent_state *llseek_cached_state;
- bool fsync_skip_inode_lock;
};
static inline u32 BTRFS_LEAF_DATA_SIZE(const struct btrfs_fs_info *info)
diff --git a/fs/btrfs/direct-io.c b/fs/btrfs/direct-io.c
index 67adbe9d294a..364bce34f034 100644
--- a/fs/btrfs/direct-io.c
+++ b/fs/btrfs/direct-io.c
@@ -864,13 +864,6 @@ ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
if (IS_ERR_OR_NULL(dio)) {
ret = PTR_ERR_OR_ZERO(dio);
} else {
- struct btrfs_file_private stack_private = { 0 };
- struct btrfs_file_private *private;
- const bool have_private = (file->private_data != NULL);
-
- if (!have_private)
- file->private_data = &stack_private;
-
/*
* If we have a synchronous write, we must make sure the fsync
* triggered by the iomap_dio_complete() call below doesn't
@@ -879,13 +872,10 @@ ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
* partial writes due to the input buffer (or parts of it) not
* being already faulted in.
*/
- private = file->private_data;
- private->fsync_skip_inode_lock = true;
+ ASSERT(current->journal_info == NULL);
+ current->journal_info = BTRFS_TRANS_DIO_WRITE_STUB;
ret = iomap_dio_complete(dio);
- private->fsync_skip_inode_lock = false;
-
- if (!have_private)
- file->private_data = NULL;
+ current->journal_info = NULL;
}
/* No increment (+=) because iomap returns a cumulative value. */
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 9914419f3b7d..2aeb8116549c 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1603,7 +1603,6 @@ static inline bool skip_inode_logging(const struct btrfs_log_ctx *ctx)
*/
int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
{
- struct btrfs_file_private *private = file->private_data;
struct dentry *dentry = file_dentry(file);
struct btrfs_inode *inode = BTRFS_I(d_inode(dentry));
struct btrfs_root *root = inode->root;
@@ -1613,7 +1612,13 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
int ret = 0, err;
u64 len;
bool full_sync;
- const bool skip_ilock = (private ? private->fsync_skip_inode_lock : false);
+ bool skip_ilock = false;
+
+ if (current->journal_info == BTRFS_TRANS_DIO_WRITE_STUB) {
+ skip_ilock = true;
+ current->journal_info = NULL;
+ lockdep_assert_held(&inode->vfs_inode.i_rwsem);
+ }
trace_btrfs_sync_file(file, datasync);
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 98c03ddc760b..dd9ce9b9f69e 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -27,6 +27,12 @@ struct btrfs_root_item;
struct btrfs_root;
struct btrfs_path;
+/*
+ * Signal that a direct IO write is in progress, to avoid deadlock for sync
+ * direct IO writes when fsync is called during the direct IO write path.
+ */
+#define BTRFS_TRANS_DIO_WRITE_STUB ((void *) 1)
+
/* Radix-tree tag for roots that are part of the trasaction. */
#define BTRFS_ROOT_TRANS_TAG 0
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.4.y
git checkout FETCH_HEAD
git cherry-pick -x 0ecc5be200c84e67114f3640064ba2bae3ba2f5a
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024090830-kangaroo-hassle-e959@gregkh' --subject-prefix 'PATCH 5.4.y' HEAD^..
Possible dependencies:
0ecc5be200c8 ("x86/apic: Make x2apic_disable() work correctly")
720a22fd6c1c ("x86/apic: Don't access the APIC when disabling x2APIC")
5a88f354dcd8 ("x86/apic: Split register_apic_address()")
d10a904435fa ("x86/apic: Consolidate boot_cpu_physical_apicid initialization sites")
49062454a3eb ("x86/apic: Rename disable_apic")
bea629d57d00 ("x86/apic: Save the APIC virtual base address")
3adee777ad0d ("x86/smpboot: Remove initial_stack on 64-bit")
94a855111ed9 ("Merge tag 'x86_core_for_v6.2' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 0ecc5be200c84e67114f3640064ba2bae3ba2f5a Mon Sep 17 00:00:00 2001
From: Yuntao Wang <yuntao.wang(a)linux.dev>
Date: Tue, 13 Aug 2024 09:48:27 +0800
Subject: [PATCH] x86/apic: Make x2apic_disable() work correctly
x2apic_disable() clears x2apic_state and x2apic_mode unconditionally, even
when the state is X2APIC_ON_LOCKED, which prevents the kernel to disable
it thereby creating inconsistent state.
Due to the early state check for X2APIC_ON, the code path which warns about
a locked X2APIC cannot be reached.
Test for state < X2APIC_ON instead and move the clearing of the state and
mode variables to the place which actually disables X2APIC.
[ tglx: Massaged change log. Added Fixes tag. Moved clearing so it's at the
right place for back ports ]
Fixes: a57e456a7b28 ("x86/apic: Fix fallout from x2apic cleanup")
Signed-off-by: Yuntao Wang <yuntao.wang(a)linux.dev>
Signed-off-by: Thomas Gleixner <tglx(a)linutronix.de>
Cc: stable(a)vger.kernel.org
Link: https://lore.kernel.org/all/20240813014827.895381-1-yuntao.wang@linux.dev
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 66fd4b2a37a3..373638691cd4 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -1775,12 +1775,9 @@ static __init void apic_set_fixmap(bool read_apic);
static __init void x2apic_disable(void)
{
- u32 x2apic_id, state = x2apic_state;
+ u32 x2apic_id;
- x2apic_mode = 0;
- x2apic_state = X2APIC_DISABLED;
-
- if (state != X2APIC_ON)
+ if (x2apic_state < X2APIC_ON)
return;
x2apic_id = read_apic_id();
@@ -1793,6 +1790,10 @@ static __init void x2apic_disable(void)
}
__x2apic_disable();
+
+ x2apic_mode = 0;
+ x2apic_state = X2APIC_DISABLED;
+
/*
* Don't reread the APIC ID as it was already done from
* check_x2apic() and the APIC driver still is a x2APIC variant,
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-4.19.y
git checkout FETCH_HEAD
git cherry-pick -x 0ecc5be200c84e67114f3640064ba2bae3ba2f5a
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024090830-reactive-jokester-6061@gregkh' --subject-prefix 'PATCH 4.19.y' HEAD^..
Possible dependencies:
0ecc5be200c8 ("x86/apic: Make x2apic_disable() work correctly")
720a22fd6c1c ("x86/apic: Don't access the APIC when disabling x2APIC")
5a88f354dcd8 ("x86/apic: Split register_apic_address()")
d10a904435fa ("x86/apic: Consolidate boot_cpu_physical_apicid initialization sites")
49062454a3eb ("x86/apic: Rename disable_apic")
bea629d57d00 ("x86/apic: Save the APIC virtual base address")
3adee777ad0d ("x86/smpboot: Remove initial_stack on 64-bit")
94a855111ed9 ("Merge tag 'x86_core_for_v6.2' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 0ecc5be200c84e67114f3640064ba2bae3ba2f5a Mon Sep 17 00:00:00 2001
From: Yuntao Wang <yuntao.wang(a)linux.dev>
Date: Tue, 13 Aug 2024 09:48:27 +0800
Subject: [PATCH] x86/apic: Make x2apic_disable() work correctly
x2apic_disable() clears x2apic_state and x2apic_mode unconditionally, even
when the state is X2APIC_ON_LOCKED, which prevents the kernel to disable
it thereby creating inconsistent state.
Due to the early state check for X2APIC_ON, the code path which warns about
a locked X2APIC cannot be reached.
Test for state < X2APIC_ON instead and move the clearing of the state and
mode variables to the place which actually disables X2APIC.
[ tglx: Massaged change log. Added Fixes tag. Moved clearing so it's at the
right place for back ports ]
Fixes: a57e456a7b28 ("x86/apic: Fix fallout from x2apic cleanup")
Signed-off-by: Yuntao Wang <yuntao.wang(a)linux.dev>
Signed-off-by: Thomas Gleixner <tglx(a)linutronix.de>
Cc: stable(a)vger.kernel.org
Link: https://lore.kernel.org/all/20240813014827.895381-1-yuntao.wang@linux.dev
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 66fd4b2a37a3..373638691cd4 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -1775,12 +1775,9 @@ static __init void apic_set_fixmap(bool read_apic);
static __init void x2apic_disable(void)
{
- u32 x2apic_id, state = x2apic_state;
+ u32 x2apic_id;
- x2apic_mode = 0;
- x2apic_state = X2APIC_DISABLED;
-
- if (state != X2APIC_ON)
+ if (x2apic_state < X2APIC_ON)
return;
x2apic_id = read_apic_id();
@@ -1793,6 +1790,10 @@ static __init void x2apic_disable(void)
}
__x2apic_disable();
+
+ x2apic_mode = 0;
+ x2apic_state = X2APIC_DISABLED;
+
/*
* Don't reread the APIC ID as it was already done from
* check_x2apic() and the APIC driver still is a x2APIC variant,
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.10.y
git checkout FETCH_HEAD
git cherry-pick -x 0ecc5be200c84e67114f3640064ba2bae3ba2f5a
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024090829-clench-kinfolk-03d9@gregkh' --subject-prefix 'PATCH 5.10.y' HEAD^..
Possible dependencies:
0ecc5be200c8 ("x86/apic: Make x2apic_disable() work correctly")
720a22fd6c1c ("x86/apic: Don't access the APIC when disabling x2APIC")
5a88f354dcd8 ("x86/apic: Split register_apic_address()")
d10a904435fa ("x86/apic: Consolidate boot_cpu_physical_apicid initialization sites")
49062454a3eb ("x86/apic: Rename disable_apic")
bea629d57d00 ("x86/apic: Save the APIC virtual base address")
3adee777ad0d ("x86/smpboot: Remove initial_stack on 64-bit")
94a855111ed9 ("Merge tag 'x86_core_for_v6.2' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 0ecc5be200c84e67114f3640064ba2bae3ba2f5a Mon Sep 17 00:00:00 2001
From: Yuntao Wang <yuntao.wang(a)linux.dev>
Date: Tue, 13 Aug 2024 09:48:27 +0800
Subject: [PATCH] x86/apic: Make x2apic_disable() work correctly
x2apic_disable() clears x2apic_state and x2apic_mode unconditionally, even
when the state is X2APIC_ON_LOCKED, which prevents the kernel to disable
it thereby creating inconsistent state.
Due to the early state check for X2APIC_ON, the code path which warns about
a locked X2APIC cannot be reached.
Test for state < X2APIC_ON instead and move the clearing of the state and
mode variables to the place which actually disables X2APIC.
[ tglx: Massaged change log. Added Fixes tag. Moved clearing so it's at the
right place for back ports ]
Fixes: a57e456a7b28 ("x86/apic: Fix fallout from x2apic cleanup")
Signed-off-by: Yuntao Wang <yuntao.wang(a)linux.dev>
Signed-off-by: Thomas Gleixner <tglx(a)linutronix.de>
Cc: stable(a)vger.kernel.org
Link: https://lore.kernel.org/all/20240813014827.895381-1-yuntao.wang@linux.dev
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 66fd4b2a37a3..373638691cd4 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -1775,12 +1775,9 @@ static __init void apic_set_fixmap(bool read_apic);
static __init void x2apic_disable(void)
{
- u32 x2apic_id, state = x2apic_state;
+ u32 x2apic_id;
- x2apic_mode = 0;
- x2apic_state = X2APIC_DISABLED;
-
- if (state != X2APIC_ON)
+ if (x2apic_state < X2APIC_ON)
return;
x2apic_id = read_apic_id();
@@ -1793,6 +1790,10 @@ static __init void x2apic_disable(void)
}
__x2apic_disable();
+
+ x2apic_mode = 0;
+ x2apic_state = X2APIC_DISABLED;
+
/*
* Don't reread the APIC ID as it was already done from
* check_x2apic() and the APIC driver still is a x2APIC variant,
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.15.y
git checkout FETCH_HEAD
git cherry-pick -x 0ecc5be200c84e67114f3640064ba2bae3ba2f5a
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024090828-tiny-boggle-b7b4@gregkh' --subject-prefix 'PATCH 5.15.y' HEAD^..
Possible dependencies:
0ecc5be200c8 ("x86/apic: Make x2apic_disable() work correctly")
720a22fd6c1c ("x86/apic: Don't access the APIC when disabling x2APIC")
5a88f354dcd8 ("x86/apic: Split register_apic_address()")
d10a904435fa ("x86/apic: Consolidate boot_cpu_physical_apicid initialization sites")
49062454a3eb ("x86/apic: Rename disable_apic")
bea629d57d00 ("x86/apic: Save the APIC virtual base address")
3adee777ad0d ("x86/smpboot: Remove initial_stack on 64-bit")
94a855111ed9 ("Merge tag 'x86_core_for_v6.2' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 0ecc5be200c84e67114f3640064ba2bae3ba2f5a Mon Sep 17 00:00:00 2001
From: Yuntao Wang <yuntao.wang(a)linux.dev>
Date: Tue, 13 Aug 2024 09:48:27 +0800
Subject: [PATCH] x86/apic: Make x2apic_disable() work correctly
x2apic_disable() clears x2apic_state and x2apic_mode unconditionally, even
when the state is X2APIC_ON_LOCKED, which prevents the kernel to disable
it thereby creating inconsistent state.
Due to the early state check for X2APIC_ON, the code path which warns about
a locked X2APIC cannot be reached.
Test for state < X2APIC_ON instead and move the clearing of the state and
mode variables to the place which actually disables X2APIC.
[ tglx: Massaged change log. Added Fixes tag. Moved clearing so it's at the
right place for back ports ]
Fixes: a57e456a7b28 ("x86/apic: Fix fallout from x2apic cleanup")
Signed-off-by: Yuntao Wang <yuntao.wang(a)linux.dev>
Signed-off-by: Thomas Gleixner <tglx(a)linutronix.de>
Cc: stable(a)vger.kernel.org
Link: https://lore.kernel.org/all/20240813014827.895381-1-yuntao.wang@linux.dev
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 66fd4b2a37a3..373638691cd4 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -1775,12 +1775,9 @@ static __init void apic_set_fixmap(bool read_apic);
static __init void x2apic_disable(void)
{
- u32 x2apic_id, state = x2apic_state;
+ u32 x2apic_id;
- x2apic_mode = 0;
- x2apic_state = X2APIC_DISABLED;
-
- if (state != X2APIC_ON)
+ if (x2apic_state < X2APIC_ON)
return;
x2apic_id = read_apic_id();
@@ -1793,6 +1790,10 @@ static __init void x2apic_disable(void)
}
__x2apic_disable();
+
+ x2apic_mode = 0;
+ x2apic_state = X2APIC_DISABLED;
+
/*
* Don't reread the APIC ID as it was already done from
* check_x2apic() and the APIC driver still is a x2APIC variant,
The patch below does not apply to the 6.1-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.1.y
git checkout FETCH_HEAD
git cherry-pick -x 0ecc5be200c84e67114f3640064ba2bae3ba2f5a
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024090827-recount-humbly-e075@gregkh' --subject-prefix 'PATCH 6.1.y' HEAD^..
Possible dependencies:
0ecc5be200c8 ("x86/apic: Make x2apic_disable() work correctly")
720a22fd6c1c ("x86/apic: Don't access the APIC when disabling x2APIC")
5a88f354dcd8 ("x86/apic: Split register_apic_address()")
d10a904435fa ("x86/apic: Consolidate boot_cpu_physical_apicid initialization sites")
49062454a3eb ("x86/apic: Rename disable_apic")
bea629d57d00 ("x86/apic: Save the APIC virtual base address")
3adee777ad0d ("x86/smpboot: Remove initial_stack on 64-bit")
94a855111ed9 ("Merge tag 'x86_core_for_v6.2' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 0ecc5be200c84e67114f3640064ba2bae3ba2f5a Mon Sep 17 00:00:00 2001
From: Yuntao Wang <yuntao.wang(a)linux.dev>
Date: Tue, 13 Aug 2024 09:48:27 +0800
Subject: [PATCH] x86/apic: Make x2apic_disable() work correctly
x2apic_disable() clears x2apic_state and x2apic_mode unconditionally, even
when the state is X2APIC_ON_LOCKED, which prevents the kernel to disable
it thereby creating inconsistent state.
Due to the early state check for X2APIC_ON, the code path which warns about
a locked X2APIC cannot be reached.
Test for state < X2APIC_ON instead and move the clearing of the state and
mode variables to the place which actually disables X2APIC.
[ tglx: Massaged change log. Added Fixes tag. Moved clearing so it's at the
right place for back ports ]
Fixes: a57e456a7b28 ("x86/apic: Fix fallout from x2apic cleanup")
Signed-off-by: Yuntao Wang <yuntao.wang(a)linux.dev>
Signed-off-by: Thomas Gleixner <tglx(a)linutronix.de>
Cc: stable(a)vger.kernel.org
Link: https://lore.kernel.org/all/20240813014827.895381-1-yuntao.wang@linux.dev
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 66fd4b2a37a3..373638691cd4 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -1775,12 +1775,9 @@ static __init void apic_set_fixmap(bool read_apic);
static __init void x2apic_disable(void)
{
- u32 x2apic_id, state = x2apic_state;
+ u32 x2apic_id;
- x2apic_mode = 0;
- x2apic_state = X2APIC_DISABLED;
-
- if (state != X2APIC_ON)
+ if (x2apic_state < X2APIC_ON)
return;
x2apic_id = read_apic_id();
@@ -1793,6 +1790,10 @@ static __init void x2apic_disable(void)
}
__x2apic_disable();
+
+ x2apic_mode = 0;
+ x2apic_state = X2APIC_DISABLED;
+
/*
* Don't reread the APIC ID as it was already done from
* check_x2apic() and the APIC driver still is a x2APIC variant,
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.10.y
git checkout FETCH_HEAD
git cherry-pick -x 2848ff28d180bd63a95da8e5dcbcdd76c1beeb7b
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024090810-amply-schedule-a7d0@gregkh' --subject-prefix 'PATCH 5.10.y' HEAD^..
Possible dependencies:
2848ff28d180 ("x86/fpu: Avoid writing LBR bit to IA32_XSS unless supported")
c33f0a81a2cf ("x86/fpu: Add fpu_state_config::legacy_features")
d72c87018d00 ("x86/fpu/xstate: Move remaining xfeature helpers to core")
eda32f4f93b4 ("x86/fpu: Rework restore_regs_from_fpstate()")
daddee247319 ("x86/fpu: Mop up xfeatures_mask_uabi()")
1c253ff2287f ("x86/fpu: Move xstate feature masks to fpu_*_cfg")
2bd264bce238 ("x86/fpu: Move xstate size to fpu_*_cfg")
cd9ae7617449 ("x86/fpu/xstate: Cleanup size calculations")
617473acdfe4 ("x86/fpu: Cleanup fpu__init_system_xstate_size_legacy()")
578971f4e228 ("x86/fpu: Provide struct fpu_config")
5509cc78080d ("x86/fpu/signal: Use fpstate for size and features")
ad6ede407aae ("x86/fpu: Use fpstate in fpu_copy_kvm_uabi_to_fpstate()")
be31dfdfd75b ("x86/fpu: Use fpstate::size")
248452ce21ae ("x86/fpu: Add size and mask information to fpstate")
2dd8eedc80b1 ("x86/process: Move arch_thread_struct_whitelist() out of line")
c20942ce5128 ("x86/fpu/core: Convert to fpstate")
7e049e8b7459 ("x86/fpu/signal: Convert to fpstate")
087df48c298c ("x86/fpu: Replace KVMs xstate component clearing")
18b3fa1ad15f ("x86/fpu: Convert restore_fpregs_from_fpstate() to struct fpstate")
f83ac56acdad ("x86/fpu: Convert fpstate_init() to struct fpstate")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 2848ff28d180bd63a95da8e5dcbcdd76c1beeb7b Mon Sep 17 00:00:00 2001
From: Mitchell Levy <levymitchell0(a)gmail.com>
Date: Mon, 12 Aug 2024 13:44:12 -0700
Subject: [PATCH] x86/fpu: Avoid writing LBR bit to IA32_XSS unless supported
There are two distinct CPU features related to the use of XSAVES and LBR:
whether LBR is itself supported and whether XSAVES supports LBR. The LBR
subsystem correctly checks both in intel_pmu_arch_lbr_init(), but the
XSTATE subsystem does not.
The LBR bit is only removed from xfeatures_mask_independent when LBR is not
supported by the CPU, but there is no validation of XSTATE support.
If XSAVES does not support LBR the write to IA32_XSS causes a #GP fault,
leaving the state of IA32_XSS unchanged, i.e. zero. The fault is handled
with a warning and the boot continues.
Consequently the next XRSTORS which tries to restore supervisor state fails
with #GP because the RFBM has zero for all supervisor features, which does
not match the XCOMP_BV field.
As XFEATURE_MASK_FPSTATE includes supervisor features setting up the FPU
causes a #GP, which ends up in fpu_reset_from_exception_fixup(). That fails
due to the same problem resulting in recursive #GPs until the kernel runs
out of stack space and double faults.
Prevent this by storing the supported independent features in
fpu_kernel_cfg during XSTATE initialization and use that cached value for
retrieving the independent feature bits to be written into IA32_XSS.
[ tglx: Massaged change log ]
Fixes: f0dccc9da4c0 ("x86/fpu/xstate: Support dynamic supervisor feature for LBR")
Suggested-by: Thomas Gleixner <tglx(a)linutronix.de>
Signed-off-by: Mitchell Levy <levymitchell0(a)gmail.com>
Signed-off-by: Thomas Gleixner <tglx(a)linutronix.de>
Cc: stable(a)vger.kernel.org
Link: https://lore.kernel.org/all/20240812-xsave-lbr-fix-v3-1-95bac1bf62f4@gmail.…
diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h
index eb17f31b06d2..de16862bf230 100644
--- a/arch/x86/include/asm/fpu/types.h
+++ b/arch/x86/include/asm/fpu/types.h
@@ -591,6 +591,13 @@ struct fpu_state_config {
* even without XSAVE support, i.e. legacy features FP + SSE
*/
u64 legacy_features;
+ /*
+ * @independent_features:
+ *
+ * Features that are supported by XSAVES, but not managed as part of
+ * the FPU core, such as LBR
+ */
+ u64 independent_features;
};
/* FPU state configuration information */
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index c5a026fee5e0..1339f8328db5 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -788,6 +788,9 @@ void __init fpu__init_system_xstate(unsigned int legacy_size)
goto out_disable;
}
+ fpu_kernel_cfg.independent_features = fpu_kernel_cfg.max_features &
+ XFEATURE_MASK_INDEPENDENT;
+
/*
* Clear XSAVE features that are disabled in the normal CPUID.
*/
diff --git a/arch/x86/kernel/fpu/xstate.h b/arch/x86/kernel/fpu/xstate.h
index 2ee0b9c53dcc..afb404cd2059 100644
--- a/arch/x86/kernel/fpu/xstate.h
+++ b/arch/x86/kernel/fpu/xstate.h
@@ -62,9 +62,9 @@ static inline u64 xfeatures_mask_supervisor(void)
static inline u64 xfeatures_mask_independent(void)
{
if (!cpu_feature_enabled(X86_FEATURE_ARCH_LBR))
- return XFEATURE_MASK_INDEPENDENT & ~XFEATURE_MASK_LBR;
+ return fpu_kernel_cfg.independent_features & ~XFEATURE_MASK_LBR;
- return XFEATURE_MASK_INDEPENDENT;
+ return fpu_kernel_cfg.independent_features;
}
/* XSAVE/XRSTOR wrapper functions */
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-4.19.y
git checkout FETCH_HEAD
git cherry-pick -x 71c186efc1b2cf1aeabfeff3b9bd5ac4c5ac14d8
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024090859-delicious-serpent-3635@gregkh' --subject-prefix 'PATCH 4.19.y' HEAD^..
Possible dependencies:
71c186efc1b2 ("userfaultfd: fix checks for huge PMDs")
dab6e717429e ("mm: Rename pmd_read_atomic()")
024d232ae4fc ("mm: Fix pmd_read_atomic()")
fbfdec9989e6 ("x86/mm/pae: Make pmd_t similar to pte_t")
bd74fdaea146 ("mm: multi-gen LRU: support page table walks")
018ee47f1489 ("mm: multi-gen LRU: exploit locality in rmap")
ac35a4902374 ("mm: multi-gen LRU: minimal implementation")
ec1c86b25f4b ("mm: multi-gen LRU: groundwork")
f1e1a7be4718 ("mm/vmscan.c: refactor shrink_node()")
d3629af59f41 ("mm/vmscan: make the annotations of refaults code at the right place")
e9c2dbc8bf71 ("mm/vmscan: define macros for refaults in struct lruvec")
507228044236 ("mm/khugepaged: record SCAN_PMD_MAPPED when scan_pmd() finds hugepage")
6614a3c3164a ("Merge tag 'mm-stable-2022-08-03' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 71c186efc1b2cf1aeabfeff3b9bd5ac4c5ac14d8 Mon Sep 17 00:00:00 2001
From: Jann Horn <jannh(a)google.com>
Date: Tue, 13 Aug 2024 22:25:21 +0200
Subject: [PATCH] userfaultfd: fix checks for huge PMDs
Patch series "userfaultfd: fix races around pmd_trans_huge() check", v2.
The pmd_trans_huge() code in mfill_atomic() is wrong in three different
ways depending on kernel version:
1. The pmd_trans_huge() check is racy and can lead to a BUG_ON() (if you hit
the right two race windows) - I've tested this in a kernel build with
some extra mdelay() calls. See the commit message for a description
of the race scenario.
On older kernels (before 6.5), I think the same bug can even
theoretically lead to accessing transhuge page contents as a page table
if you hit the right 5 narrow race windows (I haven't tested this case).
2. As pointed out by Qi Zheng, pmd_trans_huge() is not sufficient for
detecting PMDs that don't point to page tables.
On older kernels (before 6.5), you'd just have to win a single fairly
wide race to hit this.
I've tested this on 6.1 stable by racing migration (with a mdelay()
patched into try_to_migrate()) against UFFDIO_ZEROPAGE - on my x86
VM, that causes a kernel oops in ptlock_ptr().
3. On newer kernels (>=6.5), for shmem mappings, khugepaged is allowed
to yank page tables out from under us (though I haven't tested that),
so I think the BUG_ON() checks in mfill_atomic() are just wrong.
I decided to write two separate fixes for these (one fix for bugs 1+2, one
fix for bug 3), so that the first fix can be backported to kernels
affected by bugs 1+2.
This patch (of 2):
This fixes two issues.
I discovered that the following race can occur:
mfill_atomic other thread
============ ============
<zap PMD>
pmdp_get_lockless() [reads none pmd]
<bail if trans_huge>
<if none:>
<pagefault creates transhuge zeropage>
__pte_alloc [no-op]
<zap PMD>
<bail if pmd_trans_huge(*dst_pmd)>
BUG_ON(pmd_none(*dst_pmd))
I have experimentally verified this in a kernel with extra mdelay() calls;
the BUG_ON(pmd_none(*dst_pmd)) triggers.
On kernels newer than commit 0d940a9b270b ("mm/pgtable: allow
pte_offset_map[_lock]() to fail"), this can't lead to anything worse than
a BUG_ON(), since the page table access helpers are actually designed to
deal with page tables concurrently disappearing; but on older kernels
(<=6.4), I think we could probably theoretically race past the two
BUG_ON() checks and end up treating a hugepage as a page table.
The second issue is that, as Qi Zheng pointed out, there are other types
of huge PMDs that pmd_trans_huge() can't catch: devmap PMDs and swap PMDs
(in particular, migration PMDs).
On <=6.4, this is worse than the first issue: If mfill_atomic() runs on a
PMD that contains a migration entry (which just requires winning a single,
fairly wide race), it will pass the PMD to pte_offset_map_lock(), which
assumes that the PMD points to a page table.
Breakage follows: First, the kernel tries to take the PTE lock (which will
crash or maybe worse if there is no "struct page" for the address bits in
the migration entry PMD - I think at least on X86 there usually is no
corresponding "struct page" thanks to the PTE inversion mitigation, amd64
looks different).
If that didn't crash, the kernel would next try to write a PTE into what
it wrongly thinks is a page table.
As part of fixing these issues, get rid of the check for pmd_trans_huge()
before __pte_alloc() - that's redundant, we're going to have to check for
that after the __pte_alloc() anyway.
Backport note: pmdp_get_lockless() is pmd_read_atomic() in older kernels.
Link: https://lkml.kernel.org/r/20240813-uffd-thp-flip-fix-v2-0-5efa61078a41@goog…
Link: https://lkml.kernel.org/r/20240813-uffd-thp-flip-fix-v2-1-5efa61078a41@goog…
Fixes: c1a4de99fada ("userfaultfd: mcopy_atomic|mfill_zeropage: UFFDIO_COPY|UFFDIO_ZEROPAGE preparation")
Signed-off-by: Jann Horn <jannh(a)google.com>
Acked-by: David Hildenbrand <david(a)redhat.com>
Cc: Andrea Arcangeli <aarcange(a)redhat.com>
Cc: Hugh Dickins <hughd(a)google.com>
Cc: Jann Horn <jannh(a)google.com>
Cc: Pavel Emelyanov <xemul(a)virtuozzo.com>
Cc: Qi Zheng <zhengqi.arch(a)bytedance.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index e54e5c8907fa..290b2a0d84ac 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -787,21 +787,23 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
}
dst_pmdval = pmdp_get_lockless(dst_pmd);
- /*
- * If the dst_pmd is mapped as THP don't
- * override it and just be strict.
- */
- if (unlikely(pmd_trans_huge(dst_pmdval))) {
- err = -EEXIST;
- break;
- }
if (unlikely(pmd_none(dst_pmdval)) &&
unlikely(__pte_alloc(dst_mm, dst_pmd))) {
err = -ENOMEM;
break;
}
- /* If an huge pmd materialized from under us fail */
- if (unlikely(pmd_trans_huge(*dst_pmd))) {
+ dst_pmdval = pmdp_get_lockless(dst_pmd);
+ /*
+ * If the dst_pmd is THP don't override it and just be strict.
+ * (This includes the case where the PMD used to be THP and
+ * changed back to none after __pte_alloc().)
+ */
+ if (unlikely(!pmd_present(dst_pmdval) || pmd_trans_huge(dst_pmdval) ||
+ pmd_devmap(dst_pmdval))) {
+ err = -EEXIST;
+ break;
+ }
+ if (unlikely(pmd_bad(dst_pmdval))) {
err = -EFAULT;
break;
}
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.4.y
git checkout FETCH_HEAD
git cherry-pick -x 71c186efc1b2cf1aeabfeff3b9bd5ac4c5ac14d8
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024090857-wieldable-pedicure-19b9@gregkh' --subject-prefix 'PATCH 5.4.y' HEAD^..
Possible dependencies:
71c186efc1b2 ("userfaultfd: fix checks for huge PMDs")
dab6e717429e ("mm: Rename pmd_read_atomic()")
024d232ae4fc ("mm: Fix pmd_read_atomic()")
fbfdec9989e6 ("x86/mm/pae: Make pmd_t similar to pte_t")
bd74fdaea146 ("mm: multi-gen LRU: support page table walks")
018ee47f1489 ("mm: multi-gen LRU: exploit locality in rmap")
ac35a4902374 ("mm: multi-gen LRU: minimal implementation")
ec1c86b25f4b ("mm: multi-gen LRU: groundwork")
f1e1a7be4718 ("mm/vmscan.c: refactor shrink_node()")
d3629af59f41 ("mm/vmscan: make the annotations of refaults code at the right place")
e9c2dbc8bf71 ("mm/vmscan: define macros for refaults in struct lruvec")
507228044236 ("mm/khugepaged: record SCAN_PMD_MAPPED when scan_pmd() finds hugepage")
6614a3c3164a ("Merge tag 'mm-stable-2022-08-03' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 71c186efc1b2cf1aeabfeff3b9bd5ac4c5ac14d8 Mon Sep 17 00:00:00 2001
From: Jann Horn <jannh(a)google.com>
Date: Tue, 13 Aug 2024 22:25:21 +0200
Subject: [PATCH] userfaultfd: fix checks for huge PMDs
Patch series "userfaultfd: fix races around pmd_trans_huge() check", v2.
The pmd_trans_huge() code in mfill_atomic() is wrong in three different
ways depending on kernel version:
1. The pmd_trans_huge() check is racy and can lead to a BUG_ON() (if you hit
the right two race windows) - I've tested this in a kernel build with
some extra mdelay() calls. See the commit message for a description
of the race scenario.
On older kernels (before 6.5), I think the same bug can even
theoretically lead to accessing transhuge page contents as a page table
if you hit the right 5 narrow race windows (I haven't tested this case).
2. As pointed out by Qi Zheng, pmd_trans_huge() is not sufficient for
detecting PMDs that don't point to page tables.
On older kernels (before 6.5), you'd just have to win a single fairly
wide race to hit this.
I've tested this on 6.1 stable by racing migration (with a mdelay()
patched into try_to_migrate()) against UFFDIO_ZEROPAGE - on my x86
VM, that causes a kernel oops in ptlock_ptr().
3. On newer kernels (>=6.5), for shmem mappings, khugepaged is allowed
to yank page tables out from under us (though I haven't tested that),
so I think the BUG_ON() checks in mfill_atomic() are just wrong.
I decided to write two separate fixes for these (one fix for bugs 1+2, one
fix for bug 3), so that the first fix can be backported to kernels
affected by bugs 1+2.
This patch (of 2):
This fixes two issues.
I discovered that the following race can occur:
mfill_atomic other thread
============ ============
<zap PMD>
pmdp_get_lockless() [reads none pmd]
<bail if trans_huge>
<if none:>
<pagefault creates transhuge zeropage>
__pte_alloc [no-op]
<zap PMD>
<bail if pmd_trans_huge(*dst_pmd)>
BUG_ON(pmd_none(*dst_pmd))
I have experimentally verified this in a kernel with extra mdelay() calls;
the BUG_ON(pmd_none(*dst_pmd)) triggers.
On kernels newer than commit 0d940a9b270b ("mm/pgtable: allow
pte_offset_map[_lock]() to fail"), this can't lead to anything worse than
a BUG_ON(), since the page table access helpers are actually designed to
deal with page tables concurrently disappearing; but on older kernels
(<=6.4), I think we could probably theoretically race past the two
BUG_ON() checks and end up treating a hugepage as a page table.
The second issue is that, as Qi Zheng pointed out, there are other types
of huge PMDs that pmd_trans_huge() can't catch: devmap PMDs and swap PMDs
(in particular, migration PMDs).
On <=6.4, this is worse than the first issue: If mfill_atomic() runs on a
PMD that contains a migration entry (which just requires winning a single,
fairly wide race), it will pass the PMD to pte_offset_map_lock(), which
assumes that the PMD points to a page table.
Breakage follows: First, the kernel tries to take the PTE lock (which will
crash or maybe worse if there is no "struct page" for the address bits in
the migration entry PMD - I think at least on X86 there usually is no
corresponding "struct page" thanks to the PTE inversion mitigation, amd64
looks different).
If that didn't crash, the kernel would next try to write a PTE into what
it wrongly thinks is a page table.
As part of fixing these issues, get rid of the check for pmd_trans_huge()
before __pte_alloc() - that's redundant, we're going to have to check for
that after the __pte_alloc() anyway.
Backport note: pmdp_get_lockless() is pmd_read_atomic() in older kernels.
Link: https://lkml.kernel.org/r/20240813-uffd-thp-flip-fix-v2-0-5efa61078a41@goog…
Link: https://lkml.kernel.org/r/20240813-uffd-thp-flip-fix-v2-1-5efa61078a41@goog…
Fixes: c1a4de99fada ("userfaultfd: mcopy_atomic|mfill_zeropage: UFFDIO_COPY|UFFDIO_ZEROPAGE preparation")
Signed-off-by: Jann Horn <jannh(a)google.com>
Acked-by: David Hildenbrand <david(a)redhat.com>
Cc: Andrea Arcangeli <aarcange(a)redhat.com>
Cc: Hugh Dickins <hughd(a)google.com>
Cc: Jann Horn <jannh(a)google.com>
Cc: Pavel Emelyanov <xemul(a)virtuozzo.com>
Cc: Qi Zheng <zhengqi.arch(a)bytedance.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index e54e5c8907fa..290b2a0d84ac 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -787,21 +787,23 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
}
dst_pmdval = pmdp_get_lockless(dst_pmd);
- /*
- * If the dst_pmd is mapped as THP don't
- * override it and just be strict.
- */
- if (unlikely(pmd_trans_huge(dst_pmdval))) {
- err = -EEXIST;
- break;
- }
if (unlikely(pmd_none(dst_pmdval)) &&
unlikely(__pte_alloc(dst_mm, dst_pmd))) {
err = -ENOMEM;
break;
}
- /* If an huge pmd materialized from under us fail */
- if (unlikely(pmd_trans_huge(*dst_pmd))) {
+ dst_pmdval = pmdp_get_lockless(dst_pmd);
+ /*
+ * If the dst_pmd is THP don't override it and just be strict.
+ * (This includes the case where the PMD used to be THP and
+ * changed back to none after __pte_alloc().)
+ */
+ if (unlikely(!pmd_present(dst_pmdval) || pmd_trans_huge(dst_pmdval) ||
+ pmd_devmap(dst_pmdval))) {
+ err = -EEXIST;
+ break;
+ }
+ if (unlikely(pmd_bad(dst_pmdval))) {
err = -EFAULT;
break;
}
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.10.y
git checkout FETCH_HEAD
git cherry-pick -x 71c186efc1b2cf1aeabfeff3b9bd5ac4c5ac14d8
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024090855-kiln-shrank-fea8@gregkh' --subject-prefix 'PATCH 5.10.y' HEAD^..
Possible dependencies:
71c186efc1b2 ("userfaultfd: fix checks for huge PMDs")
dab6e717429e ("mm: Rename pmd_read_atomic()")
024d232ae4fc ("mm: Fix pmd_read_atomic()")
fbfdec9989e6 ("x86/mm/pae: Make pmd_t similar to pte_t")
bd74fdaea146 ("mm: multi-gen LRU: support page table walks")
018ee47f1489 ("mm: multi-gen LRU: exploit locality in rmap")
ac35a4902374 ("mm: multi-gen LRU: minimal implementation")
ec1c86b25f4b ("mm: multi-gen LRU: groundwork")
f1e1a7be4718 ("mm/vmscan.c: refactor shrink_node()")
d3629af59f41 ("mm/vmscan: make the annotations of refaults code at the right place")
e9c2dbc8bf71 ("mm/vmscan: define macros for refaults in struct lruvec")
507228044236 ("mm/khugepaged: record SCAN_PMD_MAPPED when scan_pmd() finds hugepage")
6614a3c3164a ("Merge tag 'mm-stable-2022-08-03' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 71c186efc1b2cf1aeabfeff3b9bd5ac4c5ac14d8 Mon Sep 17 00:00:00 2001
From: Jann Horn <jannh(a)google.com>
Date: Tue, 13 Aug 2024 22:25:21 +0200
Subject: [PATCH] userfaultfd: fix checks for huge PMDs
Patch series "userfaultfd: fix races around pmd_trans_huge() check", v2.
The pmd_trans_huge() code in mfill_atomic() is wrong in three different
ways depending on kernel version:
1. The pmd_trans_huge() check is racy and can lead to a BUG_ON() (if you hit
the right two race windows) - I've tested this in a kernel build with
some extra mdelay() calls. See the commit message for a description
of the race scenario.
On older kernels (before 6.5), I think the same bug can even
theoretically lead to accessing transhuge page contents as a page table
if you hit the right 5 narrow race windows (I haven't tested this case).
2. As pointed out by Qi Zheng, pmd_trans_huge() is not sufficient for
detecting PMDs that don't point to page tables.
On older kernels (before 6.5), you'd just have to win a single fairly
wide race to hit this.
I've tested this on 6.1 stable by racing migration (with a mdelay()
patched into try_to_migrate()) against UFFDIO_ZEROPAGE - on my x86
VM, that causes a kernel oops in ptlock_ptr().
3. On newer kernels (>=6.5), for shmem mappings, khugepaged is allowed
to yank page tables out from under us (though I haven't tested that),
so I think the BUG_ON() checks in mfill_atomic() are just wrong.
I decided to write two separate fixes for these (one fix for bugs 1+2, one
fix for bug 3), so that the first fix can be backported to kernels
affected by bugs 1+2.
This patch (of 2):
This fixes two issues.
I discovered that the following race can occur:
mfill_atomic other thread
============ ============
<zap PMD>
pmdp_get_lockless() [reads none pmd]
<bail if trans_huge>
<if none:>
<pagefault creates transhuge zeropage>
__pte_alloc [no-op]
<zap PMD>
<bail if pmd_trans_huge(*dst_pmd)>
BUG_ON(pmd_none(*dst_pmd))
I have experimentally verified this in a kernel with extra mdelay() calls;
the BUG_ON(pmd_none(*dst_pmd)) triggers.
On kernels newer than commit 0d940a9b270b ("mm/pgtable: allow
pte_offset_map[_lock]() to fail"), this can't lead to anything worse than
a BUG_ON(), since the page table access helpers are actually designed to
deal with page tables concurrently disappearing; but on older kernels
(<=6.4), I think we could probably theoretically race past the two
BUG_ON() checks and end up treating a hugepage as a page table.
The second issue is that, as Qi Zheng pointed out, there are other types
of huge PMDs that pmd_trans_huge() can't catch: devmap PMDs and swap PMDs
(in particular, migration PMDs).
On <=6.4, this is worse than the first issue: If mfill_atomic() runs on a
PMD that contains a migration entry (which just requires winning a single,
fairly wide race), it will pass the PMD to pte_offset_map_lock(), which
assumes that the PMD points to a page table.
Breakage follows: First, the kernel tries to take the PTE lock (which will
crash or maybe worse if there is no "struct page" for the address bits in
the migration entry PMD - I think at least on X86 there usually is no
corresponding "struct page" thanks to the PTE inversion mitigation, amd64
looks different).
If that didn't crash, the kernel would next try to write a PTE into what
it wrongly thinks is a page table.
As part of fixing these issues, get rid of the check for pmd_trans_huge()
before __pte_alloc() - that's redundant, we're going to have to check for
that after the __pte_alloc() anyway.
Backport note: pmdp_get_lockless() is pmd_read_atomic() in older kernels.
Link: https://lkml.kernel.org/r/20240813-uffd-thp-flip-fix-v2-0-5efa61078a41@goog…
Link: https://lkml.kernel.org/r/20240813-uffd-thp-flip-fix-v2-1-5efa61078a41@goog…
Fixes: c1a4de99fada ("userfaultfd: mcopy_atomic|mfill_zeropage: UFFDIO_COPY|UFFDIO_ZEROPAGE preparation")
Signed-off-by: Jann Horn <jannh(a)google.com>
Acked-by: David Hildenbrand <david(a)redhat.com>
Cc: Andrea Arcangeli <aarcange(a)redhat.com>
Cc: Hugh Dickins <hughd(a)google.com>
Cc: Jann Horn <jannh(a)google.com>
Cc: Pavel Emelyanov <xemul(a)virtuozzo.com>
Cc: Qi Zheng <zhengqi.arch(a)bytedance.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index e54e5c8907fa..290b2a0d84ac 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -787,21 +787,23 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
}
dst_pmdval = pmdp_get_lockless(dst_pmd);
- /*
- * If the dst_pmd is mapped as THP don't
- * override it and just be strict.
- */
- if (unlikely(pmd_trans_huge(dst_pmdval))) {
- err = -EEXIST;
- break;
- }
if (unlikely(pmd_none(dst_pmdval)) &&
unlikely(__pte_alloc(dst_mm, dst_pmd))) {
err = -ENOMEM;
break;
}
- /* If an huge pmd materialized from under us fail */
- if (unlikely(pmd_trans_huge(*dst_pmd))) {
+ dst_pmdval = pmdp_get_lockless(dst_pmd);
+ /*
+ * If the dst_pmd is THP don't override it and just be strict.
+ * (This includes the case where the PMD used to be THP and
+ * changed back to none after __pte_alloc().)
+ */
+ if (unlikely(!pmd_present(dst_pmdval) || pmd_trans_huge(dst_pmdval) ||
+ pmd_devmap(dst_pmdval))) {
+ err = -EEXIST;
+ break;
+ }
+ if (unlikely(pmd_bad(dst_pmdval))) {
err = -EFAULT;
break;
}
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.15.y
git checkout FETCH_HEAD
git cherry-pick -x 71c186efc1b2cf1aeabfeff3b9bd5ac4c5ac14d8
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024090853-unworldly-aerobics-75a0@gregkh' --subject-prefix 'PATCH 5.15.y' HEAD^..
Possible dependencies:
71c186efc1b2 ("userfaultfd: fix checks for huge PMDs")
dab6e717429e ("mm: Rename pmd_read_atomic()")
024d232ae4fc ("mm: Fix pmd_read_atomic()")
fbfdec9989e6 ("x86/mm/pae: Make pmd_t similar to pte_t")
bd74fdaea146 ("mm: multi-gen LRU: support page table walks")
018ee47f1489 ("mm: multi-gen LRU: exploit locality in rmap")
ac35a4902374 ("mm: multi-gen LRU: minimal implementation")
ec1c86b25f4b ("mm: multi-gen LRU: groundwork")
f1e1a7be4718 ("mm/vmscan.c: refactor shrink_node()")
d3629af59f41 ("mm/vmscan: make the annotations of refaults code at the right place")
e9c2dbc8bf71 ("mm/vmscan: define macros for refaults in struct lruvec")
507228044236 ("mm/khugepaged: record SCAN_PMD_MAPPED when scan_pmd() finds hugepage")
6614a3c3164a ("Merge tag 'mm-stable-2022-08-03' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 71c186efc1b2cf1aeabfeff3b9bd5ac4c5ac14d8 Mon Sep 17 00:00:00 2001
From: Jann Horn <jannh(a)google.com>
Date: Tue, 13 Aug 2024 22:25:21 +0200
Subject: [PATCH] userfaultfd: fix checks for huge PMDs
Patch series "userfaultfd: fix races around pmd_trans_huge() check", v2.
The pmd_trans_huge() code in mfill_atomic() is wrong in three different
ways depending on kernel version:
1. The pmd_trans_huge() check is racy and can lead to a BUG_ON() (if you hit
the right two race windows) - I've tested this in a kernel build with
some extra mdelay() calls. See the commit message for a description
of the race scenario.
On older kernels (before 6.5), I think the same bug can even
theoretically lead to accessing transhuge page contents as a page table
if you hit the right 5 narrow race windows (I haven't tested this case).
2. As pointed out by Qi Zheng, pmd_trans_huge() is not sufficient for
detecting PMDs that don't point to page tables.
On older kernels (before 6.5), you'd just have to win a single fairly
wide race to hit this.
I've tested this on 6.1 stable by racing migration (with a mdelay()
patched into try_to_migrate()) against UFFDIO_ZEROPAGE - on my x86
VM, that causes a kernel oops in ptlock_ptr().
3. On newer kernels (>=6.5), for shmem mappings, khugepaged is allowed
to yank page tables out from under us (though I haven't tested that),
so I think the BUG_ON() checks in mfill_atomic() are just wrong.
I decided to write two separate fixes for these (one fix for bugs 1+2, one
fix for bug 3), so that the first fix can be backported to kernels
affected by bugs 1+2.
This patch (of 2):
This fixes two issues.
I discovered that the following race can occur:
mfill_atomic other thread
============ ============
<zap PMD>
pmdp_get_lockless() [reads none pmd]
<bail if trans_huge>
<if none:>
<pagefault creates transhuge zeropage>
__pte_alloc [no-op]
<zap PMD>
<bail if pmd_trans_huge(*dst_pmd)>
BUG_ON(pmd_none(*dst_pmd))
I have experimentally verified this in a kernel with extra mdelay() calls;
the BUG_ON(pmd_none(*dst_pmd)) triggers.
On kernels newer than commit 0d940a9b270b ("mm/pgtable: allow
pte_offset_map[_lock]() to fail"), this can't lead to anything worse than
a BUG_ON(), since the page table access helpers are actually designed to
deal with page tables concurrently disappearing; but on older kernels
(<=6.4), I think we could probably theoretically race past the two
BUG_ON() checks and end up treating a hugepage as a page table.
The second issue is that, as Qi Zheng pointed out, there are other types
of huge PMDs that pmd_trans_huge() can't catch: devmap PMDs and swap PMDs
(in particular, migration PMDs).
On <=6.4, this is worse than the first issue: If mfill_atomic() runs on a
PMD that contains a migration entry (which just requires winning a single,
fairly wide race), it will pass the PMD to pte_offset_map_lock(), which
assumes that the PMD points to a page table.
Breakage follows: First, the kernel tries to take the PTE lock (which will
crash or maybe worse if there is no "struct page" for the address bits in
the migration entry PMD - I think at least on X86 there usually is no
corresponding "struct page" thanks to the PTE inversion mitigation, amd64
looks different).
If that didn't crash, the kernel would next try to write a PTE into what
it wrongly thinks is a page table.
As part of fixing these issues, get rid of the check for pmd_trans_huge()
before __pte_alloc() - that's redundant, we're going to have to check for
that after the __pte_alloc() anyway.
Backport note: pmdp_get_lockless() is pmd_read_atomic() in older kernels.
Link: https://lkml.kernel.org/r/20240813-uffd-thp-flip-fix-v2-0-5efa61078a41@goog…
Link: https://lkml.kernel.org/r/20240813-uffd-thp-flip-fix-v2-1-5efa61078a41@goog…
Fixes: c1a4de99fada ("userfaultfd: mcopy_atomic|mfill_zeropage: UFFDIO_COPY|UFFDIO_ZEROPAGE preparation")
Signed-off-by: Jann Horn <jannh(a)google.com>
Acked-by: David Hildenbrand <david(a)redhat.com>
Cc: Andrea Arcangeli <aarcange(a)redhat.com>
Cc: Hugh Dickins <hughd(a)google.com>
Cc: Jann Horn <jannh(a)google.com>
Cc: Pavel Emelyanov <xemul(a)virtuozzo.com>
Cc: Qi Zheng <zhengqi.arch(a)bytedance.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index e54e5c8907fa..290b2a0d84ac 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -787,21 +787,23 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
}
dst_pmdval = pmdp_get_lockless(dst_pmd);
- /*
- * If the dst_pmd is mapped as THP don't
- * override it and just be strict.
- */
- if (unlikely(pmd_trans_huge(dst_pmdval))) {
- err = -EEXIST;
- break;
- }
if (unlikely(pmd_none(dst_pmdval)) &&
unlikely(__pte_alloc(dst_mm, dst_pmd))) {
err = -ENOMEM;
break;
}
- /* If an huge pmd materialized from under us fail */
- if (unlikely(pmd_trans_huge(*dst_pmd))) {
+ dst_pmdval = pmdp_get_lockless(dst_pmd);
+ /*
+ * If the dst_pmd is THP don't override it and just be strict.
+ * (This includes the case where the PMD used to be THP and
+ * changed back to none after __pte_alloc().)
+ */
+ if (unlikely(!pmd_present(dst_pmdval) || pmd_trans_huge(dst_pmdval) ||
+ pmd_devmap(dst_pmdval))) {
+ err = -EEXIST;
+ break;
+ }
+ if (unlikely(pmd_bad(dst_pmdval))) {
err = -EFAULT;
break;
}
The patch below does not apply to the 6.1-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.1.y
git checkout FETCH_HEAD
git cherry-pick -x 71c186efc1b2cf1aeabfeff3b9bd5ac4c5ac14d8
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024090851-swinging-pug-fba8@gregkh' --subject-prefix 'PATCH 6.1.y' HEAD^..
Possible dependencies:
71c186efc1b2 ("userfaultfd: fix checks for huge PMDs")
dab6e717429e ("mm: Rename pmd_read_atomic()")
024d232ae4fc ("mm: Fix pmd_read_atomic()")
fbfdec9989e6 ("x86/mm/pae: Make pmd_t similar to pte_t")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 71c186efc1b2cf1aeabfeff3b9bd5ac4c5ac14d8 Mon Sep 17 00:00:00 2001
From: Jann Horn <jannh(a)google.com>
Date: Tue, 13 Aug 2024 22:25:21 +0200
Subject: [PATCH] userfaultfd: fix checks for huge PMDs
Patch series "userfaultfd: fix races around pmd_trans_huge() check", v2.
The pmd_trans_huge() code in mfill_atomic() is wrong in three different
ways depending on kernel version:
1. The pmd_trans_huge() check is racy and can lead to a BUG_ON() (if you hit
the right two race windows) - I've tested this in a kernel build with
some extra mdelay() calls. See the commit message for a description
of the race scenario.
On older kernels (before 6.5), I think the same bug can even
theoretically lead to accessing transhuge page contents as a page table
if you hit the right 5 narrow race windows (I haven't tested this case).
2. As pointed out by Qi Zheng, pmd_trans_huge() is not sufficient for
detecting PMDs that don't point to page tables.
On older kernels (before 6.5), you'd just have to win a single fairly
wide race to hit this.
I've tested this on 6.1 stable by racing migration (with a mdelay()
patched into try_to_migrate()) against UFFDIO_ZEROPAGE - on my x86
VM, that causes a kernel oops in ptlock_ptr().
3. On newer kernels (>=6.5), for shmem mappings, khugepaged is allowed
to yank page tables out from under us (though I haven't tested that),
so I think the BUG_ON() checks in mfill_atomic() are just wrong.
I decided to write two separate fixes for these (one fix for bugs 1+2, one
fix for bug 3), so that the first fix can be backported to kernels
affected by bugs 1+2.
This patch (of 2):
This fixes two issues.
I discovered that the following race can occur:
mfill_atomic other thread
============ ============
<zap PMD>
pmdp_get_lockless() [reads none pmd]
<bail if trans_huge>
<if none:>
<pagefault creates transhuge zeropage>
__pte_alloc [no-op]
<zap PMD>
<bail if pmd_trans_huge(*dst_pmd)>
BUG_ON(pmd_none(*dst_pmd))
I have experimentally verified this in a kernel with extra mdelay() calls;
the BUG_ON(pmd_none(*dst_pmd)) triggers.
On kernels newer than commit 0d940a9b270b ("mm/pgtable: allow
pte_offset_map[_lock]() to fail"), this can't lead to anything worse than
a BUG_ON(), since the page table access helpers are actually designed to
deal with page tables concurrently disappearing; but on older kernels
(<=6.4), I think we could probably theoretically race past the two
BUG_ON() checks and end up treating a hugepage as a page table.
The second issue is that, as Qi Zheng pointed out, there are other types
of huge PMDs that pmd_trans_huge() can't catch: devmap PMDs and swap PMDs
(in particular, migration PMDs).
On <=6.4, this is worse than the first issue: If mfill_atomic() runs on a
PMD that contains a migration entry (which just requires winning a single,
fairly wide race), it will pass the PMD to pte_offset_map_lock(), which
assumes that the PMD points to a page table.
Breakage follows: First, the kernel tries to take the PTE lock (which will
crash or maybe worse if there is no "struct page" for the address bits in
the migration entry PMD - I think at least on X86 there usually is no
corresponding "struct page" thanks to the PTE inversion mitigation, amd64
looks different).
If that didn't crash, the kernel would next try to write a PTE into what
it wrongly thinks is a page table.
As part of fixing these issues, get rid of the check for pmd_trans_huge()
before __pte_alloc() - that's redundant, we're going to have to check for
that after the __pte_alloc() anyway.
Backport note: pmdp_get_lockless() is pmd_read_atomic() in older kernels.
Link: https://lkml.kernel.org/r/20240813-uffd-thp-flip-fix-v2-0-5efa61078a41@goog…
Link: https://lkml.kernel.org/r/20240813-uffd-thp-flip-fix-v2-1-5efa61078a41@goog…
Fixes: c1a4de99fada ("userfaultfd: mcopy_atomic|mfill_zeropage: UFFDIO_COPY|UFFDIO_ZEROPAGE preparation")
Signed-off-by: Jann Horn <jannh(a)google.com>
Acked-by: David Hildenbrand <david(a)redhat.com>
Cc: Andrea Arcangeli <aarcange(a)redhat.com>
Cc: Hugh Dickins <hughd(a)google.com>
Cc: Jann Horn <jannh(a)google.com>
Cc: Pavel Emelyanov <xemul(a)virtuozzo.com>
Cc: Qi Zheng <zhengqi.arch(a)bytedance.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index e54e5c8907fa..290b2a0d84ac 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -787,21 +787,23 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
}
dst_pmdval = pmdp_get_lockless(dst_pmd);
- /*
- * If the dst_pmd is mapped as THP don't
- * override it and just be strict.
- */
- if (unlikely(pmd_trans_huge(dst_pmdval))) {
- err = -EEXIST;
- break;
- }
if (unlikely(pmd_none(dst_pmdval)) &&
unlikely(__pte_alloc(dst_mm, dst_pmd))) {
err = -ENOMEM;
break;
}
- /* If an huge pmd materialized from under us fail */
- if (unlikely(pmd_trans_huge(*dst_pmd))) {
+ dst_pmdval = pmdp_get_lockless(dst_pmd);
+ /*
+ * If the dst_pmd is THP don't override it and just be strict.
+ * (This includes the case where the PMD used to be THP and
+ * changed back to none after __pte_alloc().)
+ */
+ if (unlikely(!pmd_present(dst_pmdval) || pmd_trans_huge(dst_pmdval) ||
+ pmd_devmap(dst_pmdval))) {
+ err = -EEXIST;
+ break;
+ }
+ if (unlikely(pmd_bad(dst_pmdval))) {
err = -EFAULT;
break;
}
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-4.19.y
git checkout FETCH_HEAD
git cherry-pick -x 49aa8a1f4d6800721c7971ed383078257f12e8f9
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024090835-matchbox-untreated-40ad@gregkh' --subject-prefix 'PATCH 4.19.y' HEAD^..
Possible dependencies:
49aa8a1f4d68 ("tracing: Avoid possible softlockup in tracing_iter_reset()")
bc1a72afdc4a ("ring-buffer: Rename ring_buffer_read() to read_buffer_iter_advance()")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 49aa8a1f4d6800721c7971ed383078257f12e8f9 Mon Sep 17 00:00:00 2001
From: Zheng Yejian <zhengyejian(a)huaweicloud.com>
Date: Tue, 27 Aug 2024 20:46:54 +0800
Subject: [PATCH] tracing: Avoid possible softlockup in tracing_iter_reset()
In __tracing_open(), when max latency tracers took place on the cpu,
the time start of its buffer would be updated, then event entries with
timestamps being earlier than start of the buffer would be skipped
(see tracing_iter_reset()).
Softlockup will occur if the kernel is non-preemptible and too many
entries were skipped in the loop that reset every cpu buffer, so add
cond_resched() to avoid it.
Cc: stable(a)vger.kernel.org
Fixes: 2f26ebd549b9a ("tracing: use timestamp to determine start of latency traces")
Link: https://lore.kernel.org/20240827124654.3817443-1-zhengyejian@huaweicloud.com
Suggested-by: Steven Rostedt <rostedt(a)goodmis.org>
Signed-off-by: Zheng Yejian <zhengyejian(a)huaweicloud.com>
Signed-off-by: Steven Rostedt (Google) <rostedt(a)goodmis.org>
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index ebe7ce2f5f4a..edf6bc817aa1 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3958,6 +3958,8 @@ void tracing_iter_reset(struct trace_iterator *iter, int cpu)
break;
entries++;
ring_buffer_iter_advance(buf_iter);
+ /* This could be a big loop */
+ cond_resched();
}
per_cpu_ptr(iter->array_buffer->data, cpu)->skipped_entries = entries;
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.4.y
git checkout FETCH_HEAD
git cherry-pick -x 49aa8a1f4d6800721c7971ed383078257f12e8f9
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024090834-cheddar-crust-24f3@gregkh' --subject-prefix 'PATCH 5.4.y' HEAD^..
Possible dependencies:
49aa8a1f4d68 ("tracing: Avoid possible softlockup in tracing_iter_reset()")
bc1a72afdc4a ("ring-buffer: Rename ring_buffer_read() to read_buffer_iter_advance()")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 49aa8a1f4d6800721c7971ed383078257f12e8f9 Mon Sep 17 00:00:00 2001
From: Zheng Yejian <zhengyejian(a)huaweicloud.com>
Date: Tue, 27 Aug 2024 20:46:54 +0800
Subject: [PATCH] tracing: Avoid possible softlockup in tracing_iter_reset()
In __tracing_open(), when max latency tracers took place on the cpu,
the time start of its buffer would be updated, then event entries with
timestamps being earlier than start of the buffer would be skipped
(see tracing_iter_reset()).
Softlockup will occur if the kernel is non-preemptible and too many
entries were skipped in the loop that reset every cpu buffer, so add
cond_resched() to avoid it.
Cc: stable(a)vger.kernel.org
Fixes: 2f26ebd549b9a ("tracing: use timestamp to determine start of latency traces")
Link: https://lore.kernel.org/20240827124654.3817443-1-zhengyejian@huaweicloud.com
Suggested-by: Steven Rostedt <rostedt(a)goodmis.org>
Signed-off-by: Zheng Yejian <zhengyejian(a)huaweicloud.com>
Signed-off-by: Steven Rostedt (Google) <rostedt(a)goodmis.org>
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index ebe7ce2f5f4a..edf6bc817aa1 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3958,6 +3958,8 @@ void tracing_iter_reset(struct trace_iterator *iter, int cpu)
break;
entries++;
ring_buffer_iter_advance(buf_iter);
+ /* This could be a big loop */
+ cond_resched();
}
per_cpu_ptr(iter->array_buffer->data, cpu)->skipped_entries = entries;
The patch below does not apply to the 6.6-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.6.y
git checkout FETCH_HEAD
git cherry-pick -x bfe0857c20c663fcc1592fa4e3a61ca12b07dac9
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024090830-imaging-symphonic-8783@gregkh' --subject-prefix 'PATCH 6.6.y' HEAD^..
Possible dependencies:
bfe0857c20c6 ("Revert "mm: skip CMA pages when they are not available"")
97144ce008f9 ("mm/vmscan: use folio_migratetype() instead of get_pageblock_migratetype()")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From bfe0857c20c663fcc1592fa4e3a61ca12b07dac9 Mon Sep 17 00:00:00 2001
From: Usama Arif <usamaarif642(a)gmail.com>
Date: Wed, 21 Aug 2024 20:26:07 +0100
Subject: [PATCH] Revert "mm: skip CMA pages when they are not available"
This reverts commit 5da226dbfce3 ("mm: skip CMA pages when they are not
available") and b7108d66318a ("Multi-gen LRU: skip CMA pages when they are
not eligible").
lruvec->lru_lock is highly contended and is held when calling
isolate_lru_folios. If the lru has a large number of CMA folios
consecutively, while the allocation type requested is not MIGRATE_MOVABLE,
isolate_lru_folios can hold the lock for a very long time while it skips
those. For FIO workload, ~150million order=0 folios were skipped to
isolate a few ZONE_DMA folios [1]. This can cause lockups [1] and high
memory pressure for extended periods of time [2].
Remove skipping CMA for MGLRU as well, as it was introduced in sort_folio
for the same resaon as 5da226dbfce3a2f44978c2c7cf88166e69a6788b.
[1] https://lore.kernel.org/all/CAOUHufbkhMZYz20aM_3rHZ3OcK4m2puji2FGpUpn_-DevG…
[2] https://lore.kernel.org/all/ZrssOrcJIDy8hacI@gmail.com/
[usamaarif642(a)gmail.com: also revert b7108d66318a, per Johannes]
Link: https://lkml.kernel.org/r/9060a32d-b2d7-48c0-8626-1db535653c54@gmail.com
Link: https://lkml.kernel.org/r/357ac325-4c61-497a-92a3-bdbd230d5ec9@gmail.com
Link: https://lkml.kernel.org/r/9060a32d-b2d7-48c0-8626-1db535653c54@gmail.com
Fixes: 5da226dbfce3 ("mm: skip CMA pages when they are not available")
Signed-off-by: Usama Arif <usamaarif642(a)gmail.com>
Acked-by: Johannes Weiner <hannes(a)cmpxchg.org>
Cc: Bharata B Rao <bharata(a)amd.com>
Cc: Breno Leitao <leitao(a)debian.org>
Cc: David Hildenbrand <david(a)redhat.com>
Cc: Matthew Wilcox <willy(a)infradead.org>
Cc: Rik van Riel <riel(a)surriel.com>
Cc: Vlastimil Babka <vbabka(a)suse.cz>
Cc: Yu Zhao <yuzhao(a)google.com>
Cc: Zhaoyang Huang <huangzhaoyang(a)gmail.com>
Cc: Zhaoyang Huang <zhaoyang.huang(a)unisoc.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
diff --git a/mm/vmscan.c b/mm/vmscan.c
index cfa839284b92..bd489c1af228 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1604,25 +1604,6 @@ static __always_inline void update_lru_sizes(struct lruvec *lruvec,
}
-#ifdef CONFIG_CMA
-/*
- * It is waste of effort to scan and reclaim CMA pages if it is not available
- * for current allocation context. Kswapd can not be enrolled as it can not
- * distinguish this scenario by using sc->gfp_mask = GFP_KERNEL
- */
-static bool skip_cma(struct folio *folio, struct scan_control *sc)
-{
- return !current_is_kswapd() &&
- gfp_migratetype(sc->gfp_mask) != MIGRATE_MOVABLE &&
- folio_migratetype(folio) == MIGRATE_CMA;
-}
-#else
-static bool skip_cma(struct folio *folio, struct scan_control *sc)
-{
- return false;
-}
-#endif
-
/*
* Isolating page from the lruvec to fill in @dst list by nr_to_scan times.
*
@@ -1669,8 +1650,7 @@ static unsigned long isolate_lru_folios(unsigned long nr_to_scan,
nr_pages = folio_nr_pages(folio);
total_scan += nr_pages;
- if (folio_zonenum(folio) > sc->reclaim_idx ||
- skip_cma(folio, sc)) {
+ if (folio_zonenum(folio) > sc->reclaim_idx) {
nr_skipped[folio_zonenum(folio)] += nr_pages;
move_to = &folios_skipped;
goto move;
@@ -4320,7 +4300,7 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, struct scan_c
}
/* ineligible */
- if (zone > sc->reclaim_idx || skip_cma(folio, sc)) {
+ if (zone > sc->reclaim_idx) {
gen = folio_inc_gen(lruvec, folio, false);
list_move_tail(&folio->lru, &lrugen->folios[gen][type][zone]);
return true;
The patch below does not apply to the 6.1-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.1.y
git checkout FETCH_HEAD
git cherry-pick -x f806de88d8f7f8191afd0fd9b94db4cd058e7d4f
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024090851-coroner-surname-cd39@gregkh' --subject-prefix 'PATCH 6.1.y' HEAD^..
Possible dependencies:
f806de88d8f7 ("maple_tree: remove rcu_read_lock() from mt_validate()")
9a40d45c1f2c ("maple_tree: remove mas_searchable()")
067311d33e65 ("maple_tree: separate ma_state node from status")
271f61a8b41d ("maple_tree: clean up inlines for some functions")
bf857ddd21d0 ("maple_tree: move debug check to __mas_set_range()")
f7a590189539 ("maple_tree: make mas_erase() more robust")
a2587a7e8d37 ("maple_tree: add test for mtree_dup()")
fd32e4e9b764 ("maple_tree: introduce interfaces __mt_dup() and mtree_dup()")
4f2267b58a22 ("maple_tree: add mt_free_one() and mt_attr() helpers")
a8091f039c1e ("maple_tree: add MAS_UNDERFLOW and MAS_OVERFLOW states")
5c590804b6b0 ("maple_tree: add mas_is_active() to detect in-tree walks")
530f745c7620 ("maple_tree: replace data before marking dead in split and spanning store")
4ffc2ee2cf01 ("maple_tree: introduce mas_tree_parent() definition")
1238f6a226dc ("maple_tree: introduce mas_put_in_tree()")
72bcf4aa86ec ("maple_tree: reorder replacement of nodes to avoid live lock")
fec29364348f ("maple_tree: reduce resets during store setup")
da0892547b10 ("maple_tree: re-introduce entry to mas_preallocate() arguments")
53bee98d004f ("mm: remove re-walk from mmap_region()")
c1297987cc2a ("maple_tree: introduce __mas_set_range()")
a489539e33c2 ("maple_tree: update mt_validate()")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From f806de88d8f7f8191afd0fd9b94db4cd058e7d4f Mon Sep 17 00:00:00 2001
From: "Liam R. Howlett" <Liam.Howlett(a)Oracle.com>
Date: Tue, 20 Aug 2024 13:54:17 -0400
Subject: [PATCH] maple_tree: remove rcu_read_lock() from mt_validate()
The write lock should be held when validating the tree to avoid updates
racing with checks. Holding the rcu read lock during a large tree
validation may also cause a prolonged rcu read window and "rcu_preempt
detected stalls" warnings.
Link: https://lore.kernel.org/all/0000000000001d12d4062005aea1@google.com/
Link: https://lkml.kernel.org/r/20240820175417.2782532-1-Liam.Howlett@oracle.com
Fixes: 54a611b60590 ("Maple Tree: add new data structure")
Signed-off-by: Liam R. Howlett <Liam.Howlett(a)Oracle.com>
Reported-by: syzbot+036af2f0c7338a33b0cd(a)syzkaller.appspotmail.com
Cc: Hillf Danton <hdanton(a)sina.com>
Cc: Matthew Wilcox <willy(a)infradead.org>
Cc: "Paul E. McKenney" <paulmck(a)kernel.org>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
diff --git a/lib/maple_tree.c b/lib/maple_tree.c
index aa3a5df15b8e..6df3a8b95808 100644
--- a/lib/maple_tree.c
+++ b/lib/maple_tree.c
@@ -7566,14 +7566,14 @@ static void mt_validate_nulls(struct maple_tree *mt)
* 2. The gap is correctly set in the parents
*/
void mt_validate(struct maple_tree *mt)
+ __must_hold(mas->tree->ma_lock)
{
unsigned char end;
MA_STATE(mas, mt, 0, 0);
- rcu_read_lock();
mas_start(&mas);
if (!mas_is_active(&mas))
- goto done;
+ return;
while (!mte_is_leaf(mas.node))
mas_descend(&mas);
@@ -7594,9 +7594,6 @@ void mt_validate(struct maple_tree *mt)
mas_dfs_postorder(&mas, ULONG_MAX);
}
mt_validate_nulls(mt);
-done:
- rcu_read_unlock();
-
}
EXPORT_SYMBOL_GPL(mt_validate);
The patch below does not apply to the 6.6-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.6.y
git checkout FETCH_HEAD
git cherry-pick -x f806de88d8f7f8191afd0fd9b94db4cd058e7d4f
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024090850-skirt-unsaved-fc27@gregkh' --subject-prefix 'PATCH 6.6.y' HEAD^..
Possible dependencies:
f806de88d8f7 ("maple_tree: remove rcu_read_lock() from mt_validate()")
9a40d45c1f2c ("maple_tree: remove mas_searchable()")
067311d33e65 ("maple_tree: separate ma_state node from status")
271f61a8b41d ("maple_tree: clean up inlines for some functions")
bf857ddd21d0 ("maple_tree: move debug check to __mas_set_range()")
f7a590189539 ("maple_tree: make mas_erase() more robust")
a2587a7e8d37 ("maple_tree: add test for mtree_dup()")
fd32e4e9b764 ("maple_tree: introduce interfaces __mt_dup() and mtree_dup()")
4f2267b58a22 ("maple_tree: add mt_free_one() and mt_attr() helpers")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From f806de88d8f7f8191afd0fd9b94db4cd058e7d4f Mon Sep 17 00:00:00 2001
From: "Liam R. Howlett" <Liam.Howlett(a)Oracle.com>
Date: Tue, 20 Aug 2024 13:54:17 -0400
Subject: [PATCH] maple_tree: remove rcu_read_lock() from mt_validate()
The write lock should be held when validating the tree to avoid updates
racing with checks. Holding the rcu read lock during a large tree
validation may also cause a prolonged rcu read window and "rcu_preempt
detected stalls" warnings.
Link: https://lore.kernel.org/all/0000000000001d12d4062005aea1@google.com/
Link: https://lkml.kernel.org/r/20240820175417.2782532-1-Liam.Howlett@oracle.com
Fixes: 54a611b60590 ("Maple Tree: add new data structure")
Signed-off-by: Liam R. Howlett <Liam.Howlett(a)Oracle.com>
Reported-by: syzbot+036af2f0c7338a33b0cd(a)syzkaller.appspotmail.com
Cc: Hillf Danton <hdanton(a)sina.com>
Cc: Matthew Wilcox <willy(a)infradead.org>
Cc: "Paul E. McKenney" <paulmck(a)kernel.org>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
diff --git a/lib/maple_tree.c b/lib/maple_tree.c
index aa3a5df15b8e..6df3a8b95808 100644
--- a/lib/maple_tree.c
+++ b/lib/maple_tree.c
@@ -7566,14 +7566,14 @@ static void mt_validate_nulls(struct maple_tree *mt)
* 2. The gap is correctly set in the parents
*/
void mt_validate(struct maple_tree *mt)
+ __must_hold(mas->tree->ma_lock)
{
unsigned char end;
MA_STATE(mas, mt, 0, 0);
- rcu_read_lock();
mas_start(&mas);
if (!mas_is_active(&mas))
- goto done;
+ return;
while (!mte_is_leaf(mas.node))
mas_descend(&mas);
@@ -7594,9 +7594,6 @@ void mt_validate(struct maple_tree *mt)
mas_dfs_postorder(&mas, ULONG_MAX);
}
mt_validate_nulls(mt);
-done:
- rcu_read_unlock();
-
}
EXPORT_SYMBOL_GPL(mt_validate);
Hi, Greg
Could you please help to cherry-pick the following commit to the 5.4.y branch?
f1edb498bd9f ("clk: hi6220: use CLK_OF_DECLARE_DRIVER")
It's been there since the 5.10 kernel, and this along with the reset
controller patch
are needed for Hikey devices to work with 5.4.y kernels, otherwise it
will get stuck
during the boot.
--
Best Regards,
Yongqin Liu
---------------------------------------------------------------
#mailing list
linaro-android(a)lists.linaro.org
http://lists.linaro.org/mailman/listinfo/linaro-android
The patch below does not apply to the 6.1-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.1.y
git checkout FETCH_HEAD
git cherry-pick -x 3002240d16494d798add0575e8ba1f284258ab34
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024090804-leverage-floral-9b6b@gregkh' --subject-prefix 'PATCH 6.1.y' HEAD^..
Possible dependencies:
3002240d1649 ("fuse: fix memory leak in fuse_create_open")
15d937d7ca8c ("fuse: add request extension")
153524053bbb ("fuse: allow non-extending parallel direct writes on the same file")
4f8d37020e1f ("fuse: add "expire only" mode to FUSE_NOTIFY_INVAL_ENTRY")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 3002240d16494d798add0575e8ba1f284258ab34 Mon Sep 17 00:00:00 2001
From: yangyun <yangyun50(a)huawei.com>
Date: Fri, 23 Aug 2024 16:51:46 +0800
Subject: [PATCH] fuse: fix memory leak in fuse_create_open
The memory of struct fuse_file is allocated but not freed
when get_create_ext return error.
Fixes: 3e2b6fdbdc9a ("fuse: send security context of inode on file")
Cc: stable(a)vger.kernel.org # v5.17
Signed-off-by: yangyun <yangyun50(a)huawei.com>
Signed-off-by: Miklos Szeredi <mszeredi(a)redhat.com>
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 2b0d4781f394..8e96df9fd76c 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -670,7 +670,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry,
err = get_create_ext(&args, dir, entry, mode);
if (err)
- goto out_put_forget_req;
+ goto out_free_ff;
err = fuse_simple_request(fm, &args);
free_ext_value(&args);
The patch below does not apply to the 6.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.10.y
git checkout FETCH_HEAD
git cherry-pick -x 9ec87c5957ea9bf68d36f5e098605b585b2571e4
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024090809-portion-riptide-f7d9@gregkh' --subject-prefix 'PATCH 6.10.y' HEAD^..
Possible dependencies:
9ec87c5957ea ("OPP: Fix support for required OPPs for multiple PM domains")
0d865221c8b1 ("OPP: Drop a redundant in-parameter to _set_opp_level()")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 9ec87c5957ea9bf68d36f5e098605b585b2571e4 Mon Sep 17 00:00:00 2001
From: Ulf Hansson <ulf.hansson(a)linaro.org>
Date: Fri, 23 Aug 2024 00:45:38 +0200
Subject: [PATCH] OPP: Fix support for required OPPs for multiple PM domains
It has turned out that having _set_required_opps() to recursively call
dev_pm_opp_set_opp() to set the required OPPs, doesn't really work as well
as we expected.
More precisely, at each recursive call to dev_pm_opp_set_opp() we are
changing an OPP for a required_dev that belongs to a required-OPP table.
The problem with this, is that we may have several devices sharing the same
required-OPP table, which leads to an incorrect behaviour in regards to
aggregating the per device votes.
To fix the problem for a required-OPP table belonging to a PM domain, which
is the only existing usecase for now, let's simply replace the call to
dev_pm_opp_set_opp() in _set_required_opps() by a call to _set_opp_level().
Moving forward we may potentially need to add support for other types of
required-OPP tables. In this case, the aggregation needs to be thought of.
Fixes: e37440e7e2c2 ("OPP: Call dev_pm_opp_set_opp() for required OPPs")
Cc: stable(a)vger.kernel.org
Signed-off-by: Ulf Hansson <ulf.hansson(a)linaro.org>
Acked-by: Viresh Kumar <viresh.kumar(a)linaro.org>
Link: https://lore.kernel.org/r/20240822224547.385095-2-ulf.hansson@linaro.org
diff --git a/drivers/opp/core.c b/drivers/opp/core.c
index 5f4598246a87..494f8860220d 100644
--- a/drivers/opp/core.c
+++ b/drivers/opp/core.c
@@ -1061,6 +1061,27 @@ static int _set_opp_bw(const struct opp_table *opp_table,
return 0;
}
+static int _set_opp_level(struct device *dev, struct dev_pm_opp *opp)
+{
+ unsigned int level = 0;
+ int ret = 0;
+
+ if (opp) {
+ if (opp->level == OPP_LEVEL_UNSET)
+ return 0;
+
+ level = opp->level;
+ }
+
+ /* Request a new performance state through the device's PM domain. */
+ ret = dev_pm_domain_set_performance_state(dev, level);
+ if (ret)
+ dev_err(dev, "Failed to set performance state %u (%d)\n", level,
+ ret);
+
+ return ret;
+}
+
/* This is only called for PM domain for now */
static int _set_required_opps(struct device *dev, struct opp_table *opp_table,
struct dev_pm_opp *opp, bool up)
@@ -1091,7 +1112,7 @@ static int _set_required_opps(struct device *dev, struct opp_table *opp_table,
if (devs[index]) {
required_opp = opp ? opp->required_opps[index] : NULL;
- ret = dev_pm_opp_set_opp(devs[index], required_opp);
+ ret = _set_opp_level(devs[index], required_opp);
if (ret)
return ret;
}
@@ -1102,27 +1123,6 @@ static int _set_required_opps(struct device *dev, struct opp_table *opp_table,
return 0;
}
-static int _set_opp_level(struct device *dev, struct dev_pm_opp *opp)
-{
- unsigned int level = 0;
- int ret = 0;
-
- if (opp) {
- if (opp->level == OPP_LEVEL_UNSET)
- return 0;
-
- level = opp->level;
- }
-
- /* Request a new performance state through the device's PM domain. */
- ret = dev_pm_domain_set_performance_state(dev, level);
- if (ret)
- dev_err(dev, "Failed to set performance state %u (%d)\n", level,
- ret);
-
- return ret;
-}
-
static void _find_current_opp(struct device *dev, struct opp_table *opp_table)
{
struct dev_pm_opp *opp = ERR_PTR(-ENODEV);
@@ -2457,18 +2457,6 @@ static int _opp_attach_genpd(struct opp_table *opp_table, struct device *dev,
}
}
- /*
- * Add the virtual genpd device as a user of the OPP table, so
- * we can call dev_pm_opp_set_opp() on it directly.
- *
- * This will be automatically removed when the OPP table is
- * removed, don't need to handle that here.
- */
- if (!_add_opp_dev(virt_dev, opp_table->required_opp_tables[index])) {
- ret = -ENOMEM;
- goto err;
- }
-
opp_table->required_devs[index] = virt_dev;
index++;
name++;
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-4.19.y
git checkout FETCH_HEAD
git cherry-pick -x aea62c744a9ae2a8247c54ec42138405216414da
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024090853-flogging-deepen-eabe@gregkh' --subject-prefix 'PATCH 4.19.y' HEAD^..
Possible dependencies:
aea62c744a9a ("mmc: cqhci: Fix checking of CQHCI_HALT state")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From aea62c744a9ae2a8247c54ec42138405216414da Mon Sep 17 00:00:00 2001
From: Seunghwan Baek <sh8267.baek(a)samsung.com>
Date: Thu, 29 Aug 2024 15:18:22 +0900
Subject: [PATCH] mmc: cqhci: Fix checking of CQHCI_HALT state
To check if mmc cqe is in halt state, need to check set/clear of CQHCI_HALT
bit. At this time, we need to check with &, not &&.
Fixes: a4080225f51d ("mmc: cqhci: support for command queue enabled host")
Cc: stable(a)vger.kernel.org
Signed-off-by: Seunghwan Baek <sh8267.baek(a)samsung.com>
Reviewed-by: Ritesh Harjani <ritesh.list(a)gmail.com>
Acked-by: Adrian Hunter <adrian.hunter(a)intel.com>
Link: https://lore.kernel.org/r/20240829061823.3718-2-sh8267.baek@samsung.com
Signed-off-by: Ulf Hansson <ulf.hansson(a)linaro.org>
diff --git a/drivers/mmc/host/cqhci-core.c b/drivers/mmc/host/cqhci-core.c
index c14d7251d0bb..a02da26a1efd 100644
--- a/drivers/mmc/host/cqhci-core.c
+++ b/drivers/mmc/host/cqhci-core.c
@@ -617,7 +617,7 @@ static int cqhci_request(struct mmc_host *mmc, struct mmc_request *mrq)
cqhci_writel(cq_host, 0, CQHCI_CTL);
mmc->cqe_on = true;
pr_debug("%s: cqhci: CQE on\n", mmc_hostname(mmc));
- if (cqhci_readl(cq_host, CQHCI_CTL) && CQHCI_HALT) {
+ if (cqhci_readl(cq_host, CQHCI_CTL) & CQHCI_HALT) {
pr_err("%s: cqhci: CQE failed to exit halt state\n",
mmc_hostname(mmc));
}
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.4.y
git checkout FETCH_HEAD
git cherry-pick -x aea62c744a9ae2a8247c54ec42138405216414da
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024090853-unweave-borrowing-7c2a@gregkh' --subject-prefix 'PATCH 5.4.y' HEAD^..
Possible dependencies:
aea62c744a9a ("mmc: cqhci: Fix checking of CQHCI_HALT state")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From aea62c744a9ae2a8247c54ec42138405216414da Mon Sep 17 00:00:00 2001
From: Seunghwan Baek <sh8267.baek(a)samsung.com>
Date: Thu, 29 Aug 2024 15:18:22 +0900
Subject: [PATCH] mmc: cqhci: Fix checking of CQHCI_HALT state
To check if mmc cqe is in halt state, need to check set/clear of CQHCI_HALT
bit. At this time, we need to check with &, not &&.
Fixes: a4080225f51d ("mmc: cqhci: support for command queue enabled host")
Cc: stable(a)vger.kernel.org
Signed-off-by: Seunghwan Baek <sh8267.baek(a)samsung.com>
Reviewed-by: Ritesh Harjani <ritesh.list(a)gmail.com>
Acked-by: Adrian Hunter <adrian.hunter(a)intel.com>
Link: https://lore.kernel.org/r/20240829061823.3718-2-sh8267.baek@samsung.com
Signed-off-by: Ulf Hansson <ulf.hansson(a)linaro.org>
diff --git a/drivers/mmc/host/cqhci-core.c b/drivers/mmc/host/cqhci-core.c
index c14d7251d0bb..a02da26a1efd 100644
--- a/drivers/mmc/host/cqhci-core.c
+++ b/drivers/mmc/host/cqhci-core.c
@@ -617,7 +617,7 @@ static int cqhci_request(struct mmc_host *mmc, struct mmc_request *mrq)
cqhci_writel(cq_host, 0, CQHCI_CTL);
mmc->cqe_on = true;
pr_debug("%s: cqhci: CQE on\n", mmc_hostname(mmc));
- if (cqhci_readl(cq_host, CQHCI_CTL) && CQHCI_HALT) {
+ if (cqhci_readl(cq_host, CQHCI_CTL) & CQHCI_HALT) {
pr_err("%s: cqhci: CQE failed to exit halt state\n",
mmc_hostname(mmc));
}
This is a note to let you know that I've just added the patch titled
iio: magnetometer: ak8975: Fix reading for ak099xx sensors
to my char-misc git tree which can be found at
git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/char-misc.git
in the char-misc-next branch.
The patch will show up in the next release of the linux-next tree
(usually sometime within the next 24 hours during the week.)
The patch will also be merged in the next major kernel release
during the merge window.
If you have any questions about this process, please let me know.
From 129464e86c7445a858b790ac2d28d35f58256bbe Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Barnab=C3=A1s=20Cz=C3=A9m=C3=A1n?=
<barnabas.czeman(a)mainlining.org>
Date: Mon, 19 Aug 2024 00:29:40 +0200
Subject: iio: magnetometer: ak8975: Fix reading for ak099xx sensors
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Move ST2 reading with overflow handling after measurement data
reading.
ST2 register read have to be read after read measurment data,
because it means end of the reading and realease the lock on the data.
Remove ST2 read skip on interrupt based waiting because ST2 required to
be read out at and of the axis read.
Fixes: 57e73a423b1e ("iio: ak8975: add ak09911 and ak09912 support")
Signed-off-by: Barnabás Czémán <barnabas.czeman(a)mainlining.org>
Link: https://patch.msgid.link/20240819-ak09918-v4-2-f0734d14cfb9@mainlining.org
Cc: <Stable(a)vger.kernel.org>
Signed-off-by: Jonathan Cameron <Jonathan.Cameron(a)huawei.com>
---
drivers/iio/magnetometer/ak8975.c | 32 +++++++++++++++----------------
1 file changed, 16 insertions(+), 16 deletions(-)
diff --git a/drivers/iio/magnetometer/ak8975.c b/drivers/iio/magnetometer/ak8975.c
index 67d5d1f2402f..1d5f79b7005b 100644
--- a/drivers/iio/magnetometer/ak8975.c
+++ b/drivers/iio/magnetometer/ak8975.c
@@ -696,22 +696,8 @@ static int ak8975_start_read_axis(struct ak8975_data *data,
if (ret < 0)
return ret;
- /* This will be executed only for non-interrupt based waiting case */
- if (ret & data->def->ctrl_masks[ST1_DRDY]) {
- ret = i2c_smbus_read_byte_data(client,
- data->def->ctrl_regs[ST2]);
- if (ret < 0) {
- dev_err(&client->dev, "Error in reading ST2\n");
- return ret;
- }
- if (ret & (data->def->ctrl_masks[ST2_DERR] |
- data->def->ctrl_masks[ST2_HOFL])) {
- dev_err(&client->dev, "ST2 status error 0x%x\n", ret);
- return -EINVAL;
- }
- }
-
- return 0;
+ /* Return with zero if the data is ready. */
+ return !data->def->ctrl_regs[ST1_DRDY];
}
/* Retrieve raw flux value for one of the x, y, or z axis. */
@@ -738,6 +724,20 @@ static int ak8975_read_axis(struct iio_dev *indio_dev, int index, int *val)
if (ret < 0)
goto exit;
+ /* Read out ST2 for release lock on measurment data. */
+ ret = i2c_smbus_read_byte_data(client, data->def->ctrl_regs[ST2]);
+ if (ret < 0) {
+ dev_err(&client->dev, "Error in reading ST2\n");
+ goto exit;
+ }
+
+ if (ret & (data->def->ctrl_masks[ST2_DERR] |
+ data->def->ctrl_masks[ST2_HOFL])) {
+ dev_err(&client->dev, "ST2 status error 0x%x\n", ret);
+ ret = -EINVAL;
+ goto exit;
+ }
+
mutex_unlock(&data->lock);
pm_runtime_mark_last_busy(&data->client->dev);
--
2.46.0
This is a note to let you know that I've just added the patch titled
iio: magnetometer: ak8975: Fix reading for ak099xx sensors
to my char-misc git tree which can be found at
git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/char-misc.git
in the char-misc-testing branch.
The patch will show up in the next release of the linux-next tree
(usually sometime within the next 24 hours during the week.)
The patch will be merged to the char-misc-next branch sometime soon,
after it passes testing, and the merge window is open.
If you have any questions about this process, please let me know.
From 129464e86c7445a858b790ac2d28d35f58256bbe Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Barnab=C3=A1s=20Cz=C3=A9m=C3=A1n?=
<barnabas.czeman(a)mainlining.org>
Date: Mon, 19 Aug 2024 00:29:40 +0200
Subject: iio: magnetometer: ak8975: Fix reading for ak099xx sensors
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Move ST2 reading with overflow handling after measurement data
reading.
ST2 register read have to be read after read measurment data,
because it means end of the reading and realease the lock on the data.
Remove ST2 read skip on interrupt based waiting because ST2 required to
be read out at and of the axis read.
Fixes: 57e73a423b1e ("iio: ak8975: add ak09911 and ak09912 support")
Signed-off-by: Barnabás Czémán <barnabas.czeman(a)mainlining.org>
Link: https://patch.msgid.link/20240819-ak09918-v4-2-f0734d14cfb9@mainlining.org
Cc: <Stable(a)vger.kernel.org>
Signed-off-by: Jonathan Cameron <Jonathan.Cameron(a)huawei.com>
---
drivers/iio/magnetometer/ak8975.c | 32 +++++++++++++++----------------
1 file changed, 16 insertions(+), 16 deletions(-)
diff --git a/drivers/iio/magnetometer/ak8975.c b/drivers/iio/magnetometer/ak8975.c
index 67d5d1f2402f..1d5f79b7005b 100644
--- a/drivers/iio/magnetometer/ak8975.c
+++ b/drivers/iio/magnetometer/ak8975.c
@@ -696,22 +696,8 @@ static int ak8975_start_read_axis(struct ak8975_data *data,
if (ret < 0)
return ret;
- /* This will be executed only for non-interrupt based waiting case */
- if (ret & data->def->ctrl_masks[ST1_DRDY]) {
- ret = i2c_smbus_read_byte_data(client,
- data->def->ctrl_regs[ST2]);
- if (ret < 0) {
- dev_err(&client->dev, "Error in reading ST2\n");
- return ret;
- }
- if (ret & (data->def->ctrl_masks[ST2_DERR] |
- data->def->ctrl_masks[ST2_HOFL])) {
- dev_err(&client->dev, "ST2 status error 0x%x\n", ret);
- return -EINVAL;
- }
- }
-
- return 0;
+ /* Return with zero if the data is ready. */
+ return !data->def->ctrl_regs[ST1_DRDY];
}
/* Retrieve raw flux value for one of the x, y, or z axis. */
@@ -738,6 +724,20 @@ static int ak8975_read_axis(struct iio_dev *indio_dev, int index, int *val)
if (ret < 0)
goto exit;
+ /* Read out ST2 for release lock on measurment data. */
+ ret = i2c_smbus_read_byte_data(client, data->def->ctrl_regs[ST2]);
+ if (ret < 0) {
+ dev_err(&client->dev, "Error in reading ST2\n");
+ goto exit;
+ }
+
+ if (ret & (data->def->ctrl_masks[ST2_DERR] |
+ data->def->ctrl_masks[ST2_HOFL])) {
+ dev_err(&client->dev, "ST2 status error 0x%x\n", ret);
+ ret = -EINVAL;
+ goto exit;
+ }
+
mutex_unlock(&data->lock);
pm_runtime_mark_last_busy(&data->client->dev);
--
2.46.0
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.15.y
git checkout FETCH_HEAD
git cherry-pick -x 4bcdd831d9d01e0fb64faea50732b59b2ee88da1
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024090850-handrail-battering-849f@gregkh' --subject-prefix 'PATCH 5.15.y' HEAD^..
Possible dependencies:
4bcdd831d9d0 ("KVM: x86: Acquire kvm->srcu when handling KVM_SET_VCPU_EVENTS")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 4bcdd831d9d01e0fb64faea50732b59b2ee88da1 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc(a)google.com>
Date: Tue, 23 Jul 2024 16:20:55 -0700
Subject: [PATCH] KVM: x86: Acquire kvm->srcu when handling KVM_SET_VCPU_EVENTS
Grab kvm->srcu when processing KVM_SET_VCPU_EVENTS, as KVM will forcibly
leave nested VMX/SVM if SMM mode is being toggled, and leaving nested VMX
reads guest memory.
Note, kvm_vcpu_ioctl_x86_set_vcpu_events() can also be called from KVM_RUN
via sync_regs(), which already holds SRCU. I.e. trying to precisely use
kvm_vcpu_srcu_read_lock() around the problematic SMM code would cause
problems. Acquiring SRCU isn't all that expensive, so for simplicity,
grab it unconditionally for KVM_SET_VCPU_EVENTS.
=============================
WARNING: suspicious RCU usage
6.10.0-rc7-332d2c1d713e-next-vm #552 Not tainted
-----------------------------
include/linux/kvm_host.h:1027 suspicious rcu_dereference_check() usage!
other info that might help us debug this:
rcu_scheduler_active = 2, debug_locks = 1
1 lock held by repro/1071:
#0: ffff88811e424430 (&vcpu->mutex){+.+.}-{3:3}, at: kvm_vcpu_ioctl+0x7d/0x970 [kvm]
stack backtrace:
CPU: 15 PID: 1071 Comm: repro Not tainted 6.10.0-rc7-332d2c1d713e-next-vm #552
Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 0.0.0 02/06/2015
Call Trace:
<TASK>
dump_stack_lvl+0x7f/0x90
lockdep_rcu_suspicious+0x13f/0x1a0
kvm_vcpu_gfn_to_memslot+0x168/0x190 [kvm]
kvm_vcpu_read_guest+0x3e/0x90 [kvm]
nested_vmx_load_msr+0x6b/0x1d0 [kvm_intel]
load_vmcs12_host_state+0x432/0xb40 [kvm_intel]
vmx_leave_nested+0x30/0x40 [kvm_intel]
kvm_vcpu_ioctl_x86_set_vcpu_events+0x15d/0x2b0 [kvm]
kvm_arch_vcpu_ioctl+0x1107/0x1750 [kvm]
? mark_held_locks+0x49/0x70
? kvm_vcpu_ioctl+0x7d/0x970 [kvm]
? kvm_vcpu_ioctl+0x497/0x970 [kvm]
kvm_vcpu_ioctl+0x497/0x970 [kvm]
? lock_acquire+0xba/0x2d0
? find_held_lock+0x2b/0x80
? do_user_addr_fault+0x40c/0x6f0
? lock_release+0xb7/0x270
__x64_sys_ioctl+0x82/0xb0
do_syscall_64+0x6c/0x170
entry_SYSCALL_64_after_hwframe+0x4b/0x53
RIP: 0033:0x7ff11eb1b539
</TASK>
Fixes: f7e570780efc ("KVM: x86: Forcibly leave nested virt when SMM state is toggled")
Cc: stable(a)vger.kernel.org
Link: https://lore.kernel.org/r/20240723232055.3643811-1-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc(a)google.com>
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 70219e406987..2c7327ef0f0d 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6040,7 +6040,9 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
if (copy_from_user(&events, argp, sizeof(struct kvm_vcpu_events)))
break;
+ kvm_vcpu_srcu_read_lock(vcpu);
r = kvm_vcpu_ioctl_x86_set_vcpu_events(vcpu, &events);
+ kvm_vcpu_srcu_read_unlock(vcpu);
break;
}
case KVM_GET_DEBUGREGS: {
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.10.y
git checkout FETCH_HEAD
git cherry-pick -x 4bcdd831d9d01e0fb64faea50732b59b2ee88da1
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024090849-facsimile-hamstring-a776@gregkh' --subject-prefix 'PATCH 5.10.y' HEAD^..
Possible dependencies:
4bcdd831d9d0 ("KVM: x86: Acquire kvm->srcu when handling KVM_SET_VCPU_EVENTS")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 4bcdd831d9d01e0fb64faea50732b59b2ee88da1 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc(a)google.com>
Date: Tue, 23 Jul 2024 16:20:55 -0700
Subject: [PATCH] KVM: x86: Acquire kvm->srcu when handling KVM_SET_VCPU_EVENTS
Grab kvm->srcu when processing KVM_SET_VCPU_EVENTS, as KVM will forcibly
leave nested VMX/SVM if SMM mode is being toggled, and leaving nested VMX
reads guest memory.
Note, kvm_vcpu_ioctl_x86_set_vcpu_events() can also be called from KVM_RUN
via sync_regs(), which already holds SRCU. I.e. trying to precisely use
kvm_vcpu_srcu_read_lock() around the problematic SMM code would cause
problems. Acquiring SRCU isn't all that expensive, so for simplicity,
grab it unconditionally for KVM_SET_VCPU_EVENTS.
=============================
WARNING: suspicious RCU usage
6.10.0-rc7-332d2c1d713e-next-vm #552 Not tainted
-----------------------------
include/linux/kvm_host.h:1027 suspicious rcu_dereference_check() usage!
other info that might help us debug this:
rcu_scheduler_active = 2, debug_locks = 1
1 lock held by repro/1071:
#0: ffff88811e424430 (&vcpu->mutex){+.+.}-{3:3}, at: kvm_vcpu_ioctl+0x7d/0x970 [kvm]
stack backtrace:
CPU: 15 PID: 1071 Comm: repro Not tainted 6.10.0-rc7-332d2c1d713e-next-vm #552
Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 0.0.0 02/06/2015
Call Trace:
<TASK>
dump_stack_lvl+0x7f/0x90
lockdep_rcu_suspicious+0x13f/0x1a0
kvm_vcpu_gfn_to_memslot+0x168/0x190 [kvm]
kvm_vcpu_read_guest+0x3e/0x90 [kvm]
nested_vmx_load_msr+0x6b/0x1d0 [kvm_intel]
load_vmcs12_host_state+0x432/0xb40 [kvm_intel]
vmx_leave_nested+0x30/0x40 [kvm_intel]
kvm_vcpu_ioctl_x86_set_vcpu_events+0x15d/0x2b0 [kvm]
kvm_arch_vcpu_ioctl+0x1107/0x1750 [kvm]
? mark_held_locks+0x49/0x70
? kvm_vcpu_ioctl+0x7d/0x970 [kvm]
? kvm_vcpu_ioctl+0x497/0x970 [kvm]
kvm_vcpu_ioctl+0x497/0x970 [kvm]
? lock_acquire+0xba/0x2d0
? find_held_lock+0x2b/0x80
? do_user_addr_fault+0x40c/0x6f0
? lock_release+0xb7/0x270
__x64_sys_ioctl+0x82/0xb0
do_syscall_64+0x6c/0x170
entry_SYSCALL_64_after_hwframe+0x4b/0x53
RIP: 0033:0x7ff11eb1b539
</TASK>
Fixes: f7e570780efc ("KVM: x86: Forcibly leave nested virt when SMM state is toggled")
Cc: stable(a)vger.kernel.org
Link: https://lore.kernel.org/r/20240723232055.3643811-1-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc(a)google.com>
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 70219e406987..2c7327ef0f0d 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6040,7 +6040,9 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
if (copy_from_user(&events, argp, sizeof(struct kvm_vcpu_events)))
break;
+ kvm_vcpu_srcu_read_lock(vcpu);
r = kvm_vcpu_ioctl_x86_set_vcpu_events(vcpu, &events);
+ kvm_vcpu_srcu_read_unlock(vcpu);
break;
}
case KVM_GET_DEBUGREGS: {
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.4.y
git checkout FETCH_HEAD
git cherry-pick -x d33d26036a0274b472299d7dcdaa5fb34329f91b
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024090849-android-shield-d1fd@gregkh' --subject-prefix 'PATCH 5.4.y' HEAD^..
Possible dependencies:
d33d26036a02 ("rtmutex: Drop rt_mutex::wait_lock before scheduling")
add461325ec5 ("locking/rtmutex: Extend the rtmutex core to support ww_mutex")
1c143c4b65da ("locking/rtmutex: Provide the spin/rwlock core lock function")
e17ba59b7e8e ("locking/rtmutex: Guard regular sleeping locks specific functions")
7980aa397cc0 ("locking/rtmutex: Use rt_mutex_wake_q_head")
c014ef69b3ac ("locking/rtmutex: Add wake_state to rt_mutex_waiter")
42254105dfe8 ("locking/rwsem: Add rtmutex based R/W semaphore implementation")
ebbdc41e90ff ("locking/rtmutex: Provide rt_mutex_slowlock_locked()")
830e6acc8a1c ("locking/rtmutex: Split out the inner parts of 'struct rtmutex'")
531ae4b06a73 ("locking/rtmutex: Split API from implementation")
785159301bed ("locking/rtmutex: Convert macros to inlines")
b41cda037655 ("locking/rtmutex: Set proper wait context for lockdep")
2f064a59a11f ("sched: Change task_struct::state")
d6c23bb3a2ad ("sched: Add get_current_state()")
b03fbd4ff24c ("sched: Introduce task_is_running()")
a9e906b71f96 ("Merge branch 'sched/urgent' into sched/core, to pick up fixes")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From d33d26036a0274b472299d7dcdaa5fb34329f91b Mon Sep 17 00:00:00 2001
From: Roland Xu <mu001999(a)outlook.com>
Date: Thu, 15 Aug 2024 10:58:13 +0800
Subject: [PATCH] rtmutex: Drop rt_mutex::wait_lock before scheduling
rt_mutex_handle_deadlock() is called with rt_mutex::wait_lock held. In the
good case it returns with the lock held and in the deadlock case it emits a
warning and goes into an endless scheduling loop with the lock held, which
triggers the 'scheduling in atomic' warning.
Unlock rt_mutex::wait_lock in the dead lock case before issuing the warning
and dropping into the schedule for ever loop.
[ tglx: Moved unlock before the WARN(), removed the pointless comment,
massaged changelog, added Fixes tag ]
Fixes: 3d5c9340d194 ("rtmutex: Handle deadlock detection smarter")
Signed-off-by: Roland Xu <mu001999(a)outlook.com>
Signed-off-by: Thomas Gleixner <tglx(a)linutronix.de>
Cc: stable(a)vger.kernel.org
Link: https://lore.kernel.org/all/ME0P300MB063599BEF0743B8FA339C2CECC802@ME0P300M…
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 88d08eeb8bc0..fba1229f1de6 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -1644,6 +1644,7 @@ static int __sched rt_mutex_slowlock_block(struct rt_mutex_base *lock,
}
static void __sched rt_mutex_handle_deadlock(int res, int detect_deadlock,
+ struct rt_mutex_base *lock,
struct rt_mutex_waiter *w)
{
/*
@@ -1656,10 +1657,10 @@ static void __sched rt_mutex_handle_deadlock(int res, int detect_deadlock,
if (build_ww_mutex() && w->ww_ctx)
return;
- /*
- * Yell loudly and stop the task right here.
- */
+ raw_spin_unlock_irq(&lock->wait_lock);
+
WARN(1, "rtmutex deadlock detected\n");
+
while (1) {
set_current_state(TASK_INTERRUPTIBLE);
rt_mutex_schedule();
@@ -1713,7 +1714,7 @@ static int __sched __rt_mutex_slowlock(struct rt_mutex_base *lock,
} else {
__set_current_state(TASK_RUNNING);
remove_waiter(lock, waiter);
- rt_mutex_handle_deadlock(ret, chwalk, waiter);
+ rt_mutex_handle_deadlock(ret, chwalk, lock, waiter);
}
/*
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.10.y
git checkout FETCH_HEAD
git cherry-pick -x d33d26036a0274b472299d7dcdaa5fb34329f91b
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024090848-dragster-ahoy-c47e@gregkh' --subject-prefix 'PATCH 5.10.y' HEAD^..
Possible dependencies:
d33d26036a02 ("rtmutex: Drop rt_mutex::wait_lock before scheduling")
add461325ec5 ("locking/rtmutex: Extend the rtmutex core to support ww_mutex")
1c143c4b65da ("locking/rtmutex: Provide the spin/rwlock core lock function")
e17ba59b7e8e ("locking/rtmutex: Guard regular sleeping locks specific functions")
7980aa397cc0 ("locking/rtmutex: Use rt_mutex_wake_q_head")
c014ef69b3ac ("locking/rtmutex: Add wake_state to rt_mutex_waiter")
42254105dfe8 ("locking/rwsem: Add rtmutex based R/W semaphore implementation")
ebbdc41e90ff ("locking/rtmutex: Provide rt_mutex_slowlock_locked()")
830e6acc8a1c ("locking/rtmutex: Split out the inner parts of 'struct rtmutex'")
531ae4b06a73 ("locking/rtmutex: Split API from implementation")
785159301bed ("locking/rtmutex: Convert macros to inlines")
b41cda037655 ("locking/rtmutex: Set proper wait context for lockdep")
2f064a59a11f ("sched: Change task_struct::state")
d6c23bb3a2ad ("sched: Add get_current_state()")
b03fbd4ff24c ("sched: Introduce task_is_running()")
a9e906b71f96 ("Merge branch 'sched/urgent' into sched/core, to pick up fixes")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From d33d26036a0274b472299d7dcdaa5fb34329f91b Mon Sep 17 00:00:00 2001
From: Roland Xu <mu001999(a)outlook.com>
Date: Thu, 15 Aug 2024 10:58:13 +0800
Subject: [PATCH] rtmutex: Drop rt_mutex::wait_lock before scheduling
rt_mutex_handle_deadlock() is called with rt_mutex::wait_lock held. In the
good case it returns with the lock held and in the deadlock case it emits a
warning and goes into an endless scheduling loop with the lock held, which
triggers the 'scheduling in atomic' warning.
Unlock rt_mutex::wait_lock in the dead lock case before issuing the warning
and dropping into the schedule for ever loop.
[ tglx: Moved unlock before the WARN(), removed the pointless comment,
massaged changelog, added Fixes tag ]
Fixes: 3d5c9340d194 ("rtmutex: Handle deadlock detection smarter")
Signed-off-by: Roland Xu <mu001999(a)outlook.com>
Signed-off-by: Thomas Gleixner <tglx(a)linutronix.de>
Cc: stable(a)vger.kernel.org
Link: https://lore.kernel.org/all/ME0P300MB063599BEF0743B8FA339C2CECC802@ME0P300M…
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 88d08eeb8bc0..fba1229f1de6 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -1644,6 +1644,7 @@ static int __sched rt_mutex_slowlock_block(struct rt_mutex_base *lock,
}
static void __sched rt_mutex_handle_deadlock(int res, int detect_deadlock,
+ struct rt_mutex_base *lock,
struct rt_mutex_waiter *w)
{
/*
@@ -1656,10 +1657,10 @@ static void __sched rt_mutex_handle_deadlock(int res, int detect_deadlock,
if (build_ww_mutex() && w->ww_ctx)
return;
- /*
- * Yell loudly and stop the task right here.
- */
+ raw_spin_unlock_irq(&lock->wait_lock);
+
WARN(1, "rtmutex deadlock detected\n");
+
while (1) {
set_current_state(TASK_INTERRUPTIBLE);
rt_mutex_schedule();
@@ -1713,7 +1714,7 @@ static int __sched __rt_mutex_slowlock(struct rt_mutex_base *lock,
} else {
__set_current_state(TASK_RUNNING);
remove_waiter(lock, waiter);
- rt_mutex_handle_deadlock(ret, chwalk, waiter);
+ rt_mutex_handle_deadlock(ret, chwalk, lock, waiter);
}
/*
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-4.19.y
git checkout FETCH_HEAD
git cherry-pick -x ea72ce5da22806d5713f3ffb39a6d5ae73841f93
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024090830-cough-rewrite-bcc9@gregkh' --subject-prefix 'PATCH 4.19.y' HEAD^..
Possible dependencies:
ea72ce5da228 ("x86/kaslr: Expose and use the end of the physical memory address space")
1a167ddd3c56 ("x86: kmsan: pgtable: reduce vmalloc space")
14b80582c43e ("resource: Introduce alloc_free_mem_region()")
27674ef6c73f ("mm: remove the extra ZONE_DEVICE struct page refcount")
dc90f0846df4 ("mm: don't include <linux/memremap.h> in <linux/mm.h>")
895749455f60 ("mm: simplify freeing of devmap managed pages")
75e55d8a107e ("mm: move free_devmap_managed_page to memremap.c")
730ff52194cd ("mm: remove pointless includes from <linux/hmm.h>")
f56caedaf94f ("Merge branch 'akpm' (patches from Andrew)")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From ea72ce5da22806d5713f3ffb39a6d5ae73841f93 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx(a)linutronix.de>
Date: Wed, 14 Aug 2024 00:29:36 +0200
Subject: [PATCH] x86/kaslr: Expose and use the end of the physical memory
address space
iounmap() on x86 occasionally fails to unmap because the provided valid
ioremap address is not below high_memory. It turned out that this
happens due to KASLR.
KASLR uses the full address space between PAGE_OFFSET and vaddr_end to
randomize the starting points of the direct map, vmalloc and vmemmap
regions. It thereby limits the size of the direct map by using the
installed memory size plus an extra configurable margin for hot-plug
memory. This limitation is done to gain more randomization space
because otherwise only the holes between the direct map, vmalloc,
vmemmap and vaddr_end would be usable for randomizing.
The limited direct map size is not exposed to the rest of the kernel, so
the memory hot-plug and resource management related code paths still
operate under the assumption that the available address space can be
determined with MAX_PHYSMEM_BITS.
request_free_mem_region() allocates from (1 << MAX_PHYSMEM_BITS) - 1
downwards. That means the first allocation happens past the end of the
direct map and if unlucky this address is in the vmalloc space, which
causes high_memory to become greater than VMALLOC_START and consequently
causes iounmap() to fail for valid ioremap addresses.
MAX_PHYSMEM_BITS cannot be changed for that because the randomization
does not align with address bit boundaries and there are other places
which actually require to know the maximum number of address bits. All
remaining usage sites of MAX_PHYSMEM_BITS have been analyzed and found
to be correct.
Cure this by exposing the end of the direct map via PHYSMEM_END and use
that for the memory hot-plug and resource management related places
instead of relying on MAX_PHYSMEM_BITS. In the KASLR case PHYSMEM_END
maps to a variable which is initialized by the KASLR initialization and
otherwise it is based on MAX_PHYSMEM_BITS as before.
To prevent future hickups add a check into add_pages() to catch callers
trying to add memory above PHYSMEM_END.
Fixes: 0483e1fa6e09 ("x86/mm: Implement ASLR for kernel memory regions")
Reported-by: Max Ramanouski <max8rr8(a)gmail.com>
Reported-by: Alistair Popple <apopple(a)nvidia.com>
Signed-off-by: Thomas Gleixner <tglx(a)linutronix.de>
Tested-By: Max Ramanouski <max8rr8(a)gmail.com>
Tested-by: Alistair Popple <apopple(a)nvidia.com>
Reviewed-by: Dan Williams <dan.j.williams(a)intel.com>
Reviewed-by: Alistair Popple <apopple(a)nvidia.com>
Reviewed-by: Kees Cook <kees(a)kernel.org>
Cc: stable(a)vger.kernel.org
Link: https://lore.kernel.org/all/87ed6soy3z.ffs@tglx
diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h
index af4302d79b59..f3d257c45225 100644
--- a/arch/x86/include/asm/page_64.h
+++ b/arch/x86/include/asm/page_64.h
@@ -17,6 +17,7 @@ extern unsigned long phys_base;
extern unsigned long page_offset_base;
extern unsigned long vmalloc_base;
extern unsigned long vmemmap_base;
+extern unsigned long physmem_end;
static __always_inline unsigned long __phys_addr_nodebug(unsigned long x)
{
diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
index 9053dfe9fa03..a98e53491a4e 100644
--- a/arch/x86/include/asm/pgtable_64_types.h
+++ b/arch/x86/include/asm/pgtable_64_types.h
@@ -140,6 +140,10 @@ extern unsigned int ptrs_per_p4d;
# define VMEMMAP_START __VMEMMAP_BASE_L4
#endif /* CONFIG_DYNAMIC_MEMORY_LAYOUT */
+#ifdef CONFIG_RANDOMIZE_MEMORY
+# define PHYSMEM_END physmem_end
+#endif
+
/*
* End of the region for which vmalloc page tables are pre-allocated.
* For non-KMSAN builds, this is the same as VMALLOC_END.
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index d8dbeac8b206..ff253648706f 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -958,8 +958,12 @@ static void update_end_of_memory_vars(u64 start, u64 size)
int add_pages(int nid, unsigned long start_pfn, unsigned long nr_pages,
struct mhp_params *params)
{
+ unsigned long end = ((start_pfn + nr_pages) << PAGE_SHIFT) - 1;
int ret;
+ if (WARN_ON_ONCE(end > PHYSMEM_END))
+ return -ERANGE;
+
ret = __add_pages(nid, start_pfn, nr_pages, params);
WARN_ON_ONCE(ret);
diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c
index 37db264866b6..230f1dee4f09 100644
--- a/arch/x86/mm/kaslr.c
+++ b/arch/x86/mm/kaslr.c
@@ -47,13 +47,24 @@ static const unsigned long vaddr_end = CPU_ENTRY_AREA_BASE;
*/
static __initdata struct kaslr_memory_region {
unsigned long *base;
+ unsigned long *end;
unsigned long size_tb;
} kaslr_regions[] = {
- { &page_offset_base, 0 },
- { &vmalloc_base, 0 },
- { &vmemmap_base, 0 },
+ {
+ .base = &page_offset_base,
+ .end = &physmem_end,
+ },
+ {
+ .base = &vmalloc_base,
+ },
+ {
+ .base = &vmemmap_base,
+ },
};
+/* The end of the possible address space for physical memory */
+unsigned long physmem_end __ro_after_init;
+
/* Get size in bytes used by the memory region */
static inline unsigned long get_padding(struct kaslr_memory_region *region)
{
@@ -82,6 +93,8 @@ void __init kernel_randomize_memory(void)
BUILD_BUG_ON(vaddr_end != CPU_ENTRY_AREA_BASE);
BUILD_BUG_ON(vaddr_end > __START_KERNEL_map);
+ /* Preset the end of the possible address space for physical memory */
+ physmem_end = ((1ULL << MAX_PHYSMEM_BITS) - 1);
if (!kaslr_memory_enabled())
return;
@@ -128,11 +141,18 @@ void __init kernel_randomize_memory(void)
vaddr += entropy;
*kaslr_regions[i].base = vaddr;
- /*
- * Jump the region and add a minimum padding based on
- * randomization alignment.
- */
+ /* Calculate the end of the region */
vaddr += get_padding(&kaslr_regions[i]);
+ /*
+ * KASLR trims the maximum possible size of the
+ * direct-map. Update the physmem_end boundary.
+ * No rounding required as the region starts
+ * PUD aligned and size is in units of TB.
+ */
+ if (kaslr_regions[i].end)
+ *kaslr_regions[i].end = __pa_nodebug(vaddr - 1);
+
+ /* Add a minimum padding based on randomization alignment. */
vaddr = round_up(vaddr + 1, PUD_SIZE);
remain_entropy -= entropy;
}
diff --git a/include/linux/mm.h b/include/linux/mm.h
index c4b238a20b76..b3864156eaa4 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -97,6 +97,10 @@ extern const int mmap_rnd_compat_bits_max;
extern int mmap_rnd_compat_bits __read_mostly;
#endif
+#ifndef PHYSMEM_END
+# define PHYSMEM_END ((1ULL << MAX_PHYSMEM_BITS) - 1)
+#endif
+
#include <asm/page.h>
#include <asm/processor.h>
diff --git a/kernel/resource.c b/kernel/resource.c
index 14777afb0a99..a83040fde236 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -1826,8 +1826,7 @@ static resource_size_t gfr_start(struct resource *base, resource_size_t size,
if (flags & GFR_DESCENDING) {
resource_size_t end;
- end = min_t(resource_size_t, base->end,
- (1ULL << MAX_PHYSMEM_BITS) - 1);
+ end = min_t(resource_size_t, base->end, PHYSMEM_END);
return end - size + 1;
}
@@ -1844,8 +1843,7 @@ static bool gfr_continue(struct resource *base, resource_size_t addr,
* @size did not wrap 0.
*/
return addr > addr - size &&
- addr <= min_t(resource_size_t, base->end,
- (1ULL << MAX_PHYSMEM_BITS) - 1);
+ addr <= min_t(resource_size_t, base->end, PHYSMEM_END);
}
static resource_size_t gfr_next(resource_size_t addr, resource_size_t size,
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 66267c26ca1b..951878ab627a 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1681,7 +1681,7 @@ struct range __weak arch_get_mappable_range(void)
struct range mhp_get_pluggable_range(bool need_mapping)
{
- const u64 max_phys = (1ULL << MAX_PHYSMEM_BITS) - 1;
+ const u64 max_phys = PHYSMEM_END;
struct range mhp_range;
if (need_mapping) {
diff --git a/mm/sparse.c b/mm/sparse.c
index e4b830091d13..0c3bff882033 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -129,7 +129,7 @@ static inline int sparse_early_nid(struct mem_section *section)
static void __meminit mminit_validate_memmodel_limits(unsigned long *start_pfn,
unsigned long *end_pfn)
{
- unsigned long max_sparsemem_pfn = 1UL << (MAX_PHYSMEM_BITS-PAGE_SHIFT);
+ unsigned long max_sparsemem_pfn = (PHYSMEM_END + 1) >> PAGE_SHIFT;
/*
* Sanity checks - do not allow an architecture to pass
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.4.y
git checkout FETCH_HEAD
git cherry-pick -x ea72ce5da22806d5713f3ffb39a6d5ae73841f93
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024090829-swizzle-angelfish-9047@gregkh' --subject-prefix 'PATCH 5.4.y' HEAD^..
Possible dependencies:
ea72ce5da228 ("x86/kaslr: Expose and use the end of the physical memory address space")
1a167ddd3c56 ("x86: kmsan: pgtable: reduce vmalloc space")
14b80582c43e ("resource: Introduce alloc_free_mem_region()")
27674ef6c73f ("mm: remove the extra ZONE_DEVICE struct page refcount")
dc90f0846df4 ("mm: don't include <linux/memremap.h> in <linux/mm.h>")
895749455f60 ("mm: simplify freeing of devmap managed pages")
75e55d8a107e ("mm: move free_devmap_managed_page to memremap.c")
730ff52194cd ("mm: remove pointless includes from <linux/hmm.h>")
f56caedaf94f ("Merge branch 'akpm' (patches from Andrew)")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From ea72ce5da22806d5713f3ffb39a6d5ae73841f93 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx(a)linutronix.de>
Date: Wed, 14 Aug 2024 00:29:36 +0200
Subject: [PATCH] x86/kaslr: Expose and use the end of the physical memory
address space
iounmap() on x86 occasionally fails to unmap because the provided valid
ioremap address is not below high_memory. It turned out that this
happens due to KASLR.
KASLR uses the full address space between PAGE_OFFSET and vaddr_end to
randomize the starting points of the direct map, vmalloc and vmemmap
regions. It thereby limits the size of the direct map by using the
installed memory size plus an extra configurable margin for hot-plug
memory. This limitation is done to gain more randomization space
because otherwise only the holes between the direct map, vmalloc,
vmemmap and vaddr_end would be usable for randomizing.
The limited direct map size is not exposed to the rest of the kernel, so
the memory hot-plug and resource management related code paths still
operate under the assumption that the available address space can be
determined with MAX_PHYSMEM_BITS.
request_free_mem_region() allocates from (1 << MAX_PHYSMEM_BITS) - 1
downwards. That means the first allocation happens past the end of the
direct map and if unlucky this address is in the vmalloc space, which
causes high_memory to become greater than VMALLOC_START and consequently
causes iounmap() to fail for valid ioremap addresses.
MAX_PHYSMEM_BITS cannot be changed for that because the randomization
does not align with address bit boundaries and there are other places
which actually require to know the maximum number of address bits. All
remaining usage sites of MAX_PHYSMEM_BITS have been analyzed and found
to be correct.
Cure this by exposing the end of the direct map via PHYSMEM_END and use
that for the memory hot-plug and resource management related places
instead of relying on MAX_PHYSMEM_BITS. In the KASLR case PHYSMEM_END
maps to a variable which is initialized by the KASLR initialization and
otherwise it is based on MAX_PHYSMEM_BITS as before.
To prevent future hickups add a check into add_pages() to catch callers
trying to add memory above PHYSMEM_END.
Fixes: 0483e1fa6e09 ("x86/mm: Implement ASLR for kernel memory regions")
Reported-by: Max Ramanouski <max8rr8(a)gmail.com>
Reported-by: Alistair Popple <apopple(a)nvidia.com>
Signed-off-by: Thomas Gleixner <tglx(a)linutronix.de>
Tested-By: Max Ramanouski <max8rr8(a)gmail.com>
Tested-by: Alistair Popple <apopple(a)nvidia.com>
Reviewed-by: Dan Williams <dan.j.williams(a)intel.com>
Reviewed-by: Alistair Popple <apopple(a)nvidia.com>
Reviewed-by: Kees Cook <kees(a)kernel.org>
Cc: stable(a)vger.kernel.org
Link: https://lore.kernel.org/all/87ed6soy3z.ffs@tglx
diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h
index af4302d79b59..f3d257c45225 100644
--- a/arch/x86/include/asm/page_64.h
+++ b/arch/x86/include/asm/page_64.h
@@ -17,6 +17,7 @@ extern unsigned long phys_base;
extern unsigned long page_offset_base;
extern unsigned long vmalloc_base;
extern unsigned long vmemmap_base;
+extern unsigned long physmem_end;
static __always_inline unsigned long __phys_addr_nodebug(unsigned long x)
{
diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
index 9053dfe9fa03..a98e53491a4e 100644
--- a/arch/x86/include/asm/pgtable_64_types.h
+++ b/arch/x86/include/asm/pgtable_64_types.h
@@ -140,6 +140,10 @@ extern unsigned int ptrs_per_p4d;
# define VMEMMAP_START __VMEMMAP_BASE_L4
#endif /* CONFIG_DYNAMIC_MEMORY_LAYOUT */
+#ifdef CONFIG_RANDOMIZE_MEMORY
+# define PHYSMEM_END physmem_end
+#endif
+
/*
* End of the region for which vmalloc page tables are pre-allocated.
* For non-KMSAN builds, this is the same as VMALLOC_END.
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index d8dbeac8b206..ff253648706f 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -958,8 +958,12 @@ static void update_end_of_memory_vars(u64 start, u64 size)
int add_pages(int nid, unsigned long start_pfn, unsigned long nr_pages,
struct mhp_params *params)
{
+ unsigned long end = ((start_pfn + nr_pages) << PAGE_SHIFT) - 1;
int ret;
+ if (WARN_ON_ONCE(end > PHYSMEM_END))
+ return -ERANGE;
+
ret = __add_pages(nid, start_pfn, nr_pages, params);
WARN_ON_ONCE(ret);
diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c
index 37db264866b6..230f1dee4f09 100644
--- a/arch/x86/mm/kaslr.c
+++ b/arch/x86/mm/kaslr.c
@@ -47,13 +47,24 @@ static const unsigned long vaddr_end = CPU_ENTRY_AREA_BASE;
*/
static __initdata struct kaslr_memory_region {
unsigned long *base;
+ unsigned long *end;
unsigned long size_tb;
} kaslr_regions[] = {
- { &page_offset_base, 0 },
- { &vmalloc_base, 0 },
- { &vmemmap_base, 0 },
+ {
+ .base = &page_offset_base,
+ .end = &physmem_end,
+ },
+ {
+ .base = &vmalloc_base,
+ },
+ {
+ .base = &vmemmap_base,
+ },
};
+/* The end of the possible address space for physical memory */
+unsigned long physmem_end __ro_after_init;
+
/* Get size in bytes used by the memory region */
static inline unsigned long get_padding(struct kaslr_memory_region *region)
{
@@ -82,6 +93,8 @@ void __init kernel_randomize_memory(void)
BUILD_BUG_ON(vaddr_end != CPU_ENTRY_AREA_BASE);
BUILD_BUG_ON(vaddr_end > __START_KERNEL_map);
+ /* Preset the end of the possible address space for physical memory */
+ physmem_end = ((1ULL << MAX_PHYSMEM_BITS) - 1);
if (!kaslr_memory_enabled())
return;
@@ -128,11 +141,18 @@ void __init kernel_randomize_memory(void)
vaddr += entropy;
*kaslr_regions[i].base = vaddr;
- /*
- * Jump the region and add a minimum padding based on
- * randomization alignment.
- */
+ /* Calculate the end of the region */
vaddr += get_padding(&kaslr_regions[i]);
+ /*
+ * KASLR trims the maximum possible size of the
+ * direct-map. Update the physmem_end boundary.
+ * No rounding required as the region starts
+ * PUD aligned and size is in units of TB.
+ */
+ if (kaslr_regions[i].end)
+ *kaslr_regions[i].end = __pa_nodebug(vaddr - 1);
+
+ /* Add a minimum padding based on randomization alignment. */
vaddr = round_up(vaddr + 1, PUD_SIZE);
remain_entropy -= entropy;
}
diff --git a/include/linux/mm.h b/include/linux/mm.h
index c4b238a20b76..b3864156eaa4 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -97,6 +97,10 @@ extern const int mmap_rnd_compat_bits_max;
extern int mmap_rnd_compat_bits __read_mostly;
#endif
+#ifndef PHYSMEM_END
+# define PHYSMEM_END ((1ULL << MAX_PHYSMEM_BITS) - 1)
+#endif
+
#include <asm/page.h>
#include <asm/processor.h>
diff --git a/kernel/resource.c b/kernel/resource.c
index 14777afb0a99..a83040fde236 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -1826,8 +1826,7 @@ static resource_size_t gfr_start(struct resource *base, resource_size_t size,
if (flags & GFR_DESCENDING) {
resource_size_t end;
- end = min_t(resource_size_t, base->end,
- (1ULL << MAX_PHYSMEM_BITS) - 1);
+ end = min_t(resource_size_t, base->end, PHYSMEM_END);
return end - size + 1;
}
@@ -1844,8 +1843,7 @@ static bool gfr_continue(struct resource *base, resource_size_t addr,
* @size did not wrap 0.
*/
return addr > addr - size &&
- addr <= min_t(resource_size_t, base->end,
- (1ULL << MAX_PHYSMEM_BITS) - 1);
+ addr <= min_t(resource_size_t, base->end, PHYSMEM_END);
}
static resource_size_t gfr_next(resource_size_t addr, resource_size_t size,
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 66267c26ca1b..951878ab627a 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1681,7 +1681,7 @@ struct range __weak arch_get_mappable_range(void)
struct range mhp_get_pluggable_range(bool need_mapping)
{
- const u64 max_phys = (1ULL << MAX_PHYSMEM_BITS) - 1;
+ const u64 max_phys = PHYSMEM_END;
struct range mhp_range;
if (need_mapping) {
diff --git a/mm/sparse.c b/mm/sparse.c
index e4b830091d13..0c3bff882033 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -129,7 +129,7 @@ static inline int sparse_early_nid(struct mem_section *section)
static void __meminit mminit_validate_memmodel_limits(unsigned long *start_pfn,
unsigned long *end_pfn)
{
- unsigned long max_sparsemem_pfn = 1UL << (MAX_PHYSMEM_BITS-PAGE_SHIFT);
+ unsigned long max_sparsemem_pfn = (PHYSMEM_END + 1) >> PAGE_SHIFT;
/*
* Sanity checks - do not allow an architecture to pass
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.10.y
git checkout FETCH_HEAD
git cherry-pick -x ea72ce5da22806d5713f3ffb39a6d5ae73841f93
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024090827-corporate-raider-afc5@gregkh' --subject-prefix 'PATCH 5.10.y' HEAD^..
Possible dependencies:
ea72ce5da228 ("x86/kaslr: Expose and use the end of the physical memory address space")
1a167ddd3c56 ("x86: kmsan: pgtable: reduce vmalloc space")
14b80582c43e ("resource: Introduce alloc_free_mem_region()")
27674ef6c73f ("mm: remove the extra ZONE_DEVICE struct page refcount")
dc90f0846df4 ("mm: don't include <linux/memremap.h> in <linux/mm.h>")
895749455f60 ("mm: simplify freeing of devmap managed pages")
75e55d8a107e ("mm: move free_devmap_managed_page to memremap.c")
730ff52194cd ("mm: remove pointless includes from <linux/hmm.h>")
f56caedaf94f ("Merge branch 'akpm' (patches from Andrew)")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From ea72ce5da22806d5713f3ffb39a6d5ae73841f93 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx(a)linutronix.de>
Date: Wed, 14 Aug 2024 00:29:36 +0200
Subject: [PATCH] x86/kaslr: Expose and use the end of the physical memory
address space
iounmap() on x86 occasionally fails to unmap because the provided valid
ioremap address is not below high_memory. It turned out that this
happens due to KASLR.
KASLR uses the full address space between PAGE_OFFSET and vaddr_end to
randomize the starting points of the direct map, vmalloc and vmemmap
regions. It thereby limits the size of the direct map by using the
installed memory size plus an extra configurable margin for hot-plug
memory. This limitation is done to gain more randomization space
because otherwise only the holes between the direct map, vmalloc,
vmemmap and vaddr_end would be usable for randomizing.
The limited direct map size is not exposed to the rest of the kernel, so
the memory hot-plug and resource management related code paths still
operate under the assumption that the available address space can be
determined with MAX_PHYSMEM_BITS.
request_free_mem_region() allocates from (1 << MAX_PHYSMEM_BITS) - 1
downwards. That means the first allocation happens past the end of the
direct map and if unlucky this address is in the vmalloc space, which
causes high_memory to become greater than VMALLOC_START and consequently
causes iounmap() to fail for valid ioremap addresses.
MAX_PHYSMEM_BITS cannot be changed for that because the randomization
does not align with address bit boundaries and there are other places
which actually require to know the maximum number of address bits. All
remaining usage sites of MAX_PHYSMEM_BITS have been analyzed and found
to be correct.
Cure this by exposing the end of the direct map via PHYSMEM_END and use
that for the memory hot-plug and resource management related places
instead of relying on MAX_PHYSMEM_BITS. In the KASLR case PHYSMEM_END
maps to a variable which is initialized by the KASLR initialization and
otherwise it is based on MAX_PHYSMEM_BITS as before.
To prevent future hickups add a check into add_pages() to catch callers
trying to add memory above PHYSMEM_END.
Fixes: 0483e1fa6e09 ("x86/mm: Implement ASLR for kernel memory regions")
Reported-by: Max Ramanouski <max8rr8(a)gmail.com>
Reported-by: Alistair Popple <apopple(a)nvidia.com>
Signed-off-by: Thomas Gleixner <tglx(a)linutronix.de>
Tested-By: Max Ramanouski <max8rr8(a)gmail.com>
Tested-by: Alistair Popple <apopple(a)nvidia.com>
Reviewed-by: Dan Williams <dan.j.williams(a)intel.com>
Reviewed-by: Alistair Popple <apopple(a)nvidia.com>
Reviewed-by: Kees Cook <kees(a)kernel.org>
Cc: stable(a)vger.kernel.org
Link: https://lore.kernel.org/all/87ed6soy3z.ffs@tglx
diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h
index af4302d79b59..f3d257c45225 100644
--- a/arch/x86/include/asm/page_64.h
+++ b/arch/x86/include/asm/page_64.h
@@ -17,6 +17,7 @@ extern unsigned long phys_base;
extern unsigned long page_offset_base;
extern unsigned long vmalloc_base;
extern unsigned long vmemmap_base;
+extern unsigned long physmem_end;
static __always_inline unsigned long __phys_addr_nodebug(unsigned long x)
{
diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
index 9053dfe9fa03..a98e53491a4e 100644
--- a/arch/x86/include/asm/pgtable_64_types.h
+++ b/arch/x86/include/asm/pgtable_64_types.h
@@ -140,6 +140,10 @@ extern unsigned int ptrs_per_p4d;
# define VMEMMAP_START __VMEMMAP_BASE_L4
#endif /* CONFIG_DYNAMIC_MEMORY_LAYOUT */
+#ifdef CONFIG_RANDOMIZE_MEMORY
+# define PHYSMEM_END physmem_end
+#endif
+
/*
* End of the region for which vmalloc page tables are pre-allocated.
* For non-KMSAN builds, this is the same as VMALLOC_END.
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index d8dbeac8b206..ff253648706f 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -958,8 +958,12 @@ static void update_end_of_memory_vars(u64 start, u64 size)
int add_pages(int nid, unsigned long start_pfn, unsigned long nr_pages,
struct mhp_params *params)
{
+ unsigned long end = ((start_pfn + nr_pages) << PAGE_SHIFT) - 1;
int ret;
+ if (WARN_ON_ONCE(end > PHYSMEM_END))
+ return -ERANGE;
+
ret = __add_pages(nid, start_pfn, nr_pages, params);
WARN_ON_ONCE(ret);
diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c
index 37db264866b6..230f1dee4f09 100644
--- a/arch/x86/mm/kaslr.c
+++ b/arch/x86/mm/kaslr.c
@@ -47,13 +47,24 @@ static const unsigned long vaddr_end = CPU_ENTRY_AREA_BASE;
*/
static __initdata struct kaslr_memory_region {
unsigned long *base;
+ unsigned long *end;
unsigned long size_tb;
} kaslr_regions[] = {
- { &page_offset_base, 0 },
- { &vmalloc_base, 0 },
- { &vmemmap_base, 0 },
+ {
+ .base = &page_offset_base,
+ .end = &physmem_end,
+ },
+ {
+ .base = &vmalloc_base,
+ },
+ {
+ .base = &vmemmap_base,
+ },
};
+/* The end of the possible address space for physical memory */
+unsigned long physmem_end __ro_after_init;
+
/* Get size in bytes used by the memory region */
static inline unsigned long get_padding(struct kaslr_memory_region *region)
{
@@ -82,6 +93,8 @@ void __init kernel_randomize_memory(void)
BUILD_BUG_ON(vaddr_end != CPU_ENTRY_AREA_BASE);
BUILD_BUG_ON(vaddr_end > __START_KERNEL_map);
+ /* Preset the end of the possible address space for physical memory */
+ physmem_end = ((1ULL << MAX_PHYSMEM_BITS) - 1);
if (!kaslr_memory_enabled())
return;
@@ -128,11 +141,18 @@ void __init kernel_randomize_memory(void)
vaddr += entropy;
*kaslr_regions[i].base = vaddr;
- /*
- * Jump the region and add a minimum padding based on
- * randomization alignment.
- */
+ /* Calculate the end of the region */
vaddr += get_padding(&kaslr_regions[i]);
+ /*
+ * KASLR trims the maximum possible size of the
+ * direct-map. Update the physmem_end boundary.
+ * No rounding required as the region starts
+ * PUD aligned and size is in units of TB.
+ */
+ if (kaslr_regions[i].end)
+ *kaslr_regions[i].end = __pa_nodebug(vaddr - 1);
+
+ /* Add a minimum padding based on randomization alignment. */
vaddr = round_up(vaddr + 1, PUD_SIZE);
remain_entropy -= entropy;
}
diff --git a/include/linux/mm.h b/include/linux/mm.h
index c4b238a20b76..b3864156eaa4 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -97,6 +97,10 @@ extern const int mmap_rnd_compat_bits_max;
extern int mmap_rnd_compat_bits __read_mostly;
#endif
+#ifndef PHYSMEM_END
+# define PHYSMEM_END ((1ULL << MAX_PHYSMEM_BITS) - 1)
+#endif
+
#include <asm/page.h>
#include <asm/processor.h>
diff --git a/kernel/resource.c b/kernel/resource.c
index 14777afb0a99..a83040fde236 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -1826,8 +1826,7 @@ static resource_size_t gfr_start(struct resource *base, resource_size_t size,
if (flags & GFR_DESCENDING) {
resource_size_t end;
- end = min_t(resource_size_t, base->end,
- (1ULL << MAX_PHYSMEM_BITS) - 1);
+ end = min_t(resource_size_t, base->end, PHYSMEM_END);
return end - size + 1;
}
@@ -1844,8 +1843,7 @@ static bool gfr_continue(struct resource *base, resource_size_t addr,
* @size did not wrap 0.
*/
return addr > addr - size &&
- addr <= min_t(resource_size_t, base->end,
- (1ULL << MAX_PHYSMEM_BITS) - 1);
+ addr <= min_t(resource_size_t, base->end, PHYSMEM_END);
}
static resource_size_t gfr_next(resource_size_t addr, resource_size_t size,
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 66267c26ca1b..951878ab627a 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1681,7 +1681,7 @@ struct range __weak arch_get_mappable_range(void)
struct range mhp_get_pluggable_range(bool need_mapping)
{
- const u64 max_phys = (1ULL << MAX_PHYSMEM_BITS) - 1;
+ const u64 max_phys = PHYSMEM_END;
struct range mhp_range;
if (need_mapping) {
diff --git a/mm/sparse.c b/mm/sparse.c
index e4b830091d13..0c3bff882033 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -129,7 +129,7 @@ static inline int sparse_early_nid(struct mem_section *section)
static void __meminit mminit_validate_memmodel_limits(unsigned long *start_pfn,
unsigned long *end_pfn)
{
- unsigned long max_sparsemem_pfn = 1UL << (MAX_PHYSMEM_BITS-PAGE_SHIFT);
+ unsigned long max_sparsemem_pfn = (PHYSMEM_END + 1) >> PAGE_SHIFT;
/*
* Sanity checks - do not allow an architecture to pass
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.15.y
git checkout FETCH_HEAD
git cherry-pick -x ea72ce5da22806d5713f3ffb39a6d5ae73841f93
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024090826-expulsion-unkempt-11e3@gregkh' --subject-prefix 'PATCH 5.15.y' HEAD^..
Possible dependencies:
ea72ce5da228 ("x86/kaslr: Expose and use the end of the physical memory address space")
1a167ddd3c56 ("x86: kmsan: pgtable: reduce vmalloc space")
14b80582c43e ("resource: Introduce alloc_free_mem_region()")
27674ef6c73f ("mm: remove the extra ZONE_DEVICE struct page refcount")
dc90f0846df4 ("mm: don't include <linux/memremap.h> in <linux/mm.h>")
895749455f60 ("mm: simplify freeing of devmap managed pages")
75e55d8a107e ("mm: move free_devmap_managed_page to memremap.c")
730ff52194cd ("mm: remove pointless includes from <linux/hmm.h>")
f56caedaf94f ("Merge branch 'akpm' (patches from Andrew)")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From ea72ce5da22806d5713f3ffb39a6d5ae73841f93 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx(a)linutronix.de>
Date: Wed, 14 Aug 2024 00:29:36 +0200
Subject: [PATCH] x86/kaslr: Expose and use the end of the physical memory
address space
iounmap() on x86 occasionally fails to unmap because the provided valid
ioremap address is not below high_memory. It turned out that this
happens due to KASLR.
KASLR uses the full address space between PAGE_OFFSET and vaddr_end to
randomize the starting points of the direct map, vmalloc and vmemmap
regions. It thereby limits the size of the direct map by using the
installed memory size plus an extra configurable margin for hot-plug
memory. This limitation is done to gain more randomization space
because otherwise only the holes between the direct map, vmalloc,
vmemmap and vaddr_end would be usable for randomizing.
The limited direct map size is not exposed to the rest of the kernel, so
the memory hot-plug and resource management related code paths still
operate under the assumption that the available address space can be
determined with MAX_PHYSMEM_BITS.
request_free_mem_region() allocates from (1 << MAX_PHYSMEM_BITS) - 1
downwards. That means the first allocation happens past the end of the
direct map and if unlucky this address is in the vmalloc space, which
causes high_memory to become greater than VMALLOC_START and consequently
causes iounmap() to fail for valid ioremap addresses.
MAX_PHYSMEM_BITS cannot be changed for that because the randomization
does not align with address bit boundaries and there are other places
which actually require to know the maximum number of address bits. All
remaining usage sites of MAX_PHYSMEM_BITS have been analyzed and found
to be correct.
Cure this by exposing the end of the direct map via PHYSMEM_END and use
that for the memory hot-plug and resource management related places
instead of relying on MAX_PHYSMEM_BITS. In the KASLR case PHYSMEM_END
maps to a variable which is initialized by the KASLR initialization and
otherwise it is based on MAX_PHYSMEM_BITS as before.
To prevent future hickups add a check into add_pages() to catch callers
trying to add memory above PHYSMEM_END.
Fixes: 0483e1fa6e09 ("x86/mm: Implement ASLR for kernel memory regions")
Reported-by: Max Ramanouski <max8rr8(a)gmail.com>
Reported-by: Alistair Popple <apopple(a)nvidia.com>
Signed-off-by: Thomas Gleixner <tglx(a)linutronix.de>
Tested-By: Max Ramanouski <max8rr8(a)gmail.com>
Tested-by: Alistair Popple <apopple(a)nvidia.com>
Reviewed-by: Dan Williams <dan.j.williams(a)intel.com>
Reviewed-by: Alistair Popple <apopple(a)nvidia.com>
Reviewed-by: Kees Cook <kees(a)kernel.org>
Cc: stable(a)vger.kernel.org
Link: https://lore.kernel.org/all/87ed6soy3z.ffs@tglx
diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h
index af4302d79b59..f3d257c45225 100644
--- a/arch/x86/include/asm/page_64.h
+++ b/arch/x86/include/asm/page_64.h
@@ -17,6 +17,7 @@ extern unsigned long phys_base;
extern unsigned long page_offset_base;
extern unsigned long vmalloc_base;
extern unsigned long vmemmap_base;
+extern unsigned long physmem_end;
static __always_inline unsigned long __phys_addr_nodebug(unsigned long x)
{
diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
index 9053dfe9fa03..a98e53491a4e 100644
--- a/arch/x86/include/asm/pgtable_64_types.h
+++ b/arch/x86/include/asm/pgtable_64_types.h
@@ -140,6 +140,10 @@ extern unsigned int ptrs_per_p4d;
# define VMEMMAP_START __VMEMMAP_BASE_L4
#endif /* CONFIG_DYNAMIC_MEMORY_LAYOUT */
+#ifdef CONFIG_RANDOMIZE_MEMORY
+# define PHYSMEM_END physmem_end
+#endif
+
/*
* End of the region for which vmalloc page tables are pre-allocated.
* For non-KMSAN builds, this is the same as VMALLOC_END.
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index d8dbeac8b206..ff253648706f 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -958,8 +958,12 @@ static void update_end_of_memory_vars(u64 start, u64 size)
int add_pages(int nid, unsigned long start_pfn, unsigned long nr_pages,
struct mhp_params *params)
{
+ unsigned long end = ((start_pfn + nr_pages) << PAGE_SHIFT) - 1;
int ret;
+ if (WARN_ON_ONCE(end > PHYSMEM_END))
+ return -ERANGE;
+
ret = __add_pages(nid, start_pfn, nr_pages, params);
WARN_ON_ONCE(ret);
diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c
index 37db264866b6..230f1dee4f09 100644
--- a/arch/x86/mm/kaslr.c
+++ b/arch/x86/mm/kaslr.c
@@ -47,13 +47,24 @@ static const unsigned long vaddr_end = CPU_ENTRY_AREA_BASE;
*/
static __initdata struct kaslr_memory_region {
unsigned long *base;
+ unsigned long *end;
unsigned long size_tb;
} kaslr_regions[] = {
- { &page_offset_base, 0 },
- { &vmalloc_base, 0 },
- { &vmemmap_base, 0 },
+ {
+ .base = &page_offset_base,
+ .end = &physmem_end,
+ },
+ {
+ .base = &vmalloc_base,
+ },
+ {
+ .base = &vmemmap_base,
+ },
};
+/* The end of the possible address space for physical memory */
+unsigned long physmem_end __ro_after_init;
+
/* Get size in bytes used by the memory region */
static inline unsigned long get_padding(struct kaslr_memory_region *region)
{
@@ -82,6 +93,8 @@ void __init kernel_randomize_memory(void)
BUILD_BUG_ON(vaddr_end != CPU_ENTRY_AREA_BASE);
BUILD_BUG_ON(vaddr_end > __START_KERNEL_map);
+ /* Preset the end of the possible address space for physical memory */
+ physmem_end = ((1ULL << MAX_PHYSMEM_BITS) - 1);
if (!kaslr_memory_enabled())
return;
@@ -128,11 +141,18 @@ void __init kernel_randomize_memory(void)
vaddr += entropy;
*kaslr_regions[i].base = vaddr;
- /*
- * Jump the region and add a minimum padding based on
- * randomization alignment.
- */
+ /* Calculate the end of the region */
vaddr += get_padding(&kaslr_regions[i]);
+ /*
+ * KASLR trims the maximum possible size of the
+ * direct-map. Update the physmem_end boundary.
+ * No rounding required as the region starts
+ * PUD aligned and size is in units of TB.
+ */
+ if (kaslr_regions[i].end)
+ *kaslr_regions[i].end = __pa_nodebug(vaddr - 1);
+
+ /* Add a minimum padding based on randomization alignment. */
vaddr = round_up(vaddr + 1, PUD_SIZE);
remain_entropy -= entropy;
}
diff --git a/include/linux/mm.h b/include/linux/mm.h
index c4b238a20b76..b3864156eaa4 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -97,6 +97,10 @@ extern const int mmap_rnd_compat_bits_max;
extern int mmap_rnd_compat_bits __read_mostly;
#endif
+#ifndef PHYSMEM_END
+# define PHYSMEM_END ((1ULL << MAX_PHYSMEM_BITS) - 1)
+#endif
+
#include <asm/page.h>
#include <asm/processor.h>
diff --git a/kernel/resource.c b/kernel/resource.c
index 14777afb0a99..a83040fde236 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -1826,8 +1826,7 @@ static resource_size_t gfr_start(struct resource *base, resource_size_t size,
if (flags & GFR_DESCENDING) {
resource_size_t end;
- end = min_t(resource_size_t, base->end,
- (1ULL << MAX_PHYSMEM_BITS) - 1);
+ end = min_t(resource_size_t, base->end, PHYSMEM_END);
return end - size + 1;
}
@@ -1844,8 +1843,7 @@ static bool gfr_continue(struct resource *base, resource_size_t addr,
* @size did not wrap 0.
*/
return addr > addr - size &&
- addr <= min_t(resource_size_t, base->end,
- (1ULL << MAX_PHYSMEM_BITS) - 1);
+ addr <= min_t(resource_size_t, base->end, PHYSMEM_END);
}
static resource_size_t gfr_next(resource_size_t addr, resource_size_t size,
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 66267c26ca1b..951878ab627a 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1681,7 +1681,7 @@ struct range __weak arch_get_mappable_range(void)
struct range mhp_get_pluggable_range(bool need_mapping)
{
- const u64 max_phys = (1ULL << MAX_PHYSMEM_BITS) - 1;
+ const u64 max_phys = PHYSMEM_END;
struct range mhp_range;
if (need_mapping) {
diff --git a/mm/sparse.c b/mm/sparse.c
index e4b830091d13..0c3bff882033 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -129,7 +129,7 @@ static inline int sparse_early_nid(struct mem_section *section)
static void __meminit mminit_validate_memmodel_limits(unsigned long *start_pfn,
unsigned long *end_pfn)
{
- unsigned long max_sparsemem_pfn = 1UL << (MAX_PHYSMEM_BITS-PAGE_SHIFT);
+ unsigned long max_sparsemem_pfn = (PHYSMEM_END + 1) >> PAGE_SHIFT;
/*
* Sanity checks - do not allow an architecture to pass
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-4.19.y
git checkout FETCH_HEAD
git cherry-pick -x c5af2c90ba5629f0424a8d315f75fb8d91713c3c
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024090844-cohesive-abrasion-ef6b@gregkh' --subject-prefix 'PATCH 4.19.y' HEAD^..
Possible dependencies:
c5af2c90ba56 ("irqchip/gic-v2m: Fix refcount leak in gicv2m_of_init()")
90b4c5558615 ("irqchip/gic-v2m: Add support for Amazon Graviton variant of GICv3+GICv2m")
737be74710f3 ("irqchip/gicv2m: Don't map the MSI page in gicv2m_compose_msi_msg()")
d38a71c54525 ("irqchip/gic-v3-its: Change initialization ordering for LPIs")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From c5af2c90ba5629f0424a8d315f75fb8d91713c3c Mon Sep 17 00:00:00 2001
From: Ma Ke <make24(a)iscas.ac.cn>
Date: Tue, 20 Aug 2024 17:28:43 +0800
Subject: [PATCH] irqchip/gic-v2m: Fix refcount leak in gicv2m_of_init()
gicv2m_of_init() fails to perform an of_node_put() when
of_address_to_resource() fails, leading to a refcount leak.
Address this by moving the error handling path outside of the loop and
making it common to all failure modes.
Fixes: 4266ab1a8ff5 ("irqchip/gic-v2m: Refactor to prepare for ACPI support")
Signed-off-by: Ma Ke <make24(a)iscas.ac.cn>
Signed-off-by: Thomas Gleixner <tglx(a)linutronix.de>
Reviewed-by: Marc Zyngier <maz(a)kernel.org>
Cc: stable(a)vger.kernel.org
Link: https://lore.kernel.org/all/20240820092843.1219933-1-make24@iscas.ac.cn
diff --git a/drivers/irqchip/irq-gic-v2m.c b/drivers/irqchip/irq-gic-v2m.c
index 51af63c046ed..be35c5349986 100644
--- a/drivers/irqchip/irq-gic-v2m.c
+++ b/drivers/irqchip/irq-gic-v2m.c
@@ -407,12 +407,12 @@ static int __init gicv2m_of_init(struct fwnode_handle *parent_handle,
ret = gicv2m_init_one(&child->fwnode, spi_start, nr_spis,
&res, 0);
- if (ret) {
- of_node_put(child);
+ if (ret)
break;
- }
}
+ if (ret && child)
+ of_node_put(child);
if (!ret)
ret = gicv2m_allocate_domains(parent);
if (ret)
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-4.19.y
git checkout FETCH_HEAD
git cherry-pick -x 25dfc9e357af8aed1ca79b318a73f2c59c1f0b2b
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024090841-erasure-dice-630c@gregkh' --subject-prefix 'PATCH 4.19.y' HEAD^..
Possible dependencies:
25dfc9e357af ("perf/x86/intel: Limit the period on Haswell")
28f0f3c44b5c ("perf/x86: Change x86_pmu::limit_period signature")
706460a96fc6 ("perf/x86/amd/core: Add generic branch record interfaces")
b40d0156f560 ("perf/x86/amd/brs: Move feature-specific functions")
3c27b0c6ea48 ("perf/x86/amd: Fix AMD BRS period adjustment")
9622e67e3980 ("perf/x86/amd/core: Add PerfMonV2 counter control")
21d59e3e2c40 ("perf/x86/amd/core: Detect PerfMonV2 support")
cc37e520a236 ("perf/x86/amd: Make Zen3 branch sampling opt-in")
ba2fe7500845 ("perf/x86/amd: Add AMD branch sampling period adjustment")
ada543459cab ("perf/x86/amd: Add AMD Fam19h Branch Sampling support")
369461ce8fb6 ("x86: perf: Move RDPMC event flag to a common definition")
05485745ad48 ("perf/amd/uncore: Allow the driver to be built as a module")
5471eea5d3bf ("perf/x86: Reset the dirty counter to prevent the leak for an RDPMC task")
f83d2f91d259 ("perf/x86/intel: Add Alder Lake Hybrid support")
58ae30c29a37 ("perf/x86/intel: Add attr_update for Hybrid PMUs")
d9977c43bff8 ("perf/x86: Register hybrid PMUs")
e11c1a7eb302 ("perf/x86: Factor out x86_pmu_show_pmu_cap")
b98567298bad ("perf/x86: Remove temporary pmu assignment in event_init")
34d5b61f29ee ("perf/x86/intel: Factor out intel_pmu_check_extra_regs")
bc14fe1beeec ("perf/x86/intel: Factor out intel_pmu_check_event_constraints")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 25dfc9e357af8aed1ca79b318a73f2c59c1f0b2b Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang(a)linux.intel.com>
Date: Mon, 19 Aug 2024 11:30:04 -0700
Subject: [PATCH] perf/x86/intel: Limit the period on Haswell
Running the ltp test cve-2015-3290 concurrently reports the following
warnings.
perfevents: irq loop stuck!
WARNING: CPU: 31 PID: 32438 at arch/x86/events/intel/core.c:3174
intel_pmu_handle_irq+0x285/0x370
Call Trace:
<NMI>
? __warn+0xa4/0x220
? intel_pmu_handle_irq+0x285/0x370
? __report_bug+0x123/0x130
? intel_pmu_handle_irq+0x285/0x370
? __report_bug+0x123/0x130
? intel_pmu_handle_irq+0x285/0x370
? report_bug+0x3e/0xa0
? handle_bug+0x3c/0x70
? exc_invalid_op+0x18/0x50
? asm_exc_invalid_op+0x1a/0x20
? irq_work_claim+0x1e/0x40
? intel_pmu_handle_irq+0x285/0x370
perf_event_nmi_handler+0x3d/0x60
nmi_handle+0x104/0x330
Thanks to Thomas Gleixner's analysis, the issue is caused by the low
initial period (1) of the frequency estimation algorithm, which triggers
the defects of the HW, specifically erratum HSW11 and HSW143. (For the
details, please refer https://lore.kernel.org/lkml/87plq9l5d2.ffs@tglx/)
The HSW11 requires a period larger than 100 for the INST_RETIRED.ALL
event, but the initial period in the freq mode is 1. The erratum is the
same as the BDM11, which has been supported in the kernel. A minimum
period of 128 is enforced as well on HSW.
HSW143 is regarding that the fixed counter 1 may overcount 32 with the
Hyper-Threading is enabled. However, based on the test, the hardware
has more issues than it tells. Besides the fixed counter 1, the message
'interrupt took too long' can be observed on any counter which was armed
with a period < 32 and two events expired in the same NMI. A minimum
period of 32 is enforced for the rest of the events.
The recommended workaround code of the HSW143 is not implemented.
Because it only addresses the issue for the fixed counter. It brings
extra overhead through extra MSR writing. No related overcounting issue
has been reported so far.
Fixes: 3a632cb229bf ("perf/x86/intel: Add simple Haswell PMU support")
Reported-by: Li Huafei <lihuafei1(a)huawei.com>
Suggested-by: Thomas Gleixner <tglx(a)linutronix.de>
Signed-off-by: Kan Liang <kan.liang(a)linux.intel.com>
Signed-off-by: Thomas Gleixner <tglx(a)linutronix.de>
Cc: stable(a)vger.kernel.org
Link: https://lore.kernel.org/all/20240819183004.3132920-1-kan.liang@linux.intel.…
Closes: https://lore.kernel.org/lkml/20240729223328.327835-1-lihuafei1@huawei.com/
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 0c9c2706d4ec..9e519d8a810a 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -4589,6 +4589,25 @@ static enum hybrid_cpu_type adl_get_hybrid_cpu_type(void)
return HYBRID_INTEL_CORE;
}
+static inline bool erratum_hsw11(struct perf_event *event)
+{
+ return (event->hw.config & INTEL_ARCH_EVENT_MASK) ==
+ X86_CONFIG(.event=0xc0, .umask=0x01);
+}
+
+/*
+ * The HSW11 requires a period larger than 100 which is the same as the BDM11.
+ * A minimum period of 128 is enforced as well for the INST_RETIRED.ALL.
+ *
+ * The message 'interrupt took too long' can be observed on any counter which
+ * was armed with a period < 32 and two events expired in the same NMI.
+ * A minimum period of 32 is enforced for the rest of the events.
+ */
+static void hsw_limit_period(struct perf_event *event, s64 *left)
+{
+ *left = max(*left, erratum_hsw11(event) ? 128 : 32);
+}
+
/*
* Broadwell:
*
@@ -4606,8 +4625,7 @@ static enum hybrid_cpu_type adl_get_hybrid_cpu_type(void)
*/
static void bdw_limit_period(struct perf_event *event, s64 *left)
{
- if ((event->hw.config & INTEL_ARCH_EVENT_MASK) ==
- X86_CONFIG(.event=0xc0, .umask=0x01)) {
+ if (erratum_hsw11(event)) {
if (*left < 128)
*left = 128;
*left &= ~0x3fULL;
@@ -6766,6 +6784,7 @@ __init int intel_pmu_init(void)
x86_pmu.hw_config = hsw_hw_config;
x86_pmu.get_event_constraints = hsw_get_event_constraints;
+ x86_pmu.limit_period = hsw_limit_period;
x86_pmu.lbr_double_abort = true;
extra_attr = boot_cpu_has(X86_FEATURE_RTM) ?
hsw_format_attr : nhm_format_attr;
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.4.y
git checkout FETCH_HEAD
git cherry-pick -x 25dfc9e357af8aed1ca79b318a73f2c59c1f0b2b
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024090839-issuing-jeep-f477@gregkh' --subject-prefix 'PATCH 5.4.y' HEAD^..
Possible dependencies:
25dfc9e357af ("perf/x86/intel: Limit the period on Haswell")
28f0f3c44b5c ("perf/x86: Change x86_pmu::limit_period signature")
706460a96fc6 ("perf/x86/amd/core: Add generic branch record interfaces")
b40d0156f560 ("perf/x86/amd/brs: Move feature-specific functions")
3c27b0c6ea48 ("perf/x86/amd: Fix AMD BRS period adjustment")
9622e67e3980 ("perf/x86/amd/core: Add PerfMonV2 counter control")
21d59e3e2c40 ("perf/x86/amd/core: Detect PerfMonV2 support")
cc37e520a236 ("perf/x86/amd: Make Zen3 branch sampling opt-in")
ba2fe7500845 ("perf/x86/amd: Add AMD branch sampling period adjustment")
ada543459cab ("perf/x86/amd: Add AMD Fam19h Branch Sampling support")
369461ce8fb6 ("x86: perf: Move RDPMC event flag to a common definition")
05485745ad48 ("perf/amd/uncore: Allow the driver to be built as a module")
5471eea5d3bf ("perf/x86: Reset the dirty counter to prevent the leak for an RDPMC task")
f83d2f91d259 ("perf/x86/intel: Add Alder Lake Hybrid support")
58ae30c29a37 ("perf/x86/intel: Add attr_update for Hybrid PMUs")
d9977c43bff8 ("perf/x86: Register hybrid PMUs")
e11c1a7eb302 ("perf/x86: Factor out x86_pmu_show_pmu_cap")
b98567298bad ("perf/x86: Remove temporary pmu assignment in event_init")
34d5b61f29ee ("perf/x86/intel: Factor out intel_pmu_check_extra_regs")
bc14fe1beeec ("perf/x86/intel: Factor out intel_pmu_check_event_constraints")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 25dfc9e357af8aed1ca79b318a73f2c59c1f0b2b Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang(a)linux.intel.com>
Date: Mon, 19 Aug 2024 11:30:04 -0700
Subject: [PATCH] perf/x86/intel: Limit the period on Haswell
Running the ltp test cve-2015-3290 concurrently reports the following
warnings.
perfevents: irq loop stuck!
WARNING: CPU: 31 PID: 32438 at arch/x86/events/intel/core.c:3174
intel_pmu_handle_irq+0x285/0x370
Call Trace:
<NMI>
? __warn+0xa4/0x220
? intel_pmu_handle_irq+0x285/0x370
? __report_bug+0x123/0x130
? intel_pmu_handle_irq+0x285/0x370
? __report_bug+0x123/0x130
? intel_pmu_handle_irq+0x285/0x370
? report_bug+0x3e/0xa0
? handle_bug+0x3c/0x70
? exc_invalid_op+0x18/0x50
? asm_exc_invalid_op+0x1a/0x20
? irq_work_claim+0x1e/0x40
? intel_pmu_handle_irq+0x285/0x370
perf_event_nmi_handler+0x3d/0x60
nmi_handle+0x104/0x330
Thanks to Thomas Gleixner's analysis, the issue is caused by the low
initial period (1) of the frequency estimation algorithm, which triggers
the defects of the HW, specifically erratum HSW11 and HSW143. (For the
details, please refer https://lore.kernel.org/lkml/87plq9l5d2.ffs@tglx/)
The HSW11 requires a period larger than 100 for the INST_RETIRED.ALL
event, but the initial period in the freq mode is 1. The erratum is the
same as the BDM11, which has been supported in the kernel. A minimum
period of 128 is enforced as well on HSW.
HSW143 is regarding that the fixed counter 1 may overcount 32 with the
Hyper-Threading is enabled. However, based on the test, the hardware
has more issues than it tells. Besides the fixed counter 1, the message
'interrupt took too long' can be observed on any counter which was armed
with a period < 32 and two events expired in the same NMI. A minimum
period of 32 is enforced for the rest of the events.
The recommended workaround code of the HSW143 is not implemented.
Because it only addresses the issue for the fixed counter. It brings
extra overhead through extra MSR writing. No related overcounting issue
has been reported so far.
Fixes: 3a632cb229bf ("perf/x86/intel: Add simple Haswell PMU support")
Reported-by: Li Huafei <lihuafei1(a)huawei.com>
Suggested-by: Thomas Gleixner <tglx(a)linutronix.de>
Signed-off-by: Kan Liang <kan.liang(a)linux.intel.com>
Signed-off-by: Thomas Gleixner <tglx(a)linutronix.de>
Cc: stable(a)vger.kernel.org
Link: https://lore.kernel.org/all/20240819183004.3132920-1-kan.liang@linux.intel.…
Closes: https://lore.kernel.org/lkml/20240729223328.327835-1-lihuafei1@huawei.com/
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 0c9c2706d4ec..9e519d8a810a 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -4589,6 +4589,25 @@ static enum hybrid_cpu_type adl_get_hybrid_cpu_type(void)
return HYBRID_INTEL_CORE;
}
+static inline bool erratum_hsw11(struct perf_event *event)
+{
+ return (event->hw.config & INTEL_ARCH_EVENT_MASK) ==
+ X86_CONFIG(.event=0xc0, .umask=0x01);
+}
+
+/*
+ * The HSW11 requires a period larger than 100 which is the same as the BDM11.
+ * A minimum period of 128 is enforced as well for the INST_RETIRED.ALL.
+ *
+ * The message 'interrupt took too long' can be observed on any counter which
+ * was armed with a period < 32 and two events expired in the same NMI.
+ * A minimum period of 32 is enforced for the rest of the events.
+ */
+static void hsw_limit_period(struct perf_event *event, s64 *left)
+{
+ *left = max(*left, erratum_hsw11(event) ? 128 : 32);
+}
+
/*
* Broadwell:
*
@@ -4606,8 +4625,7 @@ static enum hybrid_cpu_type adl_get_hybrid_cpu_type(void)
*/
static void bdw_limit_period(struct perf_event *event, s64 *left)
{
- if ((event->hw.config & INTEL_ARCH_EVENT_MASK) ==
- X86_CONFIG(.event=0xc0, .umask=0x01)) {
+ if (erratum_hsw11(event)) {
if (*left < 128)
*left = 128;
*left &= ~0x3fULL;
@@ -6766,6 +6784,7 @@ __init int intel_pmu_init(void)
x86_pmu.hw_config = hsw_hw_config;
x86_pmu.get_event_constraints = hsw_get_event_constraints;
+ x86_pmu.limit_period = hsw_limit_period;
x86_pmu.lbr_double_abort = true;
extra_attr = boot_cpu_has(X86_FEATURE_RTM) ?
hsw_format_attr : nhm_format_attr;
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.10.y
git checkout FETCH_HEAD
git cherry-pick -x 25dfc9e357af8aed1ca79b318a73f2c59c1f0b2b
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024090838-facility-embargo-19d0@gregkh' --subject-prefix 'PATCH 5.10.y' HEAD^..
Possible dependencies:
25dfc9e357af ("perf/x86/intel: Limit the period on Haswell")
28f0f3c44b5c ("perf/x86: Change x86_pmu::limit_period signature")
706460a96fc6 ("perf/x86/amd/core: Add generic branch record interfaces")
b40d0156f560 ("perf/x86/amd/brs: Move feature-specific functions")
3c27b0c6ea48 ("perf/x86/amd: Fix AMD BRS period adjustment")
9622e67e3980 ("perf/x86/amd/core: Add PerfMonV2 counter control")
21d59e3e2c40 ("perf/x86/amd/core: Detect PerfMonV2 support")
cc37e520a236 ("perf/x86/amd: Make Zen3 branch sampling opt-in")
ba2fe7500845 ("perf/x86/amd: Add AMD branch sampling period adjustment")
ada543459cab ("perf/x86/amd: Add AMD Fam19h Branch Sampling support")
369461ce8fb6 ("x86: perf: Move RDPMC event flag to a common definition")
05485745ad48 ("perf/amd/uncore: Allow the driver to be built as a module")
5471eea5d3bf ("perf/x86: Reset the dirty counter to prevent the leak for an RDPMC task")
f83d2f91d259 ("perf/x86/intel: Add Alder Lake Hybrid support")
58ae30c29a37 ("perf/x86/intel: Add attr_update for Hybrid PMUs")
d9977c43bff8 ("perf/x86: Register hybrid PMUs")
e11c1a7eb302 ("perf/x86: Factor out x86_pmu_show_pmu_cap")
b98567298bad ("perf/x86: Remove temporary pmu assignment in event_init")
34d5b61f29ee ("perf/x86/intel: Factor out intel_pmu_check_extra_regs")
bc14fe1beeec ("perf/x86/intel: Factor out intel_pmu_check_event_constraints")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 25dfc9e357af8aed1ca79b318a73f2c59c1f0b2b Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang(a)linux.intel.com>
Date: Mon, 19 Aug 2024 11:30:04 -0700
Subject: [PATCH] perf/x86/intel: Limit the period on Haswell
Running the ltp test cve-2015-3290 concurrently reports the following
warnings.
perfevents: irq loop stuck!
WARNING: CPU: 31 PID: 32438 at arch/x86/events/intel/core.c:3174
intel_pmu_handle_irq+0x285/0x370
Call Trace:
<NMI>
? __warn+0xa4/0x220
? intel_pmu_handle_irq+0x285/0x370
? __report_bug+0x123/0x130
? intel_pmu_handle_irq+0x285/0x370
? __report_bug+0x123/0x130
? intel_pmu_handle_irq+0x285/0x370
? report_bug+0x3e/0xa0
? handle_bug+0x3c/0x70
? exc_invalid_op+0x18/0x50
? asm_exc_invalid_op+0x1a/0x20
? irq_work_claim+0x1e/0x40
? intel_pmu_handle_irq+0x285/0x370
perf_event_nmi_handler+0x3d/0x60
nmi_handle+0x104/0x330
Thanks to Thomas Gleixner's analysis, the issue is caused by the low
initial period (1) of the frequency estimation algorithm, which triggers
the defects of the HW, specifically erratum HSW11 and HSW143. (For the
details, please refer https://lore.kernel.org/lkml/87plq9l5d2.ffs@tglx/)
The HSW11 requires a period larger than 100 for the INST_RETIRED.ALL
event, but the initial period in the freq mode is 1. The erratum is the
same as the BDM11, which has been supported in the kernel. A minimum
period of 128 is enforced as well on HSW.
HSW143 is regarding that the fixed counter 1 may overcount 32 with the
Hyper-Threading is enabled. However, based on the test, the hardware
has more issues than it tells. Besides the fixed counter 1, the message
'interrupt took too long' can be observed on any counter which was armed
with a period < 32 and two events expired in the same NMI. A minimum
period of 32 is enforced for the rest of the events.
The recommended workaround code of the HSW143 is not implemented.
Because it only addresses the issue for the fixed counter. It brings
extra overhead through extra MSR writing. No related overcounting issue
has been reported so far.
Fixes: 3a632cb229bf ("perf/x86/intel: Add simple Haswell PMU support")
Reported-by: Li Huafei <lihuafei1(a)huawei.com>
Suggested-by: Thomas Gleixner <tglx(a)linutronix.de>
Signed-off-by: Kan Liang <kan.liang(a)linux.intel.com>
Signed-off-by: Thomas Gleixner <tglx(a)linutronix.de>
Cc: stable(a)vger.kernel.org
Link: https://lore.kernel.org/all/20240819183004.3132920-1-kan.liang@linux.intel.…
Closes: https://lore.kernel.org/lkml/20240729223328.327835-1-lihuafei1@huawei.com/
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 0c9c2706d4ec..9e519d8a810a 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -4589,6 +4589,25 @@ static enum hybrid_cpu_type adl_get_hybrid_cpu_type(void)
return HYBRID_INTEL_CORE;
}
+static inline bool erratum_hsw11(struct perf_event *event)
+{
+ return (event->hw.config & INTEL_ARCH_EVENT_MASK) ==
+ X86_CONFIG(.event=0xc0, .umask=0x01);
+}
+
+/*
+ * The HSW11 requires a period larger than 100 which is the same as the BDM11.
+ * A minimum period of 128 is enforced as well for the INST_RETIRED.ALL.
+ *
+ * The message 'interrupt took too long' can be observed on any counter which
+ * was armed with a period < 32 and two events expired in the same NMI.
+ * A minimum period of 32 is enforced for the rest of the events.
+ */
+static void hsw_limit_period(struct perf_event *event, s64 *left)
+{
+ *left = max(*left, erratum_hsw11(event) ? 128 : 32);
+}
+
/*
* Broadwell:
*
@@ -4606,8 +4625,7 @@ static enum hybrid_cpu_type adl_get_hybrid_cpu_type(void)
*/
static void bdw_limit_period(struct perf_event *event, s64 *left)
{
- if ((event->hw.config & INTEL_ARCH_EVENT_MASK) ==
- X86_CONFIG(.event=0xc0, .umask=0x01)) {
+ if (erratum_hsw11(event)) {
if (*left < 128)
*left = 128;
*left &= ~0x3fULL;
@@ -6766,6 +6784,7 @@ __init int intel_pmu_init(void)
x86_pmu.hw_config = hsw_hw_config;
x86_pmu.get_event_constraints = hsw_get_event_constraints;
+ x86_pmu.limit_period = hsw_limit_period;
x86_pmu.lbr_double_abort = true;
extra_attr = boot_cpu_has(X86_FEATURE_RTM) ?
hsw_format_attr : nhm_format_attr;
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.15.y
git checkout FETCH_HEAD
git cherry-pick -x 25dfc9e357af8aed1ca79b318a73f2c59c1f0b2b
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024090837-twisty-deodorant-ee35@gregkh' --subject-prefix 'PATCH 5.15.y' HEAD^..
Possible dependencies:
25dfc9e357af ("perf/x86/intel: Limit the period on Haswell")
28f0f3c44b5c ("perf/x86: Change x86_pmu::limit_period signature")
706460a96fc6 ("perf/x86/amd/core: Add generic branch record interfaces")
b40d0156f560 ("perf/x86/amd/brs: Move feature-specific functions")
3c27b0c6ea48 ("perf/x86/amd: Fix AMD BRS period adjustment")
9622e67e3980 ("perf/x86/amd/core: Add PerfMonV2 counter control")
21d59e3e2c40 ("perf/x86/amd/core: Detect PerfMonV2 support")
cc37e520a236 ("perf/x86/amd: Make Zen3 branch sampling opt-in")
ba2fe7500845 ("perf/x86/amd: Add AMD branch sampling period adjustment")
ada543459cab ("perf/x86/amd: Add AMD Fam19h Branch Sampling support")
369461ce8fb6 ("x86: perf: Move RDPMC event flag to a common definition")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 25dfc9e357af8aed1ca79b318a73f2c59c1f0b2b Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang(a)linux.intel.com>
Date: Mon, 19 Aug 2024 11:30:04 -0700
Subject: [PATCH] perf/x86/intel: Limit the period on Haswell
Running the ltp test cve-2015-3290 concurrently reports the following
warnings.
perfevents: irq loop stuck!
WARNING: CPU: 31 PID: 32438 at arch/x86/events/intel/core.c:3174
intel_pmu_handle_irq+0x285/0x370
Call Trace:
<NMI>
? __warn+0xa4/0x220
? intel_pmu_handle_irq+0x285/0x370
? __report_bug+0x123/0x130
? intel_pmu_handle_irq+0x285/0x370
? __report_bug+0x123/0x130
? intel_pmu_handle_irq+0x285/0x370
? report_bug+0x3e/0xa0
? handle_bug+0x3c/0x70
? exc_invalid_op+0x18/0x50
? asm_exc_invalid_op+0x1a/0x20
? irq_work_claim+0x1e/0x40
? intel_pmu_handle_irq+0x285/0x370
perf_event_nmi_handler+0x3d/0x60
nmi_handle+0x104/0x330
Thanks to Thomas Gleixner's analysis, the issue is caused by the low
initial period (1) of the frequency estimation algorithm, which triggers
the defects of the HW, specifically erratum HSW11 and HSW143. (For the
details, please refer https://lore.kernel.org/lkml/87plq9l5d2.ffs@tglx/)
The HSW11 requires a period larger than 100 for the INST_RETIRED.ALL
event, but the initial period in the freq mode is 1. The erratum is the
same as the BDM11, which has been supported in the kernel. A minimum
period of 128 is enforced as well on HSW.
HSW143 is regarding that the fixed counter 1 may overcount 32 with the
Hyper-Threading is enabled. However, based on the test, the hardware
has more issues than it tells. Besides the fixed counter 1, the message
'interrupt took too long' can be observed on any counter which was armed
with a period < 32 and two events expired in the same NMI. A minimum
period of 32 is enforced for the rest of the events.
The recommended workaround code of the HSW143 is not implemented.
Because it only addresses the issue for the fixed counter. It brings
extra overhead through extra MSR writing. No related overcounting issue
has been reported so far.
Fixes: 3a632cb229bf ("perf/x86/intel: Add simple Haswell PMU support")
Reported-by: Li Huafei <lihuafei1(a)huawei.com>
Suggested-by: Thomas Gleixner <tglx(a)linutronix.de>
Signed-off-by: Kan Liang <kan.liang(a)linux.intel.com>
Signed-off-by: Thomas Gleixner <tglx(a)linutronix.de>
Cc: stable(a)vger.kernel.org
Link: https://lore.kernel.org/all/20240819183004.3132920-1-kan.liang@linux.intel.…
Closes: https://lore.kernel.org/lkml/20240729223328.327835-1-lihuafei1@huawei.com/
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 0c9c2706d4ec..9e519d8a810a 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -4589,6 +4589,25 @@ static enum hybrid_cpu_type adl_get_hybrid_cpu_type(void)
return HYBRID_INTEL_CORE;
}
+static inline bool erratum_hsw11(struct perf_event *event)
+{
+ return (event->hw.config & INTEL_ARCH_EVENT_MASK) ==
+ X86_CONFIG(.event=0xc0, .umask=0x01);
+}
+
+/*
+ * The HSW11 requires a period larger than 100 which is the same as the BDM11.
+ * A minimum period of 128 is enforced as well for the INST_RETIRED.ALL.
+ *
+ * The message 'interrupt took too long' can be observed on any counter which
+ * was armed with a period < 32 and two events expired in the same NMI.
+ * A minimum period of 32 is enforced for the rest of the events.
+ */
+static void hsw_limit_period(struct perf_event *event, s64 *left)
+{
+ *left = max(*left, erratum_hsw11(event) ? 128 : 32);
+}
+
/*
* Broadwell:
*
@@ -4606,8 +4625,7 @@ static enum hybrid_cpu_type adl_get_hybrid_cpu_type(void)
*/
static void bdw_limit_period(struct perf_event *event, s64 *left)
{
- if ((event->hw.config & INTEL_ARCH_EVENT_MASK) ==
- X86_CONFIG(.event=0xc0, .umask=0x01)) {
+ if (erratum_hsw11(event)) {
if (*left < 128)
*left = 128;
*left &= ~0x3fULL;
@@ -6766,6 +6784,7 @@ __init int intel_pmu_init(void)
x86_pmu.hw_config = hsw_hw_config;
x86_pmu.get_event_constraints = hsw_get_event_constraints;
+ x86_pmu.limit_period = hsw_limit_period;
x86_pmu.lbr_double_abort = true;
extra_attr = boot_cpu_has(X86_FEATURE_RTM) ?
hsw_format_attr : nhm_format_attr;
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.15.y
git checkout FETCH_HEAD
git cherry-pick -x 844436e045ac2ab7895d8b281cb784a24de1d14d
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024090848-botch-appointee-7cb2@gregkh' --subject-prefix 'PATCH 5.15.y' HEAD^..
Possible dependencies:
844436e045ac ("ksmbd: Unlock on in ksmbd_tcp_set_interfaces()")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 844436e045ac2ab7895d8b281cb784a24de1d14d Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter(a)linaro.org>
Date: Thu, 29 Aug 2024 22:22:35 +0300
Subject: [PATCH] ksmbd: Unlock on in ksmbd_tcp_set_interfaces()
Unlock before returning an error code if this allocation fails.
Fixes: 0626e6641f6b ("cifsd: add server handler for central processing and tranport layers")
Cc: stable(a)vger.kernel.org # v5.15+
Signed-off-by: Dan Carpenter <dan.carpenter(a)linaro.org>
Acked-by: Namjae Jeon <linkinjeon(a)kernel.org>
Signed-off-by: Steve French <stfrench(a)microsoft.com>
diff --git a/fs/smb/server/transport_tcp.c b/fs/smb/server/transport_tcp.c
index a84788396daa..aaed9e293b2e 100644
--- a/fs/smb/server/transport_tcp.c
+++ b/fs/smb/server/transport_tcp.c
@@ -624,8 +624,10 @@ int ksmbd_tcp_set_interfaces(char *ifc_list, int ifc_list_sz)
for_each_netdev(&init_net, netdev) {
if (netif_is_bridge_port(netdev))
continue;
- if (!alloc_iface(kstrdup(netdev->name, GFP_KERNEL)))
+ if (!alloc_iface(kstrdup(netdev->name, GFP_KERNEL))) {
+ rtnl_unlock();
return -ENOMEM;
+ }
}
rtnl_unlock();
bind_additional_ifaces = 1;
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.15.y
git checkout FETCH_HEAD
git cherry-pick -x 78c5a6f1f630172b19af4912e755e1da93ef0ab5
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024090838-reentry-fender-aec1@gregkh' --subject-prefix 'PATCH 5.15.y' HEAD^..
Possible dependencies:
78c5a6f1f630 ("ksmbd: unset the binding mark of a reused connection")
38c8a9a52082 ("smb: move client and server files to common directory fs/smb")
f5c779b7ddbd ("ksmbd: fix racy issue from session setup and logoff")
1d9c4172110e ("ksmbd: Implements sess->ksmbd_chann_list as xarray")
62c487b53a7f ("ksmbd: limit pdu length size according to connection status")
abdb1742a312 ("cifs: get rid of mount options string parsing")
9fd29a5bae6e ("cifs: use fs_context for automounts")
5dd8ce24667a ("cifs: missing directory in MAINTAINERS file")
332019e23a51 ("Merge tag '5.20-rc-smb3-client-fixes-part2' of git://git.samba.org/sfrench/cifs-2.6")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 78c5a6f1f630172b19af4912e755e1da93ef0ab5 Mon Sep 17 00:00:00 2001
From: Namjae Jeon <linkinjeon(a)kernel.org>
Date: Tue, 27 Aug 2024 21:44:41 +0900
Subject: [PATCH] ksmbd: unset the binding mark of a reused connection
Steve French reported null pointer dereference error from sha256 lib.
cifs.ko can send session setup requests on reused connection.
If reused connection is used for binding session, conn->binding can
still remain true and generate_preauth_hash() will not set
sess->Preauth_HashValue and it will be NULL.
It is used as a material to create an encryption key in
ksmbd_gen_smb311_encryptionkey. ->Preauth_HashValue cause null pointer
dereference error from crypto_shash_update().
BUG: kernel NULL pointer dereference, address: 0000000000000000
#PF: supervisor read access in kernel mode
#PF: error_code(0x0000) - not-present page
PGD 0 P4D 0
Oops: 0000 [#1] PREEMPT SMP PTI
CPU: 8 PID: 429254 Comm: kworker/8:39
Hardware name: LENOVO 20MAS08500/20MAS08500, BIOS N2CET69W (1.52 )
Workqueue: ksmbd-io handle_ksmbd_work [ksmbd]
RIP: 0010:lib_sha256_base_do_update.isra.0+0x11e/0x1d0 [sha256_ssse3]
<TASK>
? show_regs+0x6d/0x80
? __die+0x24/0x80
? page_fault_oops+0x99/0x1b0
? do_user_addr_fault+0x2ee/0x6b0
? exc_page_fault+0x83/0x1b0
? asm_exc_page_fault+0x27/0x30
? __pfx_sha256_transform_rorx+0x10/0x10 [sha256_ssse3]
? lib_sha256_base_do_update.isra.0+0x11e/0x1d0 [sha256_ssse3]
? __pfx_sha256_transform_rorx+0x10/0x10 [sha256_ssse3]
? __pfx_sha256_transform_rorx+0x10/0x10 [sha256_ssse3]
_sha256_update+0x77/0xa0 [sha256_ssse3]
sha256_avx2_update+0x15/0x30 [sha256_ssse3]
crypto_shash_update+0x1e/0x40
hmac_update+0x12/0x20
crypto_shash_update+0x1e/0x40
generate_key+0x234/0x380 [ksmbd]
generate_smb3encryptionkey+0x40/0x1c0 [ksmbd]
ksmbd_gen_smb311_encryptionkey+0x72/0xa0 [ksmbd]
ntlm_authenticate.isra.0+0x423/0x5d0 [ksmbd]
smb2_sess_setup+0x952/0xaa0 [ksmbd]
__process_request+0xa3/0x1d0 [ksmbd]
__handle_ksmbd_work+0x1c4/0x2f0 [ksmbd]
handle_ksmbd_work+0x2d/0xa0 [ksmbd]
process_one_work+0x16c/0x350
worker_thread+0x306/0x440
? __pfx_worker_thread+0x10/0x10
kthread+0xef/0x120
? __pfx_kthread+0x10/0x10
ret_from_fork+0x44/0x70
? __pfx_kthread+0x10/0x10
ret_from_fork_asm+0x1b/0x30
</TASK>
Fixes: f5a544e3bab7 ("ksmbd: add support for SMB3 multichannel")
Cc: stable(a)vger.kernel.org # v5.15+
Signed-off-by: Namjae Jeon <linkinjeon(a)kernel.org>
Signed-off-by: Steve French <stfrench(a)microsoft.com>
diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c
index 20846a4d3031..8bdc59251418 100644
--- a/fs/smb/server/smb2pdu.c
+++ b/fs/smb/server/smb2pdu.c
@@ -1690,6 +1690,8 @@ int smb2_sess_setup(struct ksmbd_work *work)
rc = ksmbd_session_register(conn, sess);
if (rc)
goto out_err;
+
+ conn->binding = false;
} else if (conn->dialect >= SMB30_PROT_ID &&
(server_conf.flags & KSMBD_GLOBAL_FLAG_SMB3_MULTICHANNEL) &&
req->Flags & SMB2_SESSION_REQ_FLAG_BINDING) {
@@ -1768,6 +1770,8 @@ int smb2_sess_setup(struct ksmbd_work *work)
sess = NULL;
goto out_err;
}
+
+ conn->binding = false;
}
work->sess = sess;
The patch titled
Subject: mm/codetag: add pgalloc_tag_copy()
has been added to the -mm mm-unstable branch. Its filename is
mm-codetag-add-pgalloc_tag_copy.patch
This patch will shortly appear at
https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patche…
This patch will later appear in the mm-unstable branch at
git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next via the mm-everything
branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
and is updated there every 2-3 working days
------------------------------------------------------
From: Yu Zhao <yuzhao(a)google.com>
Subject: mm/codetag: add pgalloc_tag_copy()
Date: Thu, 5 Sep 2024 22:21:08 -0600
Add pgalloc_tag_copy() to transfer the codetag from the old folio to the
new one during migration. This makes original allocation sites persist
cross migration rather than lump into the get_new_folio callbacks passed
into migrate_pages(), e.g., compaction_alloc():
# echo 1 >/proc/sys/vm/compact_memory
# grep compaction_alloc /proc/allocinfo
Before this patch:
132968448 32463 mm/compaction.c:1880 func:compaction_alloc
After this patch:
0 0 mm/compaction.c:1880 func:compaction_alloc
Link: https://lkml.kernel.org/r/20240906042108.1150526-3-yuzhao@google.com
Fixes: dcfe378c81f7 ("lib: introduce support for page allocation tagging")
Signed-off-by: Yu Zhao <yuzhao(a)google.com>
Acked-by: Suren Baghdasaryan <surenb(a)google.com>
Cc: <stable(a)vger.kernel.org>
Cc: Kent Overstreet <kent.overstreet(a)linux.dev>
Cc: Muchun Song <muchun.song(a)linux.dev>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
include/linux/alloc_tag.h | 24 ++++++++++--------------
include/linux/mm.h | 27 +++++++++++++++++++++++++++
mm/migrate.c | 1 +
3 files changed, 38 insertions(+), 14 deletions(-)
--- a/include/linux/alloc_tag.h~mm-codetag-add-pgalloc_tag_copy
+++ a/include/linux/alloc_tag.h
@@ -137,7 +137,16 @@ static inline void alloc_tag_sub_check(u
/* Caller should verify both ref and tag to be valid */
static inline void __alloc_tag_ref_set(union codetag_ref *ref, struct alloc_tag *tag)
{
+ alloc_tag_add_check(ref, tag);
+ if (!ref || !tag)
+ return;
+
ref->ct = &tag->ct;
+}
+
+static inline void alloc_tag_ref_set(union codetag_ref *ref, struct alloc_tag *tag)
+{
+ __alloc_tag_ref_set(ref, tag);
/*
* We need in increment the call counter every time we have a new
* allocation or when we split a large allocation into smaller ones.
@@ -147,22 +156,9 @@ static inline void __alloc_tag_ref_set(u
this_cpu_inc(tag->counters->calls);
}
-static inline void alloc_tag_ref_set(union codetag_ref *ref, struct alloc_tag *tag)
-{
- alloc_tag_add_check(ref, tag);
- if (!ref || !tag)
- return;
-
- __alloc_tag_ref_set(ref, tag);
-}
-
static inline void alloc_tag_add(union codetag_ref *ref, struct alloc_tag *tag, size_t bytes)
{
- alloc_tag_add_check(ref, tag);
- if (!ref || !tag)
- return;
-
- __alloc_tag_ref_set(ref, tag);
+ alloc_tag_ref_set(ref, tag);
this_cpu_add(tag->counters->bytes, bytes);
}
--- a/include/linux/mm.h~mm-codetag-add-pgalloc_tag_copy
+++ a/include/linux/mm.h
@@ -4161,10 +4161,37 @@ static inline void pgalloc_tag_split(str
}
}
}
+
+static inline void pgalloc_tag_copy(struct folio *new, struct folio *old)
+{
+ struct alloc_tag *tag;
+ union codetag_ref *ref;
+
+ tag = pgalloc_tag_get(&old->page);
+ if (!tag)
+ return;
+
+ ref = get_page_tag_ref(&new->page);
+ if (!ref)
+ return;
+
+ /* Clear the old ref to the original allocation tag. */
+ clear_page_tag_ref(&old->page);
+ /* Decrement the counters of the tag on get_new_folio. */
+ alloc_tag_sub(ref, folio_nr_pages(new));
+
+ __alloc_tag_ref_set(ref, tag);
+
+ put_page_tag_ref(ref);
+}
#else /* !CONFIG_MEM_ALLOC_PROFILING */
static inline void pgalloc_tag_split(struct folio *folio, int old_order, int new_order)
{
}
+
+static inline void pgalloc_tag_copy(struct folio *new, struct folio *old)
+{
+}
#endif /* CONFIG_MEM_ALLOC_PROFILING */
#endif /* _LINUX_MM_H */
--- a/mm/migrate.c~mm-codetag-add-pgalloc_tag_copy
+++ a/mm/migrate.c
@@ -743,6 +743,7 @@ void folio_migrate_flags(struct folio *n
folio_set_readahead(newfolio);
folio_copy_owner(newfolio, folio);
+ pgalloc_tag_copy(newfolio, folio);
mem_cgroup_migrate(folio, newfolio);
}
_
Patches currently in -mm which might be from yuzhao(a)google.com are
mm-remap-unused-subpages-to-shared-zeropage-when-splitting-isolated-thp.patch
mm-codetag-fix-a-typo.patch
mm-codetag-fix-pgalloc_tag_split.patch
mm-codetag-add-pgalloc_tag_copy.patch
The patch titled
Subject: mm/codetag: fix pgalloc_tag_split()
has been added to the -mm mm-unstable branch. Its filename is
mm-codetag-fix-pgalloc_tag_split.patch
This patch will shortly appear at
https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patche…
This patch will later appear in the mm-unstable branch at
git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next via the mm-everything
branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
and is updated there every 2-3 working days
------------------------------------------------------
From: Yu Zhao <yuzhao(a)google.com>
Subject: mm/codetag: fix pgalloc_tag_split()
Date: Thu, 5 Sep 2024 22:21:07 -0600
The current assumption is that a large folio can only be split into
order-0 folios. That is not the case for hugeTLB demotion, nor for THP
split: see commit c010d47f107f ("mm: thp: split huge page to any lower
order pages").
When a large folio is split into ones of a lower non-zero order, only the
new head pages should be tagged. Tagging tail pages can cause imbalanced
"calls" counters, since only head pages are untagged by pgalloc_tag_sub()
and the "calls" counts on tail pages are leaked, e.g.,
# echo 2048kB >/sys/kernel/mm/hugepages/hugepages-1048576kB/demote_size
# echo 700 >/sys/kernel/mm/hugepages/hugepages-1048576kB/nr_hugepages
# time echo 700 >/sys/kernel/mm/hugepages/hugepages-1048576kB/demote
# echo 0 >/sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages
# grep alloc_gigantic_folio /proc/allocinfo
Before this patch:
0 549427200 mm/hugetlb.c:1549 func:alloc_gigantic_folio
real 0m2.057s
user 0m0.000s
sys 0m2.051s
After this patch:
0 0 mm/hugetlb.c:1549 func:alloc_gigantic_folio
real 0m1.711s
user 0m0.000s
sys 0m1.704s
Not tagging tail pages also improves the splitting time, e.g., by about
15% when demoting 1GB hugeTLB folios to 2MB ones, as shown above.
Link: https://lkml.kernel.org/r/20240906042108.1150526-2-yuzhao@google.com
Fixes: be25d1d4e822 ("mm: create new codetag references during page splitting")
Signed-off-by: Yu Zhao <yuzhao(a)google.com>
Acked-by: Suren Baghdasaryan <surenb(a)google.com>
Cc: <stable(a)vger.kernel.org>
Cc: Kent Overstreet <kent.overstreet(a)linux.dev>
Cc: Muchun Song <muchun.song(a)linux.dev>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
include/linux/mm.h | 30 ++++++++++++++++++++++++++++++
include/linux/pgalloc_tag.h | 31 -------------------------------
mm/huge_memory.c | 2 +-
mm/hugetlb.c | 2 +-
mm/page_alloc.c | 4 ++--
5 files changed, 34 insertions(+), 35 deletions(-)
--- a/include/linux/mm.h~mm-codetag-fix-pgalloc_tag_split
+++ a/include/linux/mm.h
@@ -4137,4 +4137,34 @@ void vma_pgtable_walk_end(struct vm_area
int reserve_mem_find_by_name(const char *name, phys_addr_t *start, phys_addr_t *size);
+#ifdef CONFIG_MEM_ALLOC_PROFILING
+static inline void pgalloc_tag_split(struct folio *folio, int old_order, int new_order)
+{
+ int i;
+ struct alloc_tag *tag;
+ unsigned int nr_pages = 1 << new_order;
+
+ if (!mem_alloc_profiling_enabled())
+ return;
+
+ tag = pgalloc_tag_get(&folio->page);
+ if (!tag)
+ return;
+
+ for (i = nr_pages; i < (1 << old_order); i += nr_pages) {
+ union codetag_ref *ref = get_page_tag_ref(folio_page(folio, i));
+
+ if (ref) {
+ /* Set new reference to point to the original tag */
+ alloc_tag_ref_set(ref, tag);
+ put_page_tag_ref(ref);
+ }
+ }
+}
+#else /* !CONFIG_MEM_ALLOC_PROFILING */
+static inline void pgalloc_tag_split(struct folio *folio, int old_order, int new_order)
+{
+}
+#endif /* CONFIG_MEM_ALLOC_PROFILING */
+
#endif /* _LINUX_MM_H */
--- a/include/linux/pgalloc_tag.h~mm-codetag-fix-pgalloc_tag_split
+++ a/include/linux/pgalloc_tag.h
@@ -80,36 +80,6 @@ static inline void pgalloc_tag_sub(struc
}
}
-static inline void pgalloc_tag_split(struct page *page, unsigned int nr)
-{
- int i;
- struct page_ext *first_page_ext;
- struct page_ext *page_ext;
- union codetag_ref *ref;
- struct alloc_tag *tag;
-
- if (!mem_alloc_profiling_enabled())
- return;
-
- first_page_ext = page_ext = page_ext_get(page);
- if (unlikely(!page_ext))
- return;
-
- ref = codetag_ref_from_page_ext(page_ext);
- if (!ref->ct)
- goto out;
-
- tag = ct_to_alloc_tag(ref->ct);
- page_ext = page_ext_next(page_ext);
- for (i = 1; i < nr; i++) {
- /* Set new reference to point to the original tag */
- alloc_tag_ref_set(codetag_ref_from_page_ext(page_ext), tag);
- page_ext = page_ext_next(page_ext);
- }
-out:
- page_ext_put(first_page_ext);
-}
-
static inline struct alloc_tag *pgalloc_tag_get(struct page *page)
{
struct alloc_tag *tag = NULL;
@@ -142,7 +112,6 @@ static inline void clear_page_tag_ref(st
static inline void pgalloc_tag_add(struct page *page, struct task_struct *task,
unsigned int nr) {}
static inline void pgalloc_tag_sub(struct page *page, unsigned int nr) {}
-static inline void pgalloc_tag_split(struct page *page, unsigned int nr) {}
static inline struct alloc_tag *pgalloc_tag_get(struct page *page) { return NULL; }
static inline void pgalloc_tag_sub_pages(struct alloc_tag *tag, unsigned int nr) {}
--- a/mm/huge_memory.c~mm-codetag-fix-pgalloc_tag_split
+++ a/mm/huge_memory.c
@@ -3242,7 +3242,7 @@ static void __split_huge_page(struct pag
/* Caller disabled irqs, so they are still disabled here */
split_page_owner(head, order, new_order);
- pgalloc_tag_split(head, 1 << order);
+ pgalloc_tag_split(folio, order, new_order);
/* See comment in __split_huge_page_tail() */
if (folio_test_anon(folio)) {
--- a/mm/hugetlb.c~mm-codetag-fix-pgalloc_tag_split
+++ a/mm/hugetlb.c
@@ -3795,7 +3795,7 @@ static long demote_free_hugetlb_folios(s
list_del(&folio->lru);
split_page_owner(&folio->page, huge_page_order(src), huge_page_order(dst));
- pgalloc_tag_split(&folio->page, 1 << huge_page_order(src));
+ pgalloc_tag_split(folio, huge_page_order(src), huge_page_order(dst));
for (i = 0; i < pages_per_huge_page(src); i += pages_per_huge_page(dst)) {
struct page *page = folio_page(folio, i);
--- a/mm/page_alloc.c~mm-codetag-fix-pgalloc_tag_split
+++ a/mm/page_alloc.c
@@ -2783,7 +2783,7 @@ void split_page(struct page *page, unsig
for (i = 1; i < (1 << order); i++)
set_page_refcounted(page + i);
split_page_owner(page, order, 0);
- pgalloc_tag_split(page, 1 << order);
+ pgalloc_tag_split(page_folio(page), order, 0);
split_page_memcg(page, order, 0);
}
EXPORT_SYMBOL_GPL(split_page);
@@ -4981,7 +4981,7 @@ static void *make_alloc_exact(unsigned l
struct page *last = page + nr;
split_page_owner(page, order, 0);
- pgalloc_tag_split(page, 1 << order);
+ pgalloc_tag_split(page_folio(page), order, 0);
split_page_memcg(page, order, 0);
while (page < --last)
set_page_refcounted(last);
_
Patches currently in -mm which might be from yuzhao(a)google.com are
mm-remap-unused-subpages-to-shared-zeropage-when-splitting-isolated-thp.patch
mm-codetag-fix-a-typo.patch
mm-codetag-fix-pgalloc_tag_split.patch
mm-codetag-add-pgalloc_tag_copy.patch
From: Guillaume Stols <gstols(a)baylibre.com>
The current implementation attempts to recover from an eventual glitch
in the clock by checking frstdata state after reading the first
channel's sample: If frstdata is low, it will reset the chip and
return -EIO.
This will only work in parallel mode, where frstdata pin is set low
after the 2nd sample read starts.
For the serial mode, according to the datasheet, "The FRSTDATA output
returns to a logic low following the 16th SCLK falling edge.", thus
after the Xth pulse, X being the number of bits in a sample, the check
will always be true, and the driver will not work at all in serial
mode if frstdata(optional) is defined in the devicetree as it will
reset the chip, and return -EIO every time read_sample is called.
Hence, this check must be removed for serial mode.
Fixes: b9618c0cacd7 ("staging: IIO: ADC: New driver for AD7606/AD7606-6/AD7606-4")
Signed-off-by: Guillaume Stols <gstols(a)baylibre.com>
Reviewed-by: Nuno Sa <nuno.sa(a)analog.com>
Link: https://patch.msgid.link/20240702-cleanup-ad7606-v3-1-18d5ea18770e@baylibre…
Cc: <Stable(a)vger.kernel.org>
Signed-off-by: Jonathan Cameron <Jonathan.Cameron(a)huawei.com>
---
drivers/iio/adc/ad7606.c | 28 ++-------------------
drivers/iio/adc/ad7606.h | 2 ++
drivers/iio/adc/ad7606_par.c | 48 +++++++++++++++++++++++++++++++++---
3 files changed, 49 insertions(+), 29 deletions(-)
diff --git a/drivers/iio/adc/ad7606.c b/drivers/iio/adc/ad7606.c
index 539e4a8621fe..9b457472d49c 100644
--- a/drivers/iio/adc/ad7606.c
+++ b/drivers/iio/adc/ad7606.c
@@ -49,7 +49,7 @@ static const unsigned int ad7616_oversampling_avail[8] = {
1, 2, 4, 8, 16, 32, 64, 128,
};
-static int ad7606_reset(struct ad7606_state *st)
+int ad7606_reset(struct ad7606_state *st)
{
if (st->gpio_reset) {
gpiod_set_value(st->gpio_reset, 1);
@@ -60,6 +60,7 @@ static int ad7606_reset(struct ad7606_state *st)
return -ENODEV;
}
+EXPORT_SYMBOL_NS_GPL(ad7606_reset, IIO_AD7606);
static int ad7606_reg_access(struct iio_dev *indio_dev,
unsigned int reg,
@@ -86,31 +87,6 @@ static int ad7606_read_samples(struct ad7606_state *st)
{
unsigned int num = st->chip_info->num_channels - 1;
u16 *data = st->data;
- int ret;
-
- /*
- * The frstdata signal is set to high while and after reading the sample
- * of the first channel and low for all other channels. This can be used
- * to check that the incoming data is correctly aligned. During normal
- * operation the data should never become unaligned, but some glitch or
- * electrostatic discharge might cause an extra read or clock cycle.
- * Monitoring the frstdata signal allows to recover from such failure
- * situations.
- */
-
- if (st->gpio_frstdata) {
- ret = st->bops->read_block(st->dev, 1, data);
- if (ret)
- return ret;
-
- if (!gpiod_get_value(st->gpio_frstdata)) {
- ad7606_reset(st);
- return -EIO;
- }
-
- data++;
- num--;
- }
return st->bops->read_block(st->dev, num, data);
}
diff --git a/drivers/iio/adc/ad7606.h b/drivers/iio/adc/ad7606.h
index 0c6a88cc4695..6649e84d25de 100644
--- a/drivers/iio/adc/ad7606.h
+++ b/drivers/iio/adc/ad7606.h
@@ -151,6 +151,8 @@ int ad7606_probe(struct device *dev, int irq, void __iomem *base_address,
const char *name, unsigned int id,
const struct ad7606_bus_ops *bops);
+int ad7606_reset(struct ad7606_state *st);
+
enum ad7606_supported_device_ids {
ID_AD7605_4,
ID_AD7606_8,
diff --git a/drivers/iio/adc/ad7606_par.c b/drivers/iio/adc/ad7606_par.c
index b5975bbfcbe0..02d8c309304e 100644
--- a/drivers/iio/adc/ad7606_par.c
+++ b/drivers/iio/adc/ad7606_par.c
@@ -7,6 +7,7 @@
#include <linux/mod_devicetable.h>
#include <linux/module.h>
+#include <linux/gpio/consumer.h>
#include <linux/platform_device.h>
#include <linux/types.h>
#include <linux/err.h>
@@ -21,8 +22,29 @@ static int ad7606_par16_read_block(struct device *dev,
struct iio_dev *indio_dev = dev_get_drvdata(dev);
struct ad7606_state *st = iio_priv(indio_dev);
- insw((unsigned long)st->base_address, buf, count);
+ /*
+ * On the parallel interface, the frstdata signal is set to high while
+ * and after reading the sample of the first channel and low for all
+ * other channels. This can be used to check that the incoming data is
+ * correctly aligned. During normal operation the data should never
+ * become unaligned, but some glitch or electrostatic discharge might
+ * cause an extra read or clock cycle. Monitoring the frstdata signal
+ * allows to recover from such failure situations.
+ */
+ int num = count;
+ u16 *_buf = buf;
+
+ if (st->gpio_frstdata) {
+ insw((unsigned long)st->base_address, _buf, 1);
+ if (!gpiod_get_value(st->gpio_frstdata)) {
+ ad7606_reset(st);
+ return -EIO;
+ }
+ _buf++;
+ num--;
+ }
+ insw((unsigned long)st->base_address, _buf, num);
return 0;
}
@@ -35,8 +57,28 @@ static int ad7606_par8_read_block(struct device *dev,
{
struct iio_dev *indio_dev = dev_get_drvdata(dev);
struct ad7606_state *st = iio_priv(indio_dev);
-
- insb((unsigned long)st->base_address, buf, count * 2);
+ /*
+ * On the parallel interface, the frstdata signal is set to high while
+ * and after reading the sample of the first channel and low for all
+ * other channels. This can be used to check that the incoming data is
+ * correctly aligned. During normal operation the data should never
+ * become unaligned, but some glitch or electrostatic discharge might
+ * cause an extra read or clock cycle. Monitoring the frstdata signal
+ * allows to recover from such failure situations.
+ */
+ int num = count;
+ u16 *_buf = buf;
+
+ if (st->gpio_frstdata) {
+ insb((unsigned long)st->base_address, _buf, 2);
+ if (!gpiod_get_value(st->gpio_frstdata)) {
+ ad7606_reset(st);
+ return -EIO;
+ }
+ _buf++;
+ num--;
+ }
+ insb((unsigned long)st->base_address, _buf, num * 2);
return 0;
}
--
2.46.0
This fixes two problems in the handling of negative times:
• rem is signed, but the rem * c->sb.nsec_per_time_unit operation
produced a bogus unsigned result, because s32 * u32 = u32.
• The timespec was not normalized (it could contain more than a
billion nanoseconds).
For example, { .tv_sec = -14245441, .tv_nsec = 750000000 }, after
being round tripped through timespec_to_bch2_time and then
bch2_time_to_timespec would come back as
{ .tv_sec = -14245440, .tv_nsec = 4044967296 } (more than 4 billion
nanoseconds).
Cc: stable(a)vger.kernel.org
Fixes: 595c1e9bab7f ("bcachefs: Fix time handling")
Closes: https://github.com/koverstreet/bcachefs/issues/743
Co-developed-by: Erin Shepherd <erin.shepherd(a)e43.eu>
Signed-off-by: Erin Shepherd <erin.shepherd(a)e43.eu>
Co-developed-by: Ryan Lahfa <ryan(a)lahfa.xyz>
Signed-off-by: Ryan Lahfa <ryan(a)lahfa.xyz>
Signed-off-by: Alyssa Ross <hi(a)alyssa.is>
---
I've submitted an RFC to fstests to add a regression test for this:
https://lore.kernel.org/fstests/20240907154527.604864-2-hi@alyssa.is/
fs/bcachefs/bcachefs.h | 7 +++++--
1 file changed, 5 insertions(+), 2 deletions(-)
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 0c7086e00d18..81c4d935cca8 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -1195,12 +1195,15 @@ static inline bool btree_id_cached(const struct bch_fs *c, enum btree_id btree)
static inline struct timespec64 bch2_time_to_timespec(const struct bch_fs *c, s64 time)
{
struct timespec64 t;
+ s64 sec;
s32 rem;
time += c->sb.time_base_lo;
- t.tv_sec = div_s64_rem(time, c->sb.time_units_per_sec, &rem);
- t.tv_nsec = rem * c->sb.nsec_per_time_unit;
+ sec = div_s64_rem(time, c->sb.time_units_per_sec, &rem);
+
+ set_normalized_timespec64(&t, sec, rem * (s64)c->sb.nsec_per_time_unit);
+
return t;
}
base-commit: 53f6619554fb1edf8d7599b560d44dbea085c730
--
2.45.2
Hi Greg,
Thank you again for your support when we send patches for stable
versions for MPTCP!
Recently, I sent many patches for the stable versions, and I just wanted
to check if what I did was OK for you?
I tried to reply to all the 'FAILED: patch' emails you sent, either with
patches, or with reasons explaining why it is fine not to backport them.
Are you OK with that?
Or do you prefer only receiving the patches, and not the emails with the
reasons not to backport some of them?
About the patches, do you prefer to receive one big series per version
or individual patches sent in reply to the different 'FAILED: patch'
emails like I did?
Do not hesitate if there are things we can improve!
Cheers,
Matt
--
Sponsored by the NGI0 Core fund.
The submit queue polling threads are userland threads that just never
exit to the userland. In case the creating task is part of a cgroup
with the cpuset controller enabled, the poller should also stay within
that cpuset. This also holds, as the poller belongs to the same cgroup
as the task that started it.
With the current implementation, a process can "break out" of the
defined cpuset by creating sq pollers consuming CPU time on other CPUs,
which is especially problematic for realtime applications.
Part of this problem was fixed in a5fc1441 by dropping the
PF_NO_SETAFFINITY flag, but this only becomes effective after the first
modification of the cpuset (i.e. the pollers cpuset is correct after the
first update of the enclosing cgroups cpuset).
By inheriting the cpuset of the creating tasks, we ensure that the
poller is created with a cpumask that is a subset of the cgroups mask.
Inheriting the creators cpumask is reasonable, as other userland tasks
also inherit the mask.
Fixes: 37d1e2e3642e ("io_uring: move SQPOLL thread io-wq forked worker")
Cc: stable(a)vger.kernel.org # 6.1+
Signed-off-by: Felix Moessbauer <felix.moessbauer(a)siemens.com>
---
Changes since v2:
- in v2 I accidentally sent the backport of this patch for v6.1. Will
resend that once this one is accepted. Anyways, now we know that this
also works on a v6.1 kernel.
Changes since v1:
- do not set poller thread cpuset in non-pinning case, as the default is already
correct (the mask is inherited from the parent).
- Remove incorrect term "kernel thread" from the commit message
I tested this without pinning, explicit pinning of the parent task and
non-all cgroup cpusets (and all combinations).
Best regards,
Felix Moessbauer
Siemens AG
io_uring/sqpoll.c | 1 -
1 file changed, 1 deletion(-)
diff --git a/io_uring/sqpoll.c b/io_uring/sqpoll.c
index 3b50dc9586d1..713be7c29388 100644
--- a/io_uring/sqpoll.c
+++ b/io_uring/sqpoll.c
@@ -289,7 +289,6 @@ static int io_sq_thread(void *data)
if (sqd->sq_cpu != -1) {
set_cpus_allowed_ptr(current, cpumask_of(sqd->sq_cpu));
} else {
- set_cpus_allowed_ptr(current, cpu_online_mask);
sqd->sq_cpu = raw_smp_processor_id();
}
--
2.39.2
The submit queue polling threads are userland threads that just never
exit to the userland. In case the creating task is part of a cgroup
with the cpuset controller enabled, the poller should also stay within
that cpuset. This also holds, as the poller belongs to the same cgroup
as the task that started it.
With the current implementation, a process can "break out" of the
defined cpuset by creating sq pollers consuming CPU time on other CPUs,
which is especially problematic for realtime applications.
Part of this problem was fixed in a5fc1441 by dropping the
PF_NO_SETAFFINITY flag, but this only becomes effective after the first
modification of the cpuset (i.e. the pollers cpuset is correct after the
first update of the enclosing cgroups cpuset).
By inheriting the cpuset of the creating tasks, we ensure that the
poller is created with a cpumask that is a subset of the cgroups mask.
Inheriting the creators cpumask is reasonable, as other userland tasks
also inherit the mask.
Fixes: 37d1e2e3642e ("io_uring: move SQPOLL thread io-wq forked worker")
Cc: stable(a)vger.kernel.org # 6.1+
Signed-off-by: Felix Moessbauer <felix.moessbauer(a)siemens.com>
---
Changes since v1:
- do not set poller thread cpuset in non-pinning case, as the default is already
correct (the mask is inherited from the parent).
- Remove incorrect term "kernel thread" from the commit message
I tested this without pinning, explicit pinning of the parent task and
non-all cgroup cpusets (and all combinations).
Best regards,
Felix Moessbauer
Siemens AG
io_uring/sqpoll.c | 2 --
1 file changed, 2 deletions(-)
diff --git a/io_uring/sqpoll.c b/io_uring/sqpoll.c
index 6ea21b503113..5a002fa1d953 100644
--- a/io_uring/sqpoll.c
+++ b/io_uring/sqpoll.c
@@ -231,8 +231,6 @@ static int io_sq_thread(void *data)
if (sqd->sq_cpu != -1)
set_cpus_allowed_ptr(current, cpumask_of(sqd->sq_cpu));
- else
- set_cpus_allowed_ptr(current, cpu_online_mask);
/*
* Force audit context to get setup, in case we do prep side async
--
2.39.2
It may be possible for the sum of the values derived from
i915_ggtt_offset() and __get_parent_scratch_offset()/
i915_ggtt_offset() to go over the u32 limit before being assigned
to wq offsets of u64 type.
Mitigate these issues by expanding one of the right operands
to u64 to avoid any overflow issues just in case.
Found by Linux Verification Center (linuxtesting.org) with static
analysis tool SVACE.
Fixes: 2584b3549f4c ("drm/i915/guc: Update to GuC version 70.1.1")
Cc: stable(a)vger.kernel.org
Signed-off-by: Nikita Zhandarovich <n.zhandarovich(a)fintech.ru>
---
drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
index 9400d0eb682b..908ebfa22933 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
@@ -2842,9 +2842,9 @@ static void prepare_context_registration_info_v70(struct intel_context *ce,
ce->parallel.guc.wqi_tail = 0;
ce->parallel.guc.wqi_head = 0;
- wq_desc_offset = i915_ggtt_offset(ce->state) +
+ wq_desc_offset = (u64)i915_ggtt_offset(ce->state) +
__get_parent_scratch_offset(ce);
- wq_base_offset = i915_ggtt_offset(ce->state) +
+ wq_base_offset = (u64)i915_ggtt_offset(ce->state) +
__get_wq_offset(ce);
info->wq_desc_lo = lower_32_bits(wq_desc_offset);
info->wq_desc_hi = upper_32_bits(wq_desc_offset);
Commit 08d08e2e9f0a ("tpm: ibmvtpm: Call tpm2_sessions_init() to
initialize session support") adds call to tpm2_sessions_init() in ibmvtpm,
which could be built as a module. However, tpm2_sessions_init() wasn't
exported, causing libmvtpm to fail to build as a module:
ERROR: modpost: "tpm2_sessions_init" [drivers/char/tpm/tpm_ibmvtpm.ko] undefined!
Export tpm2_sessions_init() to resolve the issue.
Cc: stable(a)vger.kernel.org # v6.10+
Reported-by: kernel test robot <lkp(a)intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202408051735.ZJkAPQ3b-lkp@intel.com/
Fixes: 08d08e2e9f0a ("tpm: ibmvtpm: Call tpm2_sessions_init() to initialize session support")
Signed-off-by: Kexy Biscuit <kexybiscuit(a)aosc.io>
Signed-off-by: Mingcong Bai <jeffbai(a)aosc.io>
---
V1 -> V2: Added Fixes tag and fixed email format
RESEND: The previous email was sent directly to stable-rc review
drivers/char/tpm/tpm2-sessions.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/drivers/char/tpm/tpm2-sessions.c b/drivers/char/tpm/tpm2-sessions.c
index d3521aadd43e..44f60730cff4 100644
--- a/drivers/char/tpm/tpm2-sessions.c
+++ b/drivers/char/tpm/tpm2-sessions.c
@@ -1362,4 +1362,5 @@ int tpm2_sessions_init(struct tpm_chip *chip)
return rc;
}
+EXPORT_SYMBOL(tpm2_sessions_init);
#endif /* CONFIG_TCG_TPM2_HMAC */
--
2.46.0
Hi Greg, Sasha,
A number of commits were identified[1] by syzbot as non-backported
fixes for the fuzzer-detected findings in various Linux LTS trees.
[1] https://syzkaller.appspot.com/upstream/backports
Please consider backporting the following commits to LTS v6.1:
9a8ec9e8ebb5a7c0cfbce2d6b4a6b67b2b78e8f3 "Bluetooth: SCO: Fix possible circular locking dependency on sco_connect_cfm"
(fixes 9a8ec9e) 3dcaa192ac2159193bc6ab57bc5369dcb84edd8e "Bluetooth: SCO: fix sco_conn related locking and validity issues"
3f5424790d4377839093b68c12b130077a4e4510 "ext4: fix inode tree inconsistency caused by ENOMEM"
7b0151caf73a656b75b550e361648430233455a0 "KVM: x86: Remove WARN sanity check on hypervisor timer vs. UNINITIALIZED vCPU"
c2efd13a2ed4f29bf9ef14ac2fbb7474084655f8 "udf: Limit file size to 4TB"
4b827b3f305d1fcf837265f1e12acc22ee84327c "xfs: remove WARN when dquot cache insertion fails"
These were verified to apply cleanly on top of v6.1.107 and to
build/boot.
The following commits to LTS v5.15:
8216776ccff6fcd40e3fdaa109aa4150ebe760b3 "ext4: reject casefold inode flag without casefold feature"
c2efd13a2ed4f29bf9ef14ac2fbb7474084655f8 "udf: Limit file size to 4TB"
These were verified to apply cleanly on top of v5.15.165 and to
build/boot.
The following commits to LTS v5.10:
04e568a3b31cfbd545c04c8bfc35c20e5ccfce0f "ext4: handle redirtying in ext4_bio_write_page()"
2a1fc7dc36260fbe74b6ca29dc6d9088194a2115 "KVM: x86: Suppress MMIO that is triggered during task switch emulation"
2454ad83b90afbc6ed2c22ec1310b624c40bf0d3 "fs: Restrict lock_two_nondirectories() to non-directory inodes"
(fixes 2454ad) 33ab231f83cc12d0157711bbf84e180c3be7d7bc "fs: don't assume arguments are non-NULL"
These were verified to apply cleanly on top of v5.10.224 and to
build/boot.
There are also a lot of syzbot-detected fix commits that did not apply
cleanly, but the conflicts seem to be quite straightforward to resolve
manually. Could you please share what the current process is with
respect to such fix patches? For example, are you sending emails
asking developers to adjust the non-applied patch (if they want), or
is it the other way around -- you expect the authors to be proactive
and send the adjusted patch versions themselves?
Some sample commits, which failed to apply to v6.1.107:
ff91059932401894e6c86341915615c5eb0eca48 "bpf, sockmap: Prevent lock inversion deadlock in map delete elem"
f8f210dc84709804c9f952297f2bfafa6ea6b4bd "btrfs: calculate the right space for delayed refs when updating global reserve"
--
Aleksandr
The submit queue polling threads are "kernel" threads that are started
from the userland. In case the userland task is part of a cgroup with
the cpuset controller enabled, the poller should also stay within that
cpuset. This also holds, as the poller belongs to the same cgroup as
the task that started it.
With the current implementation, a process can "break out" of the
defined cpuset by creating sq pollers consuming CPU time on other CPUs,
which is especially problematic for realtime applications.
Part of this problem was fixed in a5fc1441 by dropping the
PF_NO_SETAFFINITY flag, but this only becomes effective after the first
modification of the cpuset (i.e. the pollers cpuset is correct after the
first update of the enclosing cgroups cpuset).
By inheriting the cpuset of the creating tasks, we ensure that the
poller is created with a cpumask that is a subset of the cgroups mask.
Inheriting the creators cpumask is reasonable, as other userland tasks
also inherit the mask.
Fixes: 37d1e2e3642e ("io_uring: move SQPOLL thread io-wq forked worker")
Cc: stable(a)vger.kernel.org # 6.1+
Signed-off-by: Felix Moessbauer <felix.moessbauer(a)siemens.com>
---
io_uring/sqpoll.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/io_uring/sqpoll.c b/io_uring/sqpoll.c
index 3b50dc9586d1..4681b2c41a96 100644
--- a/io_uring/sqpoll.c
+++ b/io_uring/sqpoll.c
@@ -289,7 +289,7 @@ static int io_sq_thread(void *data)
if (sqd->sq_cpu != -1) {
set_cpus_allowed_ptr(current, cpumask_of(sqd->sq_cpu));
} else {
- set_cpus_allowed_ptr(current, cpu_online_mask);
+ set_cpus_allowed_ptr(current, sqd->thread->cpus_ptr);
sqd->sq_cpu = raw_smp_processor_id();
}
--
2.39.2
Zero and negative number is not a valid IRQ for in-kernel code and the
irq_of_parse_and_map() function returns zero on error. So this check for
valid IRQs should only accept values > 0.
Cc: stable(a)vger.kernel.org
Fixes: f7578496a671 ("of/irq: Use irq_of_parse_and_map()")
Signed-off-by: Ma Ke <make24(a)iscas.ac.cn>
---
drivers/i2c/busses/i2c-cpm.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/i2c/busses/i2c-cpm.c b/drivers/i2c/busses/i2c-cpm.c
index 4794ec066eb0..41e3c95c0ef7 100644
--- a/drivers/i2c/busses/i2c-cpm.c
+++ b/drivers/i2c/busses/i2c-cpm.c
@@ -435,7 +435,7 @@ static int cpm_i2c_setup(struct cpm_i2c *cpm)
init_waitqueue_head(&cpm->i2c_wait);
cpm->irq = irq_of_parse_and_map(ofdev->dev.of_node, 0);
- if (!cpm->irq)
+ if (cpm->irq <= 0)
return -EINVAL;
/* Install interrupt handler. */
--
2.25.1
The qcom_geni_serial_poll_bit() can be used to wait for events like
command completion and is supposed to wait for the time it takes to
clear a full fifo before timing out.
As noted by Doug, the current implementation does not account for start,
stop and parity bits when determining the timeout. The helper also does
not currently account for the shift register and the two-word
intermediate transfer register.
Instead of determining the fifo timeout on every call, store the timeout
when updating it in set_termios() and wait for up to 19/16 the time it
takes to clear the 16 word fifo to account for the shift and
intermediate registers. Note that serial core has already added a 20 ms
margin to the fifo timeout.
Also note that the current uart_fifo_timeout() interface does
unnecessary calculations on every call and also did not exists in
earlier kernels so only store its result once. This also facilitates
backports as earlier kernels can derive the timeout from uport->timeout,
which has since been removed.
Fixes: c4f528795d1a ("tty: serial: msm_geni_serial: Add serial driver support for GENI based QUP")
Cc: stable(a)vger.kernel.org # 4.17
Reported-by: Douglas Anderson <dianders(a)chromium.org>
Signed-off-by: Johan Hovold <johan+linaro(a)kernel.org>
---
drivers/tty/serial/qcom_geni_serial.c | 24 ++++++++++++------------
1 file changed, 12 insertions(+), 12 deletions(-)
diff --git a/drivers/tty/serial/qcom_geni_serial.c b/drivers/tty/serial/qcom_geni_serial.c
index 69a632fefc41..e1926124339d 100644
--- a/drivers/tty/serial/qcom_geni_serial.c
+++ b/drivers/tty/serial/qcom_geni_serial.c
@@ -124,7 +124,7 @@ struct qcom_geni_serial_port {
dma_addr_t tx_dma_addr;
dma_addr_t rx_dma_addr;
bool setup;
- unsigned int baud;
+ unsigned long fifo_timeout_us;
unsigned long clk_rate;
void *rx_buf;
u32 loopback;
@@ -270,22 +270,21 @@ static bool qcom_geni_serial_poll_bit(struct uart_port *uport,
{
u32 reg;
struct qcom_geni_serial_port *port;
- unsigned int baud;
- unsigned int fifo_bits;
unsigned long timeout_us = 20000;
struct qcom_geni_private_data *private_data = uport->private_data;
if (private_data->drv) {
port = to_dev_port(uport);
- baud = port->baud;
- if (!baud)
- baud = 115200;
- fifo_bits = port->tx_fifo_depth * port->tx_fifo_width;
+
/*
- * Total polling iterations based on FIFO worth of bytes to be
- * sent at current baud. Add a little fluff to the wait.
+ * Wait up to 19/16 the time it would take to clear a full
+ * FIFO, which accounts for the three words in the shift and
+ * intermediate registers.
+ *
+ * Note that fifo_timeout_us already has a 20 ms margin.
*/
- timeout_us = ((fifo_bits * USEC_PER_SEC) / baud) + 500;
+ if (port->fifo_timeout_us)
+ timeout_us = 19 * port->fifo_timeout_us / 16;
}
/*
@@ -1248,7 +1247,6 @@ static void qcom_geni_serial_set_termios(struct uart_port *uport,
qcom_geni_serial_stop_rx(uport);
/* baud rate */
baud = uart_get_baud_rate(uport, termios, old, 300, 4000000);
- port->baud = baud;
sampling_rate = UART_OVERSAMPLING;
/* Sampling rate is halved for IP versions >= 2.5 */
@@ -1326,8 +1324,10 @@ static void qcom_geni_serial_set_termios(struct uart_port *uport,
else
tx_trans_cfg |= UART_CTS_MASK;
- if (baud)
+ if (baud) {
uart_update_timeout(uport, termios->c_cflag, baud);
+ port->fifo_timeout_us = jiffies_to_usecs(uart_fifo_timeout(uport));
+ }
if (!uart_console(uport))
writel(port->loopback,
--
2.44.2
The polled UART operations are used by the kernel debugger (KDB, KGDB),
which can interrupt the kernel at any point in time. The current
Qualcomm GENI implementation does not really work when there is on-going
serial output as it inadvertently "hijacks" the current tx command,
which can result in both the initial debugger output being corrupted as
well as the corruption of any on-going serial output (up to 4k
characters) when execution resumes:
0190: abcdefghijklmnopqrstuvwxyz0123456789 0190: abcdefghijklmnopqrstuvwxyz0123456789
0191: abcdefghijklmnop[ 50.825552] sysrq: DEBUG
qrstuvwxyz0123456789 0191: abcdefghijklmnopqrstuvwxyz0123456789
Entering kdb (current=0xffff53510b4cd280, pid 640) on processor 2 due to Keyboard Entry
[2]kdb> go
omlji3h3h2g2g1f1f0e0ezdzdycycxbxbwawav :t72r2rp
o9n976k5j5j4i4i3h3h2g2g1f1f0e0ezdzdycycxbxbwawavu:t7t8s8s8r2r2q0q0p
o9n9n8ml6k6k5j5j4i4i3h3h2g2g1f1f0e0ezdzdycycxbxbwawav v u:u:t9t0s4s4rq0p
o9n9n8m8m7l7l6k6k5j5j40q0p p o
o9n9n8m8m7l7l6k6k5j5j4i4i3h3h2g2g1f1f0e0ezdzdycycxbxbwawav :t8t9s4s4r4r4q0q0p
Fix this by making sure that the polled output implementation waits for
the tx fifo to drain before cancelling any on-going longer transfers. As
the polled code cannot take any locks, leave the state variables as they
are and instead make sure that the interrupt handler always starts a new
tx command when there is data in the write buffer.
Since the debugger can interrupt the interrupt handler when it is
writing data to the tx fifo, it is currently not possible to fully
prevent losing up to 64 bytes of tty output on resume.
Fixes: c4f528795d1a ("tty: serial: msm_geni_serial: Add serial driver support for GENI based QUP")
Cc: stable(a)vger.kernel.org # 4.17
Reviewed-by: Douglas Anderson <dianders(a)chromium.org>
Tested-by: Nícolas F. R. A. Prado <nfraprado(a)collabora.com>
Signed-off-by: Johan Hovold <johan+linaro(a)kernel.org>
---
drivers/tty/serial/qcom_geni_serial.c | 27 ++++++++++++++++++---------
1 file changed, 18 insertions(+), 9 deletions(-)
diff --git a/drivers/tty/serial/qcom_geni_serial.c b/drivers/tty/serial/qcom_geni_serial.c
index f23fd0ac3cfd..6f0db310cf69 100644
--- a/drivers/tty/serial/qcom_geni_serial.c
+++ b/drivers/tty/serial/qcom_geni_serial.c
@@ -145,6 +145,7 @@ static const struct uart_ops qcom_geni_uart_pops;
static struct uart_driver qcom_geni_console_driver;
static struct uart_driver qcom_geni_uart_driver;
+static void __qcom_geni_serial_cancel_tx_cmd(struct uart_port *uport);
static void qcom_geni_serial_cancel_tx_cmd(struct uart_port *uport);
static inline struct qcom_geni_serial_port *to_dev_port(struct uart_port *uport)
@@ -384,13 +385,14 @@ static int qcom_geni_serial_get_char(struct uart_port *uport)
static void qcom_geni_serial_poll_put_char(struct uart_port *uport,
unsigned char c)
{
- writel(DEF_TX_WM, uport->membase + SE_GENI_TX_WATERMARK_REG);
+ if (qcom_geni_serial_main_active(uport)) {
+ qcom_geni_serial_poll_tx_done(uport);
+ __qcom_geni_serial_cancel_tx_cmd(uport);
+ }
+
writel(M_CMD_DONE_EN, uport->membase + SE_GENI_M_IRQ_CLEAR);
qcom_geni_serial_setup_tx(uport, 1);
- WARN_ON(!qcom_geni_serial_poll_bit(uport, SE_GENI_M_IRQ_STATUS,
- M_TX_FIFO_WATERMARK_EN, true));
writel(c, uport->membase + SE_GENI_TX_FIFOn);
- writel(M_TX_FIFO_WATERMARK_EN, uport->membase + SE_GENI_M_IRQ_CLEAR);
qcom_geni_serial_poll_tx_done(uport);
}
#endif
@@ -677,13 +679,10 @@ static void qcom_geni_serial_stop_tx_fifo(struct uart_port *uport)
writel(irq_en, uport->membase + SE_GENI_M_IRQ_EN);
}
-static void qcom_geni_serial_cancel_tx_cmd(struct uart_port *uport)
+static void __qcom_geni_serial_cancel_tx_cmd(struct uart_port *uport)
{
struct qcom_geni_serial_port *port = to_dev_port(uport);
- if (!qcom_geni_serial_main_active(uport))
- return;
-
geni_se_cancel_m_cmd(&port->se);
if (!qcom_geni_serial_poll_bit(uport, SE_GENI_M_IRQ_STATUS,
M_CMD_CANCEL_EN, true)) {
@@ -693,6 +692,16 @@ static void qcom_geni_serial_cancel_tx_cmd(struct uart_port *uport)
writel(M_CMD_ABORT_EN, uport->membase + SE_GENI_M_IRQ_CLEAR);
}
writel(M_CMD_CANCEL_EN, uport->membase + SE_GENI_M_IRQ_CLEAR);
+}
+
+static void qcom_geni_serial_cancel_tx_cmd(struct uart_port *uport)
+{
+ struct qcom_geni_serial_port *port = to_dev_port(uport);
+
+ if (!qcom_geni_serial_main_active(uport))
+ return;
+
+ __qcom_geni_serial_cancel_tx_cmd(uport);
port->tx_remaining = 0;
port->tx_queued = 0;
@@ -919,7 +928,7 @@ static void qcom_geni_serial_handle_tx_fifo(struct uart_port *uport,
if (!chunk)
goto out_write_wakeup;
- if (!port->tx_remaining) {
+ if (!active) {
qcom_geni_serial_setup_tx(uport, pending);
port->tx_remaining = pending;
port->tx_queued = 0;
--
2.44.2
The Qualcomm serial console implementation is broken and can lose
characters when the serial port is also used for tty output.
Specifically, the console code only waits for the current tx command to
complete when all data has already been written to the fifo. When there
are on-going longer transfers this often means that console output is
lost when the console code inadvertently "hijacks" the current tx
command instead of starting a new one.
This can, for example, be observed during boot when console output that
should have been interspersed with init output is truncated:
[ 9.462317] qcom-snps-eusb2-hsphy fde000.phy: Registered Qcom-eUSB2 phy
[ OK ] Found device KBG50ZNS256G KIOXIA Wi[ 9.471743ndows.
[ 9.539915] xhci-hcd xhci-hcd.0.auto: xHCI Host Controller
Add a new state variable to track how much data has been written to the
fifo and use it to determine when the fifo and shift register are both
empty. This is needed since there is currently no other known way to
determine when the shift register is empty.
This in turn allows the console code to interrupt long transfers without
losing data.
Note that the oops-in-progress case is similarly broken as it does not
cancel any active command and also waits for the wrong status flag when
attempting to drain the fifo (TX_FIFO_NOT_EMPTY_EN is only set when
cancelling a command leaves data in the fifo).
Fixes: c4f528795d1a ("tty: serial: msm_geni_serial: Add serial driver support for GENI based QUP")
Fixes: a1fee899e5be ("tty: serial: qcom_geni_serial: Fix softlock")
Fixes: 9e957a155005 ("serial: qcom-geni: Don't cancel/abort if we can't get the port lock")
Cc: stable(a)vger.kernel.org # 4.17
Reviewed-by: Douglas Anderson <dianders(a)chromium.org>
Tested-by: Nícolas F. R. A. Prado <nfraprado(a)collabora.com>
Signed-off-by: Johan Hovold <johan+linaro(a)kernel.org>
---
drivers/tty/serial/qcom_geni_serial.c | 45 +++++++++++++--------------
1 file changed, 22 insertions(+), 23 deletions(-)
diff --git a/drivers/tty/serial/qcom_geni_serial.c b/drivers/tty/serial/qcom_geni_serial.c
index 7bbd70c30620..f8f6e9466b40 100644
--- a/drivers/tty/serial/qcom_geni_serial.c
+++ b/drivers/tty/serial/qcom_geni_serial.c
@@ -131,6 +131,7 @@ struct qcom_geni_serial_port {
bool brk;
unsigned int tx_remaining;
+ unsigned int tx_queued;
int wakeup_irq;
bool rx_tx_swap;
bool cts_rts_swap;
@@ -144,6 +145,8 @@ static const struct uart_ops qcom_geni_uart_pops;
static struct uart_driver qcom_geni_console_driver;
static struct uart_driver qcom_geni_uart_driver;
+static void qcom_geni_serial_cancel_tx_cmd(struct uart_port *uport);
+
static inline struct qcom_geni_serial_port *to_dev_port(struct uart_port *uport)
{
return container_of(uport, struct qcom_geni_serial_port, uport);
@@ -393,6 +396,14 @@ static void qcom_geni_serial_poll_put_char(struct uart_port *uport,
#endif
#ifdef CONFIG_SERIAL_QCOM_GENI_CONSOLE
+static void qcom_geni_serial_drain_fifo(struct uart_port *uport)
+{
+ struct qcom_geni_serial_port *port = to_dev_port(uport);
+
+ qcom_geni_serial_poll_bitfield(uport, SE_GENI_M_GP_LENGTH, GP_LENGTH,
+ port->tx_queued);
+}
+
static void qcom_geni_serial_wr_char(struct uart_port *uport, unsigned char ch)
{
struct qcom_geni_private_data *private_data = uport->private_data;
@@ -468,7 +479,6 @@ static void qcom_geni_serial_console_write(struct console *co, const char *s,
struct qcom_geni_serial_port *port;
bool locked = true;
unsigned long flags;
- u32 geni_status;
WARN_ON(co->index < 0 || co->index >= GENI_UART_CONS_PORTS);
@@ -482,34 +492,20 @@ static void qcom_geni_serial_console_write(struct console *co, const char *s,
else
uart_port_lock_irqsave(uport, &flags);
- geni_status = readl(uport->membase + SE_GENI_STATUS);
+ if (qcom_geni_serial_main_active(uport)) {
+ /* Wait for completion or drain FIFO */
+ if (!locked || port->tx_remaining == 0)
+ qcom_geni_serial_poll_tx_done(uport);
+ else
+ qcom_geni_serial_drain_fifo(uport);
- if (!locked) {
- /*
- * We can only get here if an oops is in progress then we were
- * unable to get the lock. This means we can't safely access
- * our state variables like tx_remaining. About the best we
- * can do is wait for the FIFO to be empty before we start our
- * transfer, so we'll do that.
- */
- qcom_geni_serial_poll_bit(uport, SE_GENI_M_IRQ_STATUS,
- M_TX_FIFO_NOT_EMPTY_EN, false);
- } else if ((geni_status & M_GENI_CMD_ACTIVE) && !port->tx_remaining) {
- /*
- * It seems we can't interrupt existing transfers if all data
- * has been sent, in which case we need to look for done first.
- */
- qcom_geni_serial_poll_tx_done(uport);
+ qcom_geni_serial_cancel_tx_cmd(uport);
}
__qcom_geni_serial_console_write(uport, s, count);
-
- if (locked) {
- if (port->tx_remaining)
- qcom_geni_serial_setup_tx(uport, port->tx_remaining);
+ if (locked)
uart_port_unlock_irqrestore(uport, flags);
- }
}
static void handle_rx_console(struct uart_port *uport, u32 bytes, bool drop)
@@ -690,6 +686,7 @@ static void qcom_geni_serial_cancel_tx_cmd(struct uart_port *uport)
writel(M_CMD_CANCEL_EN, uport->membase + SE_GENI_M_IRQ_CLEAR);
port->tx_remaining = 0;
+ port->tx_queued = 0;
}
static void qcom_geni_serial_handle_rx_fifo(struct uart_port *uport, bool drop)
@@ -916,6 +913,7 @@ static void qcom_geni_serial_handle_tx_fifo(struct uart_port *uport,
if (!port->tx_remaining) {
qcom_geni_serial_setup_tx(uport, pending);
port->tx_remaining = pending;
+ port->tx_queued = 0;
irq_en = readl(uport->membase + SE_GENI_M_IRQ_EN);
if (!(irq_en & M_TX_FIFO_WATERMARK_EN))
@@ -924,6 +922,7 @@ static void qcom_geni_serial_handle_tx_fifo(struct uart_port *uport,
}
qcom_geni_serial_send_chunk_fifo(uport, chunk);
+ port->tx_queued += chunk;
/*
* The tx fifo watermark is level triggered and latched. Though we had
--
2.44.2
Hi, Greg
Could you please help to cherry-pick the following commit to the 5.4.y branch?
697fa27dc5fb ("reset: hi6220: Add support for AO reset controller")
It's been there since the 5.10 kernel, and this along with the clk patch
are needed for Hikey devices to work with the 5.4.y kernels, otherwise
it will reboot again and again during the boot.
--
Best Regards,
Yongqin Liu
---------------------------------------------------------------
#mailing list
linaro-android(a)lists.linaro.org
http://lists.linaro.org/mailman/listinfo/linaro-android
If an interrupt occurs in queued_spin_lock_slowpath() after we increment
qnodesp->count and before node->lock is initialized, another CPU might
see stale lock values in get_tail_qnode(). If the stale lock value happens
to match the lock on that CPU, then we write to the "next" pointer of
the wrong qnode. This causes a deadlock as the former CPU, once it becomes
the head of the MCS queue, will spin indefinitely until it's "next" pointer
is set by its successor in the queue.
Running stress-ng on a 16 core (16EC/16VP) shared LPAR, results in
occasional lockups similar to the following:
$ stress-ng --all 128 --vm-bytes 80% --aggressive \
--maximize --oomable --verify --syslog \
--metrics --times --timeout 5m
watchdog: CPU 15 Hard LOCKUP
......
NIP [c0000000000b78f4] queued_spin_lock_slowpath+0x1184/0x1490
LR [c000000001037c5c] _raw_spin_lock+0x6c/0x90
Call Trace:
0xc000002cfffa3bf0 (unreliable)
_raw_spin_lock+0x6c/0x90
raw_spin_rq_lock_nested.part.135+0x4c/0xd0
sched_ttwu_pending+0x60/0x1f0
__flush_smp_call_function_queue+0x1dc/0x670
smp_ipi_demux_relaxed+0xa4/0x100
xive_muxed_ipi_action+0x20/0x40
__handle_irq_event_percpu+0x80/0x240
handle_irq_event_percpu+0x2c/0x80
handle_percpu_irq+0x84/0xd0
generic_handle_irq+0x54/0x80
__do_irq+0xac/0x210
__do_IRQ+0x74/0xd0
0x0
do_IRQ+0x8c/0x170
hardware_interrupt_common_virt+0x29c/0x2a0
--- interrupt: 500 at queued_spin_lock_slowpath+0x4b8/0x1490
......
NIP [c0000000000b6c28] queued_spin_lock_slowpath+0x4b8/0x1490
LR [c000000001037c5c] _raw_spin_lock+0x6c/0x90
--- interrupt: 500
0xc0000029c1a41d00 (unreliable)
_raw_spin_lock+0x6c/0x90
futex_wake+0x100/0x260
do_futex+0x21c/0x2a0
sys_futex+0x98/0x270
system_call_exception+0x14c/0x2f0
system_call_vectored_common+0x15c/0x2ec
The following code flow illustrates how the deadlock occurs.
For the sake of brevity, assume that both locks (A and B) are
contended and we call the queued_spin_lock_slowpath() function.
CPU0 CPU1
---- ----
spin_lock_irqsave(A) |
spin_unlock_irqrestore(A) |
spin_lock(B) |
| |
▼ |
id = qnodesp->count++; |
(Note that nodes[0].lock == A) |
| |
▼ |
Interrupt |
(happens before "nodes[0].lock = B") |
| |
▼ |
spin_lock_irqsave(A) |
| |
▼ |
id = qnodesp->count++ |
nodes[1].lock = A |
| |
▼ |
Tail of MCS queue |
| spin_lock_irqsave(A)
▼ |
Head of MCS queue ▼
| CPU0 is previous tail
▼ |
Spin indefinitely ▼
(until "nodes[1].next != NULL") prev = get_tail_qnode(A, CPU0)
|
▼
prev == &qnodes[CPU0].nodes[0]
(as qnodes[CPU0].nodes[0].lock == A)
|
▼
WRITE_ONCE(prev->next, node)
|
▼
Spin indefinitely
(until nodes[0].locked == 1)
Thanks to Saket Kumar Bhaskar for help with recreating the issue
Fixes: 84990b169557 ("powerpc/qspinlock: add mcs queueing for contended waiters")
Cc: stable(a)vger.kernel.org # v6.2+
Reported-by: Geetika Moolchandani <geetika(a)linux.ibm.com>
Reported-by: Vaishnavi Bhat <vaish123(a)in.ibm.com>
Reported-by: Jijo Varghese <vargjijo(a)in.ibm.com>
Signed-off-by: Nysal Jan K.A. <nysal(a)linux.ibm.com>
Reviewed-by: Nicholas Piggin <npiggin(a)gmail.com>
---
arch/powerpc/lib/qspinlock.c | 10 +++++++++-
1 file changed, 9 insertions(+), 1 deletion(-)
diff --git a/arch/powerpc/lib/qspinlock.c b/arch/powerpc/lib/qspinlock.c
index 5de4dd549f6e..bcc7e4dff8c3 100644
--- a/arch/powerpc/lib/qspinlock.c
+++ b/arch/powerpc/lib/qspinlock.c
@@ -697,7 +697,15 @@ static __always_inline void queued_spin_lock_mcs_queue(struct qspinlock *lock, b
}
release:
- qnodesp->count--; /* release the node */
+ /*
+ * Clear the lock before releasing the node, as another CPU might see stale
+ * values if an interrupt occurs after we increment qnodesp->count
+ * but before node->lock is initialized. The barrier ensures that
+ * there are no further stores to the node after it has been released.
+ */
+ node->lock = NULL;
+ barrier();
+ qnodesp->count--;
}
void queued_spin_lock_slowpath(struct qspinlock *lock)
--
2.46.0