If the state is not idle then resolve_prepare_src() should immediately
fail and no change to global state should happen. However, it
unconditionally overwrites the src_addr trying to build a temporary any
address.
For instance if the state is already RDMA_CM_LISTEN then this will corrupt
the src_addr and would cause the test in cma_cancel_operation():
if (cma_any_addr(cma_src_addr(id_priv)) && !id_priv->cma_dev)
Which would manifest as this trace from syzkaller:
BUG: KASAN: use-after-free in __list_add_valid+0x93/0xa0 lib/list_debug.c:26
Read of size 8 at addr ffff8881546491e0 by task syz-executor.1/32204
CPU: 1 PID: 32204 Comm: syz-executor.1 Not tainted 5.12.0-rc8-syzkaller #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Call Trace:
__dump_stack lib/dump_stack.c:79 [inline]
dump_stack+0x141/0x1d7 lib/dump_stack.c:120
print_address_description.constprop.0.cold+0x5b/0x2f8 mm/kasan/report.c:232
__kasan_report mm/kasan/report.c:399 [inline]
kasan_report.cold+0x7c/0xd8 mm/kasan/report.c:416
__list_add_valid+0x93/0xa0 lib/list_debug.c:26
__list_add include/linux/list.h:67 [inline]
list_add_tail include/linux/list.h:100 [inline]
cma_listen_on_all drivers/infiniband/core/cma.c:2557 [inline]
rdma_listen+0x787/0xe00 drivers/infiniband/core/cma.c:3751
ucma_listen+0x16a/0x210 drivers/infiniband/core/ucma.c:1102
ucma_write+0x259/0x350 drivers/infiniband/core/ucma.c:1732
vfs_write+0x28e/0xa30 fs/read_write.c:603
ksys_write+0x1ee/0x250 fs/read_write.c:658
do_syscall_64+0x2d/0x70 arch/x86/entry/common.c:46
entry_SYSCALL_64_after_hwframe+0x44/0xae
This is indicating that an rdma_id_private was destroyed without doing
cma_cancel_listens().
Instead of trying to re-use the src_addr memory to indirectly create an
any address derived from the dst build one explicitly on the stack and
bind to that as any other normal flow would do. rdma_bind_addr() will copy
it over the src_addr once it knows the state is valid.
This is similar to commit bc0bdc5afaa7 ("RDMA/cma: Do not change
route.addr.src_addr.ss_family")
Cc: stable(a)vger.kernel.org
Fixes: 732d41c545bb ("RDMA/cma: Make the locking for automatic state transition more clear")
Reported-by: syzbot+c94a3675a626f6333d74(a)syzkaller.appspotmail.com
Signed-off-by: Jason Gunthorpe <jgg(a)nvidia.com>
---
drivers/infiniband/core/cma.c | 38 +++++++++++++++++++++--------------
1 file changed, 23 insertions(+), 15 deletions(-)
diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c
index 288ff0735875f4..c803d63f4d2354 100644
--- a/drivers/infiniband/core/cma.c
+++ b/drivers/infiniband/core/cma.c
@@ -3368,22 +3368,30 @@ static int cma_resolve_ib_addr(struct rdma_id_private *id_priv)
static int cma_bind_addr(struct rdma_cm_id *id, struct sockaddr *src_addr,
const struct sockaddr *dst_addr)
{
- if (!src_addr || !src_addr->sa_family) {
- src_addr = (struct sockaddr *) &id->route.addr.src_addr;
- src_addr->sa_family = dst_addr->sa_family;
- if (IS_ENABLED(CONFIG_IPV6) &&
- dst_addr->sa_family == AF_INET6) {
- struct sockaddr_in6 *src_addr6 = (struct sockaddr_in6 *) src_addr;
- struct sockaddr_in6 *dst_addr6 = (struct sockaddr_in6 *) dst_addr;
- src_addr6->sin6_scope_id = dst_addr6->sin6_scope_id;
- if (ipv6_addr_type(&dst_addr6->sin6_addr) & IPV6_ADDR_LINKLOCAL)
- id->route.addr.dev_addr.bound_dev_if = dst_addr6->sin6_scope_id;
- } else if (dst_addr->sa_family == AF_IB) {
- ((struct sockaddr_ib *) src_addr)->sib_pkey =
- ((struct sockaddr_ib *) dst_addr)->sib_pkey;
- }
+ struct sockaddr_storage zero_sock = {};
+
+ if (src_addr && src_addr->sa_family)
+ return rdma_bind_addr(id, src_addr);
+
+ /*
+ * When the src_addr is not specified, automatically supply an any addr
+ */
+ zero_sock.ss_family = dst_addr->sa_family;
+ if (IS_ENABLED(CONFIG_IPV6) && dst_addr->sa_family == AF_INET6) {
+ struct sockaddr_in6 *src_addr6 =
+ (struct sockaddr_in6 *)&zero_sock;
+ struct sockaddr_in6 *dst_addr6 =
+ (struct sockaddr_in6 *)dst_addr;
+
+ src_addr6->sin6_scope_id = dst_addr6->sin6_scope_id;
+ if (ipv6_addr_type(&dst_addr6->sin6_addr) & IPV6_ADDR_LINKLOCAL)
+ id->route.addr.dev_addr.bound_dev_if =
+ dst_addr6->sin6_scope_id;
+ } else if (dst_addr->sa_family == AF_IB) {
+ ((struct sockaddr_ib *)&zero_sock)->sib_pkey =
+ ((struct sockaddr_ib *)dst_addr)->sib_pkey;
}
- return rdma_bind_addr(id, src_addr);
+ return rdma_bind_addr(id, (struct sockaddr *)&zero_sock);
}
/*
base-commit: 748663c8ccf6b2e5a800de19127c2cc1c4423fd2
--
2.35.1
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From fadead80fe4c033b5e514fcbadd20b55c4494112 Mon Sep 17 00:00:00 2001
From: Jacob Keller <jacob.e.keller(a)intel.com>
Date: Mon, 7 Feb 2022 10:23:29 -0800
Subject: [PATCH] ice: fix concurrent reset and removal of VFs
Commit c503e63200c6 ("ice: Stop processing VF messages during teardown")
introduced a driver state flag, ICE_VF_DEINIT_IN_PROGRESS, which is
intended to prevent some issues with concurrently handling messages from
VFs while tearing down the VFs.
This change was motivated by crashes caused while tearing down and
bringing up VFs in rapid succession.
It turns out that the fix actually introduces issues with the VF driver
caused because the PF no longer responds to any messages sent by the VF
during its .remove routine. This results in the VF potentially removing
its DMA memory before the PF has shut down the device queues.
Additionally, the fix doesn't actually resolve concurrency issues within
the ice driver. It is possible for a VF to initiate a reset just prior
to the ice driver removing VFs. This can result in the remove task
concurrently operating while the VF is being reset. This results in
similar memory corruption and panics purportedly fixed by that commit.
Fix this concurrency at its root by protecting both the reset and
removal flows using the existing VF cfg_lock. This ensures that we
cannot remove the VF while any outstanding critical tasks such as a
virtchnl message or a reset are occurring.
This locking change also fixes the root cause originally fixed by commit
c503e63200c6 ("ice: Stop processing VF messages during teardown"), so we
can simply revert it.
Note that I kept these two changes together because simply reverting the
original commit alone would leave the driver vulnerable to worse race
conditions.
Fixes: c503e63200c6 ("ice: Stop processing VF messages during teardown")
Signed-off-by: Jacob Keller <jacob.e.keller(a)intel.com>
Tested-by: Konrad Jankowski <konrad0.jankowski(a)intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen(a)intel.com>
diff --git a/drivers/net/ethernet/intel/ice/ice.h b/drivers/net/ethernet/intel/ice/ice.h
index a9fa701aaa95..473b1f6be9de 100644
--- a/drivers/net/ethernet/intel/ice/ice.h
+++ b/drivers/net/ethernet/intel/ice/ice.h
@@ -280,7 +280,6 @@ enum ice_pf_state {
ICE_VFLR_EVENT_PENDING,
ICE_FLTR_OVERFLOW_PROMISC,
ICE_VF_DIS,
- ICE_VF_DEINIT_IN_PROGRESS,
ICE_CFG_BUSY,
ICE_SERVICE_SCHED,
ICE_SERVICE_DIS,
diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c
index 17a9bb461dc3..f3c346e13b7a 100644
--- a/drivers/net/ethernet/intel/ice/ice_main.c
+++ b/drivers/net/ethernet/intel/ice/ice_main.c
@@ -1799,7 +1799,9 @@ static void ice_handle_mdd_event(struct ice_pf *pf)
* reset, so print the event prior to reset.
*/
ice_print_vf_rx_mdd_event(vf);
+ mutex_lock(&pf->vf[i].cfg_lock);
ice_reset_vf(&pf->vf[i], false);
+ mutex_unlock(&pf->vf[i].cfg_lock);
}
}
}
diff --git a/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c b/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c
index 39b80124d282..408f78e3eb13 100644
--- a/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c
+++ b/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c
@@ -500,8 +500,6 @@ void ice_free_vfs(struct ice_pf *pf)
struct ice_hw *hw = &pf->hw;
unsigned int tmp, i;
- set_bit(ICE_VF_DEINIT_IN_PROGRESS, pf->state);
-
if (!pf->vf)
return;
@@ -519,22 +517,26 @@ void ice_free_vfs(struct ice_pf *pf)
else
dev_warn(dev, "VFs are assigned - not disabling SR-IOV\n");
- /* Avoid wait time by stopping all VFs at the same time */
- ice_for_each_vf(pf, i)
- ice_dis_vf_qs(&pf->vf[i]);
-
tmp = pf->num_alloc_vfs;
pf->num_qps_per_vf = 0;
pf->num_alloc_vfs = 0;
for (i = 0; i < tmp; i++) {
- if (test_bit(ICE_VF_STATE_INIT, pf->vf[i].vf_states)) {
+ struct ice_vf *vf = &pf->vf[i];
+
+ mutex_lock(&vf->cfg_lock);
+
+ ice_dis_vf_qs(vf);
+
+ if (test_bit(ICE_VF_STATE_INIT, vf->vf_states)) {
/* disable VF qp mappings and set VF disable state */
- ice_dis_vf_mappings(&pf->vf[i]);
- set_bit(ICE_VF_STATE_DIS, pf->vf[i].vf_states);
- ice_free_vf_res(&pf->vf[i]);
+ ice_dis_vf_mappings(vf);
+ set_bit(ICE_VF_STATE_DIS, vf->vf_states);
+ ice_free_vf_res(vf);
}
- mutex_destroy(&pf->vf[i].cfg_lock);
+ mutex_unlock(&vf->cfg_lock);
+
+ mutex_destroy(&vf->cfg_lock);
}
if (ice_sriov_free_msix_res(pf))
@@ -570,7 +572,6 @@ void ice_free_vfs(struct ice_pf *pf)
i);
clear_bit(ICE_VF_DIS, pf->state);
- clear_bit(ICE_VF_DEINIT_IN_PROGRESS, pf->state);
clear_bit(ICE_FLAG_SRIOV_ENA, pf->flags);
}
@@ -1498,6 +1499,8 @@ bool ice_reset_all_vfs(struct ice_pf *pf, bool is_vflr)
ice_for_each_vf(pf, v) {
vf = &pf->vf[v];
+ mutex_lock(&vf->cfg_lock);
+
vf->driver_caps = 0;
ice_vc_set_default_allowlist(vf);
@@ -1512,6 +1515,8 @@ bool ice_reset_all_vfs(struct ice_pf *pf, bool is_vflr)
ice_vf_pre_vsi_rebuild(vf);
ice_vf_rebuild_vsi(vf);
ice_vf_post_vsi_rebuild(vf);
+
+ mutex_unlock(&vf->cfg_lock);
}
if (ice_is_eswitch_mode_switchdev(pf))
@@ -1562,6 +1567,8 @@ bool ice_reset_vf(struct ice_vf *vf, bool is_vflr)
u32 reg;
int i;
+ lockdep_assert_held(&vf->cfg_lock);
+
dev = ice_pf_to_dev(pf);
if (test_bit(ICE_VF_RESETS_DISABLED, pf->state)) {
@@ -2061,9 +2068,12 @@ void ice_process_vflr_event(struct ice_pf *pf)
bit_idx = (hw->func_caps.vf_base_id + vf_id) % 32;
/* read GLGEN_VFLRSTAT register to find out the flr VFs */
reg = rd32(hw, GLGEN_VFLRSTAT(reg_idx));
- if (reg & BIT(bit_idx))
+ if (reg & BIT(bit_idx)) {
/* GLGEN_VFLRSTAT bit will be cleared in ice_reset_vf */
+ mutex_lock(&vf->cfg_lock);
ice_reset_vf(vf, true);
+ mutex_unlock(&vf->cfg_lock);
+ }
}
}
@@ -2140,7 +2150,9 @@ ice_vf_lan_overflow_event(struct ice_pf *pf, struct ice_rq_event_info *event)
if (!vf)
return;
+ mutex_lock(&vf->cfg_lock);
ice_vc_reset_vf(vf);
+ mutex_unlock(&vf->cfg_lock);
}
/**
@@ -4625,10 +4637,6 @@ void ice_vc_process_vf_msg(struct ice_pf *pf, struct ice_rq_event_info *event)
struct device *dev;
int err = 0;
- /* if de-init is underway, don't process messages from VF */
- if (test_bit(ICE_VF_DEINIT_IN_PROGRESS, pf->state))
- return;
-
dev = ice_pf_to_dev(pf);
if (ice_validate_vf_id(pf, vf_id)) {
err = -EINVAL;
From: "Steven Rostedt (Google)" <rostedt(a)goodmis.org>
Al Viro brought it to my attention that the dentries may not be filled
when the parse_options() is called, causing the call to set_gid() to
possibly crash. It should only be called if parse_options() succeeds
totally anyway.
He suggested the logical place to do the update is in apply_options().
Cc: stable(a)vger.kernel.org
Reported-by: Al Viro <viro(a)zeniv.linux.org.uk>
Fixes: 48b27b6b5191 ("tracefs: Set all files to the same group ownership as the mount option")
Signed-off-by: Steven Rostedt (Google) <rostedt(a)goodmis.org>
---
fs/tracefs/inode.c | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c
index bafc02bf8220..3638d330ff5a 100644
--- a/fs/tracefs/inode.c
+++ b/fs/tracefs/inode.c
@@ -264,7 +264,6 @@ static int tracefs_parse_options(char *data, struct tracefs_mount_opts *opts)
if (!gid_valid(gid))
return -EINVAL;
opts->gid = gid;
- set_gid(tracefs_mount->mnt_root, gid);
break;
case Opt_mode:
if (match_octal(&args[0], &option))
@@ -293,6 +292,9 @@ static int tracefs_apply_options(struct super_block *sb)
inode->i_uid = opts->uid;
inode->i_gid = opts->gid;
+ if (tracefs_mount && tracefs_mount->mnt_root)
+ set_gid(tracefs_mount->mnt_root, opts->gid);
+
return 0;
}
--
2.34.1
From: Daniel Bristot de Oliveira <bristot(a)kernel.org>
The stacktrace event trigger is not dumping the stacktrace to the instance
where it was enabled, but to the global "instance."
Use the private_data, pointing to the trigger file, to figure out the
corresponding trace instance, and use it in the trigger action, like
snapshot_trigger does.
Link: https://lkml.kernel.org/r/afbb0b4f18ba92c276865bc97204d438473f4ebc.16453962…
Cc: stable(a)vger.kernel.org
Fixes: ae63b31e4d0e2 ("tracing: Separate out trace events from global variables")
Reviewed-by: Tom Zanussi <zanussi(a)kernel.org>
Tested-by: Tom Zanussi <zanussi(a)kernel.org>
Signed-off-by: Daniel Bristot de Oliveira <bristot(a)kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt(a)goodmis.org>
---
kernel/trace/trace_events_trigger.c | 7 ++++++-
1 file changed, 6 insertions(+), 1 deletion(-)
diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c
index d00fee705f9c..e0d50c9577f3 100644
--- a/kernel/trace/trace_events_trigger.c
+++ b/kernel/trace/trace_events_trigger.c
@@ -1540,7 +1540,12 @@ stacktrace_trigger(struct event_trigger_data *data,
struct trace_buffer *buffer, void *rec,
struct ring_buffer_event *event)
{
- trace_dump_stack(STACK_SKIP);
+ struct trace_event_file *file = data->private_data;
+
+ if (file)
+ __trace_stack(file->tr, tracing_gen_ctx(), STACK_SKIP);
+ else
+ trace_dump_stack(STACK_SKIP);
}
static void
--
2.34.1
The patch below does not apply to the 5.16-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From fb7e76ea3f3b6238dda2f19a4212052d2caf00aa Mon Sep 17 00:00:00 2001
From: Roi Dayan <roid(a)nvidia.com>
Date: Thu, 3 Feb 2022 09:42:19 +0200
Subject: [PATCH] net/mlx5e: TC, Skip redundant ct clear actions
Offload of ct clear action is just resetting the reg_c register.
It's done by allocating modify hdr resources which is limited.
Doing it multiple times is redundant and wasting modify hdr resources
and if resources depleted the driver will fail offloading the rule.
Ignore redundant ct clear actions after the first one.
Fixes: 806401c20a0f ("net/mlx5e: CT, Fix multiple allocations and memleak of mod acts")
Signed-off-by: Roi Dayan <roid(a)nvidia.com>
Reviewed-by: Ariel Levkovich <lariel(a)nvidia.com>
Reviewed-by: Maor Dickman <maord(a)nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm(a)nvidia.com>
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/act.h b/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/act.h
index 26efa33de56f..10a40487d536 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/act.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/act.h
@@ -16,6 +16,7 @@ struct mlx5e_tc_act_parse_state {
unsigned int num_actions;
struct mlx5e_tc_flow *flow;
struct netlink_ext_ack *extack;
+ bool ct_clear;
bool encap;
bool decap;
bool mpls_push;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/ct.c b/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/ct.c
index 06ec30cdb269..58cc33f1363d 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/ct.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/ct.c
@@ -27,8 +27,13 @@ tc_act_parse_ct(struct mlx5e_tc_act_parse_state *parse_state,
struct mlx5e_priv *priv,
struct mlx5_flow_attr *attr)
{
+ bool clear_action = act->ct.action & TCA_CT_ACT_CLEAR;
int err;
+ /* It's redundant to do ct clear more than once. */
+ if (clear_action && parse_state->ct_clear)
+ return 0;
+
err = mlx5_tc_ct_parse_action(parse_state->ct_priv, attr,
&attr->parse_attr->mod_hdr_acts,
act, parse_state->extack);
@@ -40,6 +45,8 @@ tc_act_parse_ct(struct mlx5e_tc_act_parse_state *parse_state,
if (mlx5e_is_eswitch_flow(parse_state->flow))
attr->esw_attr->split_count = attr->esw_attr->out_count;
+ parse_state->ct_clear = clear_action;
+
return 0;
}