On 3/4/25 15:11, Hangbin Liu wrote:
The fixed commit placed mutex_lock() inside spin_lock_bh(), which triggers a warning:
BUG: sleeping function called from invalid context at...
Fix this by moving the IPsec deletion operation to bond_ipsec_free_sa, which is not held by spin_lock_bh().
Additionally, delete the IPsec list in bond_ipsec_del_sa_all() when the XFRM state is DEAD to prevent xdo_dev_state_free() from being triggered again in bond_ipsec_free_sa().
For bond_ipsec_free_sa(), there are now three conditions:
- if (!slave): When no active device exists.
- if (!xs->xso.real_dev): When xdo_dev_state_add() fails.
- if (xs->xso.real_dev != real_dev): When an xs has already been freed by bond_ipsec_del_sa_all() due to migration, and the active slave has changed to a new device. At the same time, the xs is marked as DEAD due to the XFRM entry is removed, triggering xfrm_state_gc_task() and bond_ipsec_free_sa().
In all three cases, xdo_dev_state_free() should not be called, only xs should be removed from bond->ipsec list.
At the same time, protect bond_ipsec_del_sa_all and bond_ipsec_add_sa_all with x->lock for each xs being processed. This prevents XFRM from concurrently initiating add/delete operations on the managed states.
Fixes: 2aeeef906d5a ("bonding: change ipsec_lock from spin lock to mutex") Reported-by: Jakub Kicinski kuba@kernel.org Closes: https://lore.kernel.org/netdev/20241212062734.182a0164@kernel.org Suggested-by: Cosmin Ratiu cratiu@nvidia.com Signed-off-by: Hangbin Liu liuhangbin@gmail.com
drivers/net/bonding/bond_main.c | 53 +++++++++++++++++++++++---------- 1 file changed, 37 insertions(+), 16 deletions(-)
diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index e45bba240cbc..06b060d9b031 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -537,15 +537,22 @@ static void bond_ipsec_add_sa_all(struct bonding *bond) } list_for_each_entry(ipsec, &bond->ipsec_list, list) {
spin_lock_bh(&ipsec->xs->lock);
/* Skip dead xfrm states, they'll be freed later. */
if (ipsec->xs->km.state == XFRM_STATE_DEAD)
goto next;
- /* If new state is added before ipsec_lock acquired */ if (ipsec->xs->xso.real_dev == real_dev)
continue;
goto next;
ipsec->xs->xso.real_dev = real_dev; if (real_dev->xfrmdev_ops->xdo_dev_state_add(ipsec->xs, NULL)) { slave_warn(bond_dev, real_dev, "%s: failed to add SA\n", __func__); ipsec->xs->xso.real_dev = NULL; } +next:
}spin_unlock_bh(&ipsec->xs->lock);
out: mutex_unlock(&bond->ipsec_lock); @@ -560,7 +567,6 @@ static void bond_ipsec_del_sa(struct xfrm_state *xs) struct net_device *bond_dev = xs->xso.dev; struct net_device *real_dev; netdevice_tracker tracker;
- struct bond_ipsec *ipsec; struct bonding *bond; struct slave *slave;
@@ -592,15 +598,6 @@ static void bond_ipsec_del_sa(struct xfrm_state *xs) real_dev->xfrmdev_ops->xdo_dev_state_delete(xs); out: netdev_put(real_dev, &tracker);
- mutex_lock(&bond->ipsec_lock);
- list_for_each_entry(ipsec, &bond->ipsec_list, list) {
if (ipsec->xs == xs) {
list_del(&ipsec->list);
kfree(ipsec);
break;
}
- }
- mutex_unlock(&bond->ipsec_lock);
} static void bond_ipsec_del_sa_all(struct bonding *bond) @@ -617,8 +614,18 @@ static void bond_ipsec_del_sa_all(struct bonding *bond) mutex_lock(&bond->ipsec_lock); list_for_each_entry(ipsec, &bond->ipsec_list, list) {
Second time - you should use list_for_each_entry_safe if you're walking and deleting elements from the list.
if (!ipsec->xs->xso.real_dev)spin_lock_bh(&ipsec->xs->lock);
continue;
goto next;
if (ipsec->xs->km.state == XFRM_STATE_DEAD) {
/* already dead no need to delete again */
if (real_dev->xfrmdev_ops->xdo_dev_state_free)
real_dev->xfrmdev_ops->xdo_dev_state_free(ipsec->xs);
Have you checked if .xdo_dev_state_free can sleep? I see at least one that can: mlx5e_xfrm_free_state().
list_del(&ipsec->list);
kfree(ipsec);
goto next;
}
if (!real_dev->xfrmdev_ops || !real_dev->xfrmdev_ops->xdo_dev_state_delete || @@ -631,6 +638,8 @@ static void bond_ipsec_del_sa_all(struct bonding *bond) if (real_dev->xfrmdev_ops->xdo_dev_state_free) real_dev->xfrmdev_ops->xdo_dev_state_free(ipsec->xs); } +next:
} mutex_unlock(&bond->ipsec_lock);spin_unlock_bh(&ipsec->xs->lock);
} @@ -640,6 +649,7 @@ static void bond_ipsec_free_sa(struct xfrm_state *xs) struct net_device *bond_dev = xs->xso.dev; struct net_device *real_dev; netdevice_tracker tracker;
- struct bond_ipsec *ipsec; struct bonding *bond; struct slave *slave;
@@ -659,11 +669,22 @@ static void bond_ipsec_free_sa(struct xfrm_state *xs) if (!xs->xso.real_dev) goto out;
- WARN_ON(xs->xso.real_dev != real_dev);
- mutex_lock(&bond->ipsec_lock);
- list_for_each_entry(ipsec, &bond->ipsec_list, list) {
if (ipsec->xs == xs) {
/* do xdo_dev_state_free if real_dev matches,
* otherwise only remove the list
*/
if (real_dev && real_dev->xfrmdev_ops &&
real_dev->xfrmdev_ops->xdo_dev_state_free)
real_dev->xfrmdev_ops->xdo_dev_state_free(xs);
list_del(&ipsec->list);
kfree(ipsec);
break;
}
- }
- mutex_unlock(&bond->ipsec_lock);
- if (real_dev && real_dev->xfrmdev_ops &&
real_dev->xfrmdev_ops->xdo_dev_state_free)
real_dev->xfrmdev_ops->xdo_dev_state_free(xs);
out: netdev_put(real_dev, &tracker); }