The first patch fixes the incorrect locks using in bond driver. The second patch fixes the xfrm offload feature during setup active-backup mode. The third patch add a ipsec offload testing.
v3: move the ipsec deletion to bond_ipsec_free_sa (Cosmin Ratiu) v2: do not turn carrier on if bond change link failed (Nikolay Aleksandrov) move the mutex lock to a work queue (Cosmin Ratiu)
Hangbin Liu (3): bonding: move IPsec deletion to bond_ipsec_free_sa bonding: fix xfrm offload feature setup on active-backup mode selftests: bonding: add ipsec offload test
drivers/net/bonding/bond_main.c | 36 ++-- drivers/net/bonding/bond_netlink.c | 16 +- include/net/bonding.h | 1 + .../selftests/drivers/net/bonding/Makefile | 3 +- .../drivers/net/bonding/bond_ipsec_offload.sh | 155 ++++++++++++++++++ .../selftests/drivers/net/bonding/config | 4 + 6 files changed, 195 insertions(+), 20 deletions(-) create mode 100755 tools/testing/selftests/drivers/net/bonding/bond_ipsec_offload.sh
The fixed commit placed mutex_lock() inside spin_lock_bh(), which triggers a warning:
BUG: sleeping function called from invalid context at...
Fix this by moving the IPsec deletion operation to bond_ipsec_free_sa, which is not held by spin_lock_bh().
Additionally, delete the IPsec list in bond_ipsec_del_sa_all() when the XFRM state is DEAD to prevent xdo_dev_state_free() from being triggered again in bond_ipsec_free_sa().
For bond_ipsec_free_sa(), there are now three conditions:
1. if (!slave): When no active device exists. 2. if (!xs->xso.real_dev): When xdo_dev_state_add() fails. 3. if (xs->xso.real_dev != real_dev): When an xs has already been freed by bond_ipsec_del_sa_all() due to migration, and the active slave has changed to a new device. At the same time, the xs is marked as DEAD due to the XFRM entry is removed, triggering xfrm_state_gc_task() and bond_ipsec_free_sa().
In all three cases, xdo_dev_state_free() should not be called, only xs should be removed from bond->ipsec list.
Fixes: 2aeeef906d5a ("bonding: change ipsec_lock from spin lock to mutex") Reported-by: Jakub Kicinski kuba@kernel.org Closes: https://lore.kernel.org/netdev/20241212062734.182a0164@kernel.org Suggested-by: Cosmin Ratiu cratiu@nvidia.com Signed-off-by: Hangbin Liu liuhangbin@gmail.com --- drivers/net/bonding/bond_main.c | 34 ++++++++++++++++++++++----------- 1 file changed, 23 insertions(+), 11 deletions(-)
diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index e45bba240cbc..683bf1221caf 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -537,6 +537,10 @@ static void bond_ipsec_add_sa_all(struct bonding *bond) }
list_for_each_entry(ipsec, &bond->ipsec_list, list) { + /* Skip dead xfrm states, they'll be freed later. */ + if (ipsec->xs->km.state == XFRM_STATE_DEAD) + continue; + /* If new state is added before ipsec_lock acquired */ if (ipsec->xs->xso.real_dev == real_dev) continue; @@ -560,7 +564,6 @@ static void bond_ipsec_del_sa(struct xfrm_state *xs) struct net_device *bond_dev = xs->xso.dev; struct net_device *real_dev; netdevice_tracker tracker; - struct bond_ipsec *ipsec; struct bonding *bond; struct slave *slave;
@@ -592,15 +595,6 @@ static void bond_ipsec_del_sa(struct xfrm_state *xs) real_dev->xfrmdev_ops->xdo_dev_state_delete(xs); out: netdev_put(real_dev, &tracker); - mutex_lock(&bond->ipsec_lock); - list_for_each_entry(ipsec, &bond->ipsec_list, list) { - if (ipsec->xs == xs) { - list_del(&ipsec->list); - kfree(ipsec); - break; - } - } - mutex_unlock(&bond->ipsec_lock); }
static void bond_ipsec_del_sa_all(struct bonding *bond) @@ -617,6 +611,12 @@ static void bond_ipsec_del_sa_all(struct bonding *bond)
mutex_lock(&bond->ipsec_lock); list_for_each_entry(ipsec, &bond->ipsec_list, list) { + if (ipsec->xs->km.state == XFRM_STATE_DEAD) { + list_del(&ipsec->list); + kfree(ipsec); + continue; + } + if (!ipsec->xs->xso.real_dev) continue;
@@ -640,6 +640,7 @@ static void bond_ipsec_free_sa(struct xfrm_state *xs) struct net_device *bond_dev = xs->xso.dev; struct net_device *real_dev; netdevice_tracker tracker; + struct bond_ipsec *ipsec; struct bonding *bond; struct slave *slave;
@@ -659,13 +660,24 @@ static void bond_ipsec_free_sa(struct xfrm_state *xs) if (!xs->xso.real_dev) goto out;
- WARN_ON(xs->xso.real_dev != real_dev); + if (xs->xso.real_dev != real_dev) + goto out;
if (real_dev && real_dev->xfrmdev_ops && real_dev->xfrmdev_ops->xdo_dev_state_free) real_dev->xfrmdev_ops->xdo_dev_state_free(xs); out: netdev_put(real_dev, &tracker); + + mutex_lock(&bond->ipsec_lock); + list_for_each_entry(ipsec, &bond->ipsec_list, list) { + if (ipsec->xs == xs) { + list_del(&ipsec->list); + kfree(ipsec); + break; + } + } + mutex_unlock(&bond->ipsec_lock); }
/**
On 2/27/25 10:37, Hangbin Liu wrote:
The fixed commit placed mutex_lock() inside spin_lock_bh(), which triggers a warning:
BUG: sleeping function called from invalid context at...
Fix this by moving the IPsec deletion operation to bond_ipsec_free_sa, which is not held by spin_lock_bh().
Additionally, delete the IPsec list in bond_ipsec_del_sa_all() when the XFRM state is DEAD to prevent xdo_dev_state_free() from being triggered again in bond_ipsec_free_sa().
For bond_ipsec_free_sa(), there are now three conditions:
- if (!slave): When no active device exists.
- if (!xs->xso.real_dev): When xdo_dev_state_add() fails.
- if (xs->xso.real_dev != real_dev): When an xs has already been freed by bond_ipsec_del_sa_all() due to migration, and the active slave has changed to a new device. At the same time, the xs is marked as DEAD due to the XFRM entry is removed, triggering xfrm_state_gc_task() and bond_ipsec_free_sa().
In all three cases, xdo_dev_state_free() should not be called, only xs should be removed from bond->ipsec list.
Fixes: 2aeeef906d5a ("bonding: change ipsec_lock from spin lock to mutex") Reported-by: Jakub Kicinski kuba@kernel.org Closes: https://lore.kernel.org/netdev/20241212062734.182a0164@kernel.org Suggested-by: Cosmin Ratiu cratiu@nvidia.com Signed-off-by: Hangbin Liu liuhangbin@gmail.com
drivers/net/bonding/bond_main.c | 34 ++++++++++++++++++++++----------- 1 file changed, 23 insertions(+), 11 deletions(-)
diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index e45bba240cbc..683bf1221caf 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -537,6 +537,10 @@ static void bond_ipsec_add_sa_all(struct bonding *bond) } list_for_each_entry(ipsec, &bond->ipsec_list, list) {
/* Skip dead xfrm states, they'll be freed later. */
if (ipsec->xs->km.state == XFRM_STATE_DEAD)
continue;
- /* If new state is added before ipsec_lock acquired */ if (ipsec->xs->xso.real_dev == real_dev) continue;
@@ -560,7 +564,6 @@ static void bond_ipsec_del_sa(struct xfrm_state *xs) struct net_device *bond_dev = xs->xso.dev; struct net_device *real_dev; netdevice_tracker tracker;
- struct bond_ipsec *ipsec; struct bonding *bond; struct slave *slave;
@@ -592,15 +595,6 @@ static void bond_ipsec_del_sa(struct xfrm_state *xs) real_dev->xfrmdev_ops->xdo_dev_state_delete(xs); out: netdev_put(real_dev, &tracker);
- mutex_lock(&bond->ipsec_lock);
- list_for_each_entry(ipsec, &bond->ipsec_list, list) {
if (ipsec->xs == xs) {
list_del(&ipsec->list);
kfree(ipsec);
break;
}
- }
- mutex_unlock(&bond->ipsec_lock);
} static void bond_ipsec_del_sa_all(struct bonding *bond) @@ -617,6 +611,12 @@ static void bond_ipsec_del_sa_all(struct bonding *bond) mutex_lock(&bond->ipsec_lock); list_for_each_entry(ipsec, &bond->ipsec_list, list) {
if (ipsec->xs->km.state == XFRM_STATE_DEAD) {
list_del(&ipsec->list);
To be able to do this here, you'll have to use list_for_each_entry_safe().
kfree(ipsec);
continue;
}
- if (!ipsec->xs->xso.real_dev) continue;
@@ -640,6 +640,7 @@ static void bond_ipsec_free_sa(struct xfrm_state *xs) struct net_device *bond_dev = xs->xso.dev; struct net_device *real_dev; netdevice_tracker tracker;
- struct bond_ipsec *ipsec; struct bonding *bond; struct slave *slave;
@@ -659,13 +660,24 @@ static void bond_ipsec_free_sa(struct xfrm_state *xs) if (!xs->xso.real_dev) goto out;
- WARN_ON(xs->xso.real_dev != real_dev);
- if (xs->xso.real_dev != real_dev)
goto out;
if (real_dev && real_dev->xfrmdev_ops && real_dev->xfrmdev_ops->xdo_dev_state_free) real_dev->xfrmdev_ops->xdo_dev_state_free(xs); out: netdev_put(real_dev, &tracker);
- mutex_lock(&bond->ipsec_lock);
- list_for_each_entry(ipsec, &bond->ipsec_list, list) {
if (ipsec->xs == xs) {
list_del(&ipsec->list);
kfree(ipsec);
break;
}
- }
- mutex_unlock(&bond->ipsec_lock);
} /**
On 2/27/25 10:50, Nikolay Aleksandrov wrote:
On 2/27/25 10:37, Hangbin Liu wrote:
The fixed commit placed mutex_lock() inside spin_lock_bh(), which triggers a warning:
BUG: sleeping function called from invalid context at...
Fix this by moving the IPsec deletion operation to bond_ipsec_free_sa, which is not held by spin_lock_bh().
Additionally, delete the IPsec list in bond_ipsec_del_sa_all() when the XFRM state is DEAD to prevent xdo_dev_state_free() from being triggered again in bond_ipsec_free_sa().
For bond_ipsec_free_sa(), there are now three conditions:
- if (!slave): When no active device exists.
- if (!xs->xso.real_dev): When xdo_dev_state_add() fails.
- if (xs->xso.real_dev != real_dev): When an xs has already been freed by bond_ipsec_del_sa_all() due to migration, and the active slave has changed to a new device. At the same time, the xs is marked as DEAD due to the XFRM entry is removed, triggering xfrm_state_gc_task() and bond_ipsec_free_sa().
In all three cases, xdo_dev_state_free() should not be called, only xs should be removed from bond->ipsec list.
Fixes: 2aeeef906d5a ("bonding: change ipsec_lock from spin lock to mutex") Reported-by: Jakub Kicinski kuba@kernel.org Closes: https://lore.kernel.org/netdev/20241212062734.182a0164@kernel.org Suggested-by: Cosmin Ratiu cratiu@nvidia.com Signed-off-by: Hangbin Liu liuhangbin@gmail.com
drivers/net/bonding/bond_main.c | 34 ++++++++++++++++++++++----------- 1 file changed, 23 insertions(+), 11 deletions(-)
diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index e45bba240cbc..683bf1221caf 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -537,6 +537,10 @@ static void bond_ipsec_add_sa_all(struct bonding *bond) } list_for_each_entry(ipsec, &bond->ipsec_list, list) {
/* Skip dead xfrm states, they'll be freed later. */
if (ipsec->xs->km.state == XFRM_STATE_DEAD)
continue;
- /* If new state is added before ipsec_lock acquired */ if (ipsec->xs->xso.real_dev == real_dev) continue;
@@ -560,7 +564,6 @@ static void bond_ipsec_del_sa(struct xfrm_state *xs) struct net_device *bond_dev = xs->xso.dev; struct net_device *real_dev; netdevice_tracker tracker;
- struct bond_ipsec *ipsec; struct bonding *bond; struct slave *slave;
@@ -592,15 +595,6 @@ static void bond_ipsec_del_sa(struct xfrm_state *xs) real_dev->xfrmdev_ops->xdo_dev_state_delete(xs); out: netdev_put(real_dev, &tracker);
- mutex_lock(&bond->ipsec_lock);
- list_for_each_entry(ipsec, &bond->ipsec_list, list) {
if (ipsec->xs == xs) {
list_del(&ipsec->list);
kfree(ipsec);
break;
}
- }
- mutex_unlock(&bond->ipsec_lock);
} static void bond_ipsec_del_sa_all(struct bonding *bond) @@ -617,6 +611,12 @@ static void bond_ipsec_del_sa_all(struct bonding *bond) mutex_lock(&bond->ipsec_lock); list_for_each_entry(ipsec, &bond->ipsec_list, list) {
if (ipsec->xs->km.state == XFRM_STATE_DEAD) {
list_del(&ipsec->list);
To be able to do this here, you'll have to use list_for_each_entry_safe().
One more thing - note I'm not an xfrm expert by far but it seems to me here you have to also call xdo_dev_state_free() with the old active slave dev otherwise that will never get called with the original real_dev after the switch to a new active slave (or more accurately it might if the GC runs between the switching but it is a race), care must be taken wrt sequence of events because the XFRM GC may be running in parallel which probably means that in bond_ipsec_free_sa() you'll have to take the mutex before calling xdo_dev_state_free() and check if the entry is still linked in the bond's ipsec list before calling the free_sa callback, if it isn't then del_sa_all got to it before the GC and there's nothing to do if it also called the dev's free_sa callback. The check for real_dev doesn't seem enough to protect against this race.
Cheers, Nik
kfree(ipsec);
continue;
}
- if (!ipsec->xs->xso.real_dev) continue;
@@ -640,6 +640,7 @@ static void bond_ipsec_free_sa(struct xfrm_state *xs) struct net_device *bond_dev = xs->xso.dev; struct net_device *real_dev; netdevice_tracker tracker;
- struct bond_ipsec *ipsec; struct bonding *bond; struct slave *slave;
@@ -659,13 +660,24 @@ static void bond_ipsec_free_sa(struct xfrm_state *xs) if (!xs->xso.real_dev) goto out;
- WARN_ON(xs->xso.real_dev != real_dev);
- if (xs->xso.real_dev != real_dev)
goto out;
if (real_dev && real_dev->xfrmdev_ops && real_dev->xfrmdev_ops->xdo_dev_state_free) real_dev->xfrmdev_ops->xdo_dev_state_free(xs); out: netdev_put(real_dev, &tracker);
- mutex_lock(&bond->ipsec_lock);
- list_for_each_entry(ipsec, &bond->ipsec_list, list) {
if (ipsec->xs == xs) {
list_del(&ipsec->list);
kfree(ipsec);
break;
}
- }
- mutex_unlock(&bond->ipsec_lock);
} /**
On Thu, Feb 27, 2025 at 11:21:51AM +0200, Nikolay Aleksandrov wrote:
@@ -617,6 +611,12 @@ static void bond_ipsec_del_sa_all(struct bonding *bond) mutex_lock(&bond->ipsec_lock); list_for_each_entry(ipsec, &bond->ipsec_list, list) {
if (ipsec->xs->km.state == XFRM_STATE_DEAD) {
list_del(&ipsec->list);
To be able to do this here, you'll have to use list_for_each_entry_safe().
One more thing - note I'm not an xfrm expert by far but it seems to me here you have to also call xdo_dev_state_free() with the old active slave dev otherwise that will never get called with the original real_dev after the switch to a new active slave (or more accurately it might if the GC runs between the switching but it is a race), care must be taken wrt sequence of events because the XFRM
Can we just call xs->xso.real_dev->xfrmdev_ops->xdo_dev_state_free(xs) no matter xs->xso.real_dev == real_dev or not? I'm afraid calling xdo_dev_state_free() every where may make us lot more easily.
GC may be running in parallel which probably means that in bond_ipsec_free_sa() you'll have to take the mutex before calling xdo_dev_state_free() and check if the entry is still linked in the bond's ipsec list before calling the free_sa callback, if it isn't then del_sa_all got to it before the GC and there's nothing to do if it also called the dev's free_sa callback. The check for real_dev doesn't seem enough to protect against this race.
I agree that we need to take the mutex before calling xdo_dev_state_free() in bond_ipsec_free_sa(). Do you think if this is enough? I'm a bit lot here.
Thanks Hangbin
On 2/27/25 15:21, Hangbin Liu wrote:
On Thu, Feb 27, 2025 at 11:21:51AM +0200, Nikolay Aleksandrov wrote:
@@ -617,6 +611,12 @@ static void bond_ipsec_del_sa_all(struct bonding *bond) mutex_lock(&bond->ipsec_lock); list_for_each_entry(ipsec, &bond->ipsec_list, list) {
if (ipsec->xs->km.state == XFRM_STATE_DEAD) {
list_del(&ipsec->list);
To be able to do this here, you'll have to use list_for_each_entry_safe().
One more thing - note I'm not an xfrm expert by far but it seems to me here you have to also call xdo_dev_state_free() with the old active slave dev otherwise that will never get called with the original real_dev after the switch to a new active slave (or more accurately it might if the GC runs between the switching but it is a race), care must be taken wrt sequence of events because the XFRM
Can we just call xs->xso.real_dev->xfrmdev_ops->xdo_dev_state_free(xs) no matter xs->xso.real_dev == real_dev or not? I'm afraid calling xdo_dev_state_free() every where may make us lot more easily.
You'd have to check all drivers that implement the callback to answer that and even then I'd stick to the canonical way of how it's done in xfrm and make the bond just passthrough. Any other games become dangerous and new code will have to be carefully reviewed every time, calling another device's free_sa when it wasn't added before doesn't sound good.
GC may be running in parallel which probably means that in bond_ipsec_free_sa() you'll have to take the mutex before calling xdo_dev_state_free() and check if the entry is still linked in the bond's ipsec list before calling the free_sa callback, if it isn't then del_sa_all got to it before the GC and there's nothing to do if it also called the dev's free_sa callback. The check for real_dev doesn't seem enough to protect against this race.
I agree that we need to take the mutex before calling xdo_dev_state_free() in bond_ipsec_free_sa(). Do you think if this is enough? I'm a bit lot here.
Thanks Hangbin
Well, the race is between the xfrm GC and del_sa_all, in bond's free_sa if you walk the list under the mutex before calling real_dev's free callback and don't find the current element that's being freed in free_sa then it was cleaned up by del_sa_all, otherwise del_sa_all is waiting to walk that list and clean the entries. I think it should be fine as long as free_sa was called once with the proper device.
On Thu, Feb 27, 2025 at 03:31:01PM +0200, Nikolay Aleksandrov wrote:
One more thing - note I'm not an xfrm expert by far but it seems to me here you have to also call xdo_dev_state_free() with the old active slave dev otherwise that will never get called with the original real_dev after the switch to a new active slave (or more accurately it might if the GC runs between the switching but it is a race), care must be taken wrt sequence of events because the XFRM
Can we just call xs->xso.real_dev->xfrmdev_ops->xdo_dev_state_free(xs) no matter xs->xso.real_dev == real_dev or not? I'm afraid calling xdo_dev_state_free() every where may make us lot more easily.
You'd have to check all drivers that implement the callback to answer that and even then I'd stick to the canonical way of how it's done in xfrm and make the bond just passthrough. Any other games become dangerous and new code will have to be carefully reviewed every time, calling another device's free_sa when it wasn't added before doesn't sound good.
GC may be running in parallel which probably means that in bond_ipsec_free_sa() you'll have to take the mutex before calling xdo_dev_state_free() and check if the entry is still linked in the bond's ipsec list before calling the free_sa callback, if it isn't then del_sa_all got to it before the GC and there's nothing to do if it also called the dev's free_sa callback. The check for real_dev doesn't seem enough to protect against this race.
I agree that we need to take the mutex before calling xdo_dev_state_free() in bond_ipsec_free_sa(). Do you think if this is enough? I'm a bit lot here.
Thanks Hangbin
Well, the race is between the xfrm GC and del_sa_all, in bond's free_sa if you walk the list under the mutex before calling real_dev's free callback and don't find the current element that's being freed in free_sa then it was cleaned up by del_sa_all, otherwise del_sa_all is waiting to walk that list and clean the entries. I think it should be fine as long as free_sa was called once with the proper device.
OK, so the free will be called either in del_sa_all() or free_sa(). Something like this?
static void bond_ipsec_del_sa_all(struct bonding *bond) @@ -620,6 +614,16 @@ static void bond_ipsec_del_sa_all(struct bonding *bond) if (!ipsec->xs->xso.real_dev) continue;
+ if (ipsec->xs->km.state == XFRM_STATE_DEAD) { + /* already dead no need to delete again */ + if (real_dev->xfrmdev_ops->xdo_dev_state_free) + real_dev->xfrmdev_ops->xdo_dev_state_free(ipsec->xs); + list_del(&ipsec->list); + kfree(ipsec); + continue; + } + if (!real_dev->xfrmdev_ops || !real_dev->xfrmdev_ops->xdo_dev_state_delete || netif_is_bond_master(real_dev)) {
@@ -659,11 +664,22 @@ static void bond_ipsec_free_sa(struct xfrm_state *xs) if (!xs->xso.real_dev) goto out;
- WARN_ON(xs->xso.real_dev != real_dev); + mutex_lock(&bond->ipsec_lock); + list_for_each_entry(ipsec, &bond->ipsec_list, list) { + if (ipsec->xs == xs) { + if (real_dev && xs->xso.real_dev == real_dev &&
^^ looks we don't need this xs->xso.real_dev == real_dev checking if there is no race, do we? Or just keep the WARN_ON() in case of any race.
+ real_dev->xfrmdev_ops && + real_dev->xfrmdev_ops->xdo_dev_state_free) + real_dev->xfrmdev_ops->xdo_dev_state_free(xs); + list_del(&ipsec->list); + kfree(ipsec); + break; + } + } + mutex_unlock(&bond->ipsec_lock);
- if (real_dev && real_dev->xfrmdev_ops && - real_dev->xfrmdev_ops->xdo_dev_state_free) - real_dev->xfrmdev_ops->xdo_dev_state_free(xs); out: netdev_put(real_dev, &tracker); }
On Fri, 2025-02-28 at 02:20 +0000, Hangbin Liu wrote:
On Thu, Feb 27, 2025 at 03:31:01PM +0200, Nikolay Aleksandrov wrote:
One more thing - note I'm not an xfrm expert by far but it seems to me here you have to also call xdo_dev_state_free() with the old active slave dev otherwise that will never get called with the original real_dev after the switch to a new active slave (or more accurately it might if the GC runs between the switching but it is a race), care must be taken wrt sequence of events because the XFRM
Can we just call xs->xso.real_dev->xfrmdev_ops-
xdo_dev_state_free(xs)
no matter xs->xso.real_dev == real_dev or not? I'm afraid calling xdo_dev_state_free() every where may make us lot more easily.
You'd have to check all drivers that implement the callback to answer that and even then I'd stick to the canonical way of how it's done in xfrm and make the bond just passthrough. Any other games become dangerous and new code will have to be carefully reviewed every time, calling another device's free_sa when it wasn't added before doesn't sound good.
GC may be running in parallel which probably means that in bond_ipsec_free_sa() you'll have to take the mutex before calling xdo_dev_state_free() and check if the entry is still linked in the bond's ipsec list before calling the free_sa callback, if it isn't then del_sa_all got to it before the GC and there's nothing to do if it also called the dev's free_sa callback. The check for real_dev doesn't seem enough to protect against this race.
I agree that we need to take the mutex before calling xdo_dev_state_free() in bond_ipsec_free_sa(). Do you think if this is enough? I'm a bit lot here.
Thanks Hangbin
Well, the race is between the xfrm GC and del_sa_all, in bond's free_sa if you walk the list under the mutex before calling real_dev's free callback and don't find the current element that's being freed in free_sa then it was cleaned up by del_sa_all, otherwise del_sa_all is waiting to walk that list and clean the entries. I think it should be fine as long as free_sa was called once with the proper device.
OK, so the free will be called either in del_sa_all() or free_sa(). Something like this?
[...]
Unfortunately, after applying these changes and reasoning about them for a bit, I don't think this will work. There are still races left. For example: 1. An xs is marked DEAD (in __xfrm_state_delete, with x->lock held) and before .xdo_dev_state_delete() is called on it, bond_ipsec_del_sa_all is called in parallel, doesn't call delete on xs (because it's dead), then calls free (incorrect without delete first), then removes the list entry. Later, xdo_dev_state_delete( == bond_ipsec_del_sa) is called, and calls delete (incorrect, out of order with free). Finally, bond_ipsec_free_sa is called, which fortunately doesn't do anything silly in the new proposed form because xs is no longer in the list.
2. A more sinister form of the above race can happen when bond_ipsec_del_sa_all() calls delete on real_dev, then in parallel and immediately after __xfrm_state_delete marks xs as DEAD and calls bond_ipsec_del_sa() which happily calls delete on real_dev again.
In order to fix these races (and others like it), I think bond_ipsec_del_sa_all and bond_ipsec_add_sa_all *need* to acquire x-
lock for each xs being processed. This would prevent xfrm from
concurrently initiating add/delete operations on the managed states.
Cosmin.
On 2/28/25 12:31, Cosmin Ratiu wrote:
On Fri, 2025-02-28 at 02:20 +0000, Hangbin Liu wrote:
On Thu, Feb 27, 2025 at 03:31:01PM +0200, Nikolay Aleksandrov wrote:
One more thing - note I'm not an xfrm expert by far but it seems to me here you have to also call xdo_dev_state_free() with the old active slave dev otherwise that will never get called with the original real_dev after the switch to a new active slave (or more accurately it might if the GC runs between the switching but it is a race), care must be taken wrt sequence of events because the XFRM
Can we just call xs->xso.real_dev->xfrmdev_ops-
xdo_dev_state_free(xs)
no matter xs->xso.real_dev == real_dev or not? I'm afraid calling xdo_dev_state_free() every where may make us lot more easily.
You'd have to check all drivers that implement the callback to answer that and even then I'd stick to the canonical way of how it's done in xfrm and make the bond just passthrough. Any other games become dangerous and new code will have to be carefully reviewed every time, calling another device's free_sa when it wasn't added before doesn't sound good.
GC may be running in parallel which probably means that in bond_ipsec_free_sa() you'll have to take the mutex before calling xdo_dev_state_free() and check if the entry is still linked in the bond's ipsec list before calling the free_sa callback, if it isn't then del_sa_all got to it before the GC and there's nothing to do if it also called the dev's free_sa callback. The check for real_dev doesn't seem enough to protect against this race.
I agree that we need to take the mutex before calling xdo_dev_state_free() in bond_ipsec_free_sa(). Do you think if this is enough? I'm a bit lot here.
Thanks Hangbin
Well, the race is between the xfrm GC and del_sa_all, in bond's free_sa if you walk the list under the mutex before calling real_dev's free callback and don't find the current element that's being freed in free_sa then it was cleaned up by del_sa_all, otherwise del_sa_all is waiting to walk that list and clean the entries. I think it should be fine as long as free_sa was called once with the proper device.
OK, so the free will be called either in del_sa_all() or free_sa(). Something like this?
[...]
Unfortunately, after applying these changes and reasoning about them for a bit, I don't think this will work. There are still races left. For example:
- An xs is marked DEAD (in __xfrm_state_delete, with x->lock held) and
before .xdo_dev_state_delete() is called on it, bond_ipsec_del_sa_all is called in parallel, doesn't call delete on xs (because it's dead), then calls free (incorrect without delete first), then removes the list entry. Later, xdo_dev_state_delete( == bond_ipsec_del_sa) is called, and calls delete (incorrect, out of order with free). Finally, bond_ipsec_free_sa is called, which fortunately doesn't do anything silly in the new proposed form because xs is no longer in the list.
- A more sinister form of the above race can happen when
bond_ipsec_del_sa_all() calls delete on real_dev, then in parallel and immediately after __xfrm_state_delete marks xs as DEAD and calls bond_ipsec_del_sa() which happily calls delete on real_dev again.
In order to fix these races (and others like it), I think bond_ipsec_del_sa_all and bond_ipsec_add_sa_all *need* to acquire x-
lock for each xs being processed. This would prevent xfrm from
concurrently initiating add/delete operations on the managed states.
Cosmin.
Duh, right you are. The state is protected by x->lock and cannot be trusted outside of it. If you take x->lock inside the list walk with the mutex held you can deadlock.
Cheers, Nik
On 2/28/25 13:07, Nikolay Aleksandrov wrote:
On 2/28/25 12:31, Cosmin Ratiu wrote:
On Fri, 2025-02-28 at 02:20 +0000, Hangbin Liu wrote:
On Thu, Feb 27, 2025 at 03:31:01PM +0200, Nikolay Aleksandrov wrote:
One more thing - note I'm not an xfrm expert by far but it seems to me here you have to also call xdo_dev_state_free() with the old active slave dev otherwise that will never get called with the original real_dev after the switch to a new active slave (or more accurately it might if the GC runs between the switching but it is a race), care must be taken wrt sequence of events because the XFRM
Can we just call xs->xso.real_dev->xfrmdev_ops-
xdo_dev_state_free(xs)
no matter xs->xso.real_dev == real_dev or not? I'm afraid calling xdo_dev_state_free() every where may make us lot more easily.
You'd have to check all drivers that implement the callback to answer that and even then I'd stick to the canonical way of how it's done in xfrm and make the bond just passthrough. Any other games become dangerous and new code will have to be carefully reviewed every time, calling another device's free_sa when it wasn't added before doesn't sound good.
GC may be running in parallel which probably means that in bond_ipsec_free_sa() you'll have to take the mutex before calling xdo_dev_state_free() and check if the entry is still linked in the bond's ipsec list before calling the free_sa callback, if it isn't then del_sa_all got to it before the GC and there's nothing to do if it also called the dev's free_sa callback. The check for real_dev doesn't seem enough to protect against this race.
I agree that we need to take the mutex before calling xdo_dev_state_free() in bond_ipsec_free_sa(). Do you think if this is enough? I'm a bit lot here.
Thanks Hangbin
Well, the race is between the xfrm GC and del_sa_all, in bond's free_sa if you walk the list under the mutex before calling real_dev's free callback and don't find the current element that's being freed in free_sa then it was cleaned up by del_sa_all, otherwise del_sa_all is waiting to walk that list and clean the entries. I think it should be fine as long as free_sa was called once with the proper device.
OK, so the free will be called either in del_sa_all() or free_sa(). Something like this?
[...]
Unfortunately, after applying these changes and reasoning about them for a bit, I don't think this will work. There are still races left. For example:
- An xs is marked DEAD (in __xfrm_state_delete, with x->lock held) and
before .xdo_dev_state_delete() is called on it, bond_ipsec_del_sa_all is called in parallel, doesn't call delete on xs (because it's dead), then calls free (incorrect without delete first), then removes the list entry. Later, xdo_dev_state_delete( == bond_ipsec_del_sa) is called, and calls delete (incorrect, out of order with free). Finally, bond_ipsec_free_sa is called, which fortunately doesn't do anything silly in the new proposed form because xs is no longer in the list.
- A more sinister form of the above race can happen when
bond_ipsec_del_sa_all() calls delete on real_dev, then in parallel and immediately after __xfrm_state_delete marks xs as DEAD and calls bond_ipsec_del_sa() which happily calls delete on real_dev again.
In order to fix these races (and others like it), I think bond_ipsec_del_sa_all and bond_ipsec_add_sa_all *need* to acquire x-
lock for each xs being processed. This would prevent xfrm from
concurrently initiating add/delete operations on the managed states.
Cosmin.
Duh, right you are. The state is protected by x->lock and cannot be trusted outside of it. If you take x->lock inside the list walk with the mutex held you can deadlock.
Cheers, Nik
Correction - actually took a closer look at the xfrm code and it should be fine. The x->lock is taken only in the delete path and if the mutex is not acquired by bond's del_sa callback it should be ok. Though this must be very well documented.
Hi Cosmin, On Fri, Feb 28, 2025 at 10:31:58AM +0000, Cosmin Ratiu wrote:
On Fri, 2025-02-28 at 02:20 +0000, Hangbin Liu wrote:
On Thu, Feb 27, 2025 at 03:31:01PM +0200, Nikolay Aleksandrov wrote:
One more thing - note I'm not an xfrm expert by far but it seems to me here you have to also call xdo_dev_state_free() with the old active slave dev otherwise that will never get called with the original real_dev after the switch to a new active slave (or more accurately it might if the GC runs between the switching but it is a race), care must be taken wrt sequence of events because the XFRM
Can we just call xs->xso.real_dev->xfrmdev_ops-
xdo_dev_state_free(xs)
no matter xs->xso.real_dev == real_dev or not? I'm afraid calling xdo_dev_state_free() every where may make us lot more easily.
You'd have to check all drivers that implement the callback to answer that and even then I'd stick to the canonical way of how it's done in xfrm and make the bond just passthrough. Any other games become dangerous and new code will have to be carefully reviewed every time, calling another device's free_sa when it wasn't added before doesn't sound good.
GC may be running in parallel which probably means that in bond_ipsec_free_sa() you'll have to take the mutex before calling xdo_dev_state_free() and check if the entry is still linked in the bond's ipsec list before calling the free_sa callback, if it isn't then del_sa_all got to it before the GC and there's nothing to do if it also called the dev's free_sa callback. The check for real_dev doesn't seem enough to protect against this race.
I agree that we need to take the mutex before calling xdo_dev_state_free() in bond_ipsec_free_sa(). Do you think if this is enough? I'm a bit lot here.
Thanks Hangbin
Well, the race is between the xfrm GC and del_sa_all, in bond's free_sa if you walk the list under the mutex before calling real_dev's free callback and don't find the current element that's being freed in free_sa then it was cleaned up by del_sa_all, otherwise del_sa_all is waiting to walk that list and clean the entries. I think it should be fine as long as free_sa was called once with the proper device.
OK, so the free will be called either in del_sa_all() or free_sa(). Something like this?
[...]
Unfortunately, after applying these changes and reasoning about them for a bit, I don't think this will work. There are still races left. For example:
- An xs is marked DEAD (in __xfrm_state_delete, with x->lock held) and
before .xdo_dev_state_delete() is called on it, bond_ipsec_del_sa_all is called in parallel, doesn't call delete on xs (because it's dead), then calls free (incorrect without delete first), then removes the list entry. Later, xdo_dev_state_delete( == bond_ipsec_del_sa) is called, and calls delete (incorrect, out of order with free). Finally, bond_ipsec_free_sa is called, which fortunately doesn't do anything silly in the new proposed form because xs is no longer in the list.
- A more sinister form of the above race can happen when
bond_ipsec_del_sa_all() calls delete on real_dev, then in parallel and immediately after __xfrm_state_delete marks xs as DEAD and calls bond_ipsec_del_sa() which happily calls delete on real_dev again.
In order to fix these races (and others like it), I think bond_ipsec_del_sa_all and bond_ipsec_add_sa_all *need* to acquire x-
lock for each xs being processed. This would prevent xfrm from
concurrently initiating add/delete operations on the managed states.
Thanks a lot for the careful checking. I will add the x->lock in del/add_sa_all.
Regards Hangbin
Hi Cosmin, On Fri, Feb 28, 2025 at 10:31:58AM +0000, Cosmin Ratiu wrote:
On Fri, 2025-02-28 at 02:20 +0000, Hangbin Liu wrote:
On Thu, Feb 27, 2025 at 03:31:01PM +0200, Nikolay Aleksandrov wrote:
One more thing - note I'm not an xfrm expert by far but it seems to me here you have to also call xdo_dev_state_free() with the old active slave dev otherwise that will never get called with the original real_dev after the switch to a new active slave (or more accurately it might if the GC runs between the switching but it is a race), care must be taken wrt sequence of events because the XFRM
Can we just call xs->xso.real_dev->xfrmdev_ops-
xdo_dev_state_free(xs)
no matter xs->xso.real_dev == real_dev or not? I'm afraid calling xdo_dev_state_free() every where may make us lot more easily.
You'd have to check all drivers that implement the callback to answer that and even then I'd stick to the canonical way of how it's done in xfrm and make the bond just passthrough. Any other games become dangerous and new code will have to be carefully reviewed every time, calling another device's free_sa when it wasn't added before doesn't sound good.
GC may be running in parallel which probably means that in bond_ipsec_free_sa() you'll have to take the mutex before calling xdo_dev_state_free() and check if the entry is still linked in the bond's ipsec list before calling the free_sa callback, if it isn't then del_sa_all got to it before the GC and there's nothing to do if it also called the dev's free_sa callback. The check for real_dev doesn't seem enough to protect against this race.
I agree that we need to take the mutex before calling xdo_dev_state_free() in bond_ipsec_free_sa(). Do you think if this is enough? I'm a bit lot here.
Thanks Hangbin
Well, the race is between the xfrm GC and del_sa_all, in bond's free_sa if you walk the list under the mutex before calling real_dev's free callback and don't find the current element that's being freed in free_sa then it was cleaned up by del_sa_all, otherwise del_sa_all is waiting to walk that list and clean the entries. I think it should be fine as long as free_sa was called once with the proper device.
OK, so the free will be called either in del_sa_all() or free_sa(). Something like this?
[...]
Unfortunately, after applying these changes and reasoning about them for a bit, I don't think this will work. There are still races left. For example:
- An xs is marked DEAD (in __xfrm_state_delete, with x->lock held) and
before .xdo_dev_state_delete() is called on it, bond_ipsec_del_sa_all is called in parallel, doesn't call delete on xs (because it's dead), then calls free (incorrect without delete first), then removes the list entry. Later, xdo_dev_state_delete( == bond_ipsec_del_sa) is called, and calls delete (incorrect, out of order with free). Finally, bond_ipsec_free_sa is called, which fortunately doesn't do anything silly in the new proposed form because xs is no longer in the list.
- A more sinister form of the above race can happen when
bond_ipsec_del_sa_all() calls delete on real_dev, then in parallel and immediately after __xfrm_state_delete marks xs as DEAD and calls bond_ipsec_del_sa() which happily calls delete on real_dev again.
In order to fix these races (and others like it), I think bond_ipsec_del_sa_all and bond_ipsec_add_sa_all *need* to acquire x-
lock for each xs being processed. This would prevent xfrm from
concurrently initiating add/delete operations on the managed states.
Just to make sure I added the lock in correct place, would you please help confirm.
diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index e85878b12376..c59ad3a5cf43 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -537,19 +537,25 @@ static void bond_ipsec_add_sa_all(struct bonding *bond) }
list_for_each_entry(ipsec, &bond->ipsec_list, list) { + spin_lock_bh(&ipsec->xs->lock); /* Skip dead xfrm states, they'll be freed later. */ - if (ipsec->xs->km.state == XFRM_STATE_DEAD) + if (ipsec->xs->km.state == XFRM_STATE_DEAD) { + spin_unlock_bh(&ipsec->xs->lock); continue; + }
/* If new state is added before ipsec_lock acquired */ - if (ipsec->xs->xso.real_dev == real_dev) + if (ipsec->xs->xso.real_dev == real_dev) { + spin_unlock_bh(&ipsec->xs->lock); continue; + }
ipsec->xs->xso.real_dev = real_dev; if (real_dev->xfrmdev_ops->xdo_dev_state_add(ipsec->xs, NULL)) { slave_warn(bond_dev, real_dev, "%s: failed to add SA\n", __func__); ipsec->xs->xso.real_dev = NULL; } + spin_unlock_bh(&ipsec->xs->lock); } out: mutex_unlock(&bond->ipsec_lock); @@ -614,6 +620,7 @@ static void bond_ipsec_del_sa_all(struct bonding *bond) if (!ipsec->xs->xso.real_dev) continue;
+ spin_lock_bh(&ipsec->xs->lock); if (ipsec->xs->km.state == XFRM_STATE_DEAD) { /* already dead no need to delete again */ if (ipsec->xs->xso.real_dev == real_dev && @@ -621,6 +628,7 @@ static void bond_ipsec_del_sa_all(struct bonding *bond) real_dev->xfrmdev_ops->xdo_dev_state_free(ipsec->xs); list_del(&ipsec->list); kfree(ipsec); + spin_unlock_bh(&ipsec->xs->lock); continue; }
@@ -635,6 +643,7 @@ static void bond_ipsec_del_sa_all(struct bonding *bond) if (real_dev->xfrmdev_ops->xdo_dev_state_free) real_dev->xfrmdev_ops->xdo_dev_state_free(ipsec->xs); } + spin_unlock_bh(&ipsec->xs->lock); } mutex_unlock(&bond->ipsec_lock); }
Thanks Hangbin
On Tue, 2025-03-04 at 09:18 +0000, Hangbin Liu wrote:
Just to make sure I added the lock in correct place, would you please help confirm.
diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index e85878b12376..c59ad3a5cf43 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -537,19 +537,25 @@ static void bond_ipsec_add_sa_all(struct bonding *bond) } list_for_each_entry(ipsec, &bond->ipsec_list, list) {
spin_lock_bh(&ipsec->xs->lock);
/* Skip dead xfrm states, they'll be freed later. */
if (ipsec->xs->km.state == XFRM_STATE_DEAD)
if (ipsec->xs->km.state == XFRM_STATE_DEAD) {
spin_unlock_bh(&ipsec->xs->lock);
Instead of unlocking on every branch, I recommend adding a "next:" tag before the unlock at the end of the loop and switching the "continue" statements with "goto next".
continue;
}
/* If new state is added before ipsec_lock acquired */
if (ipsec->xs->xso.real_dev == real_dev)
if (ipsec->xs->xso.real_dev == real_dev) {
spin_unlock_bh(&ipsec->xs->lock);
continue;
}
ipsec->xs->xso.real_dev = real_dev; if (real_dev->xfrmdev_ops->xdo_dev_state_add(ipsec-
xs, NULL)) {
slave_warn(bond_dev, real_dev, "%s: failed to add SA\n", __func__); ipsec->xs->xso.real_dev = NULL; }
Add the "next:" tag here.
spin_unlock_bh(&ipsec->xs->lock);
} out: mutex_unlock(&bond->ipsec_lock); @@ -614,6 +620,7 @@ static void bond_ipsec_del_sa_all(struct bonding *bond) if (!ipsec->xs->xso.real_dev) continue;
The above if should be in the critical section as well.
spin_lock_bh(&ipsec->xs->lock);
if (ipsec->xs->km.state == XFRM_STATE_DEAD) { /* already dead no need to delete again */ if (ipsec->xs->xso.real_dev == real_dev && @@ -621,6 +628,7 @@ static void bond_ipsec_del_sa_all(struct bonding *bond) real_dev->xfrmdev_ops-
xdo_dev_state_free(ipsec->xs);
list_del(&ipsec->list); kfree(ipsec);
spin_unlock_bh(&ipsec->xs->lock);
And I recommend the same thing with "goto next" here, jumping at the end of the loop, before the unlock.
continue; } @@ -635,6 +643,7 @@ static void bond_ipsec_del_sa_all(struct bonding *bond) if (real_dev->xfrmdev_ops-
xdo_dev_state_free)
real_dev->xfrmdev_ops-
xdo_dev_state_free(ipsec->xs);
}
spin_unlock_bh(&ipsec->xs->lock);
} mutex_unlock(&bond->ipsec_lock); }
Thanks Hangbin
The active-backup bonding mode supports XFRM ESP offload. However, when a bond is added using command like `ip link add bond0 type bond mode 1 miimon 100`, the `ethtool -k` command shows that the XFRM ESP offload is disabled. This occurs because, in bond_newlink(), we change bond link first and register bond device later. So the XFRM feature update in bond_option_mode_set() is not called as the bond device is not yet registered, leading to the offload feature not being set successfully.
To resolve this issue, we can modify the code order in bond_newlink() to ensure that the bond device is registered first before changing the bond link parameters. This change will allow the XFRM ESP offload feature to be correctly enabled.
Fixes: 007ab5345545 ("bonding: fix feature flag setting at init time") Signed-off-by: Hangbin Liu liuhangbin@gmail.com --- drivers/net/bonding/bond_main.c | 2 +- drivers/net/bonding/bond_netlink.c | 16 +++++++++------- include/net/bonding.h | 1 + 3 files changed, 11 insertions(+), 8 deletions(-)
diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 683bf1221caf..65e4b5d599e6 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -4401,7 +4401,7 @@ void bond_work_init_all(struct bonding *bond) INIT_DELAYED_WORK(&bond->slave_arr_work, bond_slave_arr_handler); }
-static void bond_work_cancel_all(struct bonding *bond) +void bond_work_cancel_all(struct bonding *bond) { cancel_delayed_work_sync(&bond->mii_work); cancel_delayed_work_sync(&bond->arp_work); diff --git a/drivers/net/bonding/bond_netlink.c b/drivers/net/bonding/bond_netlink.c index 2a6a424806aa..ed16af6db557 100644 --- a/drivers/net/bonding/bond_netlink.c +++ b/drivers/net/bonding/bond_netlink.c @@ -568,18 +568,20 @@ static int bond_newlink(struct net *src_net, struct net_device *bond_dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { + struct bonding *bond = netdev_priv(bond_dev); int err;
- err = bond_changelink(bond_dev, tb, data, extack); - if (err < 0) + err = register_netdevice(bond_dev); + if (err) return err;
- err = register_netdevice(bond_dev); - if (!err) { - struct bonding *bond = netdev_priv(bond_dev); + netif_carrier_off(bond_dev); + bond_work_init_all(bond);
- netif_carrier_off(bond_dev); - bond_work_init_all(bond); + err = bond_changelink(bond_dev, tb, data, extack); + if (err) { + bond_work_cancel_all(bond); + unregister_netdevice(bond_dev); }
return err; diff --git a/include/net/bonding.h b/include/net/bonding.h index 8bb5f016969f..e5e005cd2e17 100644 --- a/include/net/bonding.h +++ b/include/net/bonding.h @@ -707,6 +707,7 @@ struct bond_vlan_tag *bond_verify_device_path(struct net_device *start_dev, int bond_update_slave_arr(struct bonding *bond, struct slave *skipslave); void bond_slave_arr_work_rearm(struct bonding *bond, unsigned long delay); void bond_work_init_all(struct bonding *bond); +void bond_work_cancel_all(struct bonding *bond);
#ifdef CONFIG_PROC_FS void bond_create_proc_entry(struct bonding *bond);
This introduces a test for IPSec offload over bonding, utilizing netdevsim for the testing process, as veth interfaces do not support IPSec offload. The test will ensure that the IPSec offload functionality remains operational even after a failover event occurs in the bonding configuration.
Signed-off-by: Hangbin Liu liuhangbin@gmail.com --- .../selftests/drivers/net/bonding/Makefile | 3 +- .../drivers/net/bonding/bond_ipsec_offload.sh | 155 ++++++++++++++++++ .../selftests/drivers/net/bonding/config | 4 + 3 files changed, 161 insertions(+), 1 deletion(-) create mode 100755 tools/testing/selftests/drivers/net/bonding/bond_ipsec_offload.sh
diff --git a/tools/testing/selftests/drivers/net/bonding/Makefile b/tools/testing/selftests/drivers/net/bonding/Makefile index 2b10854e4b1e..d5a7de16d33a 100644 --- a/tools/testing/selftests/drivers/net/bonding/Makefile +++ b/tools/testing/selftests/drivers/net/bonding/Makefile @@ -10,7 +10,8 @@ TEST_PROGS := \ mode-2-recovery-updelay.sh \ bond_options.sh \ bond-eth-type-change.sh \ - bond_macvlan_ipvlan.sh + bond_macvlan_ipvlan.sh \ + bond_ipsec_offload.sh
TEST_FILES := \ lag_lib.sh \ diff --git a/tools/testing/selftests/drivers/net/bonding/bond_ipsec_offload.sh b/tools/testing/selftests/drivers/net/bonding/bond_ipsec_offload.sh new file mode 100755 index 000000000000..169866b47a67 --- /dev/null +++ b/tools/testing/selftests/drivers/net/bonding/bond_ipsec_offload.sh @@ -0,0 +1,155 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +# IPsec over bonding offload test: +# +# +----------------+ +# | bond0 | +# | | | +# | eth0 eth1 | +# +---+-------+----+ +# +# We use netdevsim instead of physical interfaces +#------------------------------------------------------------------- +# Example commands +# ip x s add proto esp src 192.0.2.1 dst 192.0.2.2 \ +# spi 0x07 mode transport reqid 0x07 replay-window 32 \ +# aead 'rfc4106(gcm(aes))' 1234567890123456dcba 128 \ +# sel src 192.0.2.1/24 dst 192.0.2.2/24 +# offload dev bond0 dir out +# ip x p add dir out src 192.0.2.1/24 dst 192.0.2.2/24 \ +# tmpl proto esp src 192.0.2.1 dst 192.0.2.2 \ +# spi 0x07 mode transport reqid 0x07 +# +#------------------------------------------------------------------- + +lib_dir=$(dirname "$0") +source "$lib_dir"/../../../net/lib.sh +algo="aead rfc4106(gcm(aes)) 0x3132333435363738393031323334353664636261 128" +srcip=192.0.2.1 +dstip=192.0.2.2 +ipsec0=/sys/kernel/debug/netdevsim/netdevsim0/ports/0/ipsec +ipsec1=/sys/kernel/debug/netdevsim/netdevsim0/ports/1/ipsec +ret=0 + +cleanup() +{ + modprobe -r netdevsim + cleanup_ns $ns +} + +active_slave_changed() +{ + local old_active_slave=$1 + local new_active_slave=$(ip -n ${ns} -d -j link show bond0 | \ + jq -r ".[].linkinfo.info_data.active_slave") + [ "$new_active_slave" != "$old_active_slave" -a "$new_active_slave" != "null" ] +} + +test_offload() +{ + # use ping to exercise the Tx path + ip netns exec $ns ping -I bond0 -c 3 -W 1 -i 0 $dstip >/dev/null + + active_slave=$(ip -n ${ns} -d -j link show bond0 | \ + jq -r ".[].linkinfo.info_data.active_slave") + + if [ $active_slave = $nic0 ]; then + sysfs=$ipsec0 + elif [ $active_slave = $nic1 ]; then + sysfs=$ipsec1 + else + echo "FAIL: bond_ipsec_offload invalid active_slave $active_slave" + ret=1 + fi + + # The tx/rx order in sysfs may changed after failover + if grep -q "SA count=2 tx=3" $sysfs && grep -q "tx ipaddr=$dstip" $sysfs; then + echo "PASS: bond_ipsec_offload has correct tx count with link ${active_slave}" + else + echo "FAIL: bond_ipsec_offload incorrect tx count with link ${active_slave}" + ret=1 + fi +} + +if ! mount | grep -q debugfs; then + mount -t debugfs none /sys/kernel/debug/ &> /dev/null +fi + +# setup netdevsim since dummy/veth dev doesn't have offload support +if [ ! -w /sys/bus/netdevsim/new_device ] ; then + modprobe -q netdevsim + if [ $? -ne 0 ]; then + echo "SKIP: can't load netdevsim for ipsec offload" + exit $ksft_skip + fi +fi + +trap cleanup EXIT + +setup_ns ns +ip -n $ns link add bond0 type bond mode active-backup miimon 100 +ip -n $ns addr add $srcip/24 dev bond0 +ip -n $ns link set bond0 up + +ifaces=$(ip netns exec $ns bash -c ' + sysfsnet=/sys/bus/netdevsim/devices/netdevsim0/net/ + echo "0 2" > /sys/bus/netdevsim/new_device + while [ ! -d $sysfsnet ] ; do :; done + udevadm settle + ls $sysfsnet +') +nic0=$(echo $ifaces | cut -f1 -d ' ') +nic1=$(echo $ifaces | cut -f2 -d ' ') +ip -n $ns link set $nic0 master bond0 +ip -n $ns link set $nic1 master bond0 + +# create offloaded SAs, both in and out +ip -n $ns x p add dir out src $srcip/24 dst $dstip/24 \ + tmpl proto esp src $srcip dst $dstip spi 9 \ + mode transport reqid 42 + +ip -n $ns x p add dir in src $dstip/24 dst $srcip/24 \ + tmpl proto esp src $dstip dst $srcip spi 9 \ + mode transport reqid 42 + +ip -n $ns x s add proto esp src $srcip dst $dstip spi 9 \ + mode transport reqid 42 $algo sel src $srcip/24 dst $dstip/24 \ + offload dev bond0 dir out + +ip -n $ns x s add proto esp src $dstip dst $srcip spi 9 \ + mode transport reqid 42 $algo sel src $dstip/24 dst $srcip/24 \ + offload dev bond0 dir in + +# does offload show up in ip output +lines=`ip -n $ns x s list | grep -c "crypto offload parameters: dev bond0 dir"` +if [ $lines -ne 2 ] ; then + echo "FAIL: bond_ipsec_offload SA offload missing from list output" + ret=1 +fi + +# we didn't create a peer, make sure we can Tx by adding a permanent neighbour +# this need to be added after enslave +ip -n $ns neigh add $dstip dev bond0 lladdr 00:11:22:33:44:55 + +# start Offload testing +test_offload + +# do failover +ip -n $ns link set $active_slave down +slowwait 5 active_slave_changed $active_slave +test_offload + +# make sure offload get removed from driver +ip -n $ns x s flush +ip -n $ns x p flush +line0=$(grep -c "SA count=0" $ipsec0) +line1=$(grep -c "SA count=0" $ipsec1) +if [ $line0 -ne 1 -o $line1 -ne 1 ] ; then + echo "FAIL: bond_ipsec_offload SA not removed from driver" + ret=1 +else + echo "PASS: bond_ipsec_offload SA removed from driver" +fi + +exit $ret diff --git a/tools/testing/selftests/drivers/net/bonding/config b/tools/testing/selftests/drivers/net/bonding/config index dad4e5fda4db..054fb772846f 100644 --- a/tools/testing/selftests/drivers/net/bonding/config +++ b/tools/testing/selftests/drivers/net/bonding/config @@ -9,3 +9,7 @@ CONFIG_NET_CLS_FLOWER=y CONFIG_NET_SCH_INGRESS=y CONFIG_NLMON=y CONFIG_VETH=y +CONFIG_INET_ESP=y +CONFIG_INET_ESP_OFFLOAD=y +CONFIG_XFRM_USER=m +CONFIG_NETDEVSIM=m
Hangbin Liu liuhangbin@gmail.com writes:
This introduces a test for IPSec offload over bonding, utilizing netdevsim for the testing process, as veth interfaces do not support IPSec offload. The test will ensure that the IPSec offload functionality remains operational even after a failover event occurs in the bonding configuration.
Signed-off-by: Hangbin Liu liuhangbin@gmail.com
.../selftests/drivers/net/bonding/Makefile | 3 +- .../drivers/net/bonding/bond_ipsec_offload.sh | 155 ++++++++++++++++++ .../selftests/drivers/net/bonding/config | 4 + 3 files changed, 161 insertions(+), 1 deletion(-) create mode 100755 tools/testing/selftests/drivers/net/bonding/bond_ipsec_offload.sh
diff --git a/tools/testing/selftests/drivers/net/bonding/Makefile b/tools/testing/selftests/drivers/net/bonding/Makefile index 2b10854e4b1e..d5a7de16d33a 100644 --- a/tools/testing/selftests/drivers/net/bonding/Makefile +++ b/tools/testing/selftests/drivers/net/bonding/Makefile @@ -10,7 +10,8 @@ TEST_PROGS := \ mode-2-recovery-updelay.sh \ bond_options.sh \ bond-eth-type-change.sh \
- bond_macvlan_ipvlan.sh
- bond_macvlan_ipvlan.sh \
- bond_ipsec_offload.sh
TEST_FILES := \ lag_lib.sh \ diff --git a/tools/testing/selftests/drivers/net/bonding/bond_ipsec_offload.sh b/tools/testing/selftests/drivers/net/bonding/bond_ipsec_offload.sh new file mode 100755 index 000000000000..169866b47a67 --- /dev/null +++ b/tools/testing/selftests/drivers/net/bonding/bond_ipsec_offload.sh @@ -0,0 +1,155 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0
+# IPsec over bonding offload test: +# +# +----------------+ +# | bond0 | +# | | | +# | eth0 eth1 | +# +---+-------+----+ +# +# We use netdevsim instead of physical interfaces +#------------------------------------------------------------------- +# Example commands +# ip x s add proto esp src 192.0.2.1 dst 192.0.2.2 \ +# spi 0x07 mode transport reqid 0x07 replay-window 32 \ +# aead 'rfc4106(gcm(aes))' 1234567890123456dcba 128 \ +# sel src 192.0.2.1/24 dst 192.0.2.2/24 +# offload dev bond0 dir out +# ip x p add dir out src 192.0.2.1/24 dst 192.0.2.2/24 \ +# tmpl proto esp src 192.0.2.1 dst 192.0.2.2 \ +# spi 0x07 mode transport reqid 0x07 +# +#-------------------------------------------------------------------
+lib_dir=$(dirname "$0") +source "$lib_dir"/../../../net/lib.sh +algo="aead rfc4106(gcm(aes)) 0x3132333435363738393031323334353664636261 128" +srcip=192.0.2.1 +dstip=192.0.2.2 +ipsec0=/sys/kernel/debug/netdevsim/netdevsim0/ports/0/ipsec +ipsec1=/sys/kernel/debug/netdevsim/netdevsim0/ports/1/ipsec +ret=0
+cleanup() +{
- modprobe -r netdevsim
- cleanup_ns $ns
+}
+active_slave_changed() +{
local old_active_slave=$1
local new_active_slave=$(ip -n ${ns} -d -j link show bond0 | \
jq -r ".[].linkinfo.info_data.active_slave")
[ "$new_active_slave" != "$old_active_slave" -a "$new_active_slave" != "null" ]
+}
+test_offload() +{
- # use ping to exercise the Tx path
- ip netns exec $ns ping -I bond0 -c 3 -W 1 -i 0 $dstip >/dev/null
- active_slave=$(ip -n ${ns} -d -j link show bond0 | \
jq -r ".[].linkinfo.info_data.active_slave")
- if [ $active_slave = $nic0 ]; then
sysfs=$ipsec0
- elif [ $active_slave = $nic1 ]; then
sysfs=$ipsec1
- else
echo "FAIL: bond_ipsec_offload invalid active_slave $active_slave"
ret=1
- fi
- # The tx/rx order in sysfs may changed after failover
- if grep -q "SA count=2 tx=3" $sysfs && grep -q "tx ipaddr=$dstip" $sysfs; then
echo "PASS: bond_ipsec_offload has correct tx count with link ${active_slave}"
- else
echo "FAIL: bond_ipsec_offload incorrect tx count with link ${active_slave}"
ret=1
- fi
lib.sh got all sorts of logging and checking helpers that were previously in forwarding/, I think it makes sense to use them. Would the following make sense to you?
test_offload() { # use ping to exercise the Tx path ip netns exec $ns ping -I bond0 -c 3 -W 1 -i 0 $dstip >/dev/null
active_slave=$(ip -n ${ns} -d -j link show bond0 | \ jq -r ".[].linkinfo.info_data.active_slave")
RET=0
if [ $active_slave = $nic0 ]; then sysfs=$ipsec0 elif [ $active_slave = $nic1 ]; then sysfs=$ipsec1 else check_err 1 "bond_ipsec_offload invalid active_slave $active_slave" fi
# The tx/rx order in sysfs may changed after failover grep -q "SA count=2 tx=3" $sysfs && grep -q "tx ipaddr=$dstip" $sysfs check_err $? "incorrect tx count with link ${active_slave}"
log_test bond_ipsec_offload }
... etc. below.
+}
+if ! mount | grep -q debugfs; then
- mount -t debugfs none /sys/kernel/debug/ &> /dev/null
Clean this up at exit?
defer umount /sys/kernel/debug/
(But then the cleanup trap needs to be registered sooner, and cleanup() needs to invoke defer_scopes_cleanup.)
+fi
+# setup netdevsim since dummy/veth dev doesn't have offload support +if [ ! -w /sys/bus/netdevsim/new_device ] ; then
- modprobe -q netdevsim
- if [ $? -ne 0 ]; then
echo "SKIP: can't load netdevsim for ipsec offload"
exit $ksft_skip
- fi
And here you can just schedule a cleanup, as above.
defer modprobe -r netdevsim
+fi
+trap cleanup EXIT
+setup_ns ns
defer cleanup_ns $ns
And at that point you can drop cleanup altogether, and just have:
trap defer_scopes_cleanup EXIT
+ip -n $ns link add bond0 type bond mode active-backup miimon 100 +ip -n $ns addr add $srcip/24 dev bond0 +ip -n $ns link set bond0 up
+ifaces=$(ip netns exec $ns bash -c '
- sysfsnet=/sys/bus/netdevsim/devices/netdevsim0/net/
- echo "0 2" > /sys/bus/netdevsim/new_device
- while [ ! -d $sysfsnet ] ; do :; done
- udevadm settle
- ls $sysfsnet
+') +nic0=$(echo $ifaces | cut -f1 -d ' ') +nic1=$(echo $ifaces | cut -f2 -d ' ') +ip -n $ns link set $nic0 master bond0 +ip -n $ns link set $nic1 master bond0
+# create offloaded SAs, both in and out +ip -n $ns x p add dir out src $srcip/24 dst $dstip/24 \
- tmpl proto esp src $srcip dst $dstip spi 9 \
- mode transport reqid 42
+ip -n $ns x p add dir in src $dstip/24 dst $srcip/24 \
- tmpl proto esp src $dstip dst $srcip spi 9 \
- mode transport reqid 42
+ip -n $ns x s add proto esp src $srcip dst $dstip spi 9 \
- mode transport reqid 42 $algo sel src $srcip/24 dst $dstip/24 \
- offload dev bond0 dir out
+ip -n $ns x s add proto esp src $dstip dst $srcip spi 9 \
- mode transport reqid 42 $algo sel src $dstip/24 dst $srcip/24 \
- offload dev bond0 dir in
+# does offload show up in ip output +lines=`ip -n $ns x s list | grep -c "crypto offload parameters: dev bond0 dir"` +if [ $lines -ne 2 ] ; then
- echo "FAIL: bond_ipsec_offload SA offload missing from list output"
- ret=1
+fi
+# we didn't create a peer, make sure we can Tx by adding a permanent neighbour +# this need to be added after enslave +ip -n $ns neigh add $dstip dev bond0 lladdr 00:11:22:33:44:55
+# start Offload testing +test_offload
+# do failover +ip -n $ns link set $active_slave down +slowwait 5 active_slave_changed $active_slave +test_offload
Hm, active_slave being overriden in the function is a bit sneaky. But shifting the assignment out of the function is not great, because then it would just needs to be done twice. Ho hum. This might just be the least annoying way to write it after all.
+# make sure offload get removed from driver +ip -n $ns x s flush +ip -n $ns x p flush +line0=$(grep -c "SA count=0" $ipsec0) +line1=$(grep -c "SA count=0" $ipsec1) +if [ $line0 -ne 1 -o $line1 -ne 1 ] ; then
- echo "FAIL: bond_ipsec_offload SA not removed from driver"
- ret=1
+else
- echo "PASS: bond_ipsec_offload SA removed from driver"
+fi
+exit $ret
With log_test this would be. It merges results from individual tests to get the right exit status.
exit $EXIT_STATUS
diff --git a/tools/testing/selftests/drivers/net/bonding/config b/tools/testing/selftests/drivers/net/bonding/config index dad4e5fda4db..054fb772846f 100644 --- a/tools/testing/selftests/drivers/net/bonding/config +++ b/tools/testing/selftests/drivers/net/bonding/config @@ -9,3 +9,7 @@ CONFIG_NET_CLS_FLOWER=y CONFIG_NET_SCH_INGRESS=y CONFIG_NLMON=y CONFIG_VETH=y +CONFIG_INET_ESP=y +CONFIG_INET_ESP_OFFLOAD=y +CONFIG_XFRM_USER=m +CONFIG_NETDEVSIM=m
linux-kselftest-mirror@lists.linaro.org