xfrm_bydst_resize() calls synchronize_rcu() while holding hash_resize_mutex. But then on PREEMPT_RT configurations, xfrm_policy_lookup_bytype() may acquire that mutex while running in an RCU read side critical section. This results in a deadlock.
In fact the scope of hash_resize_mutex is way beyond the purpose of xfrm_policy_lookup_bytype() to just fetch a coherent and stable policy for a given destination/direction, along with other details.
The lower level net->xfrm.xfrm_policy_lock, which among other things protects per destination/direction references to policy entries, is enough to serialize and benefit from priority inheritance against the write side. As a bonus, it makes it officially a per network namespace synchronization business where a policy table resize on namespace A shouldn't block a policy lookup on namespace B.
Fixes: 77cc278f7b20 (xfrm: policy: Use sequence counters with associated lock) Cc: stable@vger.kernel.org Cc: Ahmed S. Darwish a.darwish@linutronix.de Cc: Peter Zijlstra (Intel) peterz@infradead.org Cc: Varad Gautam varad.gautam@suse.com Cc: Steffen Klassert steffen.klassert@secunet.com Cc: Herbert Xu herbert@gondor.apana.org.au Cc: David S. Miller davem@davemloft.net Signed-off-by: Frederic Weisbecker frederic@kernel.org --- include/net/netns/xfrm.h | 1 + net/xfrm/xfrm_policy.c | 17 ++++++++--------- 2 files changed, 9 insertions(+), 9 deletions(-)
diff --git a/include/net/netns/xfrm.h b/include/net/netns/xfrm.h index e816b6a3ef2b..9b376b87bd54 100644 --- a/include/net/netns/xfrm.h +++ b/include/net/netns/xfrm.h @@ -74,6 +74,7 @@ struct netns_xfrm { #endif spinlock_t xfrm_state_lock; seqcount_spinlock_t xfrm_state_hash_generation; + seqcount_spinlock_t xfrm_policy_hash_generation;
spinlock_t xfrm_policy_lock; struct mutex xfrm_cfg_mutex; diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index ce500f847b99..46a6d15b66d6 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -155,7 +155,6 @@ static struct xfrm_policy_afinfo const __rcu *xfrm_policy_afinfo[AF_INET6 + 1] __read_mostly;
static struct kmem_cache *xfrm_dst_cache __ro_after_init; -static __read_mostly seqcount_mutex_t xfrm_policy_hash_generation;
static struct rhashtable xfrm_policy_inexact_table; static const struct rhashtable_params xfrm_pol_inexact_params; @@ -585,7 +584,7 @@ static void xfrm_bydst_resize(struct net *net, int dir) return;
spin_lock_bh(&net->xfrm.xfrm_policy_lock); - write_seqcount_begin(&xfrm_policy_hash_generation); + write_seqcount_begin(&net->xfrm.xfrm_policy_hash_generation);
odst = rcu_dereference_protected(net->xfrm.policy_bydst[dir].table, lockdep_is_held(&net->xfrm.xfrm_policy_lock)); @@ -596,7 +595,7 @@ static void xfrm_bydst_resize(struct net *net, int dir) rcu_assign_pointer(net->xfrm.policy_bydst[dir].table, ndst); net->xfrm.policy_bydst[dir].hmask = nhashmask;
- write_seqcount_end(&xfrm_policy_hash_generation); + write_seqcount_end(&net->xfrm.xfrm_policy_hash_generation); spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
synchronize_rcu(); @@ -1245,7 +1244,7 @@ static void xfrm_hash_rebuild(struct work_struct *work) } while (read_seqretry(&net->xfrm.policy_hthresh.lock, seq));
spin_lock_bh(&net->xfrm.xfrm_policy_lock); - write_seqcount_begin(&xfrm_policy_hash_generation); + write_seqcount_begin(&net->xfrm.xfrm_policy_hash_generation);
/* make sure that we can insert the indirect policies again before * we start with destructive action. @@ -1354,7 +1353,7 @@ static void xfrm_hash_rebuild(struct work_struct *work)
out_unlock: __xfrm_policy_inexact_flush(net); - write_seqcount_end(&xfrm_policy_hash_generation); + write_seqcount_end(&net->xfrm.xfrm_policy_hash_generation); spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
mutex_unlock(&hash_resize_mutex); @@ -2095,9 +2094,9 @@ static struct xfrm_policy *xfrm_policy_lookup_bytype(struct net *net, u8 type, rcu_read_lock(); retry: do { - sequence = read_seqcount_begin(&xfrm_policy_hash_generation); + sequence = read_seqcount_begin(&net->xfrm.xfrm_policy_hash_generation); chain = policy_hash_direct(net, daddr, saddr, family, dir); - } while (read_seqcount_retry(&xfrm_policy_hash_generation, sequence)); + } while (read_seqcount_retry(&net->xfrm.xfrm_policy_hash_generation, sequence));
ret = NULL; hlist_for_each_entry_rcu(pol, chain, bydst) { @@ -2128,7 +2127,7 @@ static struct xfrm_policy *xfrm_policy_lookup_bytype(struct net *net, u8 type, }
skip_inexact: - if (read_seqcount_retry(&xfrm_policy_hash_generation, sequence)) + if (read_seqcount_retry(&net->xfrm.xfrm_policy_hash_generation, sequence)) goto retry;
if (ret && !xfrm_pol_hold_rcu(ret)) @@ -4084,6 +4083,7 @@ static int __net_init xfrm_net_init(struct net *net) /* Initialize the per-net locks here */ spin_lock_init(&net->xfrm.xfrm_state_lock); spin_lock_init(&net->xfrm.xfrm_policy_lock); + seqcount_spinlock_init(&net->xfrm.xfrm_policy_hash_generation, &net->xfrm.xfrm_policy_lock); mutex_init(&net->xfrm.xfrm_cfg_mutex);
rv = xfrm_statistics_init(net); @@ -4128,7 +4128,6 @@ void __init xfrm_init(void) { register_pernet_subsys(&xfrm_net_ops); xfrm_dev_init(); - seqcount_mutex_init(&xfrm_policy_hash_generation, &hash_resize_mutex); xfrm_input_init();
#ifdef CONFIG_XFRM_ESPINTCP
On Mon, Jun 28, 2021 at 03:34:28PM +0200, Frederic Weisbecker wrote:
xfrm_bydst_resize() calls synchronize_rcu() while holding hash_resize_mutex. But then on PREEMPT_RT configurations, xfrm_policy_lookup_bytype() may acquire that mutex while running in an RCU read side critical section. This results in a deadlock.
In fact the scope of hash_resize_mutex is way beyond the purpose of xfrm_policy_lookup_bytype() to just fetch a coherent and stable policy for a given destination/direction, along with other details.
The lower level net->xfrm.xfrm_policy_lock, which among other things protects per destination/direction references to policy entries, is enough to serialize and benefit from priority inheritance against the write side. As a bonus, it makes it officially a per network namespace synchronization business where a policy table resize on namespace A shouldn't block a policy lookup on namespace B.
Fixes: 77cc278f7b20 (xfrm: policy: Use sequence counters with associated lock) Cc: stable@vger.kernel.org Cc: Ahmed S. Darwish a.darwish@linutronix.de Cc: Peter Zijlstra (Intel) peterz@infradead.org Cc: Varad Gautam varad.gautam@suse.com Cc: Steffen Klassert steffen.klassert@secunet.com Cc: Herbert Xu herbert@gondor.apana.org.au Cc: David S. Miller davem@davemloft.net Signed-off-by: Frederic Weisbecker frederic@kernel.org
Your patch has a conflicht with ("commit d7b0408934c7 xfrm: policy: Read seqcount outside of rcu-read side in xfrm_policy_lookup_bytype") from Varad. Can you please rebase onto the ipsec tree?
Btw. Varad, your above mentioned patch tried to fix the same issue. Do we still need it, or is it obsolete with the fix from Frederic?
Thanks!
On 6/30/21 8:57 AM, Steffen Klassert wrote:
On Mon, Jun 28, 2021 at 03:34:28PM +0200, Frederic Weisbecker wrote:
xfrm_bydst_resize() calls synchronize_rcu() while holding hash_resize_mutex. But then on PREEMPT_RT configurations, xfrm_policy_lookup_bytype() may acquire that mutex while running in an RCU read side critical section. This results in a deadlock.
In fact the scope of hash_resize_mutex is way beyond the purpose of xfrm_policy_lookup_bytype() to just fetch a coherent and stable policy for a given destination/direction, along with other details.
The lower level net->xfrm.xfrm_policy_lock, which among other things protects per destination/direction references to policy entries, is enough to serialize and benefit from priority inheritance against the write side. As a bonus, it makes it officially a per network namespace synchronization business where a policy table resize on namespace A shouldn't block a policy lookup on namespace B.
Fixes: 77cc278f7b20 (xfrm: policy: Use sequence counters with associated lock) Cc: stable@vger.kernel.org Cc: Ahmed S. Darwish a.darwish@linutronix.de Cc: Peter Zijlstra (Intel) peterz@infradead.org Cc: Varad Gautam varad.gautam@suse.com Cc: Steffen Klassert steffen.klassert@secunet.com Cc: Herbert Xu herbert@gondor.apana.org.au Cc: David S. Miller davem@davemloft.net Signed-off-by: Frederic Weisbecker frederic@kernel.org
Your patch has a conflicht with ("commit d7b0408934c7 xfrm: policy: Read seqcount outside of rcu-read side in xfrm_policy_lookup_bytype") from Varad. Can you please rebase onto the ipsec tree?
Btw. Varad, your above mentioned patch tried to fix the same issue. Do we still need it, or is it obsolete with the fix from Frederic?
The patch "xfrm: policy: Read seqcount outside of rcu-read side in xfrm_policy_lookup_bytype" shouldn't be needed after Frederic's fix since the offending mutex is now gone. It can be dropped.
Regards, Varad
Thanks!
On Wed, Jun 30, 2021 at 02:11:24PM +0200, Varad Gautam wrote:
On 6/30/21 8:57 AM, Steffen Klassert wrote:
On Mon, Jun 28, 2021 at 03:34:28PM +0200, Frederic Weisbecker wrote:
xfrm_bydst_resize() calls synchronize_rcu() while holding hash_resize_mutex. But then on PREEMPT_RT configurations, xfrm_policy_lookup_bytype() may acquire that mutex while running in an RCU read side critical section. This results in a deadlock.
In fact the scope of hash_resize_mutex is way beyond the purpose of xfrm_policy_lookup_bytype() to just fetch a coherent and stable policy for a given destination/direction, along with other details.
The lower level net->xfrm.xfrm_policy_lock, which among other things protects per destination/direction references to policy entries, is enough to serialize and benefit from priority inheritance against the write side. As a bonus, it makes it officially a per network namespace synchronization business where a policy table resize on namespace A shouldn't block a policy lookup on namespace B.
Fixes: 77cc278f7b20 (xfrm: policy: Use sequence counters with associated lock) Cc: stable@vger.kernel.org Cc: Ahmed S. Darwish a.darwish@linutronix.de Cc: Peter Zijlstra (Intel) peterz@infradead.org Cc: Varad Gautam varad.gautam@suse.com Cc: Steffen Klassert steffen.klassert@secunet.com Cc: Herbert Xu herbert@gondor.apana.org.au Cc: David S. Miller davem@davemloft.net Signed-off-by: Frederic Weisbecker frederic@kernel.org
Your patch has a conflicht with ("commit d7b0408934c7 xfrm: policy: Read seqcount outside of rcu-read side in xfrm_policy_lookup_bytype") from Varad. Can you please rebase onto the ipsec tree?
Btw. Varad, your above mentioned patch tried to fix the same issue. Do we still need it, or is it obsolete with the fix from Frederic?
The patch "xfrm: policy: Read seqcount outside of rcu-read side in xfrm_policy_lookup_bytype" shouldn't be needed after Frederic's fix since the offending mutex is now gone. It can be dropped.
Ok, so I'll revert your patch and apply Frederic's patch on top of that revert.
Thanks a lot everyone!
On Wed, Jun 30, 2021 at 08:57:53AM +0200, Steffen Klassert wrote:
On Mon, Jun 28, 2021 at 03:34:28PM +0200, Frederic Weisbecker wrote:
xfrm_bydst_resize() calls synchronize_rcu() while holding hash_resize_mutex. But then on PREEMPT_RT configurations, xfrm_policy_lookup_bytype() may acquire that mutex while running in an RCU read side critical section. This results in a deadlock.
In fact the scope of hash_resize_mutex is way beyond the purpose of xfrm_policy_lookup_bytype() to just fetch a coherent and stable policy for a given destination/direction, along with other details.
The lower level net->xfrm.xfrm_policy_lock, which among other things protects per destination/direction references to policy entries, is enough to serialize and benefit from priority inheritance against the write side. As a bonus, it makes it officially a per network namespace synchronization business where a policy table resize on namespace A shouldn't block a policy lookup on namespace B.
Fixes: 77cc278f7b20 (xfrm: policy: Use sequence counters with associated lock) Cc: stable@vger.kernel.org Cc: Ahmed S. Darwish a.darwish@linutronix.de Cc: Peter Zijlstra (Intel) peterz@infradead.org Cc: Varad Gautam varad.gautam@suse.com Cc: Steffen Klassert steffen.klassert@secunet.com Cc: Herbert Xu herbert@gondor.apana.org.au Cc: David S. Miller davem@davemloft.net Signed-off-by: Frederic Weisbecker frederic@kernel.org
Your patch has a conflicht with ("commit d7b0408934c7 xfrm: policy: Read seqcount outside of rcu-read side in xfrm_policy_lookup_bytype") from Varad. Can you please rebase onto the ipsec tree?
This patch is now applied to the ipsec tree (on top of the revert of commit d7b0408934c7).
Thanks everyone!
On Mon, Jul 05, 2021 at 01:58:50PM +0200, Steffen Klassert wrote:
On Wed, Jun 30, 2021 at 08:57:53AM +0200, Steffen Klassert wrote:
On Mon, Jun 28, 2021 at 03:34:28PM +0200, Frederic Weisbecker wrote:
xfrm_bydst_resize() calls synchronize_rcu() while holding hash_resize_mutex. But then on PREEMPT_RT configurations, xfrm_policy_lookup_bytype() may acquire that mutex while running in an RCU read side critical section. This results in a deadlock.
In fact the scope of hash_resize_mutex is way beyond the purpose of xfrm_policy_lookup_bytype() to just fetch a coherent and stable policy for a given destination/direction, along with other details.
The lower level net->xfrm.xfrm_policy_lock, which among other things protects per destination/direction references to policy entries, is enough to serialize and benefit from priority inheritance against the write side. As a bonus, it makes it officially a per network namespace synchronization business where a policy table resize on namespace A shouldn't block a policy lookup on namespace B.
Fixes: 77cc278f7b20 (xfrm: policy: Use sequence counters with associated lock) Cc: stable@vger.kernel.org Cc: Ahmed S. Darwish a.darwish@linutronix.de Cc: Peter Zijlstra (Intel) peterz@infradead.org Cc: Varad Gautam varad.gautam@suse.com Cc: Steffen Klassert steffen.klassert@secunet.com Cc: Herbert Xu herbert@gondor.apana.org.au Cc: David S. Miller davem@davemloft.net Signed-off-by: Frederic Weisbecker frederic@kernel.org
Your patch has a conflicht with ("commit d7b0408934c7 xfrm: policy: Read seqcount outside of rcu-read side in xfrm_policy_lookup_bytype") from Varad. Can you please rebase onto the ipsec tree?
This patch is now applied to the ipsec tree (on top of the revert of commit d7b0408934c7).
Thanks everyone!
Thanks a lot!
linux-stable-mirror@lists.linaro.org