Following commits are pre-requisite for the commit c65f27b9c - 1dbf1d590 (net: Add locking to protect skb->dev access in ip_output) - 719a402cf (net: netdevice: Add operation ndo_sk_get_lower_dev)
Kuniyuki Iwashima (1): tls: Use __sk_dst_get() and dst_dev_rcu() in get_netdev_for_sock().
Sharath Chandra Vurukala (1): net: Add locking to protect skb->dev access in ip_output
Tariq Toukan (1): net: netdevice: Add operation ndo_sk_get_lower_dev
include/linux/netdevice.h | 4 ++++ include/net/dst.h | 12 ++++++++++++ net/core/dev.c | 33 +++++++++++++++++++++++++++++++++ net/ipv4/ip_output.c | 16 +++++++++++----- net/tls/tls_device.c | 18 ++++++++++-------- 5 files changed, 70 insertions(+), 13 deletions(-)
From: Sharath Chandra Vurukala quic_sharathv@quicinc.com
[ Upstream commit 1dbf1d590d10a6d1978e8184f8dfe20af22d680a]
In ip_output() skb->dev is updated from the skb_dst(skb)->dev this can become invalid when the interface is unregistered and freed,
Introduced new skb_dst_dev_rcu() function to be used instead of skb_dst_dev() within rcu_locks in ip_output.This will ensure that all the skb's associated with the dev being deregistered will be transnmitted out first, before freeing the dev.
Given that ip_output() is called within an rcu_read_lock() critical section or from a bottom-half context, it is safe to introduce an RCU read-side critical section within it.
Multiple panic call stacks were observed when UL traffic was run in concurrency with device deregistration from different functions, pasting one sample for reference.
[496733.627565][T13385] Call trace: [496733.627570][T13385] bpf_prog_ce7c9180c3b128ea_cgroupskb_egres+0x24c/0x7f0 [496733.627581][T13385] __cgroup_bpf_run_filter_skb+0x128/0x498 [496733.627595][T13385] ip_finish_output+0xa4/0xf4 [496733.627605][T13385] ip_output+0x100/0x1a0 [496733.627613][T13385] ip_send_skb+0x68/0x100 [496733.627618][T13385] udp_send_skb+0x1c4/0x384 [496733.627625][T13385] udp_sendmsg+0x7b0/0x898 [496733.627631][T13385] inet_sendmsg+0x5c/0x7c [496733.627639][T13385] __sys_sendto+0x174/0x1e4 [496733.627647][T13385] __arm64_sys_sendto+0x28/0x3c [496733.627653][T13385] invoke_syscall+0x58/0x11c [496733.627662][T13385] el0_svc_common+0x88/0xf4 [496733.627669][T13385] do_el0_svc+0x2c/0xb0 [496733.627676][T13385] el0_svc+0x2c/0xa4 [496733.627683][T13385] el0t_64_sync_handler+0x68/0xb4 [496733.627689][T13385] el0t_64_sync+0x1a4/0x1a8
Changes in v3: - Replaced WARN_ON() with WARN_ON_ONCE(), as suggested by Willem de Bruijn. - Dropped legacy lines mistakenly pulled in from an outdated branch.
Changes in v2: - Addressed review comments from Eric Dumazet - Used READ_ONCE() to prevent potential load/store tearing - Added skb_dst_dev_rcu() and used along with rcu_read_lock() in ip_output
Signed-off-by: Sharath Chandra Vurukala quic_sharathv@quicinc.com Reviewed-by: Eric Dumazet edumazet@google.com Link: https://patch.msgid.link/20250730105118.GA26100@hu-sharathv-hyd.qualcomm.com Signed-off-by: Jakub Kicinski kuba@kernel.org [ Keerthana: Backported the patch to v5.10.y ] Signed-off-by: Keerthana K keerthana.kalyanasundaram@broadcom.com --- include/net/dst.h | 12 ++++++++++++ net/ipv4/ip_output.c | 16 +++++++++++----- 2 files changed, 23 insertions(+), 5 deletions(-)
diff --git a/include/net/dst.h b/include/net/dst.h index 9114272f8100..b3522d3de8c8 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -547,6 +547,18 @@ static inline void skb_dst_update_pmtu_no_confirm(struct sk_buff *skb, u32 mtu) dst->ops->update_pmtu(dst, NULL, skb, mtu, false); }
+static inline struct net_device *dst_dev_rcu(const struct dst_entry *dst) +{ + /* In the future, use rcu_dereference(dst->dev) */ + WARN_ON_ONCE(!rcu_read_lock_held()); + return READ_ONCE(dst->dev); +} + +static inline struct net_device *skb_dst_dev_rcu(const struct sk_buff *skb) +{ + return dst_dev_rcu(skb_dst(skb)); +} + struct dst_entry *dst_blackhole_check(struct dst_entry *dst, u32 cookie); void dst_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb, u32 mtu, bool confirm_neigh); diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 1e430e135aa6..3369d5ab1eff 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -429,17 +429,23 @@ int ip_mc_output(struct net *net, struct sock *sk, struct sk_buff *skb)
int ip_output(struct net *net, struct sock *sk, struct sk_buff *skb) { - struct net_device *dev = skb_dst(skb)->dev, *indev = skb->dev; + struct net_device *dev, *indev = skb->dev; + int ret_val; + + rcu_read_lock(); + dev = skb_dst_dev_rcu(skb);
IP_UPD_PO_STATS(net, IPSTATS_MIB_OUT, skb->len);
skb->dev = dev; skb->protocol = htons(ETH_P_IP);
- return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, - net, sk, skb, indev, dev, - ip_finish_output, - !(IPCB(skb)->flags & IPSKB_REROUTED)); + ret_val = NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, + net, sk, skb, indev, dev, + ip_finish_output, + !(IPCB(skb)->flags & IPSKB_REROUTED)); + rcu_read_unlock(); + return ret_val; } EXPORT_SYMBOL(ip_output);
From: Tariq Toukan tariqt@nvidia.com
[ Upstream commit 719a402cf60311b1cdff3f6320abaecdcc5e46b7]
ndo_sk_get_lower_dev returns the lower netdev that corresponds to a given socket. Additionally, we implement a helper netdev_sk_get_lowest_dev() to get the lowest one in chain.
Signed-off-by: Tariq Toukan tariqt@nvidia.com Reviewed-by: Boris Pismenny borisp@nvidia.com Signed-off-by: Jakub Kicinski kuba@kernel.org [ Keerthana: Backported the patch to v5.10.y ] Signed-off-by: Keerthana K keerthana.kalyanasundaram@broadcom.com --- include/linux/netdevice.h | 4 ++++ net/core/dev.c | 33 +++++++++++++++++++++++++++++++++ 2 files changed, 37 insertions(+)
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index d3a3e77a18df..c9f2a88a6c83 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1435,6 +1435,8 @@ struct net_device_ops { struct net_device* (*ndo_get_xmit_slave)(struct net_device *dev, struct sk_buff *skb, bool all_slaves); + struct net_device* (*ndo_sk_get_lower_dev)(struct net_device *dev, + struct sock *sk); netdev_features_t (*ndo_fix_features)(struct net_device *dev, netdev_features_t features); int (*ndo_set_features)(struct net_device *dev, @@ -2914,6 +2916,8 @@ int init_dummy_netdev(struct net_device *dev); struct net_device *netdev_get_xmit_slave(struct net_device *dev, struct sk_buff *skb, bool all_slaves); +struct net_device *netdev_sk_get_lowest_dev(struct net_device *dev, + struct sock *sk); struct net_device *dev_get_by_index(struct net *net, int ifindex); struct net_device *__dev_get_by_index(struct net *net, int ifindex); struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex); diff --git a/net/core/dev.c b/net/core/dev.c index c0dc524548ee..ad2be47b48a9 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -8169,6 +8169,39 @@ struct net_device *netdev_get_xmit_slave(struct net_device *dev, } EXPORT_SYMBOL(netdev_get_xmit_slave);
+static struct net_device *netdev_sk_get_lower_dev(struct net_device *dev, + struct sock *sk) +{ + const struct net_device_ops *ops = dev->netdev_ops; + + if (!ops->ndo_sk_get_lower_dev) + return NULL; + return ops->ndo_sk_get_lower_dev(dev, sk); +} + +/** + * netdev_sk_get_lowest_dev - Get the lowest device in chain given device and socket + * @dev: device + * @sk: the socket + * + * %NULL is returned if no lower device is found. + */ + +struct net_device *netdev_sk_get_lowest_dev(struct net_device *dev, + struct sock *sk) +{ + struct net_device *lower; + + lower = netdev_sk_get_lower_dev(dev, sk); + while (lower) { + dev = lower; + lower = netdev_sk_get_lower_dev(dev, sk); + } + + return dev; +} +EXPORT_SYMBOL(netdev_sk_get_lowest_dev); + static void netdev_adjacent_add_links(struct net_device *dev) { struct netdev_adjacent *iter;
From: Kuniyuki Iwashima kuniyu@google.com
[ Upstream commit c65f27b9c3be2269918e1cbad6d8884741f835c5 ]
get_netdev_for_sock() is called during setsockopt(), so not under RCU.
Using sk_dst_get(sk)->dev could trigger UAF.
Let's use __sk_dst_get() and dst_dev_rcu().
Note that the only ->ndo_sk_get_lower_dev() user is bond_sk_get_lower_dev(), which uses RCU.
Fixes: e8f69799810c ("net/tls: Add generic NIC offload infrastructure") Signed-off-by: Kuniyuki Iwashima kuniyu@google.com Reviewed-by: Eric Dumazet edumazet@google.com Reviewed-by: Sabrina Dubroca sd@queasysnail.net Link: https://patch.msgid.link/20250916214758.650211-6-kuniyu@google.com Signed-off-by: Jakub Kicinski kuba@kernel.org Signed-off-by: Sasha Levin sashal@kernel.org [ Keerthana: Backported the patch to v5.10.y ] Signed-off-by: Keerthana K keerthana.kalyanasundaram@broadcom.com --- net/tls/tls_device.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-)
diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c index 8e89ff403073..8cf4e1651b0c 100644 --- a/net/tls/tls_device.c +++ b/net/tls/tls_device.c @@ -113,17 +113,19 @@ static void tls_device_queue_ctx_destruction(struct tls_context *ctx) /* We assume that the socket is already connected */ static struct net_device *get_netdev_for_sock(struct sock *sk) { - struct dst_entry *dst = sk_dst_get(sk); - struct net_device *netdev = NULL; + struct net_device *dev, *lowest_dev = NULL; + struct dst_entry *dst;
- if (likely(dst)) { - netdev = dst->dev; - dev_hold(netdev); + rcu_read_lock(); + dst = __sk_dst_get(sk); + dev = dst ? dst_dev_rcu(dst) : NULL; + if (likely(dev)) { + lowest_dev = netdev_sk_get_lowest_dev(dev, sk); + dev_hold(lowest_dev); } + rcu_read_unlock();
- dst_release(dst); - - return netdev; + return lowest_dev; }
static void destroy_record(struct tls_record_info *record)
linux-stable-mirror@lists.linaro.org