3.16.52-rc1 review patch. If anyone has any objections, please let me know.
------------------
From: Paolo Abeni pabeni@redhat.com
commit bc044e8db7962e727a75b591b9851ff2ac5cf846 upstream.
The UDP early demux can leverate the rx dst cache even for multicast unconnected sockets.
In such scenario the ipv4 source address is validated only on the first packet in the given flow. After that, when we fetch the dst entry from the socket rx cache, we stop enforcing the rp_filter and we even start accepting any kind of martian addresses.
Disabling the dst cache for unconnected multicast socket will cause large performace regression, nearly reducing by half the max ingress tput.
Instead we factor out a route helper to completely validate an skb source address for multicast packets and we call it from the UDP early demux for mcast packets landing on unconnected sockets, after successful fetching the related cached dst entry.
This still gives a measurable, but limited performance regression:
rp_filter = 0 rp_filter = 1 edmux disabled: 1182 Kpps 1127 Kpps edmux before: 2238 Kpps 2238 Kpps edmux after: 2037 Kpps 2019 Kpps
The above figures are on top of current net tree. Applying the net-next commit 6e617de84e87 ("net: avoid a full fib lookup when rp_filter is disabled.") the delta with rp_filter == 0 will decrease even more.
Fixes: 421b3885bf6d ("udp: ipv4: Add udp early demux") Signed-off-by: Paolo Abeni pabeni@redhat.com Signed-off-by: David S. Miller davem@davemloft.net Signed-off-by: Ben Hutchings ben@decadent.org.uk --- include/net/route.h | 4 +++- net/ipv4/route.c | 46 ++++++++++++++++++++++++++-------------------- net/ipv4/udp.c | 13 ++++++++++++- 3 files changed, 41 insertions(+), 22 deletions(-)
--- a/include/net/route.h +++ b/include/net/route.h @@ -159,7 +159,9 @@ static inline struct rtable *ip_route_ou fl4->fl4_gre_key = gre_key; return ip_route_output_key(net, fl4); } - +int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr, + u8 tos, struct net_device *dev, + struct in_device *in_dev, u32 *itag); int ip_route_input_noref(struct sk_buff *skb, __be32 dst, __be32 src, u8 tos, struct net_device *devin);
--- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1440,40 +1440,53 @@ static struct rtable *rt_dst_alloc(struc }
/* called in rcu_read_lock() section */ -static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, - u8 tos, struct net_device *dev, int our) +int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr, + u8 tos, struct net_device *dev, + struct in_device *in_dev, u32 *itag) { - struct rtable *rth; - struct in_device *in_dev = __in_dev_get_rcu(dev); - u32 itag = 0; int err;
/* Primary sanity checks. */ - if (in_dev == NULL) return -EINVAL;
if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) || skb->protocol != htons(ETH_P_IP)) - goto e_inval; + return -EINVAL;
if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev))) if (ipv4_is_loopback(saddr)) - goto e_inval; + return -EINVAL;
if (ipv4_is_zeronet(saddr)) { if (!ipv4_is_local_multicast(daddr)) - goto e_inval; + return -EINVAL; } else { err = fib_validate_source(skb, saddr, 0, tos, 0, dev, - in_dev, &itag); + in_dev, itag); if (err < 0) - goto e_err; + return err; } + return 0; +} + +/* called in rcu_read_lock() section */ +static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, + u8 tos, struct net_device *dev, int our) +{ + struct in_device *in_dev = __in_dev_get_rcu(dev); + struct rtable *rth; + u32 itag = 0; + int err; + + err = ip_mc_validate_source(skb, daddr, saddr, tos, dev, in_dev, &itag); + if (err) + return err; + rth = rt_dst_alloc(dev_net(dev)->loopback_dev, IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false); if (!rth) - goto e_nobufs; + return -ENOBUFS;
#ifdef CONFIG_IP_ROUTE_CLASSID rth->dst.tclassid = itag; @@ -1502,13 +1515,6 @@ static int ip_route_input_mc(struct sk_b
skb_dst_set(skb, &rth->dst); return 0; - -e_nobufs: - return -ENOBUFS; -e_inval: - return -EINVAL; -e_err: - return err; }
--- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1953,6 +1953,7 @@ static struct sock *__udp4_lib_demux_loo int udp_v4_early_demux(struct sk_buff *skb) { struct net *net = dev_net(skb->dev); + struct in_device *in_dev = NULL; const struct iphdr *iph; const struct udphdr *uh; struct sock *sk; @@ -1969,7 +1970,7 @@ int udp_v4_early_demux(struct sk_buff *s
if (skb->pkt_type == PACKET_BROADCAST || skb->pkt_type == PACKET_MULTICAST) { - struct in_device *in_dev = __in_dev_get_rcu(skb->dev); + in_dev = __in_dev_get_rcu(skb->dev);
if (!in_dev) return 0; @@ -2001,6 +2002,8 @@ int udp_v4_early_demux(struct sk_buff *s if (dst) dst = dst_check(dst, 0); if (dst) { + u32 itag = 0; + /* DST_NOCACHE can not be used without taking a reference */ if (dst->flags & DST_NOCACHE) { if (likely(atomic_inc_not_zero(&dst->__refcnt))) @@ -2008,6 +2011,14 @@ int udp_v4_early_demux(struct sk_buff *s } else { skb_dst_set_noref(skb, dst); } + + /* for unconnected multicast sockets we need to validate + * the source on each packet + */ + if (!inet_sk(sk)->inet_daddr && in_dev) + return ip_mc_validate_source(skb, iph->daddr, + iph->saddr, iph->tos, + skb->dev, in_dev, &itag); } return 0; }