From: Peter Oskolkov posk@google.com
commit 22c2ad616b74f3de2256b242572ab449d031d941 upstream.
In some testing scenarios, dst/route cache can fill up so quickly that even an explicit GC call occasionally fails to clean it up. This leads to sporadically failing calls to dst_alloc and "network unreachable" errors to the user, which is confusing.
This patch adds a diagnostic message to make the cause of the failure easier to determine.
Signed-off-by: Peter Oskolkov posk@google.com Signed-off-by: David S. Miller davem@davemloft.net Signed-off-by: Suraj Jitindar Singh surajjs@amazon.com Cc: stable@vger.kernel.org # 4.19.x --- net/core/dst.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-)
diff --git a/net/core/dst.c b/net/core/dst.c index 81ccf20e2826..a263309df115 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -98,8 +98,12 @@ void *dst_alloc(struct dst_ops *ops, struct net_device *dev, struct dst_entry *dst;
if (ops->gc && dst_entries_get_fast(ops) > ops->gc_thresh) { - if (ops->gc(ops)) + if (ops->gc(ops)) { + printk_ratelimited(KERN_NOTICE "Route cache is full: " + "consider increasing sysctl " + "net.ipv[4|6].route.max_size.\n"); return NULL; + } }
dst = kmem_cache_alloc(ops->kmem_cachep, GFP_ATOMIC);
From: Eric Dumazet edumazet@google.com
commit cf86a086a18095e33e0637cb78cda1fcf5280852 upstream.
percpu_counter_add() uses a default batch size which is quite big on platforms with 256 cpus. (2*256 -> 512)
This means dst_entries_get_fast() can be off by +/- 2*(nr_cpus^2) (131072 on servers with 256 cpus)
Reduce the batch size to something more reasonable, and add logic to ip6_dst_gc() to call dst_entries_get_slow() before calling the _very_ expensive fib6_run_gc() function.
Signed-off-by: Eric Dumazet edumazet@google.com Signed-off-by: Jakub Kicinski kuba@kernel.org Signed-off-by: Suraj Jitindar Singh surajjs@amazon.com Cc: stable@vger.kernel.org # 4.19.x --- include/net/dst_ops.h | 4 +++- net/core/dst.c | 8 ++++---- net/ipv6/route.c | 3 +++ 3 files changed, 10 insertions(+), 5 deletions(-)
diff --git a/include/net/dst_ops.h b/include/net/dst_ops.h index 443863c7b8da..88ff7bb2bb9b 100644 --- a/include/net/dst_ops.h +++ b/include/net/dst_ops.h @@ -53,9 +53,11 @@ static inline int dst_entries_get_slow(struct dst_ops *dst) return percpu_counter_sum_positive(&dst->pcpuc_entries); }
+#define DST_PERCPU_COUNTER_BATCH 32 static inline void dst_entries_add(struct dst_ops *dst, int val) { - percpu_counter_add(&dst->pcpuc_entries, val); + percpu_counter_add_batch(&dst->pcpuc_entries, val, + DST_PERCPU_COUNTER_BATCH); }
static inline int dst_entries_init(struct dst_ops *dst) diff --git a/net/core/dst.c b/net/core/dst.c index a263309df115..1a9f84f8cde1 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -97,11 +97,11 @@ void *dst_alloc(struct dst_ops *ops, struct net_device *dev, { struct dst_entry *dst;
- if (ops->gc && dst_entries_get_fast(ops) > ops->gc_thresh) { + if (ops->gc && + !(flags & DST_NOCOUNT) && + dst_entries_get_fast(ops) > ops->gc_thresh) { if (ops->gc(ops)) { - printk_ratelimited(KERN_NOTICE "Route cache is full: " - "consider increasing sysctl " - "net.ipv[4|6].route.max_size.\n"); + pr_notice_ratelimited("Route cache is full: consider increasing sysctl net.ipv6.route.max_size.\n"); return NULL; } } diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 7b41d5d3575f..d8944ae0171a 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -2778,6 +2778,9 @@ static int ip6_dst_gc(struct dst_ops *ops) int entries;
entries = dst_entries_get_fast(ops); + if (entries > rt_max_size) + entries = dst_entries_get_slow(ops); + if (time_after(rt_last_gc + rt_min_interval, jiffies) && entries <= rt_max_size) goto out;
From: Eric Dumazet edumazet@google.com
commit 9cb7c013420f98fa6fd12fc6a5dc055170c108db upstream.
Reads and Writes to ip6_rt_gc_expire always have been racy, as syzbot reported lately [1]
There is a possible risk of under-flow, leading to unexpected high value passed to fib6_run_gc(), although I have not observed this in the field.
Hosts hitting ip6_dst_gc() very hard are under pretty bad state anyway.
[1] BUG: KCSAN: data-race in ip6_dst_gc / ip6_dst_gc
read-write to 0xffff888102110744 of 4 bytes by task 13165 on cpu 1: ip6_dst_gc+0x1f3/0x220 net/ipv6/route.c:3311 dst_alloc+0x9b/0x160 net/core/dst.c:86 ip6_dst_alloc net/ipv6/route.c:344 [inline] icmp6_dst_alloc+0xb2/0x360 net/ipv6/route.c:3261 mld_sendpack+0x2b9/0x580 net/ipv6/mcast.c:1807 mld_send_cr net/ipv6/mcast.c:2119 [inline] mld_ifc_work+0x576/0x800 net/ipv6/mcast.c:2651 process_one_work+0x3d3/0x720 kernel/workqueue.c:2289 worker_thread+0x618/0xa70 kernel/workqueue.c:2436 kthread+0x1a9/0x1e0 kernel/kthread.c:376 ret_from_fork+0x1f/0x30
read-write to 0xffff888102110744 of 4 bytes by task 11607 on cpu 0: ip6_dst_gc+0x1f3/0x220 net/ipv6/route.c:3311 dst_alloc+0x9b/0x160 net/core/dst.c:86 ip6_dst_alloc net/ipv6/route.c:344 [inline] icmp6_dst_alloc+0xb2/0x360 net/ipv6/route.c:3261 mld_sendpack+0x2b9/0x580 net/ipv6/mcast.c:1807 mld_send_cr net/ipv6/mcast.c:2119 [inline] mld_ifc_work+0x576/0x800 net/ipv6/mcast.c:2651 process_one_work+0x3d3/0x720 kernel/workqueue.c:2289 worker_thread+0x618/0xa70 kernel/workqueue.c:2436 kthread+0x1a9/0x1e0 kernel/kthread.c:376 ret_from_fork+0x1f/0x30
value changed: 0x00000bb3 -> 0x00000ba9
Reported by Kernel Concurrency Sanitizer on: CPU: 0 PID: 11607 Comm: kworker/0:21 Not tainted 5.18.0-rc1-syzkaller-00037-g42e7a03d3bad-dirty #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Workqueue: mld mld_ifc_work
Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Eric Dumazet edumazet@google.com Reported-by: syzbot syzkaller@googlegroups.com Reviewed-by: David Ahern dsahern@kernel.org Link: https://lore.kernel.org/r/20220413181333.649424-1-eric.dumazet@gmail.com Signed-off-by: Jakub Kicinski kuba@kernel.org [ 4.19: context adjustment in include/net/netns/ipv6.h ] Signed-off-by: Suraj Jitindar Singh surajjs@amazon.com Cc: stable@vger.kernel.org # 4.19.x --- include/net/netns/ipv6.h | 4 ++-- net/ipv6/route.c | 11 ++++++----- 2 files changed, 8 insertions(+), 7 deletions(-)
diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h index f0e396ab9bec..8f4d013fa05f 100644 --- a/include/net/netns/ipv6.h +++ b/include/net/netns/ipv6.h @@ -72,8 +72,8 @@ struct netns_ipv6 { struct dst_ops ip6_dst_ops; rwlock_t fib6_walker_lock; spinlock_t fib6_gc_lock; - unsigned int ip6_rt_gc_expire; - unsigned long ip6_rt_last_gc; + atomic_t ip6_rt_gc_expire; + unsigned long ip6_rt_last_gc; #ifdef CONFIG_IPV6_MULTIPLE_TABLES unsigned int fib6_rules_require_fldissect; bool fib6_has_custom_rules; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index d8944ae0171a..0f8d7786e8e8 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -2775,6 +2775,7 @@ static int ip6_dst_gc(struct dst_ops *ops) int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity; int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout; unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc; + unsigned int val; int entries;
entries = dst_entries_get_fast(ops); @@ -2785,13 +2786,13 @@ static int ip6_dst_gc(struct dst_ops *ops) entries <= rt_max_size) goto out;
- net->ipv6.ip6_rt_gc_expire++; - fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true); + fib6_run_gc(atomic_inc_return(&net->ipv6.ip6_rt_gc_expire), net, true); entries = dst_entries_get_slow(ops); if (entries < ops->gc_thresh) - net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1; + atomic_set(&net->ipv6.ip6_rt_gc_expire, rt_gc_timeout >> 1); out: - net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity; + val = atomic_read(&net->ipv6.ip6_rt_gc_expire); + atomic_set(&net->ipv6.ip6_rt_gc_expire, val - (val >> rt_elasticity)); return entries > rt_max_size; }
@@ -5343,7 +5344,7 @@ static int __net_init ip6_route_net_init(struct net *net) net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ; net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
- net->ipv6.ip6_rt_gc_expire = 30*HZ; + atomic_set(&net->ipv6.ip6_rt_gc_expire, 30*HZ);
ret = 0; out:
From: Jon Maxwell jmaxwell37@gmail.com
commit af6d10345ca76670c1b7c37799f0d5576ccef277 upstream.
In ip6_dst_gc() replace:
if (entries > gc_thresh)
With:
if (entries > ops->gc_thresh)
Sending Ipv6 packets in a loop via a raw socket triggers an issue where a route is cloned by ip6_rt_cache_alloc() for each packet sent. This quickly consumes the Ipv6 max_size threshold which defaults to 4096 resulting in these warnings:
[1] 99.187805] dst_alloc: 7728 callbacks suppressed [2] Route cache is full: consider increasing sysctl net.ipv6.route.max_size. . . [300] Route cache is full: consider increasing sysctl net.ipv6.route.max_size.
When this happens the packet is dropped and sendto() gets a network is unreachable error:
remaining pkt 200557 errno 101 remaining pkt 196462 errno 101 . . remaining pkt 126821 errno 101
Implement David Aherns suggestion to remove max_size check seeing that Ipv6 has a GC to manage memory usage. Ipv4 already does not check max_size.
Here are some memory comparisons for Ipv4 vs Ipv6 with the patch:
Test by running 5 instances of a program that sends UDP packets to a raw socket 5000000 times. Compare Ipv4 and Ipv6 performance with a similar program.
Ipv4:
Before test:
MemFree: 29427108 kB Slab: 237612 kB
ip6_dst_cache 1912 2528 256 32 2 : tunables 0 0 0 xfrm_dst_cache 0 0 320 25 2 : tunables 0 0 0 ip_dst_cache 2881 3990 192 42 2 : tunables 0 0 0
During test:
MemFree: 29417608 kB Slab: 247712 kB
ip6_dst_cache 1912 2528 256 32 2 : tunables 0 0 0 xfrm_dst_cache 0 0 320 25 2 : tunables 0 0 0 ip_dst_cache 44394 44394 192 42 2 : tunables 0 0 0
After test:
MemFree: 29422308 kB Slab: 238104 kB
ip6_dst_cache 1912 2528 256 32 2 : tunables 0 0 0 xfrm_dst_cache 0 0 320 25 2 : tunables 0 0 0 ip_dst_cache 3048 4116 192 42 2 : tunables 0 0 0
Ipv6 with patch:
Errno 101 errors are not observed anymore with the patch.
Before test:
MemFree: 29422308 kB Slab: 238104 kB
ip6_dst_cache 1912 2528 256 32 2 : tunables 0 0 0 xfrm_dst_cache 0 0 320 25 2 : tunables 0 0 0 ip_dst_cache 3048 4116 192 42 2 : tunables 0 0 0
During Test:
MemFree: 29431516 kB Slab: 240940 kB
ip6_dst_cache 11980 12064 256 32 2 : tunables 0 0 0 xfrm_dst_cache 0 0 320 25 2 : tunables 0 0 0 ip_dst_cache 3048 4116 192 42 2 : tunables 0 0 0
After Test:
MemFree: 29441816 kB Slab: 238132 kB
ip6_dst_cache 1902 2432 256 32 2 : tunables 0 0 0 xfrm_dst_cache 0 0 320 25 2 : tunables 0 0 0 ip_dst_cache 3048 4116 192 42 2 : tunables 0 0 0
Tested-by: Andrea Mayer andrea.mayer@uniroma2.it Signed-off-by: Jon Maxwell jmaxwell37@gmail.com Reviewed-by: David Ahern dsahern@kernel.org Link: https://lore.kernel.org/r/20230112012532.311021-1-jmaxwell37@gmail.com Signed-off-by: Jakub Kicinski kuba@kernel.org Signed-off-by: Suraj Jitindar Singh surajjs@amazon.com Cc: stable@vger.kernel.org # 4.19.x --- include/net/dst_ops.h | 2 +- net/core/dst.c | 8 ++------ net/ipv6/route.c | 13 +++++-------- 3 files changed, 8 insertions(+), 15 deletions(-)
diff --git a/include/net/dst_ops.h b/include/net/dst_ops.h index 88ff7bb2bb9b..632086b2f644 100644 --- a/include/net/dst_ops.h +++ b/include/net/dst_ops.h @@ -16,7 +16,7 @@ struct dst_ops { unsigned short family; unsigned int gc_thresh;
- int (*gc)(struct dst_ops *ops); + void (*gc)(struct dst_ops *ops); struct dst_entry * (*check)(struct dst_entry *, __u32 cookie); unsigned int (*default_advmss)(const struct dst_entry *); unsigned int (*mtu)(const struct dst_entry *); diff --git a/net/core/dst.c b/net/core/dst.c index 1a9f84f8cde1..1b1677683b97 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -99,12 +99,8 @@ void *dst_alloc(struct dst_ops *ops, struct net_device *dev,
if (ops->gc && !(flags & DST_NOCOUNT) && - dst_entries_get_fast(ops) > ops->gc_thresh) { - if (ops->gc(ops)) { - pr_notice_ratelimited("Route cache is full: consider increasing sysctl net.ipv6.route.max_size.\n"); - return NULL; - } - } + dst_entries_get_fast(ops) > ops->gc_thresh) + ops->gc(ops);
dst = kmem_cache_alloc(ops->kmem_cachep, GFP_ATOMIC); if (!dst) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 0f8d7786e8e8..9dbc9c0cbc5a 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -92,7 +92,7 @@ static struct dst_entry *ip6_negative_advice(struct dst_entry *); static void ip6_dst_destroy(struct dst_entry *); static void ip6_dst_ifdown(struct dst_entry *, struct net_device *dev, int how); -static int ip6_dst_gc(struct dst_ops *ops); +static void ip6_dst_gc(struct dst_ops *ops);
static int ip6_pkt_discard(struct sk_buff *skb); static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb); @@ -2767,11 +2767,10 @@ struct dst_entry *icmp6_dst_alloc(struct net_device *dev, return dst; }
-static int ip6_dst_gc(struct dst_ops *ops) +static void ip6_dst_gc(struct dst_ops *ops) { struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops); int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval; - int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size; int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity; int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout; unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc; @@ -2779,11 +2778,10 @@ static int ip6_dst_gc(struct dst_ops *ops) int entries;
entries = dst_entries_get_fast(ops); - if (entries > rt_max_size) + if (entries > ops->gc_thresh) entries = dst_entries_get_slow(ops);
- if (time_after(rt_last_gc + rt_min_interval, jiffies) && - entries <= rt_max_size) + if (time_after(rt_last_gc + rt_min_interval, jiffies)) goto out;
fib6_run_gc(atomic_inc_return(&net->ipv6.ip6_rt_gc_expire), net, true); @@ -2793,7 +2791,6 @@ static int ip6_dst_gc(struct dst_ops *ops) out: val = atomic_read(&net->ipv6.ip6_rt_gc_expire); atomic_set(&net->ipv6.ip6_rt_gc_expire, val - (val >> rt_elasticity)); - return entries > rt_max_size; }
static int ip6_convert_metrics(struct net *net, struct fib6_info *rt, @@ -5336,7 +5333,7 @@ static int __net_init ip6_route_net_init(struct net *net) #endif
net->ipv6.sysctl.flush_delay = 0; - net->ipv6.sysctl.ip6_rt_max_size = 4096; + net->ipv6.sysctl.ip6_rt_max_size = INT_MAX; net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2; net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ; net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
On Fri, Jan 12, 2024 at 04:53:05PM -0800, Suraj Jitindar Singh wrote:
From: Peter Oskolkov posk@google.com
commit 22c2ad616b74f3de2256b242572ab449d031d941 upstream.
In some testing scenarios, dst/route cache can fill up so quickly that even an explicit GC call occasionally fails to clean it up. This leads to sporadically failing calls to dst_alloc and "network unreachable" errors to the user, which is confusing.
This patch adds a diagnostic message to make the cause of the failure easier to determine.
Signed-off-by: Peter Oskolkov posk@google.com Signed-off-by: David S. Miller davem@davemloft.net Signed-off-by: Suraj Jitindar Singh surajjs@amazon.com Cc: stable@vger.kernel.org # 4.19.x
net/core/dst.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-)
All now queued up, thanks.
greg k-h
linux-stable-mirror@lists.linaro.org