[PATCH 5.15 085/691] inet: inet_defrag: prevent sk release while still in use

15 Oct 2024

5.15-stable review patch.  If anyone has any objections, please let me know.
------------------
From: Florian Westphal fw@strlen.de
commit 18685451fc4e546fc0e718580d32df3c0e5c8272 upstream.
ip_local_out() and other functions can pass skb->sk as function argument.
If the skb is a fragment and reassembly happens before such function call
returns, the sk must not be released.
This affects skb fragments reassembled via netfilter or similar
modules, e.g. openvswitch or ct_act.c, when run as part of tx pipeline.
Eric Dumazet made an initial analysis of this bug.  Quoting Eric:
  Calling ip_defrag() in output path is also implying skb_orphan(),
  which is buggy because output path relies on sk not disappearing.
A relevant old patch about the issue was :
  8282f27449bf ("inet: frag: Always orphan skbs inside ip_defrag()")
[..]
net/ipv4/ip_output.c depends on skb->sk being set, and probably to an
  inet socket, not an arbitrary one.
If we orphan the packet in ipvlan, then downstream things like FQ
  packet scheduler will not work properly.
We need to change ip_defrag() to only use skb_orphan() when really
  needed, ie whenever frag_list is going to be used.
Eric suggested to stash sk in fragment queue and made an initial patch.
However there is a problem with this:
If skb is refragmented again right after, ip_do_fragment() will copy
head->sk to the new fragments, and sets up destructor to sock_wfree.
IOW, we have no choice but to fix up sk_wmem accouting to reflect the
fully reassembled skb, else wmem will underflow.
This change moves the orphan down into the core, to last possible moment.
As ip_defrag_offset is aliased with sk_buff->sk member, we must move the
offset into the FRAG_CB, else skb->sk gets clobbered.
This allows to delay the orphaning long enough to learn if the skb has
to be queued or if the skb is completing the reasm queue.
In the former case, things work as before, skb is orphaned.  This is
safe because skb gets queued/stolen and won't continue past reasm engine.
In the latter case, we will steal the skb->sk reference, reattach it to
the head skb, and fix up wmem accouting when inet_frag inflates truesize.
Fixes: 7026b1ddb6b8 ("netfilter: Pass socket pointer down through okfn().")
Diagnosed-by: Eric Dumazet edumazet@google.com
Reported-by: xingwei lee xrivendell7@gmail.com
Reported-by: yue sun samsun1006219@gmail.com
Reported-by: syzbot+e5167d7144a62715044c@syzkaller.appspotmail.com
Signed-off-by: Florian Westphal fw@strlen.de
Reviewed-by: Eric Dumazet edumazet@google.com
Link: https://lore.kernel.org/r/20240326101845.30836-1-fw@strlen.de
Signed-off-by: Paolo Abeni pabeni@redhat.com
Signed-off-by: Saeed Mirzamohammadi saeed.mirzamohammadi@oracle.com
Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org
---
 include/linux/skbuff.h                  |    7 ---
 net/ipv4/inet_fragment.c                |   70 ++++++++++++++++++++++++++------
 net/ipv4/ip_fragment.c                  |    2 
 net/ipv6/netfilter/nf_conntrack_reasm.c |    2 
 4 files changed, 60 insertions(+), 21 deletions(-)

--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -660,8 +660,6 @@ typedef unsigned char *sk_buff_data_t;
  *	@rbnode: RB tree node, alternative to next/prev for netem/tcp
  *	@list: queue head
  *	@sk: Socket we are owned by
- *	@ip_defrag_offset: (aka @sk) alternate use of @sk, used in
- *		fragmentation management
  *	@dev: Device we arrived on/are leaving by
  *	@dev_scratch: (aka @dev) alternate use of @dev when @dev would be %NULL
  *	@cb: Control buffer. Free for use by every layer. Put private vars here
@@ -778,10 +776,7 @@ struct sk_buff {
    	struct list_head	list;
    };
-	union {
-		struct sock		*sk;
-		int			ip_defrag_offset;
-	};
+	struct sock		*sk;
union {
    	ktime_t		tstamp;
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -24,6 +24,8 @@
 #include <net/ip.h>
 #include <net/ipv6.h>
+#include "../core/sock_destructor.h"
+
 /* Use skb->cb to track consecutive/adjacent fragments coming at
  * the end of the queue. Nodes in the rb-tree queue will
  * contain "runs" of one or more adjacent fragments.
@@ -39,6 +41,7 @@ struct ipfrag_skb_cb {
    };
    struct sk_buff		*next_frag;
    int			frag_run_len;
+	int			ip_defrag_offset;
 };
#define FRAG_CB(skb)		((struct ipfrag_skb_cb *)((skb)->cb))
@@ -390,12 +393,12 @@ int inet_frag_queue_insert(struct inet_f
     */
    if (!last)
    	fragrun_create(q, skb);  /* First fragment. */
-	else if (last->ip_defrag_offset + last->len < end) {
+	else if (FRAG_CB(last)->ip_defrag_offset + last->len < end) {
    	/* This is the common case: skb goes to the end. */
    	/* Detect and discard overlaps. */
-		if (offset < last->ip_defrag_offset + last->len)
+		if (offset < FRAG_CB(last)->ip_defrag_offset + last->len)
    		return IPFRAG_OVERLAP;
-		if (offset == last->ip_defrag_offset + last->len)
+		if (offset == FRAG_CB(last)->ip_defrag_offset + last->len)
    		fragrun_append_to_last(q, skb);
    	else
    		fragrun_create(q, skb);
@@ -412,13 +415,13 @@ int inet_frag_queue_insert(struct inet_f
parent = *rbn;
    		curr = rb_to_skb(parent);
-			curr_run_end = curr->ip_defrag_offset +
+			curr_run_end = FRAG_CB(curr)->ip_defrag_offset +
    				FRAG_CB(curr)->frag_run_len;
-			if (end <= curr->ip_defrag_offset)
+			if (end <= FRAG_CB(curr)->ip_defrag_offset)
    			rbn = &parent->rb_left;
    		else if (offset >= curr_run_end)
    			rbn = &parent->rb_right;
-			else if (offset >= curr->ip_defrag_offset &&
+			else if (offset >= FRAG_CB(curr)->ip_defrag_offset &&
    			 end <= curr_run_end)
    			return IPFRAG_DUP;
    		else
@@ -432,7 +435,7 @@ int inet_frag_queue_insert(struct inet_f
    	rb_insert_color(&skb->rbnode, &q->rb_fragments);
    }
-	skb->ip_defrag_offset = offset;
+	FRAG_CB(skb)->ip_defrag_offset = offset;
return IPFRAG_OK;
 }
@@ -442,13 +445,28 @@ void *inet_frag_reasm_prepare(struct ine
    		      struct sk_buff *parent)
 {
    struct sk_buff *fp, *head = skb_rb_first(&q->rb_fragments);
-	struct sk_buff **nextp;
+	void (*destructor)(struct sk_buff *);
+	unsigned int orig_truesize = 0;
+	struct sk_buff **nextp = NULL;
+	struct sock *sk = skb->sk;
    int delta;
+	if (sk && is_skb_wmem(skb)) {
+		/* TX: skb->sk might have been passed as argument to
+		 * dst->output and must remain valid until tx completes.
+		 *
+		 * Move sk to reassembled skb and fix up wmem accounting.
+		 */
+		orig_truesize = skb->truesize;
+		destructor = skb->destructor;
+	}
+
    if (head != skb) {
    	fp = skb_clone(skb, GFP_ATOMIC);
-		if (!fp)
-			return NULL;
+		if (!fp) {
+			head = skb;
+			goto out_restore_sk;
+		}
    	FRAG_CB(fp)->next_frag = FRAG_CB(skb)->next_frag;
    	if (RB_EMPTY_NODE(&skb->rbnode))
    		FRAG_CB(parent)->next_frag = fp;
@@ -457,6 +475,12 @@ void *inet_frag_reasm_prepare(struct ine
    				&q->rb_fragments);
    	if (q->fragments_tail == skb)
    		q->fragments_tail = fp;
+
+		if (orig_truesize) {
+			/* prevent skb_morph from releasing sk */
+			skb->sk = NULL;
+			skb->destructor = NULL;
+		}
    	skb_morph(skb, head);
    	FRAG_CB(skb)->next_frag = FRAG_CB(head)->next_frag;
    	rb_replace_node(&head->rbnode, &skb->rbnode,
@@ -464,13 +488,13 @@ void *inet_frag_reasm_prepare(struct ine
    	consume_skb(head);
    	head = skb;
    }
-	WARN_ON(head->ip_defrag_offset != 0);
+	WARN_ON(FRAG_CB(head)->ip_defrag_offset != 0);
delta = -head->truesize;
/* Head of list must not be cloned. */
    if (skb_unclone(head, GFP_ATOMIC))
-		return NULL;
+		goto out_restore_sk;
delta += head->truesize;
    if (delta)
@@ -486,7 +510,7 @@ void *inet_frag_reasm_prepare(struct ine
clone = alloc_skb(0, GFP_ATOMIC);
    	if (!clone)
-			return NULL;
+			goto out_restore_sk;
    	skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list;
    	skb_frag_list_init(head);
    	for (i = 0; i < skb_shinfo(head)->nr_frags; i++)
@@ -503,6 +527,21 @@ void *inet_frag_reasm_prepare(struct ine
    	nextp = &skb_shinfo(head)->frag_list;
    }
+out_restore_sk:
+	if (orig_truesize) {
+		int ts_delta = head->truesize - orig_truesize;
+
+		/* if this reassembled skb is fragmented later,
+		 * fraglist skbs will get skb->sk assigned from head->sk,
+		 * and each frag skb will be released via sock_wfree.
+		 *
+		 * Update sk_wmem_alloc.
+		 */
+		head->sk = sk;
+		head->destructor = destructor;
+		refcount_add(ts_delta, &sk->sk_wmem_alloc);
+	}
+
    return nextp;
 }
 EXPORT_SYMBOL(inet_frag_reasm_prepare);
@@ -510,6 +549,8 @@ EXPORT_SYMBOL(inet_frag_reasm_prepare);
 void inet_frag_reasm_finish(struct inet_frag_queue *q, struct sk_buff *head,
    		    void *reasm_data, bool try_coalesce)
 {
+	struct sock *sk = is_skb_wmem(head) ? head->sk : NULL;
+	const unsigned int head_truesize = head->truesize;
    struct sk_buff **nextp = (struct sk_buff **)reasm_data;
    struct rb_node *rbn;
    struct sk_buff *fp;
@@ -572,6 +613,9 @@ void inet_frag_reasm_finish(struct inet_
    skb_mark_not_on_list(head);
    head->prev = NULL;
    head->tstamp = q->stamp;
+
+	if (sk)
+		refcount_add(sum_truesize - head_truesize, &sk->sk_wmem_alloc);
 }
 EXPORT_SYMBOL(inet_frag_reasm_finish);
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -377,6 +377,7 @@ static int ip_frag_queue(struct ipq *qp,
    }
skb_dst_drop(skb);
+	skb_orphan(skb);
    return -EINPROGRESS;
insert_error:
@@ -479,7 +480,6 @@ int ip_defrag(struct net *net, struct sk
    struct ipq *qp;
__IP_INC_STATS(net, IPSTATS_MIB_REASMREQDS);
-	skb_orphan(skb);
/* Lookup (or create) queue header */
    qp = ip_find(net, ip_hdr(skb), user, vif);
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -297,6 +297,7 @@ static int nf_ct_frag6_queue(struct frag
    }
skb_dst_drop(skb);
+	skb_orphan(skb);
    return -EINPROGRESS;
insert_error:
@@ -472,7 +473,6 @@ int nf_ct_frag6_gather(struct net *net,
    hdr = ipv6_hdr(skb);
    fhdr = (struct frag_hdr *)skb_transport_header(skb);
-	skb_orphan(skb);
    fq = fq_find(net, fhdr->identification, user, hdr,
    	     skb->dev ? skb->dev->ifindex : 0);
    if (fq == NULL) {

    

2025

2024

2023

2022

2021

2020

2019

2018

2017

[PATCH 5.15 085/691] inet: inet_defrag: prevent sk release while still in use