On Fri, Jun 28, 2024 at 2:33 AM Mina Almasry almasrymina@google.com wrote:
For device memory TCP, we expect the skb headers to be available in host memory for access, and we expect the skb frags to be in device memory and unaccessible to the host. We expect there to be no mixing and matching of device memory frags (unaccessible) with host memory frags (accessible) in the same skb.
Add a skb->devmem flag which indicates whether the frags in this skb are device memory frags or not.
__skb_fill_netmem_desc() now checks frags added to skbs for net_iov, and marks the skb as skb->devmem accordingly.
Add checks through the network stack to avoid accessing the frags of devmem skbs and avoid coalescing devmem skbs with non devmem skbs.
Signed-off-by: Willem de Bruijn willemb@google.com Signed-off-by: Kaiyuan Zhang kaiyuanz@google.com Signed-off-by: Mina Almasry almasrymina@google.com
v11:
- drop excessive checks for frag 0 pull (Paolo)
v9: https://lore.kernel.org/netdev/20240403002053.2376017-11-almasrymina@google....
- change skb->readable to skb->unreadable (Pavel/David).
skb->readable was very complicated, because by default skbs are readable so the flag needed to be set to true in all code paths where new skbs were created or cloned. Forgetting to set skb->readable=true in some paths caused crashes.
Flip it to skb->unreadable so that the default 0 value works well, and we only need to set it to true when we add unreadable frags.
v6
- skb->dmabuf -> skb->readable (Pavel). Pavel's original suggestion was to remove the skb->dmabuf flag entirely, but when I looked into it closely, I found the issue that if we remove the flag we have to dereference the shinfo(skb) pointer to obtain the first frag, which can cause a performance regression if it dirties the cache line when the shinfo(skb) was not really needed. Instead, I converted the skb->dmabuf flag into a generic skb->readable flag which can be re-used by io_uring.
Changes in v1:
- Rename devmem -> dmabuf (David).
- Flip skb_frags_not_readable (Jakub).
include/linux/skbuff.h | 19 +++++++++++++++-- include/net/tcp.h | 5 +++-- net/core/datagram.c | 6 ++++++ net/core/skbuff.c | 48 ++++++++++++++++++++++++++++++++++++++++-- net/ipv4/tcp.c | 3 +++ net/ipv4/tcp_input.c | 13 +++++++++--- net/ipv4/tcp_output.c | 5 ++++- net/packet/af_packet.c | 4 ++-- 8 files changed, 91 insertions(+), 12 deletions(-)
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 3cd06eb3a44da..5438434b61300 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -827,6 +827,8 @@ enum skb_tstamp_type {
@csum_level: indicates the number of consecutive checksums found in
the packet minus one that have been verified as
CHECKSUM_UNNECESSARY (max 3)
@unreadable: indicates that at least 1 of the fragments in this skb is
unreadable.
@dst_pending_confirm: need to confirm neighbour
@decrypted: Decrypted SKB
@slow_gro: state present at GRO time, slower prepare step required
@@ -1008,7 +1010,7 @@ struct sk_buff { #if IS_ENABLED(CONFIG_IP_SCTP) __u8 csum_not_inet:1; #endif
__u8 unreadable:1;
#if defined(CONFIG_NET_SCHED) || defined(CONFIG_NET_XGRESS) __u16 tc_index; /* traffic control index */ #endif @@ -1820,6 +1822,12 @@ static inline void skb_zcopy_downgrade_managed(struct sk_buff *skb) __skb_zcopy_downgrade_managed(skb); }
+/* Return true if frags in this skb are readable by the host. */ +static inline bool skb_frags_readable(const struct sk_buff *skb) +{
return !skb->unreadable;
+}
static inline void skb_mark_not_on_list(struct sk_buff *skb) { skb->next = NULL; @@ -2536,10 +2544,17 @@ static inline void skb_len_add(struct sk_buff *skb, int delta) static inline void __skb_fill_netmem_desc(struct sk_buff *skb, int i, netmem_ref netmem, int off, int size) {
struct page *page = netmem_to_page(netmem);
struct page *page; __skb_fill_netmem_desc_noacc(skb_shinfo(skb), i, netmem, off, size);
if (netmem_is_net_iov(netmem)) {
skb->unreadable = true;
return;
}
page = netmem_to_page(netmem);
/* Propagate page pfmemalloc to the skb if we can. The problem is * that not all callers have unique ownership of the page but rely * on page_is_pfmemalloc doing the right thing(tm).
diff --git a/include/net/tcp.h b/include/net/tcp.h index 2aac11e7e1cc5..e8f6e602c2ad4 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1060,7 +1060,7 @@ static inline int tcp_skb_mss(const struct sk_buff *skb)
static inline bool tcp_skb_can_collapse_to(const struct sk_buff *skb) {
return likely(!TCP_SKB_CB(skb)->eor);
return likely(!TCP_SKB_CB(skb)->eor && skb_frags_readable(skb));
}
static inline bool tcp_skb_can_collapse(const struct sk_buff *to, @@ -1069,7 +1069,8 @@ static inline bool tcp_skb_can_collapse(const struct sk_buff *to, /* skb_cmp_decrypted() not needed, use tcp_write_collapse_fence() */ return likely(tcp_skb_can_collapse_to(to) && mptcp_skb_can_collapse(to, from) &&
skb_pure_zcopy_same(to, from));
skb_pure_zcopy_same(to, from) &&
skb_frags_readable(to) == skb_frags_readable(from));
}
static inline bool tcp_skb_can_collapse_rx(const struct sk_buff *to, diff --git a/net/core/datagram.c b/net/core/datagram.c index 95f242591fd23..e1d12f55236df 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -407,6 +407,9 @@ static int __skb_datagram_iter(const struct sk_buff *skb, int offset, return 0; }
if (!skb_frags_readable(skb))
goto short_copy;
/* Copy paged appendix. Hmm... why does this look so complicated? */ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { int end;
@@ -619,6 +622,9 @@ int __zerocopy_sg_from_iter(struct msghdr *msg, struct sock *sk, if (msg && msg->msg_ubuf && msg->sg_from_iter) return msg->sg_from_iter(sk, skb, from, length);
if (!skb_frags_readable(skb))
return -EFAULT;
frag = skb_shinfo(skb)->nr_frags; while (length && iov_iter_count(from)) {
diff --git a/net/core/skbuff.c b/net/core/skbuff.c index cc47774bbeb98..1e82222d0a6dd 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -1968,6 +1968,9 @@ int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask) if (skb_shared(skb) || skb_unclone(skb, gfp_mask)) return -EINVAL;
if (!skb_frags_readable(skb))
return -EFAULT;
if (!num_frags) goto release;
@@ -2141,6 +2144,9 @@ struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t gfp_mask) unsigned int size; int headerlen;
if (!skb_frags_readable(skb))
return NULL;
if (WARN_ON_ONCE(skb_shinfo(skb)->gso_type & SKB_GSO_FRAGLIST)) return NULL;
@@ -2479,6 +2485,9 @@ struct sk_buff *skb_copy_expand(const struct sk_buff *skb, struct sk_buff *n; int oldheadroom;
if (!skb_frags_readable(skb))
return NULL;
if (WARN_ON_ONCE(skb_shinfo(skb)->gso_type & SKB_GSO_FRAGLIST)) return NULL;
@@ -2823,6 +2832,9 @@ void *__pskb_pull_tail(struct sk_buff *skb, int delta) */ int i, k, eat = (skb->tail + delta) - skb->end;
if (!skb_frags_readable(skb))
return NULL;
if (eat > 0 || skb_cloned(skb)) { if (pskb_expand_head(skb, 0, eat > 0 ? eat + 128 : 0, GFP_ATOMIC))
@@ -2976,6 +2988,9 @@ int skb_copy_bits(const struct sk_buff *skb, int offset, void *to, int len) to += copy; }
if (!skb_frags_readable(skb))
goto fault;
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { int end; skb_frag_t *f = &skb_shinfo(skb)->frags[i];
@@ -3164,6 +3179,9 @@ static bool __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe, /* * then map the fragments */
if (!skb_frags_readable(skb))
return false;
for (seg = 0; seg < skb_shinfo(skb)->nr_frags; seg++) { const skb_frag_t *f = &skb_shinfo(skb)->frags[seg];
@@ -3387,6 +3405,9 @@ int skb_store_bits(struct sk_buff *skb, int offset, const void *from, int len) from += copy; }
if (!skb_frags_readable(skb))
goto fault;
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; int end;
@@ -3466,6 +3487,9 @@ __wsum __skb_checksum(const struct sk_buff *skb, int offset, int len, pos = copy; }
if (!skb_frags_readable(skb))
return 0;
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { int end; skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
@@ -3566,6 +3590,9 @@ __wsum skb_copy_and_csum_bits(const struct sk_buff *skb, int offset, pos = copy; }
if (!skb_frags_readable(skb))
return 0;
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { int end;
@@ -4057,6 +4084,7 @@ static inline void skb_split_inside_header(struct sk_buff *skb, skb_shinfo(skb1)->frags[i] = skb_shinfo(skb)->frags[i];
skb_shinfo(skb1)->nr_frags = skb_shinfo(skb)->nr_frags;
skb1->unreadable = skb->unreadable; skb_shinfo(skb)->nr_frags = 0; skb1->data_len = skb->data_len; skb1->len += skb1->data_len;
@@ -4071,6 +4099,7 @@ static inline void skb_split_no_header(struct sk_buff *skb, { int i, k = 0; const int nfrags = skb_shinfo(skb)->nr_frags;
const int unreadable = skb->unreadable; skb_shinfo(skb)->nr_frags = 0; skb1->len = skb1->data_len = skb->len - len;
@@ -4104,6 +4133,12 @@ static inline void skb_split_no_header(struct sk_buff *skb, pos += size; } skb_shinfo(skb1)->nr_frags = k;
Minor point : skb->unreadable can be left as is ?
if (skb_shinfo(skb)->nr_frags)
skb->unreadable = unreadable;
Minor point : skb_shinfo(skb1)->nr_frags can't be zero at this point.
if (skb_shinfo(skb1)->nr_frags)
skb1->unreadable = unreadable;
}
This means we probably could remove the unreadable variable and
skb1->unreadable = skb->unreadable;
No need to send a new version, this can be incrementally changed later.
Reviewed-by: Eric Dumazet edumazet@google.com