As I noticed in my previous patch to remove the 'timespec' usage in the packet socket, the timestamps in the packet socket are slightly inefficient as they convert a nanosecond value into seconds/nanoseconds or seconds/microseconds.
This adds two new socket options for the timestamp to resolve that:
PACKET_SKIPTIMESTAMP sets a flag to indicate whether to generate timestamps at all. When this is set, all timestamps are hardcoded to zero, which saves a few cycles for the conversion and the access of the hardware clocksource. The idea was taken from pktgen, which has an F_NO_TIMESTAMP option for the same purpose.
PACKET_TIMESTAMP_NS64 changes the interpretation of the time stamp fields: instead of having 32 bits for seconds plus 32 bits for nanoseconds or microseconds, we now always send down 64 bits worth of nanoseconds when this flag is set.
Link: https://patchwork.kernel.org/patch/10077199/ Suggested-by: Willem de Bruijn willemdebruijn.kernel@gmail.com Signed-off-by: Arnd Bergmann arnd@arndb.de --- I still have not done any runtime testing on this patch, only implemented the suggestions from the previous versions.
While I don't think anyone is actively looking for this feature, I don't think there are any reasons left against merging it either, and it might come in handy for someone.
v4: address minor comments from Willem v3: rework to use setsockopt v2: use new tstamp flags instead of a new version v1: original implementation using TPACKET_V4 --- include/uapi/linux/if_packet.h | 2 + net/packet/af_packet.c | 160 ++++++++++++++++++++++++++++------------- net/packet/internal.h | 2 + 3 files changed, 116 insertions(+), 48 deletions(-)
diff --git a/include/uapi/linux/if_packet.h b/include/uapi/linux/if_packet.h index 67b61d91d89b..2eba54770e6b 100644 --- a/include/uapi/linux/if_packet.h +++ b/include/uapi/linux/if_packet.h @@ -57,6 +57,8 @@ struct sockaddr_ll { #define PACKET_QDISC_BYPASS 20 #define PACKET_ROLLOVER_STATS 21 #define PACKET_FANOUT_DATA 22 +#define PACKET_SKIPTIMESTAMP 23 +#define PACKET_TIMESTAMP_NS64 24
#define PACKET_FANOUT_HASH 0 #define PACKET_FANOUT_LB 1 diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 7432c6699818..f55f330ab547 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -200,7 +200,7 @@ static void prb_retire_current_block(struct tpacket_kbdq_core *, struct packet_sock *, unsigned int status); static int prb_queue_frozen(struct tpacket_kbdq_core *); static void prb_open_block(struct tpacket_kbdq_core *, - struct tpacket_block_desc *); + struct tpacket_block_desc *, struct packet_sock *); static void prb_retire_rx_blk_timer_expired(struct timer_list *); static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *); static void prb_fill_rxhash(struct tpacket_kbdq_core *, struct tpacket3_hdr *); @@ -439,52 +439,91 @@ static int __packet_get_status(struct packet_sock *po, void *frame) } }
-static __u32 tpacket_get_timestamp(struct sk_buff *skb, struct timespec64 *ts, - unsigned int flags) +static __u32 tpacket_get_timestamp(struct sk_buff *skb, __u32 *hi, __u32 *lo) { + struct packet_sock *po = pkt_sk(skb->sk); struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb); + ktime_t stamp; + u32 type; + + if (po->tp_skiptstamp) + return 0;
if (shhwtstamps && - (flags & SOF_TIMESTAMPING_RAW_HARDWARE) && - ktime_to_timespec64_cond(shhwtstamps->hwtstamp, ts)) - return TP_STATUS_TS_RAW_HARDWARE; + (po->tp_tstamp & SOF_TIMESTAMPING_RAW_HARDWARE) && + shhwtstamps->hwtstamp) { + stamp = shhwtstamps->hwtstamp; + type = TP_STATUS_TS_RAW_HARDWARE; + } else if (skb->tstamp) { + stamp = skb->tstamp; + type = TP_STATUS_TS_SOFTWARE; + } else { + return 0; + }
- if (ktime_to_timespec64_cond(skb->tstamp, ts)) - return TP_STATUS_TS_SOFTWARE; + if (po->tp_tstamp_ns64) { + __u64 ns = ktime_to_ns(stamp);
- return 0; + *hi = upper_32_bits(ns); + *lo = lower_32_bits(ns); + } else { + struct timespec64 ts = ktime_to_timespec64(stamp); + + *hi = ts.tv_sec; + if (po->tp_version > TPACKET_V1) + *lo = ts.tv_nsec; + else + *lo = ts.tv_nsec / NSEC_PER_USEC; + } + + return type; +} + +static void packet_get_time(struct packet_sock *po, __u32 *hi, __u32 *lo) +{ + if (po->tp_skiptstamp) { + *hi = 0; + *lo = 0; + } else if (po->tp_tstamp_ns64) { + __u64 ns = ktime_get_real_ns(); + + *hi = upper_32_bits(ns); + *hi = lower_32_bits(ns); + } else { + struct timespec64 ts; + + ktime_get_real_ts64(&ts); + /* unsigned seconds overflow in y2106 here */ + *hi = ts.tv_sec; + if (po->tp_version > TPACKET_V1) + *lo = ts.tv_nsec; + else + *lo = ts.tv_nsec / NSEC_PER_USEC; + } }
static __u32 __packet_set_timestamp(struct packet_sock *po, void *frame, struct sk_buff *skb) { union tpacket_uhdr h; - struct timespec64 ts; - __u32 ts_status; + __u32 ts_status, hi, lo;
- if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp))) + if (!(ts_status = tpacket_get_timestamp(skb, &hi, &lo))) return 0;
h.raw = frame; - /* - * versions 1 through 3 overflow the timestamps in y2106, since they - * all store the seconds in a 32-bit unsigned integer. - * If we create a version 4, that should have a 64-bit timestamp, - * either 64-bit seconds + 32-bit nanoseconds, or just 64-bit - * nanoseconds. - */ switch (po->tp_version) { case TPACKET_V1: - h.h1->tp_sec = ts.tv_sec; - h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC; + h.h1->tp_sec = hi; + h.h1->tp_usec = lo; break; case TPACKET_V2: - h.h2->tp_sec = ts.tv_sec; - h.h2->tp_nsec = ts.tv_nsec; + h.h2->tp_sec = hi; + h.h2->tp_nsec = lo; break; case TPACKET_V3: - h.h3->tp_sec = ts.tv_sec; - h.h3->tp_nsec = ts.tv_nsec; + h.h3->tp_sec = hi; + h.h3->tp_nsec = lo; break; default: WARN(1, "TPACKET version not supported.\n"); @@ -633,7 +672,7 @@ static void init_prb_bdqc(struct packet_sock *po, p1->max_frame_len = p1->kblk_size - BLK_PLUS_PRIV(p1->blk_sizeof_priv); prb_init_ft_ops(p1, req_u); prb_setup_retire_blk_timer(po); - prb_open_block(p1, pbd); + prb_open_block(p1, pbd, po); }
/* Do NOT update the last_blk_num first. @@ -730,7 +769,7 @@ static void prb_retire_rx_blk_timer_expired(struct timer_list *t) * opening a block thaws the queue,restarts timer * Thawing/timer-refresh is a side effect. */ - prb_open_block(pkc, pbd); + prb_open_block(pkc, pbd, po); goto out; } } @@ -812,10 +851,8 @@ static void prb_close_block(struct tpacket_kbdq_core *pkc1, * It shouldn't really happen as we don't close empty * blocks. See prb_retire_rx_blk_timer_expired(). */ - struct timespec64 ts; - ktime_get_real_ts64(&ts); - h1->ts_last_pkt.ts_sec = ts.tv_sec; - h1->ts_last_pkt.ts_nsec = ts.tv_nsec; + packet_get_time(po, &h1->ts_last_pkt.ts_sec, + &h1->ts_last_pkt.ts_nsec); }
smp_wmb(); @@ -841,9 +878,8 @@ static void prb_thaw_queue(struct tpacket_kbdq_core *pkc) * */ static void prb_open_block(struct tpacket_kbdq_core *pkc1, - struct tpacket_block_desc *pbd1) + struct tpacket_block_desc *pbd1, struct packet_sock *po) { - struct timespec64 ts; struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
smp_rmb(); @@ -856,10 +892,8 @@ static void prb_open_block(struct tpacket_kbdq_core *pkc1, BLOCK_NUM_PKTS(pbd1) = 0; BLOCK_LEN(pbd1) = BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
- ktime_get_real_ts64(&ts); - - h1->ts_first_pkt.ts_sec = ts.tv_sec; - h1->ts_first_pkt.ts_nsec = ts.tv_nsec; + packet_get_time(po, &h1->ts_first_pkt.ts_sec, + &h1->ts_first_pkt.ts_nsec);
pkc1->pkblk_start = (char *)pbd1; pkc1->nxt_offset = pkc1->pkblk_start + BLK_PLUS_PRIV(pkc1->blk_sizeof_priv); @@ -936,7 +970,7 @@ static void *prb_dispatch_next_block(struct tpacket_kbdq_core *pkc, * open this block and return the offset where the first packet * needs to get stored. */ - prb_open_block(pkc, pbd); + prb_open_block(pkc, pbd, po); return (void *)pkc->nxt_offset; }
@@ -1068,7 +1102,7 @@ static void *__packet_lookup_frame_in_block(struct packet_sock *po, * opening a block also thaws the queue. * Thawing is a side effect. */ - prb_open_block(pkc, pbd); + prb_open_block(pkc, pbd, po); } }
@@ -2191,8 +2225,8 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, unsigned long status = TP_STATUS_USER; unsigned short macoff, netoff, hdrlen; struct sk_buff *copy_skb = NULL; - struct timespec64 ts; __u32 ts_status; + __u32 tstamp_hi, tstamp_lo; bool is_drop_n_account = false; bool do_vnet = false;
@@ -2318,8 +2352,8 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
skb_copy_bits(skb, 0, h.raw + macoff, snaplen);
- if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp))) - ktime_get_real_ts64(&ts); + if (!(ts_status = tpacket_get_timestamp(skb, &tstamp_hi, &tstamp_lo))) + packet_get_time(po, &tstamp_hi, &tstamp_lo);
status |= ts_status;
@@ -2329,8 +2363,8 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, h.h1->tp_snaplen = snaplen; h.h1->tp_mac = macoff; h.h1->tp_net = netoff; - h.h1->tp_sec = ts.tv_sec; - h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC; + h.h1->tp_sec = tstamp_hi; + h.h1->tp_usec = tstamp_lo; hdrlen = sizeof(*h.h1); break; case TPACKET_V2: @@ -2338,8 +2372,8 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, h.h2->tp_snaplen = snaplen; h.h2->tp_mac = macoff; h.h2->tp_net = netoff; - h.h2->tp_sec = ts.tv_sec; - h.h2->tp_nsec = ts.tv_nsec; + h.h2->tp_sec = tstamp_hi; + h.h2->tp_nsec = tstamp_lo; if (skb_vlan_tag_present(skb)) { h.h2->tp_vlan_tci = skb_vlan_tag_get(skb); h.h2->tp_vlan_tpid = ntohs(skb->vlan_proto); @@ -2360,8 +2394,8 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, h.h3->tp_snaplen = snaplen; h.h3->tp_mac = macoff; h.h3->tp_net = netoff; - h.h3->tp_sec = ts.tv_sec; - h.h3->tp_nsec = ts.tv_nsec; + h.h3->tp_sec = tstamp_hi; + h.h3->tp_nsec = tstamp_lo; memset(h.h3->tp_padding, 0, sizeof(h.h3->tp_padding)); hdrlen = sizeof(*h.h3); break; @@ -3792,6 +3826,30 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv po->tp_tstamp = val; return 0; } + case PACKET_SKIPTIMESTAMP: + { + int val; + + if (optlen != sizeof(val)) + return -EINVAL; + if (copy_from_user(&val, optval, sizeof(val))) + return -EFAULT; + + po->tp_skiptstamp = val; + return 0; + } + case PACKET_TIMESTAMP_NS64: + { + int val; + + if (optlen != sizeof(val)) + return -EINVAL; + if (copy_from_user(&val, optval, sizeof(val))) + return -EFAULT; + + po->tp_tstamp_ns64 = val; + return 0; + } case PACKET_FANOUT: { int val; @@ -3921,6 +3979,12 @@ static int packet_getsockopt(struct socket *sock, int level, int optname, case PACKET_TIMESTAMP: val = po->tp_tstamp; break; + case PACKET_SKIPTIMESTAMP: + val = po->tp_skiptstamp; + break; + case PACKET_TIMESTAMP_NS64: + val = po->tp_tstamp_ns64; + break; case PACKET_FANOUT: val = (po->fanout ? ((u32)po->fanout->id | diff --git a/net/packet/internal.h b/net/packet/internal.h index 562fbc155006..20b69512210f 100644 --- a/net/packet/internal.h +++ b/net/packet/internal.h @@ -128,6 +128,8 @@ struct packet_sock { unsigned int tp_reserve; unsigned int tp_loss:1; unsigned int tp_tx_has_off:1; + unsigned int tp_skiptstamp:1; + unsigned int tp_tstamp_ns64:1; unsigned int tp_tstamp; struct net_device __rcu *cached_dev; int (*xmit)(struct sk_buff *skb);