On Fri, Jun 28, 2024 at 9:38 AM Mina Almasry almasrymina@google.com wrote:
Hi Mina, Thank you so much for this work!
ncdevmem is a devmem TCP netcat. It works similarly to netcat, but it sends and receives data using the devmem TCP APIs. It uses udmabuf as the dmabuf provider. It is compatible with a regular netcat running on a peer, or a ncdevmem running on a peer.
In addition to normal netcat support, ncdevmem has a validation mode, where it sends a specific pattern and validates this pattern on the receiver side to ensure data integrity.
Suggested-by: Stanislav Fomichev sdf@google.com Signed-off-by: Mina Almasry almasrymina@google.com
v15:
- Fix linking against libynl. (Jakub)
v9: https://lore.kernel.org/netdev/20240403002053.2376017-15-almasrymina@google....
- Remove unused nic_pci_addr entry (Cong).
v6:
- Updated to bind 8 queues.
- Added RSS configuration.
- Added some more tests for the netlink API.
Changes in v1:
- Many more general cleanups (Willem).
- Removed driver reset (Jakub).
- Removed hardcoded if index (Paolo).
RFC v2:
- General cleanups (Willem).
tools/testing/selftests/net/.gitignore | 1 + tools/testing/selftests/net/Makefile | 9 + tools/testing/selftests/net/ncdevmem.c | 542 +++++++++++++++++++++++++ 3 files changed, 552 insertions(+) create mode 100644 tools/testing/selftests/net/ncdevmem.c
diff --git a/tools/testing/selftests/net/.gitignore b/tools/testing/selftests/net/.gitignore index 666ab7d9390b1..fe770903118c5 100644 --- a/tools/testing/selftests/net/.gitignore +++ b/tools/testing/selftests/net/.gitignore @@ -17,6 +17,7 @@ ipv6_flowlabel ipv6_flowlabel_mgr log.txt msg_zerocopy +ncdevmem nettest psock_fanout psock_snd diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile index bc3925200637c..39420a6e86b7f 100644 --- a/tools/testing/selftests/net/Makefile +++ b/tools/testing/selftests/net/Makefile @@ -95,6 +95,11 @@ TEST_PROGS += fq_band_pktlimit.sh TEST_PROGS += vlan_hw_filter.sh TEST_PROGS += bpf_offload.py
+# YNL files, must be before "include ..lib.mk" +EXTRA_CLEAN += $(OUTPUT)/libynl.a +YNL_GEN_FILES := ncdevmem +TEST_GEN_FILES += $(YNL_GEN_FILES)
TEST_FILES := settings TEST_FILES += in_netns.sh lib.sh net_helper.sh setup_loopback.sh setup_veth.sh
@@ -104,6 +109,10 @@ TEST_INCLUDES := forwarding/lib.sh
include ../lib.mk
+# YNL build +YNL_GENS := netdev +include ynl.mk
$(OUTPUT)/epoll_busy_poll: LDLIBS += -lcap $(OUTPUT)/reuseport_bpf_numa: LDLIBS += -lnuma $(OUTPUT)/tcp_mmap: LDLIBS += -lpthread -lcrypto diff --git a/tools/testing/selftests/net/ncdevmem.c b/tools/testing/selftests/net/ncdevmem.c new file mode 100644 index 0000000000000..e00255e54f77b --- /dev/null +++ b/tools/testing/selftests/net/ncdevmem.c @@ -0,0 +1,542 @@ +// SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE +#define __EXPORTED_HEADERS__
+#include <linux/uio.h> +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <stdbool.h> +#include <string.h> +#include <errno.h> +#define __iovec_defined +#include <fcntl.h> +#include <malloc.h> +#include <error.h>
+#include <arpa/inet.h> +#include <sys/socket.h> +#include <sys/mman.h> +#include <sys/ioctl.h> +#include <sys/syscall.h>
+#include <linux/memfd.h> +#include <linux/if.h> +#include <linux/dma-buf.h> +#include <linux/udmabuf.h> +#include <libmnl/libmnl.h> +#include <linux/types.h> +#include <linux/netlink.h> +#include <linux/genetlink.h> +#include <linux/netdev.h> +#include <time.h>
+#include "netdev-user.h" +#include <ynl.h>
+#define PAGE_SHIFT 12 +#define TEST_PREFIX "ncdevmem" +#define NUM_PAGES 16000
+#ifndef MSG_SOCK_DEVMEM +#define MSG_SOCK_DEVMEM 0x2000000 +#endif
+/*
- tcpdevmem netcat. Works similarly to netcat but does device memory TCP
- instead of regular TCP. Uses udmabuf to mock a dmabuf provider.
- Usage:
- On server:
- ncdevmem -s <server IP> -c <client IP> -f eth1 -d 3 -n 0000:06:00.0 -l \
- -p 5201 -v 7
The 'n' option disappeared, please remove it.
- On client:
- yes $(echo -e \x01\x02\x03\x04\x05\x06) | \
- tr \n \0 | \
- head -c 5G | \
- nc <server IP> 5201 -p 5201
- Note this is compatible with regular netcat. i.e. the sender or receiver can
- be replaced with regular netcat to test the RX or TX path in isolation.
- */
+static char *server_ip = "192.168.1.4"; +static char *client_ip = "192.168.1.2"; +static char *port = "5201"; +static size_t do_validation; +static int start_queue = 8; +static int num_queues = 8; +static char *ifname = "eth1"; +static unsigned int ifindex = 3; +static unsigned int iterations; +static unsigned int dmabuf_id;
+void print_bytes(void *ptr, size_t size) +{
- unsigned char *p = ptr;
- int i;
- for (i = 0; i < size; i++)
- printf("%02hhX ", p[i]);
- printf("\n");
+}
+void print_nonzero_bytes(void *ptr, size_t size) +{
- unsigned char *p = ptr;
- unsigned int i;
- for (i = 0; i < size; i++)
- putchar(p[i]);
- printf("\n");
+}
+void validate_buffer(void *line, size_t size) +{
- static unsigned char seed = 1;
- unsigned char *ptr = line;
- int errors = 0;
- size_t i;
- for (i = 0; i < size; i++) {
- if (ptr[i] != seed) {
- fprintf(stderr,
- "Failed validation: expected=%u, actual=%u, index=%lu\n",
- seed, ptr[i], i);
- errors++;
- if (errors > 20)
- error(1, 0, "validation failed.");
- }
- seed++;
- if (seed == do_validation)
- seed = 0;
- }
- fprintf(stdout, "Validated buffer\n");
+}
+static void reset_flow_steering(void) +{
- char command[256];
- memset(command, 0, sizeof(command));
- snprintf(command, sizeof(command), "sudo ethtool -K %s ntuple off",
- "eth1");
I think we use ifname instead of "eth1".
- system(command);
- memset(command, 0, sizeof(command));
- snprintf(command, sizeof(command), "sudo ethtool -K %s ntuple on",
- "eth1");
Please use ifname instead of "eth1" too.
- system(command);
+}
+static void configure_rss(void) +{
- char command[256];
- memset(command, 0, sizeof(command));
- snprintf(command, sizeof(command), "sudo ethtool -X %s equal %d",
- ifname, start_queue);
- system(command);
+}
+static void configure_flow_steering(void) +{
- char command[256];
- memset(command, 0, sizeof(command));
- snprintf(command, sizeof(command),
- "sudo ethtool -N %s flow-type tcp4 src-ip %s dst-ip %s src-port %s dst-port %s queue %d",
- ifname, client_ip, server_ip, port, port, start_queue);
- system(command);
+}
+static int bind_rx_queue(unsigned int ifindex, unsigned int dmabuf_fd,
- struct netdev_queue_dmabuf *queues,
- unsigned int n_queue_index, struct ynl_sock **ys)
+{
- struct netdev_bind_rx_req *req = NULL;
- struct netdev_bind_rx_rsp *rsp = NULL;
- struct ynl_error yerr;
- *ys = ynl_sock_create(&ynl_netdev_family, &yerr);
- if (!*ys) {
- fprintf(stderr, "YNL: %s\n", yerr.msg);
- return -1;
- }
- req = netdev_bind_rx_req_alloc();
- netdev_bind_rx_req_set_ifindex(req, ifindex);
- netdev_bind_rx_req_set_dmabuf_fd(req, dmabuf_fd);
- __netdev_bind_rx_req_set_queues(req, queues, n_queue_index);
- rsp = netdev_bind_rx(*ys, req);
- if (!rsp) {
- perror("netdev_bind_rx");
- goto err_close;
- }
- if (!rsp->_present.dmabuf_id) {
- perror("dmabuf_id not present");
- goto err_close;
- }
- printf("got dmabuf id=%d\n", rsp->dmabuf_id);
- dmabuf_id = rsp->dmabuf_id;
- netdev_bind_rx_req_free(req);
- netdev_bind_rx_rsp_free(rsp);
- return 0;
+err_close:
- fprintf(stderr, "YNL failed: %s\n", (*ys)->err.msg);
- netdev_bind_rx_req_free(req);
- ynl_sock_destroy(*ys);
- return -1;
+}
+static void create_udmabuf(int *devfd, int *memfd, int *buf, size_t dmabuf_size) +{
- struct udmabuf_create create;
- int ret;
- *devfd = open("/dev/udmabuf", O_RDWR);
- if (*devfd < 0) {
- error(70, 0,
- "%s: [skip,no-udmabuf: Unable to access DMA buffer device file]\n",
- TEST_PREFIX);
- }
- *memfd = memfd_create("udmabuf-test", MFD_ALLOW_SEALING);
- if (*memfd < 0)
- error(70, 0, "%s: [skip,no-memfd]\n", TEST_PREFIX);
- /* Required for udmabuf */
- ret = fcntl(*memfd, F_ADD_SEALS, F_SEAL_SHRINK);
- if (ret < 0)
- error(73, 0, "%s: [skip,fcntl-add-seals]\n", TEST_PREFIX);
- ret = ftruncate(*memfd, dmabuf_size);
- if (ret == -1)
- error(74, 0, "%s: [FAIL,memfd-truncate]\n", TEST_PREFIX);
- memset(&create, 0, sizeof(create));
- create.memfd = *memfd;
- create.offset = 0;
- create.size = dmabuf_size;
- *buf = ioctl(*devfd, UDMABUF_CREATE, &create);
- if (*buf < 0)
- error(75, 0, "%s: [FAIL, create udmabuf]\n", TEST_PREFIX);
+}
+int do_server(void) +{
- char ctrl_data[sizeof(int) * 20000];
- struct netdev_queue_dmabuf *queues;
- size_t non_page_aligned_frags = 0;
- struct sockaddr_in client_addr;
- struct sockaddr_in server_sin;
- size_t page_aligned_frags = 0;
- int devfd, memfd, buf, ret;
- size_t total_received = 0;
- socklen_t client_addr_len;
- bool is_devmem = false;
- char *buf_mem = NULL;
- struct ynl_sock *ys;
- size_t dmabuf_size;
- char iobuf[819200];
- char buffer[256];
- int socket_fd;
- int client_fd;
- size_t i = 0;
- int opt = 1;
- dmabuf_size = getpagesize() * NUM_PAGES;
- create_udmabuf(&devfd, &memfd, &buf, dmabuf_size);
- reset_flow_steering();
- /* Configure RSS to divert all traffic from our devmem queues */
- configure_rss();
- /* Flow steer our devmem flows to start_queue */
- configure_flow_steering();
- sleep(1);
- queues = malloc(sizeof(*queues) * num_queues);
- for (i = 0; i < num_queues; i++) {
- queues[i]._present.type = 1;
- queues[i]._present.idx = 1;
- queues[i].type = NETDEV_QUEUE_TYPE_RX;
- queues[i].idx = start_queue + i;
- }
- if (bind_rx_queue(ifindex, buf, queues, num_queues, &ys))
- error(1, 0, "Failed to bind\n");
- buf_mem = mmap(NULL, dmabuf_size, PROT_READ | PROT_WRITE, MAP_SHARED,
- buf, 0);
- if (buf_mem == MAP_FAILED)
- error(1, 0, "mmap()");
- server_sin.sin_family = AF_INET;
- server_sin.sin_port = htons(atoi(port));
- ret = inet_pton(server_sin.sin_family, server_ip, &server_sin.sin_addr);
- if (socket < 0)
- error(79, 0, "%s: [FAIL, create socket]\n", TEST_PREFIX);
- socket_fd = socket(server_sin.sin_family, SOCK_STREAM, 0);
- if (socket < 0)
- error(errno, errno, "%s: [FAIL, create socket]\n", TEST_PREFIX);
- ret = setsockopt(socket_fd, SOL_SOCKET, SO_REUSEPORT, &opt,
- sizeof(opt));
- if (ret)
- error(errno, errno, "%s: [FAIL, set sock opt]\n", TEST_PREFIX);
- ret = setsockopt(socket_fd, SOL_SOCKET, SO_REUSEADDR, &opt,
- sizeof(opt));
- if (ret)
- error(errno, errno, "%s: [FAIL, set sock opt]\n", TEST_PREFIX);
- printf("binding to address %s:%d\n", server_ip,
- ntohs(server_sin.sin_port));
- ret = bind(socket_fd, &server_sin, sizeof(server_sin));
- if (ret)
- error(errno, errno, "%s: [FAIL, bind]\n", TEST_PREFIX);
- ret = listen(socket_fd, 1);
- if (ret)
- error(errno, errno, "%s: [FAIL, listen]\n", TEST_PREFIX);
- client_addr_len = sizeof(client_addr);
- inet_ntop(server_sin.sin_family, &server_sin.sin_addr, buffer,
- sizeof(buffer));
- printf("Waiting or connection on %s:%d\n", buffer,
- ntohs(server_sin.sin_port));
- client_fd = accept(socket_fd, &client_addr, &client_addr_len);
- inet_ntop(client_addr.sin_family, &client_addr.sin_addr, buffer,
- sizeof(buffer));
- printf("Got connection from %s:%d\n", buffer,
- ntohs(client_addr.sin_port));
- while (1) {
- struct iovec iov = { .iov_base = iobuf,
- .iov_len = sizeof(iobuf) };
- struct dmabuf_cmsg *dmabuf_cmsg = NULL;
- struct dma_buf_sync sync = { 0 };
- struct cmsghdr *cm = NULL;
- struct msghdr msg = { 0 };
- struct dmabuf_token token;
- ssize_t ret;
- is_devmem = false;
- printf("\n\n");
- msg.msg_iov = &iov;
- msg.msg_iovlen = 1;
- msg.msg_control = ctrl_data;
- msg.msg_controllen = sizeof(ctrl_data);
- ret = recvmsg(client_fd, &msg, MSG_SOCK_DEVMEM);
- printf("recvmsg ret=%ld\n", ret);
- if (ret < 0 && (errno == EAGAIN || errno == EWOULDBLOCK))
- continue;
- if (ret < 0) {
- perror("recvmsg");
- continue;
- }
- if (ret == 0) {
- printf("client exited\n");
- goto cleanup;
- }
- i++;
- for (cm = CMSG_FIRSTHDR(&msg); cm; cm = CMSG_NXTHDR(&msg, cm)) {
- if (cm->cmsg_level != SOL_SOCKET ||
- (cm->cmsg_type != SCM_DEVMEM_DMABUF &&
- cm->cmsg_type != SCM_DEVMEM_LINEAR)) {
- fprintf(stdout, "skipping non-devmem cmsg\n");
- continue;
- }
- dmabuf_cmsg = (struct dmabuf_cmsg *)CMSG_DATA(cm);
- is_devmem = true;
- if (cm->cmsg_type == SCM_DEVMEM_LINEAR) {
- /* TODO: process data copied from skb's linear
- buffer.
- */
- fprintf(stdout,
- "SCM_DEVMEM_LINEAR. dmabuf_cmsg->frag_size=%u\n",
- dmabuf_cmsg->frag_size);
- continue;
- }
- token.token_start = dmabuf_cmsg->frag_token;
- token.token_count = 1;
- total_received += dmabuf_cmsg->frag_size;
- printf("received frag_page=%llu, in_page_offset=%llu, frag_offset=%llu, frag_size=%u, token=%u, total_received=%lu, dmabuf_id=%u\n",
- dmabuf_cmsg->frag_offset >> PAGE_SHIFT,
- dmabuf_cmsg->frag_offset % getpagesize(),
- dmabuf_cmsg->frag_offset, dmabuf_cmsg->frag_size,
- dmabuf_cmsg->frag_token, total_received,
- dmabuf_cmsg->dmabuf_id);
- if (dmabuf_cmsg->dmabuf_id != dmabuf_id)
- error(1, 0,
- "received on wrong dmabuf_id: flow steering error\n");
- if (dmabuf_cmsg->frag_size % getpagesize())
- non_page_aligned_frags++;
- else
- page_aligned_frags++;
- sync.flags = DMA_BUF_SYNC_READ | DMA_BUF_SYNC_START;
- ioctl(buf, DMA_BUF_IOCTL_SYNC, &sync);
- if (do_validation)
- validate_buffer(
- ((unsigned char *)buf_mem) +
- dmabuf_cmsg->frag_offset,
- dmabuf_cmsg->frag_size);
- else
- print_nonzero_bytes(
- ((unsigned char *)buf_mem) +
- dmabuf_cmsg->frag_offset,
- dmabuf_cmsg->frag_size);
- sync.flags = DMA_BUF_SYNC_READ | DMA_BUF_SYNC_END;
- ioctl(buf, DMA_BUF_IOCTL_SYNC, &sync);
- ret = setsockopt(client_fd, SOL_SOCKET,
- SO_DEVMEM_DONTNEED, &token,
- sizeof(token));
- if (ret != 1)
- error(1, 0,
- "SO_DEVMEM_DONTNEED not enough tokens");
- }
- if (!is_devmem)
- error(1, 0, "flow steering error\n");
- printf("total_received=%lu\n", total_received);
- }
- fprintf(stdout, "%s: ok\n", TEST_PREFIX);
- fprintf(stdout, "page_aligned_frags=%lu, non_page_aligned_frags=%lu\n",
- page_aligned_frags, non_page_aligned_frags);
- fprintf(stdout, "page_aligned_frags=%lu, non_page_aligned_frags=%lu\n",
- page_aligned_frags, non_page_aligned_frags);
+cleanup:
- munmap(buf_mem, dmabuf_size);
- close(client_fd);
- close(socket_fd);
- close(buf);
- close(memfd);
- close(devfd);
- ynl_sock_destroy(ys);
- return 0;
+}
+void run_devmem_tests(void) +{
- struct netdev_queue_dmabuf *queues;
- int devfd, memfd, buf;
- struct ynl_sock *ys;
- size_t dmabuf_size;
- size_t i = 0;
- dmabuf_size = getpagesize() * NUM_PAGES;
- create_udmabuf(&devfd, &memfd, &buf, dmabuf_size);
- /* Configure RSS to divert all traffic from our devmem queues */
- configure_rss();
- sleep(1);
- queues = malloc(sizeof(*queues) * num_queues);
- for (i = 0; i < num_queues; i++) {
- queues[i]._present.type = 1;
- queues[i]._present.idx = 1;
- queues[i].type = NETDEV_QUEUE_TYPE_RX;
- queues[i].idx = start_queue + i;
- }
- if (bind_rx_queue(ifindex, buf, queues, num_queues, &ys))
- error(1, 0, "Failed to bind\n");
- /* Closing the netlink socket does an implicit unbind */
- ynl_sock_destroy(ys);
+}
+int main(int argc, char *argv[]) +{
- int is_server = 0, opt;
- while ((opt = getopt(argc, argv, "ls:c:p:v:q:f:n:i:d:")) != -1) {
I think 't' option should be added here.
- switch (opt) {
- case 'l':
- is_server = 1;
- break;
- case 's':
- server_ip = optarg;
- break;
- case 'c':
- client_ip = optarg;
- break;
- case 'p':
- port = optarg;
- break;
- case 'v':
- do_validation = atoll(optarg);
- break;
- case 'q':
- num_queues = atoi(optarg);
- break;
- case 't':
- start_queue = atoi(optarg);
- break;
- case 'f':
- ifname = optarg;
- break;
- case 'd':
- ifindex = atoi(optarg);
How about using if_nametoindex() instead of 'd' option?
- break;
- case 'i':
- iterations = atoll(optarg);
I couldn't find a use of this variable.
- break;
- case '?':
- printf("unknown option: %c\n", optopt);
- break;
- }
- }
- for (; optind < argc; optind++)
- printf("extra arguments: %s\n", argv[optind]);
- run_devmem_tests();
- if (is_server)
- return do_server();
- return 0;
+}
2.45.2.803.g4e1b14247a-goog
Thanks a lot! Taehee Yoo