Hi, Maxim!
The test is not buildable for me with CONFIG_NF_CONNTRACK=m due to missing BPF_F_CURRENT_NETNS in vmlinux.h.
On Wed, Jun 15, 2022 at 4:49 PM Maxim Mikityanskiy maximmi@nvidia.com wrote:
This commit adds selftests for the new BPF helpers: bpf_tcp_raw_{gen,check}_syncookie_ipv{4,6}.
xdp_synproxy_kern.c is a BPF program that generates SYN cookies on allowed TCP ports and sends SYNACKs to clients, accelerating synproxy iptables module.
xdp_synproxy.c is a userspace control application that allows to configure the following options in runtime: list of allowed ports, MSS, window scale, TTL.
A selftest is added to prog_tests that leverages the above programs to test the functionality of the new helpers.
Signed-off-by: Maxim Mikityanskiy maximmi@nvidia.com Reviewed-by: Tariq Toukan tariqt@nvidia.com
tools/testing/selftests/bpf/.gitignore | 1 + tools/testing/selftests/bpf/Makefile | 3 +- .../selftests/bpf/prog_tests/xdp_synproxy.c | 146 ++++ .../selftests/bpf/progs/xdp_synproxy_kern.c | 763 ++++++++++++++++++ tools/testing/selftests/bpf/xdp_synproxy.c | 418 ++++++++++ 5 files changed, 1330 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/bpf/prog_tests/xdp_synproxy.c create mode 100644 tools/testing/selftests/bpf/progs/xdp_synproxy_kern.c create mode 100644 tools/testing/selftests/bpf/xdp_synproxy.c
diff --git a/tools/testing/selftests/bpf/.gitignore b/tools/testing/selftests/bpf/.gitignore index 595565eb68c0..ca2f47f45670 100644 --- a/tools/testing/selftests/bpf/.gitignore +++ b/tools/testing/selftests/bpf/.gitignore @@ -43,3 +43,4 @@ test_cpp *.tmp xdpxceiver xdp_redirect_multi +xdp_synproxy diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 8ad7a733a505..e4ce21f4a474 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -82,7 +82,7 @@ TEST_PROGS_EXTENDED := with_addr.sh \ TEST_GEN_PROGS_EXTENDED = test_sock_addr test_skb_cgroup_id_user \ flow_dissector_load test_flow_dissector test_tcp_check_syncookie_user \ test_lirc_mode2_user xdping test_cpp runqslower bench bpf_testmod.ko \
xdpxceiver xdp_redirect_multi
xdpxceiver xdp_redirect_multi xdp_synproxyTEST_CUSTOM_PROGS = $(OUTPUT)/urandom_read
@@ -502,6 +502,7 @@ TRUNNER_EXTRA_SOURCES := test_progs.c cgroup_helpers.c trace_helpers.c \ cap_helpers.c TRUNNER_EXTRA_FILES := $(OUTPUT)/urandom_read $(OUTPUT)/bpf_testmod.ko \ $(OUTPUT)/liburandom_read.so \
$(OUTPUT)/xdp_synproxy \ ima_setup.sh \ $(wildcard progs/btf_dump_test_case_*.c)TRUNNER_BPF_BUILD_RULE := CLANG_BPF_BUILD_RULE diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_synproxy.c b/tools/testing/selftests/bpf/prog_tests/xdp_synproxy.c new file mode 100644 index 000000000000..d9ee884c2a2b --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/xdp_synproxy.c @@ -0,0 +1,146 @@ +// SPDX-License-Identifier: LGPL-2.1 OR BSD-2-Clause +/* Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. */
+#include <test_progs.h> +#include <network_helpers.h> +#include <ctype.h>
+#define CMD_OUT_BUF_SIZE 1023
+#define SYS(cmd) ({ \
if (!ASSERT_OK(system(cmd), (cmd))) \goto out; \+})
+#define SYS_OUT(cmd) ({ \
FILE *f = popen((cmd), "r"); \if (!ASSERT_OK_PTR(f, (cmd))) \goto out; \f; \+})
+/* out must be at least `size * 4 + 1` bytes long */ +static void escape_str(char *out, const char *in, size_t size) +{
static const char *hex = "0123456789ABCDEF";size_t i;for (i = 0; i < size; i++) {if (isprint(in[i]) && in[i] != '\\' && in[i] != '\'') {*out++ = in[i];} else {*out++ = '\\';*out++ = 'x';*out++ = hex[(in[i] >> 4) & 0xf];*out++ = hex[in[i] & 0xf];}}*out++ = '\0';+}
+static bool expect_str(char *buf, size_t size, const char *str, const char *name) +{
static char escbuf_expected[CMD_OUT_BUF_SIZE * 4];static char escbuf_actual[CMD_OUT_BUF_SIZE * 4];static int duration = 0;bool ok;ok = size == strlen(str) && !memcmp(buf, str, size);if (!ok) {escape_str(escbuf_expected, str, strlen(str));escape_str(escbuf_actual, buf, size);}CHECK(!ok, name, "unexpected %s: actual '%s' != expected '%s'\n",name, escbuf_actual, escbuf_expected);return ok;+}
+void test_xdp_synproxy(void) +{
int server_fd = -1, client_fd = -1, accept_fd = -1;struct nstoken *ns = NULL;FILE *ctrl_file = NULL;char buf[CMD_OUT_BUF_SIZE];size_t size;SYS("ip netns add synproxy");SYS("ip link add tmp0 type veth peer name tmp1");SYS("ip link set tmp1 netns synproxy");SYS("ip link set tmp0 up");SYS("ip addr replace 198.18.0.1/24 dev tmp0");/* When checksum offload is enabled, the XDP program sees wrong* checksums and drops packets.*/SYS("ethtool -K tmp0 tx off");/* Workaround required for veth. */SYS("ip link set tmp0 xdp object xdp_dummy.o section xdp 2> /dev/null");ns = open_netns("synproxy");if (!ASSERT_OK_PTR(ns, "setns"))goto out;SYS("ip link set lo up");SYS("ip link set tmp1 up");SYS("ip addr replace 198.18.0.2/24 dev tmp1");SYS("sysctl -w net.ipv4.tcp_syncookies=2");SYS("sysctl -w net.ipv4.tcp_timestamps=1");SYS("sysctl -w net.netfilter.nf_conntrack_tcp_loose=0");SYS("iptables -t raw -I PREROUTING \-i tmp1 -p tcp -m tcp --syn --dport 8080 -j CT --notrack");SYS("iptables -t filter -A INPUT \-i tmp1 -p tcp -m tcp --dport 8080 -m state --state INVALID,UNTRACKED \-j SYNPROXY --sack-perm --timestamp --wscale 7 --mss 1460");SYS("iptables -t filter -A INPUT \-i tmp1 -m state --state INVALID -j DROP");ctrl_file = SYS_OUT("./xdp_synproxy --iface tmp1 --ports 8080 --single \--mss4 1460 --mss6 1440 --wscale 7 --ttl 64");size = fread(buf, 1, sizeof(buf), ctrl_file);pclose(ctrl_file);if (!expect_str(buf, size, "Total SYNACKs generated: 0\n","initial SYNACKs"))goto out;server_fd = start_server(AF_INET, SOCK_STREAM, "198.18.0.2", 8080, 0);if (!ASSERT_GE(server_fd, 0, "start_server"))goto out;close_netns(ns);ns = NULL;client_fd = connect_to_fd(server_fd, 10000);if (!ASSERT_GE(client_fd, 0, "connect_to_fd"))goto out;accept_fd = accept(server_fd, NULL, NULL);if (!ASSERT_GE(accept_fd, 0, "accept"))goto out;ns = open_netns("synproxy");if (!ASSERT_OK_PTR(ns, "setns"))goto out;ctrl_file = SYS_OUT("./xdp_synproxy --iface tmp1 --single");size = fread(buf, 1, sizeof(buf), ctrl_file);pclose(ctrl_file);if (!expect_str(buf, size, "Total SYNACKs generated: 1\n","SYNACKs after connection"))goto out;+out:
if (accept_fd >= 0)close(accept_fd);if (client_fd >= 0)close(client_fd);if (server_fd >= 0)close(server_fd);if (ns)close_netns(ns);system("ip link del tmp0");system("ip netns del synproxy");+} diff --git a/tools/testing/selftests/bpf/progs/xdp_synproxy_kern.c b/tools/testing/selftests/bpf/progs/xdp_synproxy_kern.c new file mode 100644 index 000000000000..53b9865276a4 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/xdp_synproxy_kern.c @@ -0,0 +1,763 @@ +// SPDX-License-Identifier: LGPL-2.1 OR BSD-2-Clause +/* Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. */
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h> +#include <bpf/bpf_endian.h> +#include <asm/errno.h>
+#define NSEC_PER_SEC 1000000000L
+#define ETH_ALEN 6 +#define ETH_P_IP 0x0800 +#define ETH_P_IPV6 0x86DD
+#define tcp_flag_word(tp) (((union tcp_word_hdr *)(tp))->words[3])
+#define IP_DF 0x4000 +#define IP_MF 0x2000 +#define IP_OFFSET 0x1fff
+#define NEXTHDR_TCP 6
+#define TCPOPT_NOP 1 +#define TCPOPT_EOL 0 +#define TCPOPT_MSS 2 +#define TCPOPT_WINDOW 3 +#define TCPOPT_SACK_PERM 4 +#define TCPOPT_TIMESTAMP 8
+#define TCPOLEN_MSS 4 +#define TCPOLEN_WINDOW 3 +#define TCPOLEN_SACK_PERM 2 +#define TCPOLEN_TIMESTAMP 10
+#define TCP_TS_HZ 1000 +#define TS_OPT_WSCALE_MASK 0xf +#define TS_OPT_SACK (1 << 4) +#define TS_OPT_ECN (1 << 5) +#define TSBITS 6 +#define TSMASK (((__u32)1 << TSBITS) - 1) +#define TCP_MAX_WSCALE 14U
+#define IPV4_MAXLEN 60 +#define TCP_MAXLEN 60
+#define DEFAULT_MSS4 1460 +#define DEFAULT_MSS6 1440 +#define DEFAULT_WSCALE 7 +#define DEFAULT_TTL 64 +#define MAX_ALLOWED_PORTS 8
+#define swap(a, b) \
do { typeof(a) __tmp = (a); (a) = (b); (b) = __tmp; } while (0)+#define __get_unaligned_t(type, ptr) ({ \
const struct { type x; } __attribute__((__packed__)) *__pptr = (typeof(__pptr))(ptr); \__pptr->x; \+})
+#define get_unaligned(ptr) __get_unaligned_t(typeof(*(ptr)), (ptr))
+struct {
__uint(type, BPF_MAP_TYPE_ARRAY);__type(key, __u32);__type(value, __u64);__uint(max_entries, 2);+} values SEC(".maps");
+struct {
__uint(type, BPF_MAP_TYPE_ARRAY);__type(key, __u32);__type(value, __u16);__uint(max_entries, MAX_ALLOWED_PORTS);+} allowed_ports SEC(".maps");
+extern struct nf_conn *bpf_xdp_ct_lookup(struct xdp_md *xdp_ctx,
struct bpf_sock_tuple *bpf_tuple,__u32 len_tuple,struct bpf_ct_opts *opts,__u32 len_opts) __ksym;+extern void bpf_ct_release(struct nf_conn *ct) __ksym;
+static __always_inline void swap_eth_addr(__u8 *a, __u8 *b) +{
__u8 tmp[ETH_ALEN];__builtin_memcpy(tmp, a, ETH_ALEN);__builtin_memcpy(a, b, ETH_ALEN);__builtin_memcpy(b, tmp, ETH_ALEN);+}
+static __always_inline __u16 csum_fold(__u32 csum) +{
csum = (csum & 0xffff) + (csum >> 16);csum = (csum & 0xffff) + (csum >> 16);return (__u16)~csum;+}
+static __always_inline __u16 csum_tcpudp_magic(__be32 saddr, __be32 daddr,
__u32 len, __u8 proto,__u32 csum)+{
__u64 s = csum;s += (__u32)saddr;s += (__u32)daddr;+#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
s += proto + len;+#elif __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
s += (proto + len) << 8;+#else +#error Unknown endian +#endif
s = (s & 0xffffffff) + (s >> 32);s = (s & 0xffffffff) + (s >> 32);return csum_fold((__u32)s);+}
+static __always_inline __u16 csum_ipv6_magic(const struct in6_addr *saddr,
const struct in6_addr *daddr,__u32 len, __u8 proto, __u32 csum)+{
__u64 sum = csum;int i;+#pragma unroll
for (i = 0; i < 4; i++)sum += (__u32)saddr->in6_u.u6_addr32[i];+#pragma unroll
for (i = 0; i < 4; i++)sum += (__u32)daddr->in6_u.u6_addr32[i];/* Don't combine additions to avoid 32-bit overflow. */sum += bpf_htonl(len);sum += bpf_htonl(proto);sum = (sum & 0xffffffff) + (sum >> 32);sum = (sum & 0xffffffff) + (sum >> 32);return csum_fold((__u32)sum);+}
+static __always_inline __u64 tcp_clock_ns(void) +{
return bpf_ktime_get_ns();+}
+static __always_inline __u32 tcp_ns_to_ts(__u64 ns) +{
return ns / (NSEC_PER_SEC / TCP_TS_HZ);+}
+static __always_inline __u32 tcp_time_stamp_raw(void) +{
return tcp_ns_to_ts(tcp_clock_ns());+}
+struct tcpopt_context {
__u8 *ptr;__u8 *end;void *data_end;__be32 *tsecr;__u8 wscale;bool option_timestamp;bool option_sack;+};
+static int tscookie_tcpopt_parse(struct tcpopt_context *ctx) +{
__u8 opcode, opsize;if (ctx->ptr >= ctx->end)return 1;if (ctx->ptr >= ctx->data_end)return 1;opcode = ctx->ptr[0];if (opcode == TCPOPT_EOL)return 1;if (opcode == TCPOPT_NOP) {++ctx->ptr;return 0;}if (ctx->ptr + 1 >= ctx->end)return 1;if (ctx->ptr + 1 >= ctx->data_end)return 1;opsize = ctx->ptr[1];if (opsize < 2)return 1;if (ctx->ptr + opsize > ctx->end)return 1;switch (opcode) {case TCPOPT_WINDOW:if (opsize == TCPOLEN_WINDOW && ctx->ptr + TCPOLEN_WINDOW <= ctx->data_end)ctx->wscale = ctx->ptr[2] < TCP_MAX_WSCALE ? ctx->ptr[2] : TCP_MAX_WSCALE;break;case TCPOPT_TIMESTAMP:if (opsize == TCPOLEN_TIMESTAMP && ctx->ptr + TCPOLEN_TIMESTAMP <= ctx->data_end) {ctx->option_timestamp = true;/* Client's tsval becomes our tsecr. */*ctx->tsecr = get_unaligned((__be32 *)(ctx->ptr + 2));}break;case TCPOPT_SACK_PERM:if (opsize == TCPOLEN_SACK_PERM)ctx->option_sack = true;break;}ctx->ptr += opsize;return 0;+}
+static int tscookie_tcpopt_parse_batch(__u32 index, void *context) +{
int i;for (i = 0; i < 7; i++)if (tscookie_tcpopt_parse(context))return 1;return 0;+}
+static __always_inline bool tscookie_init(struct tcphdr *tcp_header,
__u16 tcp_len, __be32 *tsval,__be32 *tsecr, void *data_end)+{
struct tcpopt_context loop_ctx = {.ptr = (__u8 *)(tcp_header + 1),.end = (__u8 *)tcp_header + tcp_len,.data_end = data_end,.tsecr = tsecr,.wscale = TS_OPT_WSCALE_MASK,.option_timestamp = false,.option_sack = false,};u32 cookie;bpf_loop(6, tscookie_tcpopt_parse_batch, &loop_ctx, 0);if (!loop_ctx.option_timestamp)return false;cookie = tcp_time_stamp_raw() & ~TSMASK;cookie |= loop_ctx.wscale & TS_OPT_WSCALE_MASK;if (loop_ctx.option_sack)cookie |= TS_OPT_SACK;if (tcp_header->ece && tcp_header->cwr)cookie |= TS_OPT_ECN;*tsval = bpf_htonl(cookie);return true;+}
+static __always_inline void values_get_tcpipopts(__u16 *mss, __u8 *wscale,
__u8 *ttl, bool ipv6)+{
__u32 key = 0;__u64 *value;value = bpf_map_lookup_elem(&values, &key);if (value && *value != 0) {if (ipv6)*mss = (*value >> 32) & 0xffff;else*mss = *value & 0xffff;*wscale = (*value >> 16) & 0xf;*ttl = (*value >> 24) & 0xff;return;}*mss = ipv6 ? DEFAULT_MSS6 : DEFAULT_MSS4;*wscale = DEFAULT_WSCALE;*ttl = DEFAULT_TTL;+}
+static __always_inline void values_inc_synacks(void) +{
__u32 key = 1;__u32 *value;value = bpf_map_lookup_elem(&values, &key);if (value)__sync_fetch_and_add(value, 1);+}
+static __always_inline bool check_port_allowed(__u16 port) +{
__u32 i;for (i = 0; i < MAX_ALLOWED_PORTS; i++) {__u32 key = i;__u16 *value;value = bpf_map_lookup_elem(&allowed_ports, &key);if (!value)break;/* 0 is a terminator value. Check it first to avoid matching on* a forbidden port == 0 and returning true.*/if (*value == 0)break;if (*value == port)return true;}return false;+}
+struct header_pointers {
struct ethhdr *eth;struct iphdr *ipv4;struct ipv6hdr *ipv6;struct tcphdr *tcp;__u16 tcp_len;+};
+static __always_inline int tcp_dissect(void *data, void *data_end,
struct header_pointers *hdr)+{
hdr->eth = data;if (hdr->eth + 1 > data_end)return XDP_DROP;switch (bpf_ntohs(hdr->eth->h_proto)) {case ETH_P_IP:hdr->ipv6 = NULL;hdr->ipv4 = (void *)hdr->eth + sizeof(*hdr->eth);if (hdr->ipv4 + 1 > data_end)return XDP_DROP;if (hdr->ipv4->ihl * 4 < sizeof(*hdr->ipv4))return XDP_DROP;if (hdr->ipv4->version != 4)return XDP_DROP;if (hdr->ipv4->protocol != IPPROTO_TCP)return XDP_PASS;hdr->tcp = (void *)hdr->ipv4 + hdr->ipv4->ihl * 4;break;case ETH_P_IPV6:hdr->ipv4 = NULL;hdr->ipv6 = (void *)hdr->eth + sizeof(*hdr->eth);if (hdr->ipv6 + 1 > data_end)return XDP_DROP;if (hdr->ipv6->version != 6)return XDP_DROP;/* XXX: Extension headers are not supported and could circumvent* XDP SYN flood protection.*/if (hdr->ipv6->nexthdr != NEXTHDR_TCP)return XDP_PASS;hdr->tcp = (void *)hdr->ipv6 + sizeof(*hdr->ipv6);break;default:/* XXX: VLANs will circumvent XDP SYN flood protection. */return XDP_PASS;}if (hdr->tcp + 1 > data_end)return XDP_DROP;hdr->tcp_len = hdr->tcp->doff * 4;if (hdr->tcp_len < sizeof(*hdr->tcp))return XDP_DROP;return XDP_TX;+}
+static __always_inline int tcp_lookup(struct xdp_md *ctx, struct header_pointers *hdr) +{
struct bpf_ct_opts ct_lookup_opts = {.netns_id = BPF_F_CURRENT_NETNS,.l4proto = IPPROTO_TCP,};struct bpf_sock_tuple tup = {};struct nf_conn *ct;__u32 tup_size;if (hdr->ipv4) {/* TCP doesn't normally use fragments, and XDP can't reassemble* them.*/if ((hdr->ipv4->frag_off & bpf_htons(IP_DF | IP_MF | IP_OFFSET)) != bpf_htons(IP_DF))return XDP_DROP;tup.ipv4.saddr = hdr->ipv4->saddr;tup.ipv4.daddr = hdr->ipv4->daddr;tup.ipv4.sport = hdr->tcp->source;tup.ipv4.dport = hdr->tcp->dest;tup_size = sizeof(tup.ipv4);} else if (hdr->ipv6) {__builtin_memcpy(tup.ipv6.saddr, &hdr->ipv6->saddr, sizeof(tup.ipv6.saddr));__builtin_memcpy(tup.ipv6.daddr, &hdr->ipv6->daddr, sizeof(tup.ipv6.daddr));tup.ipv6.sport = hdr->tcp->source;tup.ipv6.dport = hdr->tcp->dest;tup_size = sizeof(tup.ipv6);} else {/* The verifier can't track that either ipv4 or ipv6 is not* NULL.*/return XDP_ABORTED;}ct = bpf_xdp_ct_lookup(ctx, &tup, tup_size, &ct_lookup_opts, sizeof(ct_lookup_opts));if (ct) {unsigned long status = ct->status;bpf_ct_release(ct);if (status & IPS_CONFIRMED_BIT)return XDP_PASS;} else if (ct_lookup_opts.error != -ENOENT) {return XDP_ABORTED;}/* error == -ENOENT || !(status & IPS_CONFIRMED_BIT) */return XDP_TX;+}
+static __always_inline __u8 tcp_mkoptions(__be32 *buf, __be32 *tsopt, __u16 mss,
__u8 wscale)+{
__be32 *start = buf;*buf++ = bpf_htonl((TCPOPT_MSS << 24) | (TCPOLEN_MSS << 16) | mss);if (!tsopt)return buf - start;if (tsopt[0] & bpf_htonl(1 << 4))*buf++ = bpf_htonl((TCPOPT_SACK_PERM << 24) |(TCPOLEN_SACK_PERM << 16) |(TCPOPT_TIMESTAMP << 8) |TCPOLEN_TIMESTAMP);else*buf++ = bpf_htonl((TCPOPT_NOP << 24) |(TCPOPT_NOP << 16) |(TCPOPT_TIMESTAMP << 8) |TCPOLEN_TIMESTAMP);*buf++ = tsopt[0];*buf++ = tsopt[1];if ((tsopt[0] & bpf_htonl(0xf)) != bpf_htonl(0xf))*buf++ = bpf_htonl((TCPOPT_NOP << 24) |(TCPOPT_WINDOW << 16) |(TCPOLEN_WINDOW << 8) |wscale);return buf - start;+}
+static __always_inline void tcp_gen_synack(struct tcphdr *tcp_header,
__u32 cookie, __be32 *tsopt,__u16 mss, __u8 wscale)+{
void *tcp_options;tcp_flag_word(tcp_header) = TCP_FLAG_SYN | TCP_FLAG_ACK;if (tsopt && (tsopt[0] & bpf_htonl(1 << 5)))tcp_flag_word(tcp_header) |= TCP_FLAG_ECE;tcp_header->doff = 5; /* doff is part of tcp_flag_word. */swap(tcp_header->source, tcp_header->dest);tcp_header->ack_seq = bpf_htonl(bpf_ntohl(tcp_header->seq) + 1);tcp_header->seq = bpf_htonl(cookie);tcp_header->window = 0;tcp_header->urg_ptr = 0;tcp_header->check = 0; /* Calculate checksum later. */tcp_options = (void *)(tcp_header + 1);tcp_header->doff += tcp_mkoptions(tcp_options, tsopt, mss, wscale);+}
+static __always_inline void tcpv4_gen_synack(struct header_pointers *hdr,
__u32 cookie, __be32 *tsopt)+{
__u8 wscale;__u16 mss;__u8 ttl;values_get_tcpipopts(&mss, &wscale, &ttl, false);swap_eth_addr(hdr->eth->h_source, hdr->eth->h_dest);swap(hdr->ipv4->saddr, hdr->ipv4->daddr);hdr->ipv4->check = 0; /* Calculate checksum later. */hdr->ipv4->tos = 0;hdr->ipv4->id = 0;hdr->ipv4->ttl = ttl;tcp_gen_synack(hdr->tcp, cookie, tsopt, mss, wscale);hdr->tcp_len = hdr->tcp->doff * 4;hdr->ipv4->tot_len = bpf_htons(sizeof(*hdr->ipv4) + hdr->tcp_len);+}
+static __always_inline void tcpv6_gen_synack(struct header_pointers *hdr,
__u32 cookie, __be32 *tsopt)+{
__u8 wscale;__u16 mss;__u8 ttl;values_get_tcpipopts(&mss, &wscale, &ttl, true);swap_eth_addr(hdr->eth->h_source, hdr->eth->h_dest);swap(hdr->ipv6->saddr, hdr->ipv6->daddr);*(__be32 *)hdr->ipv6 = bpf_htonl(0x60000000);hdr->ipv6->hop_limit = ttl;tcp_gen_synack(hdr->tcp, cookie, tsopt, mss, wscale);hdr->tcp_len = hdr->tcp->doff * 4;hdr->ipv6->payload_len = bpf_htons(hdr->tcp_len);+}
+static __always_inline int syncookie_handle_syn(struct header_pointers *hdr,
struct xdp_md *ctx,void *data, void *data_end)+{
__u32 old_pkt_size, new_pkt_size;/* Unlike clang 10, clang 11 and 12 generate code that doesn't pass the* BPF verifier if tsopt is not volatile. Volatile forces it to store* the pointer value and use it directly, otherwise tcp_mkoptions is* (mis)compiled like this:* if (!tsopt)* return buf - start;* reg = stored_return_value_of_tscookie_init;* if (reg)* tsopt = tsopt_buf;* else* tsopt = NULL;* ...* *buf++ = tsopt[1];* It creates a dead branch where tsopt is assigned NULL, but the* verifier can't prove it's dead and blocks the program.*/__be32 * volatile tsopt = NULL;__be32 tsopt_buf[2] = {};__u16 ip_len;__u32 cookie;__s64 value;/* Checksum is not yet verified, but both checksum failure and TCP* header checks return XDP_DROP, so the order doesn't matter.*/if (hdr->tcp->fin || hdr->tcp->rst)return XDP_DROP;/* Issue SYN cookies on allowed ports, drop SYN packets on blocked* ports.*/if (!check_port_allowed(bpf_ntohs(hdr->tcp->dest)))return XDP_DROP;if (hdr->ipv4) {/* Check the IPv4 and TCP checksums before creating a SYNACK. */value = bpf_csum_diff(0, 0, (void *)hdr->ipv4, hdr->ipv4->ihl * 4, 0);if (value < 0)return XDP_ABORTED;if (csum_fold(value) != 0)return XDP_DROP; /* Bad IPv4 checksum. */value = bpf_csum_diff(0, 0, (void *)hdr->tcp, hdr->tcp_len, 0);if (value < 0)return XDP_ABORTED;if (csum_tcpudp_magic(hdr->ipv4->saddr, hdr->ipv4->daddr,hdr->tcp_len, IPPROTO_TCP, value) != 0)return XDP_DROP; /* Bad TCP checksum. */ip_len = sizeof(*hdr->ipv4);value = bpf_tcp_raw_gen_syncookie_ipv4(hdr->ipv4, hdr->tcp,hdr->tcp_len);} else if (hdr->ipv6) {/* Check the TCP checksum before creating a SYNACK. */value = bpf_csum_diff(0, 0, (void *)hdr->tcp, hdr->tcp_len, 0);if (value < 0)return XDP_ABORTED;if (csum_ipv6_magic(&hdr->ipv6->saddr, &hdr->ipv6->daddr,hdr->tcp_len, IPPROTO_TCP, value) != 0)return XDP_DROP; /* Bad TCP checksum. */ip_len = sizeof(*hdr->ipv6);value = bpf_tcp_raw_gen_syncookie_ipv6(hdr->ipv6, hdr->tcp,hdr->tcp_len);} else {return XDP_ABORTED;}if (value < 0)return XDP_ABORTED;cookie = (__u32)value;if (tscookie_init((void *)hdr->tcp, hdr->tcp_len,&tsopt_buf[0], &tsopt_buf[1], data_end))tsopt = tsopt_buf;/* Check that there is enough space for a SYNACK. It also covers* the check that the destination of the __builtin_memmove below* doesn't overflow.*/if (data + sizeof(*hdr->eth) + ip_len + TCP_MAXLEN > data_end)return XDP_ABORTED;if (hdr->ipv4) {if (hdr->ipv4->ihl * 4 > sizeof(*hdr->ipv4)) {struct tcphdr *new_tcp_header;new_tcp_header = data + sizeof(*hdr->eth) + sizeof(*hdr->ipv4);__builtin_memmove(new_tcp_header, hdr->tcp, sizeof(*hdr->tcp));hdr->tcp = new_tcp_header;hdr->ipv4->ihl = sizeof(*hdr->ipv4) / 4;}tcpv4_gen_synack(hdr, cookie, tsopt);} else if (hdr->ipv6) {tcpv6_gen_synack(hdr, cookie, tsopt);} else {return XDP_ABORTED;}/* Recalculate checksums. */hdr->tcp->check = 0;value = bpf_csum_diff(0, 0, (void *)hdr->tcp, hdr->tcp_len, 0);if (value < 0)return XDP_ABORTED;if (hdr->ipv4) {hdr->tcp->check = csum_tcpudp_magic(hdr->ipv4->saddr,hdr->ipv4->daddr,hdr->tcp_len,IPPROTO_TCP,value);hdr->ipv4->check = 0;value = bpf_csum_diff(0, 0, (void *)hdr->ipv4, sizeof(*hdr->ipv4), 0);if (value < 0)return XDP_ABORTED;hdr->ipv4->check = csum_fold(value);} else if (hdr->ipv6) {hdr->tcp->check = csum_ipv6_magic(&hdr->ipv6->saddr,&hdr->ipv6->daddr,hdr->tcp_len,IPPROTO_TCP,value);} else {return XDP_ABORTED;}/* Set the new packet size. */old_pkt_size = data_end - data;new_pkt_size = sizeof(*hdr->eth) + ip_len + hdr->tcp->doff * 4;if (bpf_xdp_adjust_tail(ctx, new_pkt_size - old_pkt_size))return XDP_ABORTED;values_inc_synacks();return XDP_TX;+}
+static __always_inline int syncookie_handle_ack(struct header_pointers *hdr) +{
int err;if (hdr->tcp->rst)return XDP_DROP;if (hdr->ipv4)err = bpf_tcp_raw_check_syncookie_ipv4(hdr->ipv4, hdr->tcp);else if (hdr->ipv6)err = bpf_tcp_raw_check_syncookie_ipv6(hdr->ipv6, hdr->tcp);elsereturn XDP_ABORTED;if (err)return XDP_DROP;return XDP_PASS;+}
+SEC("xdp") +int syncookie_xdp(struct xdp_md *ctx) +{
void *data_end = (void *)(long)ctx->data_end;void *data = (void *)(long)ctx->data;struct header_pointers hdr;__s64 value;int ret;struct bpf_ct_opts ct_lookup_opts = {.netns_id = BPF_F_CURRENT_NETNS,.l4proto = IPPROTO_TCP,};ret = tcp_dissect(data, data_end, &hdr);if (ret != XDP_TX)return ret;ret = tcp_lookup(ctx, &hdr);if (ret != XDP_TX)return ret;/* Packet is TCP and doesn't belong to an established connection. */if ((hdr.tcp->syn ^ hdr.tcp->ack) != 1)return XDP_DROP;/* Grow the TCP header to TCP_MAXLEN to be able to pass any hdr.tcp_len* to bpf_tcp_raw_gen_syncookie_ipv{4,6} and pass the verifier.*/if (bpf_xdp_adjust_tail(ctx, TCP_MAXLEN - hdr.tcp_len))return XDP_ABORTED;data_end = (void *)(long)ctx->data_end;data = (void *)(long)ctx->data;if (hdr.ipv4) {hdr.eth = data;hdr.ipv4 = (void *)hdr.eth + sizeof(*hdr.eth);/* IPV4_MAXLEN is needed when calculating checksum.* At least sizeof(struct iphdr) is needed here to access ihl.*/if ((void *)hdr.ipv4 + IPV4_MAXLEN > data_end)return XDP_ABORTED;hdr.tcp = (void *)hdr.ipv4 + hdr.ipv4->ihl * 4;} else if (hdr.ipv6) {hdr.eth = data;hdr.ipv6 = (void *)hdr.eth + sizeof(*hdr.eth);hdr.tcp = (void *)hdr.ipv6 + sizeof(*hdr.ipv6);} else {return XDP_ABORTED;}if ((void *)hdr.tcp + TCP_MAXLEN > data_end)return XDP_ABORTED;/* We run out of registers, tcp_len gets spilled to the stack, and the* verifier forgets its min and max values checked above in tcp_dissect.*/hdr.tcp_len = hdr.tcp->doff * 4;if (hdr.tcp_len < sizeof(*hdr.tcp))return XDP_ABORTED;return hdr.tcp->syn ? syncookie_handle_syn(&hdr, ctx, data, data_end) :syncookie_handle_ack(&hdr);+}
+char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/xdp_synproxy.c b/tools/testing/selftests/bpf/xdp_synproxy.c new file mode 100644 index 000000000000..4653d4655b5f --- /dev/null +++ b/tools/testing/selftests/bpf/xdp_synproxy.c @@ -0,0 +1,418 @@ +// SPDX-License-Identifier: LGPL-2.1 OR BSD-2-Clause +/* Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. */
+#include <stdnoreturn.h> +#include <stdlib.h> +#include <stdio.h> +#include <string.h> +#include <errno.h> +#include <unistd.h> +#include <getopt.h> +#include <signal.h> +#include <sys/types.h> +#include <bpf/bpf.h> +#include <bpf/libbpf.h> +#include <net/if.h> +#include <linux/if_link.h> +#include <linux/limits.h>
+static unsigned int ifindex; +static __u32 attached_prog_id;
+static void noreturn cleanup(int sig) +{
DECLARE_LIBBPF_OPTS(bpf_xdp_attach_opts, opts);int prog_fd;int err;if (attached_prog_id == 0)exit(0);prog_fd = bpf_prog_get_fd_by_id(attached_prog_id);if (prog_fd < 0) {fprintf(stderr, "Error: bpf_prog_get_fd_by_id: %s\n", strerror(-prog_fd));err = bpf_xdp_attach(ifindex, -1, 0, NULL);if (err < 0) {fprintf(stderr, "Error: bpf_set_link_xdp_fd: %s\n", strerror(-err));fprintf(stderr, "Failed to detach XDP program\n");exit(1);}} else {opts.old_prog_fd = prog_fd;err = bpf_xdp_attach(ifindex, -1, XDP_FLAGS_REPLACE, &opts);close(prog_fd);if (err < 0) {fprintf(stderr, "Error: bpf_set_link_xdp_fd_opts: %s\n", strerror(-err));/* Not an error if already replaced by someone else. */if (err != -EEXIST) {fprintf(stderr, "Failed to detach XDP program\n");exit(1);}}}exit(0);+}
+static noreturn void usage(const char *progname) +{
fprintf(stderr, "Usage: %s [--iface <iface>|--prog <prog_id>] [--mss4 <mss ipv4> --mss6 <mss ipv6> --wscale <wscale> --ttl <ttl>] [--ports <port1>,<port2>,...] [--single]\n",progname);exit(1);+}
+static unsigned long parse_arg_ul(const char *progname, const char *arg, unsigned long limit) +{
unsigned long res;char *endptr;errno = 0;res = strtoul(arg, &endptr, 10);if (errno != 0 || *endptr != '\0' || arg[0] == '\0' || res > limit)usage(progname);return res;+}
+static void parse_options(int argc, char *argv[], unsigned int *ifindex, __u32 *prog_id,
__u64 *tcpipopts, char **ports, bool *single)+{
static struct option long_options[] = {{ "help", no_argument, NULL, 'h' },{ "iface", required_argument, NULL, 'i' },{ "prog", required_argument, NULL, 'x' },{ "mss4", required_argument, NULL, 4 },{ "mss6", required_argument, NULL, 6 },{ "wscale", required_argument, NULL, 'w' },{ "ttl", required_argument, NULL, 't' },{ "ports", required_argument, NULL, 'p' },{ "single", no_argument, NULL, 's' },{ NULL, 0, NULL, 0 },};unsigned long mss4, mss6, wscale, ttl;unsigned int tcpipopts_mask = 0;if (argc < 2)usage(argv[0]);*ifindex = 0;*prog_id = 0;*tcpipopts = 0;*ports = NULL;*single = false;while (true) {int opt;opt = getopt_long(argc, argv, "", long_options, NULL);if (opt == -1)break;switch (opt) {case 'h':usage(argv[0]);break;case 'i':*ifindex = if_nametoindex(optarg);if (*ifindex == 0)usage(argv[0]);break;case 'x':*prog_id = parse_arg_ul(argv[0], optarg, UINT32_MAX);if (*prog_id == 0)usage(argv[0]);break;case 4:mss4 = parse_arg_ul(argv[0], optarg, UINT16_MAX);tcpipopts_mask |= 1 << 0;break;case 6:mss6 = parse_arg_ul(argv[0], optarg, UINT16_MAX);tcpipopts_mask |= 1 << 1;break;case 'w':wscale = parse_arg_ul(argv[0], optarg, 14);tcpipopts_mask |= 1 << 2;break;case 't':ttl = parse_arg_ul(argv[0], optarg, UINT8_MAX);tcpipopts_mask |= 1 << 3;break;case 'p':*ports = optarg;break;case 's':*single = true;break;default:usage(argv[0]);}}if (optind < argc)usage(argv[0]);if (tcpipopts_mask == 0xf) {if (mss4 == 0 || mss6 == 0 || wscale == 0 || ttl == 0)usage(argv[0]);*tcpipopts = (mss6 << 32) | (ttl << 24) | (wscale << 16) | mss4;} else if (tcpipopts_mask != 0) {usage(argv[0]);}if (*ifindex != 0 && *prog_id != 0)usage(argv[0]);if (*ifindex == 0 && *prog_id == 0)usage(argv[0]);+}
+static int syncookie_attach(const char *argv0, unsigned int ifindex) +{
struct bpf_prog_info info = {};__u32 info_len = sizeof(info);char xdp_filename[PATH_MAX];struct bpf_program *prog;struct bpf_object *obj;int prog_fd;int err;snprintf(xdp_filename, sizeof(xdp_filename), "%s_kern.o", argv0);obj = bpf_object__open_file(xdp_filename, NULL);err = libbpf_get_error(obj);if (err < 0) {fprintf(stderr, "Error: bpf_object__open_file: %s\n", strerror(-err));return err;}err = bpf_object__load(obj);if (err < 0) {fprintf(stderr, "Error: bpf_object__open_file: %s\n", strerror(-err));return err;}prog = bpf_object__find_program_by_name(obj, "syncookie_xdp");if (!prog) {fprintf(stderr, "Error: bpf_object__find_program_by_name: program syncookie_xdp was not found\n");return -ENOENT;}prog_fd = bpf_program__fd(prog);err = bpf_obj_get_info_by_fd(prog_fd, &info, &info_len);if (err < 0) {fprintf(stderr, "Error: bpf_obj_get_info_by_fd: %s\n", strerror(-err));goto out;}attached_prog_id = info.id;signal(SIGINT, cleanup);signal(SIGTERM, cleanup);err = bpf_xdp_attach(ifindex, prog_fd, XDP_FLAGS_UPDATE_IF_NOEXIST, NULL);if (err < 0) {fprintf(stderr, "Error: bpf_set_link_xdp_fd: %s\n", strerror(-err));signal(SIGINT, SIG_DFL);signal(SIGTERM, SIG_DFL);attached_prog_id = 0;goto out;}err = 0;+out:
bpf_object__close(obj);return err;+}
+static int syncookie_open_bpf_maps(__u32 prog_id, int *values_map_fd, int *ports_map_fd) +{
struct bpf_prog_info prog_info;__u32 map_ids[8];__u32 info_len;int prog_fd;int err;int i;*values_map_fd = -1;*ports_map_fd = -1;prog_fd = bpf_prog_get_fd_by_id(prog_id);if (prog_fd < 0) {fprintf(stderr, "Error: bpf_prog_get_fd_by_id: %s\n", strerror(-prog_fd));return prog_fd;}prog_info = (struct bpf_prog_info) {.nr_map_ids = 8,.map_ids = (__u64)map_ids,};info_len = sizeof(prog_info);err = bpf_obj_get_info_by_fd(prog_fd, &prog_info, &info_len);if (err != 0) {fprintf(stderr, "Error: bpf_obj_get_info_by_fd: %s\n", strerror(-err));goto out;}if (prog_info.type != BPF_PROG_TYPE_XDP) {fprintf(stderr, "Error: BPF prog type is not BPF_PROG_TYPE_XDP\n");err = -ENOENT;goto out;}if (prog_info.nr_map_ids < 2) {fprintf(stderr, "Error: Found %u BPF maps, expected at least 2\n",prog_info.nr_map_ids);err = -ENOENT;goto out;}for (i = 0; i < prog_info.nr_map_ids; i++) {struct bpf_map_info map_info = {};int map_fd;err = bpf_map_get_fd_by_id(map_ids[i]);if (err < 0) {fprintf(stderr, "Error: bpf_map_get_fd_by_id: %s\n", strerror(-err));goto err_close_map_fds;}map_fd = err;info_len = sizeof(map_info);err = bpf_obj_get_info_by_fd(map_fd, &map_info, &info_len);if (err != 0) {fprintf(stderr, "Error: bpf_obj_get_info_by_fd: %s\n", strerror(-err));close(map_fd);goto err_close_map_fds;}if (strcmp(map_info.name, "values") == 0) {*values_map_fd = map_fd;continue;}if (strcmp(map_info.name, "allowed_ports") == 0) {*ports_map_fd = map_fd;continue;}close(map_fd);}if (*values_map_fd != -1 && *ports_map_fd != -1) {err = 0;goto out;}err = -ENOENT;+err_close_map_fds:
if (*values_map_fd != -1)close(*values_map_fd);if (*ports_map_fd != -1)close(*ports_map_fd);*values_map_fd = -1;*ports_map_fd = -1;+out:
close(prog_fd);return err;+}
+int main(int argc, char *argv[]) +{
int values_map_fd, ports_map_fd;__u64 tcpipopts;bool firstiter;__u64 prevcnt;__u32 prog_id;char *ports;bool single;int err = 0;parse_options(argc, argv, &ifindex, &prog_id, &tcpipopts, &ports, &single);if (prog_id == 0) {err = bpf_xdp_query_id(ifindex, 0, &prog_id);if (err < 0) {fprintf(stderr, "Error: bpf_get_link_xdp_id: %s\n", strerror(-err));goto out;}if (prog_id == 0) {err = syncookie_attach(argv[0], ifindex);if (err < 0)goto out;prog_id = attached_prog_id;}}err = syncookie_open_bpf_maps(prog_id, &values_map_fd, &ports_map_fd);if (err < 0)goto out;if (ports) {__u16 port_last = 0;__u32 port_idx = 0;char *p = ports;fprintf(stderr, "Replacing allowed ports\n");while (p && *p != '\0') {char *token = strsep(&p, ",");__u16 port;port = parse_arg_ul(argv[0], token, UINT16_MAX);err = bpf_map_update_elem(ports_map_fd, &port_idx, &port, BPF_ANY);if (err != 0) {fprintf(stderr, "Error: bpf_map_update_elem: %s\n", strerror(-err));fprintf(stderr, "Failed to add port %u (index %u)\n",port, port_idx);goto out_close_maps;}fprintf(stderr, "Added port %u\n", port);port_idx++;}err = bpf_map_update_elem(ports_map_fd, &port_idx, &port_last, BPF_ANY);if (err != 0) {fprintf(stderr, "Error: bpf_map_update_elem: %s\n", strerror(-err));fprintf(stderr, "Failed to add the terminator value 0 (index %u)\n",port_idx);goto out_close_maps;}}if (tcpipopts) {__u32 key = 0;fprintf(stderr, "Replacing TCP/IP options\n");err = bpf_map_update_elem(values_map_fd, &key, &tcpipopts, BPF_ANY);if (err != 0) {fprintf(stderr, "Error: bpf_map_update_elem: %s\n", strerror(-err));goto out_close_maps;}}if ((ports || tcpipopts) && attached_prog_id == 0 && !single)goto out_close_maps;prevcnt = 0;firstiter = true;while (true) {__u32 key = 1;__u64 value;err = bpf_map_lookup_elem(values_map_fd, &key, &value);if (err != 0) {fprintf(stderr, "Error: bpf_map_lookup_elem: %s\n", strerror(-err));goto out_close_maps;}if (firstiter) {prevcnt = value;firstiter = false;}if (single) {printf("Total SYNACKs generated: %llu\n", value);break;}printf("SYNACKs generated: %llu (total %llu)\n", value - prevcnt, value);prevcnt = value;sleep(1);}+out_close_maps:
close(values_map_fd);close(ports_map_fd);+out:
return err == 0 ? 0 : 1;+}
2.30.2