When calling socket lookup from L2 (tc, xdp), VRF boundaries aren't respected. This patchset fixes this by regarding the incoming device's VRF attachment when performing the socket lookups from tc/xdp.
The first two patches are coding changes which facilitate this fix by factoring out the tc helper's logic which was shared with cg/sk_skb (which operate correctly).
The third patch contains the actual bugfix.
The fourth patch adds bpf tests for these lookup functions. --- v2: Fixed uninitialized var in test patch (4).
Gilad Sever (4): bpf: factor out socket lookup functions for the TC hookpoint. bpf: Call __bpf_sk_lookup()/__bpf_skc_lookup() directly via TC hookpoint bpf: fix bpf socket lookup from tc/xdp to respect socket VRF bindings selftests/bpf: Add tc_socket_lookup tests
net/core/filter.c | 132 +++++-- .../bpf/prog_tests/tc_socket_lookup.c | 341 ++++++++++++++++++ .../selftests/bpf/progs/tc_socket_lookup.c | 73 ++++ 3 files changed, 525 insertions(+), 21 deletions(-) create mode 100644 tools/testing/selftests/bpf/prog_tests/tc_socket_lookup.c create mode 100644 tools/testing/selftests/bpf/progs/tc_socket_lookup.c
Change BPF helper socket lookup functions to use TC specific variants: bpf_tc_sk_lookup_tcp() / bpf_tc_sk_lookup_udp() / bpf_tc_skc_lookup_tcp() instead of sharing implementation with the cg / sk_skb hooking points. This allows introducing a separate logic for the TC flow.
The tc functions are identical to the original code.
Signed-off-by: Gilad Sever gilad9366@gmail.com --- net/core/filter.c | 63 ++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 60 insertions(+), 3 deletions(-)
diff --git a/net/core/filter.c b/net/core/filter.c index 1d6f165923bf..5910956f4e0d 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -6701,6 +6701,63 @@ static const struct bpf_func_proto bpf_sk_lookup_udp_proto = { .arg5_type = ARG_ANYTHING, };
+BPF_CALL_5(bpf_tc_skc_lookup_tcp, struct sk_buff *, skb, + struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) +{ + return (unsigned long)bpf_skc_lookup(skb, tuple, len, IPPROTO_TCP, + netns_id, flags); +} + +static const struct bpf_func_proto bpf_tc_skc_lookup_tcp_proto = { + .func = bpf_tc_skc_lookup_tcp, + .gpl_only = false, + .pkt_access = true, + .ret_type = RET_PTR_TO_SOCK_COMMON_OR_NULL, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, + .arg3_type = ARG_CONST_SIZE, + .arg4_type = ARG_ANYTHING, + .arg5_type = ARG_ANYTHING, +}; + +BPF_CALL_5(bpf_tc_sk_lookup_tcp, struct sk_buff *, skb, + struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) +{ + return (unsigned long)bpf_sk_lookup(skb, tuple, len, IPPROTO_TCP, + netns_id, flags); +} + +static const struct bpf_func_proto bpf_tc_sk_lookup_tcp_proto = { + .func = bpf_tc_sk_lookup_tcp, + .gpl_only = false, + .pkt_access = true, + .ret_type = RET_PTR_TO_SOCKET_OR_NULL, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, + .arg3_type = ARG_CONST_SIZE, + .arg4_type = ARG_ANYTHING, + .arg5_type = ARG_ANYTHING, +}; + +BPF_CALL_5(bpf_tc_sk_lookup_udp, struct sk_buff *, skb, + struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) +{ + return (unsigned long)bpf_sk_lookup(skb, tuple, len, IPPROTO_UDP, + netns_id, flags); +} + +static const struct bpf_func_proto bpf_tc_sk_lookup_udp_proto = { + .func = bpf_tc_sk_lookup_udp, + .gpl_only = false, + .pkt_access = true, + .ret_type = RET_PTR_TO_SOCKET_OR_NULL, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, + .arg3_type = ARG_CONST_SIZE, + .arg4_type = ARG_ANYTHING, + .arg5_type = ARG_ANYTHING, +}; + BPF_CALL_1(bpf_sk_release, struct sock *, sk) { if (sk && sk_is_refcounted(sk)) @@ -7954,9 +8011,9 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) #endif #ifdef CONFIG_INET case BPF_FUNC_sk_lookup_tcp: - return &bpf_sk_lookup_tcp_proto; + return &bpf_tc_sk_lookup_tcp_proto; case BPF_FUNC_sk_lookup_udp: - return &bpf_sk_lookup_udp_proto; + return &bpf_tc_sk_lookup_udp_proto; case BPF_FUNC_sk_release: return &bpf_sk_release_proto; case BPF_FUNC_tcp_sock: @@ -7964,7 +8021,7 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_get_listener_sock: return &bpf_get_listener_sock_proto; case BPF_FUNC_skc_lookup_tcp: - return &bpf_skc_lookup_tcp_proto; + return &bpf_tc_skc_lookup_tcp_proto; case BPF_FUNC_tcp_check_syncookie: return &bpf_tcp_check_syncookie_proto; case BPF_FUNC_skb_ecn_set_ce:
skb->dev always exists in the tc flow. There is no need to use bpf_skc_lookup(), bpf_sk_lookup() from this code path.
This change facilitates fixing the tc flow to be VRF aware.
Signed-off-by: Gilad Sever gilad9366@gmail.com --- net/core/filter.c | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-)
diff --git a/net/core/filter.c b/net/core/filter.c index 5910956f4e0d..f43f86fc1235 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -6704,8 +6704,12 @@ static const struct bpf_func_proto bpf_sk_lookup_udp_proto = { BPF_CALL_5(bpf_tc_skc_lookup_tcp, struct sk_buff *, skb, struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) { - return (unsigned long)bpf_skc_lookup(skb, tuple, len, IPPROTO_TCP, - netns_id, flags); + struct net *caller_net = dev_net(skb->dev); + int ifindex = skb->dev->ifindex; + + return (unsigned long)__bpf_skc_lookup(skb, tuple, len, caller_net, + ifindex, IPPROTO_TCP, netns_id, + flags); }
static const struct bpf_func_proto bpf_tc_skc_lookup_tcp_proto = { @@ -6723,8 +6727,12 @@ static const struct bpf_func_proto bpf_tc_skc_lookup_tcp_proto = { BPF_CALL_5(bpf_tc_sk_lookup_tcp, struct sk_buff *, skb, struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) { - return (unsigned long)bpf_sk_lookup(skb, tuple, len, IPPROTO_TCP, - netns_id, flags); + struct net *caller_net = dev_net(skb->dev); + int ifindex = skb->dev->ifindex; + + return (unsigned long)__bpf_sk_lookup(skb, tuple, len, caller_net, + ifindex, IPPROTO_TCP, netns_id, + flags); }
static const struct bpf_func_proto bpf_tc_sk_lookup_tcp_proto = { @@ -6742,8 +6750,12 @@ static const struct bpf_func_proto bpf_tc_sk_lookup_tcp_proto = { BPF_CALL_5(bpf_tc_sk_lookup_udp, struct sk_buff *, skb, struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) { - return (unsigned long)bpf_sk_lookup(skb, tuple, len, IPPROTO_UDP, - netns_id, flags); + struct net *caller_net = dev_net(skb->dev); + int ifindex = skb->dev->ifindex; + + return (unsigned long)__bpf_sk_lookup(skb, tuple, len, caller_net, + ifindex, IPPROTO_UDP, netns_id, + flags); }
static const struct bpf_func_proto bpf_tc_sk_lookup_udp_proto = {
When calling bpf_sk_lookup_tcp(), bpf_sk_lookup_udp() or bpf_skc_lookup_tcp() from tc/xdp ingress, VRF socket bindings aren't respoected, i.e. unbound sockets are returned, and bound sockets aren't found.
VRF binding is determined by the sdif argument to sk_lookup(), however when called from tc the IP SKB control block isn't initialized and thus inet{,6}_sdif() always returns 0.
Fix by calculating sdif for the tc/xdp flows by observing the device's l3 enslaved state.
The cg/sk_skb hooking points which are expected to support inet{,6}_sdif() pass sdif=-1 which makes __bpf_skc_lookup() use the existing logic.
Fixes: 6acc9b432e67 ("bpf: Add helper to retrieve socket in BPF") Signed-off-by: Gilad Sever gilad9366@gmail.com --- net/core/filter.c | 63 +++++++++++++++++++++++++++++++---------------- 1 file changed, 42 insertions(+), 21 deletions(-)
diff --git a/net/core/filter.c b/net/core/filter.c index f43f86fc1235..d68e14ff1e55 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -6529,12 +6529,11 @@ static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple, static struct sock * __bpf_skc_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len, struct net *caller_net, u32 ifindex, u8 proto, u64 netns_id, - u64 flags) + u64 flags, int sdif) { struct sock *sk = NULL; struct net *net; u8 family; - int sdif;
if (len == sizeof(tuple->ipv4)) family = AF_INET; @@ -6546,10 +6545,12 @@ __bpf_skc_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len, if (unlikely(flags || !((s32)netns_id < 0 || netns_id <= S32_MAX))) goto out;
- if (family == AF_INET) - sdif = inet_sdif(skb); - else - sdif = inet6_sdif(skb); + if (sdif < 0) { + if (family == AF_INET) + sdif = inet_sdif(skb); + else + sdif = inet6_sdif(skb); + }
if ((s32)netns_id < 0) { net = caller_net; @@ -6569,10 +6570,11 @@ __bpf_skc_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len, static struct sock * __bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len, struct net *caller_net, u32 ifindex, u8 proto, u64 netns_id, - u64 flags) + u64 flags, int sdif) { struct sock *sk = __bpf_skc_lookup(skb, tuple, len, caller_net, - ifindex, proto, netns_id, flags); + ifindex, proto, netns_id, flags, + sdif);
if (sk) { struct sock *sk2 = sk_to_full_sk(sk); @@ -6612,7 +6614,7 @@ bpf_skc_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len, }
return __bpf_skc_lookup(skb, tuple, len, caller_net, ifindex, proto, - netns_id, flags); + netns_id, flags, -1); }
static struct sock * @@ -6701,15 +6703,25 @@ static const struct bpf_func_proto bpf_sk_lookup_udp_proto = { .arg5_type = ARG_ANYTHING, };
+static int bpf_l2_sdif(const struct net_device *dev) +{ +#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV) + if (netif_is_l3_slave(dev)) + return dev->ifindex; +#endif + return 0; +} + BPF_CALL_5(bpf_tc_skc_lookup_tcp, struct sk_buff *, skb, struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) { struct net *caller_net = dev_net(skb->dev); + int sdif = bpf_l2_sdif(skb->dev); int ifindex = skb->dev->ifindex;
return (unsigned long)__bpf_skc_lookup(skb, tuple, len, caller_net, ifindex, IPPROTO_TCP, netns_id, - flags); + flags, sdif); }
static const struct bpf_func_proto bpf_tc_skc_lookup_tcp_proto = { @@ -6728,11 +6740,12 @@ BPF_CALL_5(bpf_tc_sk_lookup_tcp, struct sk_buff *, skb, struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) { struct net *caller_net = dev_net(skb->dev); + int sdif = bpf_l2_sdif(skb->dev); int ifindex = skb->dev->ifindex;
return (unsigned long)__bpf_sk_lookup(skb, tuple, len, caller_net, ifindex, IPPROTO_TCP, netns_id, - flags); + flags, sdif); }
static const struct bpf_func_proto bpf_tc_sk_lookup_tcp_proto = { @@ -6751,11 +6764,12 @@ BPF_CALL_5(bpf_tc_sk_lookup_udp, struct sk_buff *, skb, struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) { struct net *caller_net = dev_net(skb->dev); + int sdif = bpf_l2_sdif(skb->dev); int ifindex = skb->dev->ifindex;
return (unsigned long)__bpf_sk_lookup(skb, tuple, len, caller_net, ifindex, IPPROTO_UDP, netns_id, - flags); + flags, sdif); }
static const struct bpf_func_proto bpf_tc_sk_lookup_udp_proto = { @@ -6788,11 +6802,13 @@ BPF_CALL_5(bpf_xdp_sk_lookup_udp, struct xdp_buff *, ctx, struct bpf_sock_tuple *, tuple, u32, len, u32, netns_id, u64, flags) { struct net *caller_net = dev_net(ctx->rxq->dev); - int ifindex = ctx->rxq->dev->ifindex; + struct net_device *dev = ctx->rxq->dev; + int sdif = bpf_l2_sdif(dev); + int ifindex = dev->ifindex;
return (unsigned long)__bpf_sk_lookup(NULL, tuple, len, caller_net, ifindex, IPPROTO_UDP, netns_id, - flags); + flags, sdif); }
static const struct bpf_func_proto bpf_xdp_sk_lookup_udp_proto = { @@ -6811,11 +6827,13 @@ BPF_CALL_5(bpf_xdp_skc_lookup_tcp, struct xdp_buff *, ctx, struct bpf_sock_tuple *, tuple, u32, len, u32, netns_id, u64, flags) { struct net *caller_net = dev_net(ctx->rxq->dev); - int ifindex = ctx->rxq->dev->ifindex; + struct net_device *dev = ctx->rxq->dev; + int sdif = bpf_l2_sdif(dev); + int ifindex = dev->ifindex;
return (unsigned long)__bpf_skc_lookup(NULL, tuple, len, caller_net, ifindex, IPPROTO_TCP, netns_id, - flags); + flags, sdif); }
static const struct bpf_func_proto bpf_xdp_skc_lookup_tcp_proto = { @@ -6834,11 +6852,13 @@ BPF_CALL_5(bpf_xdp_sk_lookup_tcp, struct xdp_buff *, ctx, struct bpf_sock_tuple *, tuple, u32, len, u32, netns_id, u64, flags) { struct net *caller_net = dev_net(ctx->rxq->dev); - int ifindex = ctx->rxq->dev->ifindex; + struct net_device *dev = ctx->rxq->dev; + int sdif = bpf_l2_sdif(dev); + int ifindex = dev->ifindex;
return (unsigned long)__bpf_sk_lookup(NULL, tuple, len, caller_net, ifindex, IPPROTO_TCP, netns_id, - flags); + flags, sdif); }
static const struct bpf_func_proto bpf_xdp_sk_lookup_tcp_proto = { @@ -6858,7 +6878,8 @@ BPF_CALL_5(bpf_sock_addr_skc_lookup_tcp, struct bpf_sock_addr_kern *, ctx, { return (unsigned long)__bpf_skc_lookup(NULL, tuple, len, sock_net(ctx->sk), 0, - IPPROTO_TCP, netns_id, flags); + IPPROTO_TCP, netns_id, flags, + -1); }
static const struct bpf_func_proto bpf_sock_addr_skc_lookup_tcp_proto = { @@ -6877,7 +6898,7 @@ BPF_CALL_5(bpf_sock_addr_sk_lookup_tcp, struct bpf_sock_addr_kern *, ctx, { return (unsigned long)__bpf_sk_lookup(NULL, tuple, len, sock_net(ctx->sk), 0, IPPROTO_TCP, - netns_id, flags); + netns_id, flags, -1); }
static const struct bpf_func_proto bpf_sock_addr_sk_lookup_tcp_proto = { @@ -6896,7 +6917,7 @@ BPF_CALL_5(bpf_sock_addr_sk_lookup_udp, struct bpf_sock_addr_kern *, ctx, { return (unsigned long)__bpf_sk_lookup(NULL, tuple, len, sock_net(ctx->sk), 0, IPPROTO_UDP, - netns_id, flags); + netns_id, flags, -1); }
static const struct bpf_func_proto bpf_sock_addr_sk_lookup_udp_proto = {
Verify that socket lookup via TC with all BPF APIs is VRF aware.
Signed-off-by: Gilad Sever gilad9366@gmail.com --- v2: Fix build by initializing vars with -1 --- .../bpf/prog_tests/tc_socket_lookup.c | 341 ++++++++++++++++++ .../selftests/bpf/progs/tc_socket_lookup.c | 73 ++++ 2 files changed, 414 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/tc_socket_lookup.c create mode 100644 tools/testing/selftests/bpf/progs/tc_socket_lookup.c
diff --git a/tools/testing/selftests/bpf/prog_tests/tc_socket_lookup.c b/tools/testing/selftests/bpf/prog_tests/tc_socket_lookup.c new file mode 100644 index 000000000000..5dcaf0ea3f8c --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/tc_socket_lookup.c @@ -0,0 +1,341 @@ +// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause + +/* + * Topology: + * --------- + * NS1 namespace | NS2 namespace + * | + * +--------------+ | +--------------+ + * | veth01 |----------| veth10 | + * | 172.16.1.100 | | | 172.16.1.200 | + * | bpf | | +--------------+ + * +--------------+ | + * server(UDP/TCP) | + * +-------------------+ | + * | vrf1 | | + * | +--------------+ | | +--------------+ + * | | veth02 |----------| veth20 | + * | | 172.16.2.100 | | | | 172.16.2.200 | + * | | bpf | | | +--------------+ + * | +--------------+ | | + * | server(UDP/TCP) | | + * +-------------------+ | + * + * Test flow + * ----------- + * The tests verifies that socket lookup via TC is VRF aware: + * 1) Creates two veth pairs between NS1 and NS2: + * a) veth01 <-> veth10 outside the VRF + * b) veth02 <-> veth20 in the VRF + * 2) Attaches to veth01 and veth02 a program that calls: + * a) bpf_skc_lookup_tcp() with TCP and tcp_skc is true + * b) bpf_sk_lookup_tcp() with TCP and tcp_skc is false + * c) bpf_sk_lookup_udp() with UDP + * The program stores the lookup result in bss->lookup_status. + * 3) Creates a socket TCP/UDP server in/outside the VRF. + * 4) The test expects lookup_status to be: + * a) 0 from device in VRF to server outside VRF + * b) 0 from device outside VRF to server in VRF + * c) 1 from device in VRF to server in VRF + * d) 1 from device outside VRF to server outside VRF + */ + +#include <net/if.h> + +#include "test_progs.h" +#include "network_helpers.h" +#include "tc_socket_lookup.skel.h" + +#define NS1 "tc_socket_lookup_1" +#define NS2 "tc_socket_lookup_2" + +#define IP4_ADDR_VETH01 "172.16.1.100" +#define IP4_ADDR_VETH10 "172.16.1.200" +#define IP4_ADDR_VETH02 "172.16.2.100" +#define IP4_ADDR_VETH20 "172.16.2.200" + +#define NON_VRF_PORT 5000 +#define IN_VRF_PORT 5001 + +#define IO_TIMEOUT_SEC 3 + +#define SYS(fmt, ...) \ + ({ \ + char cmd[1024]; \ + snprintf(cmd, sizeof(cmd), fmt, ##__VA_ARGS__); \ + if (!ASSERT_OK(system(cmd), cmd)) \ + goto fail; \ + }) + +#define SYS_NOFAIL(fmt, ...) \ + ({ \ + char cmd[1024]; \ + snprintf(cmd, sizeof(cmd), fmt, ##__VA_ARGS__); \ + system(cmd); \ + }) + +static int make_socket(int sotype, const char *ip, int port, + struct sockaddr_storage *addr) +{ + struct timeval timeo = { .tv_sec = IO_TIMEOUT_SEC }; + int err, fd; + + err = make_sockaddr(AF_INET, ip, port, addr, NULL); + if (!ASSERT_OK(err, "make_address")) + return -1; + + fd = socket(AF_INET, sotype, 0); + if (!ASSERT_OK(fd < 0, "socket")) + return -1; + + err = setsockopt(fd, SOL_SOCKET, SO_SNDTIMEO, &timeo, sizeof(timeo)); + if (!ASSERT_OK(err, "setsockopt(SO_SNDTIMEO)")) + goto fail; + + err = setsockopt(fd, SOL_SOCKET, SO_RCVTIMEO, &timeo, sizeof(timeo)); + if (!ASSERT_OK(err, "setsockopt(SO_RCVTIMEO)")) + goto fail; + + return fd; +fail: + close(fd); + return -1; +} + +static int make_server(int sotype, const char *ip, int port, const char *ifname) +{ + struct sockaddr_storage addr = {}; + const int one = 1; + int err, fd = -1; + + fd = make_socket(sotype, ip, port, &addr); + if (fd < 0) + return -1; + + if (sotype == SOCK_STREAM) { + err = setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &one, + sizeof(one)); + if (!ASSERT_OK(err, "setsockopt(SO_REUSEADDR)")) + goto fail; + } + + if (ifname) { + err = setsockopt(fd, SOL_SOCKET, SO_BINDTODEVICE, + ifname, strlen(ifname) + 1); + if (!ASSERT_OK(err, "setsockopt(SO_BINDTODEVICE)")) + goto fail; + } + + err = bind(fd, (void *)&addr, sizeof(struct sockaddr_in)); + if (!ASSERT_OK(err, "bind")) + goto fail; + + if (sotype == SOCK_STREAM) { + err = listen(fd, SOMAXCONN); + if (!ASSERT_OK(err, "listen")) + goto fail; + } + + return fd; +fail: + close(fd); + return -1; +} + +static int attach_tc_prog(struct bpf_tc_hook *hook, int prog_fd) +{ + LIBBPF_OPTS(bpf_tc_opts, opts1, .handle = 1, .priority = 1, + .prog_fd = prog_fd); + int ret; + + ret = bpf_tc_hook_create(hook); + if (!ASSERT_OK(ret, "create tc hook")) + return ret; + + if (prog_fd >= 0) { + hook->attach_point = BPF_TC_INGRESS; + ret = bpf_tc_attach(hook, &opts1); + if (!ASSERT_OK(ret, "bpf_tc_attach")) { + bpf_tc_hook_destroy(hook); + return ret; + } + } + return 0; +} + +static void cleanup(void) +{ + SYS_NOFAIL("test -f /var/run/netns/" NS1 " && ip netns delete " + NS1); + SYS_NOFAIL("test -f /var/run/netns/" NS2 " && ip netns delete " + NS2); +} + +static int setup(struct tc_socket_lookup *skel) +{ + LIBBPF_OPTS(bpf_tc_hook, tc_hook, .attach_point = BPF_TC_INGRESS); + struct nstoken *nstoken = NULL; + int ifindex, prog_fd, ret = 0; + + SYS("ip netns add " NS1); + SYS("ip netns add " NS2); + + /* NS1 <-> NS2 [veth01 <-> veth10] */ + SYS("ip link add veth01 netns " NS1 " type veth peer name veth10 netns " + NS2); + SYS("ip -net " NS1 " addr add " IP4_ADDR_VETH01 "/24 dev veth01"); + SYS("ip -net " NS1 " link set dev veth01 up"); + SYS("ip -net " NS2 " addr add " IP4_ADDR_VETH10 "/24 dev veth10"); + SYS("ip -net " NS2 " link set dev veth10 up"); + + /* NS1 <-> NS2 [veth02 <-> veth20] */ + SYS("ip link add veth02 netns " NS1 " type veth peer name veth20 netns " + NS2); + SYS("ip -net " NS1 " addr add " IP4_ADDR_VETH02 "/24 dev veth02"); + SYS("ip -net " NS1 " link set dev veth02 up"); + SYS("ip -net " NS2 " addr add " IP4_ADDR_VETH20 "/24 dev veth20"); + SYS("ip -net " NS2 " link set dev veth20 up"); + + /* veth02 -> vrf1 */ + SYS("ip -net " NS1 " link add vrf1 type vrf table 11"); + SYS("ip -net " NS1 " route add vrf vrf1 unreachable default metric " + "4278198272"); + SYS("ip -net " NS1 " link set vrf1 alias vrf"); + SYS("ip -net " NS1 " link set vrf1 up"); + SYS("ip -net " NS1 " link set veth02 master vrf1"); + + /* Attach prog to veth devices in NS1 */ + nstoken = open_netns(NS1); + if (!ASSERT_OK_PTR(nstoken, "setns " NS1)) + goto fail; + prog_fd = bpf_program__fd(skel->progs.test_socket_lookup); + if (!ASSERT_GE(prog_fd, 0, "bpf_program__fd")) + goto fail; + + ifindex = if_nametoindex("veth01"); + if (!ASSERT_NEQ(ifindex, 0, "veth01 ifindex")) + goto fail; + tc_hook.ifindex = ifindex; + if (attach_tc_prog(&tc_hook, prog_fd)) + goto fail; + + ifindex = if_nametoindex("veth02"); + if (!ASSERT_NEQ(ifindex, 0, "veth02 ifindex")) + goto fail; + tc_hook.ifindex = ifindex; + if (attach_tc_prog(&tc_hook, prog_fd)) + goto fail; + goto close; +fail: + ret = -1; +close: + if (nstoken) + close_netns(nstoken); + return ret; +} + +static int test_lookup(struct tc_socket_lookup *skel, int sotype, + const char *ip, int port, bool tcp_skc, + int lookup_status_exp) +{ + static const char msg[] = "Hello Server"; + struct sockaddr_storage addr = {}; + int fd, ret = 0; + + fd = make_socket(sotype, ip, port, &addr); + if (fd < 0) + return -1; + + skel->bss->tcp_skc = tcp_skc; + skel->bss->lookup_status = -1; + + if (sotype == SOCK_STREAM) + connect(fd, (void *)&addr, sizeof(struct sockaddr_in)); + else + sendto(fd, msg, sizeof(msg), 0, (void *)&addr, + sizeof(struct sockaddr_in)); + + if (!ASSERT_EQ(skel->bss->lookup_status, lookup_status_exp, + "lookup_status")) + goto fail; + + goto close; + +fail: + ret = -1; +close: + close(fd); + return ret; +} + +static void _test_tc_socket_lookup(struct tc_socket_lookup *skel, int sotype, + bool tcp_skc) +{ + int in_vrf_server = -1, non_vrf_server = -1; + struct nstoken *nstoken = NULL; + + nstoken = open_netns(NS1); + if (!ASSERT_OK_PTR(nstoken, "setns " NS1)) + goto done; + + /* Open sockets in and outside VRF */ + non_vrf_server = make_server(sotype, "0.0.0.0", NON_VRF_PORT, NULL); + if (!ASSERT_GE(non_vrf_server, 0, "make_server__outside_vrf_fd")) + goto done; + + in_vrf_server = make_server(sotype, "0.0.0.0", IN_VRF_PORT, "veth02"); + if (!ASSERT_GE(in_vrf_server, 0, "make_server__in_vrf_fd")) + goto done; + + /* Perform test from NS2 */ + close_netns(nstoken); + nstoken = open_netns(NS2); + if (!ASSERT_OK_PTR(nstoken, "setns " NS2)) + goto done; + + if (!ASSERT_OK(test_lookup(skel, sotype, IP4_ADDR_VETH02, NON_VRF_PORT, + tcp_skc, 0), "in_to_out")) + goto done; + if (!ASSERT_OK(test_lookup(skel, sotype, IP4_ADDR_VETH02, IN_VRF_PORT, + tcp_skc, 1), "in_to_in")) + goto done; + if (!ASSERT_OK(test_lookup(skel, sotype, IP4_ADDR_VETH01, NON_VRF_PORT, + tcp_skc, 1), "out_to_out")) + goto done; + if (!ASSERT_OK(test_lookup(skel, sotype, IP4_ADDR_VETH01, IN_VRF_PORT, + tcp_skc, 0), "out_to_in")) + goto done; + +done: + if (non_vrf_server >= 0) + close(non_vrf_server); + if (in_vrf_server >= 0) + close(in_vrf_server); + if (nstoken) + close_netns(nstoken); +} + +void test_tc_socket_lookup(void) +{ + struct tc_socket_lookup *skel; + + cleanup(); + + skel = tc_socket_lookup__open_and_load(); + if (!ASSERT_OK_PTR(skel, "tc_socket_lookup__open_and_load")) + return; + + if (!ASSERT_OK(setup(skel), "setup")) + goto done; + + if (test__start_subtest("tc_socket_lookup_tcp")) + _test_tc_socket_lookup(skel, SOCK_STREAM, false); + if (test__start_subtest("tc_socket_lookup_tcp_skc")) + _test_tc_socket_lookup(skel, SOCK_STREAM, true); + if (test__start_subtest("tc_socket_lookup_udp")) + _test_tc_socket_lookup(skel, SOCK_DGRAM, false); + +done: + tc_socket_lookup__destroy(skel); + cleanup(); +} diff --git a/tools/testing/selftests/bpf/progs/tc_socket_lookup.c b/tools/testing/selftests/bpf/progs/tc_socket_lookup.c new file mode 100644 index 000000000000..06601eb17807 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/tc_socket_lookup.c @@ -0,0 +1,73 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/bpf.h> +#include <linux/ip.h> +#include <linux/in.h> +#include <linux/if_ether.h> +#include <linux/pkt_cls.h> +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_endian.h> +#include <stdbool.h> + +int lookup_status; +bool tcp_skc; + +#define CUR_NS BPF_F_CURRENT_NETNS + +SEC("tc") +int test_socket_lookup(struct __sk_buff *skb) +{ + struct bpf_sock_tuple *tp; + void *data_end, *data; + struct bpf_sock *sk; + struct ethhdr *eth; + struct iphdr *iph; + int tplen; + + if (skb->protocol != bpf_htons(ETH_P_IP)) + return TC_ACT_UNSPEC; + + tplen = sizeof(tp->ipv4); + + if (bpf_skb_pull_data(skb, sizeof(*eth) + sizeof(*iph) + tplen)) + return TC_ACT_SHOT; + + data_end = (void *)(long)skb->data_end; + data = (void *)(long)skb->data; + + eth = data; + if (eth + 1 > data_end) + return TC_ACT_SHOT; + + iph = (struct iphdr *)(eth + 1); + if (iph + 1 > data_end) + return TC_ACT_SHOT; + + tp = (struct bpf_sock_tuple *)&iph->saddr; + if ((void *)tp + tplen > data_end) + return TC_ACT_SHOT; + + switch (iph->protocol) { + case IPPROTO_TCP: + if (tcp_skc) + sk = bpf_skc_lookup_tcp(skb, tp, tplen, CUR_NS, 0); + else + sk = bpf_sk_lookup_tcp(skb, tp, tplen, CUR_NS, 0); + break; + case IPPROTO_UDP: + sk = bpf_sk_lookup_udp(skb, tp, tplen, CUR_NS, 0); + break; + default: + return TC_ACT_SHOT; + } + + lookup_status = 0; + + if (sk) { + bpf_sk_release(sk); + lookup_status = 1; + } + + return TC_ACT_UNSPEC; +} + +char _license[] SEC("license") = "GPL";
On 04/20, Gilad Sever wrote:
Verify that socket lookup via TC with all BPF APIs is VRF aware.
Signed-off-by: Gilad Sever gilad9366@gmail.com
v2: Fix build by initializing vars with -1
.../bpf/prog_tests/tc_socket_lookup.c | 341 ++++++++++++++++++ .../selftests/bpf/progs/tc_socket_lookup.c | 73 ++++ 2 files changed, 414 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/tc_socket_lookup.c create mode 100644 tools/testing/selftests/bpf/progs/tc_socket_lookup.c
diff --git a/tools/testing/selftests/bpf/prog_tests/tc_socket_lookup.c b/tools/testing/selftests/bpf/prog_tests/tc_socket_lookup.c new file mode 100644 index 000000000000..5dcaf0ea3f8c --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/tc_socket_lookup.c @@ -0,0 +1,341 @@ +// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
+/*
- Topology:
NS1 namespace | NS2 namespace
|
+--------------+ | +--------------+
| veth01 |----------| veth10 |
| 172.16.1.100 | | | 172.16.1.200 |
| bpf | | +--------------+
+--------------+ |
server(UDP/TCP) |
- +-------------------+ |
- | vrf1 | |
- | +--------------+ | | +--------------+
- | | veth02 |----------| veth20 |
- | | 172.16.2.100 | | | | 172.16.2.200 |
- | | bpf | | | +--------------+
- | +--------------+ | |
- | server(UDP/TCP) | |
- +-------------------+ |
- Test flow
- The tests verifies that socket lookup via TC is VRF aware:
- Creates two veth pairs between NS1 and NS2:
a) veth01 <-> veth10 outside the VRF
b) veth02 <-> veth20 in the VRF
- Attaches to veth01 and veth02 a program that calls:
a) bpf_skc_lookup_tcp() with TCP and tcp_skc is true
b) bpf_sk_lookup_tcp() with TCP and tcp_skc is false
c) bpf_sk_lookup_udp() with UDP
The program stores the lookup result in bss->lookup_status.
- Creates a socket TCP/UDP server in/outside the VRF.
- The test expects lookup_status to be:
a) 0 from device in VRF to server outside VRF
b) 0 from device outside VRF to server in VRF
c) 1 from device in VRF to server in VRF
d) 1 from device outside VRF to server outside VRF
- */
+#include <net/if.h>
+#include "test_progs.h" +#include "network_helpers.h" +#include "tc_socket_lookup.skel.h"
+#define NS1 "tc_socket_lookup_1" +#define NS2 "tc_socket_lookup_2"
+#define IP4_ADDR_VETH01 "172.16.1.100" +#define IP4_ADDR_VETH10 "172.16.1.200" +#define IP4_ADDR_VETH02 "172.16.2.100" +#define IP4_ADDR_VETH20 "172.16.2.200"
+#define NON_VRF_PORT 5000 +#define IN_VRF_PORT 5001
+#define IO_TIMEOUT_SEC 3
+#define SYS(fmt, ...) \
- ({ \
char cmd[1024]; \
snprintf(cmd, sizeof(cmd), fmt, ##__VA_ARGS__); \
if (!ASSERT_OK(system(cmd), cmd)) \
goto fail; \
- })
+#define SYS_NOFAIL(fmt, ...) \
- ({ \
char cmd[1024]; \
snprintf(cmd, sizeof(cmd), fmt, ##__VA_ARGS__); \
system(cmd); \
- })
[..]
+static int make_socket(int sotype, const char *ip, int port,
struct sockaddr_storage *addr)
+{
- struct timeval timeo = { .tv_sec = IO_TIMEOUT_SEC };
- int err, fd;
- err = make_sockaddr(AF_INET, ip, port, addr, NULL);
- if (!ASSERT_OK(err, "make_address"))
return -1;
- fd = socket(AF_INET, sotype, 0);
- if (!ASSERT_OK(fd < 0, "socket"))
return -1;
- err = setsockopt(fd, SOL_SOCKET, SO_SNDTIMEO, &timeo, sizeof(timeo));
- if (!ASSERT_OK(err, "setsockopt(SO_SNDTIMEO)"))
goto fail;
- err = setsockopt(fd, SOL_SOCKET, SO_RCVTIMEO, &timeo, sizeof(timeo));
- if (!ASSERT_OK(err, "setsockopt(SO_RCVTIMEO)"))
goto fail;
- return fd;
+fail:
- close(fd);
- return -1;
+}
+static int make_server(int sotype, const char *ip, int port, const char *ifname) +{
- struct sockaddr_storage addr = {};
- const int one = 1;
- int err, fd = -1;
- fd = make_socket(sotype, ip, port, &addr);
- if (fd < 0)
return -1;
- if (sotype == SOCK_STREAM) {
err = setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &one,
sizeof(one));
if (!ASSERT_OK(err, "setsockopt(SO_REUSEADDR)"))
goto fail;
- }
- if (ifname) {
err = setsockopt(fd, SOL_SOCKET, SO_BINDTODEVICE,
ifname, strlen(ifname) + 1);
if (!ASSERT_OK(err, "setsockopt(SO_BINDTODEVICE)"))
goto fail;
- }
- err = bind(fd, (void *)&addr, sizeof(struct sockaddr_in));
- if (!ASSERT_OK(err, "bind"))
goto fail;
- if (sotype == SOCK_STREAM) {
err = listen(fd, SOMAXCONN);
if (!ASSERT_OK(err, "listen"))
goto fail;
- }
- return fd;
+fail:
- close(fd);
- return -1;
+}
Any reason you're not using start_server from network_helpers.h?
On 4/20/23 6:44 PM, Stanislav Fomichev wrote:
On 04/20, Gilad Sever wrote:
Verify that socket lookup via TC with all BPF APIs is VRF aware.
Signed-off-by: Gilad Sever gilad9366@gmail.com
v2: Fix build by initializing vars with -1
Agree with Stan on all the refactoring changes and if we can simplify the set without them. The other item on top of what has been commented is that it would be great to also add test case for XDP as well to ensure that the lookup behavior is consistent for both cases.
Thanks, Daniel
On 20/04/2023 19:44, Stanislav Fomichev wrote:
On 04/20, Gilad Sever wrote:
Verify that socket lookup via TC with all BPF APIs is VRF aware.
Signed-off-by: Gilad Sever gilad9366@gmail.com
v2: Fix build by initializing vars with -1
.../bpf/prog_tests/tc_socket_lookup.c | 341 ++++++++++++++++++ .../selftests/bpf/progs/tc_socket_lookup.c | 73 ++++ 2 files changed, 414 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/tc_socket_lookup.c create mode 100644 tools/testing/selftests/bpf/progs/tc_socket_lookup.c
diff --git a/tools/testing/selftests/bpf/prog_tests/tc_socket_lookup.c b/tools/testing/selftests/bpf/prog_tests/tc_socket_lookup.c new file mode 100644 index 000000000000..5dcaf0ea3f8c --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/tc_socket_lookup.c @@ -0,0 +1,341 @@ +// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
+/*
- Topology:
NS1 namespace | NS2 namespace
|
+--------------+ | +--------------+
| veth01 |----------| veth10 |
| 172.16.1.100 | | | 172.16.1.200 |
| bpf | | +--------------+
+--------------+ |
server(UDP/TCP) |
- +-------------------+ |
- | vrf1 | |
- | +--------------+ | | +--------------+
- | | veth02 |----------| veth20 |
- | | 172.16.2.100 | | | | 172.16.2.200 |
- | | bpf | | | +--------------+
- | +--------------+ | |
- | server(UDP/TCP) | |
- +-------------------+ |
- Test flow
- The tests verifies that socket lookup via TC is VRF aware:
- Creates two veth pairs between NS1 and NS2:
a) veth01 <-> veth10 outside the VRF
b) veth02 <-> veth20 in the VRF
- Attaches to veth01 and veth02 a program that calls:
a) bpf_skc_lookup_tcp() with TCP and tcp_skc is true
b) bpf_sk_lookup_tcp() with TCP and tcp_skc is false
c) bpf_sk_lookup_udp() with UDP
The program stores the lookup result in bss->lookup_status.
- Creates a socket TCP/UDP server in/outside the VRF.
- The test expects lookup_status to be:
a) 0 from device in VRF to server outside VRF
b) 0 from device outside VRF to server in VRF
c) 1 from device in VRF to server in VRF
d) 1 from device outside VRF to server outside VRF
- */
+#include <net/if.h>
+#include "test_progs.h" +#include "network_helpers.h" +#include "tc_socket_lookup.skel.h"
+#define NS1 "tc_socket_lookup_1" +#define NS2 "tc_socket_lookup_2"
+#define IP4_ADDR_VETH01 "172.16.1.100" +#define IP4_ADDR_VETH10 "172.16.1.200" +#define IP4_ADDR_VETH02 "172.16.2.100" +#define IP4_ADDR_VETH20 "172.16.2.200"
+#define NON_VRF_PORT 5000 +#define IN_VRF_PORT 5001
+#define IO_TIMEOUT_SEC 3
+#define SYS(fmt, ...) \
- ({ \
char cmd[1024]; \
snprintf(cmd, sizeof(cmd), fmt, ##__VA_ARGS__); \
if (!ASSERT_OK(system(cmd), cmd)) \
goto fail; \
- })
+#define SYS_NOFAIL(fmt, ...) \
- ({ \
char cmd[1024]; \
snprintf(cmd, sizeof(cmd), fmt, ##__VA_ARGS__); \
system(cmd); \
- })
[..]
+static int make_socket(int sotype, const char *ip, int port,
struct sockaddr_storage *addr)
+{
- struct timeval timeo = { .tv_sec = IO_TIMEOUT_SEC };
- int err, fd;
- err = make_sockaddr(AF_INET, ip, port, addr, NULL);
- if (!ASSERT_OK(err, "make_address"))
return -1;
- fd = socket(AF_INET, sotype, 0);
- if (!ASSERT_OK(fd < 0, "socket"))
return -1;
- err = setsockopt(fd, SOL_SOCKET, SO_SNDTIMEO, &timeo, sizeof(timeo));
- if (!ASSERT_OK(err, "setsockopt(SO_SNDTIMEO)"))
goto fail;
- err = setsockopt(fd, SOL_SOCKET, SO_RCVTIMEO, &timeo, sizeof(timeo));
- if (!ASSERT_OK(err, "setsockopt(SO_RCVTIMEO)"))
goto fail;
- return fd;
+fail:
- close(fd);
- return -1;
+}
+static int make_server(int sotype, const char *ip, int port, const char *ifname) +{
- struct sockaddr_storage addr = {};
- const int one = 1;
- int err, fd = -1;
- fd = make_socket(sotype, ip, port, &addr);
- if (fd < 0)
return -1;
- if (sotype == SOCK_STREAM) {
err = setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &one,
sizeof(one));
if (!ASSERT_OK(err, "setsockopt(SO_REUSEADDR)"))
goto fail;
- }
- if (ifname) {
err = setsockopt(fd, SOL_SOCKET, SO_BINDTODEVICE,
ifname, strlen(ifname) + 1);
if (!ASSERT_OK(err, "setsockopt(SO_BINDTODEVICE)"))
goto fail;
- }
- err = bind(fd, (void *)&addr, sizeof(struct sockaddr_in));
- if (!ASSERT_OK(err, "bind"))
goto fail;
- if (sotype == SOCK_STREAM) {
err = listen(fd, SOMAXCONN);
if (!ASSERT_OK(err, "listen"))
goto fail;
- }
- return fd;
+fail:
- close(fd);
- return -1;
+}
Any reason you're not using start_server from network_helpers.h? It's because I need to bind the server socket to the VRF device.
On Sun, Apr 23, 2023 at 2:31 AM Gilad Sever gilad9366@gmail.com wrote:
On 20/04/2023 19:44, Stanislav Fomichev wrote:
On 04/20, Gilad Sever wrote:
Verify that socket lookup via TC with all BPF APIs is VRF aware.
Signed-off-by: Gilad Sever gilad9366@gmail.com
v2: Fix build by initializing vars with -1
.../bpf/prog_tests/tc_socket_lookup.c | 341 ++++++++++++++++++ .../selftests/bpf/progs/tc_socket_lookup.c | 73 ++++ 2 files changed, 414 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/tc_socket_lookup.c create mode 100644 tools/testing/selftests/bpf/progs/tc_socket_lookup.c
diff --git a/tools/testing/selftests/bpf/prog_tests/tc_socket_lookup.c b/tools/testing/selftests/bpf/prog_tests/tc_socket_lookup.c new file mode 100644 index 000000000000..5dcaf0ea3f8c --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/tc_socket_lookup.c @@ -0,0 +1,341 @@ +// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
+/*
- Topology:
NS1 namespace | NS2 namespace
|
+--------------+ | +--------------+
| veth01 |----------| veth10 |
| 172.16.1.100 | | | 172.16.1.200 |
| bpf | | +--------------+
+--------------+ |
server(UDP/TCP) |
- +-------------------+ |
- | vrf1 | |
- | +--------------+ | | +--------------+
- | | veth02 |----------| veth20 |
- | | 172.16.2.100 | | | | 172.16.2.200 |
- | | bpf | | | +--------------+
- | +--------------+ | |
- | server(UDP/TCP) | |
- +-------------------+ |
- Test flow
- The tests verifies that socket lookup via TC is VRF aware:
- Creates two veth pairs between NS1 and NS2:
a) veth01 <-> veth10 outside the VRF
b) veth02 <-> veth20 in the VRF
- Attaches to veth01 and veth02 a program that calls:
a) bpf_skc_lookup_tcp() with TCP and tcp_skc is true
b) bpf_sk_lookup_tcp() with TCP and tcp_skc is false
c) bpf_sk_lookup_udp() with UDP
The program stores the lookup result in bss->lookup_status.
- Creates a socket TCP/UDP server in/outside the VRF.
- The test expects lookup_status to be:
a) 0 from device in VRF to server outside VRF
b) 0 from device outside VRF to server in VRF
c) 1 from device in VRF to server in VRF
d) 1 from device outside VRF to server outside VRF
- */
+#include <net/if.h>
+#include "test_progs.h" +#include "network_helpers.h" +#include "tc_socket_lookup.skel.h"
+#define NS1 "tc_socket_lookup_1" +#define NS2 "tc_socket_lookup_2"
+#define IP4_ADDR_VETH01 "172.16.1.100" +#define IP4_ADDR_VETH10 "172.16.1.200" +#define IP4_ADDR_VETH02 "172.16.2.100" +#define IP4_ADDR_VETH20 "172.16.2.200"
+#define NON_VRF_PORT 5000 +#define IN_VRF_PORT 5001
+#define IO_TIMEOUT_SEC 3
+#define SYS(fmt, ...) \
- ({ \
char cmd[1024]; \
snprintf(cmd, sizeof(cmd), fmt, ##__VA_ARGS__); \
if (!ASSERT_OK(system(cmd), cmd)) \
goto fail; \
- })
+#define SYS_NOFAIL(fmt, ...) \
- ({ \
char cmd[1024]; \
snprintf(cmd, sizeof(cmd), fmt, ##__VA_ARGS__); \
system(cmd); \
- })
[..]
+static int make_socket(int sotype, const char *ip, int port,
struct sockaddr_storage *addr)
+{
- struct timeval timeo = { .tv_sec = IO_TIMEOUT_SEC };
- int err, fd;
- err = make_sockaddr(AF_INET, ip, port, addr, NULL);
- if (!ASSERT_OK(err, "make_address"))
return -1;
- fd = socket(AF_INET, sotype, 0);
- if (!ASSERT_OK(fd < 0, "socket"))
return -1;
- err = setsockopt(fd, SOL_SOCKET, SO_SNDTIMEO, &timeo, sizeof(timeo));
- if (!ASSERT_OK(err, "setsockopt(SO_SNDTIMEO)"))
goto fail;
- err = setsockopt(fd, SOL_SOCKET, SO_RCVTIMEO, &timeo, sizeof(timeo));
- if (!ASSERT_OK(err, "setsockopt(SO_RCVTIMEO)"))
goto fail;
- return fd;
+fail:
- close(fd);
- return -1;
+}
+static int make_server(int sotype, const char *ip, int port, const char *ifname) +{
- struct sockaddr_storage addr = {};
- const int one = 1;
- int err, fd = -1;
- fd = make_socket(sotype, ip, port, &addr);
- if (fd < 0)
return -1;
- if (sotype == SOCK_STREAM) {
err = setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &one,
sizeof(one));
if (!ASSERT_OK(err, "setsockopt(SO_REUSEADDR)"))
goto fail;
- }
- if (ifname) {
err = setsockopt(fd, SOL_SOCKET, SO_BINDTODEVICE,
ifname, strlen(ifname) + 1);
if (!ASSERT_OK(err, "setsockopt(SO_BINDTODEVICE)"))
goto fail;
- }
- err = bind(fd, (void *)&addr, sizeof(struct sockaddr_in));
- if (!ASSERT_OK(err, "bind"))
goto fail;
- if (sotype == SOCK_STREAM) {
err = listen(fd, SOMAXCONN);
if (!ASSERT_OK(err, "listen"))
goto fail;
- }
- return fd;
+fail:
- close(fd);
- return -1;
+}
Any reason you're not using start_server from network_helpers.h? It's because I need to bind the server socket to the VRF device.
I see, thanks, so it's the SO_BINDTODEVICE part. Looks generic enough to belong to network_helpers.h. WDYT? Does it make sense to extend __start_server to support it? Or have a new separate network_helper for this?
On 04/20, Gilad Sever wrote:
When calling socket lookup from L2 (tc, xdp), VRF boundaries aren't respected. This patchset fixes this by regarding the incoming device's VRF attachment when performing the socket lookups from tc/xdp.
The first two patches are coding changes which facilitate this fix by factoring out the tc helper's logic which was shared with cg/sk_skb (which operate correctly).
Why is not relevant for cgroup/egress? Is it already running with the correct device?
Also, do we really need all this refactoring and separate paths? Can we just add that bpf_l2_sdif part to the existing code? It will trigger for tc, but I'm assuming it will be a no-op for cgroup path?
And regarding bpf_l2_sdif: seems like it's really generic and should probably be called something like dev_sdif?
The third patch contains the actual bugfix.
The fourth patch adds bpf tests for these lookup functions.
v2: Fixed uninitialized var in test patch (4).
Gilad Sever (4): bpf: factor out socket lookup functions for the TC hookpoint. bpf: Call __bpf_sk_lookup()/__bpf_skc_lookup() directly via TC hookpoint bpf: fix bpf socket lookup from tc/xdp to respect socket VRF bindings selftests/bpf: Add tc_socket_lookup tests
net/core/filter.c | 132 +++++-- .../bpf/prog_tests/tc_socket_lookup.c | 341 ++++++++++++++++++ .../selftests/bpf/progs/tc_socket_lookup.c | 73 ++++ 3 files changed, 525 insertions(+), 21 deletions(-) create mode 100644 tools/testing/selftests/bpf/prog_tests/tc_socket_lookup.c create mode 100644 tools/testing/selftests/bpf/progs/tc_socket_lookup.c
-- 2.34.1
On 20/04/2023 19:42, Stanislav Fomichev wrote:
On 04/20, Gilad Sever wrote:
When calling socket lookup from L2 (tc, xdp), VRF boundaries aren't respected. This patchset fixes this by regarding the incoming device's VRF attachment when performing the socket lookups from tc/xdp.
The first two patches are coding changes which facilitate this fix by factoring out the tc helper's logic which was shared with cg/sk_skb (which operate correctly).
Why is not relevant for cgroup/egress? Is it already running with the correct device?
Yes.
Also, do we really need all this refactoring and separate paths? Can we just add that bpf_l2_sdif part to the existing code? It will trigger for tc, but I'm assuming it will be a no-op for cgroup path?
The reason we preferred the refactoring is to avoid triggering `inet_sdif()` from tc/xdp. This is because in our understanding, the IPCB is undefined before IP processing so it seems incorrect to use `inet_sdif()` from tc/xdp.
We did come up with a different option which could spare most of the refactoring and still partially separate the two paths:
Pass sdif to __bpf_skc_lookup() but instead of using different functions for tc, calculate sdif by calling `dev_sdif()` in bpf_skc_lookup() only when netif_is_l3_master() is false. In other words:
- xdp callers would check the device's l3 enslaved state using the new `dev_sdif()` - sock_addr callers would use inet{,6}_sdif() as they did before - cg/tc share the same code path, so when netif_is_l3_master() is true use inet{,6}_sdif() and when it is false use dev_sdif(). this relies on the following assumptions: - tc programs don't run on l3 master devices - cgroup callers never see l3 enslaved devices - inet{,6}_sdif() isn't relevant for non l3 master devices
In our opinion, it's safer to factor out the tc flow as in the patchset, similar to XDP which has its own functions.
What do you think?
And regarding bpf_l2_sdif: seems like it's really generic and should probably be called something like dev_sdif?
Agreed. I'll rename in the next patch.
The third patch contains the actual bugfix.
The fourth patch adds bpf tests for these lookup functions.
v2: Fixed uninitialized var in test patch (4).
Gilad Sever (4): bpf: factor out socket lookup functions for the TC hookpoint. bpf: Call __bpf_sk_lookup()/__bpf_skc_lookup() directly via TC hookpoint bpf: fix bpf socket lookup from tc/xdp to respect socket VRF bindings selftests/bpf: Add tc_socket_lookup tests
net/core/filter.c | 132 +++++-- .../bpf/prog_tests/tc_socket_lookup.c | 341 ++++++++++++++++++ .../selftests/bpf/progs/tc_socket_lookup.c | 73 ++++ 3 files changed, 525 insertions(+), 21 deletions(-) create mode 100644 tools/testing/selftests/bpf/prog_tests/tc_socket_lookup.c create mode 100644 tools/testing/selftests/bpf/progs/tc_socket_lookup.c
-- 2.34.1
On Sun, Apr 23, 2023 at 4:41 AM Gilad Sever gilad9366@gmail.com wrote:
On 20/04/2023 19:42, Stanislav Fomichev wrote:
On 04/20, Gilad Sever wrote:
When calling socket lookup from L2 (tc, xdp), VRF boundaries aren't respected. This patchset fixes this by regarding the incoming device's VRF attachment when performing the socket lookups from tc/xdp.
The first two patches are coding changes which facilitate this fix by factoring out the tc helper's logic which was shared with cg/sk_skb (which operate correctly).
Why is not relevant for cgroup/egress? Is it already running with the correct device?
Yes.
Also, do we really need all this refactoring and separate paths? Can we just add that bpf_l2_sdif part to the existing code? It will trigger for tc, but I'm assuming it will be a no-op for cgroup path?
The reason we preferred the refactoring is to avoid triggering `inet_sdif()` from tc/xdp. This is because in our understanding, the IPCB is undefined before IP processing so it seems incorrect to use `inet_sdif()` from tc/xdp.
We did come up with a different option which could spare most of the refactoring and still partially separate the two paths:
Pass sdif to __bpf_skc_lookup() but instead of using different functions for tc, calculate sdif by calling `dev_sdif()` in bpf_skc_lookup() only when netif_is_l3_master() is false. In other words:
[..]
- xdp callers would check the device's l3 enslaved state using the new
`dev_sdif()`
- sock_addr callers would use inet{,6}_sdif() as they did before
- cg/tc share the same code path, so when netif_is_l3_master() is true use inet{,6}_sdif() and when it is false use dev_sdif(). this relies
on the following assumptions:
- tc programs don't run on l3 master devices
- cgroup callers never see l3 enslaved devices
- inet{,6}_sdif() isn't relevant for non l3 master devices
Yeah, that's what I was assuming we should be able to do.. But we probably need somebody who understands this part better than me to say whether the above are safe..
If nobody comments, ignore me and do a v2 with your original approach.
On 4/24/23 11:06 AM, Stanislav Fomichev wrote:
- xdp callers would check the device's l3 enslaved state using the new
`dev_sdif()`
- sock_addr callers would use inet{,6}_sdif() as they did before
- cg/tc share the same code path, so when netif_is_l3_master() is true use inet{,6}_sdif() and when it is false use dev_sdif(). this relies
on the following assumptions:
- tc programs don't run on l3 master devices
this can happen, but I am not sure how prevalent a use case.
- cgroup callers never see l3 enslaved devices
egress definitely, not sure on ingress. The code resets the skb->dev back to the original device in a lot of places in the ip/ipv6 code now. And ipv6 brings up LLAs and those did not get the device switch so it could be fairly common.
- inet{,6}_sdif() isn't relevant for non l3 master devices
sdif should be 0 and not matched if a netdev is not a l3mdev port.
BTW, in skimming the patches, I noticed patch 3 has bpf_l2_sdif which seems an odd name to me. It returns a layer 3 device index, not a layer 2 which would be a bridge port. I would stick to the l3 naming for consistency.
Yeah, that's what I was assuming we should be able to do.. But we probably need somebody who understands this part better than me to say whether the above are safe..
If nobody comments, ignore me and do a v2 with your original approach.
linux-kselftest-mirror@lists.linaro.org