If a packet needs to be encapsulated towards a local destination IP and a VXLAN device that matches the destination port and VNI exists, then the packet will be injected into the Rx path as if it was received by the target VXLAN device without undergoing encapsulation. If such a device does not exist, the packet will be dropped.
There are scenarios where we do not want to drop such packets and instead want to let them be encapsulated and locally received by a user space program that post-processes these VXLAN packets.
To that end, add a new VXLAN device attribute that controls whether such packets are dropped or not. When set ("localbypass") these packets are dropped and when unset ("nolocalbypass") the packets are encapsulated and locally delivered to the listening user space application. Default to "localbypass" to maintain existing behavior.
Signed-off-by: Vladimir Nikishkin vladimir@nikishkin.pw --- drivers/net/vxlan/vxlan_core.c | 24 +++++++++++++++++++++--- include/net/vxlan.h | 4 +++- include/uapi/linux/if_link.h | 1 + 3 files changed, 25 insertions(+), 4 deletions(-)
diff --git a/drivers/net/vxlan/vxlan_core.c b/drivers/net/vxlan/vxlan_core.c index 561fe1b314f5..ede98b879257 100644 --- a/drivers/net/vxlan/vxlan_core.c +++ b/drivers/net/vxlan/vxlan_core.c @@ -2355,11 +2355,13 @@ static int encap_bypass_if_local(struct sk_buff *skb, struct net_device *dev, !(rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))) { struct vxlan_dev *dst_vxlan;
- dst_release(dst); dst_vxlan = vxlan_find_vni(vxlan->net, dst_ifindex, vni, daddr->sa.sa_family, dst_port, vxlan->cfg.flags); if (!dst_vxlan) { + if (!(vxlan->cfg.flags & VXLAN_F_LOCALBYPASS)) + return 0; + dst_release(dst); dev->stats.tx_errors++; vxlan_vnifilter_count(vxlan, vni, NULL, VXLAN_VNI_STATS_TX_ERRORS, 0); @@ -2367,6 +2369,7 @@ static int encap_bypass_if_local(struct sk_buff *skb, struct net_device *dev,
return -ENOENT; } + dst_release(dst); vxlan_encap_bypass(skb, vxlan, dst_vxlan, vni, true); return 1; } @@ -2568,7 +2571,6 @@ void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
if (!info) { u32 rt6i_flags = ((struct rt6_info *)ndst)->rt6i_flags; - err = encap_bypass_if_local(skb, dev, vxlan, dst, dst_port, ifindex, vni, ndst, rt6i_flags); @@ -3172,6 +3174,7 @@ static void vxlan_raw_setup(struct net_device *dev) }
static const struct nla_policy vxlan_policy[IFLA_VXLAN_MAX + 1] = { + [IFLA_VXLAN_UNSPEC] = { .strict_start_type = IFLA_VXLAN_LOCALBYPASS }, [IFLA_VXLAN_ID] = { .type = NLA_U32 }, [IFLA_VXLAN_GROUP] = { .len = sizeof_field(struct iphdr, daddr) }, [IFLA_VXLAN_GROUP6] = { .len = sizeof(struct in6_addr) }, @@ -3202,6 +3205,7 @@ static const struct nla_policy vxlan_policy[IFLA_VXLAN_MAX + 1] = { [IFLA_VXLAN_TTL_INHERIT] = { .type = NLA_FLAG }, [IFLA_VXLAN_DF] = { .type = NLA_U8 }, [IFLA_VXLAN_VNIFILTER] = { .type = NLA_U8 }, + [IFLA_VXLAN_LOCALBYPASS] = NLA_POLICY_MAX(NLA_U8, 1), };
static int vxlan_validate(struct nlattr *tb[], struct nlattr *data[], @@ -4011,6 +4015,17 @@ static int vxlan_nl2conf(struct nlattr *tb[], struct nlattr *data[], conf->flags |= VXLAN_F_UDP_ZERO_CSUM_TX; }
+ if (data[IFLA_VXLAN_LOCALBYPASS]) { + err = vxlan_nl2flag(conf, data, IFLA_VXLAN_LOCALBYPASS, + VXLAN_F_LOCALBYPASS, changelink, + true, extack); + if (err) + return err; + } else if (!changelink) { + /* default to local bypass on a new device */ + conf->flags |= VXLAN_F_LOCALBYPASS; + } + if (data[IFLA_VXLAN_UDP_ZERO_CSUM6_TX]) { err = vxlan_nl2flag(conf, data, IFLA_VXLAN_UDP_ZERO_CSUM6_TX, VXLAN_F_UDP_ZERO_CSUM6_TX, changelink, @@ -4232,6 +4247,7 @@ static size_t vxlan_get_size(const struct net_device *dev) nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_UDP_ZERO_CSUM6_RX */ nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_REMCSUM_TX */ nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_REMCSUM_RX */ + nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_LOCALBYPASS */ 0; }
@@ -4308,7 +4324,9 @@ static int vxlan_fill_info(struct sk_buff *skb, const struct net_device *dev) nla_put_u8(skb, IFLA_VXLAN_REMCSUM_TX, !!(vxlan->cfg.flags & VXLAN_F_REMCSUM_TX)) || nla_put_u8(skb, IFLA_VXLAN_REMCSUM_RX, - !!(vxlan->cfg.flags & VXLAN_F_REMCSUM_RX))) + !!(vxlan->cfg.flags & VXLAN_F_REMCSUM_RX)) || + nla_put_u8(skb, IFLA_VXLAN_LOCALBYPASS, + !!(vxlan->cfg.flags & VXLAN_F_LOCALBYPASS))) goto nla_put_failure;
if (nla_put(skb, IFLA_VXLAN_PORT_RANGE, sizeof(ports), &ports)) diff --git a/include/net/vxlan.h b/include/net/vxlan.h index 20bd7d893e10..0be91ca78d3a 100644 --- a/include/net/vxlan.h +++ b/include/net/vxlan.h @@ -328,6 +328,7 @@ struct vxlan_dev { #define VXLAN_F_TTL_INHERIT 0x10000 #define VXLAN_F_VNIFILTER 0x20000 #define VXLAN_F_MDB 0x40000 +#define VXLAN_F_LOCALBYPASS 0x80000
/* Flags that are used in the receive path. These flags must match in * order for a socket to be shareable @@ -348,7 +349,8 @@ struct vxlan_dev { VXLAN_F_UDP_ZERO_CSUM6_TX | \ VXLAN_F_UDP_ZERO_CSUM6_RX | \ VXLAN_F_COLLECT_METADATA | \ - VXLAN_F_VNIFILTER) + VXLAN_F_VNIFILTER | \ + VXLAN_F_LOCALBYPASS)
struct net_device *vxlan_dev_create(struct net *net, const char *name, u8 name_assign_type, struct vxlan_config *conf); diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 4ac1000b0ef2..0f6a0fe09bdb 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -828,6 +828,7 @@ enum { IFLA_VXLAN_TTL_INHERIT, IFLA_VXLAN_DF, IFLA_VXLAN_VNIFILTER, /* only applicable with COLLECT_METADATA mode */ + IFLA_VXLAN_LOCALBYPASS, __IFLA_VXLAN_MAX }; #define IFLA_VXLAN_MAX (__IFLA_VXLAN_MAX - 1)
Add test to make sure that the localbypass option is on by default.
Add test to change vxlan localbypass to nolocalbypass and check that packets are delivered to userspace.
Signed-off-by: Vladimir Nikishkin vladimir@nikishkin.pw --- tools/testing/selftests/net/Makefile | 1 + .../selftests/net/test_vxlan_nolocalbypass.sh | 234 ++++++++++++++++++ 2 files changed, 235 insertions(+) create mode 100755 tools/testing/selftests/net/test_vxlan_nolocalbypass.sh
diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile index c12df57d5539..7f3ab2a93ed6 100644 --- a/tools/testing/selftests/net/Makefile +++ b/tools/testing/selftests/net/Makefile @@ -84,6 +84,7 @@ TEST_GEN_FILES += ip_local_port_range TEST_GEN_FILES += bind_wildcard TEST_PROGS += test_vxlan_mdb.sh TEST_PROGS += test_bridge_neigh_suppress.sh +TEST_PROGS += test_vxlan_nolocalbypass.sh
TEST_FILES := settings
diff --git a/tools/testing/selftests/net/test_vxlan_nolocalbypass.sh b/tools/testing/selftests/net/test_vxlan_nolocalbypass.sh new file mode 100755 index 000000000000..d8e48ab1e7e0 --- /dev/null +++ b/tools/testing/selftests/net/test_vxlan_nolocalbypass.sh @@ -0,0 +1,234 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +# This file is testing that the [no]localbypass option for a vxlan device is +# working. With the nolocalbypass option, packets to a local destination, which +# have no corresponding vxlan in the kernel, will be delivered to userspace, for +# any userspace process to process. In this test tcpdump plays the role of such a +# process. This is what the test 1 is checking. +# The test 2 checks that without the nolocalbypass (which is equivalent to the +# localbypass option), the packets do not reach userspace. + +EXIT_SUCCESS=0 +EXIT_FAIL=1 +ksft_skip=4 +nsuccess=0 +nfail=0 + +ret=0 + +TESTS=" +changelink_nolocalbypass_simple +" +VERBOSE=0 +PAUSE_ON_FAIL=no +PAUSE=no + + +NETNS_NAME=vxlan_nolocalbypass_test + +################################################################################ +# Utilities + +log_test() +{ + local rc=$1 + local expected=$2 + local msg="$3" + + if [ ${rc} -eq ${expected} ]; then + printf "TEST: %-60s [ OK ]\n" "${msg}" + nsuccess=$((nsuccess+1)) + else + ret=1 + nfail=$((nfail+1)) + printf "TEST: %-60s [FAIL]\n" "${msg}" + if [ "$VERBOSE" = "1" ]; then + echo " rc=$rc, expected $expected" + fi + + if [ "${PAUSE_ON_FAIL}" = "yes" ]; then + echo + echo "hit enter to continue, 'q' to quit" + read a + [ "$a" = "q" ] && exit 1 + fi + fi + + if [ "${PAUSE}" = "yes" ]; then + echo + echo "hit enter to continue, 'q' to quit" + read a + [ "$a" = "q" ] && exit 1 + fi + + [ "$VERBOSE" = "1" ] && echo +} + +run_cmd() +{ + local cmd="$1" + local out + local stderr="2>/dev/null" + + if [ "$VERBOSE" = "1" ]; then + printf "COMMAND: $cmd\n" + stderr= + fi + + out=$(eval $cmd $stderr) + rc=$? + if [ "$VERBOSE" = "1" -a -n "$out" ]; then + echo " $out" + fi + + return $rc +} + +socat_check_packets() +{ + echo TODO + exit 1 +} + +################################################################################ +# Setup + +setup() +{ + ip netns add "$NETNS_NAME" + ip -n "$NETNS_NAME" link set up lo + ip -n "$NETNS_NAME" addr add 127.0.0.1 dev lo +} + +cleanup() +{ + ip netns del "$NETNS_NAME" +} + + +################################################################################ +# Tests + +changelink_nolocalbypass_simple() +{ + # test 1: by default, packets are dropped + + run_cmd "ip -n $NETNS_NAME link add testvxlan0 type vxlan \ + id 100 \ + dstport 4789 \ + srcport 4789 4790 \ + nolearning noproxy" + log_test $? 0 "Create vxlan with localbypass by default" + run_cmd "ip -n $NETNS_NAME link set up dev testvxlan0" + log_test $? 0 "Bring up vxlan device" + run_cmd "bridge -n $NETNS_NAME fdb add 00:00:00:00:00:00 dev testvxlan0 dst 127.0.0.1 port 4792" + log_test $? 0 "Add the most general fdb entry" + run_cmd "ip -n $NETNS_NAME address add 172.16.100.1/24 dev testvxlan0" + + local tmp_file="$(mktemp)" + ip netns exec $NETNS_NAME socat UDP4-LISTEN:4792,fork "$tmp_file" & + + run_cmd "ip netns exec $NETNS_NAME timeout 3 ping 172.16.100.2" + + l_size=$(stat -c '%s' "$tmp_file" | tr -d '\n') + log_test $l_size 0 " Packets dropped by default." + + { kill %% && wait %%; } 2>/dev/null + rm -rf "$tmp_file" + touch "$tmp_file" + # test 2: nolocalbypass works + + run_cmd "ip -n $NETNS_NAME link set testvxlan0 type vxlan nolocalbypass" + + ip netns exec $NETNS_NAME socat UDP4-LISTEN:4792,fork "$tmp_file" & + sleep 1 + run_cmd "ip netns exec $NETNS_NAME timeout 3 ping 172.16.100.2" + + l_size=$(stat -c '%s' "$tmp_file" | tr -d '\n') + if [[ "$l_size" != 0 ]] ; then + log_test 1 1 " Packets dropped by default." + else + log_test 0 1 " Packets dropped by default." + fi + + run_cmd "ip -n $NETNS_NAME link del dev testvxlan0 1>/dev/null 2>&1" + + { kill %% && wait %%; } 2>/dev/null + rm -rf "$tmp_file" + +} + +################################################################################ +# Usage + +usage() +{ + cat <<EOF +usage: ${0##*/} OPTS + + -t <test> Test(s) to run (default: all) + (options: $TESTS) + -p Pause on fail + -P Pause after each test before cleanup + -v Verbose mode (show commands and output) +EOF +} + +################################################################################ +# Main + +trap cleanup EXIT + +while getopts ":t:pPvh" opt; do + case $opt in + t) TESTS=$OPTARG ;; + p) PAUSE_ON_FAIL=yes;; + P) PAUSE=yes;; + v) VERBOSE=$(($VERBOSE + 1));; + h) usage; exit 0;; + *) usage; exit 1;; + esac +done + +# Make sure we don't pause twice. +[ "${PAUSE}" = "yes" ] && PAUSE_ON_FAIL=no + +if [ "$(id -u)" -ne 0 ];then + echo "SKIP: Need root privileges" + exit $ksft_skip; +fi + +if [ ! -x "$(command -v ip)" ]; then + echo "SKIP: Could not run test without ip tool" + exit $ksft_skip +fi + +if [ ! -x "$(command -v bridge)" ]; then + echo "SKIP: Could not run test without bridge tool" + exit $ksft_skip +fi +if [ ! -x "$(command -v socat)" ]; then + echo "socat command not found. Skipping test" + return 1 +fi + +ip link help vxlan 2>&1 | grep -q "localbypass" +if [ $? -ne 0 ]; then + echo "SKIP: iproute2 ip too old, missing VXLAN nolocalbypass support" + exit $ksft_skip +fi + +cleanup + +for t in $TESTS +do + setup; $t; cleanup; +done + +if [ "$TESTS" != "none" ]; then + printf "\nTests passed: %3d\n" ${nsuccess} + printf "Tests failed: %3d\n" ${nfail} +fi + +exit $ret
On Tue, 2023-05-02 at 00:25 +0800, Vladimir Nikishkin wrote:
Add test to make sure that the localbypass option is on by default.
Add test to change vxlan localbypass to nolocalbypass and check that packets are delivered to userspace.
Signed-off-by: Vladimir Nikishkin vladimir@nikishkin.pw
tools/testing/selftests/net/Makefile | 1 + .../selftests/net/test_vxlan_nolocalbypass.sh | 234 ++++++++++++++++++ 2 files changed, 235 insertions(+) create mode 100755 tools/testing/selftests/net/test_vxlan_nolocalbypass.sh
diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile index c12df57d5539..7f3ab2a93ed6 100644 --- a/tools/testing/selftests/net/Makefile +++ b/tools/testing/selftests/net/Makefile @@ -84,6 +84,7 @@ TEST_GEN_FILES += ip_local_port_range TEST_GEN_FILES += bind_wildcard TEST_PROGS += test_vxlan_mdb.sh TEST_PROGS += test_bridge_neigh_suppress.sh +TEST_PROGS += test_vxlan_nolocalbypass.sh TEST_FILES := settings diff --git a/tools/testing/selftests/net/test_vxlan_nolocalbypass.sh b/tools/testing/selftests/net/test_vxlan_nolocalbypass.sh new file mode 100755 index 000000000000..d8e48ab1e7e0 --- /dev/null +++ b/tools/testing/selftests/net/test_vxlan_nolocalbypass.sh @@ -0,0 +1,234 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0
+# This file is testing that the [no]localbypass option for a vxlan device is +# working. With the nolocalbypass option, packets to a local destination, which +# have no corresponding vxlan in the kernel, will be delivered to userspace, for +# any userspace process to process. In this test tcpdump plays the role of such a +# process. This is what the test 1 is checking. +# The test 2 checks that without the nolocalbypass (which is equivalent to the +# localbypass option), the packets do not reach userspace.
+EXIT_SUCCESS=0 +EXIT_FAIL=1 +ksft_skip=4 +nsuccess=0 +nfail=0
+ret=0
+TESTS=" +changelink_nolocalbypass_simple +" +VERBOSE=0 +PAUSE_ON_FAIL=no +PAUSE=no
+NETNS_NAME=vxlan_nolocalbypass_test
+################################################################################ +# Utilities
+log_test() +{
- local rc=$1
- local expected=$2
- local msg="$3"
- if [ ${rc} -eq ${expected} ]; then
printf "TEST: %-60s [ OK ]\n" "${msg}"
nsuccess=$((nsuccess+1))
- else
ret=1
nfail=$((nfail+1))
printf "TEST: %-60s [FAIL]\n" "${msg}"
if [ "$VERBOSE" = "1" ]; then
echo " rc=$rc, expected $expected"
fi
if [ "${PAUSE_ON_FAIL}" = "yes" ]; then
echo
echo "hit enter to continue, 'q' to quit"
read a
[ "$a" = "q" ] && exit 1
fi
- fi
- if [ "${PAUSE}" = "yes" ]; then
echo
echo "hit enter to continue, 'q' to quit"
read a
[ "$a" = "q" ] && exit 1
- fi
- [ "$VERBOSE" = "1" ] && echo
+}
+run_cmd() +{
- local cmd="$1"
- local out
- local stderr="2>/dev/null"
- if [ "$VERBOSE" = "1" ]; then
printf "COMMAND: $cmd\n"
stderr=
- fi
- out=$(eval $cmd $stderr)
- rc=$?
- if [ "$VERBOSE" = "1" -a -n "$out" ]; then
echo " $out"
- fi
- return $rc
+}
+socat_check_packets() +{
- echo TODO
- exit 1
Minor nit: please use a consistent number of spaces to indent e.g. 4
Note that net-next is currently close, you should submit the next revision when net-next reopens after May 8th.
Cheers,
Paolo
On Tue, May 02, 2023 at 12:25:30AM +0800, Vladimir Nikishkin wrote:
Add test to make sure that the localbypass option is on by default.
Add test to change vxlan localbypass to nolocalbypass and check that packets are delivered to userspace.
What do you think about this version [1]? I ended up removing the socat usage because it was unnecessarily complicated (sorry). Note that this test does not pass without the diff I posted earlier [2].
Without the diff, "nolocalbypass" basically means "Perform a bypass only if there is a matching local VXLAN device, otherwise encapsulate the packet and deliver it locally".
With the diff, "nolocalbypass" means "Never perform a bypass, encapsulate the packet and deliver it locally".
I think my definition better suits the "nolocalbypass" name. It also means that user space see consistent behavior: Encapsulated packets are always visible on the loopback device, regardless if there is a matching local VXLAN device.
It is true that with or without the diff packets will end up in the local VXLAN device, assuming one exists.
[1] #!/bin/bash # SPDX-License-Identifier: GPL-2.0
# This test is for checking the [no]localbypass VXLAN device option. The test # configures two VXLAN devices in the same network namespace and a tc filter on # the loopback device that drops encapsulated packets. The test sends packets # from the first VXLAN device and verifies that by default these packets are # received by the second VXLAN device. The test then enables the nolocalbypass # option and verifies that packets are no longer received by the second VXLAN # device.
ret=0 # Kselftest framework requirement - SKIP code is 4. ksft_skip=4
TESTS=" nolocalbypass " VERBOSE=0 PAUSE_ON_FAIL=no PAUSE=no
################################################################################ # Utilities
log_test() { local rc=$1 local expected=$2 local msg="$3"
if [ ${rc} -eq ${expected} ]; then printf "TEST: %-60s [ OK ]\n" "${msg}" nsuccess=$((nsuccess+1)) else ret=1 nfail=$((nfail+1)) printf "TEST: %-60s [FAIL]\n" "${msg}" if [ "$VERBOSE" = "1" ]; then echo " rc=$rc, expected $expected" fi
if [ "${PAUSE_ON_FAIL}" = "yes" ]; then echo echo "hit enter to continue, 'q' to quit" read a [ "$a" = "q" ] && exit 1 fi fi
if [ "${PAUSE}" = "yes" ]; then echo echo "hit enter to continue, 'q' to quit" read a [ "$a" = "q" ] && exit 1 fi
[ "$VERBOSE" = "1" ] && echo }
run_cmd() { local cmd="$1" local out local stderr="2>/dev/null"
if [ "$VERBOSE" = "1" ]; then printf "COMMAND: $cmd\n" stderr= fi
out=$(eval $cmd $stderr) rc=$? if [ "$VERBOSE" = "1" -a -n "$out" ]; then echo " $out" fi
return $rc }
tc_check_packets() { local ns=$1; shift local id=$1; shift local handle=$1; shift local count=$1; shift local pkts
sleep 0.1 pkts=$(tc -n $ns -j -s filter show $id \ | jq ".[] | select(.options.handle == $handle) | \ .options.actions[0].stats.packets") [[ $pkts == $count ]] }
################################################################################ # Setup
setup() { ip netns add ns1
ip -n ns1 link set dev lo up ip -n ns1 address add 192.0.2.1/32 dev lo ip -n ns1 address add 198.51.100.1/32 dev lo
ip -n ns1 link add name vx0 up type vxlan id 100 local 198.51.100.1 \ dstport 4789 nolearning ip -n ns1 link add name vx1 up type vxlan id 100 dstport 4790 }
cleanup() { ip netns del ns1 &> /dev/null }
################################################################################ # Tests
nolocalbypass() { local smac=00:01:02:03:04:05 local dmac=00:0a:0b:0c:0d:0e
run_cmd "bridge -n ns1 fdb add $dmac dev vx0 self static dst 192.0.2.1 port 4790"
run_cmd "tc -n ns1 qdisc add dev vx1 clsact" run_cmd "tc -n ns1 filter add dev vx1 ingress pref 1 handle 101 proto all flower src_mac $smac dst_mac $dmac action pass"
run_cmd "tc -n ns1 qdisc add dev lo clsact" run_cmd "tc -n ns1 filter add dev lo ingress pref 1 handle 101 proto ip flower ip_proto udp dst_port 4790 action drop"
run_cmd "ip -n ns1 -d link show dev vx0 | grep ' localbypass'" log_test $? 0 "localbypass enabled"
run_cmd "ip netns exec ns1 mausezahn vx0 -a $smac -b $dmac -c 1 -p 100 -q"
tc_check_packets "ns1" "dev vx1 ingress" 101 1 log_test $? 0 "Packet received by local VXLAN device - localbypass"
run_cmd "ip -n ns1 link set dev vx0 type vxlan nolocalbypass"
run_cmd "ip -n ns1 -d link show dev vx0 | grep 'nolocalbypass'" log_test $? 0 "localbypass disabled"
run_cmd "ip netns exec ns1 mausezahn vx0 -a $smac -b $dmac -c 1 -p 100 -q"
tc_check_packets "ns1" "dev vx1 ingress" 101 1 log_test $? 0 "Packet not received by local VXLAN device - nolocalbypass"
run_cmd "ip -n ns1 link set dev vx0 type vxlan localbypass"
run_cmd "ip -n ns1 -d link show dev vx0 | grep ' localbypass'" log_test $? 0 "localbypass enabled"
run_cmd "ip netns exec ns1 mausezahn vx0 -a $smac -b $dmac -c 1 -p 100 -q"
tc_check_packets "ns1" "dev vx1 ingress" 101 2 log_test $? 0 "Packet received by local VXLAN device - localbypass" }
################################################################################ # Usage
usage() { cat <<EOF usage: ${0##*/} OPTS
-t <test> Test(s) to run (default: all) (options: $TESTS) -p Pause on fail -P Pause after each test before cleanup -v Verbose mode (show commands and output) EOF }
################################################################################ # Main
trap cleanup EXIT
while getopts ":t:pPvh" opt; do case $opt in t) TESTS=$OPTARG ;; p) PAUSE_ON_FAIL=yes;; P) PAUSE=yes;; v) VERBOSE=$(($VERBOSE + 1));; h) usage; exit 0;; *) usage; exit 1;; esac done
# Make sure we don't pause twice. [ "${PAUSE}" = "yes" ] && PAUSE_ON_FAIL=no
if [ "$(id -u)" -ne 0 ];then echo "SKIP: Need root privileges" exit $ksft_skip; fi
if [ ! -x "$(command -v ip)" ]; then echo "SKIP: Could not run test without ip tool" exit $ksft_skip fi
if [ ! -x "$(command -v bridge)" ]; then echo "SKIP: Could not run test without bridge tool" exit $ksft_skip fi
if [ ! -x "$(command -v mausezahn)" ]; then echo "SKIP: Could not run test without mausezahn tool" exit $ksft_skip fi
if [ ! -x "$(command -v jq)" ]; then echo "SKIP: Could not run test without jq tool" exit $ksft_skip fi
ip link help vxlan 2>&1 | grep -q "localbypass" if [ $? -ne 0 ]; then echo "SKIP: iproute2 ip too old, missing VXLAN nolocalbypass support" exit $ksft_skip fi
cleanup
for t in $TESTS do setup; $t; cleanup; done
if [ "$TESTS" != "none" ]; then printf "\nTests passed: %3d\n" ${nsuccess} printf "Tests failed: %3d\n" ${nfail} fi
exit $ret
[2] https://lore.kernel.org/netdev/ZFOthnnqvElorCM8@shredder/
Ido Schimmel idosch@idosch.org writes:
On Tue, May 02, 2023 at 12:25:30AM +0800, Vladimir Nikishkin wrote:
Add test to make sure that the localbypass option is on by default.
Add test to change vxlan localbypass to nolocalbypass and check that packets are delivered to userspace.
What do you think about this version [1]?
I think that your idea of making "nolocalbypass" applicable universally is more clear and more straightforward than doing an exception to a special case, as the original patch does. I had actually considered such a change myself, and only decided against it because I wanted a patch that changes the existing behaviour in a minimal way.
If you are happy with a more radical behaviour for "nolocalbypass", I am all for it.
I can even imagine a line in the documentation, something along the lines of "The use of the nolocalbypass flag makes debugging easier, because the packets are seen on widely available userspace network debugging tools, such as tcpdump. You might want to debug and tweak your system using this flag, and when you are convinced that the system is working as expected, turn it off for a performance gain."
I will re-submit this series of patches on after the 8th of May 2023.
On Fri, May 05, 2023 at 09:33:03AM +0800, Vladimir Nikishkin wrote:
Ido Schimmel idosch@idosch.org writes:
On Tue, May 02, 2023 at 12:25:30AM +0800, Vladimir Nikishkin wrote:
Add test to make sure that the localbypass option is on by default.
Add test to change vxlan localbypass to nolocalbypass and check that packets are delivered to userspace.
What do you think about this version [1]?
I think that your idea of making "nolocalbypass" applicable universally is more clear and more straightforward than doing an exception to a special case, as the original patch does. I had actually considered such a change myself, and only decided against it because I wanted a patch that changes the existing behaviour in a minimal way.
If you are happy with a more radical behaviour for "nolocalbypass", I am all for it.
I'm fine with it. We are not changing the default behavior, but instead gating the new functionality behind a new option whose name fits the implementation and also - I believe - users' expectations.
I can even imagine a line in the documentation, something along the lines of "The use of the nolocalbypass flag makes debugging easier, because the packets are seen on widely available userspace network debugging tools, such as tcpdump. You might want to debug and tweak your system using this flag, and when you are convinced that the system is working as expected, turn it off for a performance gain."
I will re-submit this series of patches on after the 8th of May 2023.
Cool. Keep in mind that net-next does not automatically open when rc1 is released. You can monitor the list for the "net-next is OPEN" mail or follow the suggestions mentioned here: https://www.kernel.org/doc/html/latest/process/maintainer-netdev.html?highli...
On Tue, 2 May 2023 00:25:29 +0800 Vladimir Nikishkin vladimir@nikishkin.pw wrote:
If a packet needs to be encapsulated towards a local destination IP and a VXLAN device that matches the destination port and VNI exists, then the packet will be injected into the Rx path as if it was received by the target VXLAN device without undergoing encapsulation. If such a device does not exist, the packet will be dropped.
There are scenarios where we do not want to drop such packets and instead want to let them be encapsulated and locally received by a user space program that post-processes these VXLAN packets.
To that end, add a new VXLAN device attribute that controls whether such packets are dropped or not. When set ("localbypass") these packets are dropped and when unset ("nolocalbypass") the packets are encapsulated and locally delivered to the listening user space application. Default to "localbypass" to maintain existing behavior.
Signed-off-by: Vladimir Nikishkin vladimir@nikishkin.pw
Is there some way to use BPF for this. Rather than a special case for some userspace program?
Stephen Hemminger stephen@networkplumber.org writes:
On Tue, 2 May 2023 00:25:29 +0800 Vladimir Nikishkin vladimir@nikishkin.pw wrote:
If a packet needs to be encapsulated towards a local destination IP and a VXLAN device that matches the destination port and VNI exists, then the packet will be injected into the Rx path as if it was received by the target VXLAN device without undergoing encapsulation. If such a device does not exist, the packet will be dropped.
There are scenarios where we do not want to drop such packets and instead want to let them be encapsulated and locally received by a user space program that post-processes these VXLAN packets.
To that end, add a new VXLAN device attribute that controls whether such packets are dropped or not. When set ("localbypass") these packets are dropped and when unset ("nolocalbypass") the packets are encapsulated and locally delivered to the listening user space application. Default to "localbypass" to maintain existing behavior.
Signed-off-by: Vladimir Nikishkin vladimir@nikishkin.pw
Is there some way to use BPF for this. Rather than a special case for some userspace program?
Well, in the first patch this was not a special case, but rather change to the default behaviour. (Which, I guess has been a little too audacious.)
I am not sure about BPF, but the concrete use-case I have is solvable by dedicating a packet to a bogus IP, and doing an nftables double-NAT (source and destination) to 127.0.0.1, which is the way I am solving this problem now, and I suspect, what most sysadmins who need this feature would be doing this without this patch.
In fact, among all the people I have talked to about this issue (on #networking@libera.chat, and elsewhere), nobody considered dropping packets to be an intuitive thing. The "intuitive logic" here is the following:
1) I am sending packets to an ip and a port, 2) I have a process listening to packets on this IP and port, 3) Why on Earth are packets not arriving? 4) Even further, why does local behaviour differ from remote behaviour?
So the "special case" is already there by design. The new option is turning off the special case.
I am aware of the fact that heavy-duty network processing people have a different perspective on this issue, and that in high-load environments every tiny bit of performance is of crucial importance, hence "local bypass" is seen not as a dirty heuristic, but rather as an essential feature which vastly increases performance, but for "kitchen sink" sysadmins the current (not documented) behaviour is just baffling.
So I would argue that having an option that, even though it might not be the most frequently used one, is clearly documented as enabling the most straightforward behaviour, would be worth it.
And although having a userspace process listening to a vxlan "for processing" might not be the most frequently used thing (although I do need it), at least being able to see the packets being sent to local ports, with, say, tcpdump, in exactly the same way as the packets being sent to remote addresses, would help sysadmins debug their setups better even when only the most basic tools available.
I hope that this is convincing enough.
P.S. A apologise for not adding the vxlan: and testing/selftests/net: prefixes to the patches. I will add them to the next attempt, in addition to fixing the other issues that might be discovered.
On Tue, May 02, 2023 at 12:25:29AM +0800, Vladimir Nikishkin wrote:
diff --git a/drivers/net/vxlan/vxlan_core.c b/drivers/net/vxlan/vxlan_core.c index 561fe1b314f5..ede98b879257 100644 --- a/drivers/net/vxlan/vxlan_core.c +++ b/drivers/net/vxlan/vxlan_core.c @@ -2355,11 +2355,13 @@ static int encap_bypass_if_local(struct sk_buff *skb, struct net_device *dev, !(rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))) { struct vxlan_dev *dst_vxlan;
dst_vxlan = vxlan_find_vni(vxlan->net, dst_ifindex, vni, daddr->sa.sa_family, dst_port, vxlan->cfg.flags); if (!dst_vxlan) {dst_release(dst);
if (!(vxlan->cfg.flags & VXLAN_F_LOCALBYPASS))
return 0;
dst_release(dst);
Thinking about it again, now that we have a new flag to signal the desired behavior, why do we care if there is a local VXLAN device listening or not? If 'VXLAN_F_LOCALBYPASS' is not set we don't want to deliver the packet to a local VXLAN device even if one exists.
IOW, can't the diff simply be:
diff --git a/drivers/net/vxlan/vxlan_core.c b/drivers/net/vxlan/vxlan_core.c index 561fe1b314f5..1a1dfe6be92d 100644 --- a/drivers/net/vxlan/vxlan_core.c +++ b/drivers/net/vxlan/vxlan_core.c @@ -2352,7 +2352,8 @@ static int encap_bypass_if_local(struct sk_buff *skb, struct net_device *dev, #endif /* Bypass encapsulation if the destination is local */ if (rt_flags & RTCF_LOCAL && - !(rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))) { + !(rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) && + vxlan->cfg.flags & VXLAN_F_LOCALBYPASS) { struct vxlan_dev *dst_vxlan;
dst_release(dst);
?
dev->stats.tx_errors++; vxlan_vnifilter_count(vxlan, vni, NULL, VXLAN_VNI_STATS_TX_ERRORS, 0);
@@ -2367,6 +2369,7 @@ static int encap_bypass_if_local(struct sk_buff *skb, struct net_device *dev, return -ENOENT; }
vxlan_encap_bypass(skb, vxlan, dst_vxlan, vni, true); return 1; }dst_release(dst);
@@ -2568,7 +2571,6 @@ void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, if (!info) { u32 rt6i_flags = ((struct rt6_info *)ndst)->rt6i_flags;
Unnecessary change. Please remove. Probably a leftover from previous versions
err = encap_bypass_if_local(skb, dev, vxlan, dst, dst_port, ifindex, vni, ndst, rt6i_flags);
@@ -3172,6 +3174,7 @@ static void vxlan_raw_setup(struct net_device *dev) } static const struct nla_policy vxlan_policy[IFLA_VXLAN_MAX + 1] = {
- [IFLA_VXLAN_UNSPEC] = { .strict_start_type = IFLA_VXLAN_LOCALBYPASS }, [IFLA_VXLAN_ID] = { .type = NLA_U32 }, [IFLA_VXLAN_GROUP] = { .len = sizeof_field(struct iphdr, daddr) }, [IFLA_VXLAN_GROUP6] = { .len = sizeof(struct in6_addr) },
@@ -3202,6 +3205,7 @@ static const struct nla_policy vxlan_policy[IFLA_VXLAN_MAX + 1] = { [IFLA_VXLAN_TTL_INHERIT] = { .type = NLA_FLAG }, [IFLA_VXLAN_DF] = { .type = NLA_U8 }, [IFLA_VXLAN_VNIFILTER] = { .type = NLA_U8 },
- [IFLA_VXLAN_LOCALBYPASS] = NLA_POLICY_MAX(NLA_U8, 1),
};
The rest looks fine to me
linux-kselftest-mirror@lists.linaro.org