Lower the expected level of traffic in the pp_alloc_fail test
and calculate failure counter thresholds based on the traffic
rather than using a fixed constant.
We only have "QEMU HW" in NIPA right now, and the test (due to
debug dependencies) only works on debug kernels in the first place.
We need some place for it to pass otherwise it seems to be bit
rotting. So lower the traffic threshold so that it passes on QEMU
and with a debug kernel...
Signed-off-by: Jakub Kicinski <kuba(a)kernel.org>
---
CC: shuah(a)kernel.org
CC: johndale(a)cisco.com
CC: linux-kselftest(a)vger.kernel.org
---
.../selftests/drivers/net/hw/pp_alloc_fail.py | 14 ++++++++++----
1 file changed, 10 insertions(+), 4 deletions(-)
diff --git a/tools/testing/selftests/drivers/net/hw/pp_alloc_fail.py b/tools/testing/selftests/drivers/net/hw/pp_alloc_fail.py
index fc66b7a7b149..a4521a912d61 100755
--- a/tools/testing/selftests/drivers/net/hw/pp_alloc_fail.py
+++ b/tools/testing/selftests/drivers/net/hw/pp_alloc_fail.py
@@ -7,6 +7,7 @@ Test driver resilience vs page pool allocation failures.
import errno
import time
+import math
import os
from lib.py import ksft_run, ksft_exit, ksft_pr
from lib.py import KsftSkipEx, KsftFailEx
@@ -62,7 +63,7 @@ from lib.py import cmd, tool, GenerateTraffic
stat1 = get_stats()
time.sleep(1)
stat2 = get_stats()
- if stat2['rx-packets'] - stat1['rx-packets'] < 15000:
+ if stat2['rx-packets'] - stat1['rx-packets'] < 4000:
raise KsftFailEx("Traffic seems low:", stat2['rx-packets'] - stat1['rx-packets'])
@@ -91,9 +92,14 @@ from lib.py import cmd, tool, GenerateTraffic
if s2['rx-alloc-fail'] - s1['rx-alloc-fail'] < 1:
raise KsftSkipEx("Allocation failures not increasing")
- if s2['rx-alloc-fail'] - s1['rx-alloc-fail'] < 100:
- raise KsftSkipEx("Allocation increasing too slowly", s2['rx-alloc-fail'] - s1['rx-alloc-fail'],
- "packets:", s2['rx-packets'] - s1['rx-packets'])
+ pkts = s2['rx-packets'] - s1['rx-packets']
+ # Expecting one failure per 512 buffers, 3.1x safety margin
+ want_fails = math.floor(pkts / 512 / 3.1)
+ seen_fails = s2['rx-alloc-fail'] - s1['rx-alloc-fail']
+ if s2['rx-alloc-fail'] - s1['rx-alloc-fail'] < want_fails:
+ raise KsftSkipEx("Allocation increasing too slowly", seen_fails,
+ "packets:", pkts)
+ ksft_pr(f"Seen: pkts:{pkts} fails:{seen_fails} (pass thrs:{want_fails})")
# Basic failures are fine, try to wobble some settings to catch extra failures
check_traffic_flowing()
--
2.51.0
From: Rong Tao <rongtao(a)cestc.cn>
Since commit 1b8abbb12128 ("bpf...d_path(): constify path argument"),
the first parameter of the bpf_d_path() has been changed to a const
constant. We need to modify the header file and bpf_doc.py.
The two error messages are as follows:
linux/tools/testing/selftests/bpf$ make
CLNG-BPF [test_progs] bpf_iter_task_vmas.bpf.o
progs/bpf_iter_task_vmas.c:52:14: error: passing 'const struct path *'
to parameter of type 'struct path *' discards qualifiers
[-Werror,-Wincompatible-pointer-types-discards-qualifiers]
52 | bpf_d_path(&file->f_path, d_path_buf, D_PATH_BUF_SIZE);
| ^~~~~~~~~~~~~
1 error generated.
....
progs/verifier_vfs_accept.c:80:7: error: assigning to 'struct path *'
from 'const struct path *' discards qualifiers
[-Werror,-Wincompatible-pointer-types-discards-qualifiers]
80 | path = &file->f_path;
| ^ ~~~~~~~~~~~~~
1 error generated.
Fixes: 1b8abbb12128 ("bpf...d_path(): constify path argument")
Signed-off-by: Rong Tao <rongtao(a)cestc.cn>
---
include/uapi/linux/bpf.h | 2 +-
scripts/bpf_doc.py | 1 +
tools/include/uapi/linux/bpf.h | 2 +-
tools/testing/selftests/bpf/progs/verifier_vfs_accept.c | 2 +-
4 files changed, 4 insertions(+), 3 deletions(-)
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index ae83d8649ef1..6829936d33f5 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -4891,7 +4891,7 @@ union bpf_attr {
*
* **-ENOENT** if the bpf_local_storage cannot be found.
*
- * long bpf_d_path(struct path *path, char *buf, u32 sz)
+ * long bpf_d_path(const struct path *path, char *buf, u32 sz)
* Description
* Return full path for given **struct path** object, which
* needs to be the kernel BTF *path* object. The path is
diff --git a/scripts/bpf_doc.py b/scripts/bpf_doc.py
index c77dc40f7689..15d113a1bc1d 100755
--- a/scripts/bpf_doc.py
+++ b/scripts/bpf_doc.py
@@ -788,6 +788,7 @@ class PrinterHelpersHeader(Printer):
'struct task_struct',
'struct cgroup',
'struct path',
+ 'const struct path',
'struct btf_ptr',
'struct inode',
'struct socket',
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index ae83d8649ef1..6829936d33f5 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -4891,7 +4891,7 @@ union bpf_attr {
*
* **-ENOENT** if the bpf_local_storage cannot be found.
*
- * long bpf_d_path(struct path *path, char *buf, u32 sz)
+ * long bpf_d_path(const struct path *path, char *buf, u32 sz)
* Description
* Return full path for given **struct path** object, which
* needs to be the kernel BTF *path* object. The path is
diff --git a/tools/testing/selftests/bpf/progs/verifier_vfs_accept.c b/tools/testing/selftests/bpf/progs/verifier_vfs_accept.c
index 3e2d76ee8050..55398c04290a 100644
--- a/tools/testing/selftests/bpf/progs/verifier_vfs_accept.c
+++ b/tools/testing/selftests/bpf/progs/verifier_vfs_accept.c
@@ -70,7 +70,7 @@ __success
int BPF_PROG(path_d_path_from_file_argument, struct file *file)
{
int ret;
- struct path *path;
+ const struct path *path;
/* The f_path member is a path which is embedded directly within a
* file. Therefore, a pointer to such embedded members are still
--
2.51.0
From: Rong Tao <rongtao(a)cestc.cn>
Add kfuncs bpf_strcasestr and bpf_strncasestr, which are extensions of
bpf_strstr and bpf_strnstr, suitable for more scenarios.
Rong Tao (2):
bpf: add bpf_strcasestr,bpf_strncasestr kfuncs
selftests/bpf: Test bpf_strcasestr,bpf_strncasestr kfuncs
kernel/bpf/helpers.c | 96 +++++++++++++++----
.../selftests/bpf/prog_tests/string_kfuncs.c | 2 +
.../bpf/progs/string_kfuncs_failure1.c | 12 +++
.../bpf/progs/string_kfuncs_failure2.c | 2 +
.../bpf/progs/string_kfuncs_success.c | 13 +++
5 files changed, 104 insertions(+), 21 deletions(-)
--
v2: remove extra __bpf_kfunc and fix comment of bpf_strncasestr().
v1: https://lore.kernel.org/all/tencent_8AF4D15B4475031E2185ACDE4B1495995707@qq…
--
2.51.0
From: Rong Tao <rongtao(a)cestc.cn>
bpf_strcasestr() and bpf_strncasestr() functions perform same like
bpf_strstr() and bpf_strnstr() except ignoring the case of the
characters.
Signed-off-by: Rong Tao <rongtao(a)cestc.cn>
---
kernel/bpf/helpers.c | 96 ++++++++++++++++++++++++++++++++++----------
1 file changed, 75 insertions(+), 21 deletions(-)
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index c9fab9a356df..c4a0070d1c71 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -3675,34 +3675,21 @@ __bpf_kfunc int bpf_strcspn(const char *s__ign, const char *reject__ign)
return -EFAULT;
}
-/**
- * bpf_strnstr - Find the first substring in a length-limited string
- * @s1__ign: The string to be searched
- * @s2__ign: The string to search for
- * @len: the maximum number of characters to search
- *
- * Return:
- * * >=0 - Index of the first character of the first occurrence of @s2__ign
- * within the first @len characters of @s1__ign
- * * %-ENOENT - @s2__ign not found in the first @len characters of @s1__ign
- * * %-EFAULT - Cannot read one of the strings
- * * %-E2BIG - One of the strings is too large
- * * %-ERANGE - One of the strings is outside of kernel address space
- */
-__bpf_kfunc int bpf_strnstr(const char *s1__ign, const char *s2__ign, size_t len)
+__bpf_kfunc int __bpf_strnstr(const char *s1, const char *s2, size_t len,
+ bool ignore_case)
{
char c1, c2;
int i, j;
- if (!copy_from_kernel_nofault_allowed(s1__ign, 1) ||
- !copy_from_kernel_nofault_allowed(s2__ign, 1)) {
+ if (!copy_from_kernel_nofault_allowed(s1, 1) ||
+ !copy_from_kernel_nofault_allowed(s2, 1)) {
return -ERANGE;
}
guard(pagefault)();
for (i = 0; i < XATTR_SIZE_MAX; i++) {
for (j = 0; i + j <= len && j < XATTR_SIZE_MAX; j++) {
- __get_kernel_nofault(&c2, s2__ign + j, char, err_out);
+ __get_kernel_nofault(&c2, s2 + j, char, err_out);
if (c2 == '\0')
return i;
/*
@@ -3712,7 +3699,13 @@ __bpf_kfunc int bpf_strnstr(const char *s1__ign, const char *s2__ign, size_t len
*/
if (i + j == len)
break;
- __get_kernel_nofault(&c1, s1__ign + j, char, err_out);
+ __get_kernel_nofault(&c1, s1 + j, char, err_out);
+
+ if (ignore_case) {
+ c1 = tolower(c1);
+ c2 = tolower(c2);
+ }
+
if (c1 == '\0')
return -ENOENT;
if (c1 != c2)
@@ -3722,7 +3715,7 @@ __bpf_kfunc int bpf_strnstr(const char *s1__ign, const char *s2__ign, size_t len
return -E2BIG;
if (i + j == len)
return -ENOENT;
- s1__ign++;
+ s1++;
}
return -E2BIG;
err_out:
@@ -3744,8 +3737,67 @@ __bpf_kfunc int bpf_strnstr(const char *s1__ign, const char *s2__ign, size_t len
*/
__bpf_kfunc int bpf_strstr(const char *s1__ign, const char *s2__ign)
{
- return bpf_strnstr(s1__ign, s2__ign, XATTR_SIZE_MAX);
+ return __bpf_strnstr(s1__ign, s2__ign, XATTR_SIZE_MAX, false);
+}
+
+/**
+ * bpf_strcasestr - Find the first substring in a string, ignoring the case of
+ * the characters
+ * @s1__ign: The string to be searched
+ * @s2__ign: The string to search for
+ *
+ * Return:
+ * * >=0 - Index of the first character of the first occurrence of @s2__ign
+ * within @s1__ign
+ * * %-ENOENT - @s2__ign is not a substring of @s1__ign
+ * * %-EFAULT - Cannot read one of the strings
+ * * %-E2BIG - One of the strings is too large
+ * * %-ERANGE - One of the strings is outside of kernel address space
+ */
+__bpf_kfunc int bpf_strcasestr(const char *s1__ign, const char *s2__ign)
+{
+ return __bpf_strnstr(s1__ign, s2__ign, XATTR_SIZE_MAX, true);
}
+
+/**
+ * bpf_strnstr - Find the first substring in a length-limited string
+ * @s1__ign: The string to be searched
+ * @s2__ign: The string to search for
+ * @len: the maximum number of characters to search
+ *
+ * Return:
+ * * >=0 - Index of the first character of the first occurrence of @s2__ign
+ * within the first @len characters of @s1__ign
+ * * %-ENOENT - @s2__ign not found in the first @len characters of @s1__ign
+ * * %-EFAULT - Cannot read one of the strings
+ * * %-E2BIG - One of the strings is too large
+ * * %-ERANGE - One of the strings is outside of kernel address space
+ */
+__bpf_kfunc int bpf_strnstr(const char *s1__ign, const char *s2__ign, size_t len)
+{
+ return __bpf_strnstr(s1__ign, s2__ign, len, false);
+}
+
+/**
+ * bpf_strnstr - Find the first substring in a length-limited string
+ * @s1__ign: The string to be searched
+ * @s2__ign: The string to search for
+ * @len: the maximum number of characters to search
+ *
+ * Return:
+ * * >=0 - Index of the first character of the first occurrence of @s2__ign
+ * within the first @len characters of @s1__ign
+ * * %-ENOENT - @s2__ign not found in the first @len characters of @s1__ign
+ * * %-EFAULT - Cannot read one of the strings
+ * * %-E2BIG - One of the strings is too large
+ * * %-ERANGE - One of the strings is outside of kernel address space
+ */
+__bpf_kfunc int bpf_strncasestr(const char *s1__ign, const char *s2__ign,
+ size_t len)
+{
+ return __bpf_strnstr(s1__ign, s2__ign, len, true);
+}
+
#ifdef CONFIG_KEYS
/**
* bpf_lookup_user_key - lookup a key by its serial
@@ -4367,7 +4419,9 @@ BTF_ID_FLAGS(func, bpf_strnlen);
BTF_ID_FLAGS(func, bpf_strspn);
BTF_ID_FLAGS(func, bpf_strcspn);
BTF_ID_FLAGS(func, bpf_strstr);
+BTF_ID_FLAGS(func, bpf_strcasestr);
BTF_ID_FLAGS(func, bpf_strnstr);
+BTF_ID_FLAGS(func, bpf_strncasestr);
#if defined(CONFIG_BPF_LSM) && defined(CONFIG_CGROUPS)
BTF_ID_FLAGS(func, bpf_cgroup_read_xattr, KF_RCU)
#endif
--
2.51.0
From: Rong Tao <rongtao(a)cestc.cn>
Add kfuncs bpf_strcasestr and bpf_strncasestr, which are extensions of
bpf_strstr and bpf_strnstr, suitable for more scenarios.
Rong Tao (2):
bpf: add bpf_strcasestr,bpf_strncasestr kfuncs
selftests/bpf: Test bpf_strcasestr,bpf_strncasestr kfuncs
kernel/bpf/helpers.c | 96 +++++++++++++++----
.../selftests/bpf/prog_tests/string_kfuncs.c | 2 +
.../bpf/progs/string_kfuncs_failure1.c | 12 +++
.../bpf/progs/string_kfuncs_failure2.c | 2 +
.../bpf/progs/string_kfuncs_success.c | 13 +++
5 files changed, 104 insertions(+), 21 deletions(-)
--
2.51.0
Add a script to test various scenarios where a bridge is involved
in the fastpath. It runs tests in the forward path, and also in
a bridged path.
The setup is similar to a basic home router with multiple lan ports.
It uses 3 pairs of veth-devices. Each or all pairs can be
replaced by a pair of real interfaces, interconnected by wire.
This is necessary to test the behavior when dealing with
dsa ports, foreign (dsa) ports and switchdev userports that support
SWITCHDEV_OBJ_ID_PORT_VLAN.
See the head of the script for a detailed description.
Run without arguments to perform all tests on veth-devices.
Signed-off-by: Eric Woudstra <ericwouds(a)gmail.com>
---
This test script is written first for the proposed bridge-fastpath
patch-sets, but it's use is more general and can easily be expanded.
Changes in v3:
- Removed all warnings reported by shellcheck -x -e SC2317
- Improved del_pppoe(), check if interfaces are removed
- Added is_known_issue() to warn instead of error for known issues
- Link down and (hardware) interfaces to default netns at end of script
- Removed matching ip(v6) address
Changes in v2:
- Moved test-series to functions
- Moved code to set_pair_link() up/down
- Added conntrack zone to bridged traffic
- Test bridge chain prerouting in test without fastpath
and bridge chain forward in tests with fastpath
Some example outputs of this last version of patches from different
hardware, without and with patches:
ALL VETH:
=========
./bridge_fastpath.sh -t
Setup:
CLIENT 0
veth0cl
|
veth0rt
WAN
ROUTER
LAN1 LAN2
veth1rt veth2rt
| |
veth1cl veth2cl
CLIENT 1 CLIENT 2
Without patches:
PASS: unaware bridge, without encaps, without fastpath
PASS: unaware bridge, with single vlan encap, without fastpath
WARN: unaware bridge, with double q vlan encaps, without fastpath: ipv4/6: established bytes 0 < 4194304
WARN: unaware bridge, with 802.1ad vlan encaps, without fastpath: ipv4/6: established bytes 0 < 4194304
WARN: unaware bridge, with pppoe encap, without fastpath: ipv4/6: established bytes 0 < 4194304
WARN: unaware bridge, with pppoe-in-q encaps, without fastpath: ipv4/6: established bytes 0 < 4194304
PASS: aware bridge, without/without vlan encap, without fastpath
PASS: aware bridge, with/without vlan encap, without fastpath
PASS: aware bridge, with/with vlan encap, without fastpath
PASS: aware bridge, without/with vlan encap, without fastpath
PASS: forward, without vlan-device, without vlan encap, client1, without fastpath
PASS: forward, without vlan-device, without vlan encap, client1, with fastpath
PASS: forward, without vlan-device, with vlan encap, client1, without fastpath
WARN: forward, without vlan-device, with vlan encap, client1, with fastpath: ipv4/6: tcp broken
PASS: forward, with vlan-device, without vlan encap, client1, without fastpath
PASS: forward, with vlan-device, without vlan encap, client1, with fastpath
PASS: forward, with vlan-device, with vlan encap, client1, without fastpath
PASS: forward, with vlan-device, with vlan encap, client1, with fastpath
PASS: all tests passed
With patches:
PASS: unaware bridge, without encaps, without fastpath
PASS: unaware bridge, without encaps, with fastpath
PASS: unaware bridge, with single vlan encap, without fastpath
PASS: unaware bridge, with single vlan encap, with fastpath
PASS: unaware bridge, with double q vlan encaps, without fastpath
PASS: unaware bridge, with double q vlan encaps, with fastpath
PASS: unaware bridge, with 802.1ad vlan encaps, without fastpath
PASS: unaware bridge, with 802.1ad vlan encaps, with fastpath
PASS: unaware bridge, with pppoe encap, without fastpath
PASS: unaware bridge, with pppoe encap, with fastpath
PASS: unaware bridge, with pppoe-in-q encaps, without fastpath
PASS: unaware bridge, with pppoe-in-q encaps, with fastpath
PASS: aware bridge, without/without vlan encap, without fastpath
PASS: aware bridge, without/without vlan encap, with fastpath
PASS: aware bridge, with/without vlan encap, without fastpath
PASS: aware bridge, with/without vlan encap, with fastpath
PASS: aware bridge, with/with vlan encap, without fastpath
PASS: aware bridge, with/with vlan encap, with fastpath
PASS: aware bridge, without/with vlan encap, without fastpath
PASS: aware bridge, without/with vlan encap, with fastpath
PASS: forward, without vlan-device, without vlan encap, client1, without fastpath
PASS: forward, without vlan-device, without vlan encap, client1, with fastpath
PASS: forward, without vlan-device, with vlan encap, client1, without fastpath
PASS: forward, without vlan-device, with vlan encap, client1, with fastpath
PASS: forward, with vlan-device, without vlan encap, client1, without fastpath
PASS: forward, with vlan-device, without vlan encap, client1, with fastpath
PASS: forward, with vlan-device, with vlan encap, client1, without fastpath
PASS: forward, with vlan-device, with vlan encap, client1, with fastpath
PASS: all tests passed
BANANAPI-R3 (lan1 & lan2 are dsa):
============
Without patches:
./bridge_fastpath.sh -t -0 enu1u2,lan2 -1 enu1u1,lan1 -2 lan4,eth1
Setup:
CLIENT 0
enu1u2
|
lan2
WAN
ROUTER
LAN1 LAN2
lan1 eth1
| |
enu1u1 lan4
CLIENT 1 CLIENT 2
PASS: unaware bridge, without encaps, without fastpath
PASS: unaware bridge, with single vlan encap, without fastpath
WARN: unaware bridge, with pppoe encap, without fastpath: ipv4/6: established bytes 0 < 4194304
WARN: unaware bridge, with pppoe-in-q encaps, without fastpath: ipv4/6: established bytes 0 < 4194304
PASS: aware bridge, without/without vlan encap, without fastpath
PASS: aware bridge, with/without vlan encap, without fastpath
PASS: aware bridge, with/with vlan encap, without fastpath
PASS: aware bridge, without/with vlan encap, without fastpath
PASS: forward, without vlan-device, without vlan encap, client1, without fastpath
WARN: forward, without vlan-device, without vlan encap, client1, with fastpath: ipv4: counted bytes 2110480 > 2097152
WARN: forward, without vlan-device, without vlan encap, client1, with fastpath: ipv6: counted bytes 2116104 > 2097152
PASS: forward, without vlan-device, without vlan encap, client1, with hw_fastpath
PASS: forward, without vlan-device, without vlan encap, client2, without fastpath
PASS: forward, without vlan-device, without vlan encap, client2, with fastpath
PASS: forward, without vlan-device, without vlan encap, client2, with hw_fastpath
PASS: forward, without vlan-device, with vlan encap, client1, without fastpath
WARN: forward, without vlan-device, with vlan encap, client1, with fastpath: ipv4/6: tcp broken
WARN: forward, without vlan-device, with vlan encap, client1, with hw_fastpath: ipv4/6: tcp broken
PASS: forward, without vlan-device, with vlan encap, client2, without fastpath
WARN: forward, without vlan-device, with vlan encap, client2, with fastpath: ipv4/6: tcp broken
WARN: forward, without vlan-device, with vlan encap, client2, with hw_fastpath: ipv4/6: tcp broken
PASS: forward, with vlan-device, without vlan encap, client1, without fastpath
PASS: forward, with vlan-device, without vlan encap, client1, with fastpath
PASS: forward, with vlan-device, without vlan encap, client1, with hw_fastpath
PASS: forward, with vlan-device, without vlan encap, client2, without fastpath
WARN: forward, with vlan-device, without vlan encap, client2, with fastpath: ipv4: counted bytes 2122388 > 2097152
WARN: forward, with vlan-device, without vlan encap, client2, with fastpath: ipv6: counted bytes 2129280 > 2097152
WARN: forward, with vlan-device, without vlan encap, client2, with hw_fastpath: ipv4: counted bytes 2110428 > 2097152
WARN: forward, with vlan-device, without vlan encap, client2, with hw_fastpath: ipv6: counted bytes 2140144 > 2097152
PASS: forward, with vlan-device, with vlan encap, client1, without fastpath
PASS: forward, with vlan-device, with vlan encap, client1, with fastpath
PASS: forward, with vlan-device, with vlan encap, client1, with hw_fastpath
PASS: forward, with vlan-device, with vlan encap, client2, without fastpath
PASS: forward, with vlan-device, with vlan encap, client2, with fastpath
PASS: forward, with vlan-device, with vlan encap, client2, with hw_fastpath
PASS: all tests passed
With patches:
PASS: unaware bridge, without encaps, without fastpath
PASS: unaware bridge, without encaps, with fastpath
PASS: unaware bridge, without encaps, with hw_fastpath
PASS: unaware bridge, with single vlan encap, without fastpath
PASS: unaware bridge, with single vlan encap, with fastpath
PASS: unaware bridge, with single vlan encap, with hw_fastpath
PASS: unaware bridge, with pppoe encap, without fastpath
PASS: unaware bridge, with pppoe encap, with fastpath
PASS: unaware bridge, with pppoe encap, with hw_fastpath
PASS: unaware bridge, with pppoe-in-q encaps, without fastpath
PASS: unaware bridge, with pppoe-in-q encaps, with fastpath
PASS: unaware bridge, with pppoe-in-q encaps, with hw_fastpath
PASS: aware bridge, without/without vlan encap, without fastpath
PASS: aware bridge, without/without vlan encap, with fastpath
PASS: aware bridge, without/without vlan encap, with hw_fastpath
PASS: aware bridge, with/without vlan encap, without fastpath
PASS: aware bridge, with/without vlan encap, with fastpath
PASS: aware bridge, with/without vlan encap, with hw_fastpath
PASS: aware bridge, with/with vlan encap, without fastpath
PASS: aware bridge, with/with vlan encap, with fastpath
PASS: aware bridge, with/with vlan encap, with hw_fastpath
PASS: aware bridge, without/with vlan encap, without fastpath
PASS: aware bridge, without/with vlan encap, with fastpath
PASS: aware bridge, without/with vlan encap, with hw_fastpath
PASS: forward, without vlan-device, without vlan encap, client1, without fastpath
PASS: forward, without vlan-device, without vlan encap, client1, with fastpath
PASS: forward, without vlan-device, without vlan encap, client1, with hw_fastpath
PASS: forward, without vlan-device, without vlan encap, client2, without fastpath
PASS: forward, without vlan-device, without vlan encap, client2, with fastpath
PASS: forward, without vlan-device, without vlan encap, client2, with hw_fastpath
PASS: forward, without vlan-device, with vlan encap, client1, without fastpath
PASS: forward, without vlan-device, with vlan encap, client1, with fastpath
PASS: forward, without vlan-device, with vlan encap, client1, with hw_fastpath
PASS: forward, without vlan-device, with vlan encap, client2, without fastpath
PASS: forward, without vlan-device, with vlan encap, client2, with fastpath
PASS: forward, without vlan-device, with vlan encap, client2, with hw_fastpath
PASS: forward, with vlan-device, without vlan encap, client1, without fastpath
PASS: forward, with vlan-device, without vlan encap, client1, with fastpath
PASS: forward, with vlan-device, without vlan encap, client1, with hw_fastpath
PASS: forward, with vlan-device, without vlan encap, client2, without fastpath
PASS: forward, with vlan-device, without vlan encap, client2, with fastpath
PASS: forward, with vlan-device, without vlan encap, client2, with hw_fastpath
PASS: forward, with vlan-device, with vlan encap, client1, without fastpath
PASS: forward, with vlan-device, with vlan encap, client1, with fastpath
PASS: forward, with vlan-device, with vlan encap, client1, with hw_fastpath
PASS: forward, with vlan-device, with vlan encap, client2, without fastpath
PASS: forward, with vlan-device, with vlan encap, client2, with fastpath
PASS: forward, with vlan-device, with vlan encap, client2, with hw_fastpath
PASS: all tests passed
.../testing/selftests/net/netfilter/Makefile | 1 +
.../net/netfilter/bridge_fastpath.sh | 1055 +++++++++++++++++
2 files changed, 1056 insertions(+)
create mode 100755 tools/testing/selftests/net/netfilter/bridge_fastpath.sh
diff --git a/tools/testing/selftests/net/netfilter/Makefile b/tools/testing/selftests/net/netfilter/Makefile
index a98ed892f55f..e0de04333a3f 100644
--- a/tools/testing/selftests/net/netfilter/Makefile
+++ b/tools/testing/selftests/net/netfilter/Makefile
@@ -8,6 +8,7 @@ MNL_LDLIBS := $(shell $(HOSTPKG_CONFIG) --libs libmnl 2>/dev/null || echo -lmnl)
TEST_PROGS := br_netfilter.sh bridge_brouter.sh
TEST_PROGS += br_netfilter_queue.sh
+TEST_PROGS += bridge_fastpath.sh
TEST_PROGS += conntrack_dump_flush.sh
TEST_PROGS += conntrack_icmp_related.sh
TEST_PROGS += conntrack_ipip_mtu.sh
diff --git a/tools/testing/selftests/net/netfilter/bridge_fastpath.sh b/tools/testing/selftests/net/netfilter/bridge_fastpath.sh
new file mode 100755
index 000000000000..614497489edb
--- /dev/null
+++ b/tools/testing/selftests/net/netfilter/bridge_fastpath.sh
@@ -0,0 +1,1055 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Check if conntrack, nft chain and fastpath is functional in setups
+# where a bridge is in the fastpath.
+#
+# Commandline options make it possible to use real ethernet pairs
+# instead of veth-device pairs. Any, or all, pairs can be tested using
+# real hardware pairs. This is can be useful to test dsa-ports,
+# switchdev (dsa) foreign ports and switchdev ports supporting
+# SWITCHDEV_OBJ_ID_PORT_VLAN.
+#
+# First tcp is tested. Conntrack and nft chain are tested using a counter.
+# When there is a fastpath possible between the interfaces then the
+# fastpath is also tested.
+# When there is a hardware offloaded fastpath possible between the
+# interfaces then the hardware offloaded path is also tested.
+#
+# Setup is as a typical router:
+#
+# nsclientwan
+# |
+# nsrt
+# | |
+# nsclient1 nsclient2
+#
+# Masquerading for ipv4 only.
+#
+# First check if a bridge table forward chain can be setup, skip
+# these tests if this is not possible.
+# Then check if a inet table forward chain can be setup, skip
+# these tests if this is not possible.
+#
+# Different setups of paths are tested that involve a bridge in the
+# fastpath. This can be in the forward-fastpath or in the bridge-fastpath.
+#
+# The first series, in the bridge-fastpath, using a vlan-unaware bridge.
+# Traffic with the following vlan-tags is checked:
+# a. without vlan
+# b. single vlan
+# c. double q vlan (only on veth-devices)
+# d. 802.1ad vlan (only on veth-devices)
+# e. pppoe (when available)
+# f. pppoe-in-q (when available)
+#
+# (for items c to f fastpath can only work when a conntrack zone is set)
+# (double tag testing results in broken tcp traffic on most hardware,
+# in this test setup, use '-a' argument to test it anyway)
+# (pppoe testing takes place if pppd and pppoe-server are installed)
+#
+# The second series, in the bridge-fastpath, using a vlan-aware bridge.
+# Here we test all combinations of ingress/egress with or without single
+# vlan encaps.
+#
+# The third series, in the forward-fastpath, using a vlan-aware bridge,
+# without a vlan-device linked to the master port. We test the same combinations
+# of ingress/egress with or without single vlan encaps.
+#
+# The fourth series, in the forward-fastpath, using a vlan-aware bridge,
+# with a vlan-device linked to the master port. We test the same combinations
+# of ingress/egress with or without single vlan encaps.
+#
+# Note 1: Using dsa userports on both sides of eth-pairs client1 or client2
+# gives erratic and unpredictable results. Use, for example, an usb-eth device
+# on the client side to test a dsa-userport.
+#
+# Note 2: Testing the hardware offloaded fastpath, it is not checked if the
+# packets do not follow the software fastpath instead. A universal way to
+# check this should be added at some point.
+#
+# Note 3: Some interfaces to test on the router side, are netns immutable.
+# Use the -d or --defaultnsrouter option so that the interfaces of the router
+# do not have to change netns. The router is build up in the default netns.
+#
+
+source lib.sh
+
+checktool "nft --version" "run test without nft"
+checktool "socat -h" "run test without socat"
+checktool "bridge -V" "run test without bridge"
+
+NR_OF_TESTS=4
+VID1=100
+VID2=101
+BRWAN=brwan
+BRLAN=brlan
+BRCL=brcl
+LINKUP_TIMEOUT=10
+PING_TIMEOUT=10
+SOCAT_TIMEOUT=10
+filesize=$((2 * 1024 * 1024))
+
+filein=$(mktemp)
+file1out=$(mktemp)
+file2out=$(mktemp)
+pppoeserveroptions=$(mktemp)
+pppoeserverpid=$(mktemp)
+
+setup_ns nsclientwan nsclientlan1 nsclientlan2
+
+ WAN=0 ; LAN1=1 ; LAN2=2 ; ADWAN=3 ; ADLAN=4
+nsa=( "$nsclientwan" "$nsclientlan1" "$nsclientlan2" ) # $nsrt $nsrt
+AD4=( '192.168.1.1' '192.168.2.101' '192.168.2.102' '192.168.1.2' '192.168.2.1' )
+AD6=( 'dead:1::1' 'dead:2::101' 'dead:2::102' 'dead:1::2' 'dead:2::1' )
+
+tests_string=$(seq 1 $NR_OF_TESTS)
+
+while [ "${1:-}" != '' ]; do
+ case "$1" in
+ '-0' | '--pairwan')
+ shift
+ vethcl[WAN]="${1%,*}"
+ vethrt[WAN]="${1#*,}"
+ ;;
+ '-1' | '--pairlan1')
+ shift
+ vethcl[LAN1]="${1%,*}"
+ vethrt[LAN1]="${1#*,}"
+ ;;
+ '-2' | '--pairlan2')
+ shift
+ vethcl[LAN2]="${1%,*}"
+ vethrt[LAN2]="${1#*,}"
+ ;;
+ '-s' | '--filesize')
+ shift
+ filesize=$1
+ ;;
+ '-p' | '--parts')
+ shift
+ tests_string=$1
+ ;;
+ '-4' | '--ipv4')
+ do_ipv4=1
+ ;;
+ '-6' | '--ipv6')
+ do_ipv6=1
+ ;;
+ '-n' | '--noskip')
+ noskip=1
+ ;;
+ '-d' | '--defaultnsrouter')
+ defaultnsrouter=1
+ ;;
+ '-f' | '--fixmac')
+ fixmac=1
+ ;;
+ '-t' | '--showtree')
+ showtree=1
+ ;;
+ *)
+ cat <<-EOF
+ Usage: $(basename "$0") [OPTION]...
+ -0 --pairwan eth0cl,eth0rt pair of real interfaces to use on wan side
+ -1 --pairlan1 eth1cl,eth1rt pair of real interfaces to use on lan1 side
+ -2 --pairlan2 eth2cl,eth2rt pair of real interfaces to use on lan2 side
+ -s --filesize filesize to use for testing in bytes
+ -p --parts partnumbers of tests to run, comma separated
+ -4|-6 --ipv4|--ipv6 test ipv4/6 only
+ -d --defaultnsrouter router in default network namespace, caution!
+ -f --fixmac change mac address when conflict found
+ -n --noskip also perform the normally skipped tests
+ -t --showtree show the tree of used interfaces
+ EOF
+ exit "$ksft_skip"
+ ;;
+ esac
+ shift
+done
+
+for i in ${tests_string//','/' '}; do
+ tests[i]="yes"
+done
+
+if [ -n "$defaultnsrouter" ]; then
+ nsrt="nsrt-$(mktemp -u XXXXXX)"
+ touch "/var/run/netns/$nsrt"
+ mount --bind /proc/1/ns/net "/var/run/netns/$nsrt"
+else
+ setup_ns nsrt
+fi
+nsa+=("$nsrt" "$nsrt")
+
+cleanup() {
+ if [ -n "$defaultnsrouter" ]; then
+ umount "/var/run/netns/$nsrt"
+ rm -f "/var/run/netns/$nsrt"
+ fi
+ cleanup_all_ns
+ rm -f "$filein" "$file1out" "$file2out" "$pppoeserveroptions" "$pppoeserverpid"
+}
+
+trap cleanup EXIT
+
+head -c "$filesize" < /dev/urandom > "$filein"
+
+check_mac()
+{
+ local ns=$1
+ local dev=$2
+ local othermacs=$3
+ local mac
+
+ mac=$(ip -net "$ns" -br link show dev "$dev" | \
+ grep -o -E '([[:xdigit:]]{1,2}:){5}[[:xdigit:]]{1,2}')
+
+ if [[ ! "$othermacs" =~ $mac ]]; then
+ echo "$mac"
+ return 0
+ fi
+ echo "WARN: Conflicting mac address $dev $mac" 1>&2
+
+ [ -z "$fixmac" ] && return 1
+
+ for (( j = 0 ; j < 10 ; j++ )); do
+ mac="${mac::6}$(printf %02x:%02x:%02x:%02x $((RANDOM%256)) \
+ $((RANDOM%256)) $((RANDOM%256)) $((RANDOM%256)))"
+ [[ "$othermacs" =~ $mac ]] && continue
+ echo "$mac"
+ ip -net "$ns" link set dev "$dev" address "$mac" 1>&2
+ return $?
+ done
+ return 1
+}
+
+is_link()
+{
+ local updown=$1
+ local ns=$2
+ local dev=$3
+
+ if ip -net "$ns" link show dev "$dev" "${updown,,}" 2>/dev/null | \
+ grep -q "state ${updown^^}"
+ then
+ return 0
+ fi
+ return 1
+}
+
+set_pair_link()
+{
+ local updown=$1
+ local all="${*:2}"
+ local lret=0
+ local i j
+
+ for i in $all; do
+ ns="${nsa[$i]}"
+ ip -net "$ns" link set "${vethcl[$i]}" "$updown"
+ lret=$((lret | $?))
+ ip -net "$nsrt" link set "${vethrt[$i]}" "$updown"
+ lret=$((lret | $?))
+ done
+ [ $lret -ne 0 ] && return 1
+
+ for j in $(seq 1 $((LINKUP_TIMEOUT * 5 ))); do
+ lret=0
+ for i in $all; do
+ ns="${nsa[$i]}"
+ is_link "$updown" "$ns" "${vethcl[$i]}"
+ lret=$((lret | $?))
+ is_link "$updown" "$nsrt" "${vethrt[$i]}"
+ lret=$((lret | $?))
+ done
+ [ $lret -eq 0 ] && break
+ sleep 0.2
+ done
+ return $lret
+}
+
+wait_ping()
+{
+ local i1=$1
+ local i2=$2
+ local ns1=${nsa[$i1]}
+ local j
+ local lret
+
+ for j in $(seq 1 $((PING_TIMEOUT * 5 ))); do
+ ip netns exec "$ns1" ping -c 1 -w $PING_TIMEOUT -i 0.2 \
+ -q "${AD4[$i2]}" >/dev/null 2>&1
+ lret=$?
+ [ $lret -le 1 ] && return $lret
+ sleep 0.2
+ done
+ return 1
+}
+
+add_addr()
+{
+ local i=$1
+ local dev=$2
+ local ns=${nsa[$i]}
+ local ad4=${AD4[$i]}
+ local ad6=${AD6[$i]}
+
+ ip -net "$ns" addr add "${ad4}/24" dev "$dev"
+ ip -net "$ns" addr add "${ad6}/64" dev "$dev" nodad
+ if [[ "$ns" == "nsclientlan"* ]]; then
+ ip -net "$ns" route add default via "${AD4[$ADLAN]}"
+ ip -net "$ns" route add default via "${AD6[$ADLAN]}"
+ elif [[ "$ns" == "nsclientwan"* ]]; then
+ ip -net "$ns" route add default via "${AD6[$ADWAN]}"
+ fi
+
+}
+
+del_addr()
+{
+ local i=$1
+ local dev=$2
+ local ns=${nsa[$i]}
+ local ad4=${AD4[$i]}
+ local ad6=${AD6[$i]}
+
+ if [[ "$ns" == "nsclientlan"* ]]; then
+ ip -net "$ns" route del default via "${AD6[$ADLAN]}"
+ ip -net "$ns" route del default via "${AD4[$ADLAN]}"
+ elif [[ "$ns" == "nsclientwan"* ]]; then
+ ip -net "$ns" route del default via "${AD6[$ADWAN]}"
+ fi
+ ip -net "$ns" addr del "${ad6}/64" dev "$dev" nodad
+ ip -net "$ns" addr del "${ad4}/24" dev "$dev"
+}
+
+set_client()
+{
+ local i=$1
+ local vlan=$2
+ local arg=$3
+ local ns=${nsa[$i]}
+ local vdev="${vethcl[$i]}"
+ local brdev="$BRCL"
+ local proto=""
+ local pvidslave=""
+
+ unset_client "$i"
+
+ if [[ "$vlan" == "qq" ]]; then
+ ip -net "$ns" link add link "$vdev" name "$vdev.$VID1" type vlan id $VID1
+ ip -net "$ns" link add link "$vdev.$VID1" name "$vdev.$VID1.$VID2" \
+ type vlan id $VID2
+ ip -net "$ns" link set "$vdev.$VID1" up
+ ip -net "$ns" link set "$vdev.$VID1.$VID2" up
+ add_addr "$i" "$vdev.$VID1.$VID2"
+ return
+ fi
+
+ [[ "$vlan" == "none" ]] && pvidslave="pvid untagged"
+ [[ "$vlan" == "ad" ]] && proto="vlan_protocol 802.1ad"
+
+ # shellcheck disable=SC2086
+ ip -net "$ns" link add "$brdev" type bridge vlan_filtering 1 vlan_default_pvid 0 $proto
+ ip -net "$ns" link set "$vdev" master "$brdev"
+ ip -net "$ns" link set "$brdev" up
+
+ # shellcheck disable=SC2086
+ bridge -net "$ns" vlan add dev "$vdev" vid $VID1 $pvidslave
+ bridge -net "$ns" vlan add dev "$brdev" vid $VID1 pvid untagged self
+
+ if [[ "$vlan" == "ad" ]]; then
+ ip -net "$ns" link add link "$brdev" name "$brdev.$VID2" type vlan id $VID2
+ brdev="$brdev.$VID2"
+ ip -net "$ns" link set "$brdev" up
+ fi
+
+ if [[ "$arg" != "noaddress" ]]; then
+ add_addr "$i" "$brdev"
+ fi
+}
+
+unset_client()
+{
+ local i=$1
+ local ns=${nsa[$i]}
+ local vdev="${vethcl[$i]}"
+ local brdev="$BRCL"
+
+ ip -net "$ns" link del "$brdev" type bridge 2>/dev/null
+ ip -net "$ns" link del "$vdev.$VID1" 2>/dev/null
+}
+
+add_pppoe()
+{
+ local i1=$1
+ local i2=$2
+ local dev1=$3
+ local dev2=$4
+ local desc=$5
+ local ns1=${nsa[$i1]}
+ local ns2=${nsa[$i2]}
+
+ ppp1=0
+ while [ -n "$(ip -net "$ns1" link show ppp$ppp1 2>/dev/null)" ]
+ do ((ppp1++)); done
+ echo "noauth defaultroute noipdefault unit $ppp1" >"$pppoeserveroptions"
+ ppp1="ppp$ppp1"
+
+ if ! ip netns exec "$ns1" pppoe-server -k -L "${AD4[$i1]}" -R "${AD4[$i2]}" \
+ -I "$dev1" -X "$pppoeserverpid" -O "$pppoeserveroptions" >/dev/null; then
+ echo "ERROR: $desc: failed to setup pppoe server" 1>&2
+ return 1
+ fi
+
+ if ! ip netns exec "$ns2" pppd plugin pppoe.so nic-"$dev2" persist holdoff 0 noauth \
+ defaultroute noipdefault noaccomp nodeflate noproxyarp nopcomp \
+ novj novjccomp linkname "selftest-$$" >/dev/null; then
+ echo "ERROR: $desc: failed to setup pppoe client" 1>&2
+ return 1
+ fi
+
+ if ! wait_ping "$i1" "$i2"; then
+ echo "ERROR: $desc: failed to setup functional pppoe connection" 1>&2
+ return 1
+ fi
+
+ ppp2=$(tail -n 1 < "/run/pppd/ppp-selftest-$$.pid")
+
+ ip -net "$ns1" addr add "${AD6[$i1]}/64" dev "$ppp1" nodad
+ ip -net "$ns2" addr add "${AD6[$i2]}/64" dev "$ppp2" nodad
+
+ return 0
+}
+
+del_pppoe()
+{
+ local i1=$1
+ local i2=$2
+ local dev1=$3
+ local dev2=$4
+ local ns1=${nsa[$i1]}
+ local ns2=${nsa[$i2]}
+ local i serverpid clientpid
+
+ serverpid="$(head -n 1 < "$pppoeserverpid")"
+ clientpid="$(head -n 1 < "/run/pppd/ppp-selftest-$$.pid")"
+
+ [[ -n "$ppp1" ]] && ip -net "$ns1" addr del "${AD6[$i1]}/64" dev "$ppp1"
+ [[ -n "$ppp2" ]] && ip -net "$ns2" addr del "${AD6[$i2]}/64" dev "$ppp2"
+
+ for i in $(seq 1 $((PING_TIMEOUT * 5 ))); do
+ if ip -net "$ns2" link show dev "$ppp2" 1>/dev/null 2>/dev/null; then
+ kill -9 "$clientpid" 2>/dev/null
+ elif ip -net "$ns1" link show dev "$ppp1" 1>/dev/null 2>/dev/null; then
+ kill -SIGTERM "$serverpid" 2>/dev/null
+ else return 0
+ fi
+ sleep 0.2
+ done
+ echo "ERROR: failed to remove pppoe connection" 1>&2
+ return 1
+}
+
+listener_ready()
+{
+ local ns=$1
+ local ipv=$2
+
+ ss -N "$ns" --ipv"$ipv" -lnt -o "sport = :8080" | grep -q 8080
+}
+
+test_tcp() {
+ local i1=$1
+ local i2=$2
+ local dofast=$3
+ local desc=$4
+ local ns1=${nsa[$i1]}
+ local ns2=${nsa[$i2]}
+ local i=-1
+ local lret=0
+ local ads=""
+ local ipv ad a lpid bytes limit error
+
+ if [ -n "$do_ipv4" ]; then ads="${AD4[$i2]}"
+ elif [ -n "$do_ipv6" ]; then ads="${AD6[$i2]}"
+ else ads="${AD4[$i2]} ${AD6[$i2]}"
+ fi
+ for ad in $ads; do
+ ((i++))
+ if [[ "$ad" =~ ":" ]]
+ then ipv="6"; a="[${ad}]"
+ else ipv="4"; a="${ad}"
+ fi
+
+ rm -f "$file1out" "$file2out"
+
+ # ip netns exec "$nsrt" nft reset counters >/dev/null
+ # But on some systems this results in 4GB values in packet and byte count, so:
+ (echo "flush ruleset"; ip netns exec "$nsrt" nft --stateless list ruleset) | \
+ ip netns exec "$nsrt" nft -f -
+
+ timeout "$SOCAT_TIMEOUT" ip netns exec "$ns2" socat TCP$ipv-LISTEN:8080,reuseaddr \
+ STDIO <"$filein" >"$file2out" 2>/dev/null &
+ lpid=$!
+ busywait 1000 listener_ready "$ns2" "$ipv"
+
+ timeout "$SOCAT_TIMEOUT" ip netns exec "$ns1" socat TCP$ipv:"$a":8080 \
+ STDIO <"$filein" >"$file1out" 2>/dev/null
+
+ if ! wait $lpid; then
+ error[i]="tcp broken"
+ continue
+ fi
+ if ! cmp "$filein" "$file1out" >/dev/null 2>&1; then
+ error[i]="file mismatch to ${ad}"
+ continue
+ fi
+ if ! cmp "$filein" "$file2out" >/dev/null 2>&1; then
+ error[i]="file mismatch from ${ad}"
+ continue
+ fi
+
+ limit=$((2 * filesize))
+ bytes=$(ip netns exec "$nsrt" nft list counter $family filter "check" | \
+ grep "packets" | cut -d' ' -f4)
+ if [ -z "$dofast" ] && [ "$bytes" -lt "$limit" ]; then
+
+ error[i]="established bytes $bytes < $limit"
+ continue
+ fi
+ if [ -n "$dofast" ] && [ "$bytes" -gt "$((limit/2))" ]; then
+ # Significant reduction of bytes expected
+ error[i]="counted bytes $bytes > $((limit/2))"
+ continue
+ fi
+
+ done
+
+ if [ -n "${error[0]}" ]; then
+ if [[ "${error[0]}" == "${error[1]}" ]]; then
+ error[0]="$desc: ipv4/6: ${error[0]}"
+ error[1]=""
+ else
+ error[0]="$desc: ipv4: ${error[0]}"
+ fi
+ fi
+ if [ -n "${error[1]}" ]; then
+ error[1]="$desc: ipv6: ${error[1]}"
+ fi
+
+ for i in 0 1; do
+ if [ -n "${error[i]}" ]; then
+ if is_known_issue "$desc: ${error[i]}"; then
+ echo "WARN: ${error[i]}" 1>&2
+ lret=$((lret | 1))
+ else
+ echo "ERROR: ${error[i]}" 1>&2
+ lret=$((lret | 2))
+ fi
+ fi
+ done
+ if [ $lret -eq 0 ]; then
+ echo "PASS: $desc"
+ fi
+ return $(( lret & 2 ))
+}
+
+known_issues=(
+'*unaware bridge,*with double q vlan encaps,*without fastpath*established*' # 1
+'*unaware bridge,*with 802.1ad vlan encaps,*without fastpath*established*' # 1
+'*unaware bridge,*with pppoe encap,*without fastpath*established*' # 1
+'*unaware bridge,*with pppoe-in-q encaps,*without fastpath*established*' # 1
+'*forward,*without vlan-device, without vlan encap,*with *fastpath:*counted*' # 2
+'*forward,*without vlan-device, with vlan encap,*with *fastpath:*tcp broken*' # 3
+'*forward,*with vlan-device, without vlan encap,*with *fastpath:*counted*' # 4
+)
+
+is_known_issue() {
+ local err=$1
+ for issue in "${known_issues[@]}"; do
+ # shellcheck disable=SC2053
+ [[ "$err" == $issue ]] && return 0
+ done
+ return 1
+}
+
+test_paths() {
+ local i1=$1
+ local i2=$2
+ local desc=$3
+ local ns1=${nsa[$i1]}
+ local ns2=${nsa[$i2]}
+
+
+ if ! setup_nftables "$i1" "$i2"; then
+ echo "ERROR: $desc: cannot setup nftables" 1>&2
+ return 1
+ fi
+ if ! test_tcp "$i1" "$i2" "" "$desc without fastpath"; then
+ return 1
+ fi
+
+ if ! setup_fastpath "$i1" "$i2" "" 2>/dev/null; then
+ return 0
+ fi
+ if ! test_tcp "$i1" "$i2" "fast" "$desc with fastpath"; then
+ return 1
+ fi
+
+ if ! setup_fastpath "$i1" "$i2" "hw" 2>/dev/null; then
+ return 0
+ fi
+ if ! test_tcp "$i1" "$i2" "fast" "$desc with hw_fastpath"; then
+ return 1
+ fi
+
+ return 0
+
+}
+
+add_masq()
+{
+ if [[ $family != "bridge" ]]; then
+ ip netns exec "$nsrt" nft -f - <<-EOF
+ table ip nat {
+ chain postrouting {
+ type nat hook postrouting priority 0;
+ oifname ${BRWAN} masquerade
+ }
+ }
+ EOF
+ else
+ return 0
+ fi
+}
+
+add_zone()
+{
+ local devs=$1
+
+ if [[ $family == "bridge" ]]; then
+ ip netns exec "$nsrt" nft -f - <<-EOF
+ table ${family} filter {
+ chain preroutingzones {
+ type filter hook prerouting priority -300;
+ iif ${devs} ct zone set 23
+ }
+ }
+ EOF
+ fi
+}
+
+setup_nftables()
+{
+ local devs="{ ${vethrt[$1]} , ${vethrt[$2]} }"
+ local i1=$1
+ local i2=$2
+
+ ip netns exec "$nsrt" nft flush ruleset
+
+ if ! add_masq; then
+ return 1
+ fi
+
+ add_zone "${devs}" 2>/dev/null
+
+ ip netns exec "$nsrt" nft -f - <<-EOF
+ table ${family} filter {
+ counter check { }
+ chain prerouting {
+ type filter hook prerouting priority 0; policy accept;
+ ct state established tcp dport 8080 counter name "check"
+ ct state established tcp sport 8080 counter name "check"
+ }
+ }
+ EOF
+}
+
+setup_fastpath()
+{
+ local devs="{ ${vethrt[$1]} , ${vethrt[$2]} }"
+ local arg=$3
+ local flags=""
+
+ [[ "$arg" == "hw" ]] && flags="flags offload"
+
+ ip netns exec "$nsrt" nft flush ruleset
+
+ if ! add_masq; then
+ return 1
+ fi
+
+ add_zone "${devs}" 2>/dev/null
+
+ ip netns exec "$nsrt" nft -f - <<-EOF
+ table ${family} filter {
+ counter check { }
+ flowtable f {
+ hook ingress priority filter
+ devices = ${devs}
+ ${flags}
+ }
+ chain forward {
+ type filter hook forward priority 0; policy accept;
+ counter name "check"
+ ct state established flow add @f
+ }
+ }
+ EOF
+}
+
+test_unaware_bridge()
+{
+ local lret=0
+ local i
+
+ for i in $LAN1 $LAN2; do
+ set_client "$i" none
+ done
+
+ test_paths $LAN1 $LAN2 "unaware bridge, without encaps, "
+ lret=$((lret | $?))
+
+ for i in $LAN1 $LAN2; do
+ set_client "$i" q
+ done
+
+ test_paths $LAN1 $LAN2 "unaware bridge, with single vlan encap, "
+ lret=$((lret | $?))
+
+ for i in $LAN1 $LAN2; do
+ set_client "$i" qq
+ done
+
+ # Skip testing double tagged packets on real hardware
+ if [ -n "$lan_all_veth" ] || [ -n "$noskip" ]; then
+
+ test_paths $LAN1 $LAN2 "unaware bridge, with double q vlan encaps, "
+ lret=$((lret | $?))
+
+ for i in $LAN1 $LAN2; do
+ set_client "$i" ad
+ done
+
+ test_paths $LAN1 $LAN2 "unaware bridge, with 802.1ad vlan encaps, "
+ lret=$((lret | $?))
+
+ fi
+ # End Skip testing double tagged packets
+
+ if [ -n "$(command -v pppd 2>/dev/null)" ] &&
+ [ -n "$(command -v pppoe-server 2>/dev/null)" ]; then
+ # Start pppoe
+
+ for i in $LAN1 $LAN2; do
+ set_client "$i" none noaddress
+ done
+
+ if add_pppoe $LAN1 $LAN2 "$BRCL" "$BRCL" "unaware bridge, with pppoe encap"; then
+ test_paths $LAN1 $LAN2 "unaware bridge, with pppoe encap, "
+ lret=$((lret | $?))
+ fi
+
+ del_pppoe $LAN1 $LAN2 "$BRCL" "$BRCL"
+ lret=$((lret | $?))
+
+ for i in $LAN1 $LAN2; do
+ set_client "$i" q noaddress
+ done
+
+ if add_pppoe $LAN1 $LAN2 "$BRCL" "$BRCL" "unaware bridge, with pppoe-in-q encaps"; then
+ test_paths $LAN1 $LAN2 "unaware bridge, with pppoe-in-q encaps, "
+ lret=$((lret | $?))
+ fi
+
+ del_pppoe $LAN1 $LAN2 "$BRCL" "$BRCL"
+ lret=$((lret | $?))
+
+ # End pppoe
+ fi
+
+ for i in $LAN1 $LAN2; do
+ unset_client "$i"
+ done
+ return $lret
+}
+
+test_aware_bridge()
+{
+ local lret=0
+ local i
+
+ for i in $LAN1 $LAN2; do
+ bridge -net "$nsrt" vlan add dev "${vethrt[$i]}" vid $VID1 pvid untagged
+ set_client "$i" none
+ done
+ test_paths $LAN1 $LAN2 "aware bridge, without/without vlan encap,"
+ lret=$((lret | $?))
+
+ i=$LAN1
+ bridge -net "$nsrt" vlan del dev "${vethrt[$i]}" vid $VID1 pvid untagged
+ bridge -net "$nsrt" vlan add dev "${vethrt[$i]}" vid $VID1
+ set_client $i q
+
+ test_paths $LAN1 $LAN2 "aware bridge, with/without vlan encap, "
+ lret=$((lret | $?))
+
+ i=$LAN2
+ bridge -net "$nsrt" vlan del dev "${vethrt[$i]}" vid $VID1 pvid untagged
+ bridge -net "$nsrt" vlan add dev "${vethrt[$i]}" vid $VID1
+ set_client $i q
+
+ test_paths $LAN1 $LAN2 "aware bridge, with/with vlan encap, "
+ lret=$((lret | $?))
+
+ i=$LAN1
+ bridge -net "$nsrt" vlan del dev "${vethrt[$i]}" vid $VID1
+ bridge -net "$nsrt" vlan add dev "${vethrt[$i]}" vid $VID1 pvid untagged
+ set_client $i none
+
+ test_paths $LAN1 $LAN2 "aware bridge, without/with vlan encap, "
+ lret=$((lret | $?))
+
+ i=$LAN1
+ bridge -net "$nsrt" vlan del dev "${vethrt[$i]}" vid $VID1 pvid untagged
+ unset_client $i
+ i=$LAN2
+ bridge -net "$nsrt" vlan del dev "${vethrt[$i]}" vid $VID1
+ unset_client $i
+
+ return $lret
+}
+
+test_forward_without_vlandev()
+{
+ local wo=$1
+ local lret=0
+ local i
+
+ [[ "$wo" == "" ]] && wo="without"
+
+ for i in $LAN1 $LAN2; do
+ bridge -net "$nsrt" vlan add dev "${vethrt[$i]}" vid $VID1 pvid untagged
+ set_client "$i" none
+ done
+
+ test_paths $LAN1 $WAN "forward, $wo vlan-device, without vlan encap, client1,"
+ lret=$((lret | $?))
+ if [ -z "$lan_all_veth" ] || [ -n "$noskip" ]; then
+ test_paths $LAN2 $WAN "forward, $wo vlan-device, without vlan encap, client2,"
+ lret=$((lret | $?))
+ fi
+
+ for i in $LAN1 $LAN2; do
+ bridge -net "$nsrt" vlan del dev "${vethrt[$i]}" vid $VID1 pvid untagged
+ bridge -net "$nsrt" vlan add dev "${vethrt[$i]}" vid $VID1
+ set_client "$i" q
+ done
+
+ test_paths $LAN1 $WAN "forward, $wo vlan-device, with vlan encap, client1,"
+ lret=$((lret | $?))
+ if [ -z "$lan_all_veth" ] || [ -n "$noskip" ]; then
+ test_paths $LAN2 $WAN "forward, $wo vlan-device, with vlan encap, client2,"
+ lret=$((lret | $?))
+ fi
+
+ for i in $LAN1 $LAN2; do
+ bridge -net "$nsrt" vlan del dev "${vethrt[$i]}" vid $VID1
+ unset_client "$i"
+ done
+ return $lret
+}
+
+test_forward_with_vlandev()
+{
+ test_forward_without_vlandev "with"
+ return $?
+}
+
+ret=0
+### Start Initial Setup ###
+
+for i in 4 6; do
+ ip netns exec "$nsrt" sysctl -q net.ipv$i.conf.all.forwarding=1
+done
+
+### Use brwan to make sure software fastpath is ###
+### direct xmit in other direction also ###
+
+ip -net "$nsrt" link add $BRWAN type bridge
+ret=$((ret | $?))
+ip -net "$nsrt" link set $BRWAN up
+ret=$((ret | $?))
+if [ $ret -ne 0 ]; then
+ echo "SKIP: Can't create bridge"
+ exit "$ksft_skip"
+fi
+
+# If both lan clients are veth-devices, only test 1 in the forward path
+if [ -z "${vethcl[$LAN1]}" ] && [ -z "${vethcl[$LAN2]}" ]; then
+ lan_all_veth=1
+fi
+
+for i in $WAN $LAN1 $LAN2; do
+ ns="${nsa[$i]}"
+ if [ -z "${vethcl[$i]}" ]; then
+ vethcl[i]="veth${i}cl"
+ vethrt[i]="veth${i}rt"
+ ip link add "${vethcl[$i]}" netns "$ns" type veth \
+ peer name "${vethrt[$i]}" netns "$nsrt"
+ ret=$((ret | $?))
+ else # Use pair of interconnected hardware interfaces
+ ip link set "${vethrt[$i]}" netns "$nsrt"
+ ret=$((ret | $?))
+ ip link set "${vethcl[$i]}" netns "$ns"
+ ret=$((ret | $?))
+ fi
+done
+if [ $ret -ne 0 ]; then
+ echo "SKIP: (v)eth pairs cannot be used"
+ exit "$ksft_skip"
+fi
+
+if [ -n "$showtree" ]; then
+ cat <<-EOF
+ Setup:
+ CLIENT 0
+ ${vethcl[$WAN]}
+ |
+ ${vethrt[$WAN]}
+ WAN
+ ROUTER
+ LAN1 LAN2
+ $(printf "%14.14s" "${vethrt[$LAN1]}") ${vethrt[$LAN2]}
+ | |
+ $(printf "%14.14s" "${vethcl[$LAN1]}") ${vethcl[$LAN2]}
+ CLIENT 1 CLIENT 2
+
+ EOF
+fi
+
+for n in nsclientwan nsclientlan; do
+ routerside=""; clientside=""
+ for i in $WAN $LAN1 $LAN2; do
+ ns="${nsa[$i]}"
+ [[ "$ns" != "$n"* ]] && continue
+ mac=$(check_mac "$ns" "${vethcl[$i]}" "$routerside $clientside")
+ ret=$((ret | $?))
+ clientside+=" $mac"
+ mac=$(check_mac "$nsrt" "${vethrt[$i]}" "$clientside")
+ ret=$((ret | $?))
+ routerside+=" $mac"
+ done
+done
+if [ $ret -ne 0 ]; then
+ echo "SKIP: conflicting mac address"
+ exit "$ksft_skip"
+fi
+
+set_pair_link up $WAN $LAN1 $LAN2
+ret=$((ret | $?))
+if [ $ret -ne 0 ]; then
+ echo "SKIP: setting (v)eth pairs link up failed"
+ exit "$ksft_skip"
+fi
+
+i=$WAN
+ip -net "$nsrt" link set "${vethrt[$i]}" master $BRWAN
+set_client $i none
+add_addr $ADWAN "$BRWAN"
+
+family="bridge"
+if ! setup_nftables $LAN1 $LAN2 2>/dev/null; then
+ echo "INFO: Cannot add nftables table $family"
+ tests[1]=""; tests[2]=""
+fi
+family="inet"
+if ! setup_nftables $WAN $LAN1 2>/dev/null; then
+ echo "INFO: Cannot add nftables table $family"
+ tests[3]=""; tests[4]=""
+fi
+
+### End Initial Setup ###
+
+if [ -n "${tests[1]}" ]; then
+ # Setup brlan as vlan unaware bridge
+ family="bridge"
+ ip -net "$nsrt" link add $BRLAN type bridge
+ ip -net "$nsrt" link set $BRLAN up
+ for i in $LAN1 $LAN2; do
+ ip -net "$nsrt" link set "${vethrt[$i]}" master $BRLAN
+ done
+ test_unaware_bridge
+ ret=$((ret | $?))
+ ip -net "$nsrt" link del $BRLAN type bridge
+fi
+
+if [ -n "${tests[2]}" ] || [ -n "${tests[3]}" ] || [ -n "${tests[4]}" ]; then
+ # Setup brlan as vlan aware bridge
+ family="bridge"
+
+ ip -net "$nsrt" link add $BRLAN type bridge vlan_filtering 1 vlan_default_pvid 0
+ ip -net "$nsrt" link set $BRLAN up
+ bridge -net "$nsrt" vlan add dev $BRLAN vid $VID1 pvid untagged self
+ add_addr $ADLAN "$BRLAN"
+ for i in $LAN1 $LAN2; do
+ ip -net "$nsrt" link set "${vethrt[$i]}" master $BRLAN
+ done
+
+ if [ -n "${tests[2]}" ]; then
+ test_aware_bridge
+ ret=$((ret | $?))
+ fi
+
+ family="inet"
+
+ if [ -n "${tests[3]}" ]; then
+ test_forward_without_vlandev
+ ret=$((ret | $?))
+ fi
+
+ if [ -n "${tests[4]}" ]; then
+ # Setup vlan-device linked to brlan master port
+ del_addr $ADLAN "$BRLAN"
+ ip -net "$nsrt" link set $BRLAN down
+ bridge -net "$nsrt" vlan del dev $BRLAN vid $VID1 pvid untagged self
+ bridge -net "$nsrt" vlan add dev $BRLAN vid $VID1 self
+ ip -net "$nsrt" link add link $BRLAN name $BRLAN.$VID1 type vlan id $VID1
+ ip -net "$nsrt" link set $BRLAN up
+ ip -net "$nsrt" link set "$BRLAN.$VID1" up
+ add_addr $ADLAN "$BRLAN.$VID1"
+ test_forward_with_vlandev
+ ret=$((ret | $?))
+ fi
+
+ ip -net "$nsrt" link del $BRLAN type bridge
+fi
+
+### Finish tests ###
+
+ip -net "$nsrt" link del $BRWAN type bridge
+
+for i in $WAN $LAN1 $LAN2; do
+ unset_client "$i"
+done
+
+set_pair_link down $WAN $LAN1 $LAN2
+
+for i in $WAN $LAN1 $LAN2; do
+ ns="${nsa[$i]}"
+ if [[ "${vethcl[$i]:0:4}" != "veth" ]]; then
+ ip netns exec "$ns" ip link set "${vethcl[$i]}" netns 1
+ fi
+ if [[ "${vethrt[$i]:0:4}" != "veth" ]]; then
+ ip netns exec "$nsrt" ip link set "${vethrt[$i]}" netns 1
+ fi
+done
+
+if [ $ret -eq 0 ]; then
+ echo "PASS: all tests passed"
+else
+ echo "ERROR: bridge fastpath test has failed"
+fi
+
+exit $ret
--
2.50.0
When trying to build the latest BPF selftests, with a debug kernel
config, Pahole 1.30 and CLang 20.1.8 (and GCC 15.2), I got these errors:
progs/dynptr_success.c:579:9: error: call to undeclared function 'bpf_dynptr_slice'; ISO C99 and later do not support implicit function declarations [-Wimplicit-function-declaration]
579 | data = bpf_dynptr_slice(&ptr, 0, NULL, 1);
| ^
progs/dynptr_success.c:579:9: note: did you mean 'bpf_dynptr_size'?
.virtme/build-debug-btf//tools/include/vmlinux.h:120280:14: note: 'bpf_dynptr_size' declared here
120280 | extern __u32 bpf_dynptr_size(const struct bpf_dynptr *p) __weak __ksym;
| ^
progs/dynptr_success.c:579:7: error: incompatible integer to pointer conversion assigning to '__u64 *' (aka 'unsigned long long *') from 'int' [-Wint-conversion]
579 | data = bpf_dynptr_slice(&ptr, 0, NULL, 1);
| ^ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
progs/dynptr_success.c:596:9: error: call to undeclared function 'bpf_dynptr_slice'; ISO C99 and later do not support implicit function declarations [-Wimplicit-function-declaration]
596 | data = bpf_dynptr_slice(&ptr, 0, NULL, 10);
| ^
progs/dynptr_success.c:596:7: error: incompatible integer to pointer conversion assigning to 'char *' from 'int' [-Wint-conversion]
596 | data = bpf_dynptr_slice(&ptr, 0, NULL, 10);
| ^ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
I don't have these errors without the debug kernel config from
kernel/configs/debug.config. With the debug kernel, bpf_dynptr_slice()
is not declared in vmlinux.h. It is declared there without debug.config.
The fix is similar to what is done in dynptr_fail.c which is also using
bpf_dynptr_slice(): bpf_kfuncs.h is now included.
Signed-off-by: Matthieu Baerts (NGI0) <matttbe(a)kernel.org>
---
Notes:
- This patch looks wrong, I guess bpf_dynptr_slice() should be in
vmlinux.h even with a "debug" kernel, but it is not:
$ grep -cw bpf_dynptr_slice .virtme/build-debug-btf/tools/include/vmlinux.h
0
$ grep -w bpf_dynptr_slice .virtme/build-btf/tools/include/vmlinux.h
extern void *bpf_dynptr_slice(...) __weak __ksym;
- This is on top of bpf/master: commit 63d2247e2e37, tag bpf-fixes.
- I only see this error when using kernel/configs/debug.config.
- Because this has not been spot by the BPF CI, I wonder if I'm
building the BPF selftests properly... Here is what I did:
$ virtme-configkernel --arch x86_64 --defconfig \
--custom tools/testing/selftests/net/mptcp/config \
--custom kernel/configs/debug.config \
--custom tools/testing/selftests/bpf/config \
O=${PWD}/.virtme/build-debug-btf
$ ./scripts/config --file ${PWD}/.virtme/build-debug-btf/.config \
-e NET_NS_REFCNT_TRACKER -d SLUB_DEBUG_ON \
-d DEBUG_KMEMLEAK_AUTO_SCAN -e PANIC_ON_OOPS \
-e SOFTLOCKUP_DETECTOR -e BOOTPARAM_SOFTLOCKUP_PANIC \
-e HARDLOCKUP_DETECTOR -e BOOTPARAM_HUNG_TASK_PANIC \
-e DETECT_HUNG_TASK -e BOOTPARAM_HUNG_TASK_PANIC -e DEBUG_INFO \
-e DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT -e GDB_SCRIPTS \
-e DEBUG_INFO_DWARF4 -e DEBUG_INFO_COMPRESSED \
-e DEBUG_INFO_COMPRESSED_ZLIB -e DEBUG_INFO_BTF_MODULES \
-e MODULE_ALLOW_BTF_MISMATCH -d IA32_EMULATION -e DYNAMIC_DEBUG \
--set-val CONSOLE_LOGLEVEL_DEFAULT 8 -e FTRACE -e FUNCTION_TRACER \
-e DYNAMIC_FTRACE -e FTRACE_SYSCALLS -e HIST_TRIGGERS -e DEBUG_NET \
-m KUNIT -e KUNIT_DEBUGFS -d KUNIT_ALL_TESTS -m MPTCP_KUNIT_TEST \
-e BPF_JIT -e BPF_SYSCALL -e TUN -e CRYPTO_USER_API_HASH \
-e CRYPTO_SHA1 -e NET_SCH_TBF -e BRIDGE -d RETPOLINE -d PCCARD \
-d MACINTOSH_DRIVERS -d SOUND -d USB_SUPPORT -d NEW_LEDS -d SCSI \
-d SURFACE_PLATFORMS -d DRM -d FB -d ATA -d MISC_FILESYSTEMS
# sorry, long list used by the MPTCP CI to accelerate builds, etc.
$ make O=${PWD}/.virtme/build-debug-btf olddefconfig
$ make O=${PWD}/.virtme/build-debug-btf -j$(nproc) -l$(nproc)
$ make O=${PWD}/.virtme/build-debug-btf headers_install \
INSTALL_HDR_PATH=${PWD}/.virtme/headers
$ make O=${PWD}/.virtme/build-debug-btf \
KHDR_INCLUDES=-I${PWD}/.virtme/headers/includes \
-C tools/testing/selftests/bpf
- The errors I got should be reproducible using:
$ docker run -v "${PWD}:${PWD}:rw" -w "${PWD}" --privileged --rm -it \
-e INPUT_EXTRA_ENV=INPUT_RUN_TESTS_ONLY=bpftest_all \
--pull always mptcp/mptcp-upstream-virtme-docker:latest \
auto-btf-debug
- These issues were originally spot by our MPTCP CI:
https://github.com/multipath-tcp/mptcp_net-next/actions/runs/18222911614/jo…
- No errors without kernel/configs/debug.config on the CI and on my side
- This CI got different issues, and I had to declare more kfuncs there:
https://github.com/multipath-tcp/mptcp_net-next/commit/4435d4da9f4f
but this CI is currently on top of 'net', with Jiri's patches from
https://lore.kernel.org/20251001122223.170830-1-jolsa@kernel.org
- The builds have been done from a clean build directory each time.
- Do you think the issue is on my side? Dependences? How the selftests
are built? I didn't change the way the BPF selftests are built for a
while. I had other issues with pahole 1.29, but fixed with 1.30.
- Feel free to discard this patch for a better solution (if any).
- I don't know which Fixes tag adding, but I doubt this patch is valid.
---
tools/testing/selftests/bpf/progs/dynptr_success.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/tools/testing/selftests/bpf/progs/dynptr_success.c b/tools/testing/selftests/bpf/progs/dynptr_success.c
index 127dea342e5a67dda33e0a39e84d135206d2f3f1..60daf5ce8eb283d8c8bf2d7853eda6313df4fa87 100644
--- a/tools/testing/selftests/bpf/progs/dynptr_success.c
+++ b/tools/testing/selftests/bpf/progs/dynptr_success.c
@@ -6,6 +6,7 @@
#include <stdbool.h>
#include <bpf/bpf_helpers.h>
#include <bpf/bpf_tracing.h>
+#include "bpf_kfuncs.h"
#include "bpf_misc.h"
#include "errno.h"
---
base-commit: 63d2247e2e37d9c589a0a26aa4e684f736a45e29
change-id: 20251003-bpf-sft-fix-build-err-6-18-6a4c032f680a
Best regards,
--
Matthieu Baerts (NGI0) <matttbe(a)kernel.org>
Problem
=======
When host APEI is unable to claim a synchronous external abort (SEA)
during guest abort, today KVM directly injects an asynchronous SError
into the VCPU then resumes it. The injected SError usually results in
unpleasant guest kernel panic.
One of the major situation of guest SEA is when VCPU consumes recoverable
uncorrected memory error (UER), which is not uncommon at all in modern
datacenter servers with large amounts of physical memory. Although SError
and guest panic is sufficient to stop the propagation of corrupted memory,
there is room to recover from an UER in a more graceful manner.
Proposed Solution
=================
The idea is, we can replay the SEA to the faulting VCPU. If the memory
error consumption or the fault that cause SEA is not from guest kernel,
the blast radius can be limited to the poison-consuming guest process,
while the VM can keep running.
In addition, instead of doing under the hood without involving userspace,
there are benefits to redirect the SEA to VMM:
- VM customers care about the disruptions caused by memory errors, and
VMM usually has the responsibility to start the process of notifying
the customers of memory error events in their VMs. For example some
cloud provider emits a critical log in their observability UI [1], and
provides a playbook for customers on how to mitigate disruptions to
their workloads.
- VMM can protect future memory error consumption by unmapping the poisoned
pages from stage-2 page table with KVM userfault [2], or by splitting the
memslot that contains the poisoned pages.
- VMM can keep track of SEA events in the VM. When VMM thinks the status
on the host or the VM is bad enough, e.g. number of distinct SEAs
exceeds a threshold, it can restart the VM on another healthy host.
- Behavior parity with x86 architecture. When machine check exception
(MCE) is caused by VCPU, kernel or KVM signals userspace SIGBUS to
let VMM either recover from the MCE, or terminate itself with VM.
The prior RFC proposes to implement SIGBUS on arm64 as well, but
Marc preferred KVM exit over signal [3]. However, implementation
aside, returning SEA to VMM is on par with returning MCE to VMM.
Once SEA is redirected to VMM, among other actions, VMM is encouraged
to inject external aborts into the faulting VCPU.
New UAPIs
=========
This patchset introduces following userspace-visible changes to empower
VMM to control what happens for SEA on guest memory:
- KVM_CAP_ARM_SEA_TO_USER. While taking SEA, if userspace has enabled
this new capability at VM creation, and the SEA is not owned by kernel
allocated memory, instead of injecting SError, return KVM_EXIT_ARM_SEA
to userspace.
- KVM_EXIT_ARM_SEA. This is the VM exit reason VMM gets. The details
about the SEA is provided in arm_sea as much as possible, including
sanitized ESR value at EL2, faulting guest virtual and physical
addresses if available.
* From v2 [4]:
- Rebased on "[PATCH] KVM: arm64: nv: Handle SEAs due to VNCR redirection" [5]
and kvmarm/next commit 7b8346bd9fce ("KVM: arm64: Don't attempt vLPI
mappings when vPE allocation is disabled")
- Took the host_owns_sea implementation from Oliver [6, 7].
- Excluded the guest SEA injection patches.
- Updated selftest.
* From v1 [8]:
- Rebased on commit 4d62121ce9b5 ("KVM: arm64: vgic-debug: Avoid
dereferencing NULL ITE pointer").
- Sanitize ESR_EL2 before reporting it to userspace.
- Do not do KVM_EXIT_ARM_SEA when SEA is caused by memory allocated to
stage-2 translation table.
[1] https://cloud.google.com/solutions/sap/docs/manage-host-errors
[2] https://lore.kernel.org/kvm/20250109204929.1106563-1-jthoughton@google.com
[3] https://lore.kernel.org/kvm/86pljbqqh0.wl-maz@kernel.org
[4] https://lore.kernel.org/kvm/20250604050902.3944054-1-jiaqiyan@google.com/
[5] https://lore.kernel.org/kvmarm/20250729182342.3281742-1-oliver.upton@linux.…
[6] https://lore.kernel.org/kvm/aHFohmTb9qR_JG1E@linux.dev/#t
[7] https://lore.kernel.org/kvm/aHK-DPufhLy5Dtuk@linux.dev/
[8] https://lore.kernel.org/kvm/20250505161412.1926643-1-jiaqiyan@google.com
Jiaqi Yan (3):
KVM: arm64: VM exit to userspace to handle SEA
KVM: selftests: Test for KVM_EXIT_ARM_SEA
Documentation: kvm: new UAPI for handling SEA
Documentation/virt/kvm/api.rst | 61 ++++
arch/arm64/include/asm/kvm_host.h | 2 +
arch/arm64/kvm/arm.c | 5 +
arch/arm64/kvm/mmu.c | 68 +++-
include/uapi/linux/kvm.h | 10 +
tools/arch/arm64/include/asm/esr.h | 2 +
tools/testing/selftests/kvm/Makefile.kvm | 1 +
.../testing/selftests/kvm/arm64/sea_to_user.c | 327 ++++++++++++++++++
tools/testing/selftests/kvm/lib/kvm_util.c | 1 +
9 files changed, 476 insertions(+), 1 deletion(-)
create mode 100644 tools/testing/selftests/kvm/arm64/sea_to_user.c
--
2.50.1.565.gc32cd1483b-goog
Fix a memory leak in netpoll and introduce netconsole selftests that
expose the issue when running with kmemleak detection enabled.
This patchset includes a selftest for netpoll with multiple concurrent
users (netconsole + bonding), which simulates the scenario from test[1]
that originally demonstrated the issue allegedly fixed by commit
efa95b01da18 ("netpoll: fix use after free") - a commit that is now
being reverted.
Sending this to "net" branch because this is a fix, and the selftest
might help with the backports validation.
Link: https://lore.kernel.org/lkml/96b940137a50e5c387687bb4f57de8b0435a653f.14048… [1]
Signed-off-by: Breno Leitao <leitao(a)debian.org>
---
Changes in v6:
- Expand the tests even more and some small fixups
- Moved the test to bonding selftests
- Link to v5: https://lore.kernel.org/r/20250918-netconsole_torture-v5-0-77e25e0a4eb6@deb…
Changes in v5:
- Set CONFIG_BONDING=m in selftests/drivers/net/config.
- Link to v4: https://lore.kernel.org/r/20250917-netconsole_torture-v4-0-0a5b3b8f81ce@deb…
Changes in v4:
- Added an additional selftest to test multiple netpoll users in
parallel
- Link to v3: https://lore.kernel.org/r/20250905-netconsole_torture-v3-0-875c7febd316@deb…
Changes in v3:
- This patchset is a merge of the fix and the selftest together as
recommended by Jakub.
Changes in v2:
- Reuse the netconsole creation from lib_netcons.sh. Thus, refactoring
the create_dynamic_target() (Jakub)
- Move the "wait" to after all the messages has been sent.
- Link to v1: https://lore.kernel.org/r/20250902-netconsole_torture-v1-1-03c6066598e9@deb…
---
Breno Leitao (4):
net: netpoll: fix incorrect refcount handling causing incorrect cleanup
selftest: netcons: refactor target creation
selftest: netcons: create a torture test
selftest: netcons: add test for netconsole over bonded interfaces
net/core/netpoll.c | 7 +++++--
tools/testing/selftests/drivers/net/Makefile | 1 +
tools/testing/selftests/drivers/net/bonding/Makefile | 2 ++
tools/testing/selftests/drivers/net/bonding/config | 4 ++++
tools/testing/selftests/drivers/net/bonding/netcons_over_bonding.sh | 221 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
tools/testing/selftests/drivers/net/lib/sh/lib_netcons.sh | 189 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------------------
tools/testing/selftests/drivers/net/netcons_torture.sh | 127 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
7 files changed, 531 insertions(+), 20 deletions(-)
---
base-commit: f1455695d2d99894b65db233877acac9a0e120b9
change-id: 20250902-netconsole_torture-8fc23f0aca99
Best regards,
--
Breno Leitao <leitao(a)debian.org>
This series backports 19 patches to update minmax.h in the 5.15.y branch,
aligning it with v6.17-rc7.
The ultimate goal is to synchronize all longterm branches so that they
include the full set of minmax.h changes (6.12.y and 6.6.y were already
backported by me and are now aligned, 6.1.y is in progress).
The key motivation is to bring in commit d03eba99f5bf ("minmax: allow
min()/max()/clamp() if the arguments have the same signedness"), which
is missing in kernel 5.10.y.
In mainline, this change enables min()/max()/clamp() to accept mixed
argument types, provided both have the same signedness. Without it,
backported patches that use these forms may trigger compiler warnings,
which escalate to build failures when -Werror is enabled.
Andy Shevchenko (1):
minmax: deduplicate __unconst_integer_typeof()
David Laight (8):
minmax: fix indentation of __cmp_once() and __clamp_once()
minmax.h: add whitespace around operators and after commas
minmax.h: update some comments
minmax.h: reduce the #define expansion of min(), max() and clamp()
minmax.h: use BUILD_BUG_ON_MSG() for the lo < hi test in clamp()
minmax.h: move all the clamp() definitions after the min/max() ones
minmax.h: simplify the variants of clamp()
minmax.h: remove some #defines that are only expanded once
Herve Codina (1):
minmax: Introduce {min,max}_array()
Linus Torvalds (8):
minmax: avoid overly complicated constant expressions in VM code
minmax: make generic MIN() and MAX() macros available everywhere
minmax: add a few more MIN_T/MAX_T users
minmax: simplify and clarify min_t()/max_t() implementation
minmax: simplify min()/max()/clamp() implementation
minmax: don't use max() in situations that want a C constant
expression
minmax: improve macro expansion and type checking
minmax: fix up min3() and max3() too
Matthew Wilcox (Oracle) (1):
minmax: add in_range() macro
arch/arm/mm/pageattr.c | 6 +-
arch/um/drivers/mconsole_user.c | 2 +
arch/x86/mm/pgtable.c | 2 +-
drivers/edac/sb_edac.c | 4 +-
drivers/edac/skx_common.h | 1 -
drivers/gpu/drm/amd/amdgpu/amdgpu.h | 2 +
.../drm/amd/display/modules/hdcp/hdcp_ddc.c | 2 +
.../drm/amd/pm/powerplay/hwmgr/ppevvmath.h | 14 +-
.../amd/pm/swsmu/smu11/sienna_cichlid_ppt.c | 2 +
.../drm/arm/display/include/malidp_utils.h | 2 +-
.../display/komeda/komeda_pipeline_state.c | 24 +-
drivers/gpu/drm/drm_color_mgmt.c | 2 +-
drivers/gpu/drm/msm/adreno/a6xx_gmu.c | 6 -
drivers/gpu/drm/radeon/evergreen_cs.c | 2 +
drivers/hwmon/adt7475.c | 24 +-
drivers/input/touchscreen/cyttsp4_core.c | 2 +-
drivers/irqchip/irq-sun6i-r.c | 2 +-
drivers/md/dm-integrity.c | 4 +-
drivers/media/dvb-frontends/stv0367_priv.h | 3 +
.../net/ethernet/chelsio/cxgb3/cxgb3_main.c | 18 +-
.../net/ethernet/stmicro/stmmac/stmmac_main.c | 2 +-
drivers/net/fjes/fjes_main.c | 4 +-
drivers/nfc/pn544/i2c.c | 2 -
drivers/platform/x86/sony-laptop.c | 1 -
drivers/scsi/isci/init.c | 6 +-
.../pci/hive_isp_css_include/math_support.h | 5 -
drivers/virt/acrn/ioreq.c | 4 +-
fs/btrfs/misc.h | 2 -
fs/btrfs/tree-checker.c | 2 +-
fs/ext2/balloc.c | 2 -
fs/ext4/ext4.h | 2 -
fs/ufs/util.h | 6 -
include/linux/compiler.h | 9 +
include/linux/minmax.h | 264 +++++++++++++-----
kernel/trace/preemptirq_delay_test.c | 2 -
lib/btree.c | 1 -
lib/decompress_unlzma.c | 2 +
lib/logic_pio.c | 3 -
lib/vsprintf.c | 2 +-
lib/zstd/zstd_internal.h | 2 -
mm/zsmalloc.c | 1 -
net/ipv4/proc.c | 2 +-
net/ipv6/proc.c | 2 +-
net/netfilter/nf_nat_core.c | 6 +-
net/tipc/core.h | 2 +-
net/tipc/link.c | 10 +-
tools/testing/selftests/vm/mremap_test.c | 2 +
47 files changed, 289 insertions(+), 183 deletions(-)
--
2.47.3
Add the benchmark testcase "kprobe-multi-all", which will hook all the
kernel functions during the testing.
This series is separated out from [1].
Changes since V1:
* introduce trace_blacklist instead of copy-pasting strcmp in the 2nd
patch
* use fprintf() instead of printf() in 3rd patch
Link: https://lore.kernel.org/bpf/20250817024607.296117-1-dongml2@chinatelecom.cn/ [1]
Menglong Dong (3):
selftests/bpf: move get_ksyms and get_addrs to trace_helpers.c
selftests/bpf: skip recursive functions for kprobe_multi
selftests/bpf: add benchmark testing for kprobe-multi-all
tools/testing/selftests/bpf/bench.c | 4 +
.../selftests/bpf/benchs/bench_trigger.c | 53 ++++
.../selftests/bpf/benchs/run_bench_trigger.sh | 4 +-
.../bpf/prog_tests/kprobe_multi_test.c | 220 +---------------
.../selftests/bpf/progs/trigger_bench.c | 12 +
tools/testing/selftests/bpf/trace_helpers.c | 234 ++++++++++++++++++
tools/testing/selftests/bpf/trace_helpers.h | 3 +
7 files changed, 311 insertions(+), 219 deletions(-)
--
2.51.0
Regarding my last email, which you haven't replied to. I need your answer today, either for your personal or for your company/business. Let me know if you're interested
Hi Linus,
Please pull this kselftest next update for Linux 6.18-rc1.
- Fixes watchdog test to exit when device doesn't support keep alive
- Fix missing header build complaints during out of tree build
- A few minor fixes to git ignore
- MAINTAINERS file change to update dma_map_benchmark
diff is attached.
thanks,
-- Shuah
----------------------------------------------------------------
The following changes since commit f83ec76bf285bea5727f478a68b894f5543ca76e:
Linux 6.17-rc6 (2025-09-14 14:21:14 -0700)
are available in the Git repository at:
git://git.kernel.org/pub/scm/linux/kernel/git/shuah/linux-kselftest tags/linux_kselftest-next-6.18-rc1
for you to fetch changes up to 19692013415486febf71343f5cc539a343a2994b:
MAINTAINERS: add myself and Barry to dma_map_benchmark maintainers (2025-09-28 14:41:34 -0600)
----------------------------------------------------------------
linux_kselftest-next-6.18-rc1
- Fixes watchdog test to exit when device doesn't support keep alive
- Fix missing header build complaints during out of tree build
- A few minor fixes to git ignore
- MAINTAINERS file change to update dma_map_benchmark
----------------------------------------------------------------
Akhilesh Patil (1):
selftests: watchdog: skip ping loop if WDIOF_KEEPALIVEPING not supported
Dylan Yudaken (1):
selftests/kexec: Ignore selftest binary
Qinxin Xia (1):
MAINTAINERS: add myself and Barry to dma_map_benchmark maintainers
Thomas Weißschuh (1):
selftests: always install UAPI headers to the correct directory
Yi Lai (1):
selftests/kselftest_harness: Add harness-selftest.expected to TEST_FILES
MAINTAINERS | 5 +++--
tools/testing/selftests/kexec/.gitignore | 2 ++
tools/testing/selftests/kselftest_harness/Makefile | 1 +
tools/testing/selftests/lib.mk | 5 ++++-
tools/testing/selftests/watchdog/watchdog-test.c | 6 ++++++
5 files changed, 16 insertions(+), 3 deletions(-)
create mode 100644 tools/testing/selftests/kexec/.gitignore
----------------------------------------------------------------
Hi Linus,
Please pull the following kunit next update for Linux 6.18-rc1.
A seven patch series adds a new parameterized test features
KUnit parameterized tests currently support two primary methods for
getting parameters:
1. Defining custom logic within a generate_params() function.
2. Using the KUNIT_ARRAY_PARAM() and KUNIT_ARRAY_PARAM_DESC()
macros with a pre-defined static array and passing
the created *_gen_params() to KUNIT_CASE_PARAM().
These methods present limitations when dealing with dynamically
generated parameter arrays, or in scenarios where populating parameters
sequentially via generate_params() is inefficient or overly complex.
These limitations are fixed with a parameterized test method.
- Fixes issues in kunit build artifacts cleanup,
- Fixes parsing skipped test problem in kselftest framework,
- Enables PCI on UML without triggering WARN()
- a few other fixes and adds support for new configs such as MIPS
diff is attached.
thanks,
-- Shuah
----------------------------------------------------------------
The following changes since commit 8f5ae30d69d7543eee0d70083daf4de8fe15d585:
Linux 6.17-rc1 (2025-08-10 19:41:16 +0300)
are available in the Git repository at:
git://git.kernel.org/pub/scm/linux/kernel/git/shuah/linux-kselftest tags/linux_kselftest-kunit-6.18-rc1
for you to fetch changes up to 285cae57a51664cc94e85de0ff994f9965b3aca8:
kunit: Extend kconfig help text for KUNIT_UML_PCI (2025-09-16 08:27:09 -0600)
----------------------------------------------------------------
linux_kselftest-kunit-6.18-rc1
- A seven patch series adds a new parameterized test features
KUnit parameterized tests currently support two primary methods for
getting parameters:
1. Defining custom logic within a generate_params() function.
2. Using the KUNIT_ARRAY_PARAM() and KUNIT_ARRAY_PARAM_DESC()
macros with a pre-defined static array and passing
the created *_gen_params() to KUNIT_CASE_PARAM().
These methods present limitations when dealing with dynamically
generated parameter arrays, or in scenarios where populating parameters
sequentially via generate_params() is inefficient or overly complex.
These limitations are fixed with a parameterized test method.
- Fixes issues in kunit build artifacts cleanup,
- Fixes parsing skipped test problem in kselftest framework,
- Enables PCI on UML without triggering WARN()
- a few other fixes and adds support for new configs such as MIPS
----------------------------------------------------------------
David Gow (1):
kunit: tool: Accept --raw_output=full as an alias of 'all'
Kaibo Ma (1):
rust: kunit: allow `cfg` on `test`s
Marie Zhussupova (7):
kunit: Add parent kunit for parameterized test context
kunit: Introduce param_init/exit for parameterized test context management
kunit: Pass parameterized test context to generate_params()
kunit: Enable direct registration of parameter arrays to a KUnit test
kunit: Add example parameterized test with shared resource management using the Resource API
kunit: Add example parameterized test with direct dynamic parameter array setup
Documentation: kunit: Document new parameterized test features
Thomas Weißschuh (5):
kunit: Always descend into kunit directory during build
kunit: tool: Parse skipped tests from kselftest.h
kunit: Enable PCI on UML without triggering WARN()
kunit: qemu_configs: Add MIPS configurations
kunit: Extend kconfig help text for KUNIT_UML_PCI
Documentation/dev-tools/kunit/usage.rst | 342 ++++++++++++++++++++-
drivers/gpu/drm/xe/tests/xe_pci.c | 14 +-
drivers/gpu/drm/xe/tests/xe_pci_test.h | 9 +-
include/kunit/test.h | 95 +++++-
kernel/kcsan/kcsan_test.c | 2 +-
lib/Makefile | 4 -
lib/kunit/Kconfig | 11 +
lib/kunit/Makefile | 2 +-
lib/kunit/kunit-example-test.c | 217 +++++++++++++
lib/kunit/test.c | 94 +++++-
rust/kernel/kunit.rs | 11 +
rust/macros/kunit.rs | 48 ++-
tools/testing/kunit/configs/arch_uml.config | 5 +-
tools/testing/kunit/kunit.py | 4 +-
tools/testing/kunit/kunit_parser.py | 8 +-
tools/testing/kunit/qemu_configs/mips.py | 18 ++
tools/testing/kunit/qemu_configs/mips64.py | 19 ++
tools/testing/kunit/qemu_configs/mips64el.py | 19 ++
tools/testing/kunit/qemu_configs/mipsel.py | 18 ++
.../test_data/test_is_test_passed-kselftest.log | 3 +-
20 files changed, 880 insertions(+), 63 deletions(-)
create mode 100644 tools/testing/kunit/qemu_configs/mips.py
create mode 100644 tools/testing/kunit/qemu_configs/mips64.py
create mode 100644 tools/testing/kunit/qemu_configs/mips64el.py
create mode 100644 tools/testing/kunit/qemu_configs/mipsel.py
----------------------------------------------------------------
The active-backup bonding mode supports XFRM ESP offload. However, when
a bond is added using command like `ip link add bond0 type bond mode 1
miimon 100`, the `ethtool -k` command shows that the XFRM ESP offload is
disabled. This occurs because, in bond_newlink(), we change bond link
first and register bond device later. So the XFRM feature update in
bond_option_mode_set() is not called as the bond device is not yet
registered, leading to the offload feature not being set successfully.
To resolve this issue, we can modify the code order in bond_newlink() to
ensure that the bond device is registered first before changing the bond
link parameters. This change will allow the XFRM ESP offload feature to be
correctly enabled.
Fixes: 007ab5345545 ("bonding: fix feature flag setting at init time")
Signed-off-by: Hangbin Liu <liuhangbin(a)gmail.com>
---
v3: rebase to latest net, no code update
v2: rebase to latest net, no code update
---
drivers/net/bonding/bond_main.c | 2 +-
drivers/net/bonding/bond_netlink.c | 16 +++++++++-------
include/net/bonding.h | 1 +
3 files changed, 11 insertions(+), 8 deletions(-)
diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 57be04f6cb11..f4f0feddd9fa 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -4411,7 +4411,7 @@ void bond_work_init_all(struct bonding *bond)
INIT_DELAYED_WORK(&bond->slave_arr_work, bond_slave_arr_handler);
}
-static void bond_work_cancel_all(struct bonding *bond)
+void bond_work_cancel_all(struct bonding *bond)
{
cancel_delayed_work_sync(&bond->mii_work);
cancel_delayed_work_sync(&bond->arp_work);
diff --git a/drivers/net/bonding/bond_netlink.c b/drivers/net/bonding/bond_netlink.c
index 57fff2421f1b..7a9d73ec8e91 100644
--- a/drivers/net/bonding/bond_netlink.c
+++ b/drivers/net/bonding/bond_netlink.c
@@ -579,20 +579,22 @@ static int bond_newlink(struct net_device *bond_dev,
struct rtnl_newlink_params *params,
struct netlink_ext_ack *extack)
{
+ struct bonding *bond = netdev_priv(bond_dev);
struct nlattr **data = params->data;
struct nlattr **tb = params->tb;
int err;
- err = bond_changelink(bond_dev, tb, data, extack);
- if (err < 0)
+ err = register_netdevice(bond_dev);
+ if (err)
return err;
- err = register_netdevice(bond_dev);
- if (!err) {
- struct bonding *bond = netdev_priv(bond_dev);
+ netif_carrier_off(bond_dev);
+ bond_work_init_all(bond);
- netif_carrier_off(bond_dev);
- bond_work_init_all(bond);
+ err = bond_changelink(bond_dev, tb, data, extack);
+ if (err) {
+ bond_work_cancel_all(bond);
+ unregister_netdevice(bond_dev);
}
return err;
diff --git a/include/net/bonding.h b/include/net/bonding.h
index e06f0d63b2c1..bd56ad976cfb 100644
--- a/include/net/bonding.h
+++ b/include/net/bonding.h
@@ -711,6 +711,7 @@ struct bond_vlan_tag *bond_verify_device_path(struct net_device *start_dev,
int bond_update_slave_arr(struct bonding *bond, struct slave *skipslave);
void bond_slave_arr_work_rearm(struct bonding *bond, unsigned long delay);
void bond_work_init_all(struct bonding *bond);
+void bond_work_cancel_all(struct bonding *bond);
#ifdef CONFIG_PROC_FS
void bond_create_proc_entry(struct bonding *bond);
--
2.50.1
The core scheduling is for smt enabled cpus. It is not returns
failure and gives plenty of error messages and not clearly points
to the smt issue if the smt is disabled. It just mention
"not a core sched system" and many other messages. For example:
Not a core sched system
tid=210574, / tgid=210574 / pgid=210574: ffffffffffffffff
Not a core sched system
tid=210575, / tgid=210575 / pgid=210574: ffffffffffffffff
Not a core sched system
tid=210577, / tgid=210575 / pgid=210574: ffffffffffffffff
(similar things many other times)
In this patch, the test will first read /sys/devices/system/cpu/smt/active,
if the file cannot be opened or its value is 0, the test is skipped with
an explanatory message. This helps developers understand why it is skipped
and avoids unnecessary attention when running the full selftest suite.
Signed-off-by: Yifei Liu <yifei.l.liu(a)oracle.com>
---
tools/testing/selftests/sched/cs_prctl_test.c | 23 ++++++++++++++++++-
1 file changed, 22 insertions(+), 1 deletion(-)
diff --git a/tools/testing/selftests/sched/cs_prctl_test.c b/tools/testing/selftests/sched/cs_prctl_test.c
index 52d97fae4dbd..7ce8088cde6a 100644
--- a/tools/testing/selftests/sched/cs_prctl_test.c
+++ b/tools/testing/selftests/sched/cs_prctl_test.c
@@ -32,6 +32,8 @@
#include <stdlib.h>
#include <string.h>
+#include "../kselftest.h"
+
#if __GLIBC_PREREQ(2, 30) == 0
#include <sys/syscall.h>
static pid_t gettid(void)
@@ -109,6 +111,22 @@ static void handle_usage(int rc, char *msg)
exit(rc);
}
+int check_smt(void)
+{
+ int c = 0;
+ FILE *file;
+
+ file = fopen("/sys/devices/system/cpu/smt/active", "r");
+ if (!file)
+ return 0;
+ c = fgetc(file) - 0x30;
+ fclose(file);
+ if (c == 0 || c == 1)
+ return c;
+ //if fgetc returns EOF or -1 for correupted files, return 0.
+ return 0;
+}
+
static unsigned long get_cs_cookie(int pid)
{
unsigned long long cookie;
@@ -271,7 +289,10 @@ int main(int argc, char *argv[])
delay = -1;
srand(time(NULL));
-
+ if (!check_smt()) {
+ ksft_test_result_skip("smt not enabled\n");
+ return 1;
+ }
/* put into separate process group */
if (setpgid(0, 0) != 0)
handle_error("process group");
--
2.50.1
Fix functions that return undefined values. These issues were caught by
running clang using LLVM=1 option; and are as follows:
--
ovpn-cli.c:1587:6: warning: variable 'ret' is used uninitialized whenever 'if' condition is true [-Wsometimes-uninitialized]
1587 | if (!sock) {
| ^~~~~
ovpn-cli.c:1635:9: note: uninitialized use occurs here
1635 | return ret;
| ^~~
ovpn-cli.c:1587:2: note: remove the 'if' if its condition is always false
1587 | if (!sock) {
| ^~~~~~~~~~~~
1588 | fprintf(stderr, "cannot allocate netlink socket\n");
| ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1589 | goto err_free;
| ~~~~~~~~~~~~~~
1590 | }
| ~
ovpn-cli.c:1584:15: note: initialize the variable 'ret' to silence this warning
1584 | int mcid, ret;
| ^
| = 0
ovpn-cli.c:2107:7: warning: variable 'ret' is used uninitialized whenever switch case is taken [-Wsometimes-uninitialized]
2107 | case CMD_INVALID:
| ^~~~~~~~~~~
ovpn-cli.c:2111:9: note: uninitialized use occurs here
2111 | return ret;
| ^~~
ovpn-cli.c:1939:12: note: initialize the variable 'ret' to silence this warning
1939 | int n, ret;
| ^
|
--
Fixes: 959bc330a439 ("testing/selftests: add test tool and scripts for ovpn module")
ovpn module")
v3:
- Use prefix net.
- Remove so_txtime fix as default case calls error().
- Changelog before sign-off.
- Three dashes after sign-off
v2:
- Use subsystem name "net".
- Add fixes tags.
- Remove txtimestamp fix as default case calls error.
- Assign constant error string instead of NULL.
Signed-off-by: Sidharth Seela <sidharthseela(a)gmail.com>
---
diff --git a/tools/testing/selftests/net/ovpn/ovpn-cli.c b/tools/testing/selftests/net/ovpn/ovpn-cli.c
index 9201f2905f2c..20d00378f34a 100644
--- a/tools/testing/selftests/net/ovpn/ovpn-cli.c
+++ b/tools/testing/selftests/net/ovpn/ovpn-cli.c
@@ -1581,7 +1581,7 @@ static int ovpn_listen_mcast(void)
{
struct nl_sock *sock;
struct nl_cb *cb;
- int mcid, ret;
+ int mcid, ret = -1;
sock = nl_socket_alloc();
if (!sock) {
@@ -1936,7 +1936,7 @@ static int ovpn_run_cmd(struct ovpn_ctx *ovpn)
{
char peer_id[10], vpnip[INET6_ADDRSTRLEN], laddr[128], lport[10];
char raddr[128], rport[10];
- int n, ret;
+ int n, ret = -1;
FILE *fp;
switch (ovpn->cmd) {
--
2.47.3
This is a follow-up series of [1]. It tries to fix a possible UAF in the
fops of cros_ec_chardev after the underlying protocol device has gone by
using revocable.
The 1st patch introduces the revocable which is an implementation of ideas
from the talk [2].
The 2nd and 3rd patches add test cases for revocable in Kunit and selftest.
The 4th patch converts existing protocol devices to resource providers
of cros_ec_device.
The 5th - 7th are PoC patches for moving most revocable code to subsystem
level. Miscdevice is used as it would be simpler for PoC. Note that the
device driver (e.g., cros_ec_chardev) still needs to be revocable-aware.
The driver needs to specify where to save the pointer and thus the resource
is available in fops.
- The 5th patch adds a helper for using revocable API with fops.
- The 6th patch leverages the helper in miscdevice.
- The 7th patch converts cros_ec_chardev to a resource consumer of
cros_ec_device to fix the UAF.
[1] https://lore.kernel.org/chrome-platform/20250721044456.2736300-6-tzungbi@ke…
[2] https://lpc.events/event/17/contributions/1627/
v4:
- Rebase onto next-20250922.
- Remove the 5th patch from v3.
- Add fops replacement PoC in 5th - 7th patches.
v3: https://lore.kernel.org/chrome-platform/20250912081718.3827390-1-tzungbi@ke…
- Rebase onto https://lore.kernel.org/chrome-platform/20250828083601.856083-1-tzungbi@ker…
and next-20250912.
- The 4th patch changed accordingly.
v2: https://lore.kernel.org/chrome-platform/20250820081645.847919-1-tzungbi@ker…
- Rename "ref_proxy" -> "revocable".
- Add test cases in Kunit and selftest.
v1: https://lore.kernel.org/chrome-platform/20250814091020.1302888-1-tzungbi@ke…
Tzung-Bi Shih (7):
revocable: Revocable resource management
revocable: Add Kunit test cases
selftests: revocable: Add kselftest cases
platform/chrome: Protect cros_ec_device lifecycle with revocable
revocable: Add fops replacement
char: misc: Leverage revocable fops replacement
platform/chrome: cros_ec_chardev: Secure cros_ec_device via revocable
.../driver-api/driver-model/index.rst | 1 +
.../driver-api/driver-model/revocable.rst | 87 ++++
MAINTAINERS | 9 +
drivers/base/Kconfig | 8 +
drivers/base/Makefile | 5 +-
drivers/base/revocable.c | 374 ++++++++++++++++++
drivers/base/revocable_test.c | 110 ++++++
drivers/char/misc.c | 7 +
drivers/platform/chrome/cros_ec.c | 5 +
drivers/platform/chrome/cros_ec_chardev.c | 15 +-
include/linux/miscdevice.h | 3 +
include/linux/platform_data/cros_ec_proto.h | 4 +
include/linux/revocable.h | 60 +++
tools/testing/selftests/Makefile | 1 +
.../selftests/drivers/base/revocable/Makefile | 7 +
.../drivers/base/revocable/revocable_test.c | 116 ++++++
.../drivers/base/revocable/test-revocable.sh | 39 ++
.../base/revocable/test_modules/Makefile | 10 +
.../revocable/test_modules/revocable_test.c | 188 +++++++++
19 files changed, 1047 insertions(+), 2 deletions(-)
create mode 100644 Documentation/driver-api/driver-model/revocable.rst
create mode 100644 drivers/base/revocable.c
create mode 100644 drivers/base/revocable_test.c
create mode 100644 include/linux/revocable.h
create mode 100644 tools/testing/selftests/drivers/base/revocable/Makefile
create mode 100644 tools/testing/selftests/drivers/base/revocable/revocable_test.c
create mode 100755 tools/testing/selftests/drivers/base/revocable/test-revocable.sh
create mode 100644 tools/testing/selftests/drivers/base/revocable/test_modules/Makefile
create mode 100644 tools/testing/selftests/drivers/base/revocable/test_modules/revocable_test.c
--
2.51.0.534.gc79095c0ca-goog
This series includes several changes to the MPTCP RX path. The main
goals are improving the RX performances, and increase the long term
maintainability.
Some changes reflects recent(ish) improvements introduced in the TCP
stack: patch 1, 2 and 3 are the MPTCP counter part of SKB deferral free
and auto-tuning improvements. Note that patch 3 could possibly fix
additional issues, and overall such patch should protect from similar
issues to arise in the future.
Patches 4-7 are aimed at introducing the socket backlog usage which will
be done in a later series to process the packets received by the
different subflows while the msk socket is owned.
Patch 8 is not related to the RX path, but it contains additional tests
for new features recently introduced in net-next.
Signed-off-by: Matthieu Baerts (NGI0) <matttbe(a)kernel.org>
---
Notes:
- Sorry for sending this series that late, we had quite a few patches
to upstream during this cycle. This is the last batch, and it has
been heavily tested the last 2 weeks.
- If there are some issues with some patches, but not with 1-3, it
would be nice, if possible, if these 3 first patches can be accepted,
to reduce the recently introduced gap with TCP.
- Patches can be grouped like this if needed: 1-3, 4-5, 6-7, 8. 6-7 are
preparing the ground for future on-going work, they can be dropped if
there are issues with them.
---
Matthieu Baerts (NGI0) (1):
selftests: mptcp: join: validate new laminar endp
Paolo Abeni (7):
mptcp: leverage skb deferral free
tcp: make tcp_rcvbuf_grow() accessible to mptcp code
mptcp: rcvbuf auto-tuning improvement
mptcp: introduce the mptcp_init_skb helper
mptcp: remove unneeded mptcp_move_skb()
mptcp: factor out a basic skb coalesce helper
mptcp: minor move_skbs_to_msk() cleanup
include/net/tcp.h | 1 +
net/ipv4/tcp_input.c | 2 +-
net/mptcp/protocol.c | 187 ++++++++++++------------
net/mptcp/protocol.h | 4 +-
tools/testing/selftests/net/mptcp/mptcp_join.sh | 69 +++++++++
tools/testing/selftests/net/mptcp/pm_nl_ctl.c | 9 ++
6 files changed, 177 insertions(+), 95 deletions(-)
---
base-commit: 1493c18fe8696bfc758a97130a485fc4e08387f5
change-id: 20250927-net-next-mptcp-rcv-path-imp-192d8c24c9c7
Best regards,
--
Matthieu Baerts (NGI0) <matttbe(a)kernel.org>
The selftests: kvm: irqfd_test consistently fails across all test platforms
since its introduction in Linux next-20250625. The failure occurs due to
a KVM_IRQFD ioctl returning errno 11 (Resource temporarily unavailable).
This has been observed from day one and is reproducible on all test runs.
Reproducibility: 100% failure on all test platforms since
next-20250625..next-20250929
Test fails on the below list
* graviton4
* rk3399-rock-pi-4b
## Initial Observations:
The test is attempting to register an IRQFD but fails with EAGAIN (errno 11).
This likely indicates resource exhaustion or unsupported behavior on
affected ARM-based platforms.
Could you please advise on the way forward for this test?
Should we treat this as an unsupported case on ARM platforms,
or is there a missing implementation/configuration that needs to be addressed?
## Test log
selftests: kvm: irqfd_test
Random seed: 0x6b8b4567
==== Test Assertion Failure ====
include/kvm_util.h:527: !ret
pid=721 tid=721 errno=11 - Resource temporarily unavailable
1 0x000000000040250f: kvm_irqfd at kvm_util.h:527
2 0x000000000040222f: main at irqfd_test.c:100
3 0x0000ffffbd43229b: ?? ??:0
4 0x0000ffffbd43237b: ?? ??:0
addr2line: 5 0x000000000040206f: DWARF error: mangled line number
section (bad file number)
addr2line: DWARF error: mangled line number section (bad file number)
_start at ??:?
KVM_IRQFD failed, rc: -1 errno: 11 (Resource temporarily unavailable)
not ok 4 selftests: kvm: irqfd_test exit=254
## Links
* https://qa-reports.linaro.org/lkft/linux-next-master/build/next-20250929/te…
* https://qa-reports.linaro.org/lkft/linux-next-master/build/next-20250929/te…
Add a basic test suite for drivers that support PSP. Also, add a PSP
implementation in the netdevsim driver.
The netdevsim implementation does encapsulation and decapsulation of
PSP packets, but no crypto.
The tests cover the basic usage of the uapi, and demonstrate key
exchange and connection setup. The tests and netdevsim support IPv4
and IPv6. Here is an example run on a system with a CX7 NIC.
TAP version 13
1..28
ok 1 psp.data_basic_send_v0_ip4
ok 2 psp.data_basic_send_v0_ip6
ok 3 psp.data_basic_send_v1_ip4
ok 4 psp.data_basic_send_v1_ip6
ok 5 psp.data_basic_send_v2_ip4 # SKIP ('PSP version not supported', 'hdr0-aes-gmac-128')
ok 6 psp.data_basic_send_v2_ip6 # SKIP ('PSP version not supported', 'hdr0-aes-gmac-128')
ok 7 psp.data_basic_send_v3_ip4 # SKIP ('PSP version not supported', 'hdr0-aes-gmac-256')
ok 8 psp.data_basic_send_v3_ip6 # SKIP ('PSP version not supported', 'hdr0-aes-gmac-256')
ok 9 psp.data_mss_adjust_ip4
ok 10 psp.data_mss_adjust_ip6
ok 11 psp.dev_list_devices
ok 12 psp.dev_get_device
ok 13 psp.dev_get_device_bad
ok 14 psp.dev_rotate
ok 15 psp.dev_rotate_spi
ok 16 psp.assoc_basic
ok 17 psp.assoc_bad_dev
ok 18 psp.assoc_sk_only_conn
ok 19 psp.assoc_sk_only_mismatch
ok 20 psp.assoc_sk_only_mismatch_tx
ok 21 psp.assoc_sk_only_unconn
ok 22 psp.assoc_version_mismatch
ok 23 psp.assoc_twice
ok 24 psp.data_send_bad_key
ok 25 psp.data_send_disconnect
ok 26 psp.data_stale_key
ok 27 psp.removal_device_rx # XFAIL Test only works on netdevsim
ok 28 psp.removal_device_bi # XFAIL Test only works on netdevsim
# Totals: pass:22 fail:0 xfail:2 xpass:0 skip:4 error:0
#
# Responder logs (0):
# STDERR:
# Set PSP enable on device 1 to 0x3
# Set PSP enable on device 1 to 0x0
v3:
- fix netdevsim bugs
- rework the skipping
- use errno
- remove duplicated condition
v2: https://lore.kernel.org/20250925211647.3450332-1-daniel.zahka@gmail.com
- fix pylint warnings
- insert CONFIG_INET_PSP in alphebetical order
- use branch to skip all tests
- fix compilation error when CONFIG_INET_PSP is not set
v1: https://lore.kernel.org/20250924194959.2845473-1-daniel.zahka@gmail.com
Jakub Kicinski (8):
netdevsim: a basic test PSP implementation
selftests: drv-net: base device access API test
selftests: drv-net: add PSP responder
selftests: drv-net: psp: add basic data transfer and key rotation
tests
selftests: drv-net: psp: add association tests
selftests: drv-net: psp: add connection breaking tests
selftests: drv-net: psp: add test for auto-adjusting TCP MSS
selftests: drv-net: psp: add tests for destroying devices
drivers/net/netdevsim/Makefile | 4 +
tools/testing/selftests/drivers/net/Makefile | 10 +
drivers/net/netdevsim/netdevsim.h | 27 +
drivers/net/netdevsim/netdev.c | 43 +-
drivers/net/netdevsim/psp.c | 225 +++++++
net/core/skbuff.c | 1 +
.../selftests/drivers/net/psp_responder.c | 483 ++++++++++++++
.../testing/selftests/drivers/net/.gitignore | 1 +
tools/testing/selftests/drivers/net/config | 1 +
.../drivers/net/hw/lib/py/__init__.py | 4 +-
.../selftests/drivers/net/lib/py/__init__.py | 4 +-
.../selftests/drivers/net/lib/py/env.py | 4 +
tools/testing/selftests/drivers/net/psp.py | 627 ++++++++++++++++++
.../testing/selftests/net/lib/py/__init__.py | 2 +-
tools/testing/selftests/net/lib/py/ksft.py | 10 +
tools/testing/selftests/net/lib/py/ynl.py | 5 +
16 files changed, 1440 insertions(+), 11 deletions(-)
create mode 100644 drivers/net/netdevsim/psp.c
create mode 100644 tools/testing/selftests/drivers/net/psp_responder.c
create mode 100755 tools/testing/selftests/drivers/net/psp.py
--
2.51.0
Fix functions that return undefined values. These issues were caught by
running clang using LLVM=1 option.
Clang warnings are as follows:
ovpn-cli.c:1587:6: warning: variable 'ret' is used uninitialized whenever 'if' condition is true [-Wsometimes-uninitialized]
1587 | if (!sock) {
| ^~~~~
ovpn-cli.c:1635:9: note: uninitialized use occurs here
1635 | return ret;
| ^~~
ovpn-cli.c:1587:2: note: remove the 'if' if its condition is always false
1587 | if (!sock) {
| ^~~~~~~~~~~~
1588 | fprintf(stderr, "cannot allocate netlink socket\n");
| ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1589 | goto err_free;
| ~~~~~~~~~~~~~~
1590 | }
| ~
ovpn-cli.c:1584:15: note: initialize the variable 'ret' to silence this warning
1584 | int mcid, ret;
| ^
| = 0
ovpn-cli.c:2107:7: warning: variable 'ret' is used uninitialized whenever switch case is taken [-Wsometimes-uninitialized]
2107 | case CMD_INVALID:
| ^~~~~~~~~~~
ovpn-cli.c:2111:9: note: uninitialized use occurs here
2111 | return ret;
| ^~~
ovpn-cli.c:1939:12: note: initialize the variable 'ret' to silence this warning
1939 | int n, ret;
| ^
|
Fixes: 959bc330a439 ("testing/selftests: add test tool and scripts for ovpn module")
ovpn module")
Signed-off-by: Sidharth Seela <sidharthseela(a)gmail.com>
---
v5:
- Assign -ENOMEM to ret inside if block.
- Assign -EINVAL to ret inside case block.
v4:
- Move changelog below sign-off.
- Remove double-hyphens in commit description.
v3:
- Use prefix net.
- Remove so_txtime fix as default case calls error().
- Changelog before sign-off.
- Three dashes after sign-off
v2:
- Use subsystem name "net".
- Add fixes tags.
- Remove txtimestamp fix as default case calls error.
- Assign constant error string instead of NULL.
diff --git a/tools/testing/selftests/net/ovpn/ovpn-cli.c b/tools/testing/selftests/net/ovpn/ovpn-cli.c
index 9201f2905f2c..8d0f2f61923c 100644
--- a/tools/testing/selftests/net/ovpn/ovpn-cli.c
+++ b/tools/testing/selftests/net/ovpn/ovpn-cli.c
@@ -1586,6 +1586,7 @@ static int ovpn_listen_mcast(void)
sock = nl_socket_alloc();
if (!sock) {
fprintf(stderr, "cannot allocate netlink socket\n");
+ ret = -ENOMEM;
goto err_free;
}
@@ -2105,6 +2106,7 @@ static int ovpn_run_cmd(struct ovpn_ctx *ovpn)
ret = ovpn_listen_mcast();
break;
case CMD_INVALID:
+ ret = -EINVAL;
break;
}
--
2.47.3
From: Chia-Yu Chang <chia-yu.chang(a)nokia-bell-labs.com>
Hello,
Plesae find the v2 AccECN case handling patch series, which covers
several excpetional case handling of Accurate ECN spec (RFC9768),
adds new identifiers to be used by CC modules, adds ecn_delta into
rate_sample, and keeps the ACE counter for computation, etc.
This patch series is part of the full AccECN patch series, which is available at
https://github.com/L4STeam/linux-net-next/commits/upstream_l4steam/
Best regards,
Chia-Yu
---
v3:
- Add additional min() check if pkts_acked_ewma is not initialized in #1.
- Change TCP_CONG_WANTS_ECT_1 into individual flag add helper function INET_ECN_xmit_wants_ect_1() in #3.
- Add empty line between variable declarations and code in #4.
- Update commit message to fix old AccECN commits in #5.
- Remove unnecessary brackets in #10.
- Move patch #3 in v2 to a later Prague patch serise and remove patch #13 in v2.
---
Chia-Yu Chang (10):
tcp: L4S ECT(1) identifier and NEEDS_ACCECN for CC modules
tcp: disable RFC3168 fallback identifier for CC modules
tcp: accecn: handle unexpected AccECN negotiation feedback
tcp: accecn: retransmit downgraded SYN in AccECN negotiation
tcp: move increment of num_retrans
tcp: accecn: retransmit SYN/ACK without AccECN option or non-AccECN
SYN/ACK
tcp: accecn: unset ECT if receive or send ACE=0 in AccECN negotiaion
tcp: accecn: fallback outgoing half link to non-AccECN
tcp: accecn: verify ACE counter in 1st ACK after AccECN negotiation
tcp: accecn: enable AccECN
Ilpo Järvinen (2):
tcp: try to avoid safer when ACKs are thinned
gro: flushing when CWR is set negatively affects AccECN
.../networking/net_cachelines/tcp_sock.rst | 1 +
include/linux/tcp.h | 1 +
include/net/inet_ecn.h | 20 ++++-
include/net/tcp.h | 32 ++++++-
include/net/tcp_ecn.h | 90 +++++++++++++------
net/ipv4/sysctl_net_ipv4.c | 2 +-
net/ipv4/tcp.c | 2 +
net/ipv4/tcp_cong.c | 10 ++-
net/ipv4/tcp_input.c | 49 ++++++++--
net/ipv4/tcp_minisocks.c | 40 ++++++---
net/ipv4/tcp_offload.c | 3 +-
net/ipv4/tcp_output.c | 35 +++++---
12 files changed, 218 insertions(+), 67 deletions(-)
--
2.34.1
Hello there,
Static analyser cppcheck says:
linux-6.17/tools/testing/selftests/landlock/fs_test.c:5631:23: style: A pointer can not be negative so it is either pointless or an error to check if it is. [pointerLessThanZero]
Source code is
if (log_match_cursor < 0)
return (long long)log_match_cursor;
but
char *log_match_cursor = log_match;
Suggest remove code.
Regards
David Binderman
From: Jack Thomson <jackabt(a)amazon.com>
Overview:
This patch series adds ARM64 support for the KVM_PRE_FAULT_MEMORY
feature, which was previously only available on x86 [1]. This allows
a reduction in the number of stage-2 faults during execution. This is
beneficial in post-copy migration scenarios, particularly in memory
intensive applications, where high latencies are experienced due to
the stage-2 faults when pre-populating memory via UFFD / memcpy.
Patch Overview:
- The first patch is a preparatory refactor.
- The second patch is adding a page walk flag for pre-faulting.
- The third patch adds support for the KVM_PRE_FAULT_MEMORY ioctl
on arm64.
- The fourth patch fixes an issue with unaligned mmap allocations
in the selftests.
- The fifth patch updates the pre_fault_memory_test to support
arm64.
- The last patch extends the pre_fault_memory_test to cover
different vm memory backings.
[1]: https://lore.kernel.org/kvm/20240710174031.312055-1-pbonzini@redhat.com
Jack Thomson (6):
KVM: arm64: Add __gmem_abort and __user_mem_abort
KVM: arm64: Add KVM_PGTABLE_WALK_PRE_FAULT walk flag
KVM: arm64: Add pre_fault_memory implementation
KVM: selftests: Fix unaligned mmap allocations
KVM: selftests: Enable pre_fault_memory_test for arm64
KVM: selftests: Add option for different backing in pre-fault tests
arch/arm64/include/asm/kvm_pgtable.h | 3 +
arch/arm64/kvm/Kconfig | 1 +
arch/arm64/kvm/arm.c | 1 +
arch/arm64/kvm/hyp/pgtable.c | 6 +-
arch/arm64/kvm/mmu.c | 97 +++++++++++++--
tools/testing/selftests/kvm/Makefile.kvm | 1 +
tools/testing/selftests/kvm/lib/kvm_util.c | 12 +-
.../selftests/kvm/pre_fault_memory_test.c | 110 +++++++++++++-----
8 files changed, 186 insertions(+), 45 deletions(-)
base-commit: 42188667be387867d2bf763d028654cbad046f7b
--
2.43.0
This series backports 19 patches to update minmax.h in the 6.1.y branch,
aligning it with v6.17-rc7.
The ultimate goal is to synchronize all longterm branches so that they
include the full set of minmax.h changes.
Previous work to update 6.12.48:
https://lore.kernel.org/stable/20250922103123.14538-1-farbere@amazon.com/T/…
and 6.6.107:
https://lore.kernel.org/stable/20250922103241.16213-1-farbere@amazon.com/T/…
The key motivation is to bring in commit d03eba99f5bf ("minmax: allow
min()/max()/clamp() if the arguments have the same signedness"), which
is missing in older kernels.
In mainline, this change enables min()/max()/clamp() to accept mixed
argument types, provided both have the same signedness. Without it,
backported patches that use these forms may trigger compiler warnings,
which escalate to build failures when -Werror is enabled.
Andy Shevchenko (1):
minmax: deduplicate __unconst_integer_typeof()
David Laight (8):
minmax: fix indentation of __cmp_once() and __clamp_once()
minmax.h: add whitespace around operators and after commas
minmax.h: update some comments
minmax.h: reduce the #define expansion of min(), max() and clamp()
minmax.h: use BUILD_BUG_ON_MSG() for the lo < hi test in clamp()
minmax.h: move all the clamp() definitions after the min/max() ones
minmax.h: simplify the variants of clamp()
minmax.h: remove some #defines that are only expanded once
Herve Codina (1):
minmax: Introduce {min,max}_array()
Linus Torvalds (8):
minmax: avoid overly complicated constant expressions in VM code
minmax: simplify and clarify min_t()/max_t() implementation
minmax: make generic MIN() and MAX() macros available everywhere
minmax: add a few more MIN_T/MAX_T users
minmax: simplify min()/max()/clamp() implementation
minmax: don't use max() in situations that want a C constant
expression
minmax: improve macro expansion and type checking
minmax: fix up min3() and max3() too
Matthew Wilcox (Oracle) (1):
minmax: add in_range() macro
arch/arm/mm/pageattr.c | 6 +-
arch/um/drivers/mconsole_user.c | 2 +
arch/x86/mm/pgtable.c | 2 +-
drivers/edac/sb_edac.c | 4 +-
drivers/edac/skx_common.h | 1 -
.../drm/amd/display/modules/hdcp/hdcp_ddc.c | 2 +
.../drm/amd/pm/powerplay/hwmgr/ppevvmath.h | 14 +-
drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c | 2 +-
.../drm/arm/display/include/malidp_utils.h | 2 +-
.../display/komeda/komeda_pipeline_state.c | 24 +-
drivers/gpu/drm/drm_color_mgmt.c | 2 +-
drivers/gpu/drm/msm/adreno/a6xx_gmu.c | 6 -
drivers/gpu/drm/radeon/evergreen_cs.c | 2 +
drivers/hwmon/adt7475.c | 24 +-
drivers/input/touchscreen/cyttsp4_core.c | 2 +-
drivers/irqchip/irq-sun6i-r.c | 2 +-
drivers/md/dm-integrity.c | 2 +-
drivers/media/dvb-frontends/stv0367_priv.h | 3 +
.../net/ethernet/chelsio/cxgb3/cxgb3_main.c | 18 +-
.../net/ethernet/stmicro/stmmac/stmmac_main.c | 2 +-
drivers/net/fjes/fjes_main.c | 4 +-
drivers/nfc/pn544/i2c.c | 2 -
drivers/platform/x86/sony-laptop.c | 1 -
drivers/scsi/isci/init.c | 6 +-
.../pci/hive_isp_css_include/math_support.h | 5 -
drivers/virt/acrn/ioreq.c | 4 +-
fs/btrfs/misc.h | 2 -
fs/btrfs/tree-checker.c | 2 +-
fs/ext2/balloc.c | 2 -
fs/ext4/ext4.h | 2 -
fs/ufs/util.h | 6 -
include/linux/compiler.h | 9 +
include/linux/minmax.h | 264 +++++++++++++-----
include/linux/pageblock-flags.h | 2 +-
kernel/trace/preemptirq_delay_test.c | 2 -
lib/btree.c | 1 -
lib/decompress_unlzma.c | 2 +
lib/logic_pio.c | 3 -
lib/vsprintf.c | 2 +-
mm/zsmalloc.c | 1 -
net/ipv4/proc.c | 2 +-
net/ipv6/proc.c | 2 +-
net/netfilter/nf_nat_core.c | 6 +-
net/tipc/core.h | 2 +-
net/tipc/link.c | 10 +-
.../selftests/bpf/progs/get_branch_snapshot.c | 4 +-
tools/testing/selftests/seccomp/seccomp_bpf.c | 2 +
tools/testing/selftests/vm/mremap_test.c | 2 +
48 files changed, 290 insertions(+), 184 deletions(-)
--
2.47.3
During the discussion of the clone3() support for shadow stacks concerns
were raised from the glibc side that since it is not possible to reuse
the allocated shadow stack[1]. This means that the benefit of being able
to manage allocations is greatly reduced, for example it is not possible
to integrate the shadow stacks into the glibc thread stack cache. The
stack can be inspected but otherwise it would have to be unmapped and
remapped before it could be used again, it's not clear that this is
better than managing things in the kernel.
In that discussion I suggested that we could enable reuse by writing a
token to the shadow stack of exiting threads, mirroring how the
userspace stack pivot instructions write a token to the outgoing stack.
As mentioned by Florian[2] glibc already unwinds the stack and exits the
thread from the start routine which would integrate nicely with this,
the shadow stack pointer will be at the same place as it was when the
thread started.
This would not write a token if the thread doesn't exit cleanly, that
seems viable to me - users should probably handle this by double
checking that a token is present after waiting for the thread.
This is tagged as a RFC since I put it together fairly quickly to
demonstrate the proposal and the suggestion hasn't had much response
either way from the glibc developers. At the very least we don't
currently handle scheduling during exit(), or distinguish why the thread
is exiting. I've also not done anything about x86.
[1] https://marc.info/?l=glibc-alpha&m=175821637429537&w=2
[2] https://marc.info/?l=glibc-alpha&m=175733266913483&w=2
Signed-off-by: Mark Brown <broonie(a)kernel.org>
---
Mark Brown (3):
arm64/gcs: Support reuse of GCS for exited threads
kselftest/arm64: Validate PR_SHADOW_STACK_EXIT_TOKEN in basic-gcs
kselftest/arm64: Add PR_SHADOW_STACK_EXIT_TOKEN to gcs-locking
arch/arm64/include/asm/gcs.h | 3 +-
arch/arm64/mm/gcs.c | 25 ++++-
include/uapi/linux/prctl.h | 1 +
tools/testing/selftests/arm64/gcs/basic-gcs.c | 121 ++++++++++++++++++++++++
tools/testing/selftests/arm64/gcs/gcs-locking.c | 23 +++++
tools/testing/selftests/arm64/gcs/gcs-util.h | 3 +-
6 files changed, 173 insertions(+), 3 deletions(-)
---
base-commit: 0b67d4b724b4afed2690c21bef418b8a803c5be2
change-id: 20250919-arm64-gcs-exit-token-82c3c2570aad
prerequisite-change-id: 20231019-clone3-shadow-stack-15d40d2bf536
Best regards,
--
Mark Brown <broonie(a)kernel.org>
Fix a memory leak in netpoll and introduce netconsole selftests that
expose the issue when running with kmemleak detection enabled.
This patchset includes a selftest for netpoll with multiple concurrent
users (netconsole + bonding), which simulates the scenario from test[1]
that originally demonstrated the issue allegedly fixed by commit
efa95b01da18 ("netpoll: fix use after free") - a commit that is now
being reverted.
Sending this to "net" branch because this is a fix, and the selftest
might help with the backports validation.
Link: https://lore.kernel.org/lkml/96b940137a50e5c387687bb4f57de8b0435a653f.14048… [1]
Signed-off-by: Breno Leitao <leitao(a)debian.org>
---
Changes in v5:
- Set CONFIG_BONDING=m in selftests/drivers/net/config.
- Link to v4: https://lore.kernel.org/r/20250917-netconsole_torture-v4-0-0a5b3b8f81ce@deb…
Changes in v4:
- Added an additional selftest to test multiple netpoll users in
parallel
- Link to v3: https://lore.kernel.org/r/20250905-netconsole_torture-v3-0-875c7febd316@deb…
Changes in v3:
- This patchset is a merge of the fix and the selftest together as
recommended by Jakub.
Changes in v2:
- Reuse the netconsole creation from lib_netcons.sh. Thus, refactoring
the create_dynamic_target() (Jakub)
- Move the "wait" to after all the messages has been sent.
- Link to v1: https://lore.kernel.org/r/20250902-netconsole_torture-v1-1-03c6066598e9@deb…
---
Breno Leitao (4):
net: netpoll: fix incorrect refcount handling causing incorrect cleanup
selftest: netcons: refactor target creation
selftest: netcons: create a torture test
selftest: netcons: add test for netconsole over bonded interfaces
net/core/netpoll.c | 7 +-
tools/testing/selftests/drivers/net/Makefile | 2 +
tools/testing/selftests/drivers/net/config | 1 +
.../selftests/drivers/net/lib/sh/lib_netcons.sh | 197 ++++++++++++++++++---
.../selftests/drivers/net/netcons_over_bonding.sh | 76 ++++++++
.../selftests/drivers/net/netcons_torture.sh | 127 +++++++++++++
6 files changed, 385 insertions(+), 25 deletions(-)
---
base-commit: 5e87fdc37f8dc619549d49ba5c951b369ce7c136
change-id: 20250902-netconsole_torture-8fc23f0aca99
Best regards,
--
Breno Leitao <leitao(a)debian.org>
Hi Thomas and David,
I am seeing the following error during "kunit.py run --alltests run"
next-20250926.
$ make all compile_commands.json scripts_gdb ARCH=um O=.kunit --jobs=16
ERROR:root:/usr/bin/ld: drivers/net/wireless/intel/iwlwifi/tests/devinfo.o: in function `devinfo_pci_ids_config':
devinfo.c:(.text+0x2d): undefined reference to `iwl_bz_mac_cfg'
collect2: error: ld returned 1 exit status
make[3]: *** [../scripts/Makefile.vmlinux:72: vmlinux.unstripped] Error 1
make[2]: *** [/linux/linux_next/Makefile:1242: vmlinux] Error 2
make[1]: *** [/linux/linux_next/Makefile:248: __sub-make] Error 2
make: *** [Makefile:248: __sub-make] Error 2
Possile intearction between these two commits: Note: linux-kselftext
kunit branch is fine I am going send kunit pr to Linus later today.
Heads up that "kunit.py run --alltests run" is failing on next-20250926
commit 031cdd3bc3f369553933c1b0f4cb18000162c8ff
Author: Thomas Weißschuh <thomas.weissschuh(a)linutronix.de>
Date: Mon Sep 8 09:03:38 2025 +0200
kunit: Enable PCI on UML without triggering WARN()
commit 137b0bb916f1addb2ffbefd09a6e3e9d15fe6100
Author: Johannes Berg <johannes.berg(a)intel.com>
Date: Mon Sep 15 11:34:28 2025 +0300
wifi: iwlwifi: tests: check listed PCI IDs have configs
Note: linux-kselftext build just fine.
thanks,
-- Shuah
Hi all,
The test_xsk.sh script covers many AF_XDP use cases. The tests it runs
are defined in xksxceiver.c. Since this script is used to test real
hardware, the goal here is to leave it as it is, and only integrate the
tests that run on veth peers into the test_progs framework.
Some tests are flaky so they can't be integrated in the CI as they are.
I think that fixing their flakyness would require a significant amount of
work. So, as first step, I've excluded them from the list of tests
migrated to the CI (cf PATCH 14). If these tests get fixed at some
point, integrating them into the CI will be straightforward.
I noticed a small error on a function's return value while investigating
on the report's summary issue pointed out by Maciej in previous iteration,
the new PATCH 3 fixes it.
PATCH 1 extracts test_xsk[.c/.h] from xskxceiver[.c/.h] to make the
tests available to test_progs.
PATCH 2 to 7 fix small issues in the current test
PATCH 8 to 13 handle all errors to release resources instead of calling
exit() when any error occurs.
PATCH 14 isolates some flaky tests
PATCH 15 integrate the non-flaky tests to the test_progs framework
Signed-off-by: Bastien Curutchet (eBPF Foundation) <bastien.curutchet(a)bootlin.com>
---
Changes in v4:
- Fix test_xsk.sh's summary report.
- Merge PATCH 11 & 12 together, otherwise PATCH 11 fails to build.
- Split old PATCH 3 in two patches. The first one fixes
testapp_stats_rx_dropped(), the second one fixes
testapp_xdp_shared_umem(). The unecessary frees (in
testapp_stats_rx_full() and testapp_stats_fill_empty() are removed)
- Link to v3: https://lore.kernel.org/r/20250904-xsk-v3-0-ce382e331485@bootlin.com
Changes in v3:
- Rebase on latest bpf-next_base to integrate commit c9110e6f7237 ("selftests/bpf:
Fix count write in testapp_xdp_metadata_copy()").
- Move XDP_METADATA_COPY_* tests from flaky-tests to nominal tests
- Link to v2: https://lore.kernel.org/r/20250902-xsk-v2-0-17c6345d5215@bootlin.com
Changes in v2:
- Rebase on the latest bpf-next_base and integrate the newly added tests
to the work (adjust_tail* and tx_queue_consumer tests)
- Re-order patches to split xkxceiver sooner.
- Fix the bug reported by Maciej.
- Fix verbose mode in test_xsk.sh by keeping kselftest (remove PATCH 1,
7 and 8)
- Link to v1: https://lore.kernel.org/r/20250313-xsk-v1-0-7374729a93b9@bootlin.com
---
Bastien Curutchet (eBPF Foundation) (15):
selftests/bpf: test_xsk: Split xskxceiver
selftests/bpf: test_xsk: Initialize bitmap before use
selftests/bpf: test_xsk: Fix __testapp_validate_traffic()'s return value
selftests/bpf: test_xsk: fix memory leak in testapp_stats_rx_dropped()
selftests/bpf: test_xsk: fix memory leak in testapp_xdp_shared_umem()
selftests/bpf: test_xsk: Wrap test clean-up in functions
selftests/bpf: test_xsk: Release resources when swap fails
selftests/bpf: test_xsk: Add return value to init_iface()
selftests/bpf: test_xsk: Don't exit immediately when xsk_attach fails
selftests/bpf: test_xsk: Don't exit immediately when gettimeofday fails
selftests/bpf: test_xsk: Don't exit immediately when workers fail
selftests/bpf: test_xsk: Don't exit immediately if validate_traffic fails
selftests/bpf: test_xsk: Don't exit immediately on allocation failures
selftests/bpf: test_xsk: Isolate flaky tests
selftests/bpf: test_xsk: Integrate test_xsk.c to test_progs framework
tools/testing/selftests/bpf/Makefile | 11 +-
tools/testing/selftests/bpf/prog_tests/test_xsk.c | 2595 ++++++++++++++++++++
tools/testing/selftests/bpf/prog_tests/test_xsk.h | 294 +++
tools/testing/selftests/bpf/prog_tests/xsk.c | 146 ++
tools/testing/selftests/bpf/xskxceiver.c | 2696 +--------------------
tools/testing/selftests/bpf/xskxceiver.h | 156 --
6 files changed, 3174 insertions(+), 2724 deletions(-)
---
base-commit: 1bd67e08d0f3fcb8cc69a73fb7aab9f048be4b8e
change-id: 20250218-xsk-0cf90e975d14
Best regards,
--
Bastien Curutchet (eBPF Foundation) <bastien.curutchet(a)bootlin.com>
A task in the kernel (task_mm_cid_work) runs somewhat periodically to
compact the mm_cid for each process. Add a test to validate that it runs
correctly and timely.
The test spawns 1 thread pinned to each CPU, then each thread, including
the main one, runs in short bursts for some time. During this period, the
mm_cids should be spanning all numbers between 0 and nproc.
At the end of this phase, a thread with high enough mm_cid (>= nproc/2)
is selected to be the new leader, all other threads terminate.
After some time, the only running thread should see 0 as mm_cid, if that
doesn't happen, the compaction mechanism didn't work and the test fails.
The test never fails if only 1 core is available, in which case, we
cannot test anything as the only available mm_cid is 0.
Acked-by: Shuah Khan <skhan(a)linuxfoundation.org>
Signed-off-by: Gabriele Monaco <gmonaco(a)redhat.com>
---
tools/testing/selftests/rseq/.gitignore | 1 +
tools/testing/selftests/rseq/Makefile | 2 +-
.../selftests/rseq/mm_cid_compaction_test.c | 204 ++++++++++++++++++
3 files changed, 206 insertions(+), 1 deletion(-)
create mode 100644 tools/testing/selftests/rseq/mm_cid_compaction_test.c
diff --git a/tools/testing/selftests/rseq/.gitignore b/tools/testing/selftests/rseq/.gitignore
index 0fda241fa62b..b3920c59bf40 100644
--- a/tools/testing/selftests/rseq/.gitignore
+++ b/tools/testing/selftests/rseq/.gitignore
@@ -3,6 +3,7 @@ basic_percpu_ops_test
basic_percpu_ops_mm_cid_test
basic_test
basic_rseq_op_test
+mm_cid_compaction_test
param_test
param_test_benchmark
param_test_compare_twice
diff --git a/tools/testing/selftests/rseq/Makefile b/tools/testing/selftests/rseq/Makefile
index 0d0a5fae5954..bc4d940f66d4 100644
--- a/tools/testing/selftests/rseq/Makefile
+++ b/tools/testing/selftests/rseq/Makefile
@@ -17,7 +17,7 @@ OVERRIDE_TARGETS = 1
TEST_GEN_PROGS = basic_test basic_percpu_ops_test basic_percpu_ops_mm_cid_test param_test \
param_test_benchmark param_test_compare_twice param_test_mm_cid \
param_test_mm_cid_benchmark param_test_mm_cid_compare_twice \
- syscall_errors_test
+ syscall_errors_test mm_cid_compaction_test
TEST_GEN_PROGS_EXTENDED = librseq.so
diff --git a/tools/testing/selftests/rseq/mm_cid_compaction_test.c b/tools/testing/selftests/rseq/mm_cid_compaction_test.c
new file mode 100644
index 000000000000..d13623625f5a
--- /dev/null
+++ b/tools/testing/selftests/rseq/mm_cid_compaction_test.c
@@ -0,0 +1,204 @@
+// SPDX-License-Identifier: LGPL-2.1
+#define _GNU_SOURCE
+#include <assert.h>
+#include <pthread.h>
+#include <sched.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stddef.h>
+
+#include "../kselftest.h"
+#include "rseq.h"
+
+#define VERBOSE 0
+#define printf_verbose(fmt, ...) \
+ do { \
+ if (VERBOSE) \
+ printf(fmt, ##__VA_ARGS__); \
+ } while (0)
+
+/* 50 ms */
+#define RUNNER_PERIOD 50000
+/*
+ * Number of runs before we terminate or get the token.
+ * The number is slowly increasing with the number of CPUs as the compaction
+ * process can take longer on larger systems. This is an arbitrary value.
+ */
+#define THREAD_RUNS (3 + args->num_cpus/8)
+
+/*
+ * Number of times we check that the mm_cid were compacted.
+ * Checks are repeated every RUNNER_PERIOD.
+ */
+#define MM_CID_COMPACT_TIMEOUT 10
+
+struct thread_args {
+ int cpu;
+ int num_cpus;
+ pthread_mutex_t *token;
+ pthread_barrier_t *barrier;
+ pthread_t *tinfo;
+ struct thread_args *args_head;
+};
+
+static void __noreturn *thread_runner(void *arg)
+{
+ struct thread_args *args = arg;
+ int i, ret, curr_mm_cid;
+ cpu_set_t cpumask;
+
+ CPU_ZERO(&cpumask);
+ CPU_SET(args->cpu, &cpumask);
+ ret = pthread_setaffinity_np(pthread_self(), sizeof(cpumask), &cpumask);
+ if (ret) {
+ errno = ret;
+ perror("Error: failed to set affinity");
+ abort();
+ }
+ pthread_barrier_wait(args->barrier);
+
+ for (i = 0; i < THREAD_RUNS; i++)
+ usleep(RUNNER_PERIOD);
+ curr_mm_cid = rseq_current_mm_cid();
+ /*
+ * We select one thread with high enough mm_cid to be the new leader.
+ * All other threads (including the main thread) will terminate.
+ * After some time, the mm_cid of the only remaining thread should
+ * converge to 0, if not, the test fails.
+ */
+ if (curr_mm_cid >= args->num_cpus / 2 &&
+ !pthread_mutex_trylock(args->token)) {
+ printf_verbose(
+ "cpu%d has mm_cid=%d and will be the new leader.\n",
+ sched_getcpu(), curr_mm_cid);
+ for (i = 0; i < args->num_cpus; i++) {
+ if (args->tinfo[i] == pthread_self())
+ continue;
+ ret = pthread_join(args->tinfo[i], NULL);
+ if (ret) {
+ errno = ret;
+ perror("Error: failed to join thread");
+ abort();
+ }
+ }
+ pthread_barrier_destroy(args->barrier);
+ free(args->tinfo);
+ free(args->token);
+ free(args->barrier);
+ free(args->args_head);
+
+ for (i = 0; i < MM_CID_COMPACT_TIMEOUT; i++) {
+ curr_mm_cid = rseq_current_mm_cid();
+ printf_verbose("run %d: mm_cid=%d on cpu%d.\n", i,
+ curr_mm_cid, sched_getcpu());
+ if (curr_mm_cid == 0)
+ exit(EXIT_SUCCESS);
+ usleep(RUNNER_PERIOD);
+ }
+ exit(EXIT_FAILURE);
+ }
+ printf_verbose("cpu%d has mm_cid=%d and is going to terminate.\n",
+ sched_getcpu(), curr_mm_cid);
+ pthread_exit(NULL);
+}
+
+int test_mm_cid_compaction(void)
+{
+ cpu_set_t affinity;
+ int i, j, ret = 0, num_threads;
+ pthread_t *tinfo;
+ pthread_mutex_t *token;
+ pthread_barrier_t *barrier;
+ struct thread_args *args;
+
+ sched_getaffinity(0, sizeof(affinity), &affinity);
+ num_threads = CPU_COUNT(&affinity);
+ tinfo = calloc(num_threads, sizeof(*tinfo));
+ if (!tinfo) {
+ perror("Error: failed to allocate tinfo");
+ return -1;
+ }
+ args = calloc(num_threads, sizeof(*args));
+ if (!args) {
+ perror("Error: failed to allocate args");
+ ret = -1;
+ goto out_free_tinfo;
+ }
+ token = malloc(sizeof(*token));
+ if (!token) {
+ perror("Error: failed to allocate token");
+ ret = -1;
+ goto out_free_args;
+ }
+ barrier = malloc(sizeof(*barrier));
+ if (!barrier) {
+ perror("Error: failed to allocate barrier");
+ ret = -1;
+ goto out_free_token;
+ }
+ if (num_threads == 1) {
+ fprintf(stderr, "Cannot test on a single cpu. "
+ "Skipping mm_cid_compaction test.\n");
+ /* only skipping the test, this is not a failure */
+ goto out_free_barrier;
+ }
+ pthread_mutex_init(token, NULL);
+ ret = pthread_barrier_init(barrier, NULL, num_threads);
+ if (ret) {
+ errno = ret;
+ perror("Error: failed to initialise barrier");
+ goto out_free_barrier;
+ }
+ for (i = 0, j = 0; i < CPU_SETSIZE && j < num_threads; i++) {
+ if (!CPU_ISSET(i, &affinity))
+ continue;
+ args[j].num_cpus = num_threads;
+ args[j].tinfo = tinfo;
+ args[j].token = token;
+ args[j].barrier = barrier;
+ args[j].cpu = i;
+ args[j].args_head = args;
+ if (!j) {
+ /* The first thread is the main one */
+ tinfo[0] = pthread_self();
+ ++j;
+ continue;
+ }
+ ret = pthread_create(&tinfo[j], NULL, thread_runner, &args[j]);
+ if (ret) {
+ errno = ret;
+ perror("Error: failed to create thread");
+ abort();
+ }
+ ++j;
+ }
+ printf_verbose("Started %d threads.\n", num_threads);
+
+ /* Also main thread will terminate if it is not selected as leader */
+ thread_runner(&args[0]);
+
+ /* only reached in case of errors */
+out_free_barrier:
+ free(barrier);
+out_free_token:
+ free(token);
+out_free_args:
+ free(args);
+out_free_tinfo:
+ free(tinfo);
+
+ return ret;
+}
+
+int main(int argc, char **argv)
+{
+ if (!rseq_mm_cid_available()) {
+ fprintf(stderr, "Error: rseq_mm_cid unavailable\n");
+ return -1;
+ }
+ if (test_mm_cid_compaction())
+ return -1;
+ return 0;
+}
--
2.51.0
From: Wilfred Mallawa <wilfred.mallawa(a)wdc.com>
During a handshake, an endpoint may specify a maximum record size limit.
Currently, the kernel defaults to TLS_MAX_PAYLOAD_SIZE (16KB) for the
maximum record size. Meaning that, the outgoing records from the kernel
can exceed a lower size negotiated during the handshake. In such a case,
the TLS endpoint must send a fatal "record_overflow" alert [1], and
thus the record is discarded.
Upcoming Western Digital NVMe-TCP hardware controllers implement TLS
support. For these devices, supporting TLS record size negotiation is
necessary because the maximum TLS record size supported by the controller
is less than the default 16KB currently used by the kernel.
This patch adds support for retrieving the negotiated record size limit
during a handshake, and enforcing it at the TLS layer such that outgoing
records are no larger than the size negotiated. This patch depends on
the respective userspace support in tlshd and GnuTLS [2].
[1] https://www.rfc-editor.org/rfc/rfc8449
[2] https://gitlab.com/gnutls/gnutls/-/merge_requests/2005
Signed-off-by: Wilfred Mallawa <wilfred.mallawa(a)wdc.com>
---
Changes V3 -> V4:
* Added record_size_limit RFC reference to documentation
* Always export the record size limit in tls_get_info()
* Disallow user space to change the record_size_limit from under us
if an open record is pending.
* Added record_size_limit minimum size check as per RFC
* Allow space for the ContentType byte for TLS 1.3. The expected
behaviour is that userspace directly uses the negotiated
record_size_limit, kernel will limit the plaintext buffer size
appropirately.
* New patch to add self-tests.
---
Documentation/networking/tls.rst | 12 +++++
include/net/tls.h | 5 +++
include/uapi/linux/tls.h | 2 +
net/tls/tls_device.c | 2 +-
net/tls/tls_main.c | 75 ++++++++++++++++++++++++++++++++
net/tls/tls_sw.c | 2 +-
6 files changed, 96 insertions(+), 2 deletions(-)
diff --git a/Documentation/networking/tls.rst b/Documentation/networking/tls.rst
index 36cc7afc2527..d24bf8911bb8 100644
--- a/Documentation/networking/tls.rst
+++ b/Documentation/networking/tls.rst
@@ -280,6 +280,18 @@ If the record decrypted turns out to had been padded or is not a data
record it will be decrypted again into a kernel buffer without zero copy.
Such events are counted in the ``TlsDecryptRetry`` statistic.
+TLS_TX_RECORD_SIZE_LIM
+~~~~~~~~~~~~~~~~~~~~~~
+
+Sets the maximum size for the plaintext of a protected record.
+
+The provided value should correspond to the limit negotiated during the TLS
+handshake via the `record_size_limit` extension (RFC 8449)[1]. When this
+option is set, the kernel enforces this limit on all transmitted TLS records,
+ensuring no plaintext fragment exceeds the specified size.
+
+[1] https://datatracker.ietf.org/doc/html/rfc8449
+
Statistics
==========
diff --git a/include/net/tls.h b/include/net/tls.h
index 857340338b69..32f053770ec4 100644
--- a/include/net/tls.h
+++ b/include/net/tls.h
@@ -53,6 +53,8 @@ struct tls_rec;
/* Maximum data size carried in a TLS record */
#define TLS_MAX_PAYLOAD_SIZE ((size_t)1 << 14)
+/* Minimum record size limit as per RFC8449 */
+#define TLS_MIN_RECORD_SIZE_LIM ((size_t)1 << 6)
#define TLS_HEADER_SIZE 5
#define TLS_NONCE_OFFSET TLS_HEADER_SIZE
@@ -226,6 +228,9 @@ struct tls_context {
u8 rx_conf:3;
u8 zerocopy_sendfile:1;
u8 rx_no_pad:1;
+ u16 tx_record_size_limit; /* Max plaintext fragment size. For TLS 1.3,
+ * this excludes the ContentType.
+ */
int (*push_pending_record)(struct sock *sk, int flags);
void (*sk_write_space)(struct sock *sk);
diff --git a/include/uapi/linux/tls.h b/include/uapi/linux/tls.h
index b66a800389cc..3add266d5916 100644
--- a/include/uapi/linux/tls.h
+++ b/include/uapi/linux/tls.h
@@ -41,6 +41,7 @@
#define TLS_RX 2 /* Set receive parameters */
#define TLS_TX_ZEROCOPY_RO 3 /* TX zerocopy (only sendfile now) */
#define TLS_RX_EXPECT_NO_PAD 4 /* Attempt opportunistic zero-copy */
+#define TLS_TX_RECORD_SIZE_LIM 5 /* Maximum record size */
/* Supported versions */
#define TLS_VERSION_MINOR(ver) ((ver) & 0xFF)
@@ -194,6 +195,7 @@ enum {
TLS_INFO_RXCONF,
TLS_INFO_ZC_RO_TX,
TLS_INFO_RX_NO_PAD,
+ TLS_INFO_TX_RECORD_SIZE_LIM,
__TLS_INFO_MAX,
};
#define TLS_INFO_MAX (__TLS_INFO_MAX - 1)
diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c
index f672a62a9a52..bf16ceb41dde 100644
--- a/net/tls/tls_device.c
+++ b/net/tls/tls_device.c
@@ -459,7 +459,7 @@ static int tls_push_data(struct sock *sk,
/* TLS_HEADER_SIZE is not counted as part of the TLS record, and
* we need to leave room for an authentication tag.
*/
- max_open_record_len = TLS_MAX_PAYLOAD_SIZE +
+ max_open_record_len = tls_ctx->tx_record_size_limit +
prot->prepend_size;
do {
rc = tls_do_allocation(sk, ctx, pfrag, prot->prepend_size);
diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c
index a3ccb3135e51..09883d9c6c96 100644
--- a/net/tls/tls_main.c
+++ b/net/tls/tls_main.c
@@ -544,6 +544,31 @@ static int do_tls_getsockopt_no_pad(struct sock *sk, char __user *optval,
return 0;
}
+static int do_tls_getsockopt_tx_record_size(struct sock *sk, char __user *optval,
+ int __user *optlen)
+{
+ struct tls_context *ctx = tls_get_ctx(sk);
+ int len;
+ /* TLS 1.3: Record length contains ContentType */
+ u16 record_size_limit = ctx->prot_info.version == TLS_1_3_VERSION ?
+ ctx->tx_record_size_limit + 1 :
+ ctx->tx_record_size_limit;
+
+ if (get_user(len, optlen))
+ return -EFAULT;
+
+ if (len < sizeof(record_size_limit))
+ return -EINVAL;
+
+ if (put_user(sizeof(record_size_limit), optlen))
+ return -EFAULT;
+
+ if (copy_to_user(optval, &record_size_limit, sizeof(record_size_limit)))
+ return -EFAULT;
+
+ return 0;
+}
+
static int do_tls_getsockopt(struct sock *sk, int optname,
char __user *optval, int __user *optlen)
{
@@ -563,6 +588,9 @@ static int do_tls_getsockopt(struct sock *sk, int optname,
case TLS_RX_EXPECT_NO_PAD:
rc = do_tls_getsockopt_no_pad(sk, optval, optlen);
break;
+ case TLS_TX_RECORD_SIZE_LIM:
+ rc = do_tls_getsockopt_tx_record_size(sk, optval, optlen);
+ break;
default:
rc = -ENOPROTOOPT;
break;
@@ -812,6 +840,43 @@ static int do_tls_setsockopt_no_pad(struct sock *sk, sockptr_t optval,
return rc;
}
+static int do_tls_setsockopt_tx_record_size(struct sock *sk, sockptr_t optval,
+ unsigned int optlen)
+{
+ struct tls_context *ctx = tls_get_ctx(sk);
+ struct tls_sw_context_tx *sw_ctx = tls_sw_ctx_tx(ctx);
+ u16 value;
+
+ if (sw_ctx->open_rec)
+ return -EBUSY;
+
+ if (sockptr_is_null(optval) || optlen != sizeof(value))
+ return -EINVAL;
+
+ if (copy_from_sockptr(&value, optval, sizeof(value)))
+ return -EFAULT;
+
+ if (value < TLS_MIN_RECORD_SIZE_LIM)
+ return -EINVAL;
+
+ if (ctx->prot_info.version == TLS_1_2_VERSION &&
+ value > TLS_MAX_PAYLOAD_SIZE)
+ return -EINVAL;
+
+ if (ctx->prot_info.version == TLS_1_3_VERSION &&
+ value - 1 > TLS_MAX_PAYLOAD_SIZE)
+ return -EINVAL;
+
+ /*
+ * For TLS 1.3: 'value' includes one byte for the appended ContentType.
+ * Adjust the kernel's internal plaintext limit accordingly.
+ */
+ ctx->tx_record_size_limit = ctx->prot_info.version == TLS_1_3_VERSION ?
+ value - 1 : value;
+
+ return 0;
+}
+
static int do_tls_setsockopt(struct sock *sk, int optname, sockptr_t optval,
unsigned int optlen)
{
@@ -833,6 +898,9 @@ static int do_tls_setsockopt(struct sock *sk, int optname, sockptr_t optval,
case TLS_RX_EXPECT_NO_PAD:
rc = do_tls_setsockopt_no_pad(sk, optval, optlen);
break;
+ case TLS_TX_RECORD_SIZE_LIM:
+ rc = do_tls_setsockopt_tx_record_size(sk, optval, optlen);
+ break;
default:
rc = -ENOPROTOOPT;
break;
@@ -1022,6 +1090,7 @@ static int tls_init(struct sock *sk)
ctx->tx_conf = TLS_BASE;
ctx->rx_conf = TLS_BASE;
+ ctx->tx_record_size_limit = TLS_MAX_PAYLOAD_SIZE;
update_sk_prot(sk, ctx);
out:
write_unlock_bh(&sk->sk_callback_lock);
@@ -1111,6 +1180,11 @@ static int tls_get_info(struct sock *sk, struct sk_buff *skb, bool net_admin)
goto nla_failure;
}
+ err = nla_put_u16(skb, TLS_INFO_TX_RECORD_SIZE_LIM,
+ ctx->tx_record_size_limit);
+ if (err)
+ goto nla_failure;
+
rcu_read_unlock();
nla_nest_end(skb, start);
return 0;
@@ -1132,6 +1206,7 @@ static size_t tls_get_info_size(const struct sock *sk, bool net_admin)
nla_total_size(sizeof(u16)) + /* TLS_INFO_TXCONF */
nla_total_size(0) + /* TLS_INFO_ZC_RO_TX */
nla_total_size(0) + /* TLS_INFO_RX_NO_PAD */
+ nla_total_size(sizeof(u16)) + /* TLS_INFO_TX_RECORD_SIZE_LIM */
0;
return size;
diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index bac65d0d4e3e..28fb796573d1 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -1079,7 +1079,7 @@ static int tls_sw_sendmsg_locked(struct sock *sk, struct msghdr *msg,
orig_size = msg_pl->sg.size;
full_record = false;
try_to_copy = msg_data_left(msg);
- record_room = TLS_MAX_PAYLOAD_SIZE - msg_pl->sg.size;
+ record_room = tls_ctx->tx_record_size_limit - msg_pl->sg.size;
if (try_to_copy >= record_room) {
try_to_copy = record_room;
full_record = true;
--
2.51.0
This check was removed in commit e6f497955fb6 ("ipv6: Check GATEWAY
in rtm_to_fib6_multipath_config().") as part of rt6_qualify_for ecmp().
The author correctly recognises that rt6_qualify_for_ecmp() returns
false if fb_nh_gw_family is set to AF_UNSPEC, but then mistakes
AF_UNSPEC for AF_INET6 when reasoning that the check is unnecessary.
This means certain malformed entries don't get caught in
ip6_route_multipath_add().
This patch reintroduces the AF_UNSPEC check while respecting changes
of the initial patch.
Reported-by: syzbot+a259a17220263c2d73fc(a)syzkaller.appspotmail.com
Closes: https://syzkaller.appspot.com/bug?extid=a259a17220263c2d73fc
Fixes: e6f497955fb6 ("ipv6: Check GATEWAY in rtm_to_fib6_multipath_config().")
Signed-off-by: Maksimilijan Marosevic <maksimilijan.marosevic(a)proton.me>
---
net/ipv6/route.c | 8 ++++++++
1 file changed, 8 insertions(+)
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index aee6a10b112a..884bae3fb1b1 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -5454,6 +5454,14 @@ static int ip6_route_multipath_add(struct fib6_config *cfg,
goto cleanup;
}
+ if (rt->fib6_nh->fib_nh_gw_family == AF_UNSPEC) {
+ err = -EINVAL;
+ NL_SET_ERR_MSG(extack,
+ "Device only routes can not be added for IPv6 using the multipath API.");
+ fib6_info_release(rt);
+ goto cleanup;
+ }
+
rt->fib6_nh->fib_nh_weight = rtnh->rtnh_hops + 1;
err = ip6_route_info_append(&rt6_nh_list, rt, &r_cfg);
--
2.43.0
The Alphabetical layout was... auto-correct.
Friendly: Neo (Jesus-Christ a.k.a. King David again).
-
Miracles proving that I was Jesus, and King David, are in my Facebook.
I proposed to explain the Holy-Trinity live on TV.
It is since year 2000 that I must be in the news.
I do not declare be Jew, but get persecuted anyway by antisemites (antichristians?).
Writing still does not pay my bills at all, keeps me almost homeless.
While my intellectual property is stolen, my names removed from many Masterpieces.
Facebook
https://www.facebook.com/profile.php?id=100057121342964
Paypal
https://www.paypal.com/paypalme/meDavidSantamaria
Email/Teams
DavidSantamaria(a)hotmail.fr<mailto:DavidSantamaria@hotmail.fr>
RCS/WhatsApp
+33 7 67 99 32 37
$ADA
addr1qx8chpwdeqv77duf2eutrtgvd5967l4w87fy54fx0022gr8p80z2mq7cmunmrdvy8yn3pzfzpm46zyfjp8usl36vpw2q509hrd
$BTC
3FdwVoDzJoUzceogUwEmVu9YxoFvX6c2Rk
$WAVES
3PEAeFkwqVsgAiyad8uVQzQxiGDyJNnCCn5
Sent from my Public Address.