[ Upstream commit 4acfe3dfde685a5a9eaec5555351918e2d7266a1 ]
Dan Carpenter spotted a race condition in a couple of situations like
these in the test_firmware driver:
static int test_dev_config_update_u8(const char *buf, size_t size, u8 *cfg)
{
u8 val;
int ret;
ret = kstrtou8(buf, 10, &val);
if (ret)
return ret;
mutex_lock(&test_fw_mutex);
*(u8 *)cfg = val;
mutex_unlock(&test_fw_mutex);
/* Always return full write size even if we didn't consume all */
return size;
}
static ssize_t config_num_requests_store(struct device *dev,
struct device_attribute *attr,
const char *buf, size_t count)
{
int rc;
mutex_lock(&test_fw_mutex);
if (test_fw_config->reqs) {
pr_err("Must call release_all_firmware prior to changing config\n");
rc = -EINVAL;
mutex_unlock(&test_fw_mutex);
goto out;
}
mutex_unlock(&test_fw_mutex);
// NOTE: HERE is the race!!! Function can be preempted!
// test_fw_config->reqs can change between the release of
// the lock about and acquire of the lock in the
// test_dev_config_update_u8()
rc = test_dev_config_update_u8(buf, count,
&test_fw_config->num_requests);
out:
return rc;
}
static ssize_t config_read_fw_idx_store(struct device *dev,
struct device_attribute *attr,
const char *buf, size_t count)
{
return test_dev_config_update_u8(buf, count,
&test_fw_config->read_fw_idx);
}
The function test_dev_config_update_u8() is called from both the locked
and the unlocked context, function config_num_requests_store() and
config_read_fw_idx_store() which can both be called asynchronously as
they are driver's methods, while test_dev_config_update_u8() and siblings
change their argument pointed to by u8 *cfg or similar pointer.
To avoid deadlock on test_fw_mutex, the lock is dropped before calling
test_dev_config_update_u8() and re-acquired within test_dev_config_update_u8()
itself, but alas this creates a race condition.
Having two locks wouldn't assure a race-proof mutual exclusion.
This situation is best avoided by the introduction of a new, unlocked
function __test_dev_config_update_u8() which can be called from the locked
context and reducing test_dev_config_update_u8() to:
static int test_dev_config_update_u8(const char *buf, size_t size, u8 *cfg)
{
int ret;
mutex_lock(&test_fw_mutex);
ret = __test_dev_config_update_u8(buf, size, cfg);
mutex_unlock(&test_fw_mutex);
return ret;
}
doing the locking and calling the unlocked primitive, which enables both
locked and unlocked versions without duplication of code.
Fixes: c92316bf8e948 ("test_firmware: add batched firmware tests")
Cc: Luis R. Rodriguez <mcgrof(a)kernel.org>
Cc: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
Cc: Russ Weight <russell.h.weight(a)intel.com>
Cc: Takashi Iwai <tiwai(a)suse.de>
Cc: Tianfei Zhang <tianfei.zhang(a)intel.com>
Cc: Shuah Khan <shuah(a)kernel.org>
Cc: Colin Ian King <colin.i.king(a)gmail.com>
Cc: Randy Dunlap <rdunlap(a)infradead.org>
Cc: linux-kselftest(a)vger.kernel.org
Cc: stable(a)vger.kernel.org # v5.4, 4.19, 4.14
Suggested-by: Dan Carpenter <error27(a)gmail.com>
Link: https://lore.kernel.org/r/20230509084746.48259-1-mirsad.todorovac@alu.unizg…
Signed-off-by: Mirsad Goran Todorovac <mirsad.todorovac(a)alu.unizg.hr>
[ This is the patch to fix the racing condition in locking for the 5.4, ]
[ 4.19 and 4.14 stable branches. Not all the fixes from the upstream ]
[ commit apply, but those which do are verbatim equal to those in the ]
[ upstream commit. ]
---
v4:
verbatim the same patch as for the 5.4 stable tree which patchwork didn't apply
lib/test_firmware.c | 37 ++++++++++++++++++++++++++++---------
1 file changed, 28 insertions(+), 9 deletions(-)
diff --git a/lib/test_firmware.c b/lib/test_firmware.c
index 34210306ea66..d407e5e670f3 100644
--- a/lib/test_firmware.c
+++ b/lib/test_firmware.c
@@ -283,16 +283,26 @@ static ssize_t config_test_show_str(char *dst,
return len;
}
-static int test_dev_config_update_bool(const char *buf, size_t size,
- bool *cfg)
+static inline int __test_dev_config_update_bool(const char *buf, size_t size,
+ bool *cfg)
{
int ret;
- mutex_lock(&test_fw_mutex);
if (strtobool(buf, cfg) < 0)
ret = -EINVAL;
else
ret = size;
+
+ return ret;
+}
+
+static int test_dev_config_update_bool(const char *buf, size_t size,
+ bool *cfg)
+{
+ int ret;
+
+ mutex_lock(&test_fw_mutex);
+ ret = __test_dev_config_update_bool(buf, size, cfg);
mutex_unlock(&test_fw_mutex);
return ret;
@@ -322,7 +332,7 @@ static ssize_t test_dev_config_show_int(char *buf, int cfg)
return snprintf(buf, PAGE_SIZE, "%d\n", val);
}
-static int test_dev_config_update_u8(const char *buf, size_t size, u8 *cfg)
+static inline int __test_dev_config_update_u8(const char *buf, size_t size, u8 *cfg)
{
int ret;
long new;
@@ -334,14 +344,23 @@ static int test_dev_config_update_u8(const char *buf, size_t size, u8 *cfg)
if (new > U8_MAX)
return -EINVAL;
- mutex_lock(&test_fw_mutex);
*(u8 *)cfg = new;
- mutex_unlock(&test_fw_mutex);
/* Always return full write size even if we didn't consume all */
return size;
}
+static int test_dev_config_update_u8(const char *buf, size_t size, u8 *cfg)
+{
+ int ret;
+
+ mutex_lock(&test_fw_mutex);
+ ret = __test_dev_config_update_u8(buf, size, cfg);
+ mutex_unlock(&test_fw_mutex);
+
+ return ret;
+}
+
static ssize_t test_dev_config_show_u8(char *buf, u8 cfg)
{
u8 val;
@@ -374,10 +393,10 @@ static ssize_t config_num_requests_store(struct device *dev,
mutex_unlock(&test_fw_mutex);
goto out;
}
- mutex_unlock(&test_fw_mutex);
- rc = test_dev_config_update_u8(buf, count,
- &test_fw_config->num_requests);
+ rc = __test_dev_config_update_u8(buf, count,
+ &test_fw_config->num_requests);
+ mutex_unlock(&test_fw_mutex);
out:
return rc;
--
2.34.1
Add new feature checks related to crypto to the hwcap test.
The following is a log snippet from my local testing environment
based on for-next/selftests:
~~~
TAP version 13
1..111
# AES present
ok 1 cpuinfo_match_AES
ok 2 sigill_AES
ok 3 # SKIP sigbus_AES
# CRC32 present
ok 4 cpuinfo_match_CRC32
ok 5 sigill_CRC32
ok 6 # SKIP sigbus_CRC32
ok 7 cpuinfo_match_CSSC
# sigill_reported for CSSC
ok 8 # SKIP sigill_CSSC
ok 9 # SKIP sigbus_CSSC
# FP present
ok 10 cpuinfo_match_FP
ok 11 sigill_FP
ok 12 # SKIP sigbus_FP
# JSCVT present
ok 13 cpuinfo_match_JSCVT
ok 14 sigill_JSCVT
ok 15 # SKIP sigbus_JSCVT
# LRCPC present
ok 16 cpuinfo_match_LRCPC
ok 17 sigill_LRCPC
ok 18 # SKIP sigbus_LRCPC
# LRCPC2 present
ok 19 cpuinfo_match_LRCPC2
ok 20 sigill_LRCPC2
ok 21 # SKIP sigbus_LRCPC2
# LSE present
ok 22 cpuinfo_match_LSE
ok 23 sigill_LSE
ok 24 # SKIP sigbus_LSE
# LSE2 present
ok 25 cpuinfo_match_LSE2
ok 26 sigill_LSE2
ok 27 sigbus_LSE2
ok 28 cpuinfo_match_MOPS
ok 29 sigill_MOPS
ok 30 # SKIP sigbus_MOPS
# RNG present
ok 31 cpuinfo_match_RNG
ok 32 sigill_RNG
ok 33 # SKIP sigbus_RNG
# PMULL present
ok 34 cpuinfo_match_PMULL
ok 35 sigill_PMULL
ok 36 # SKIP sigbus_PMULL
ok 37 cpuinfo_match_RPRFM
ok 38 # SKIP sigill_RPRFM
ok 39 # SKIP sigbus_RPRFM
# SHA1 present
ok 40 cpuinfo_match_SHA1
ok 41 sigill_SHA1
ok 42 # SKIP sigbus_SHA1
# SHA2 present
ok 43 cpuinfo_match_SHA2
ok 44 sigill_SHA2
ok 45 # SKIP sigbus_SHA2
# SHA512 present
ok 46 cpuinfo_match_SHA512
ok 47 sigill_SHA512
ok 48 # SKIP sigbus_SHA512
ok 49 cpuinfo_match_SME
ok 50 sigill_SME
ok 51 # SKIP sigbus_SME
ok 52 cpuinfo_match_SME2
ok 53 sigill_SME2
ok 54 # SKIP sigbus_SME2
ok 55 cpuinfo_match_SME 2.1
# sigill_reported for SME 2.1
ok 56 # SKIP sigill_SME 2.1
ok 57 # SKIP sigbus_SME 2.1
ok 58 cpuinfo_match_SME I16I32
# sigill_reported for SME I16I32
ok 59 # SKIP sigill_SME I16I32
ok 60 # SKIP sigbus_SME I16I32
ok 61 cpuinfo_match_SME BI32I32
# sigill_reported for SME BI32I32
ok 62 # SKIP sigill_SME BI32I32
ok 63 # SKIP sigbus_SME BI32I32
ok 64 cpuinfo_match_SME B16B16
# sigill_reported for SME B16B16
ok 65 # SKIP sigill_SME B16B16
ok 66 # SKIP sigbus_SME B16B16
ok 67 cpuinfo_match_SME F16F16
# sigill_reported for SME F16F16
ok 68 # SKIP sigill_SME F16F16
ok 69 # SKIP sigbus_SME F16F16
# SVE present
ok 70 cpuinfo_match_SVE
ok 71 sigill_SVE
ok 72 # SKIP sigbus_SVE
ok 73 cpuinfo_match_SVE 2
# sigill_reported for SVE 2
ok 74 # SKIP sigill_SVE 2
ok 75 # SKIP sigbus_SVE 2
ok 76 cpuinfo_match_SVE 2.1
# sigill_reported for SVE 2.1
ok 77 # SKIP sigill_SVE 2.1
ok 78 # SKIP sigbus_SVE 2.1
ok 79 cpuinfo_match_SVE AES
# sigill_reported for SVE AES
ok 80 # SKIP sigill_SVE AES
ok 81 # SKIP sigbus_SVE AES
ok 82 cpuinfo_match_SVE2 PMULL
# sigill_reported for SVE2 PMULL
ok 83 # SKIP sigill_SVE2 PMULL
ok 84 # SKIP sigbus_SVE2 PMULL
ok 85 cpuinfo_match_SVE2 BITPERM
# sigill_reported for SVE2 BITPERM
ok 86 # SKIP sigill_SVE2 BITPERM
ok 87 # SKIP sigbus_SVE2 BITPERM
ok 88 cpuinfo_match_SVE2 SHA3
# sigill_reported for SVE2 SHA3
ok 89 # SKIP sigill_SVE2 SHA3
ok 90 # SKIP sigbus_SVE2 SHA3
ok 91 cpuinfo_match_SVE2 SM4
# sigill_reported for SVE2 SM4
ok 92 # SKIP sigill_SVE2 SM4
ok 93 # SKIP sigbus_SVE2 SM4
# SVE2 I8MM present
ok 94 cpuinfo_match_SVE2 I8MM
ok 95 sigill_SVE2 I8MM
ok 96 # SKIP sigbus_SVE2 I8MM
# SVE2 F32MM present
ok 97 cpuinfo_match_SVE2 F32MM
ok 98 sigill_SVE2 F32MM
ok 99 # SKIP sigbus_SVE2 F32MM
# SVE2 F64MM present
ok 100 cpuinfo_match_SVE2 F64MM
ok 101 sigill_SVE2 F64MM
ok 102 # SKIP sigbus_SVE2 F64MM
# SVE2 BF16 present
ok 103 cpuinfo_match_SVE2 BF16
ok 104 sigill_SVE2 BF16
ok 105 # SKIP sigbus_SVE2 BF16
ok 106 cpuinfo_match_SVE2 EBF16
ok 107 # SKIP sigill_SVE2 EBF16
ok 108 # SKIP sigbus_SVE2 EBF16
ok 109 cpuinfo_match_HBC
ok 110 sigill_HBC
ok 111 # SKIP sigbus_HBC
# Totals: pass:60 fail:0 xfail:0 xpass:0 skip:51 error:0
~~~
Zeng Heng (4):
kselftest/arm64: add SHA1 and related features to hwcap test
kselftest/arm64: add AES feature check to hwcap test
kselftest/arm64: add pmull feature to hwcap test
kselftest/arm64: add jscvt feature to hwcap test
tools/testing/selftests/arm64/abi/hwcap.c | 77 +++++++++++++++++++++++
1 file changed, 77 insertions(+)
--
2.25.1
Observed occassional failures in the futex_wait_timeout test:
ok 1 futex_wait relative succeeds
ok 2 futex_wait_bitset realtime succeeds
ok 3 futex_wait_bitset monotonic succeeds
ok 4 futex_wait_requeue_pi realtime succeeds
ok 5 futex_wait_requeue_pi monotonic succeeds
not ok 6 futex_lock_pi realtime returned 0
......
The test expects the child thread to complete some steps before
the parent thread gets to run. There is an implicit expectation
of the order of invocation of futex_lock_pi between the child thread
and the parent thread. Make this order explicit. If the order is
not met, the futex_lock_pi call in the parent thread succeeds and
will not timeout.
Fixes: f4addd54b161 ("selftests: futex: Expand timeout test")
Signed-off-by: Nysal Jan K.A <nysal(a)linux.ibm.com>
---
.../selftests/futex/functional/futex_wait_timeout.c | 7 +++++++
1 file changed, 7 insertions(+)
diff --git a/tools/testing/selftests/futex/functional/futex_wait_timeout.c b/tools/testing/selftests/futex/functional/futex_wait_timeout.c
index 3651ce17beeb..d183f878360b 100644
--- a/tools/testing/selftests/futex/functional/futex_wait_timeout.c
+++ b/tools/testing/selftests/futex/functional/futex_wait_timeout.c
@@ -24,6 +24,7 @@
static long timeout_ns = 100000; /* 100us default timeout */
static futex_t futex_pi;
+static pthread_barrier_t barrier;
void usage(char *prog)
{
@@ -48,6 +49,8 @@ void *get_pi_lock(void *arg)
if (ret != 0)
error("futex_lock_pi failed\n", ret);
+ pthread_barrier_wait(&barrier);
+
/* Blocks forever */
ret = futex_wait(&lock, 0, NULL, 0);
error("futex_wait failed\n", ret);
@@ -130,6 +133,7 @@ int main(int argc, char *argv[])
basename(argv[0]));
ksft_print_msg("\tArguments: timeout=%ldns\n", timeout_ns);
+ pthread_barrier_init(&barrier, NULL, 2);
pthread_create(&thread, NULL, get_pi_lock, NULL);
/* initialize relative timeout */
@@ -163,6 +167,9 @@ int main(int argc, char *argv[])
res = futex_wait_requeue_pi(&f1, f1, &futex_pi, &to, 0);
test_timeout(res, &ret, "futex_wait_requeue_pi monotonic", ETIMEDOUT);
+ /* Wait until the other thread calls futex_lock_pi() */
+ pthread_barrier_wait(&barrier);
+ pthread_barrier_destroy(&barrier);
/*
* FUTEX_LOCK_PI with CLOCK_REALTIME
* Due to historical reasons, FUTEX_LOCK_PI supports only realtime
--
2.41.0
The user_events selftests were removed from the standard set of
selftests due to the uapi header it relies on having been temporarily
removed. That header is now reinstated so we can reenable the tests.
Signed-off-by: Mark Brown <broonie(a)kernel.org>
---
tools/testing/selftests/Makefile | 1 +
tools/testing/selftests/user_events/Makefile | 8 --------
2 files changed, 1 insertion(+), 8 deletions(-)
diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile
index 666b56f22a41..4e221bb620b4 100644
--- a/tools/testing/selftests/Makefile
+++ b/tools/testing/selftests/Makefile
@@ -89,6 +89,7 @@ TARGETS += tmpfs
TARGETS += tpm2
TARGETS += tty
TARGETS += user
+TARGETS += user_events
TARGETS += vDSO
TARGETS += mm
TARGETS += x86
diff --git a/tools/testing/selftests/user_events/Makefile b/tools/testing/selftests/user_events/Makefile
index 9e95bd41b0b4..10fcd0066203 100644
--- a/tools/testing/selftests/user_events/Makefile
+++ b/tools/testing/selftests/user_events/Makefile
@@ -2,14 +2,6 @@
CFLAGS += -Wl,-no-as-needed -Wall $(KHDR_INCLUDES)
LDLIBS += -lrt -lpthread -lm
-# Note:
-# This test depends on <linux/user_events.h> exported in uapi
-# The following commit removed user_events.h out of uapi:
-# commit 5cfff569cab8bf544bab62c911c5d6efd5af5e05
-# tracing: Move user_events.h temporarily out of include/uapi
-# This test will not compile until user_events.h is added
-# back to uapi.
-
TEST_GEN_PROGS = ftrace_test dyn_test perf_test abi_test
TEST_FILES := settings
---
base-commit: 5d0c230f1de8c7515b6567d9afba1f196fb4e2f4
change-id: 20230805-kselftest-user-events-e4beff821b0f
Best regards,
--
Mark Brown <broonie(a)kernel.org>
We have some dmabuf-heaps and perf_events tests but they are not hooked
up to the kselftest build infrastructure which is a bit of an obstacle
to running them in systems with generic infrastructure for selftests.
Add them to the top level kselftest Makefile so they get built as
standard.
Signed-off-by: Mark Brown <broonie(a)kernel.org>
---
tools/testing/selftests/Makefile | 3 +++
1 file changed, 3 insertions(+)
diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile
index 666b56f22a41..bdee501596ef 100644
--- a/tools/testing/selftests/Makefile
+++ b/tools/testing/selftests/Makefile
@@ -12,6 +12,7 @@ TARGETS += core
TARGETS += cpufreq
TARGETS += cpu-hotplug
TARGETS += damon
+TARGETS += dmabuf-heaps
TARGETS += drivers/dma-buf
TARGETS += drivers/s390x/uvdevice
TARGETS += drivers/net/bonding
@@ -56,6 +57,7 @@ TARGETS += net/mptcp
TARGETS += net/openvswitch
TARGETS += netfilter
TARGETS += nsfs
+TARGETS += perf_events
TARGETS += pidfd
TARGETS += pid_namespace
TARGETS += powerpc
@@ -88,6 +90,7 @@ endif
TARGETS += tmpfs
TARGETS += tpm2
TARGETS += tty
+TARGETS += uevents
TARGETS += user
TARGETS += vDSO
TARGETS += mm
---
base-commit: 5d0c230f1de8c7515b6567d9afba1f196fb4e2f4
change-id: 20230805-kselftest-perf-events-build-c0e0f1182bae
Best regards,
--
Mark Brown <broonie(a)kernel.org>
In busybox, the mktemp requires that the generated filename be
suffixed with at least six consecutive 'X' characters. Otherwise,
it will return an "Invalid argument" error.
Signed-off-by: Hui Min Mina Chou <minachou(a)andestech.com>
---
tools/testing/selftests/filesystems/fat/run_fat_tests.sh | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/tools/testing/selftests/filesystems/fat/run_fat_tests.sh b/tools/testing/selftests/filesystems/fat/run_fat_tests.sh
index 7f35dc3d15df..d61264d4795d 100755
--- a/tools/testing/selftests/filesystems/fat/run_fat_tests.sh
+++ b/tools/testing/selftests/filesystems/fat/run_fat_tests.sh
@@ -12,7 +12,7 @@ set -u
set -o pipefail
BASE_DIR="$(dirname $0)"
-TMP_DIR="$(mktemp -d /tmp/fat_tests_tmp.XXXX)"
+TMP_DIR="$(mktemp -d /tmp/fat_tests_tmp.XXXXXX)"
IMG_PATH="${TMP_DIR}/fat.img"
MNT_PATH="${TMP_DIR}/mnt"
--
2.34.1
lwt xmit hook does not expect positive return values in function
ip_finish_output2 and ip6_finish_output. However, BPF programs can
directly return positive statuses such like NET_XMIT_DROP, NET_RX_DROP,
and etc to the caller. Such return values would make the kernel continue
processing already freed skbs and eventually panic.
This set fixes the return values from BPF ops to unexpected continue
processing, and checks strictly on the correct continue condition for
future proof. In addition, add missing selftests for BPF_REDIRECT
and BPF_REROUTE cases for BPF-CI.
v4: https://lore.kernel.org/bpf/ZMD1sFTW8SFiex+x@debian.debian/T/
v3: https://lore.kernel.org/bpf/cover.1690255889.git.yan@cloudflare.com/
v2: https://lore.kernel.org/netdev/ZLdY6JkWRccunvu0@debian.debian/
v1: https://lore.kernel.org/bpf/ZLbYdpWC8zt9EJtq@debian.debian/
changes since v4:
* fixed same error on BPF_REROUTE path
* re-implemented selftests under BPF-CI requirement
changes since v3:
* minor change in commit message and changelogs
* tested by Jakub Sitnicki
changes since v2:
* subject name changed
* also covered redirect to ingress case
* added selftests
changes since v1:
* minor code style changes
Yan Zhai (4):
lwt: fix return values of BPF ops
lwt: check LWTUNNEL_XMIT_CONTINUE strictly
selftests/bpf: add lwt_xmit tests for BPF_REDIRECT
selftests/bpf: add lwt_xmit tests for BPF_REROUTE
include/net/lwtunnel.h | 5 +-
net/core/lwt_bpf.c | 7 +-
net/ipv4/ip_output.c | 2 +-
net/ipv6/ip6_output.c | 2 +-
.../selftests/bpf/prog_tests/lwt_helpers.h | 139 ++++++++
.../selftests/bpf/prog_tests/lwt_redirect.c | 319 ++++++++++++++++++
.../selftests/bpf/prog_tests/lwt_reroute.c | 256 ++++++++++++++
.../selftests/bpf/progs/test_lwt_redirect.c | 58 ++++
.../selftests/bpf/progs/test_lwt_reroute.c | 36 ++
9 files changed, 817 insertions(+), 7 deletions(-)
create mode 100644 tools/testing/selftests/bpf/prog_tests/lwt_helpers.h
create mode 100644 tools/testing/selftests/bpf/prog_tests/lwt_redirect.c
create mode 100644 tools/testing/selftests/bpf/prog_tests/lwt_reroute.c
create mode 100644 tools/testing/selftests/bpf/progs/test_lwt_redirect.c
create mode 100644 tools/testing/selftests/bpf/progs/test_lwt_reroute.c
--
2.30.2
Hi Shuah, hi Paul,
I'm sending you the list of planned nolibc changes for 6.6. A doc update
may possibly follow a bit later to try to document the contribution
process. We also noticed a slight increase in binary sizes that might
be fixed soon but I wouldn't bet on this since it will require lot of
testing again and I'd rather postpone this by default. In any case I
have no intent to push any significant updates/fixes for 6.6 at this
point.
I'm also pasting a summary of the changes in this pull request, feel
free to use it for the merge commit message if you need.
For any question or if anything is not clear, do not hesitate to ask!
Thanks,
Willy
----- changes ------
Nolibc:
- improved portability by removing build errors with -ENOSYS
- added syscall6() on MIPS to support pselect6() and mmap()
- added setvbuf(), rmdir(), pipe(), pipe2()
- add support for ppc/ppc64
- environ is no longer optional
- fixed frame pointer issues at -O0
- dropped sys_stat() in favor of sys_statx()
- centralized _start_c() to remove lots of asm code
- switched size_t to __SIZE_TYPE__
Selftests:
- improved status reporting (success/warning/failure counts, path to log file)
- various code cleanups (indent, unused variables, ...)
- more consistent test numbering
- enabled compiler warnings
- dropped unreliable chmod_net test
- improved reliability (create /dev/zero & /tmp, rely less on /proc)
- new tests (brk/sbrk/mmap/munmap)
- improved compatibility with musl
- new run-nolibc-test target to build and run natively
- new run-libc-test target to build and run against native libc
- made the cmdline parser more reliable against boolean arguments
- dropped dependency on memfd for vfprintf() test
- nolibc-test is no longer stripped
- added support for extending ARCH via XARCH
Other:
- add Thomas as co-maintainer
-----------
The following changes since commit 06c2afb862f9da8dc5efa4b6076a0e48c3fbaaa5:
Linux 6.5-rc1 (2023-07-09 13:53:13 -0700)
are available in the Git repository at:
https://git.kernel.org/pub/scm/linux/kernel/git/wtarreau/nolibc.git/ 20230806-for-6.6-1
for you to fetch changes up to d98c1e27e46e47a3ae67e1d048f153598ba82611:
tools/nolibc: stackprotector.h: make __stack_chk_init static (2023-08-06 18:44:47 +0200)
----------------------------------------------------------------
Ryan Roberts (1):
tools/nolibc/stdio: add setvbuf() to set buffering mode
Thomas Weißschuh (22):
selftests/nolibc: drop test chmod_net
selftests/nolibc: simplify call to ioperm
tools/nolibc: completely remove optional environ support
selftests/nolibc: make evaluation of test conditions
selftests/nolibc: simplify status printing
selftests/nolibc: avoid gaps in test numbers
selftests/nolibc: avoid buffer underrun in space printing
tools/nolibc: drop unused variables
tools/nolibc: fix return type of getpagesize()
tools/nolibc: setvbuf: avoid unused parameter warnings
tools/nolibc: sys: avoid implicit sign cast
tools/nolibc: stdint: use __SIZE_TYPE__ for size_t
selftests/nolibc: drop unused variables
selftests/nolibc: mark test helpers as potentially unused
selftests/nolibc: make functions static if possible
selftests/nolibc: avoid unused parameter warnings
selftests/nolibc: avoid sign-compare warnings
selftests/nolibc: use correct return type for read() and write()
selftests/nolibc: prevent out of bounds access in expect_vfprintf
selftests/nolibc: don't strip nolibc-test
selftests/nolibc: enable compiler warnings
MAINTAINERS: nolibc: add myself as co-maintainer
Willy Tarreau (1):
selftests/nolibc: avoid warnings during intptr tests
Yuan Tan (2):
tools/nolibc: add pipe() and pipe2() support
selftests/nolibc: add testcase for pipe
Zhangjin Wu (74):
selftests/nolibc: add a standalone test report macro
selftests/nolibc: always print the path to test log file
selftests/nolibc: restore the failed tests print
tools/nolibc: fix up #error compile failures with -ENOSYS
tools/nolibc: fix up undeclared syscall macros with #ifdef and -ENOSYS
tools/nolibc: sys.h: add a syscall return helper
tools/nolibc: unistd.h: apply __sysret() helper
tools/nolibc: sys.h: apply __sysret() helper
tools/nolibc: unistd.h: reorder the syscall macros
tools/nolibc: arch-*.h: fix up code indent errors
toolc/nolibc: arch-*.h: clean up whitespaces after __asm__
tools/nolibc: arch-loongarch.h: shrink with _NOLIBC_SYSCALL_CLOBBERLIST
tools/nolibc: arch-mips.h: shrink with _NOLIBC_SYSCALL_CLOBBERLIST
tools/nolibc: add missing my_syscall6() for mips
tools/nolibc: __sysret: support syscalls who return a pointer
tools/nolibc: clean up mmap() routine
tools/nolibc: clean up sbrk() routine
selftests/nolibc: export argv0 for some tests
selftests/nolibc: prepare: create /dev/zero
selftests/nolibc: add EXPECT_PTREQ, EXPECT_PTRNE and EXPECT_PTRER
selftests/nolibc: add sbrk_0 to test current brk getting
selftests/nolibc: add mmap_bad test case
selftests/nolibc: add munmap_bad test case
selftests/nolibc: add mmap_munmap_good test case
selftests/nolibc: add run-libc-test target
selftests/nolibc: stat_fault: silence NULL argument warning with glibc
selftests/nolibc: gettid: restore for glibc and musl
selftests/nolibc: add _LARGEFILE64_SOURCE for musl
selftests/nolibc: fix up int_fast16/32_t test cases for musl
tools/nolibc: types.h: add RB_ flags for reboot()
selftests/nolibc: prefer <sys/reboot.h> to <linux/reboot.h>
selftests/nolibc: fix up kernel parameters support
selftests/nolibc: link_cross: use /proc/self/cmdline
tools/nolibc: add rmdir() support
selftests/nolibc: add a new rmdir() test case
selftests/nolibc: fix up failures when CONFIG_PROC_FS=n
selftests/nolibc: prepare /tmp for tests that need to write
selftests/nolibc: vfprintf: remove MEMFD_CREATE dependency
selftests/nolibc: chdir_root: restore current path after test
selftests/nolibc: stat_timestamps: remove procfs dependency
selftests/nolibc: chroot_exe: remove procfs dependency
selftests/nolibc: add chmod_argv0 test
selftests/nolibc: report: print a summarized test status
selftests/nolibc: report: print total tests
selftests/nolibc: report: align passed, skipped and failed
selftests/nolibc: report: extrude the test status line
selftests/nolibc: report: add newline before test failures
tools/nolibc: arch-*.h: add missing space after ','
tools/nolibc: fix up startup failures for -O0 under gcc < 11.1.0
tools/nolibc: remove the old sys_stat support
tools/nolibc: add new crt.h with _start_c
tools/nolibc: stackprotector.h: add empty __stack_chk_init for !_NOLIBC_STACKPROTECTOR
tools/nolibc: crt.h: initialize stack protector
tools/nolibc: arm: shrink _start with _start_c
tools/nolibc: aarch64: shrink _start with _start_c
tools/nolibc: i386: shrink _start with _start_c
tools/nolibc: x86_64: shrink _start with _start_c
tools/nolibc: mips: shrink _start with _start_c
tools/nolibc: loongarch: shrink _start with _start_c
tools/nolibc: riscv: shrink _start with _start_c
tools/nolibc: s390: shrink _start with _start_c
selftests/nolibc: add EXPECT_PTRGE, EXPECT_PTRGT, EXPECT_PTRLE, EXPECT_PTRLT
selftests/nolibc: add testcases for startup code
selftests/nolibc: allow run nolibc-test locally
selftests/nolibc: allow test -include /path/to/nolibc.h
selftests/nolibc: mmap_munmap_good: fix up return value
tools/nolibc: add support for powerpc
tools/nolibc: add support for powerpc64
selftests/nolibc: add XARCH and ARCH mapping support
selftests/nolibc: add test support for ppc
selftests/nolibc: add test support for ppc64le
selftests/nolibc: add test support for ppc64
selftests/nolibc: allow report with existing test log
tools/nolibc: stackprotector.h: make __stack_chk_init static
MAINTAINERS | 1 +
tools/include/nolibc/Makefile | 1 +
tools/include/nolibc/arch-aarch64.h | 85 +---
tools/include/nolibc/arch-arm.h | 111 +----
tools/include/nolibc/arch-i386.h | 86 +---
tools/include/nolibc/arch-loongarch.h | 83 +---
tools/include/nolibc/arch-mips.h | 147 +++----
tools/include/nolibc/arch-powerpc.h | 213 ++++++++++
tools/include/nolibc/arch-riscv.h | 83 +---
tools/include/nolibc/arch-s390.h | 77 +---
tools/include/nolibc/arch-x86_64.h | 86 +---
tools/include/nolibc/arch.h | 2 +
tools/include/nolibc/crt.h | 61 +++
tools/include/nolibc/nolibc.h | 9 +-
tools/include/nolibc/stackprotector.h | 5 +-
tools/include/nolibc/stdint.h | 2 +-
tools/include/nolibc/stdio.h | 27 ++
tools/include/nolibc/stdlib.h | 12 +-
tools/include/nolibc/sys.h | 554 +++++++-----------------
tools/include/nolibc/types.h | 22 +-
tools/include/nolibc/unistd.h | 13 +-
tools/testing/selftests/nolibc/Makefile | 109 +++--
tools/testing/selftests/nolibc/nolibc-test.c | 609 ++++++++++++++++++++-------
23 files changed, 1216 insertions(+), 1182 deletions(-)
create mode 100644 tools/include/nolibc/arch-powerpc.h
create mode 100644 tools/include/nolibc/crt.h
When run command "ip netns delete client", device link1_1 has been
deleted. So, it is no need to delete link1_1 again. Remove it.
Signed-off-by: Zhengchao Shao <shaozhengchao(a)huawei.com>
---
.../drivers/net/bonding/bond-arp-interval-causes-panic.sh | 1 -
1 file changed, 1 deletion(-)
diff --git a/tools/testing/selftests/drivers/net/bonding/bond-arp-interval-causes-panic.sh b/tools/testing/selftests/drivers/net/bonding/bond-arp-interval-causes-panic.sh
index 71c00bfafbc9..7b2d421f09cf 100755
--- a/tools/testing/selftests/drivers/net/bonding/bond-arp-interval-causes-panic.sh
+++ b/tools/testing/selftests/drivers/net/bonding/bond-arp-interval-causes-panic.sh
@@ -11,7 +11,6 @@ finish()
{
ip netns delete server || true
ip netns delete client || true
- ip link del link1_1 || true
}
trap finish EXIT
--
2.34.1
*Changes in v30*:
- Rebase on top of next-20230815
- Minor nitpicks
*Changes in v29:*
- Polish IOCTL and improve documentation
*Changes in v28:*
- Fix walk_end and add 17 test cases in selftests patch
*Changes in v27:*
- Handle review comments and minor improvements
- Add performance improvement patch on top with test for easy review
*Changes in v26:*
- Code re-structurring and API changes in PAGEMAP_IOCTL
*Changes in v25*:
- Do proper filtering on hole as well (hole got missed earlier)
*Changes in v24*:
- Rebase on top of next-20230710
- Place WP markers in case of hole as well
*Changes in v23*:
- Set vec_buf_index in loop only when vec_buf_index is set
- Return -EFAULT instead of -EINVAL if vec is NULL
- Correctly return the walk ending address to the page granularity
*Changes in v22*:
- Interface change:
- Replace [start start + len) with [start, end)
- Return the ending address of the address walk in start
*Changes in v21*:
- Abort walk instead of returning error if WP is to be performed on
partial hugetlb
*Changes in v20*
- Correct PAGE_IS_FILE and add PAGE_IS_PFNZERO
*Changes in v19*
- Minor changes and interface updates
*Changes in v18*
- Rebase on top of next-20230613
- Minor updates
*Changes in v17*
- Rebase on top of next-20230606
- Minor improvements in PAGEMAP_SCAN IOCTL patch
*Changes in v16*
- Fix a corner case
- Add exclusive PM_SCAN_OP_WP back
*Changes in v15*
- Build fix (Add missed build fix in RESEND)
*Changes in v14*
- Fix build error caused by #ifdef added at last minute in some configs
*Changes in v13*
- Rebase on top of next-20230414
- Give-up on using uffd_wp_range() and write new helpers, flush tlb only
once
*Changes in v12*
- Update and other memory types to UFFD_FEATURE_WP_ASYNC
- Rebaase on top of next-20230406
- Review updates
*Changes in v11*
- Rebase on top of next-20230307
- Base patches on UFFD_FEATURE_WP_UNPOPULATED
- Do a lot of cosmetic changes and review updates
- Remove ENGAGE_WP + !GET operation as it can be performed with
UFFDIO_WRITEPROTECT
*Changes in v10*
- Add specific condition to return error if hugetlb is used with wp
async
- Move changes in tools/include/uapi/linux/fs.h to separate patch
- Add documentation
*Changes in v9:*
- Correct fault resolution for userfaultfd wp async
- Fix build warnings and errors which were happening on some configs
- Simplify pagemap ioctl's code
*Changes in v8:*
- Update uffd async wp implementation
- Improve PAGEMAP_IOCTL implementation
*Changes in v7:*
- Add uffd wp async
- Update the IOCTL to use uffd under the hood instead of soft-dirty
flags
*Motivation*
The real motivation for adding PAGEMAP_SCAN IOCTL is to emulate Windows
GetWriteWatch() and ResetWriteWatch() syscalls [1]. The GetWriteWatch()
retrieves the addresses of the pages that are written to in a region of
virtual memory.
This syscall is used in Windows applications and games etc. This syscall is
being emulated in pretty slow manner in userspace. Our purpose is to
enhance the kernel such that we translate it efficiently in a better way.
Currently some out of tree hack patches are being used to efficiently
emulate it in some kernels. We intend to replace those with these patches.
So the whole gaming on Linux can effectively get benefit from this. It
means there would be tons of users of this code.
CRIU use case [2] was mentioned by Andrei and Danylo:
> Use cases for migrating sparse VMAs are binaries sanitized with ASAN,
> MSAN or TSAN [3]. All of these sanitizers produce sparse mappings of
> shadow memory [4]. Being able to migrate such binaries allows to highly
> reduce the amount of work needed to identify and fix post-migration
> crashes, which happen constantly.
Andrei's defines the following uses of this code:
* it is more granular and allows us to track changed pages more
effectively. The current interface can clear dirty bits for the entire
process only. In addition, reading info about pages is a separate
operation. It means we must freeze the process to read information
about all its pages, reset dirty bits, only then we can start dumping
pages. The information about pages becomes more and more outdated,
while we are processing pages. The new interface solves both these
downsides. First, it allows us to read pte bits and clear the
soft-dirty bit atomically. It means that CRIU will not need to freeze
processes to pre-dump their memory. Second, it clears soft-dirty bits
for a specified region of memory. It means CRIU will have actual info
about pages to the moment of dumping them.
* The new interface has to be much faster because basic page filtering
is happening in the kernel. With the old interface, we have to read
pagemap for each page.
*Implementation Evolution (Short Summary)*
From the definition of GetWriteWatch(), we feel like kernel's soft-dirty
feature can be used under the hood with some additions like:
* reset soft-dirty flag for only a specific region of memory instead of
clearing the flag for the entire process
* get and clear soft-dirty flag for a specific region atomically
So we decided to use ioctl on pagemap file to read or/and reset soft-dirty
flag. But using soft-dirty flag, sometimes we get extra pages which weren't
even written. They had become soft-dirty because of VMA merging and
VM_SOFTDIRTY flag. This breaks the definition of GetWriteWatch(). We were
able to by-pass this short coming by ignoring VM_SOFTDIRTY until David
reported that mprotect etc messes up the soft-dirty flag while ignoring
VM_SOFTDIRTY [5]. This wasn't happening until [6] got introduced. We
discussed if we can revert these patches. But we could not reach to any
conclusion. So at this point, I made couple of tries to solve this whole
VM_SOFTDIRTY issue by correcting the soft-dirty implementation:
* [7] Correct the bug fixed wrongly back in 2014. It had potential to cause
regression. We left it behind.
* [8] Keep a list of soft-dirty part of a VMA across splits and merges. I
got the reply don't increase the size of the VMA by 8 bytes.
At this point, we left soft-dirty considering it is too much delicate and
userfaultfd [9] seemed like the only way forward. From there onward, we
have been basing soft-dirty emulation on userfaultfd wp feature where
kernel resolves the faults itself when WP_ASYNC feature is used. It was
straight forward to add WP_ASYNC feature in userfautlfd. Now we get only
those pages dirty or written-to which are really written in reality. (PS
There is another WP_UNPOPULATED userfautfd feature is required which is
needed to avoid pre-faulting memory before write-protecting [9].)
All the different masks were added on the request of CRIU devs to create
interface more generic and better.
[1] https://learn.microsoft.com/en-us/windows/win32/api/memoryapi/nf-memoryapi-…
[2] https://lore.kernel.org/all/20221014134802.1361436-1-mdanylo@google.com
[3] https://github.com/google/sanitizers
[4] https://github.com/google/sanitizers/wiki/AddressSanitizerAlgorithm#64-bit
[5] https://lore.kernel.org/all/bfcae708-db21-04b4-0bbe-712badd03071@redhat.com
[6] https://lore.kernel.org/all/20220725142048.30450-1-peterx@redhat.com/
[7] https://lore.kernel.org/all/20221122115007.2787017-1-usama.anjum@collabora.…
[8] https://lore.kernel.org/all/20221220162606.1595355-1-usama.anjum@collabora.…
[9] https://lore.kernel.org/all/20230306213925.617814-1-peterx@redhat.com
[10] https://lore.kernel.org/all/20230125144529.1630917-1-mdanylo@google.com
* Original Cover letter from v8*
Hello,
Note:
Soft-dirty pages and pages which have been written-to are synonyms. As
kernel already has soft-dirty feature inside which we have given up to
use, we are using written-to terminology while using UFFD async WP under
the hood.
It is possible to find and clear soft-dirty pages entirely in userspace.
But it isn't efficient:
- The mprotect and SIGSEGV handler for bookkeeping
- The userfaultfd wp (synchronous) with the handler for bookkeeping
Some benchmarks can be seen here[1]. This series adds features that weren't
present earlier:
- There is no atomic get soft-dirty/Written-to status and clear present in
the kernel.
- The pages which have been written-to can not be found in accurate way.
(Kernel's soft-dirty PTE bit + sof_dirty VMA bit shows more soft-dirty
pages than there actually are.)
Historically, soft-dirty PTE bit tracking has been used in the CRIU
project. The procfs interface is enough for finding the soft-dirty bit
status and clearing the soft-dirty bit of all the pages of a process.
We have the use case where we need to track the soft-dirty PTE bit for
only specific pages on-demand. We need this tracking and clear mechanism
of a region of memory while the process is running to emulate the
getWriteWatch() syscall of Windows.
*(Moved to using UFFD instead of soft-dirty feature to find pages which
have been written-to from v7 patch series)*:
Stop using the soft-dirty flags for finding which pages have been
written to. It is too delicate and wrong as it shows more soft-dirty
pages than the actual soft-dirty pages. There is no interest in
correcting it [2][3] as this is how the feature was written years ago.
It shouldn't be updated to changed behaviour. Peter Xu has suggested
using the async version of the UFFD WP [4] as it is based inherently
on the PTEs.
So in this patch series, I've added a new mode to the UFFD which is
asynchronous version of the write protect. When this variant of the
UFFD WP is used, the page faults are resolved automatically by the
kernel. The pages which have been written-to can be found by reading
pagemap file (!PM_UFFD_WP). This feature can be used successfully to
find which pages have been written to from the time the pages were
write protected. This works just like the soft-dirty flag without
showing any extra pages which aren't soft-dirty in reality.
The information related to pages if the page is file mapped, present and
swapped is required for the CRIU project [5][6]. The addition of the
required mask, any mask, excluded mask and return masks are also required
for the CRIU project [5].
The IOCTL returns the addresses of the pages which match the specific
masks. The page addresses are returned in struct page_region in a compact
form. The max_pages is needed to support a use case where user only wants
to get a specific number of pages. So there is no need to find all the
pages of interest in the range when max_pages is specified. The IOCTL
returns when the maximum number of the pages are found. The max_pages is
optional. If max_pages is specified, it must be equal or greater than the
vec_size. This restriction is needed to handle worse case when one
page_region only contains info of one page and it cannot be compacted.
This is needed to emulate the Windows getWriteWatch() syscall.
The patch series include the detailed selftest which can be used as an
example for the uffd async wp test and PAGEMAP_IOCTL. It shows the
interface usages as well.
[1] https://lore.kernel.org/lkml/54d4c322-cd6e-eefd-b161-2af2b56aae24@collabora…
[2] https://lore.kernel.org/all/20221220162606.1595355-1-usama.anjum@collabora.…
[3] https://lore.kernel.org/all/20221122115007.2787017-1-usama.anjum@collabora.…
[4] https://lore.kernel.org/all/Y6Hc2d+7eTKs7AiH@x1n
[5] https://lore.kernel.org/all/YyiDg79flhWoMDZB@gmail.com/
[6] https://lore.kernel.org/all/20221014134802.1361436-1-mdanylo@google.com/
Regards,
Muhammad Usama Anjum
Muhammad Usama Anjum (5):
fs/proc/task_mmu: Implement IOCTL to get and optionally clear info
about PTEs
fs/proc/task_mmu: Add fast paths to get/clear PAGE_IS_WRITTEN flag
tools headers UAPI: Update linux/fs.h with the kernel sources
mm/pagemap: add documentation of PAGEMAP_SCAN IOCTL
selftests: mm: add pagemap ioctl tests
Peter Xu (1):
userfaultfd: UFFD_FEATURE_WP_ASYNC
Documentation/admin-guide/mm/pagemap.rst | 89 +
Documentation/admin-guide/mm/userfaultfd.rst | 35 +
fs/proc/task_mmu.c | 708 ++++++++
fs/userfaultfd.c | 26 +-
include/linux/hugetlb.h | 1 +
include/linux/userfaultfd_k.h | 28 +-
include/uapi/linux/fs.h | 59 +
include/uapi/linux/userfaultfd.h | 9 +-
mm/hugetlb.c | 34 +-
mm/memory.c | 28 +-
tools/include/uapi/linux/fs.h | 59 +
tools/testing/selftests/mm/.gitignore | 2 +
tools/testing/selftests/mm/Makefile | 3 +-
tools/testing/selftests/mm/config | 1 +
tools/testing/selftests/mm/pagemap_ioctl.c | 1660 ++++++++++++++++++
tools/testing/selftests/mm/run_vmtests.sh | 4 +
16 files changed, 2722 insertions(+), 24 deletions(-)
create mode 100644 tools/testing/selftests/mm/pagemap_ioctl.c
--
2.40.1
*Changes in v30*:
- Rebase on top of next-20230815
- Minor nitpicks
*Changes in v29:*
- Polish IOCTL and improve documentation
*Changes in v28:*
- Fix walk_end and add 17 test cases in selftests patch
*Changes in v27:*
- Handle review comments and minor improvements
- Add performance improvement patch on top with test for easy review
*Changes in v26:*
- Code re-structurring and API changes in PAGEMAP_IOCTL
*Changes in v25*:
- Do proper filtering on hole as well (hole got missed earlier)
*Changes in v24*:
- Rebase on top of next-20230710
- Place WP markers in case of hole as well
*Changes in v23*:
- Set vec_buf_index in loop only when vec_buf_index is set
- Return -EFAULT instead of -EINVAL if vec is NULL
- Correctly return the walk ending address to the page granularity
*Changes in v22*:
- Interface change:
- Replace [start start + len) with [start, end)
- Return the ending address of the address walk in start
*Changes in v21*:
- Abort walk instead of returning error if WP is to be performed on
partial hugetlb
*Changes in v20*
- Correct PAGE_IS_FILE and add PAGE_IS_PFNZERO
*Changes in v19*
- Minor changes and interface updates
*Changes in v18*
- Rebase on top of next-20230613
- Minor updates
*Changes in v17*
- Rebase on top of next-20230606
- Minor improvements in PAGEMAP_SCAN IOCTL patch
*Changes in v16*
- Fix a corner case
- Add exclusive PM_SCAN_OP_WP back
*Changes in v15*
- Build fix (Add missed build fix in RESEND)
*Changes in v14*
- Fix build error caused by #ifdef added at last minute in some configs
*Changes in v13*
- Rebase on top of next-20230414
- Give-up on using uffd_wp_range() and write new helpers, flush tlb only
once
*Changes in v12*
- Update and other memory types to UFFD_FEATURE_WP_ASYNC
- Rebaase on top of next-20230406
- Review updates
*Changes in v11*
- Rebase on top of next-20230307
- Base patches on UFFD_FEATURE_WP_UNPOPULATED
- Do a lot of cosmetic changes and review updates
- Remove ENGAGE_WP + !GET operation as it can be performed with
UFFDIO_WRITEPROTECT
*Changes in v10*
- Add specific condition to return error if hugetlb is used with wp
async
- Move changes in tools/include/uapi/linux/fs.h to separate patch
- Add documentation
*Changes in v9:*
- Correct fault resolution for userfaultfd wp async
- Fix build warnings and errors which were happening on some configs
- Simplify pagemap ioctl's code
*Changes in v8:*
- Update uffd async wp implementation
- Improve PAGEMAP_IOCTL implementation
*Changes in v7:*
- Add uffd wp async
- Update the IOCTL to use uffd under the hood instead of soft-dirty
flags
*Motivation*
The real motivation for adding PAGEMAP_SCAN IOCTL is to emulate Windows
GetWriteWatch() and ResetWriteWatch() syscalls [1]. The GetWriteWatch()
retrieves the addresses of the pages that are written to in a region of
virtual memory.
This syscall is used in Windows applications and games etc. This syscall is
being emulated in pretty slow manner in userspace. Our purpose is to
enhance the kernel such that we translate it efficiently in a better way.
Currently some out of tree hack patches are being used to efficiently
emulate it in some kernels. We intend to replace those with these patches.
So the whole gaming on Linux can effectively get benefit from this. It
means there would be tons of users of this code.
CRIU use case [2] was mentioned by Andrei and Danylo:
> Use cases for migrating sparse VMAs are binaries sanitized with ASAN,
> MSAN or TSAN [3]. All of these sanitizers produce sparse mappings of
> shadow memory [4]. Being able to migrate such binaries allows to highly
> reduce the amount of work needed to identify and fix post-migration
> crashes, which happen constantly.
Andrei's defines the following uses of this code:
* it is more granular and allows us to track changed pages more
effectively. The current interface can clear dirty bits for the entire
process only. In addition, reading info about pages is a separate
operation. It means we must freeze the process to read information
about all its pages, reset dirty bits, only then we can start dumping
pages. The information about pages becomes more and more outdated,
while we are processing pages. The new interface solves both these
downsides. First, it allows us to read pte bits and clear the
soft-dirty bit atomically. It means that CRIU will not need to freeze
processes to pre-dump their memory. Second, it clears soft-dirty bits
for a specified region of memory. It means CRIU will have actual info
about pages to the moment of dumping them.
* The new interface has to be much faster because basic page filtering
is happening in the kernel. With the old interface, we have to read
pagemap for each page.
*Implementation Evolution (Short Summary)*
From the definition of GetWriteWatch(), we feel like kernel's soft-dirty
feature can be used under the hood with some additions like:
* reset soft-dirty flag for only a specific region of memory instead of
clearing the flag for the entire process
* get and clear soft-dirty flag for a specific region atomically
So we decided to use ioctl on pagemap file to read or/and reset soft-dirty
flag. But using soft-dirty flag, sometimes we get extra pages which weren't
even written. They had become soft-dirty because of VMA merging and
VM_SOFTDIRTY flag. This breaks the definition of GetWriteWatch(). We were
able to by-pass this short coming by ignoring VM_SOFTDIRTY until David
reported that mprotect etc messes up the soft-dirty flag while ignoring
VM_SOFTDIRTY [5]. This wasn't happening until [6] got introduced. We
discussed if we can revert these patches. But we could not reach to any
conclusion. So at this point, I made couple of tries to solve this whole
VM_SOFTDIRTY issue by correcting the soft-dirty implementation:
* [7] Correct the bug fixed wrongly back in 2014. It had potential to cause
regression. We left it behind.
* [8] Keep a list of soft-dirty part of a VMA across splits and merges. I
got the reply don't increase the size of the VMA by 8 bytes.
At this point, we left soft-dirty considering it is too much delicate and
userfaultfd [9] seemed like the only way forward. From there onward, we
have been basing soft-dirty emulation on userfaultfd wp feature where
kernel resolves the faults itself when WP_ASYNC feature is used. It was
straight forward to add WP_ASYNC feature in userfautlfd. Now we get only
those pages dirty or written-to which are really written in reality. (PS
There is another WP_UNPOPULATED userfautfd feature is required which is
needed to avoid pre-faulting memory before write-protecting [9].)
All the different masks were added on the request of CRIU devs to create
interface more generic and better.
[1] https://learn.microsoft.com/en-us/windows/win32/api/memoryapi/nf-memoryapi-…
[2] https://lore.kernel.org/all/20221014134802.1361436-1-mdanylo@google.com
[3] https://github.com/google/sanitizers
[4] https://github.com/google/sanitizers/wiki/AddressSanitizerAlgorithm#64-bit
[5] https://lore.kernel.org/all/bfcae708-db21-04b4-0bbe-712badd03071@redhat.com
[6] https://lore.kernel.org/all/20220725142048.30450-1-peterx@redhat.com/
[7] https://lore.kernel.org/all/20221122115007.2787017-1-usama.anjum@collabora.…
[8] https://lore.kernel.org/all/20221220162606.1595355-1-usama.anjum@collabora.…
[9] https://lore.kernel.org/all/20230306213925.617814-1-peterx@redhat.com
[10] https://lore.kernel.org/all/20230125144529.1630917-1-mdanylo@google.com
* Original Cover letter from v8*
Hello,
Note:
Soft-dirty pages and pages which have been written-to are synonyms. As
kernel already has soft-dirty feature inside which we have given up to
use, we are using written-to terminology while using UFFD async WP under
the hood.
It is possible to find and clear soft-dirty pages entirely in userspace.
But it isn't efficient:
- The mprotect and SIGSEGV handler for bookkeeping
- The userfaultfd wp (synchronous) with the handler for bookkeeping
Some benchmarks can be seen here[1]. This series adds features that weren't
present earlier:
- There is no atomic get soft-dirty/Written-to status and clear present in
the kernel.
- The pages which have been written-to can not be found in accurate way.
(Kernel's soft-dirty PTE bit + sof_dirty VMA bit shows more soft-dirty
pages than there actually are.)
Historically, soft-dirty PTE bit tracking has been used in the CRIU
project. The procfs interface is enough for finding the soft-dirty bit
status and clearing the soft-dirty bit of all the pages of a process.
We have the use case where we need to track the soft-dirty PTE bit for
only specific pages on-demand. We need this tracking and clear mechanism
of a region of memory while the process is running to emulate the
getWriteWatch() syscall of Windows.
*(Moved to using UFFD instead of soft-dirty feature to find pages which
have been written-to from v7 patch series)*:
Stop using the soft-dirty flags for finding which pages have been
written to. It is too delicate and wrong as it shows more soft-dirty
pages than the actual soft-dirty pages. There is no interest in
correcting it [2][3] as this is how the feature was written years ago.
It shouldn't be updated to changed behaviour. Peter Xu has suggested
using the async version of the UFFD WP [4] as it is based inherently
on the PTEs.
So in this patch series, I've added a new mode to the UFFD which is
asynchronous version of the write protect. When this variant of the
UFFD WP is used, the page faults are resolved automatically by the
kernel. The pages which have been written-to can be found by reading
pagemap file (!PM_UFFD_WP). This feature can be used successfully to
find which pages have been written to from the time the pages were
write protected. This works just like the soft-dirty flag without
showing any extra pages which aren't soft-dirty in reality.
The information related to pages if the page is file mapped, present and
swapped is required for the CRIU project [5][6]. The addition of the
required mask, any mask, excluded mask and return masks are also required
for the CRIU project [5].
The IOCTL returns the addresses of the pages which match the specific
masks. The page addresses are returned in struct page_region in a compact
form. The max_pages is needed to support a use case where user only wants
to get a specific number of pages. So there is no need to find all the
pages of interest in the range when max_pages is specified. The IOCTL
returns when the maximum number of the pages are found. The max_pages is
optional. If max_pages is specified, it must be equal or greater than the
vec_size. This restriction is needed to handle worse case when one
page_region only contains info of one page and it cannot be compacted.
This is needed to emulate the Windows getWriteWatch() syscall.
The patch series include the detailed selftest which can be used as an
example for the uffd async wp test and PAGEMAP_IOCTL. It shows the
interface usages as well.
[1] https://lore.kernel.org/lkml/54d4c322-cd6e-eefd-b161-2af2b56aae24@collabora…
[2] https://lore.kernel.org/all/20221220162606.1595355-1-usama.anjum@collabora.…
[3] https://lore.kernel.org/all/20221122115007.2787017-1-usama.anjum@collabora.…
[4] https://lore.kernel.org/all/Y6Hc2d+7eTKs7AiH@x1n
[5] https://lore.kernel.org/all/YyiDg79flhWoMDZB@gmail.com/
[6] https://lore.kernel.org/all/20221014134802.1361436-1-mdanylo@google.com/
Regards,
Muhammad Usama Anjum
Muhammad Usama Anjum (5):
fs/proc/task_mmu: Implement IOCTL to get and optionally clear info
about PTEs
fs/proc/task_mmu: Add fast paths to get/clear PAGE_IS_WRITTEN flag
tools headers UAPI: Update linux/fs.h with the kernel sources
mm/pagemap: add documentation of PAGEMAP_SCAN IOCTL
selftests: mm: add pagemap ioctl tests
Peter Xu (1):
userfaultfd: UFFD_FEATURE_WP_ASYNC
Documentation/admin-guide/mm/pagemap.rst | 89 +
Documentation/admin-guide/mm/userfaultfd.rst | 35 +
fs/proc/task_mmu.c | 705 ++++++++
fs/userfaultfd.c | 26 +-
include/linux/hugetlb.h | 1 +
include/linux/userfaultfd_k.h | 28 +-
include/uapi/linux/fs.h | 59 +
include/uapi/linux/userfaultfd.h | 9 +-
mm/hugetlb.c | 34 +-
mm/memory.c | 28 +-
tools/include/uapi/linux/fs.h | 59 +
tools/testing/selftests/mm/.gitignore | 2 +
tools/testing/selftests/mm/Makefile | 3 +-
tools/testing/selftests/mm/config | 1 +
tools/testing/selftests/mm/pagemap_ioctl.c | 1660 ++++++++++++++++++
tools/testing/selftests/mm/run_vmtests.sh | 4 +
16 files changed, 2719 insertions(+), 24 deletions(-)
create mode 100644 tools/testing/selftests/mm/pagemap_ioctl.c
--
2.40.1
In the Segment Routing (SR) architecture a list of instructions, called
segments, can be added to the packet headers to influence the forwarding and
processing of the packets in an SR enabled network.
Considering the Segment Routing over IPv6 data plane (SRv6) [1], the segment
identifiers (SIDs) are IPv6 addresses (128 bits) and the segment list (SID
List) is carried in the Segment Routing Header (SRH). A segment may correspond
to a "behavior" that is executed by a node when the packet is received.
The Linux kernel currently supports a large subset of the behaviors described
in [2] (e.g., End, End.X, End.T and so on).
In some SRv6 scenarios, the number of segments carried by the SID List may
increase dramatically, reducing the MTU (Maximum Transfer Unit) size and/or
limiting the processing power of legacy hardware devices (due to longer IPv6
headers).
The NEXT-C-SID mechanism [3] extends the SRv6 architecture by providing several
ways to efficiently represent the SID List.
By leveraging the NEXT-C-SID, it is possible to encode several SRv6 segments
within a single 128 bit SID address (also referenced as Compressed SID
Container). In this way, the length of the SID List can be drastically reduced.
The NEXT-C-SID mechanism is built upon the "flavors" framework defined in [2].
This framework is already supported by the Linux SRv6 subsystem and is used to
modify and/or extend a subset of existing behaviors.
In this patchset, we extend the SRv6 End.X behavior in order to support the
NEXT-C-SID mechanism.
In details, the patchset is made of:
- patch 1/2: add NEXT-C-SID support for SRv6 End.X behavior;
- patch 2/2: add selftest for NEXT-C-SID in SRv6 End.X behavior.
From the user space perspective, we do not need to change the iproute2 code to
support the NEXT-C-SID flavor for the SRv6 End.X behavior. However, we will
update the man page considering the NEXT-C-SID flavor applied to the SRv6 End.X
behavior in a separate patch.
Comments, improvements and suggestions are always appreciated.
Thank you all,
Andrea
[1] - https://datatracker.ietf.org/doc/html/rfc8754
[2] - https://datatracker.ietf.org/doc/html/rfc8986
[3] - https://datatracker.ietf.org/doc/html/draft-ietf-spring-srv6-srh-compression
v1 -> v2:
- Fix author tags in the commit message in patch 2/2, thanks to Paolo Abeni;
- Remove unnecessary supp_ops == 0 check in patch 1/2, thanks to Hangbin Liu;
- Fix 'is it possible' -> 'it is possible' in cover letter, thanks to
Hangbin Liu.
Andrea Mayer (1):
seg6: add NEXT-C-SID support for SRv6 End.X behavior
Paolo Lungaroni (1):
selftests: seg6: add selftest for NEXT-C-SID flavor in SRv6 End.X
behavior
net/ipv6/seg6_local.c | 108 +-
tools/testing/selftests/net/Makefile | 1 +
.../net/srv6_end_x_next_csid_l3vpn_test.sh | 1213 +++++++++++++++++
3 files changed, 1302 insertions(+), 20 deletions(-)
create mode 100755 tools/testing/selftests/net/srv6_end_x_next_csid_l3vpn_test.sh
--
2.20.1
iommufd gives userspace the capability to manipulate iommu subsytem.
e.g. DMA map/unmap etc. In the near future, it will support iommu nested
translation. Different platform vendors have different implementation for
the nested translation. For example, Intel VT-d supports using guest I/O
page table as the stage-1 translation table. This requires guest I/O page
table be compatible with hardware IOMMU. So before set up nested translation,
userspace needs to know the hardware iommu information to understand the
nested translation requirements.
This series reports the iommu hardware information for a given device
which has been bound to iommufd. It is preparation work for userspace to
allocate hwpt for given device. Like the nested translation support[1].
This series introduces an iommu op to report the iommu hardware info,
and an ioctl IOMMU_GET_HW_INFO is added to report such hardware info to
user. enum iommu_hw_info_type is defined to differentiate the iommu hardware
info reported to user hence user can decode them. This series only adds the
framework for iommu hw info reporting, the complete reporting path needs vendor
specific definition and driver support. The full code is available in [1]
as well.
[1] https://github.com/yiliu1765/iommufd/tree/wip/iommufd_nesting_08112023-yi
(only the hw_info report path is the latest, other parts is wip)
Change log:
v7:
- Use clear_user() (Jason)
- Add fail_nth for hw_ifo (Jason)
v6: https://lore.kernel.org/linux-iommu/20230808153510.4170-1-yi.l.liu@intel.co…
- Add Jingqi's comment on patch 02
- Add Baolu's r-b to patch 03
- Address Jason's comment on patch 03
v5: https://lore.kernel.org/linux-iommu/20230803143144.200945-1-yi.l.liu@intel.…
- Return hw_info_type in the .hw_info op, hence drop hw_info_type field in iommu_ops (Kevin)
- Add Jason's r-b for patch 01
- Address coding style comments from Jason and Kevin w.r.t. patch 02, 03 and 04
v4: https://lore.kernel.org/linux-iommu/20230724105936.107042-1-yi.l.liu@intel.…
- Rename ioctl to IOMMU_GET_HW_INFO and structure to iommu_hw_info
- Move the iommufd_get_hw_info handler to main.c
- Place iommu_hw_info prior to iommu_hwpt_alloc
- Update the function namings accordingly
- Update uapi kdocs
v3: https://lore.kernel.org/linux-iommu/20230511143024.19542-1-yi.l.liu@intel.c…
- Add r-b from Baolu
- Rename IOMMU_HW_INFO_TYPE_DEFAULT to be IOMMU_HW_INFO_TYPE_NONE to
better suit what it means
- Let IOMMU_DEVICE_GET_HW_INFO succeed even the underlying iommu driver
does not have driver-specific data to report per below remark.
https://lore.kernel.org/kvm/ZAcwJSK%2F9UVI9LXu@nvidia.com/
v2: https://lore.kernel.org/linux-iommu/20230309075358.571567-1-yi.l.liu@intel.…
- Drop patch 05 of v1 as it is already covered by other series
- Rename the capability info to be iommu hardware info
v1: https://lore.kernel.org/linux-iommu/20230209041642.9346-1-yi.l.liu@intel.co…
Regards,
Yi Liu
Lu Baolu (1):
iommu: Add new iommu op to get iommu hardware information
Nicolin Chen (1):
iommufd/selftest: Add coverage for IOMMU_GET_HW_INFO ioctl
Yi Liu (2):
iommu: Move dev_iommu_ops() to private header
iommufd: Add IOMMU_GET_HW_INFO
drivers/iommu/iommu-priv.h | 11 +++
drivers/iommu/iommufd/iommufd_test.h | 9 ++
drivers/iommu/iommufd/main.c | 85 +++++++++++++++++++
drivers/iommu/iommufd/selftest.c | 16 ++++
include/linux/iommu.h | 20 ++---
include/uapi/linux/iommufd.h | 45 ++++++++++
tools/testing/selftests/iommu/iommufd.c | 28 +++++-
.../selftests/iommu/iommufd_fail_nth.c | 4 +
tools/testing/selftests/iommu/iommufd_utils.h | 47 ++++++++++
9 files changed, 253 insertions(+), 12 deletions(-)
--
2.34.1
Add functionality to run built-in tests after boot by writing to a
debugfs file.
Add a new debugfs file labeled "run" for each test suite to use for
this purpose.
As an example, write to the file using the following:
echo "any string" > /sys/kernel/debugfs/kunit/<testsuite>/run
This will trigger the test suite to run and will print results to the
kernel log.
Note that what you "write" to the debugfs file will not be saved.
To guard against running tests concurrently with this feature, add a
mutex lock around running kunit. This supports the current practice of
not allowing tests to be run concurrently on the same kernel.
This functionality may not work for all tests.
This new functionality could be used to design a parameter
injection feature in the future.
Signed-off-by: Rae Moar <rmoar(a)google.com>
---
Interested in what people think of this idea. I will be adding
documentation in v2.
Note this may need to be changed once the patches on extending logs
land.
Thanks!
-Rae
lib/kunit/debugfs.c | 66 +++++++++++++++++++++++++++++++++++++++++++++
lib/kunit/test.c | 13 +++++++++
2 files changed, 79 insertions(+)
diff --git a/lib/kunit/debugfs.c b/lib/kunit/debugfs.c
index 22c5c496a68f..7f76cb909a97 100644
--- a/lib/kunit/debugfs.c
+++ b/lib/kunit/debugfs.c
@@ -8,12 +8,14 @@
#include <linux/module.h>
#include <kunit/test.h>
+#include <kunit/test-bug.h>
#include "string-stream.h"
#include "debugfs.h"
#define KUNIT_DEBUGFS_ROOT "kunit"
#define KUNIT_DEBUGFS_RESULTS "results"
+#define KUNIT_DEBUGFS_RUN "run"
/*
* Create a debugfs representation of test suites:
@@ -21,6 +23,8 @@
* Path Semantics
* /sys/kernel/debug/kunit/<testsuite>/results Show results of last run for
* testsuite
+ * /sys/kernel/debug/kunit/<testsuite>/run Write to this file to trigger
+ * testsuite to run
*
*/
@@ -93,6 +97,51 @@ static int debugfs_results_open(struct inode *inode, struct file *file)
return single_open(file, debugfs_print_results, suite);
}
+/*
+ * Print a usage message to the debugfs "run" file
+ * (/sys/kernel/debug/kunit/<testsuite>/run) if opened.
+ */
+static int debugfs_print_run(struct seq_file *seq, void *v)
+{
+ struct kunit_suite *suite = (struct kunit_suite *)seq->private;
+
+ seq_puts(seq, "Write to this file to trigger the test suite to run.\n");
+ seq_printf(seq, "usage: echo \"any string\" > /sys/kernel/debugfs/kunit/%s/run\n",
+ suite->name);
+ return 0;
+}
+
+/*
+ * The debugfs "run" file (/sys/kernel/debug/kunit/<testsuite>/run)
+ * contains no information. Write to the file to trigger the test suite
+ * to run.
+ */
+static int debugfs_run_open(struct inode *inode, struct file *file)
+{
+ struct kunit_suite *suite;
+
+ suite = (struct kunit_suite *)inode->i_private;
+
+ return single_open(file, debugfs_print_run, suite);
+}
+
+/*
+ * Trigger a test suite to run by writing to the suite's "run" debugfs
+ * file found at: /sys/kernel/debug/kunit/<testsuite>/run
+ *
+ * Note: what is written to this file will not be saved.
+ */
+static ssize_t debugfs_run(struct file *file,
+ const char __user *buf, size_t count, loff_t *ppos)
+{
+ struct inode *f_inode = file->f_inode;
+ struct kunit_suite *suite = (struct kunit_suite *) f_inode->i_private;
+
+ __kunit_test_suites_init(&suite, 1);
+
+ return count;
+}
+
static const struct file_operations debugfs_results_fops = {
.open = debugfs_results_open,
.read = seq_read,
@@ -100,10 +149,23 @@ static const struct file_operations debugfs_results_fops = {
.release = debugfs_release,
};
+static const struct file_operations debugfs_run_fops = {
+ .open = debugfs_run_open,
+ .read = seq_read,
+ .write = debugfs_run,
+ .llseek = seq_lseek,
+ .release = debugfs_release,
+};
+
void kunit_debugfs_create_suite(struct kunit_suite *suite)
{
struct kunit_case *test_case;
+ if (suite->log) {
+ /* Clear the suite log that's leftover from a previous run. */
+ suite->log[0] = '\0';
+ return;
+ }
/* Allocate logs before creating debugfs representation. */
suite->log = kzalloc(KUNIT_LOG_SIZE, GFP_KERNEL);
kunit_suite_for_each_test_case(suite, test_case)
@@ -114,6 +176,10 @@ void kunit_debugfs_create_suite(struct kunit_suite *suite)
debugfs_create_file(KUNIT_DEBUGFS_RESULTS, S_IFREG | 0444,
suite->debugfs,
suite, &debugfs_results_fops);
+
+ debugfs_create_file(KUNIT_DEBUGFS_RUN, S_IFREG | 0644,
+ suite->debugfs,
+ suite, &debugfs_run_fops);
}
void kunit_debugfs_destroy_suite(struct kunit_suite *suite)
diff --git a/lib/kunit/test.c b/lib/kunit/test.c
index 49698a168437..5058a72d9e8a 100644
--- a/lib/kunit/test.c
+++ b/lib/kunit/test.c
@@ -13,6 +13,7 @@
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/moduleparam.h>
+#include <linux/mutex.h>
#include <linux/panic.h>
#include <linux/sched/debug.h>
#include <linux/sched.h>
@@ -22,6 +23,8 @@
#include "string-stream.h"
#include "try-catch-impl.h"
+static struct mutex kunit_run_lock;
+
/*
* Hook to fail the current test and print an error message to the log.
*/
@@ -702,6 +705,11 @@ int __kunit_test_suites_init(struct kunit_suite * const * const suites, int num_
return 0;
}
+ /* Use mutex lock to guard against running tests concurrently. */
+ if (mutex_lock_interruptible(&kunit_run_lock)) {
+ pr_err("kunit: test interrupted\n");
+ return -EINTR;
+ }
static_branch_inc(&kunit_running);
for (i = 0; i < num_suites; i++) {
@@ -710,6 +718,7 @@ int __kunit_test_suites_init(struct kunit_suite * const * const suites, int num_
}
static_branch_dec(&kunit_running);
+ mutex_unlock(&kunit_run_lock);
return 0;
}
EXPORT_SYMBOL_GPL(__kunit_test_suites_init);
@@ -869,6 +878,10 @@ static int __init kunit_init(void)
kunit_install_hooks();
kunit_debugfs_init();
+
+ /* Initialize lock to guard against running tests concurrently. */
+ mutex_init(&kunit_run_lock);
+
#ifdef CONFIG_MODULES
return register_module_notifier(&kunit_mod_nb);
#else
base-commit: 582eb3aeed2d06b122fba95518b84506d3d4ceb9
--
2.41.0.694.ge786442a9b-goog
As is described in the "How to use MPTCP?" section in MPTCP wiki [1]:
"Your app should create sockets with IPPROTO_MPTCP as the proto:
( socket(AF_INET, SOCK_STREAM, IPPROTO_MPTCP); ). Legacy apps can be
forced to create and use MPTCP sockets instead of TCP ones via the
mptcpize command bundled with the mptcpd daemon."
But the mptcpize (LD_PRELOAD technique) command has some limitations
[2]:
- it doesn't work if the application is not using libc (e.g. GoLang
apps)
- in some envs, it might not be easy to set env vars / change the way
apps are launched, e.g. on Android
- mptcpize needs to be launched with all apps that want MPTCP: we could
have more control from BPF to enable MPTCP only for some apps or all the
ones of a netns or a cgroup, etc.
- it is not in BPF, we cannot talk about it at netdev conf.
So this patchset attempts to use BPF to implement functions similer to
mptcpize.
The main idea is to add a hook in sys_socket() to change the protocol id
from IPPROTO_TCP (or 0) to IPPROTO_MPTCP.
[1]
https://github.com/multipath-tcp/mptcp_net-next/wiki
[2]
https://github.com/multipath-tcp/mptcp_net-next/issues/79
v13:
- drop "Use random netns name for mptcp" patch.
v12:
- update diag_* log of update_socket_protocol.
- add 'ip netns show' after 'ip netns del' to check if there is
a test did not clean up its netns.
- return libbpf_get_error() instead of -EIO for the error from
open_and_load().
- Use getsockopt(SOL_PROTOCOL) to verify mptcp protocol intead of
using 'ss -tOni'.
v11:
- add comments about outputs of 'ss' and 'nstat'.
- use "err = verify_mptcpify()" instead of using =+.
v10:
- drop "#ifdef CONFIG_BPF_JIT".
- include vmlinux.h and bpf_tracing_net.h to avoid defining some
macros.
- drop unneeded checks for mptcp.
v9:
- update comment for 'update_socket_protocol'.
v8:
- drop the additional checks on the 'protocol' value after the
'update_socket_protocol()' call.
v7:
- add __weak and __diag_* for update_socket_protocol.
v6:
- add update_socket_protocol.
v5:
- add bpf_mptcpify helper.
v4:
- use lsm_cgroup/socket_create
v3:
- patch 8: char cmd[128]; -> char cmd[256];
v2:
- Fix build selftests errors reported by CI
Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/79
Geliang Tang (4):
bpf: Add update_socket_protocol hook
selftests/bpf: Add two mptcp netns helpers
selftests/bpf: Fix error checks of mptcp open_and_load
selftests/bpf: Add mptcpify test
net/mptcp/bpf.c | 15 ++
net/socket.c | 26 +++-
.../testing/selftests/bpf/prog_tests/mptcp.c | 141 +++++++++++++++---
tools/testing/selftests/bpf/progs/mptcpify.c | 20 +++
4 files changed, 182 insertions(+), 20 deletions(-)
create mode 100644 tools/testing/selftests/bpf/progs/mptcpify.c
--
2.35.3
From: Joel Fernandes (Google) <joel(a)joelfernandes.org>
Often times during debugging, it is difficult to jump to the ftrace dump
in the console log and treat it independent of the result of the log file.
Copy the contents of the buffers into its own file to make it easier to refer
to the ftrace dump. The original ftrace dump is still available in the
console log if it is desired to refer to it there.
Signed-off-by: Joel Fernandes (Google) <joel(a)joelfernandes.org>
---
v1-v2: Change log updates, "From:" updates.
.../selftests/rcutorture/bin/functions.sh | 24 +++++++++++++++++++
.../selftests/rcutorture/bin/parse-console.sh | 7 ++++++
2 files changed, 31 insertions(+)
mode change 100644 => 100755 tools/testing/selftests/rcutorture/bin/functions.sh
diff --git a/tools/testing/selftests/rcutorture/bin/functions.sh b/tools/testing/selftests/rcutorture/bin/functions.sh
old mode 100644
new mode 100755
index b8e2ea23cb3f..2ec4ab87a7f0
--- a/tools/testing/selftests/rcutorture/bin/functions.sh
+++ b/tools/testing/selftests/rcutorture/bin/functions.sh
@@ -331,3 +331,27 @@ specify_qemu_net () {
echo $1 -net none
fi
}
+
+# Extract the ftrace output from the console log output
+# The ftrace output looks in the logs looks like:
+# Dumping ftrace buffer:
+# ---------------------------------
+# [...]
+# ---------------------------------
+extract_ftrace_from_console() {
+ awk '
+ /Dumping ftrace buffer:/ {
+ capture = 1
+ next
+ }
+ /---------------------------------/ {
+ if(capture == 1) {
+ capture = 2
+ next
+ } else if(capture == 2) {
+ capture = 0
+ }
+ }
+ capture == 2
+ ' "$1";
+}
diff --git a/tools/testing/selftests/rcutorture/bin/parse-console.sh b/tools/testing/selftests/rcutorture/bin/parse-console.sh
index 9ab0f6bc172c..e3d2f69ec0fb 100755
--- a/tools/testing/selftests/rcutorture/bin/parse-console.sh
+++ b/tools/testing/selftests/rcutorture/bin/parse-console.sh
@@ -182,3 +182,10 @@ if ! test -s $file.diags
then
rm -f $file.diags
fi
+
+# Call extract_ftrace_from_console function, if the output is empty,
+# don't create $file.ftrace. Otherwise output the results to $file.ftrace
+extract_ftrace_from_console $file > $file.ftrace
+if [ ! -s $file.ftrace ]; then
+ rm -f $file.ftrace
+fi
--
2.41.0.640.ga95def55d0-goog
Hi all:
The core frequency is subjected to the process variation in semiconductors.
Not all cores are able to reach the maximum frequency respecting the
infrastructure limits. Consequently, AMD has redefined the concept of
maximum frequency of a part. This means that a fraction of cores can reach
maximum frequency. To find the best process scheduling policy for a given
scenario, OS needs to know the core ordering informed by the platform through
highest performance capability register of the CPPC interface.
Earlier implementations of AMD Pstate Preferred Core only support a static
core ranking and targeted performance. Now it has the ability to dynamically
change the preferred core based on the workload and platform conditions and
accounting for thermals and aging.
AMD Pstate driver utilizes the functions and data structures provided by
the ITMT architecture to enable the scheduler to favor scheduling on cores
which can be get a higher frequency with lower voltage.
We call it AMD Pstate Preferrred Core.
Here sched_set_itmt_core_prio() is called to set priorities and
sched_set_itmt_support() is called to enable ITMT feature.
AMD Pstate driver uses the highest performance value to indicate
the priority of CPU. The higher value has a higher priority.
AMD Pstate driver will provide an initial core ordering at boot time.
It relies on the CPPC interface to communicate the core ranking to the
operating system and scheduler to make sure that OS is choosing the cores
with highest performance firstly for scheduling the process. When AMD Pstate
driver receives a message with the highest performance change, it will
update the core ranking.
Changes form V1->V2:
- acpi: cppc:
- - Add reference link.
- cpufreq:
- - Moidfy link error.
- cpufreq: amd-pstate:
- - Init the priorities of all online CPUs
- - Use a single variable to represent the status of Preferred Core.
- Documentation:
- - Default enabled preferred core.
- Documentation: amd-pstate:
- - Modify inappropriate descriptions.
- - Default enabled preferred core.
- - Use a single variable to represent the status of Preferred Core.
Meng Li (7):
x86: Drop CPU_SUP_INTEL from SCHED_MC_PRIO for the expansion.
acpi: cppc: Add get the highest performance cppc control
cpufreq: amd-pstate: Enable AMD Pstate Preferred Core Supporting.
cpufreq: Add a notification message that the highest perf has changed
cpufreq: amd-pstate: Update AMD Pstate Preferred Core ranking
dynamically
Documentation: amd-pstate: introduce AMD Pstate Preferred Core
Documentation: introduce AMD Pstate Preferrd Core mode kernel command
line options
.../admin-guide/kernel-parameters.txt | 5 +
Documentation/admin-guide/pm/amd-pstate.rst | 54 +++++++
arch/x86/Kconfig | 3 +-
drivers/acpi/cppc_acpi.c | 13 ++
drivers/acpi/processor_driver.c | 6 +
drivers/cpufreq/amd-pstate.c | 152 ++++++++++++++++--
drivers/cpufreq/cpufreq.c | 13 ++
include/acpi/cppc_acpi.h | 5 +
include/linux/amd-pstate.h | 1 +
include/linux/cpufreq.h | 4 +
10 files changed, 239 insertions(+), 17 deletions(-)
--
2.34.1
Add new feature checks related to crypto to the hwcap test.
The following is a log snippet from my local testing environment
based on for-next/selftests:
~~~
TAP version 13
1..111
# AES present
ok 1 cpuinfo_match_AES
ok 2 sigill_AES
ok 3 # SKIP sigbus_AES
# CRC32 present
ok 4 cpuinfo_match_CRC32
ok 5 sigill_CRC32
ok 6 # SKIP sigbus_CRC32
ok 7 cpuinfo_match_CSSC
# sigill_reported for CSSC
ok 8 # SKIP sigill_CSSC
ok 9 # SKIP sigbus_CSSC
# FP present
ok 10 cpuinfo_match_FP
ok 11 sigill_FP
ok 12 # SKIP sigbus_FP
# JSCVT present
ok 13 cpuinfo_match_JSCVT
ok 14 sigill_JSCVT
ok 15 # SKIP sigbus_JSCVT
# LRCPC present
ok 16 cpuinfo_match_LRCPC
ok 17 sigill_LRCPC
ok 18 # SKIP sigbus_LRCPC
# LRCPC2 present
ok 19 cpuinfo_match_LRCPC2
ok 20 sigill_LRCPC2
ok 21 # SKIP sigbus_LRCPC2
# LSE present
ok 22 cpuinfo_match_LSE
ok 23 sigill_LSE
ok 24 # SKIP sigbus_LSE
# LSE2 present
ok 25 cpuinfo_match_LSE2
ok 26 sigill_LSE2
ok 27 sigbus_LSE2
ok 28 cpuinfo_match_MOPS
ok 29 sigill_MOPS
ok 30 # SKIP sigbus_MOPS
# RNG present
ok 31 cpuinfo_match_RNG
ok 32 sigill_RNG
ok 33 # SKIP sigbus_RNG
# PMULL present
ok 34 cpuinfo_match_PMULL
ok 35 sigill_PMULL
ok 36 # SKIP sigbus_PMULL
ok 37 cpuinfo_match_RPRFM
ok 38 # SKIP sigill_RPRFM
ok 39 # SKIP sigbus_RPRFM
# SHA1 present
ok 40 cpuinfo_match_SHA1
ok 41 sigill_SHA1
ok 42 # SKIP sigbus_SHA1
# SHA2 present
ok 43 cpuinfo_match_SHA2
ok 44 sigill_SHA2
ok 45 # SKIP sigbus_SHA2
# SHA512 present
ok 46 cpuinfo_match_SHA512
ok 47 sigill_SHA512
ok 48 # SKIP sigbus_SHA512
ok 49 cpuinfo_match_SME
ok 50 sigill_SME
ok 51 # SKIP sigbus_SME
ok 52 cpuinfo_match_SME2
ok 53 sigill_SME2
ok 54 # SKIP sigbus_SME2
ok 55 cpuinfo_match_SME 2.1
# sigill_reported for SME 2.1
ok 56 # SKIP sigill_SME 2.1
ok 57 # SKIP sigbus_SME 2.1
ok 58 cpuinfo_match_SME I16I32
# sigill_reported for SME I16I32
ok 59 # SKIP sigill_SME I16I32
ok 60 # SKIP sigbus_SME I16I32
ok 61 cpuinfo_match_SME BI32I32
# sigill_reported for SME BI32I32
ok 62 # SKIP sigill_SME BI32I32
ok 63 # SKIP sigbus_SME BI32I32
ok 64 cpuinfo_match_SME B16B16
# sigill_reported for SME B16B16
ok 65 # SKIP sigill_SME B16B16
ok 66 # SKIP sigbus_SME B16B16
ok 67 cpuinfo_match_SME F16F16
# sigill_reported for SME F16F16
ok 68 # SKIP sigill_SME F16F16
ok 69 # SKIP sigbus_SME F16F16
# SVE present
ok 70 cpuinfo_match_SVE
ok 71 sigill_SVE
ok 72 # SKIP sigbus_SVE
ok 73 cpuinfo_match_SVE 2
# sigill_reported for SVE 2
ok 74 # SKIP sigill_SVE 2
ok 75 # SKIP sigbus_SVE 2
ok 76 cpuinfo_match_SVE 2.1
# sigill_reported for SVE 2.1
ok 77 # SKIP sigill_SVE 2.1
ok 78 # SKIP sigbus_SVE 2.1
ok 79 cpuinfo_match_SVE AES
# sigill_reported for SVE AES
ok 80 # SKIP sigill_SVE AES
ok 81 # SKIP sigbus_SVE AES
ok 82 cpuinfo_match_SVE2 PMULL
# sigill_reported for SVE2 PMULL
ok 83 # SKIP sigill_SVE2 PMULL
ok 84 # SKIP sigbus_SVE2 PMULL
ok 85 cpuinfo_match_SVE2 BITPERM
# sigill_reported for SVE2 BITPERM
ok 86 # SKIP sigill_SVE2 BITPERM
ok 87 # SKIP sigbus_SVE2 BITPERM
ok 88 cpuinfo_match_SVE2 SHA3
# sigill_reported for SVE2 SHA3
ok 89 # SKIP sigill_SVE2 SHA3
ok 90 # SKIP sigbus_SVE2 SHA3
ok 91 cpuinfo_match_SVE2 SM4
# sigill_reported for SVE2 SM4
ok 92 # SKIP sigill_SVE2 SM4
ok 93 # SKIP sigbus_SVE2 SM4
# SVE2 I8MM present
ok 94 cpuinfo_match_SVE2 I8MM
ok 95 sigill_SVE2 I8MM
ok 96 # SKIP sigbus_SVE2 I8MM
# SVE2 F32MM present
ok 97 cpuinfo_match_SVE2 F32MM
ok 98 sigill_SVE2 F32MM
ok 99 # SKIP sigbus_SVE2 F32MM
# SVE2 F64MM present
ok 100 cpuinfo_match_SVE2 F64MM
ok 101 sigill_SVE2 F64MM
ok 102 # SKIP sigbus_SVE2 F64MM
# SVE2 BF16 present
ok 103 cpuinfo_match_SVE2 BF16
ok 104 sigill_SVE2 BF16
ok 105 # SKIP sigbus_SVE2 BF16
ok 106 cpuinfo_match_SVE2 EBF16
ok 107 # SKIP sigill_SVE2 EBF16
ok 108 # SKIP sigbus_SVE2 EBF16
ok 109 cpuinfo_match_HBC
ok 110 sigill_HBC
ok 111 # SKIP sigbus_HBC
# Totals: pass:60 fail:0 xfail:0 xpass:0 skip:51 error:0
~~~
Zeng Heng (4):
kselftest/arm64: add SHA1 and related features to hwcap test
kselftest/arm64: add AES feature check to hwcap test
kselftest/arm64: add pmull feature to hwcap test
kselftest/arm64: add jscvt feature to hwcap test
tools/testing/selftests/arm64/abi/hwcap.c | 77 +++++++++++++++++++++++
1 file changed, 77 insertions(+)
--
2.25.1
Add new feature checks related to crypto to the hwcap test.
The following is a log snippet from my local testing environment
based on for-next/selftests:
~~~
TAP version 13
1..111
# AES present
ok 1 cpuinfo_match_AES
ok 2 sigill_AES
ok 3 # SKIP sigbus_AES
# CRC32 present
ok 4 cpuinfo_match_CRC32
ok 5 sigill_CRC32
ok 6 # SKIP sigbus_CRC32
ok 7 cpuinfo_match_CSSC
# sigill_reported for CSSC
ok 8 # SKIP sigill_CSSC
ok 9 # SKIP sigbus_CSSC
# FP present
ok 10 cpuinfo_match_FP
ok 11 sigill_FP
ok 12 # SKIP sigbus_FP
# JSCVT present
ok 13 cpuinfo_match_JSCVT
ok 14 sigill_JSCVT
ok 15 # SKIP sigbus_JSCVT
# LRCPC present
ok 16 cpuinfo_match_LRCPC
ok 17 sigill_LRCPC
ok 18 # SKIP sigbus_LRCPC
# LRCPC2 present
ok 19 cpuinfo_match_LRCPC2
ok 20 sigill_LRCPC2
ok 21 # SKIP sigbus_LRCPC2
# LSE present
ok 22 cpuinfo_match_LSE
ok 23 sigill_LSE
ok 24 # SKIP sigbus_LSE
# LSE2 present
ok 25 cpuinfo_match_LSE2
ok 26 sigill_LSE2
ok 27 sigbus_LSE2
ok 28 cpuinfo_match_MOPS
ok 29 sigill_MOPS
ok 30 # SKIP sigbus_MOPS
# RNG present
ok 31 cpuinfo_match_RNG
ok 32 sigill_RNG
ok 33 # SKIP sigbus_RNG
# PMULL present
ok 34 cpuinfo_match_PMULL
ok 35 sigill_PMULL
ok 36 # SKIP sigbus_PMULL
ok 37 cpuinfo_match_RPRFM
ok 38 # SKIP sigill_RPRFM
ok 39 # SKIP sigbus_RPRFM
# SHA1 present
ok 40 cpuinfo_match_SHA1
ok 41 sigill_SHA1
ok 42 # SKIP sigbus_SHA1
# SHA2 present
ok 43 cpuinfo_match_SHA2
ok 44 sigill_SHA2
ok 45 # SKIP sigbus_SHA2
# SHA512 present
ok 46 cpuinfo_match_SHA512
ok 47 sigill_SHA512
ok 48 # SKIP sigbus_SHA512
ok 49 cpuinfo_match_SME
ok 50 sigill_SME
ok 51 # SKIP sigbus_SME
ok 52 cpuinfo_match_SME2
ok 53 sigill_SME2
ok 54 # SKIP sigbus_SME2
ok 55 cpuinfo_match_SME 2.1
# sigill_reported for SME 2.1
ok 56 # SKIP sigill_SME 2.1
ok 57 # SKIP sigbus_SME 2.1
ok 58 cpuinfo_match_SME I16I32
# sigill_reported for SME I16I32
ok 59 # SKIP sigill_SME I16I32
ok 60 # SKIP sigbus_SME I16I32
ok 61 cpuinfo_match_SME BI32I32
# sigill_reported for SME BI32I32
ok 62 # SKIP sigill_SME BI32I32
ok 63 # SKIP sigbus_SME BI32I32
ok 64 cpuinfo_match_SME B16B16
# sigill_reported for SME B16B16
ok 65 # SKIP sigill_SME B16B16
ok 66 # SKIP sigbus_SME B16B16
ok 67 cpuinfo_match_SME F16F16
# sigill_reported for SME F16F16
ok 68 # SKIP sigill_SME F16F16
ok 69 # SKIP sigbus_SME F16F16
# SVE present
ok 70 cpuinfo_match_SVE
ok 71 sigill_SVE
ok 72 # SKIP sigbus_SVE
ok 73 cpuinfo_match_SVE 2
# sigill_reported for SVE 2
ok 74 # SKIP sigill_SVE 2
ok 75 # SKIP sigbus_SVE 2
ok 76 cpuinfo_match_SVE 2.1
# sigill_reported for SVE 2.1
ok 77 # SKIP sigill_SVE 2.1
ok 78 # SKIP sigbus_SVE 2.1
ok 79 cpuinfo_match_SVE AES
# sigill_reported for SVE AES
ok 80 # SKIP sigill_SVE AES
ok 81 # SKIP sigbus_SVE AES
ok 82 cpuinfo_match_SVE2 PMULL
# sigill_reported for SVE2 PMULL
ok 83 # SKIP sigill_SVE2 PMULL
ok 84 # SKIP sigbus_SVE2 PMULL
ok 85 cpuinfo_match_SVE2 BITPERM
# sigill_reported for SVE2 BITPERM
ok 86 # SKIP sigill_SVE2 BITPERM
ok 87 # SKIP sigbus_SVE2 BITPERM
ok 88 cpuinfo_match_SVE2 SHA3
# sigill_reported for SVE2 SHA3
ok 89 # SKIP sigill_SVE2 SHA3
ok 90 # SKIP sigbus_SVE2 SHA3
ok 91 cpuinfo_match_SVE2 SM4
# sigill_reported for SVE2 SM4
ok 92 # SKIP sigill_SVE2 SM4
ok 93 # SKIP sigbus_SVE2 SM4
# SVE2 I8MM present
ok 94 cpuinfo_match_SVE2 I8MM
ok 95 sigill_SVE2 I8MM
ok 96 # SKIP sigbus_SVE2 I8MM
# SVE2 F32MM present
ok 97 cpuinfo_match_SVE2 F32MM
ok 98 sigill_SVE2 F32MM
ok 99 # SKIP sigbus_SVE2 F32MM
# SVE2 F64MM present
ok 100 cpuinfo_match_SVE2 F64MM
ok 101 sigill_SVE2 F64MM
ok 102 # SKIP sigbus_SVE2 F64MM
# SVE2 BF16 present
ok 103 cpuinfo_match_SVE2 BF16
ok 104 sigill_SVE2 BF16
ok 105 # SKIP sigbus_SVE2 BF16
ok 106 cpuinfo_match_SVE2 EBF16
ok 107 # SKIP sigill_SVE2 EBF16
ok 108 # SKIP sigbus_SVE2 EBF16
ok 109 cpuinfo_match_HBC
ok 110 sigill_HBC
ok 111 # SKIP sigbus_HBC
# Totals: pass:60 fail:0 xfail:0 xpass:0 skip:51 error:0
~~~
Zeng Heng (4):
kselftest/arm64: add SHA1 and related features to hwcap test
kselftest/arm64: add AES feature check to hwcap test
kselftest/arm64: add pmull feature to hwcap test
kselftest/arm64: add jscvt feature to hwcap test
tools/testing/selftests/arm64/abi/hwcap.c | 77 +++++++++++++++++++++++
1 file changed, 77 insertions(+)
--
2.25.1
Hi,
Can anyone give some hints on how to run a single test on the kselftest
framework? The reason that I want such support is because I have to run
test cases inside emulator, which is very slow.
Per the kselftest documents, I can run kvm selftests with "make
-C tools/testing/selftests TARGETS=kvm run_tests", but it does not provide
a mechanism to run a single test in KVM subsystem. It takes a very long
time to finish the KVM subset testing inside the slow emulator while I'm
only trying to replace/add one testcase.
Currently I modify the code like below, to run only a single test. Not
sure if there is a more generic way to do that. If no such mechanism and
there are more people have similar need, is it possible to add it? I'm more
than happy to contribute with guide.
Thank you
--jyh
diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile
index c692cc86e7da..25fce1a3ceb8 100644
--- a/tools/testing/selftests/kvm/Makefile
+++ b/tools/testing/selftests/kvm/Makefile
@@ -56,7 +56,7 @@ LIBKVM_riscv += lib/riscv/processor.c
LIBKVM_riscv += lib/riscv/ucall.c
# Non-compiled test targets
-TEST_PROGS_x86_64 += x86_64/nx_huge_pages_test.sh
+#TEST_PROGS_x86_64 += x86_64/nx_huge_pages_test.sh
# Compiled test targets
TEST_GEN_PROGS_x86_64 = x86_64/cpuid_test
@@ -135,7 +135,7 @@ TEST_GEN_PROGS_x86_64 += kvm_binary_stats_test
TEST_GEN_PROGS_x86_64 += system_counter_offset_test
# Compiled outputs used by test targets
-TEST_GEN_PROGS_EXTENDED_x86_64 += x86_64/nx_huge_pages_test
+#TEST_GEN_PROGS_EXTENDED_x86_64 += x86_64/nx_huge_pages_test
TEST_GEN_PROGS_aarch64 += aarch64/aarch32_id_regs
TEST_GEN_PROGS_aarch64 += aarch64/arch_timer
@@ -186,6 +186,8 @@ TEST_GEN_PROGS += $(TEST_GEN_PROGS_$(ARCH_DIR))
TEST_GEN_PROGS_EXTENDED += $(TEST_GEN_PROGS_EXTENDED_$(ARCH_DIR))
LIBKVM += $(LIBKVM_$(ARCH_DIR))
+TEST_GEN_PROGS = x86_64/cr4_cpuid_sync_test
+
OVERRIDE_TARGETS = 1
# lib.mak defines $(OUTPUT), prepends $(OUTPUT)/ to $(TEST_GEN_PROGS), and most
As reported and suggested by Willy, the inline __sysret() helper
introduces three types of conversions and increases the size:
(1) the "unsigned long" argument to __sysret() forces a sign extension
from all sys_* functions that used to return 'int'
(2) the comparison with the error range now has to be performed on a
'unsigned long' instead of an 'int'
(3) the return value from __sysret() is a 'long' (note, a signed long)
which then has to be turned back to an 'int' before being returned by the
caller to satisfy the caller's prototype.
To fix up this, firstly, let's use macro instead of inline function to
preserves the input type and avoids these useless conversions (1), (3).
Secondly, comparison to -MAX_ERRNO inflicts on all integer returns where
we could previously keep a simple sign comparison, let's use a new
__is_pointer() macro suggested by David Laight to limit the comparison
to -MAX_ERRNO (2) only for pointer returns and preserve a simple sign
comparison for integer returns as before. The __builtin_choose_expr()
is suggested by David Laight to choose different comparisons based on
the types to share code.
Thirdly, fix up the following warning by an explicit conversion and let
__sysret() be able to accept the (void *) type of argument:
sysroot/powerpc/include/sys.h: In function 'sbrk':
sysroot/powerpc/include/sys.h:104:16: warning: cast to pointer from integer of different size [-Wint-to-pointer-cast]
104 | return (void *)__sysret(-ENOMEM);
Fourthly, to further workaround the argument type with 'const' flag,
must use __auto_type for gcc >= 11.0 and __typeof__((arg) + 0) suggested
by David Laight for old gcc versions.
Suggested-by: Willy Tarreau <w(a)1wt.eu>
Link: https://lore.kernel.org/lkml/20230806095846.GB10627@1wt.eu/
Link: https://lore.kernel.org/lkml/20230806134348.GA19145@1wt.eu/
Suggested-by: David Laight <David.Laight(a)ACULAB.COM>
Link: https://lore.kernel.org/lkml/f51e54bcf470451ea36f24640f000e61@AcuMS.aculab.…
Link: https://lore.kernel.org/lkml/a1732bbffd1542d3b9dd34c92f45076c@AcuMS.aculab.…
Signed-off-by: Zhangjin Wu <falcon(a)tinylab.org>
---
Hi, Willy, Hi, David
v5 applies suggestions from David Laight, it further drops the fixed
'long' conversion branch by using a __typeof__((arg) + 0) trick and also
merges the pointer type and integer type comparisons with
__bultin_choose_expr() and a new __is_pointer() macro, now, the code is
cleaner than before versions.
David, Thanks a lot!
Like before, tests run for all nolibc supported boards.
Changes from v4 --> v5:
* Use __typeof__((arg) + 0) to lose the 'const' flag for old gcc
versions.
* Import the famous __is_constexpr() macro from kernel side and add a
__is_pointer() macro based on it. (David, to avoid introduce extra
discuss on the prove-in-use __is_constexpr macro, this patch uses the
original version instead of your suggested version, more info here:
https://lore.kernel.org/lkml/20220131204357.1133674-1-keescook@chromium.org/)
* Use __builtin_choose_expr() to merge two comparisons to share the same
errno setting code and the -1L assignment code.
Changes from v3 --> v4:
* fix up a new warning about 'ret < 0' when the input arg type is (void *)
Changes from v2 --> v3:
* define a __GXX_HAS_AUTO_TYPE_WITH_CONST_SUPPORT for gcc >= 11.0 (ABI_VERSION >= 1016)
* split __sysret() to two versions by the macro instead of a mixed unified and unreadable version
* use shorter __ret instead of __sysret_arg
Changes from v1 --> v2:
* fix up argument with 'const' in the type
* support "void *" argument
Best regards,
Zhangjin
---
v4: https://lore.kernel.org/lkml/a4084f7fac7a89f861b5582774bc7a98634d1e76.16913…
v3: https://lore.kernel.org/lkml/8eaab5da2dcbba42e3f3efc2ae686a22c95f84f0.16913…
v2: https://lore.kernel.org/lkml/95fe3e732f455fab653fe1427118d905e4d04257.16913…
v1: https://lore.kernel.org/lkml/20230806131921.52453-1-falcon@tinylab.org/
---
tools/include/nolibc/sys.h | 74 ++++++++++++++++++++++++++++++--------
1 file changed, 59 insertions(+), 15 deletions(-)
diff --git a/tools/include/nolibc/sys.h b/tools/include/nolibc/sys.h
index 833d6c5e86dc..6bdd18716e84 100644
--- a/tools/include/nolibc/sys.h
+++ b/tools/include/nolibc/sys.h
@@ -27,23 +27,67 @@
#include "errno.h"
#include "types.h"
+/*
+ * This returns a constant expression while determining if an argument is
+ * a constant expression, most importantly without evaluating the argument.
+ * Glory to Martin Uecker <Martin.Uecker(a)med.uni-goettingen.de>
+ * (from include/linux/const.h)
+ */
+#define __is_constexpr(x) \
+ (sizeof(int) == sizeof(*(8 ? ((void *)((long)(x) * 0l)) : (int *)8)))
+
+/*
+ * "(void *)0 isn't 'constant enough' for is_constexpr() - so
+ * is_constexpr((type)0) can be used to detect pointer types."
+ * (from David Laight <David.Laight(a)ACULAB.COM>)
+ */
+#define __is_pointer(x) (!__is_constexpr((__typeof__(x))0))
-/* Syscall return helper for library routines, set errno as -ret when ret is in
- * range of [-MAX_ERRNO, -1]
+/*
+ * To preserve the input type and workaround the 'error: assignment of
+ * read-only variable' when the input type has 'const' flag.
+ *
+ * For gcc >= 11.0 (__GXX_ABI_VERSION = 1016), use the new __auto_type keyword
+ * instead of __typeof__().
*
- * Note, No official reference states the errno range here aligns with musl
- * (src/internal/syscall_ret.c) and glibc (sysdeps/unix/sysv/linux/sysdep.h)
+ * For old gcc versions, "use typeof((x) + 0) to lose the 'const' flag. The
+ * only downside is that char/short become int." (from David Laight
+ * <David.Laight(a)ACULAB.COM>)
*/
-static __inline__ __attribute__((unused, always_inline))
-long __sysret(unsigned long ret)
-{
- if (ret >= (unsigned long)-MAX_ERRNO) {
- SET_ERRNO(-(long)ret);
- return -1;
- }
- return ret;
-}
+#if __GXX_ABI_VERSION >= 1016
+#define __typeofdecl(arg) __auto_type
+#else
+#define __typeofdecl(arg) __typeof__((arg) + 0)
+#endif
+
+/* Syscall return helper for library routines
+ *
+ * - for pointer returns, set errno as -ret when ret is in [-MAX_ERRNO, -1]
+ * - for integer returns, set errno as -ret when ret < 0
+ *
+ * Note,
+ *
+ * - No official reference states the errno range, here aligns with musl
+ * (src/internal/syscall_ret.c) and glibc (sysdeps/unix/sysv/linux/sysdep.h).
+ *
+ * - To reduce binary size by removing useless type conversions and sign
+ * extensions, the helper is defined as a macro to preserve input type and
+ * provide two comparisons for both pointer and integer types during the
+ * compiling stage.
+ */
+
+#define __sysret(arg) \
+({ \
+ __typeofdecl(arg) __ret = (arg); \
+ if (__builtin_choose_expr(__is_pointer(arg), (unsigned long)-(MAX_ERRNO + 1), (long)__ret) \
+ < __builtin_choose_expr(__is_pointer(arg), (unsigned long)__ret, 0)) { \
+ SET_ERRNO(-(long)__ret); \
+ __ret = (__typeof__(arg))-1L; \
+ } \
+ __ret; \
+})
+
/* Functions in this file only describe syscalls. They're declared static so
* that the compiler usually decides to inline them while still being allowed
@@ -94,7 +138,7 @@ void *sbrk(intptr_t inc)
if (ret && sys_brk(ret + inc) == ret + inc)
return ret + inc;
- return (void *)__sysret(-ENOMEM);
+ return __sysret((void *)-ENOMEM);
}
@@ -682,7 +726,7 @@ void *sys_mmap(void *addr, size_t length, int prot, int flags, int fd,
static __attribute__((unused))
void *mmap(void *addr, size_t length, int prot, int flags, int fd, off_t offset)
{
- return (void *)__sysret((unsigned long)sys_mmap(addr, length, prot, flags, fd, offset));
+ return __sysret(sys_mmap(addr, length, prot, flags, fd, offset));
}
static __attribute__((unused))
--
2.25.1
The test allocates dcache inside a cgroup, then destroys the cgroups and
then checks the sanity of numbers on the parent level. The reason it
fails is because dentries are freed with an RCU delay - a debugging
sleep shows that usage drops as expected shortly after.
Insert a 1s sleep after completing the cgroup creation/deletions. This
should be good enough, assuming that machines running those tests are
otherwise not very busy. This commit is directly inspired by Johannes
over at the link below.
Link: https://lore.kernel.org/all/20230801135632.1768830-1-hannes@cmpxchg.org/
Signed-off-by: Lucas Karpinski <lkarpins(a)redhat.com>
---
tools/testing/selftests/cgroup/test_kmem.c | 3 +++
1 file changed, 3 insertions(+)
diff --git a/tools/testing/selftests/cgroup/test_kmem.c b/tools/testing/selftests/cgroup/test_kmem.c
index 67cc0182058d..7ac384bbfdd5 100644
--- a/tools/testing/selftests/cgroup/test_kmem.c
+++ b/tools/testing/selftests/cgroup/test_kmem.c
@@ -183,6 +183,9 @@ static int test_kmem_memcg_deletion(const char *root)
if (cg_run_in_subcgroups(parent, alloc_kmem_smp, NULL, 100))
goto cleanup;
+ /* wait for RCU freeing */
+ sleep(1);
+
current = cg_read_long(parent, "memory.current");
slab = cg_read_key_long(parent, "memory.stat", "slab ");
anon = cg_read_key_long(parent, "memory.stat", "anon ");
--
2.41.0
This adds support for receiving KeyUpdate messages (RFC 8446, 4.6.3
[1]). A sender transmits a KeyUpdate message and then changes its TX
key. The receiver should react by updating its RX key before
processing the next message.
This patchset implements key updates by:
1. pausing decryption when a KeyUpdate message is received, to avoid
attempting to use the old key to decrypt a record encrypted with
the new key
2. returning -EKEYEXPIRED to syscalls that cannot receive the
KeyUpdate message, until the rekey has been performed by userspace
3. passing the KeyUpdate message to userspace as a control message
4. allowing updates of the crypto_info via the TLS_TX/TLS_RX
setsockopts
This API has been tested with gnutls to make sure that it allows
userspace libraries to implement key updates [2]. Thanks to Frantisek
Krenzelok <fkrenzel(a)redhat.com> for providing the implementation in
gnutls and testing the kernel patches.
=======================================================================
Discussions around v2 of this patchset focused on how HW offload would
interact with rekey.
RX
- The existing SW path will handle all records between the KeyUpdate
message signaling the change of key and the new key becoming known
to the kernel -- those will be queued encrypted, and decrypted in
SW as they are read by userspace (once the key is provided, ie same
as this patchset)
- Call ->tls_dev_del + ->tls_dev_add immediately during
setsockopt(TLS_RX)
TX
- After setsockopt(TLS_TX), switch to the existing SW path (not the
current device_fallback) until we're able to re-enable HW offload
- tls_device_sendmsg will call into tls_sw_sendmsg under lock_sock
to avoid changing socket ops during the rekey while another
thread might be waiting on the lock
- We only re-enable HW offload (call ->tls_dev_add to install the new
key in HW) once all records sent with the old key have been
ACKed. At this point, all unacked records are SW-encrypted with the
new key, and the old key is unused by both HW and retransmissions.
- If there are no unacked records when userspace does
setsockopt(TLS_TX), we can (try to) install the new key in HW
immediately.
- If yet another key has been provided via setsockopt(TLS_TX), we
don't install intermediate keys, only the latest.
- TCP notifies ktls of ACKs via the icsk_clean_acked callback. In
case of a rekey, tls_icsk_clean_acked will record when all data
sent with the most recent past key has been sent. The next call
to sendmsg will install the new key in HW.
- We close and push the current SW record before reenabling
offload.
If ->tls_dev_add fails to install the new key in HW, we stay in SW
mode. We can add a counter to keep track of this.
In addition:
Because we can't change socket ops during a rekey, we'll also have to
modify do_tls_setsockopt_conf to check ctx->tx_conf and only call
either tls_set_device_offload or tls_set_sw_offload. RX already uses
the same ops for both TLS_HW and TLS_SW, so we could switch between HW
and SW mode on rekey.
An alternative would be to have a common sendmsg which locks
the socket and then calls the correct implementation. We'll need that
anyway for the offload under rekey case, so that would only add a test
to the SW path's ops (compared to the current code). That should allow
us to simplify build_protos a bit, but might have a performance
impact - we'll need to check it if we want to go that route.
=======================================================================
Note: in a future series, I'll clean up tls_set_sw_offload and
eliminate the per-cipher copy-paste using tls_cipher_size_desc.
[1] https://www.rfc-editor.org/rfc/rfc8446#section-4.6.3
[2] https://gitlab.com/gnutls/gnutls/-/merge_requests/1625
Sabrina Dubroca (6):
tls: remove tls_context argument from tls_set_sw_offload
tls: block decryption when a rekey is pending
tls: implement rekey for TLS1.3
docs: tls: document TLS1.3 key updates
selftests: tls: add key_generation argument to tls_crypto_info_init
selftests: tls: add rekey tests
Documentation/networking/tls.rst | 21 ++
include/net/tls.h | 3 +
net/tls/tls.h | 3 +-
net/tls/tls_device.c | 2 +-
net/tls/tls_main.c | 47 ++-
net/tls/tls_sw.c | 184 +++++++++---
tools/testing/selftests/net/tls.c | 466 +++++++++++++++++++++++++++++-
7 files changed, 661 insertions(+), 65 deletions(-)
--
2.40.1
From: Rong Tao <rongtao(a)cestc.cn>
commit 686a8bb72349("selftests/mm: split uffd tests into uffd-stress and
uffd-unit-tests") split uffd tests into uffd-stress and uffd-unit-tests,
obviously we need to modify the help information synchronously.
Also modify code indentation.
Signed-off-by: Rong Tao <rongtao(a)cestc.cn>
---
tools/testing/selftests/mm/uffd-stress.c | 24 ++++++++++++------------
1 file changed, 12 insertions(+), 12 deletions(-)
diff --git a/tools/testing/selftests/mm/uffd-stress.c b/tools/testing/selftests/mm/uffd-stress.c
index 995ff13e74c7..e40b6d7d2c0e 100644
--- a/tools/testing/selftests/mm/uffd-stress.c
+++ b/tools/testing/selftests/mm/uffd-stress.c
@@ -53,21 +53,21 @@ pthread_attr_t attr;
do { typeof(a) __tmp = (a); (a) = (b); (b) = __tmp; } while (0)
const char *examples =
- "# Run anonymous memory test on 100MiB region with 99999 bounces:\n"
- "./userfaultfd anon 100 99999\n\n"
- "# Run share memory test on 1GiB region with 99 bounces:\n"
- "./userfaultfd shmem 1000 99\n\n"
- "# Run hugetlb memory test on 256MiB region with 50 bounces:\n"
- "./userfaultfd hugetlb 256 50\n\n"
- "# Run the same hugetlb test but using private file:\n"
- "./userfaultfd hugetlb-private 256 50\n\n"
- "# 10MiB-~6GiB 999 bounces anonymous test, "
- "continue forever unless an error triggers\n"
- "while ./userfaultfd anon $[RANDOM % 6000 + 10] 999; do true; done\n\n";
+ "# Run anonymous memory test on 100MiB region with 99999 bounces:\n"
+ "./uffd-stress anon 100 99999\n\n"
+ "# Run share memory test on 1GiB region with 99 bounces:\n"
+ "./uffd-stress shmem 1000 99\n\n"
+ "# Run hugetlb memory test on 256MiB region with 50 bounces:\n"
+ "./uffd-stress hugetlb 256 50\n\n"
+ "# Run the same hugetlb test but using private file:\n"
+ "./uffd-stress hugetlb-private 256 50\n\n"
+ "# 10MiB-~6GiB 999 bounces anonymous test, "
+ "continue forever unless an error triggers\n"
+ "while ./uffd-stress anon $[RANDOM % 6000 + 10] 999; do true; done\n\n";
static void usage(void)
{
- fprintf(stderr, "\nUsage: ./userfaultfd <test type> <MiB> <bounces>\n\n");
+ fprintf(stderr, "\nUsage: ./uffd-stress <test type> <MiB> <bounces>\n\n");
fprintf(stderr, "Supported <test type>: anon, hugetlb, "
"hugetlb-private, shmem, shmem-private\n\n");
fprintf(stderr, "Examples:\n\n");
--
2.39.3
We mix up KUNIT_TRIGGER_STATIC_STUB and KUNIT_STATIC_STUB_REDIRECT in
static_stub header. Just correct KUNIT_TRIGGER_STATIC_STUB to
KUNIT_STATIC_STUB_REDIRECT which is documented.
Signed-off-by: Kemeng Shi <shikemeng(a)huaweicloud.com>
Fixes: e047c5eaa763 ("kunit: Expose 'static stub' API to redirect functions")
Reviewed-by: David Gow <davidgow(a)google.com>
---
v1->v2:
-Fix typo
-Add Fixes tag.
-Collect RVB from David
---
include/kunit/static_stub.h | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/include/kunit/static_stub.h b/include/kunit/static_stub.h
index 9b80150a5d62..85315c80b303 100644
--- a/include/kunit/static_stub.h
+++ b/include/kunit/static_stub.h
@@ -11,7 +11,7 @@
#if !IS_ENABLED(CONFIG_KUNIT)
/* If CONFIG_KUNIT is not enabled, these stubs quietly disappear. */
-#define KUNIT_TRIGGER_STATIC_STUB(real_fn_name, args...) do {} while (0)
+#define KUNIT_STATIC_STUB_REDIRECT(real_fn_name, args...) do {} while (0)
#else
@@ -30,7 +30,7 @@
* This is a function prologue which is used to allow calls to the current
* function to be redirected by a KUnit test. KUnit tests can call
* kunit_activate_static_stub() to pass a replacement function in. The
- * replacement function will be called by KUNIT_TRIGGER_STATIC_STUB(), which
+ * replacement function will be called by KUNIT_STATIC_STUB_REDIRECT(), which
* will then return from the function. If the caller is not in a KUnit context,
* the function will continue execution as normal.
*
@@ -87,7 +87,7 @@ void __kunit_activate_static_stub(struct kunit *test,
* When activated, calls to real_fn_addr from within this test (even if called
* indirectly) will instead call replacement_addr. The function pointed to by
* real_fn_addr must begin with the static stub prologue in
- * KUNIT_TRIGGER_STATIC_STUB() for this to work. real_fn_addr and
+ * KUNIT_STATIC_STUB_REDIRECT() for this to work. real_fn_addr and
* replacement_addr must have the same type.
*
* The redirection can be disabled again with kunit_deactivate_static_stub().
--
2.30.0
Hi, Willy
Here is v2 of the customized CROSS_COMPILE support, this helps a lot
during the testing of the other cross-arch nolibc changes:
$ ARCHS="i386 x86_64 arm64 arm mips ppc ppc64 ppc64le riscv s390"
$ for arch in ${ARCHS[@]}; do printf "%9s: " $arch; make run-user XARCH=$arch | grep status; done
Based on your suggestion, we did this changes:
- The qemu notes patch [1] is removed, welcome your doc file ;-)
- Arnd's crosstools are customized by default
- Import cc-cross-prefix to support local cross toolchains too
- Use mips64 toolchains for mips like x86_64 toolchains for i386, allow
download less toolchains
- Use HOSTCC for libc-test compiling
Changes from v1 --> v2:
* selftests/nolibc: allow use x86_64 toolchain for i386
No change.
* selftests/nolibc: allow use mips64 toolchain for mips
Allow download less toolchains, save time save storage space
* selftests/nolibc: libc-test: use HOSTCC instead of CC
libc-test is mainly for local test, use HOSTCC
* selftests/nolibc: allow customize CROSS_COMPILE by architecture
Moved the ../../../scripts/Makefile.include after our customized
CROSS_COMPILE, to let it prefix CC with $(CROSS_COMPILE) for us.
* selftests/nolibc: customize CROSS_COMPILE for all architectures
Use Arnd's crosstools as the default ones
* selftests/nolibc: import cc-cross-prefix macro
selftests/nolibc: allow use cross toolchains from software repository
Import cc-cross-prefix to allow customize a list of the cross
compilers, the ones from local repositories are appended in.
If already installed ones from local repos, why not use them, let's
do it.
Willy, since this series is really important to test the coming
patchsets, I send it here before the others to simplify the testing, but
we can delay its review, it is not urgent.
And here [2] is the simple script I wrote to download, decompress and
configure the PATH variable for Anrd's crosstools, hope it helps.
Best regards,
Zhangjin Wu
---
[1]: https://lore.kernel.org/lkml/6de680acbc2d87e13a680d4453ef022568bf489b.16912…
[2]: https://gitee.com/tinylab/linux-lab/blob/next/tools/nolibc/crosstool.sh
v1: https://lore.kernel.org/lkml/cover.1691263493.git.falcon@tinylab.org/
Zhangjin Wu (7):
selftests/nolibc: allow use x86_64 toolchain for i386
selftests/nolibc: allow use mips64 toolchain for mips
selftests/nolibc: libc-test: use HOSTCC instead of CC
selftests/nolibc: allow customize CROSS_COMPILE by architecture
selftests/nolibc: customize CROSS_COMPILE for all architectures
selftests/nolibc: import cc-cross-prefix macro
selftests/nolibc: allow use cross toolchains from software repository
tools/testing/selftests/nolibc/Makefile | 38 +++++++++++++++++++++----
1 file changed, 33 insertions(+), 5 deletions(-)
--
2.25.1
We mix up KUNIT_TRIGGER_STATIC_STUB and KUNIT_STATIC_STUB_REDIRECT in
static_stub header. Just correct KUNIT_TRIGGER_STATIC_STUB to
KUNIT_STATIC_STUB_REDIRECT which is documented.
Signed-off-by: Kemeng Shi <shikemeng(a)huaweicloud.com>
---
include/kunit/static_stub.h | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/include/kunit/static_stub.h b/include/kunit/static_stub.h
index 9b80150a5d62..85315c80b303 100644
--- a/include/kunit/static_stub.h
+++ b/include/kunit/static_stub.h
@@ -11,7 +11,7 @@
#if !IS_ENABLED(CONFIG_KUNIT)
/* If CONFIG_KUNIT is not enabled, these stubs quietly disappear. */
-#define KUNIT_TRIGGER_STATIC_STUB(real_fn_name, args...) do {} while (0)
+#define KUNIT_STATIC_STUB_REDIRECT(real_fn_name, args...) do {} while (0)
#else
@@ -30,7 +30,7 @@
* This is a function prologue which is used to allow calls to the current
* function to be redirected by a KUnit test. KUnit tests can call
* kunit_activate_static_stub() to pass a replacement function in. The
- * replacement function will be called by KUNIT_TRIGGER_STATIC_STUB(), which
+ * replacement function will be called by KUNIT_STATIC_STUB_REDIRECT(), which
* will then return from the function. If the caller is not in a KUnit context,
* the function will continue execution as normal.
*
@@ -87,7 +87,7 @@ void __kunit_activate_static_stub(struct kunit *test,
* When activated, calls to real_fn_addr from within this test (even if called
* indirectly) will instead call replacement_addr. The function pointed to by
* real_fn_addr must begin with the static stub prologue in
- * KUNIT_TRIGGER_STATIC_STUB() for this to work. real_fn_addr and
+ * KUNIT_STATIC_STUB_REDIRECT() for this to work. real_fn_addr and
* replacement_addr must have the same type.
*
* The redirection can be disabled again with kunit_deactivate_static_stub().
--
2.30.0
*Changes in v29:*
- Polish IOCTL and improve documentation
*Changes in v28:*
- Fix walk_end and add 17 test cases in selftests patch
*Changes in v27:*
- Handle review comments and minor improvements
- Add performance improvement patch on top with test for easy review
*Changes in v26:*
- Code re-structurring and API changes in PAGEMAP_IOCTL
*Changes in v25*:
- Do proper filtering on hole as well (hole got missed earlier)
*Changes in v24*:
- Rebase on top of next-20230710
- Place WP markers in case of hole as well
*Changes in v23*:
- Set vec_buf_index in loop only when vec_buf_index is set
- Return -EFAULT instead of -EINVAL if vec is NULL
- Correctly return the walk ending address to the page granularity
*Changes in v22*:
- Interface change:
- Replace [start start + len) with [start, end)
- Return the ending address of the address walk in start
*Changes in v21*:
- Abort walk instead of returning error if WP is to be performed on
partial hugetlb
*Changes in v20*
- Correct PAGE_IS_FILE and add PAGE_IS_PFNZERO
*Changes in v19*
- Minor changes and interface updates
*Changes in v18*
- Rebase on top of next-20230613
- Minor updates
*Changes in v17*
- Rebase on top of next-20230606
- Minor improvements in PAGEMAP_SCAN IOCTL patch
*Changes in v16*
- Fix a corner case
- Add exclusive PM_SCAN_OP_WP back
*Changes in v15*
- Build fix (Add missed build fix in RESEND)
*Changes in v14*
- Fix build error caused by #ifdef added at last minute in some configs
*Changes in v13*
- Rebase on top of next-20230414
- Give-up on using uffd_wp_range() and write new helpers, flush tlb only
once
*Changes in v12*
- Update and other memory types to UFFD_FEATURE_WP_ASYNC
- Rebaase on top of next-20230406
- Review updates
*Changes in v11*
- Rebase on top of next-20230307
- Base patches on UFFD_FEATURE_WP_UNPOPULATED
- Do a lot of cosmetic changes and review updates
- Remove ENGAGE_WP + !GET operation as it can be performed with
UFFDIO_WRITEPROTECT
*Changes in v10*
- Add specific condition to return error if hugetlb is used with wp
async
- Move changes in tools/include/uapi/linux/fs.h to separate patch
- Add documentation
*Changes in v9:*
- Correct fault resolution for userfaultfd wp async
- Fix build warnings and errors which were happening on some configs
- Simplify pagemap ioctl's code
*Changes in v8:*
- Update uffd async wp implementation
- Improve PAGEMAP_IOCTL implementation
*Changes in v7:*
- Add uffd wp async
- Update the IOCTL to use uffd under the hood instead of soft-dirty
flags
*Motivation*
The real motivation for adding PAGEMAP_SCAN IOCTL is to emulate Windows
GetWriteWatch() and ResetWriteWatch() syscalls [1]. The GetWriteWatch()
retrieves the addresses of the pages that are written to in a region of
virtual memory.
This syscall is used in Windows applications and games etc. This syscall is
being emulated in pretty slow manner in userspace. Our purpose is to
enhance the kernel such that we translate it efficiently in a better way.
Currently some out of tree hack patches are being used to efficiently
emulate it in some kernels. We intend to replace those with these patches.
So the whole gaming on Linux can effectively get benefit from this. It
means there would be tons of users of this code.
CRIU use case [2] was mentioned by Andrei and Danylo:
> Use cases for migrating sparse VMAs are binaries sanitized with ASAN,
> MSAN or TSAN [3]. All of these sanitizers produce sparse mappings of
> shadow memory [4]. Being able to migrate such binaries allows to highly
> reduce the amount of work needed to identify and fix post-migration
> crashes, which happen constantly.
Andrei's defines the following uses of this code:
* it is more granular and allows us to track changed pages more
effectively. The current interface can clear dirty bits for the entire
process only. In addition, reading info about pages is a separate
operation. It means we must freeze the process to read information
about all its pages, reset dirty bits, only then we can start dumping
pages. The information about pages becomes more and more outdated,
while we are processing pages. The new interface solves both these
downsides. First, it allows us to read pte bits and clear the
soft-dirty bit atomically. It means that CRIU will not need to freeze
processes to pre-dump their memory. Second, it clears soft-dirty bits
for a specified region of memory. It means CRIU will have actual info
about pages to the moment of dumping them.
* The new interface has to be much faster because basic page filtering
is happening in the kernel. With the old interface, we have to read
pagemap for each page.
*Implementation Evolution (Short Summary)*
From the definition of GetWriteWatch(), we feel like kernel's soft-dirty
feature can be used under the hood with some additions like:
* reset soft-dirty flag for only a specific region of memory instead of
clearing the flag for the entire process
* get and clear soft-dirty flag for a specific region atomically
So we decided to use ioctl on pagemap file to read or/and reset soft-dirty
flag. But using soft-dirty flag, sometimes we get extra pages which weren't
even written. They had become soft-dirty because of VMA merging and
VM_SOFTDIRTY flag. This breaks the definition of GetWriteWatch(). We were
able to by-pass this short coming by ignoring VM_SOFTDIRTY until David
reported that mprotect etc messes up the soft-dirty flag while ignoring
VM_SOFTDIRTY [5]. This wasn't happening until [6] got introduced. We
discussed if we can revert these patches. But we could not reach to any
conclusion. So at this point, I made couple of tries to solve this whole
VM_SOFTDIRTY issue by correcting the soft-dirty implementation:
* [7] Correct the bug fixed wrongly back in 2014. It had potential to cause
regression. We left it behind.
* [8] Keep a list of soft-dirty part of a VMA across splits and merges. I
got the reply don't increase the size of the VMA by 8 bytes.
At this point, we left soft-dirty considering it is too much delicate and
userfaultfd [9] seemed like the only way forward. From there onward, we
have been basing soft-dirty emulation on userfaultfd wp feature where
kernel resolves the faults itself when WP_ASYNC feature is used. It was
straight forward to add WP_ASYNC feature in userfautlfd. Now we get only
those pages dirty or written-to which are really written in reality. (PS
There is another WP_UNPOPULATED userfautfd feature is required which is
needed to avoid pre-faulting memory before write-protecting [9].)
All the different masks were added on the request of CRIU devs to create
interface more generic and better.
[1] https://learn.microsoft.com/en-us/windows/win32/api/memoryapi/nf-memoryapi-…
[2] https://lore.kernel.org/all/20221014134802.1361436-1-mdanylo@google.com
[3] https://github.com/google/sanitizers
[4] https://github.com/google/sanitizers/wiki/AddressSanitizerAlgorithm#64-bit
[5] https://lore.kernel.org/all/bfcae708-db21-04b4-0bbe-712badd03071@redhat.com
[6] https://lore.kernel.org/all/20220725142048.30450-1-peterx@redhat.com/
[7] https://lore.kernel.org/all/20221122115007.2787017-1-usama.anjum@collabora.…
[8] https://lore.kernel.org/all/20221220162606.1595355-1-usama.anjum@collabora.…
[9] https://lore.kernel.org/all/20230306213925.617814-1-peterx@redhat.com
[10] https://lore.kernel.org/all/20230125144529.1630917-1-mdanylo@google.com
* Original Cover letter from v8*
Hello,
Note:
Soft-dirty pages and pages which have been written-to are synonyms. As
kernel already has soft-dirty feature inside which we have given up to
use, we are using written-to terminology while using UFFD async WP under
the hood.
It is possible to find and clear soft-dirty pages entirely in userspace.
But it isn't efficient:
- The mprotect and SIGSEGV handler for bookkeeping
- The userfaultfd wp (synchronous) with the handler for bookkeeping
Some benchmarks can be seen here[1]. This series adds features that weren't
present earlier:
- There is no atomic get soft-dirty/Written-to status and clear present in
the kernel.
- The pages which have been written-to can not be found in accurate way.
(Kernel's soft-dirty PTE bit + sof_dirty VMA bit shows more soft-dirty
pages than there actually are.)
Historically, soft-dirty PTE bit tracking has been used in the CRIU
project. The procfs interface is enough for finding the soft-dirty bit
status and clearing the soft-dirty bit of all the pages of a process.
We have the use case where we need to track the soft-dirty PTE bit for
only specific pages on-demand. We need this tracking and clear mechanism
of a region of memory while the process is running to emulate the
getWriteWatch() syscall of Windows.
*(Moved to using UFFD instead of soft-dirty feature to find pages which
have been written-to from v7 patch series)*:
Stop using the soft-dirty flags for finding which pages have been
written to. It is too delicate and wrong as it shows more soft-dirty
pages than the actual soft-dirty pages. There is no interest in
correcting it [2][3] as this is how the feature was written years ago.
It shouldn't be updated to changed behaviour. Peter Xu has suggested
using the async version of the UFFD WP [4] as it is based inherently
on the PTEs.
So in this patch series, I've added a new mode to the UFFD which is
asynchronous version of the write protect. When this variant of the
UFFD WP is used, the page faults are resolved automatically by the
kernel. The pages which have been written-to can be found by reading
pagemap file (!PM_UFFD_WP). This feature can be used successfully to
find which pages have been written to from the time the pages were
write protected. This works just like the soft-dirty flag without
showing any extra pages which aren't soft-dirty in reality.
The information related to pages if the page is file mapped, present and
swapped is required for the CRIU project [5][6]. The addition of the
required mask, any mask, excluded mask and return masks are also required
for the CRIU project [5].
The IOCTL returns the addresses of the pages which match the specific
masks. The page addresses are returned in struct page_region in a compact
form. The max_pages is needed to support a use case where user only wants
to get a specific number of pages. So there is no need to find all the
pages of interest in the range when max_pages is specified. The IOCTL
returns when the maximum number of the pages are found. The max_pages is
optional. If max_pages is specified, it must be equal or greater than the
vec_size. This restriction is needed to handle worse case when one
page_region only contains info of one page and it cannot be compacted.
This is needed to emulate the Windows getWriteWatch() syscall.
The patch series include the detailed selftest which can be used as an
example for the uffd async wp test and PAGEMAP_IOCTL. It shows the
interface usages as well.
[1] https://lore.kernel.org/lkml/54d4c322-cd6e-eefd-b161-2af2b56aae24@collabora…
[2] https://lore.kernel.org/all/20221220162606.1595355-1-usama.anjum@collabora.…
[3] https://lore.kernel.org/all/20221122115007.2787017-1-usama.anjum@collabora.…
[4] https://lore.kernel.org/all/Y6Hc2d+7eTKs7AiH@x1n
[5] https://lore.kernel.org/all/YyiDg79flhWoMDZB@gmail.com/
[6] https://lore.kernel.org/all/20221014134802.1361436-1-mdanylo@google.com/
Regards,
Muhammad Usama Anjum
Muhammad Usama Anjum (5):
fs/proc/task_mmu: Implement IOCTL to get and optionally clear info
about PTEs
fs/proc/task_mmu: Add fast paths to get/clear PAGE_IS_WRITTEN flag
tools headers UAPI: Update linux/fs.h with the kernel sources
mm/pagemap: add documentation of PAGEMAP_SCAN IOCTL
selftests: mm: add pagemap ioctl tests
Peter Xu (1):
userfaultfd: UFFD_FEATURE_WP_ASYNC
Documentation/admin-guide/mm/pagemap.rst | 89 +
Documentation/admin-guide/mm/userfaultfd.rst | 35 +
fs/proc/task_mmu.c | 709 ++++++++
fs/userfaultfd.c | 26 +-
include/linux/hugetlb.h | 1 +
include/linux/userfaultfd_k.h | 21 +-
include/uapi/linux/fs.h | 59 +
include/uapi/linux/userfaultfd.h | 9 +-
mm/hugetlb.c | 34 +-
mm/memory.c | 27 +-
tools/include/uapi/linux/fs.h | 59 +
tools/testing/selftests/mm/.gitignore | 2 +
tools/testing/selftests/mm/Makefile | 3 +-
tools/testing/selftests/mm/config | 1 +
tools/testing/selftests/mm/pagemap_ioctl.c | 1660 ++++++++++++++++++
tools/testing/selftests/mm/run_vmtests.sh | 4 +
16 files changed, 2715 insertions(+), 24 deletions(-)
create mode 100644 tools/testing/selftests/mm/pagemap_ioctl.c
--
2.40.1
On Fri, Aug 11, 2023 at 05:59:27PM +0200, Petr Machata wrote:
+ Shuah Khan <shuah(a)kernel.org>, linux-kselftest(a)vger.kernel.org
> This test verifies whether the encapsulated packets have the correct
> configured TTL. It does so by sending ICMP packets through the test
> topology and mirroring them to a gretap netdevice. On a busy host
> however, more than just the test ICMP packets may end up flowing
> through the topology, get mirrored, and counted. This leads to
> potential spurious failures as the test observes much more mirrored
> packets than the sent test packets, and assumes a bug.
>
> Fix this by tightening up the mirror action match. Change it from
> matchall to a flower classifier matching on ICMP packets specifically.
>
> Fixes: 45315673e0c5 ("selftests: forwarding: Test changes in mirror-to-gretap")
> Signed-off-by: Petr Machata <petrm(a)nvidia.com>
> Tested-by: Mirsad Todorovac <mirsad.todorovac(a)alu.unizg.hr>
> Reviewed-by: Ido Schimmel <idosch(a)nvidia.com>
Reviewed-by: Simon Horman <horms(a)kernel.org>
> ---
> tools/testing/selftests/net/forwarding/mirror_gre_changes.sh | 3 ++-
> 1 file changed, 2 insertions(+), 1 deletion(-)
>
> diff --git a/tools/testing/selftests/net/forwarding/mirror_gre_changes.sh b/tools/testing/selftests/net/forwarding/mirror_gre_changes.sh
> index aff88f78e339..5ea9d63915f7 100755
> --- a/tools/testing/selftests/net/forwarding/mirror_gre_changes.sh
> +++ b/tools/testing/selftests/net/forwarding/mirror_gre_changes.sh
> @@ -72,7 +72,8 @@ test_span_gre_ttl()
>
> RET=0
>
> - mirror_install $swp1 ingress $tundev "matchall $tcflags"
> + mirror_install $swp1 ingress $tundev \
> + "prot ip flower $tcflags ip_prot icmp"
> tc filter add dev $h3 ingress pref 77 prot $prot \
> flower skip_hw ip_ttl 50 action pass
>
> --
> 2.41.0
>
>
*Changes in v28:*
- Fix walk_end and add 17 test cases in selftests patch
*Changes in v27:*
- Handle review comments and minor improvements
- Add performance improvement patch on top with test for easy review
*Changes in v26:*
- Code re-structurring and API changes in PAGEMAP_IOCTL
*Changes in v25*:
- Do proper filtering on hole as well (hole got missed earlier)
*Changes in v24*:
- Rebase on top of next-20230710
- Place WP markers in case of hole as well
*Changes in v23*:
- Set vec_buf_index in loop only when vec_buf_index is set
- Return -EFAULT instead of -EINVAL if vec is NULL
- Correctly return the walk ending address to the page granularity
*Changes in v22*:
- Interface change:
- Replace [start start + len) with [start, end)
- Return the ending address of the address walk in start
*Changes in v21*:
- Abort walk instead of returning error if WP is to be performed on
partial hugetlb
*Changes in v20*
- Correct PAGE_IS_FILE and add PAGE_IS_PFNZERO
*Changes in v19*
- Minor changes and interface updates
*Changes in v18*
- Rebase on top of next-20230613
- Minor updates
*Changes in v17*
- Rebase on top of next-20230606
- Minor improvements in PAGEMAP_SCAN IOCTL patch
*Changes in v16*
- Fix a corner case
- Add exclusive PM_SCAN_OP_WP back
*Changes in v15*
- Build fix (Add missed build fix in RESEND)
*Changes in v14*
- Fix build error caused by #ifdef added at last minute in some configs
*Changes in v13*
- Rebase on top of next-20230414
- Give-up on using uffd_wp_range() and write new helpers, flush tlb only
once
*Changes in v12*
- Update and other memory types to UFFD_FEATURE_WP_ASYNC
- Rebaase on top of next-20230406
- Review updates
*Changes in v11*
- Rebase on top of next-20230307
- Base patches on UFFD_FEATURE_WP_UNPOPULATED
- Do a lot of cosmetic changes and review updates
- Remove ENGAGE_WP + !GET operation as it can be performed with
UFFDIO_WRITEPROTECT
*Changes in v10*
- Add specific condition to return error if hugetlb is used with wp
async
- Move changes in tools/include/uapi/linux/fs.h to separate patch
- Add documentation
*Changes in v9:*
- Correct fault resolution for userfaultfd wp async
- Fix build warnings and errors which were happening on some configs
- Simplify pagemap ioctl's code
*Changes in v8:*
- Update uffd async wp implementation
- Improve PAGEMAP_IOCTL implementation
*Changes in v7:*
- Add uffd wp async
- Update the IOCTL to use uffd under the hood instead of soft-dirty
flags
*Motivation*
The real motivation for adding PAGEMAP_SCAN IOCTL is to emulate Windows
GetWriteWatch() and ResetWriteWatch() syscalls [1]. The GetWriteWatch()
retrieves the addresses of the pages that are written to in a region of
virtual memory.
This syscall is used in Windows applications and games etc. This syscall is
being emulated in pretty slow manner in userspace. Our purpose is to
enhance the kernel such that we translate it efficiently in a better way.
Currently some out of tree hack patches are being used to efficiently
emulate it in some kernels. We intend to replace those with these patches.
So the whole gaming on Linux can effectively get benefit from this. It
means there would be tons of users of this code.
CRIU use case [2] was mentioned by Andrei and Danylo:
> Use cases for migrating sparse VMAs are binaries sanitized with ASAN,
> MSAN or TSAN [3]. All of these sanitizers produce sparse mappings of
> shadow memory [4]. Being able to migrate such binaries allows to highly
> reduce the amount of work needed to identify and fix post-migration
> crashes, which happen constantly.
Andrei's defines the following uses of this code:
* it is more granular and allows us to track changed pages more
effectively. The current interface can clear dirty bits for the entire
process only. In addition, reading info about pages is a separate
operation. It means we must freeze the process to read information
about all its pages, reset dirty bits, only then we can start dumping
pages. The information about pages becomes more and more outdated,
while we are processing pages. The new interface solves both these
downsides. First, it allows us to read pte bits and clear the
soft-dirty bit atomically. It means that CRIU will not need to freeze
processes to pre-dump their memory. Second, it clears soft-dirty bits
for a specified region of memory. It means CRIU will have actual info
about pages to the moment of dumping them.
* The new interface has to be much faster because basic page filtering
is happening in the kernel. With the old interface, we have to read
pagemap for each page.
*Implementation Evolution (Short Summary)*
From the definition of GetWriteWatch(), we feel like kernel's soft-dirty
feature can be used under the hood with some additions like:
* reset soft-dirty flag for only a specific region of memory instead of
clearing the flag for the entire process
* get and clear soft-dirty flag for a specific region atomically
So we decided to use ioctl on pagemap file to read or/and reset soft-dirty
flag. But using soft-dirty flag, sometimes we get extra pages which weren't
even written. They had become soft-dirty because of VMA merging and
VM_SOFTDIRTY flag. This breaks the definition of GetWriteWatch(). We were
able to by-pass this short coming by ignoring VM_SOFTDIRTY until David
reported that mprotect etc messes up the soft-dirty flag while ignoring
VM_SOFTDIRTY [5]. This wasn't happening until [6] got introduced. We
discussed if we can revert these patches. But we could not reach to any
conclusion. So at this point, I made couple of tries to solve this whole
VM_SOFTDIRTY issue by correcting the soft-dirty implementation:
* [7] Correct the bug fixed wrongly back in 2014. It had potential to cause
regression. We left it behind.
* [8] Keep a list of soft-dirty part of a VMA across splits and merges. I
got the reply don't increase the size of the VMA by 8 bytes.
At this point, we left soft-dirty considering it is too much delicate and
userfaultfd [9] seemed like the only way forward. From there onward, we
have been basing soft-dirty emulation on userfaultfd wp feature where
kernel resolves the faults itself when WP_ASYNC feature is used. It was
straight forward to add WP_ASYNC feature in userfautlfd. Now we get only
those pages dirty or written-to which are really written in reality. (PS
There is another WP_UNPOPULATED userfautfd feature is required which is
needed to avoid pre-faulting memory before write-protecting [9].)
All the different masks were added on the request of CRIU devs to create
interface more generic and better.
[1] https://learn.microsoft.com/en-us/windows/win32/api/memoryapi/nf-memoryapi-…
[2] https://lore.kernel.org/all/20221014134802.1361436-1-mdanylo@google.com
[3] https://github.com/google/sanitizers
[4] https://github.com/google/sanitizers/wiki/AddressSanitizerAlgorithm#64-bit
[5] https://lore.kernel.org/all/bfcae708-db21-04b4-0bbe-712badd03071@redhat.com
[6] https://lore.kernel.org/all/20220725142048.30450-1-peterx@redhat.com/
[7] https://lore.kernel.org/all/20221122115007.2787017-1-usama.anjum@collabora.…
[8] https://lore.kernel.org/all/20221220162606.1595355-1-usama.anjum@collabora.…
[9] https://lore.kernel.org/all/20230306213925.617814-1-peterx@redhat.com
[10] https://lore.kernel.org/all/20230125144529.1630917-1-mdanylo@google.com
* Original Cover letter from v8*
Hello,
Note:
Soft-dirty pages and pages which have been written-to are synonyms. As
kernel already has soft-dirty feature inside which we have given up to
use, we are using written-to terminology while using UFFD async WP under
the hood.
It is possible to find and clear soft-dirty pages entirely in userspace.
But it isn't efficient:
- The mprotect and SIGSEGV handler for bookkeeping
- The userfaultfd wp (synchronous) with the handler for bookkeeping
Some benchmarks can be seen here[1]. This series adds features that weren't
present earlier:
- There is no atomic get soft-dirty/Written-to status and clear present in
the kernel.
- The pages which have been written-to can not be found in accurate way.
(Kernel's soft-dirty PTE bit + sof_dirty VMA bit shows more soft-dirty
pages than there actually are.)
Historically, soft-dirty PTE bit tracking has been used in the CRIU
project. The procfs interface is enough for finding the soft-dirty bit
status and clearing the soft-dirty bit of all the pages of a process.
We have the use case where we need to track the soft-dirty PTE bit for
only specific pages on-demand. We need this tracking and clear mechanism
of a region of memory while the process is running to emulate the
getWriteWatch() syscall of Windows.
*(Moved to using UFFD instead of soft-dirty feature to find pages which
have been written-to from v7 patch series)*:
Stop using the soft-dirty flags for finding which pages have been
written to. It is too delicate and wrong as it shows more soft-dirty
pages than the actual soft-dirty pages. There is no interest in
correcting it [2][3] as this is how the feature was written years ago.
It shouldn't be updated to changed behaviour. Peter Xu has suggested
using the async version of the UFFD WP [4] as it is based inherently
on the PTEs.
So in this patch series, I've added a new mode to the UFFD which is
asynchronous version of the write protect. When this variant of the
UFFD WP is used, the page faults are resolved automatically by the
kernel. The pages which have been written-to can be found by reading
pagemap file (!PM_UFFD_WP). This feature can be used successfully to
find which pages have been written to from the time the pages were
write protected. This works just like the soft-dirty flag without
showing any extra pages which aren't soft-dirty in reality.
The information related to pages if the page is file mapped, present and
swapped is required for the CRIU project [5][6]. The addition of the
required mask, any mask, excluded mask and return masks are also required
for the CRIU project [5].
The IOCTL returns the addresses of the pages which match the specific
masks. The page addresses are returned in struct page_region in a compact
form. The max_pages is needed to support a use case where user only wants
to get a specific number of pages. So there is no need to find all the
pages of interest in the range when max_pages is specified. The IOCTL
returns when the maximum number of the pages are found. The max_pages is
optional. If max_pages is specified, it must be equal or greater than the
vec_size. This restriction is needed to handle worse case when one
page_region only contains info of one page and it cannot be compacted.
This is needed to emulate the Windows getWriteWatch() syscall.
The patch series include the detailed selftest which can be used as an
example for the uffd async wp test and PAGEMAP_IOCTL. It shows the
interface usages as well.
[1] https://lore.kernel.org/lkml/54d4c322-cd6e-eefd-b161-2af2b56aae24@collabora…
[2] https://lore.kernel.org/all/20221220162606.1595355-1-usama.anjum@collabora.…
[3] https://lore.kernel.org/all/20221122115007.2787017-1-usama.anjum@collabora.…
[4] https://lore.kernel.org/all/Y6Hc2d+7eTKs7AiH@x1n
[5] https://lore.kernel.org/all/YyiDg79flhWoMDZB@gmail.com/
[6] https://lore.kernel.org/all/20221014134802.1361436-1-mdanylo@google.com/
Regards,
Muhammad Usama Anjum
Muhammad Usama Anjum (5):
fs/proc/task_mmu: Implement IOCTL to get and optionally clear info
about PTEs
fs/proc/task_mmu: Add fast paths to get/clear PAGE_IS_WRITTEN flag
tools headers UAPI: Update linux/fs.h with the kernel sources
mm/pagemap: add documentation of PAGEMAP_SCAN IOCTL
selftests: mm: add pagemap ioctl tests
Peter Xu (1):
userfaultfd: UFFD_FEATURE_WP_ASYNC
Documentation/admin-guide/mm/pagemap.rst | 64 +
Documentation/admin-guide/mm/userfaultfd.rst | 35 +
fs/proc/task_mmu.c | 716 ++++++++
fs/userfaultfd.c | 26 +-
include/linux/hugetlb.h | 1 +
include/linux/userfaultfd_k.h | 21 +-
include/uapi/linux/fs.h | 59 +
include/uapi/linux/userfaultfd.h | 9 +-
mm/hugetlb.c | 34 +-
mm/memory.c | 27 +-
tools/include/uapi/linux/fs.h | 59 +
tools/testing/selftests/mm/.gitignore | 2 +
tools/testing/selftests/mm/Makefile | 3 +-
tools/testing/selftests/mm/config | 1 +
tools/testing/selftests/mm/pagemap_ioctl.c | 1658 ++++++++++++++++++
tools/testing/selftests/mm/run_vmtests.sh | 4 +
16 files changed, 2695 insertions(+), 24 deletions(-)
create mode 100644 tools/testing/selftests/mm/pagemap_ioctl.c
--
2.39.2
Hi,
This follows the discussion here:
https://lore.kernel.org/linux-kselftest/20230324123157.bbwvfq4gsxnlnfwb@hou…
This shows a couple of inconsistencies with regard to how device-managed
resources are cleaned up. Basically, devm resources will only be cleaned up
if the device is attached to a bus and bound to a driver. Failing any of
these cases, a call to device_unregister will not end up in the devm
resources being released.
We had to work around it in DRM to provide helpers to create a device for
kunit tests, but the current discussion around creating similar, generic,
helpers for kunit resumed interest in fixing this.
This can be tested using the command:
./tools/testing/kunit/kunit.py run --kunitconfig=drivers/base/test/
I added the fix David suggested back in that discussion which does fix
the tests. The SoB is missing, since David didn't provide it back then.
Let me know what you think,
Maxime
Signed-off-by: Maxime Ripard <mripard(a)kernel.org>
---
Changes in v3:
- Reworded the commit logs according to David's feedback
- Rebased on current next
- Link to v2: https://lore.kernel.org/r/20230329-kunit-devm-inconsistencies-test-v2-0-19f…
Changes in v2:
- Use an init function
- Document the tests
- Add a fix for the bugs
- Link to v1: https://lore.kernel.org/r/20230329-kunit-devm-inconsistencies-test-v1-0-c33…
---
David Gow (1):
drivers: base: Free devm resources when unregistering a device
Maxime Ripard (2):
drivers: base: Add basic devm tests for root devices
drivers: base: Add basic devm tests for platform devices
drivers/base/core.c | 11 ++
drivers/base/test/.kunitconfig | 2 +
drivers/base/test/Kconfig | 4 +
drivers/base/test/Makefile | 3 +
drivers/base/test/platform-device-test.c | 220 +++++++++++++++++++++++++++++++
drivers/base/test/root-device-test.c | 108 +++++++++++++++
6 files changed, 348 insertions(+)
---
base-commit: c58c49dd89324b18a812762a2bfa5a0458e4f252
change-id: 20230329-kunit-devm-inconsistencies-test-5e5a7d01e60d
Best regards,
--
Maxime Ripard <mripard(a)kernel.org>
A previous fixup to this commit fixed one issue, but introduced another:
we're now overly strict when validating the src address for UFFDIO_COPY.
Most of the validation in validate_range is useful to apply to src as
well as dst, but page alignment is only a requirement for dst, not src.
So, split the function up so src can use an "unaligned" variant, while
still allowing us to share the majority of the code between the
different cases.
Reported-by: Ryan Roberts <ryan.roberts(a)arm.com>
Closes: https://lore.kernel.org/linux-mm/8fbb5965-28f7-4e9a-ac04-1406ed8fc2d4@arm.c…
Signed-off-by: Axel Rasmussen <axelrasmussen(a)google.com>
---
fs/userfaultfd.c | 18 +++++++++++++-----
1 file changed, 13 insertions(+), 5 deletions(-)
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index bb5c474a0a77..1091cb461747 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -1287,13 +1287,11 @@ static __always_inline void wake_userfault(struct userfaultfd_ctx *ctx,
__wake_userfault(ctx, range);
}
-static __always_inline int validate_range(struct mm_struct *mm,
- __u64 start, __u64 len)
+static __always_inline int validate_unaligned_range(
+ struct mm_struct *mm, __u64 start, __u64 len)
{
__u64 task_size = mm->task_size;
- if (start & ~PAGE_MASK)
- return -EINVAL;
if (len & ~PAGE_MASK)
return -EINVAL;
if (!len)
@@ -1309,6 +1307,15 @@ static __always_inline int validate_range(struct mm_struct *mm,
return 0;
}
+static __always_inline int validate_range(struct mm_struct *mm,
+ __u64 start, __u64 len)
+{
+ if (start & ~PAGE_MASK)
+ return -EINVAL;
+
+ return validate_unaligned_range(mm, start, len);
+}
+
static int userfaultfd_register(struct userfaultfd_ctx *ctx,
unsigned long arg)
{
@@ -1759,7 +1766,8 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx,
sizeof(uffdio_copy)-sizeof(__s64)))
goto out;
- ret = validate_range(ctx->mm, uffdio_copy.src, uffdio_copy.len);
+ ret = validate_unaligned_range(ctx->mm, uffdio_copy.src,
+ uffdio_copy.len);
if (ret)
goto out;
ret = validate_range(ctx->mm, uffdio_copy.dst, uffdio_copy.len);
--
2.41.0.640.ga95def55d0-goog
As is described in the "How to use MPTCP?" section in MPTCP wiki [1]:
"Your app should create sockets with IPPROTO_MPTCP as the proto:
( socket(AF_INET, SOCK_STREAM, IPPROTO_MPTCP); ). Legacy apps can be
forced to create and use MPTCP sockets instead of TCP ones via the
mptcpize command bundled with the mptcpd daemon."
But the mptcpize (LD_PRELOAD technique) command has some limitations
[2]:
- it doesn't work if the application is not using libc (e.g. GoLang
apps)
- in some envs, it might not be easy to set env vars / change the way
apps are launched, e.g. on Android
- mptcpize needs to be launched with all apps that want MPTCP: we could
have more control from BPF to enable MPTCP only for some apps or all the
ones of a netns or a cgroup, etc.
- it is not in BPF, we cannot talk about it at netdev conf.
So this patchset attempts to use BPF to implement functions similer to
mptcpize.
The main idea is to add a hook in sys_socket() to change the protocol id
from IPPROTO_TCP (or 0) to IPPROTO_MPTCP.
[1]
https://github.com/multipath-tcp/mptcp_net-next/wiki
[2]
https://github.com/multipath-tcp/mptcp_net-next/issues/79
v11:
- add comments about outputs of 'ss' and 'nstat'.
- use "err = verify_mptcpify()" instead of using =+.
v10:
- drop "#ifdef CONFIG_BPF_JIT".
- include vmlinux.h and bpf_tracing_net.h to avoid defining some
macros.
- drop unneeded checks for mptcp.
v9:
- update comment for 'update_socket_protocol'.
v8:
- drop the additional checks on the 'protocol' value after the
'update_socket_protocol()' call.
v7:
- add __weak and __diag_* for update_socket_protocol.
v6:
- add update_socket_protocol.
v5:
- add bpf_mptcpify helper.
v4:
- use lsm_cgroup/socket_create
v3:
- patch 8: char cmd[128]; -> char cmd[256];
v2:
- Fix build selftests errors reported by CI
Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/79
Geliang Tang (5):
bpf: Add update_socket_protocol hook
selftests/bpf: Use random netns name for mptcp
selftests/bpf: Add two mptcp netns helpers
selftests/bpf: Drop unneeded checks for mptcp
selftests/bpf: Add mptcpify test
net/mptcp/bpf.c | 15 ++
net/socket.c | 24 +++
.../testing/selftests/bpf/prog_tests/mptcp.c | 139 +++++++++++++++---
tools/testing/selftests/bpf/progs/mptcpify.c | 20 +++
4 files changed, 179 insertions(+), 19 deletions(-)
create mode 100644 tools/testing/selftests/bpf/progs/mptcpify.c
--
2.35.3
iommufd gives userspace the capability to manipulate iommu subsytem.
e.g. DMA map/unmap etc. In the near future, it will support iommu nested
translation. Different platform vendors have different implementation for
the nested translation. For example, Intel VT-d supports using guest I/O
page table as the stage-1 translation table. This requires guest I/O page
table be compatible with hardware IOMMU. So before set up nested translation,
userspace needs to know the hardware iommu information to understand the
nested translation requirements.
This series reports the iommu hardware information for a given device
which has been bound to iommufd. It is preparation work for userspace to
allocate hwpt for given device. Like the nested translation support[1].
This series introduces an iommu op to report the iommu hardware info,
and an ioctl IOMMU_GET_HW_INFO is added to report such hardware info to
user. enum iommu_hw_info_type is defined to differentiate the iommu hardware
info reported to user hence user can decode them. This series only adds the
framework for iommu hw info reporting, the complete reporting path needs vendor
specific definition and driver support. The full code is available in [1]
as well.
[1] https://github.com/yiliu1765/iommufd/tree/wip/iommufd_nesting_08082023-yi
(only the hw_info report path is the latest, other parts is wip)
Change log:
v6:
- Add Jingqi's comment on patch 02
- Add Baolu's r-b to patch 03
- Address Jason's comment on patch 03
v5: https://lore.kernel.org/linux-iommu/20230803143144.200945-1-yi.l.liu@intel.…
- Return hw_info_type in the .hw_info op, hence drop hw_info_type field in iommu_ops (Kevin)
- Add Jason's r-b for patch 01
- Address coding style comments from Jason and Kevin w.r.t. patch 02, 03 and 04
v4: https://lore.kernel.org/linux-iommu/20230724105936.107042-1-yi.l.liu@intel.…
- Rename ioctl to IOMMU_GET_HW_INFO and structure to iommu_hw_info
- Move the iommufd_get_hw_info handler to main.c
- Place iommu_hw_info prior to iommu_hwpt_alloc
- Update the function namings accordingly
- Update uapi kdocs
v3: https://lore.kernel.org/linux-iommu/20230511143024.19542-1-yi.l.liu@intel.c…
- Add r-b from Baolu
- Rename IOMMU_HW_INFO_TYPE_DEFAULT to be IOMMU_HW_INFO_TYPE_NONE to
better suit what it means
- Let IOMMU_DEVICE_GET_HW_INFO succeed even the underlying iommu driver
does not have driver-specific data to report per below remark.
https://lore.kernel.org/kvm/ZAcwJSK%2F9UVI9LXu@nvidia.com/
v2: https://lore.kernel.org/linux-iommu/20230309075358.571567-1-yi.l.liu@intel.…
- Drop patch 05 of v1 as it is already covered by other series
- Rename the capability info to be iommu hardware info
v1: https://lore.kernel.org/linux-iommu/20230209041642.9346-1-yi.l.liu@intel.co…
Regards,
Yi Liu
Lu Baolu (1):
iommu: Add new iommu op to get iommu hardware information
Nicolin Chen (1):
iommufd/selftest: Add coverage for IOMMU_GET_HW_INFO ioctl
Yi Liu (2):
iommu: Move dev_iommu_ops() to private header
iommufd: Add IOMMU_GET_HW_INFO
drivers/iommu/iommu-priv.h | 11 +++
drivers/iommu/iommufd/iommufd_test.h | 9 ++
drivers/iommu/iommufd/main.c | 97 +++++++++++++++++++
drivers/iommu/iommufd/selftest.c | 16 +++
include/linux/iommu.h | 20 ++--
include/uapi/linux/iommufd.h | 45 +++++++++
tools/testing/selftests/iommu/iommufd.c | 17 +++-
tools/testing/selftests/iommu/iommufd_utils.h | 26 +++++
8 files changed, 229 insertions(+), 12 deletions(-)
--
2.34.1
Add new feature checks and provide testing item to support capturing
SIGBUS exception signal.
The following is a log snippet from my local testing environment:
~~~
TAP version 13
1..90
# CRC32 present
ok 1 cpuinfo_match_CRC32
ok 2 sigill_CRC32
ok 3 # SKIP sigbus_CRC32
ok 4 cpuinfo_match_CSSC
# sigill_reported for CSSC
ok 5 # SKIP sigill_CSSC
ok 6 # SKIP sigbus_CSSC
# FP present
ok 7 cpuinfo_match_FP
ok 8 sigill_FP
ok 9 # SKIP sigbus_FP
# LRCPC present
ok 10 cpuinfo_match_LRCPC
ok 11 sigill_LRCPC
ok 12 # SKIP sigbus_LRCPC
# LRCPC2 present
ok 13 cpuinfo_match_LRCPC2
ok 14 sigill_LRCPC2
ok 15 # SKIP sigbus_LRCPC2
# LSE present
ok 16 cpuinfo_match_LSE
ok 17 sigill_LSE
ok 18 # SKIP sigbus_LSE
# LSE2 present
ok 19 cpuinfo_match_LSE2
ok 20 sigill_LSE2
ok 21 sigbus_LSE2
ok 22 cpuinfo_match_MOPS
ok 23 sigill_MOPS
ok 24 # SKIP sigbus_MOPS
# RNG present
ok 25 cpuinfo_match_RNG
ok 26 sigill_RNG
ok 27 # SKIP sigbus_RNG
ok 28 cpuinfo_match_RPRFM
ok 29 # SKIP sigill_RPRFM
ok 30 # SKIP sigbus_RPRFM
ok 31 cpuinfo_match_SME
ok 32 sigill_SME
ok 33 # SKIP sigbus_SME
ok 34 cpuinfo_match_SME2
ok 35 sigill_SME2
ok 36 # SKIP sigbus_SME2
ok 37 cpuinfo_match_SME 2.1
# sigill_reported for SME 2.1
ok 38 # SKIP sigill_SME 2.1
ok 39 # SKIP sigbus_SME 2.1
ok 40 cpuinfo_match_SME I16I32
# sigill_reported for SME I16I32
ok 41 # SKIP sigill_SME I16I32
ok 42 # SKIP sigbus_SME I16I32
ok 43 cpuinfo_match_SME BI32I32
# sigill_reported for SME BI32I32
ok 44 # SKIP sigill_SME BI32I32
ok 45 # SKIP sigbus_SME BI32I32
ok 46 cpuinfo_match_SME B16B16
# sigill_reported for SME B16B16
ok 47 # SKIP sigill_SME B16B16
ok 48 # SKIP sigbus_SME B16B16
ok 49 cpuinfo_match_SME F16F16
# sigill_reported for SME F16F16
ok 50 # SKIP sigill_SME F16F16
ok 51 # SKIP sigbus_SME F16F16
# SVE present
ok 52 cpuinfo_match_SVE
ok 53 sigill_SVE
ok 54 # SKIP sigbus_SVE
ok 55 cpuinfo_match_SVE 2
# sigill_reported for SVE 2
ok 56 # SKIP sigill_SVE 2
ok 57 # SKIP sigbus_SVE 2
ok 58 cpuinfo_match_SVE 2.1
# sigill_reported for SVE 2.1
ok 59 # SKIP sigill_SVE 2.1
ok 60 # SKIP sigbus_SVE 2.1
ok 61 cpuinfo_match_SVE AES
# sigill_reported for SVE AES
ok 62 # SKIP sigill_SVE AES
ok 63 # SKIP sigbus_SVE AES
ok 64 cpuinfo_match_SVE2 PMULL
# sigill_reported for SVE2 PMULL
ok 65 # SKIP sigill_SVE2 PMULL
ok 66 # SKIP sigbus_SVE2 PMULL
ok 67 cpuinfo_match_SVE2 BITPERM
# sigill_reported for SVE2 BITPERM
ok 68 # SKIP sigill_SVE2 BITPERM
ok 69 # SKIP sigbus_SVE2 BITPERM
ok 70 cpuinfo_match_SVE2 SHA3
# sigill_reported for SVE2 SHA3
ok 71 # SKIP sigill_SVE2 SHA3
ok 72 # SKIP sigbus_SVE2 SHA3
ok 73 cpuinfo_match_SVE2 SM4
# sigill_reported for SVE2 SM4
ok 74 # SKIP sigill_SVE2 SM4
ok 75 # SKIP sigbus_SVE2 SM4
# SVE2 I8MM present
ok 76 cpuinfo_match_SVE2 I8MM
ok 77 sigill_SVE2 I8MM
ok 78 # SKIP sigbus_SVE2 I8MM
# SVE2 F32MM present
ok 79 cpuinfo_match_SVE2 F32MM
ok 80 sigill_SVE2 F32MM
ok 81 # SKIP sigbus_SVE2 F32MM
# SVE2 F64MM present
ok 82 cpuinfo_match_SVE2 F64MM
ok 83 sigill_SVE2 F64MM
ok 84 # SKIP sigbus_SVE2 F64MM
# SVE2 BF16 present
ok 85 cpuinfo_match_SVE2 BF16
ok 86 sigill_SVE2 BF16
ok 87 # SKIP sigbus_SVE2 BF16
ok 88 cpuinfo_match_SVE2 EBF16
ok 89 # SKIP sigill_SVE2 EBF16
ok 90 # SKIP sigbus_SVE2 EBF16
# Totals: pass:46 fail:0 xfail:0 xpass:0 skip:44 error:0
~~~
Zeng Heng (5):
kselftest/arm64: add float-point feature to hwcap test
kselftest/arm64: add crc32 feature to hwcap test
kselftest/arm64: add DEF_SIGHANDLER_FUNC() and DEF_INST_RAISE_SIG()
helpers
kselftest/arm64: add test item that support to capturing the SIGBUS
signal
kselftest/arm64: add lse and lse2 features to hwcap test
tools/testing/selftests/arm64/abi/hwcap.c | 201 ++++++++++++++++------
1 file changed, 151 insertions(+), 50 deletions(-)
---
v1 -> v2:
- switch fp and crc32 instructions from hand encode to assemble language.
There is no logical changes between versions.
--
2.25.1
Our ABI opts to provide future proofing by defining a much larger
SVE_VQ_MAX than the architecture actually supports. Since we use
this define to control the size of our vector data buffers this results
in a lot of overhead when we initialise which can be a very noticable
problem in emulation, we fill buffers that are orders of magnitude
larger than we will ever actually use even with virtual platforms that
provide the full range of architecturally supported vector lengths.
Define and use the actual architecture maximum to mitigate this.
Signed-off-by: Mark Brown <broonie(a)kernel.org>
---
tools/testing/selftests/arm64/abi/syscall-abi.c | 38 +++++++++++++++----------
1 file changed, 23 insertions(+), 15 deletions(-)
diff --git a/tools/testing/selftests/arm64/abi/syscall-abi.c b/tools/testing/selftests/arm64/abi/syscall-abi.c
index 18cc123e2347..d704511a0955 100644
--- a/tools/testing/selftests/arm64/abi/syscall-abi.c
+++ b/tools/testing/selftests/arm64/abi/syscall-abi.c
@@ -20,12 +20,20 @@
#include "syscall-abi.h"
+/*
+ * The kernel defines a much larger SVE_VQ_MAX than is expressable in
+ * the architecture, this creates a *lot* of overhead filling the
+ * buffers (especially ZA) on emulated platforms so use the actual
+ * architectural maximum instead.
+ */
+#define ARCH_SVE_VQ_MAX 16
+
static int default_sme_vl;
static int sve_vl_count;
-static unsigned int sve_vls[SVE_VQ_MAX];
+static unsigned int sve_vls[ARCH_SVE_VQ_MAX];
static int sme_vl_count;
-static unsigned int sme_vls[SVE_VQ_MAX];
+static unsigned int sme_vls[ARCH_SVE_VQ_MAX];
extern void do_syscall(int sve_vl, int sme_vl);
@@ -130,9 +138,9 @@ static int check_fpr(struct syscall_cfg *cfg, int sve_vl, int sme_vl,
#define SVE_Z_SHARED_BYTES (128 / 8)
-static uint8_t z_zero[__SVE_ZREG_SIZE(SVE_VQ_MAX)];
-uint8_t z_in[SVE_NUM_ZREGS * __SVE_ZREG_SIZE(SVE_VQ_MAX)];
-uint8_t z_out[SVE_NUM_ZREGS * __SVE_ZREG_SIZE(SVE_VQ_MAX)];
+static uint8_t z_zero[__SVE_ZREG_SIZE(ARCH_SVE_VQ_MAX)];
+uint8_t z_in[SVE_NUM_ZREGS * __SVE_ZREG_SIZE(ARCH_SVE_VQ_MAX)];
+uint8_t z_out[SVE_NUM_ZREGS * __SVE_ZREG_SIZE(ARCH_SVE_VQ_MAX)];
static void setup_z(struct syscall_cfg *cfg, int sve_vl, int sme_vl,
uint64_t svcr)
@@ -190,8 +198,8 @@ static int check_z(struct syscall_cfg *cfg, int sve_vl, int sme_vl,
return errors;
}
-uint8_t p_in[SVE_NUM_PREGS * __SVE_PREG_SIZE(SVE_VQ_MAX)];
-uint8_t p_out[SVE_NUM_PREGS * __SVE_PREG_SIZE(SVE_VQ_MAX)];
+uint8_t p_in[SVE_NUM_PREGS * __SVE_PREG_SIZE(ARCH_SVE_VQ_MAX)];
+uint8_t p_out[SVE_NUM_PREGS * __SVE_PREG_SIZE(ARCH_SVE_VQ_MAX)];
static void setup_p(struct syscall_cfg *cfg, int sve_vl, int sme_vl,
uint64_t svcr)
@@ -222,8 +230,8 @@ static int check_p(struct syscall_cfg *cfg, int sve_vl, int sme_vl,
return errors;
}
-uint8_t ffr_in[__SVE_PREG_SIZE(SVE_VQ_MAX)];
-uint8_t ffr_out[__SVE_PREG_SIZE(SVE_VQ_MAX)];
+uint8_t ffr_in[__SVE_PREG_SIZE(ARCH_SVE_VQ_MAX)];
+uint8_t ffr_out[__SVE_PREG_SIZE(ARCH_SVE_VQ_MAX)];
static void setup_ffr(struct syscall_cfg *cfg, int sve_vl, int sme_vl,
uint64_t svcr)
@@ -300,8 +308,8 @@ static int check_svcr(struct syscall_cfg *cfg, int sve_vl, int sme_vl,
return errors;
}
-uint8_t za_in[ZA_SIG_REGS_SIZE(SVE_VQ_MAX)];
-uint8_t za_out[ZA_SIG_REGS_SIZE(SVE_VQ_MAX)];
+uint8_t za_in[ZA_SIG_REGS_SIZE(ARCH_SVE_VQ_MAX)];
+uint8_t za_out[ZA_SIG_REGS_SIZE(ARCH_SVE_VQ_MAX)];
static void setup_za(struct syscall_cfg *cfg, int sve_vl, int sme_vl,
uint64_t svcr)
@@ -470,9 +478,9 @@ void sve_count_vls(void)
return;
/*
- * Enumerate up to SVE_VQ_MAX vector lengths
+ * Enumerate up to ARCH_SVE_VQ_MAX vector lengths
*/
- for (vq = SVE_VQ_MAX; vq > 0; vq /= 2) {
+ for (vq = ARCH_SVE_VQ_MAX; vq > 0; vq /= 2) {
vl = prctl(PR_SVE_SET_VL, vq * 16);
if (vl == -1)
ksft_exit_fail_msg("PR_SVE_SET_VL failed: %s (%d)\n",
@@ -496,9 +504,9 @@ void sme_count_vls(void)
return;
/*
- * Enumerate up to SVE_VQ_MAX vector lengths
+ * Enumerate up to ARCH_SVE_VQ_MAX vector lengths
*/
- for (vq = SVE_VQ_MAX; vq > 0; vq /= 2) {
+ for (vq = ARCH_SVE_VQ_MAX; vq > 0; vq /= 2) {
vl = prctl(PR_SME_SET_VL, vq * 16);
if (vl == -1)
ksft_exit_fail_msg("PR_SME_SET_VL failed: %s (%d)\n",
---
base-commit: 52a93d39b17dc7eb98b6aa3edb93943248e03b2f
change-id: 20230809-arm64-syscall-abi-perf-1e5876d161b2
Best regards,
--
Mark Brown <broonie(a)kernel.org>
Replace the original fixed-size log buffer with a dynamically-
extending log.
Patch 1 provides the basic implementation. The following patches
add test cases, support for logging long strings, and an optimization
to the string formatting that is now more thoroughly testable.
Changes since v2:
- Fixed uninitialized string bug in get_concatenated_log().
- Moved get_concatenated_log() into first patch so that
kunit_log_newline_test() dumps the entire log on error.
- Moved kunit_log_frag_sized_line_test() to the correct point in
the chain, after the change that it depends on. Also log another
line after the long line to test that the log extends correctly.
- Added kunit_log_init_frag_test() to test kunit_init_log_frag()
instead of testing it as part of every other test.
Richard Fitzgerald (7):
kunit: Replace fixed-size log with dynamically-extending buffer
kunit: kunit-test: Add test cases for extending log buffer
kunit: Handle logging of lines longer than the fragment buffer size
kunit: kunit-test: Test logging a line that exactly fills a fragment
kunit: kunit-test: Add test cases for logging very long lines
kunit: kunit-test: Add test of logging only a newline
kunit: Don't waste first attempt to format string in
kunit_log_append()
include/kunit/test.h | 25 ++-
lib/kunit/debugfs.c | 65 ++++++--
lib/kunit/kunit-test.c | 339 ++++++++++++++++++++++++++++++++++++++++-
lib/kunit/test.c | 127 ++++++++++++---
4 files changed, 507 insertions(+), 49 deletions(-)
--
2.30.2