From: Mark Brown <broonie(a)kernel.org>
[ Upstream commit dbcf76390eb9a65d5d0c37b0cd57335218564e37 ]
The ftrace selftests do not currently produce KTAP output, they produce a
custom format much nicer for human consumption. This means that when run in
automated test systems we just get a single result for the suite as a whole
rather than recording results for individual test cases, making it harder
to look at the test data and masking things like inappropriate skips.
Address this by adding support for KTAP output to the ftracetest script and
providing a trivial wrapper which will be invoked by the kselftest runner
to generate output in this format by default, users using ftracetest
directly will continue to get the existing output.
This is not the most elegant solution but it is simple and effective. I
did consider implementing this by post processing the existing output
format but that felt more complex and likely to result in all output being
lost if something goes seriously wrong during the run which would not be
helpful. I did also consider just writing a separate runner script but
there's enough going on with things like the signal handling for that to
seem like it would be duplicating too much.
Acked-by: Steven Rostedt (Google) <rostedt(a)goodmis.org>
Acked-by: Masami Hiramatsu (Google) <mhiramat(a)kernel.org>
Tested-by: Masami Hiramatsu (Google) <mhiramat(a)kernel.org>
Signed-off-by: Mark Brown <broonie(a)kernel.org>
Signed-off-by: Shuah Khan <skhan(a)linuxfoundation.org>
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
---
tools/testing/selftests/ftrace/Makefile | 3 +-
tools/testing/selftests/ftrace/ftracetest | 63 ++++++++++++++++++-
.../testing/selftests/ftrace/ftracetest-ktap | 8 +++
3 files changed, 70 insertions(+), 4 deletions(-)
create mode 100755 tools/testing/selftests/ftrace/ftracetest-ktap
diff --git a/tools/testing/selftests/ftrace/Makefile b/tools/testing/selftests/ftrace/Makefile
index d6e106fbce11c..a1e955d2de4cc 100644
--- a/tools/testing/selftests/ftrace/Makefile
+++ b/tools/testing/selftests/ftrace/Makefile
@@ -1,7 +1,8 @@
# SPDX-License-Identifier: GPL-2.0
all:
-TEST_PROGS := ftracetest
+TEST_PROGS_EXTENDED := ftracetest
+TEST_PROGS := ftracetest-ktap
TEST_FILES := test.d settings
EXTRA_CLEAN := $(OUTPUT)/logs/*
diff --git a/tools/testing/selftests/ftrace/ftracetest b/tools/testing/selftests/ftrace/ftracetest
index 8ec1922e974eb..9a73a110a8bfc 100755
--- a/tools/testing/selftests/ftrace/ftracetest
+++ b/tools/testing/selftests/ftrace/ftracetest
@@ -13,6 +13,7 @@ echo "Usage: ftracetest [options] [testcase(s)] [testcase-directory(s)]"
echo " Options:"
echo " -h|--help Show help message"
echo " -k|--keep Keep passed test logs"
+echo " -K|--ktap Output in KTAP format"
echo " -v|--verbose Increase verbosity of test messages"
echo " -vv Alias of -v -v (Show all results in stdout)"
echo " -vvv Alias of -v -v -v (Show all commands immediately)"
@@ -85,6 +86,10 @@ parse_opts() { # opts
KEEP_LOG=1
shift 1
;;
+ --ktap|-K)
+ KTAP=1
+ shift 1
+ ;;
--verbose|-v|-vv|-vvv)
if [ $VERBOSE -eq -1 ]; then
usage "--console can not use with --verbose"
@@ -178,6 +183,7 @@ TEST_DIR=$TOP_DIR/test.d
TEST_CASES=`find_testcases $TEST_DIR`
LOG_DIR=$TOP_DIR/logs/`date +%Y%m%d-%H%M%S`/
KEEP_LOG=0
+KTAP=0
DEBUG=0
VERBOSE=0
UNSUPPORTED_RESULT=0
@@ -229,7 +235,7 @@ prlog() { # messages
newline=
shift
fi
- printf "$*$newline"
+ [ "$KTAP" != "1" ] && printf "$*$newline"
[ "$LOG_FILE" ] && printf "$*$newline" | strip_esc >> $LOG_FILE
}
catlog() { #file
@@ -260,11 +266,11 @@ TOTAL_RESULT=0
INSTANCE=
CASENO=0
+CASENAME=
testcase() { # testfile
CASENO=$((CASENO+1))
- desc=`grep "^#[ \t]*description:" $1 | cut -f2- -d:`
- prlog -n "[$CASENO]$INSTANCE$desc"
+ CASENAME=`grep "^#[ \t]*description:" $1 | cut -f2- -d:`
}
checkreq() { # testfile
@@ -277,40 +283,68 @@ test_on_instance() { # testfile
grep -q "^#[ \t]*flags:.*instance" $1
}
+ktaptest() { # result comment
+ if [ "$KTAP" != "1" ]; then
+ return
+ fi
+
+ local result=
+ if [ "$1" = "1" ]; then
+ result="ok"
+ else
+ result="not ok"
+ fi
+ shift
+
+ local comment=$*
+ if [ "$comment" != "" ]; then
+ comment="# $comment"
+ fi
+
+ echo $CASENO $result $INSTANCE$CASENAME $comment
+}
+
eval_result() { # sigval
case $1 in
$PASS)
prlog " [${color_green}PASS${color_reset}]"
+ ktaptest 1
PASSED_CASES="$PASSED_CASES $CASENO"
return 0
;;
$FAIL)
prlog " [${color_red}FAIL${color_reset}]"
+ ktaptest 0
FAILED_CASES="$FAILED_CASES $CASENO"
return 1 # this is a bug.
;;
$UNRESOLVED)
prlog " [${color_blue}UNRESOLVED${color_reset}]"
+ ktaptest 0 UNRESOLVED
UNRESOLVED_CASES="$UNRESOLVED_CASES $CASENO"
return $UNRESOLVED_RESULT # depends on use case
;;
$UNTESTED)
prlog " [${color_blue}UNTESTED${color_reset}]"
+ ktaptest 1 SKIP
UNTESTED_CASES="$UNTESTED_CASES $CASENO"
return 0
;;
$UNSUPPORTED)
prlog " [${color_blue}UNSUPPORTED${color_reset}]"
+ ktaptest 1 SKIP
UNSUPPORTED_CASES="$UNSUPPORTED_CASES $CASENO"
return $UNSUPPORTED_RESULT # depends on use case
;;
$XFAIL)
prlog " [${color_green}XFAIL${color_reset}]"
+ ktaptest 1 XFAIL
XFAILED_CASES="$XFAILED_CASES $CASENO"
return 0
;;
*)
prlog " [${color_blue}UNDEFINED${color_reset}]"
+ ktaptest 0 error
UNDEFINED_CASES="$UNDEFINED_CASES $CASENO"
return 1 # this must be a test bug
;;
@@ -371,6 +405,7 @@ __run_test() { # testfile
run_test() { # testfile
local testname=`basename $1`
testcase $1
+ prlog -n "[$CASENO]$INSTANCE$CASENAME"
if [ ! -z "$LOG_FILE" ] ; then
local testlog=`mktemp $LOG_DIR/${CASENO}-${testname}-log.XXXXXX`
else
@@ -405,6 +440,17 @@ run_test() { # testfile
# load in the helper functions
. $TEST_DIR/functions
+if [ "$KTAP" = "1" ]; then
+ echo "TAP version 13"
+
+ casecount=`echo $TEST_CASES | wc -w`
+ for t in $TEST_CASES; do
+ test_on_instance $t || continue
+ casecount=$((casecount+1))
+ done
+ echo "1..${casecount}"
+fi
+
# Main loop
for t in $TEST_CASES; do
run_test $t
@@ -439,6 +485,17 @@ prlog "# of unsupported: " `echo $UNSUPPORTED_CASES | wc -w`
prlog "# of xfailed: " `echo $XFAILED_CASES | wc -w`
prlog "# of undefined(test bug): " `echo $UNDEFINED_CASES | wc -w`
+if [ "$KTAP" = "1" ]; then
+ echo -n "# Totals:"
+ echo -n " pass:"`echo $PASSED_CASES | wc -w`
+ echo -n " faii:"`echo $FAILED_CASES | wc -w`
+ echo -n " xfail:"`echo $XFAILED_CASES | wc -w`
+ echo -n " xpass:0"
+ echo -n " skip:"`echo $UNTESTED_CASES $UNSUPPORTED_CASES | wc -w`
+ echo -n " error:"`echo $UNRESOLVED_CASES $UNDEFINED_CASES | wc -w`
+ echo
+fi
+
cleanup
# if no error, return 0
diff --git a/tools/testing/selftests/ftrace/ftracetest-ktap b/tools/testing/selftests/ftrace/ftracetest-ktap
new file mode 100755
index 0000000000000..b3284679ef3af
--- /dev/null
+++ b/tools/testing/selftests/ftrace/ftracetest-ktap
@@ -0,0 +1,8 @@
+#!/bin/sh -e
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# ftracetest-ktap: Wrapper to integrate ftracetest with the kselftest runner
+#
+# Copyright (C) Arm Ltd., 2023
+
+./ftracetest -K
--
2.39.2
From: Mark Brown <broonie(a)kernel.org>
[ Upstream commit dbcf76390eb9a65d5d0c37b0cd57335218564e37 ]
The ftrace selftests do not currently produce KTAP output, they produce a
custom format much nicer for human consumption. This means that when run in
automated test systems we just get a single result for the suite as a whole
rather than recording results for individual test cases, making it harder
to look at the test data and masking things like inappropriate skips.
Address this by adding support for KTAP output to the ftracetest script and
providing a trivial wrapper which will be invoked by the kselftest runner
to generate output in this format by default, users using ftracetest
directly will continue to get the existing output.
This is not the most elegant solution but it is simple and effective. I
did consider implementing this by post processing the existing output
format but that felt more complex and likely to result in all output being
lost if something goes seriously wrong during the run which would not be
helpful. I did also consider just writing a separate runner script but
there's enough going on with things like the signal handling for that to
seem like it would be duplicating too much.
Acked-by: Steven Rostedt (Google) <rostedt(a)goodmis.org>
Acked-by: Masami Hiramatsu (Google) <mhiramat(a)kernel.org>
Tested-by: Masami Hiramatsu (Google) <mhiramat(a)kernel.org>
Signed-off-by: Mark Brown <broonie(a)kernel.org>
Signed-off-by: Shuah Khan <skhan(a)linuxfoundation.org>
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
---
tools/testing/selftests/ftrace/Makefile | 3 +-
tools/testing/selftests/ftrace/ftracetest | 63 ++++++++++++++++++-
.../testing/selftests/ftrace/ftracetest-ktap | 8 +++
3 files changed, 70 insertions(+), 4 deletions(-)
create mode 100755 tools/testing/selftests/ftrace/ftracetest-ktap
diff --git a/tools/testing/selftests/ftrace/Makefile b/tools/testing/selftests/ftrace/Makefile
index d6e106fbce11c..a1e955d2de4cc 100644
--- a/tools/testing/selftests/ftrace/Makefile
+++ b/tools/testing/selftests/ftrace/Makefile
@@ -1,7 +1,8 @@
# SPDX-License-Identifier: GPL-2.0
all:
-TEST_PROGS := ftracetest
+TEST_PROGS_EXTENDED := ftracetest
+TEST_PROGS := ftracetest-ktap
TEST_FILES := test.d settings
EXTRA_CLEAN := $(OUTPUT)/logs/*
diff --git a/tools/testing/selftests/ftrace/ftracetest b/tools/testing/selftests/ftrace/ftracetest
index c3311c8c40890..2506621e75dfb 100755
--- a/tools/testing/selftests/ftrace/ftracetest
+++ b/tools/testing/selftests/ftrace/ftracetest
@@ -13,6 +13,7 @@ echo "Usage: ftracetest [options] [testcase(s)] [testcase-directory(s)]"
echo " Options:"
echo " -h|--help Show help message"
echo " -k|--keep Keep passed test logs"
+echo " -K|--ktap Output in KTAP format"
echo " -v|--verbose Increase verbosity of test messages"
echo " -vv Alias of -v -v (Show all results in stdout)"
echo " -vvv Alias of -v -v -v (Show all commands immediately)"
@@ -85,6 +86,10 @@ parse_opts() { # opts
KEEP_LOG=1
shift 1
;;
+ --ktap|-K)
+ KTAP=1
+ shift 1
+ ;;
--verbose|-v|-vv|-vvv)
if [ $VERBOSE -eq -1 ]; then
usage "--console can not use with --verbose"
@@ -178,6 +183,7 @@ TEST_DIR=$TOP_DIR/test.d
TEST_CASES=`find_testcases $TEST_DIR`
LOG_DIR=$TOP_DIR/logs/`date +%Y%m%d-%H%M%S`/
KEEP_LOG=0
+KTAP=0
DEBUG=0
VERBOSE=0
UNSUPPORTED_RESULT=0
@@ -229,7 +235,7 @@ prlog() { # messages
newline=
shift
fi
- printf "$*$newline"
+ [ "$KTAP" != "1" ] && printf "$*$newline"
[ "$LOG_FILE" ] && printf "$*$newline" | strip_esc >> $LOG_FILE
}
catlog() { #file
@@ -260,11 +266,11 @@ TOTAL_RESULT=0
INSTANCE=
CASENO=0
+CASENAME=
testcase() { # testfile
CASENO=$((CASENO+1))
- desc=`grep "^#[ \t]*description:" $1 | cut -f2- -d:`
- prlog -n "[$CASENO]$INSTANCE$desc"
+ CASENAME=`grep "^#[ \t]*description:" $1 | cut -f2- -d:`
}
checkreq() { # testfile
@@ -277,40 +283,68 @@ test_on_instance() { # testfile
grep -q "^#[ \t]*flags:.*instance" $1
}
+ktaptest() { # result comment
+ if [ "$KTAP" != "1" ]; then
+ return
+ fi
+
+ local result=
+ if [ "$1" = "1" ]; then
+ result="ok"
+ else
+ result="not ok"
+ fi
+ shift
+
+ local comment=$*
+ if [ "$comment" != "" ]; then
+ comment="# $comment"
+ fi
+
+ echo $CASENO $result $INSTANCE$CASENAME $comment
+}
+
eval_result() { # sigval
case $1 in
$PASS)
prlog " [${color_green}PASS${color_reset}]"
+ ktaptest 1
PASSED_CASES="$PASSED_CASES $CASENO"
return 0
;;
$FAIL)
prlog " [${color_red}FAIL${color_reset}]"
+ ktaptest 0
FAILED_CASES="$FAILED_CASES $CASENO"
return 1 # this is a bug.
;;
$UNRESOLVED)
prlog " [${color_blue}UNRESOLVED${color_reset}]"
+ ktaptest 0 UNRESOLVED
UNRESOLVED_CASES="$UNRESOLVED_CASES $CASENO"
return $UNRESOLVED_RESULT # depends on use case
;;
$UNTESTED)
prlog " [${color_blue}UNTESTED${color_reset}]"
+ ktaptest 1 SKIP
UNTESTED_CASES="$UNTESTED_CASES $CASENO"
return 0
;;
$UNSUPPORTED)
prlog " [${color_blue}UNSUPPORTED${color_reset}]"
+ ktaptest 1 SKIP
UNSUPPORTED_CASES="$UNSUPPORTED_CASES $CASENO"
return $UNSUPPORTED_RESULT # depends on use case
;;
$XFAIL)
prlog " [${color_green}XFAIL${color_reset}]"
+ ktaptest 1 XFAIL
XFAILED_CASES="$XFAILED_CASES $CASENO"
return 0
;;
*)
prlog " [${color_blue}UNDEFINED${color_reset}]"
+ ktaptest 0 error
UNDEFINED_CASES="$UNDEFINED_CASES $CASENO"
return 1 # this must be a test bug
;;
@@ -371,6 +405,7 @@ __run_test() { # testfile
run_test() { # testfile
local testname=`basename $1`
testcase $1
+ prlog -n "[$CASENO]$INSTANCE$CASENAME"
if [ ! -z "$LOG_FILE" ] ; then
local testlog=`mktemp $LOG_DIR/${CASENO}-${testname}-log.XXXXXX`
else
@@ -405,6 +440,17 @@ run_test() { # testfile
# load in the helper functions
. $TEST_DIR/functions
+if [ "$KTAP" = "1" ]; then
+ echo "TAP version 13"
+
+ casecount=`echo $TEST_CASES | wc -w`
+ for t in $TEST_CASES; do
+ test_on_instance $t || continue
+ casecount=$((casecount+1))
+ done
+ echo "1..${casecount}"
+fi
+
# Main loop
for t in $TEST_CASES; do
run_test $t
@@ -439,6 +485,17 @@ prlog "# of unsupported: " `echo $UNSUPPORTED_CASES | wc -w`
prlog "# of xfailed: " `echo $XFAILED_CASES | wc -w`
prlog "# of undefined(test bug): " `echo $UNDEFINED_CASES | wc -w`
+if [ "$KTAP" = "1" ]; then
+ echo -n "# Totals:"
+ echo -n " pass:"`echo $PASSED_CASES | wc -w`
+ echo -n " faii:"`echo $FAILED_CASES | wc -w`
+ echo -n " xfail:"`echo $XFAILED_CASES | wc -w`
+ echo -n " xpass:0"
+ echo -n " skip:"`echo $UNTESTED_CASES $UNSUPPORTED_CASES | wc -w`
+ echo -n " error:"`echo $UNRESOLVED_CASES $UNDEFINED_CASES | wc -w`
+ echo
+fi
+
cleanup
# if no error, return 0
diff --git a/tools/testing/selftests/ftrace/ftracetest-ktap b/tools/testing/selftests/ftrace/ftracetest-ktap
new file mode 100755
index 0000000000000..b3284679ef3af
--- /dev/null
+++ b/tools/testing/selftests/ftrace/ftracetest-ktap
@@ -0,0 +1,8 @@
+#!/bin/sh -e
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# ftracetest-ktap: Wrapper to integrate ftracetest with the kselftest runner
+#
+# Copyright (C) Arm Ltd., 2023
+
+./ftracetest -K
--
2.39.2
From: Jinrong Liang <cloudliang(a)tencent.com>
From: Jinrong Liang <cloudliang(a)tencent.com>
Hi,
This patch set adds some tests to ensure consistent PMU performance event
filter behavior. Specifically, the patches aim to improve KVM's PMU event
filter by strengthening the test coverage, adding documentation, and making
other small changes.
The first patch replaces int with uint32_t for nevents to ensure consistency
and readability in the code. The second patch adds fixed_counter_bitmap to
create_pmu_event_filter() to support the use of the same creator to control
the use of guest fixed counters. The third patch adds test cases for
unsupported input values in PMU filter, including unsupported "action"
values, unsupported "flags" values, and unsupported "nevents" values. Also,
it tests setting non-existent fixed counters in the fixed bitmap doesn't
fail.
The fourth patch updates the documentation for KVM_SET_PMU_EVENT_FILTER ioctl
to include a detailed description of how fixed performance events are handled
in the pmu filter. The fifth patch adds tests to cover that pmu_event_filter
works as expected when applied to fixed performance counters, even if there
is no fixed counter exists. The sixth patch adds a test to ensure that setting
both generic and fixed performance event filters does not affect the consistency
of the fixed performance filter behavior in KVM. The seventh patch adds a test
to verify the behavior of the pmu event filter when an incomplete
kvm_pmu_event_filter structure is used.
These changes help to ensure that KVM's PMU event filter functions as expected
in all supported use cases. These patches have been tested and verified to
function properly.
Thanks for your review and feedback.
Sincerely,
Jinrong Liang
Previous:
https://lore.kernel.org/kvm/20230414110056.19665-1-cloudliang@tencent.com
v2:
- Wrap the code from the documentation in a block of code; (Bagas Sanjaya)
Jinrong Liang (7):
KVM: selftests: Replace int with uint32_t for nevents
KVM: selftests: Apply create_pmu_event_filter() to fixed ctrs
KVM: selftests: Test unavailable event filters are rejected
KVM: x86/pmu: Add documentation for fixed ctr on PMU filter
KVM: selftests: Check if pmu_event_filter meets expectations on fixed
ctrs
KVM: selftests: Check gp event filters without affecting fixed event
filters
KVM: selftests: Test pmu event filter with incompatible
kvm_pmu_event_filter
Documentation/virt/kvm/api.rst | 21 ++
.../kvm/x86_64/pmu_event_filter_test.c | 239 ++++++++++++++++--
2 files changed, 243 insertions(+), 17 deletions(-)
base-commit: a25497a280bbd7bbcc08c87ddb2b3909affc8402
--
2.31.1
This is the v7 of this series which tries to implement the fd-based KVM
guest private memory. The patches are based on latest kvm/queue branch
commit:
b9b71f43683a (kvm/queue) KVM: x86/mmu: Buffer nested MMU
split_desc_cache only by default capacity
Introduction
------------
In general this patch series introduce fd-based memslot which provides
guest memory through memory file descriptor fd[offset,size] instead of
hva/size. The fd can be created from a supported memory filesystem
like tmpfs/hugetlbfs etc. which we refer as memory backing store. KVM
and the the memory backing store exchange callbacks when such memslot
gets created. At runtime KVM will call into callbacks provided by the
backing store to get the pfn with the fd+offset. Memory backing store
will also call into KVM callbacks when userspace punch hole on the fd
to notify KVM to unmap secondary MMU page table entries.
Comparing to existing hva-based memslot, this new type of memslot allows
guest memory unmapped from host userspace like QEMU and even the kernel
itself, therefore reduce attack surface and prevent bugs.
Based on this fd-based memslot, we can build guest private memory that
is going to be used in confidential computing environments such as Intel
TDX and AMD SEV. When supported, the memory backing store can provide
more enforcement on the fd and KVM can use a single memslot to hold both
the private and shared part of the guest memory.
mm extension
---------------------
Introduces new MFD_INACCESSIBLE flag for memfd_create(), the file
created with these flags cannot read(), write() or mmap() etc via normal
MMU operations. The file content can only be used with the newly
introduced memfile_notifier extension.
The memfile_notifier extension provides two sets of callbacks for KVM to
interact with the memory backing store:
- memfile_notifier_ops: callbacks for memory backing store to notify
KVM when memory gets invalidated.
- backing store callbacks: callbacks for KVM to call into memory
backing store to request memory pages for guest private memory.
The memfile_notifier extension also provides APIs for memory backing
store to register/unregister itself and to trigger the notifier when the
bookmarked memory gets invalidated.
The patchset also introduces a new memfd seal F_SEAL_AUTO_ALLOCATE to
prevent double allocation caused by unintentional guest when we only
have a single side of the shared/private memfds effective.
memslot extension
-----------------
Add the private fd and the fd offset to existing 'shared' memslot so
that both private/shared guest memory can live in one single memslot.
A page in the memslot is either private or shared. Whether a guest page
is private or shared is maintained through reusing existing SEV ioctls
KVM_MEMORY_ENCRYPT_{UN,}REG_REGION.
Test
----
To test the new functionalities of this patch TDX patchset is needed.
Since TDX patchset has not been merged so I did two kinds of test:
- Regresion test on kvm/queue (this patchset)
Most new code are not covered. Code also in below repo:
https://github.com/chao-p/linux/tree/privmem-v7
- New Funational test on latest TDX code
The patch is rebased to latest TDX code and tested the new
funcationalities. See below repos:
Linux: https://github.com/chao-p/linux/tree/privmem-v7-tdx
QEMU: https://github.com/chao-p/qemu/tree/privmem-v7
An example QEMU command line for TDX test:
-object tdx-guest,id=tdx,debug=off,sept-ve-disable=off \
-machine confidential-guest-support=tdx \
-object memory-backend-memfd-private,id=ram1,size=${mem} \
-machine memory-backend=ram1
Changelog
----------
v7:
- Move the private/shared info from backing store to KVM.
- Introduce F_SEAL_AUTO_ALLOCATE to avoid double allocation.
- Rework on the sync mechanism between zap/page fault paths.
- Addressed other comments in v6.
v6:
- Re-organzied patch for both mm/KVM parts.
- Added flags for memfile_notifier so its consumers can state their
features and memory backing store can check against these flags.
- Put a backing store reference in the memfile_notifier and move pfn_ops
into backing store.
- Only support boot time backing store register.
- Overall KVM part improvement suggested by Sean and some others.
v5:
- Removed userspace visible F_SEAL_INACCESSIBLE, instead using an
in-kernel flag (SHM_F_INACCESSIBLE for shmem). Private fd can only
be created by MFD_INACCESSIBLE.
- Introduced new APIs for backing store to register itself to
memfile_notifier instead of direct function call.
- Added the accounting and restriction for MFD_INACCESSIBLE memory.
- Added KVM API doc for new memslot extensions and man page for the new
MFD_INACCESSIBLE flag.
- Removed the overlap check for mapping the same file+offset into
multiple gfns due to perf consideration, warned in document.
- Addressed other comments in v4.
v4:
- Decoupled the callbacks between KVM/mm from memfd and use new
name 'memfile_notifier'.
- Supported register multiple memslots to the same backing store.
- Added per-memslot pfn_ops instead of per-system.
- Reworked the invalidation part.
- Improved new KVM uAPIs (private memslot extension and memory
error) per Sean's suggestions.
- Addressed many other minor fixes for comments from v3.
v3:
- Added locking protection when calling
invalidate_page_range/fallocate callbacks.
- Changed memslot structure to keep use useraddr for shared memory.
- Re-organized F_SEAL_INACCESSIBLE and MEMFD_OPS.
- Added MFD_INACCESSIBLE flag to force F_SEAL_INACCESSIBLE.
- Commit message improvement.
- Many small fixes for comments from the last version.
Links to previous discussions
-----------------------------
[1] Original design proposal:
https://lkml.kernel.org/kvm/20210824005248.200037-1-seanjc@google.com/
[2] Updated proposal and RFC patch v1:
https://lkml.kernel.org/linux-fsdevel/20211111141352.26311-1-chao.p.peng@li…
[3] Patch v5: https://lkml.org/lkml/2022/5/19/861
Chao Peng (12):
mm: Add F_SEAL_AUTO_ALLOCATE seal to memfd
selftests/memfd: Add tests for F_SEAL_AUTO_ALLOCATE
mm: Introduce memfile_notifier
mm/memfd: Introduce MFD_INACCESSIBLE flag
KVM: Rename KVM_PRIVATE_MEM_SLOTS to KVM_INTERNAL_MEM_SLOTS
KVM: Use gfn instead of hva for mmu_notifier_retry
KVM: Rename mmu_notifier_*
KVM: Extend the memslot to support fd-based private memory
KVM: Add KVM_EXIT_MEMORY_FAULT exit
KVM: Register/unregister the guest private memory regions
KVM: Handle page fault for private memory
KVM: Enable and expose KVM_MEM_PRIVATE
Kirill A. Shutemov (1):
mm/shmem: Support memfile_notifier
Documentation/virt/kvm/api.rst | 77 +++++-
arch/arm64/kvm/mmu.c | 8 +-
arch/mips/include/asm/kvm_host.h | 2 +-
arch/mips/kvm/mmu.c | 10 +-
arch/powerpc/include/asm/kvm_book3s_64.h | 2 +-
arch/powerpc/kvm/book3s_64_mmu_host.c | 4 +-
arch/powerpc/kvm/book3s_64_mmu_hv.c | 4 +-
arch/powerpc/kvm/book3s_64_mmu_radix.c | 6 +-
arch/powerpc/kvm/book3s_hv_nested.c | 2 +-
arch/powerpc/kvm/book3s_hv_rm_mmu.c | 8 +-
arch/powerpc/kvm/e500_mmu_host.c | 4 +-
arch/riscv/kvm/mmu.c | 4 +-
arch/x86/include/asm/kvm_host.h | 3 +-
arch/x86/kvm/Kconfig | 3 +
arch/x86/kvm/mmu.h | 2 -
arch/x86/kvm/mmu/mmu.c | 74 +++++-
arch/x86/kvm/mmu/mmu_internal.h | 18 ++
arch/x86/kvm/mmu/mmutrace.h | 1 +
arch/x86/kvm/mmu/paging_tmpl.h | 4 +-
arch/x86/kvm/x86.c | 2 +-
include/linux/kvm_host.h | 105 +++++---
include/linux/memfile_notifier.h | 91 +++++++
include/linux/shmem_fs.h | 2 +
include/uapi/linux/fcntl.h | 1 +
include/uapi/linux/kvm.h | 37 +++
include/uapi/linux/memfd.h | 1 +
mm/Kconfig | 4 +
mm/Makefile | 1 +
mm/memfd.c | 18 +-
mm/memfile_notifier.c | 123 ++++++++++
mm/shmem.c | 125 +++++++++-
tools/testing/selftests/memfd/memfd_test.c | 166 +++++++++++++
virt/kvm/Kconfig | 3 +
virt/kvm/kvm_main.c | 272 ++++++++++++++++++---
virt/kvm/pfncache.c | 14 +-
35 files changed, 1074 insertions(+), 127 deletions(-)
create mode 100644 include/linux/memfile_notifier.h
create mode 100644 mm/memfile_notifier.c
--
2.25.1
Many uses of the KUnit resource system are intended to simply defer
calling a function until the test exits (be it due to success or
failure). The existing kunit_alloc_resource() function is often used for
this, but was awkward to use (requiring passing NULL init functions, etc),
and returned a resource without incrementing its reference count, which
-- while okay for this use-case -- could cause problems in others.
Instead, introduce a simple kunit_add_action() API: a simple function
(returning nothing, accepting a single void* argument) can be scheduled
to be called when the test exits. Deferred actions are called in the
opposite order to that which they were registered.
This mimics the devres API, devm_add_action(), and also provides
kunit_remove_action(), to cancel a deferred action, and
kunit_release_action() to trigger one early.
This is implemented as a resource under the hood, so the ordering
between resource cleanup and deferred functions is maintained.
Reviewed-by: Benjamin Berg <benjamin.berg(a)intel.com>
Reviewed-by: Maxime Ripard <maxime(a)cerno.tech>
Tested-by: Maxime Ripard <maxime(a)cerno.tech>
Signed-off-by: David Gow <davidgow(a)google.com>
---
No changes since v2:
https://lore.kernel.org/linux-kselftest/20230518083849.2631178-1-davidgow@g…
Changes since v1:
https://lore.kernel.org/linux-kselftest/20230421084226.2278282-2-davidgow@g…
- Some small documentation updates (Thanks Daniel)
- Reinstate a typedef for the action function.
- This time, it's called kunit_action_t
- Thanks Maxime!
Changes since RFC v2:
https://lore.kernel.org/linux-kselftest/20230331080411.981038-2-davidgow@go…
- Got rid of internal_gfp
- everything uses GFP_KERNEL now
- This includes kunit_kzalloc() and friends, which still allocate the
returned memory with the provided GFP, but use GFP_KERNEL for
internal bookkeeping data.
- Thanks Maxime & Benjamin!
- Got rid of cancellation tokens.
- kunit_add_action() now returns 0 on success, otherwise an error
- Note that this can quite easily lead to a memory leak, so look at
kunit_add_action_or_reset()
- Thanks Maxime & Benjamin!
- Added kunit_add_action_or_reset
- Matches devm_add_action_or_reset()
- Returns 0 on success.
- Thanks Maxime & Benjamin!
- Got rid of the kunit_defer_func_t typedef.
- I liked it, but it is probably pushing the boundaries of kernel
style.
- Use (void (*)(void *)) instead.
Changes since RFC v1:
https://lore.kernel.org/linux-kselftest/20230325043104.3761770-2-davidgow@g…
- Rename functions to better match the devm_* APIs. (Thanks Maxime)
- Embed the kunit_resource in struct kunit_action_ctx to avoid an extra
allocation (Thanks Benjamin)
- Use 'struct kunit_action_ctx' as the type for cancellation tokens
(Thanks Benjamin)
- Add tests.
---
include/kunit/resource.h | 92 +++++++++++++++++++++++++++++++++++++
lib/kunit/kunit-test.c | 88 ++++++++++++++++++++++++++++++++++-
lib/kunit/resource.c | 99 ++++++++++++++++++++++++++++++++++++++++
3 files changed, 278 insertions(+), 1 deletion(-)
diff --git a/include/kunit/resource.h b/include/kunit/resource.h
index c0d88b318e90..b64eb783b1bc 100644
--- a/include/kunit/resource.h
+++ b/include/kunit/resource.h
@@ -387,4 +387,96 @@ static inline int kunit_destroy_named_resource(struct kunit *test,
*/
void kunit_remove_resource(struct kunit *test, struct kunit_resource *res);
+/* A 'deferred action' function to be used with kunit_add_action. */
+typedef void (kunit_action_t)(void *);
+
+/**
+ * kunit_add_action() - Call a function when the test ends.
+ * @test: Test case to associate the action with.
+ * @func: The function to run on test exit
+ * @ctx: Data passed into @func
+ *
+ * Defer the execution of a function until the test exits, either normally or
+ * due to a failure. @ctx is passed as additional context. All functions
+ * registered with kunit_add_action() will execute in the opposite order to that
+ * they were registered in.
+ *
+ * This is useful for cleaning up allocated memory and resources, as these
+ * functions are called even if the test aborts early due to, e.g., a failed
+ * assertion.
+ *
+ * See also: devm_add_action() for the devres equivalent.
+ *
+ * Returns:
+ * 0 on success, an error if the action could not be deferred.
+ */
+int kunit_add_action(struct kunit *test, kunit_action_t *action, void *ctx);
+
+/**
+ * kunit_add_action_or_reset() - Call a function when the test ends.
+ * @test: Test case to associate the action with.
+ * @func: The function to run on test exit
+ * @ctx: Data passed into @func
+ *
+ * Defer the execution of a function until the test exits, either normally or
+ * due to a failure. @ctx is passed as additional context. All functions
+ * registered with kunit_add_action() will execute in the opposite order to that
+ * they were registered in.
+ *
+ * This is useful for cleaning up allocated memory and resources, as these
+ * functions are called even if the test aborts early due to, e.g., a failed
+ * assertion.
+ *
+ * If the action cannot be created (e.g., due to the system being out of memory),
+ * then action(ctx) will be called immediately, and an error will be returned.
+ *
+ * See also: devm_add_action_or_reset() for the devres equivalent.
+ *
+ * Returns:
+ * 0 on success, an error if the action could not be deferred.
+ */
+int kunit_add_action_or_reset(struct kunit *test, kunit_action_t *action,
+ void *ctx);
+
+/**
+ * kunit_remove_action() - Cancel a matching deferred action.
+ * @test: Test case the action is associated with.
+ * @func: The deferred function to cancel.
+ * @ctx: The context passed to the deferred function to trigger.
+ *
+ * Prevent an action deferred via kunit_add_action() from executing when the
+ * test terminates.
+ *
+ * If the function/context pair was deferred multiple times, only the most
+ * recent one will be cancelled.
+ *
+ * See also: devm_remove_action() for the devres equivalent.
+ */
+void kunit_remove_action(struct kunit *test,
+ kunit_action_t *action,
+ void *ctx);
+
+/**
+ * kunit_release_action() - Run a matching action call immediately.
+ * @test: Test case the action is associated with.
+ * @func: The deferred function to trigger.
+ * @ctx: The context passed to the deferred function to trigger.
+ *
+ * Execute a function deferred via kunit_add_action()) immediately, rather than
+ * when the test ends.
+ *
+ * If the function/context pair was deferred multiple times, it will only be
+ * executed once here. The most recent deferral will no longer execute when
+ * the test ends.
+ *
+ * kunit_release_action(test, func, ctx);
+ * is equivalent to
+ * func(ctx);
+ * kunit_remove_action(test, func, ctx);
+ *
+ * See also: devm_release_action() for the devres equivalent.
+ */
+void kunit_release_action(struct kunit *test,
+ kunit_action_t *action,
+ void *ctx);
#endif /* _KUNIT_RESOURCE_H */
diff --git a/lib/kunit/kunit-test.c b/lib/kunit/kunit-test.c
index 42e44caa1bdd..83d8e90ca7a2 100644
--- a/lib/kunit/kunit-test.c
+++ b/lib/kunit/kunit-test.c
@@ -112,7 +112,7 @@ struct kunit_test_resource_context {
struct kunit test;
bool is_resource_initialized;
int allocate_order[2];
- int free_order[2];
+ int free_order[4];
};
static int fake_resource_init(struct kunit_resource *res, void *context)
@@ -403,6 +403,88 @@ static void kunit_resource_test_named(struct kunit *test)
KUNIT_EXPECT_TRUE(test, list_empty(&test->resources));
}
+static void increment_int(void *ctx)
+{
+ int *i = (int *)ctx;
+ (*i)++;
+}
+
+static void kunit_resource_test_action(struct kunit *test)
+{
+ int num_actions = 0;
+
+ kunit_add_action(test, increment_int, &num_actions);
+ KUNIT_EXPECT_EQ(test, num_actions, 0);
+ kunit_cleanup(test);
+ KUNIT_EXPECT_EQ(test, num_actions, 1);
+
+ /* Once we've cleaned up, the action queue is empty. */
+ kunit_cleanup(test);
+ KUNIT_EXPECT_EQ(test, num_actions, 1);
+
+ /* Check the same function can be deferred multiple times. */
+ kunit_add_action(test, increment_int, &num_actions);
+ kunit_add_action(test, increment_int, &num_actions);
+ kunit_cleanup(test);
+ KUNIT_EXPECT_EQ(test, num_actions, 3);
+}
+static void kunit_resource_test_remove_action(struct kunit *test)
+{
+ int num_actions = 0;
+
+ kunit_add_action(test, increment_int, &num_actions);
+ KUNIT_EXPECT_EQ(test, num_actions, 0);
+
+ kunit_remove_action(test, increment_int, &num_actions);
+ kunit_cleanup(test);
+ KUNIT_EXPECT_EQ(test, num_actions, 0);
+}
+static void kunit_resource_test_release_action(struct kunit *test)
+{
+ int num_actions = 0;
+
+ kunit_add_action(test, increment_int, &num_actions);
+ KUNIT_EXPECT_EQ(test, num_actions, 0);
+ /* Runs immediately on trigger. */
+ kunit_release_action(test, increment_int, &num_actions);
+ KUNIT_EXPECT_EQ(test, num_actions, 1);
+
+ /* Doesn't run again on test exit. */
+ kunit_cleanup(test);
+ KUNIT_EXPECT_EQ(test, num_actions, 1);
+}
+static void action_order_1(void *ctx)
+{
+ struct kunit_test_resource_context *res_ctx = (struct kunit_test_resource_context *)ctx;
+
+ KUNIT_RESOURCE_TEST_MARK_ORDER(res_ctx, free_order, 1);
+ kunit_log(KERN_INFO, current->kunit_test, "action_order_1");
+}
+static void action_order_2(void *ctx)
+{
+ struct kunit_test_resource_context *res_ctx = (struct kunit_test_resource_context *)ctx;
+
+ KUNIT_RESOURCE_TEST_MARK_ORDER(res_ctx, free_order, 2);
+ kunit_log(KERN_INFO, current->kunit_test, "action_order_2");
+}
+static void kunit_resource_test_action_ordering(struct kunit *test)
+{
+ struct kunit_test_resource_context *ctx = test->priv;
+
+ kunit_add_action(test, action_order_1, ctx);
+ kunit_add_action(test, action_order_2, ctx);
+ kunit_add_action(test, action_order_1, ctx);
+ kunit_add_action(test, action_order_2, ctx);
+ kunit_remove_action(test, action_order_1, ctx);
+ kunit_release_action(test, action_order_2, ctx);
+ kunit_cleanup(test);
+
+ /* [2 is triggered] [2], [(1 is cancelled)] [1] */
+ KUNIT_EXPECT_EQ(test, ctx->free_order[0], 2);
+ KUNIT_EXPECT_EQ(test, ctx->free_order[1], 2);
+ KUNIT_EXPECT_EQ(test, ctx->free_order[2], 1);
+}
+
static int kunit_resource_test_init(struct kunit *test)
{
struct kunit_test_resource_context *ctx =
@@ -434,6 +516,10 @@ static struct kunit_case kunit_resource_test_cases[] = {
KUNIT_CASE(kunit_resource_test_proper_free_ordering),
KUNIT_CASE(kunit_resource_test_static),
KUNIT_CASE(kunit_resource_test_named),
+ KUNIT_CASE(kunit_resource_test_action),
+ KUNIT_CASE(kunit_resource_test_remove_action),
+ KUNIT_CASE(kunit_resource_test_release_action),
+ KUNIT_CASE(kunit_resource_test_action_ordering),
{}
};
diff --git a/lib/kunit/resource.c b/lib/kunit/resource.c
index c414df922f34..f0209252b179 100644
--- a/lib/kunit/resource.c
+++ b/lib/kunit/resource.c
@@ -77,3 +77,102 @@ int kunit_destroy_resource(struct kunit *test, kunit_resource_match_t match,
return 0;
}
EXPORT_SYMBOL_GPL(kunit_destroy_resource);
+
+struct kunit_action_ctx {
+ struct kunit_resource res;
+ kunit_action_t *func;
+ void *ctx;
+};
+
+static void __kunit_action_free(struct kunit_resource *res)
+{
+ struct kunit_action_ctx *action_ctx = container_of(res, struct kunit_action_ctx, res);
+
+ action_ctx->func(action_ctx->ctx);
+}
+
+
+int kunit_add_action(struct kunit *test, void (*action)(void *), void *ctx)
+{
+ struct kunit_action_ctx *action_ctx;
+
+ KUNIT_ASSERT_NOT_NULL_MSG(test, action, "Tried to action a NULL function!");
+
+ action_ctx = kzalloc(sizeof(*action_ctx), GFP_KERNEL);
+ if (!action_ctx)
+ return -ENOMEM;
+
+ action_ctx->func = action;
+ action_ctx->ctx = ctx;
+
+ action_ctx->res.should_kfree = true;
+ /* As init is NULL, this cannot fail. */
+ __kunit_add_resource(test, NULL, __kunit_action_free, &action_ctx->res, action_ctx);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(kunit_add_action);
+
+int kunit_add_action_or_reset(struct kunit *test, void (*action)(void *),
+ void *ctx)
+{
+ int res = kunit_add_action(test, action, ctx);
+
+ if (res)
+ action(ctx);
+ return res;
+}
+EXPORT_SYMBOL_GPL(kunit_add_action_or_reset);
+
+static bool __kunit_action_match(struct kunit *test,
+ struct kunit_resource *res, void *match_data)
+{
+ struct kunit_action_ctx *match_ctx = (struct kunit_action_ctx *)match_data;
+ struct kunit_action_ctx *res_ctx = container_of(res, struct kunit_action_ctx, res);
+
+ /* Make sure this is a free function. */
+ if (res->free != __kunit_action_free)
+ return false;
+
+ /* Both the function and context data should match. */
+ return (match_ctx->func == res_ctx->func) && (match_ctx->ctx == res_ctx->ctx);
+}
+
+void kunit_remove_action(struct kunit *test,
+ kunit_action_t *action,
+ void *ctx)
+{
+ struct kunit_action_ctx match_ctx;
+ struct kunit_resource *res;
+
+ match_ctx.func = action;
+ match_ctx.ctx = ctx;
+
+ res = kunit_find_resource(test, __kunit_action_match, &match_ctx);
+ if (res) {
+ /* Remove the free function so we don't run the action. */
+ res->free = NULL;
+ kunit_remove_resource(test, res);
+ kunit_put_resource(res);
+ }
+}
+EXPORT_SYMBOL_GPL(kunit_remove_action);
+
+void kunit_release_action(struct kunit *test,
+ kunit_action_t *action,
+ void *ctx)
+{
+ struct kunit_action_ctx match_ctx;
+ struct kunit_resource *res;
+
+ match_ctx.func = action;
+ match_ctx.ctx = ctx;
+
+ res = kunit_find_resource(test, __kunit_action_match, &match_ctx);
+ if (res) {
+ kunit_remove_resource(test, res);
+ /* We have to put() this here, else free won't be called. */
+ kunit_put_resource(res);
+ }
+}
+EXPORT_SYMBOL_GPL(kunit_release_action);
--
2.41.0.rc0.172.g3f132b7071-goog
Many uses of the KUnit resource system are intended to simply defer
calling a function until the test exits (be it due to success or
failure). The existing kunit_alloc_resource() function is often used for
this, but was awkward to use (requiring passing NULL init functions, etc),
and returned a resource without incrementing its reference count, which
-- while okay for this use-case -- could cause problems in others.
Instead, introduce a simple kunit_add_action() API: a simple function
(returning nothing, accepting a single void* argument) can be scheduled
to be called when the test exits. Deferred actions are called in the
opposite order to that which they were registered.
This mimics the devres API, devm_add_action(), and also provides
kunit_remove_action(), to cancel a deferred action, and
kunit_release_action() to trigger one early.
This is implemented as a resource under the hood, so the ordering
between resource cleanup and deferred functions is maintained.
Reviewed-by: Benjamin Berg <benjamin.berg(a)intel.com>
Reviewed-by: Maxime Ripard <maxime(a)cerno.tech>
Tested-by: Maxime Ripard <maxime(a)cerno.tech>
Signed-off-by: David Gow <davidgow(a)google.com>
---
Changes since v1:
https://lore.kernel.org/linux-kselftest/20230421084226.2278282-2-davidgow@g…
- Some small documentation updates (Thanks Daniel)
- Reinstate a typedef for the action function.
- This time, it's called kunit_action_t
- Thanks Maxime!
Changes since RFC v2:
https://lore.kernel.org/linux-kselftest/20230331080411.981038-2-davidgow@go…
- Got rid of internal_gfp
- everything uses GFP_KERNEL now
- This includes kunit_kzalloc() and friends, which still allocate the
returned memory with the provided GFP, but use GFP_KERNEL for
internal bookkeeping data.
- Thanks Maxime & Benjamin!
- Got rid of cancellation tokens.
- kunit_add_action() now returns 0 on success, otherwise an error
- Note that this can quite easily lead to a memory leak, so look at
kunit_add_action_or_reset()
- Thanks Maxime & Benjamin!
- Added kunit_add_action_or_reset
- Matches devm_add_action_or_reset()
- Returns 0 on success.
- Thanks Maxime & Benjamin!
- Got rid of the kunit_defer_func_t typedef.
- I liked it, but it is probably pushing the boundaries of kernel
style.
- Use (void (*)(void *)) instead.
Changes since RFC v1:
https://lore.kernel.org/linux-kselftest/20230325043104.3761770-2-davidgow@g…
- Rename functions to better match the devm_* APIs. (Thanks Maxime)
- Embed the kunit_resource in struct kunit_action_ctx to avoid an extra
allocation (Thanks Benjamin)
- Use 'struct kunit_action_ctx' as the type for cancellation tokens
(Thanks Benjamin)
- Add tests.
---
include/kunit/resource.h | 92 +++++++++++++++++++++++++++++++++++++
lib/kunit/kunit-test.c | 88 ++++++++++++++++++++++++++++++++++-
lib/kunit/resource.c | 99 ++++++++++++++++++++++++++++++++++++++++
3 files changed, 278 insertions(+), 1 deletion(-)
diff --git a/include/kunit/resource.h b/include/kunit/resource.h
index c0d88b318e90..b64eb783b1bc 100644
--- a/include/kunit/resource.h
+++ b/include/kunit/resource.h
@@ -387,4 +387,96 @@ static inline int kunit_destroy_named_resource(struct kunit *test,
*/
void kunit_remove_resource(struct kunit *test, struct kunit_resource *res);
+/* A 'deferred action' function to be used with kunit_add_action. */
+typedef void (kunit_action_t)(void *);
+
+/**
+ * kunit_add_action() - Call a function when the test ends.
+ * @test: Test case to associate the action with.
+ * @func: The function to run on test exit
+ * @ctx: Data passed into @func
+ *
+ * Defer the execution of a function until the test exits, either normally or
+ * due to a failure. @ctx is passed as additional context. All functions
+ * registered with kunit_add_action() will execute in the opposite order to that
+ * they were registered in.
+ *
+ * This is useful for cleaning up allocated memory and resources, as these
+ * functions are called even if the test aborts early due to, e.g., a failed
+ * assertion.
+ *
+ * See also: devm_add_action() for the devres equivalent.
+ *
+ * Returns:
+ * 0 on success, an error if the action could not be deferred.
+ */
+int kunit_add_action(struct kunit *test, kunit_action_t *action, void *ctx);
+
+/**
+ * kunit_add_action_or_reset() - Call a function when the test ends.
+ * @test: Test case to associate the action with.
+ * @func: The function to run on test exit
+ * @ctx: Data passed into @func
+ *
+ * Defer the execution of a function until the test exits, either normally or
+ * due to a failure. @ctx is passed as additional context. All functions
+ * registered with kunit_add_action() will execute in the opposite order to that
+ * they were registered in.
+ *
+ * This is useful for cleaning up allocated memory and resources, as these
+ * functions are called even if the test aborts early due to, e.g., a failed
+ * assertion.
+ *
+ * If the action cannot be created (e.g., due to the system being out of memory),
+ * then action(ctx) will be called immediately, and an error will be returned.
+ *
+ * See also: devm_add_action_or_reset() for the devres equivalent.
+ *
+ * Returns:
+ * 0 on success, an error if the action could not be deferred.
+ */
+int kunit_add_action_or_reset(struct kunit *test, kunit_action_t *action,
+ void *ctx);
+
+/**
+ * kunit_remove_action() - Cancel a matching deferred action.
+ * @test: Test case the action is associated with.
+ * @func: The deferred function to cancel.
+ * @ctx: The context passed to the deferred function to trigger.
+ *
+ * Prevent an action deferred via kunit_add_action() from executing when the
+ * test terminates.
+ *
+ * If the function/context pair was deferred multiple times, only the most
+ * recent one will be cancelled.
+ *
+ * See also: devm_remove_action() for the devres equivalent.
+ */
+void kunit_remove_action(struct kunit *test,
+ kunit_action_t *action,
+ void *ctx);
+
+/**
+ * kunit_release_action() - Run a matching action call immediately.
+ * @test: Test case the action is associated with.
+ * @func: The deferred function to trigger.
+ * @ctx: The context passed to the deferred function to trigger.
+ *
+ * Execute a function deferred via kunit_add_action()) immediately, rather than
+ * when the test ends.
+ *
+ * If the function/context pair was deferred multiple times, it will only be
+ * executed once here. The most recent deferral will no longer execute when
+ * the test ends.
+ *
+ * kunit_release_action(test, func, ctx);
+ * is equivalent to
+ * func(ctx);
+ * kunit_remove_action(test, func, ctx);
+ *
+ * See also: devm_release_action() for the devres equivalent.
+ */
+void kunit_release_action(struct kunit *test,
+ kunit_action_t *action,
+ void *ctx);
#endif /* _KUNIT_RESOURCE_H */
diff --git a/lib/kunit/kunit-test.c b/lib/kunit/kunit-test.c
index 42e44caa1bdd..83d8e90ca7a2 100644
--- a/lib/kunit/kunit-test.c
+++ b/lib/kunit/kunit-test.c
@@ -112,7 +112,7 @@ struct kunit_test_resource_context {
struct kunit test;
bool is_resource_initialized;
int allocate_order[2];
- int free_order[2];
+ int free_order[4];
};
static int fake_resource_init(struct kunit_resource *res, void *context)
@@ -403,6 +403,88 @@ static void kunit_resource_test_named(struct kunit *test)
KUNIT_EXPECT_TRUE(test, list_empty(&test->resources));
}
+static void increment_int(void *ctx)
+{
+ int *i = (int *)ctx;
+ (*i)++;
+}
+
+static void kunit_resource_test_action(struct kunit *test)
+{
+ int num_actions = 0;
+
+ kunit_add_action(test, increment_int, &num_actions);
+ KUNIT_EXPECT_EQ(test, num_actions, 0);
+ kunit_cleanup(test);
+ KUNIT_EXPECT_EQ(test, num_actions, 1);
+
+ /* Once we've cleaned up, the action queue is empty. */
+ kunit_cleanup(test);
+ KUNIT_EXPECT_EQ(test, num_actions, 1);
+
+ /* Check the same function can be deferred multiple times. */
+ kunit_add_action(test, increment_int, &num_actions);
+ kunit_add_action(test, increment_int, &num_actions);
+ kunit_cleanup(test);
+ KUNIT_EXPECT_EQ(test, num_actions, 3);
+}
+static void kunit_resource_test_remove_action(struct kunit *test)
+{
+ int num_actions = 0;
+
+ kunit_add_action(test, increment_int, &num_actions);
+ KUNIT_EXPECT_EQ(test, num_actions, 0);
+
+ kunit_remove_action(test, increment_int, &num_actions);
+ kunit_cleanup(test);
+ KUNIT_EXPECT_EQ(test, num_actions, 0);
+}
+static void kunit_resource_test_release_action(struct kunit *test)
+{
+ int num_actions = 0;
+
+ kunit_add_action(test, increment_int, &num_actions);
+ KUNIT_EXPECT_EQ(test, num_actions, 0);
+ /* Runs immediately on trigger. */
+ kunit_release_action(test, increment_int, &num_actions);
+ KUNIT_EXPECT_EQ(test, num_actions, 1);
+
+ /* Doesn't run again on test exit. */
+ kunit_cleanup(test);
+ KUNIT_EXPECT_EQ(test, num_actions, 1);
+}
+static void action_order_1(void *ctx)
+{
+ struct kunit_test_resource_context *res_ctx = (struct kunit_test_resource_context *)ctx;
+
+ KUNIT_RESOURCE_TEST_MARK_ORDER(res_ctx, free_order, 1);
+ kunit_log(KERN_INFO, current->kunit_test, "action_order_1");
+}
+static void action_order_2(void *ctx)
+{
+ struct kunit_test_resource_context *res_ctx = (struct kunit_test_resource_context *)ctx;
+
+ KUNIT_RESOURCE_TEST_MARK_ORDER(res_ctx, free_order, 2);
+ kunit_log(KERN_INFO, current->kunit_test, "action_order_2");
+}
+static void kunit_resource_test_action_ordering(struct kunit *test)
+{
+ struct kunit_test_resource_context *ctx = test->priv;
+
+ kunit_add_action(test, action_order_1, ctx);
+ kunit_add_action(test, action_order_2, ctx);
+ kunit_add_action(test, action_order_1, ctx);
+ kunit_add_action(test, action_order_2, ctx);
+ kunit_remove_action(test, action_order_1, ctx);
+ kunit_release_action(test, action_order_2, ctx);
+ kunit_cleanup(test);
+
+ /* [2 is triggered] [2], [(1 is cancelled)] [1] */
+ KUNIT_EXPECT_EQ(test, ctx->free_order[0], 2);
+ KUNIT_EXPECT_EQ(test, ctx->free_order[1], 2);
+ KUNIT_EXPECT_EQ(test, ctx->free_order[2], 1);
+}
+
static int kunit_resource_test_init(struct kunit *test)
{
struct kunit_test_resource_context *ctx =
@@ -434,6 +516,10 @@ static struct kunit_case kunit_resource_test_cases[] = {
KUNIT_CASE(kunit_resource_test_proper_free_ordering),
KUNIT_CASE(kunit_resource_test_static),
KUNIT_CASE(kunit_resource_test_named),
+ KUNIT_CASE(kunit_resource_test_action),
+ KUNIT_CASE(kunit_resource_test_remove_action),
+ KUNIT_CASE(kunit_resource_test_release_action),
+ KUNIT_CASE(kunit_resource_test_action_ordering),
{}
};
diff --git a/lib/kunit/resource.c b/lib/kunit/resource.c
index c414df922f34..f0209252b179 100644
--- a/lib/kunit/resource.c
+++ b/lib/kunit/resource.c
@@ -77,3 +77,102 @@ int kunit_destroy_resource(struct kunit *test, kunit_resource_match_t match,
return 0;
}
EXPORT_SYMBOL_GPL(kunit_destroy_resource);
+
+struct kunit_action_ctx {
+ struct kunit_resource res;
+ kunit_action_t *func;
+ void *ctx;
+};
+
+static void __kunit_action_free(struct kunit_resource *res)
+{
+ struct kunit_action_ctx *action_ctx = container_of(res, struct kunit_action_ctx, res);
+
+ action_ctx->func(action_ctx->ctx);
+}
+
+
+int kunit_add_action(struct kunit *test, void (*action)(void *), void *ctx)
+{
+ struct kunit_action_ctx *action_ctx;
+
+ KUNIT_ASSERT_NOT_NULL_MSG(test, action, "Tried to action a NULL function!");
+
+ action_ctx = kzalloc(sizeof(*action_ctx), GFP_KERNEL);
+ if (!action_ctx)
+ return -ENOMEM;
+
+ action_ctx->func = action;
+ action_ctx->ctx = ctx;
+
+ action_ctx->res.should_kfree = true;
+ /* As init is NULL, this cannot fail. */
+ __kunit_add_resource(test, NULL, __kunit_action_free, &action_ctx->res, action_ctx);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(kunit_add_action);
+
+int kunit_add_action_or_reset(struct kunit *test, void (*action)(void *),
+ void *ctx)
+{
+ int res = kunit_add_action(test, action, ctx);
+
+ if (res)
+ action(ctx);
+ return res;
+}
+EXPORT_SYMBOL_GPL(kunit_add_action_or_reset);
+
+static bool __kunit_action_match(struct kunit *test,
+ struct kunit_resource *res, void *match_data)
+{
+ struct kunit_action_ctx *match_ctx = (struct kunit_action_ctx *)match_data;
+ struct kunit_action_ctx *res_ctx = container_of(res, struct kunit_action_ctx, res);
+
+ /* Make sure this is a free function. */
+ if (res->free != __kunit_action_free)
+ return false;
+
+ /* Both the function and context data should match. */
+ return (match_ctx->func == res_ctx->func) && (match_ctx->ctx == res_ctx->ctx);
+}
+
+void kunit_remove_action(struct kunit *test,
+ kunit_action_t *action,
+ void *ctx)
+{
+ struct kunit_action_ctx match_ctx;
+ struct kunit_resource *res;
+
+ match_ctx.func = action;
+ match_ctx.ctx = ctx;
+
+ res = kunit_find_resource(test, __kunit_action_match, &match_ctx);
+ if (res) {
+ /* Remove the free function so we don't run the action. */
+ res->free = NULL;
+ kunit_remove_resource(test, res);
+ kunit_put_resource(res);
+ }
+}
+EXPORT_SYMBOL_GPL(kunit_remove_action);
+
+void kunit_release_action(struct kunit *test,
+ kunit_action_t *action,
+ void *ctx)
+{
+ struct kunit_action_ctx match_ctx;
+ struct kunit_resource *res;
+
+ match_ctx.func = action;
+ match_ctx.ctx = ctx;
+
+ res = kunit_find_resource(test, __kunit_action_match, &match_ctx);
+ if (res) {
+ kunit_remove_resource(test, res);
+ /* We have to put() this here, else free won't be called. */
+ kunit_put_resource(res);
+ }
+}
+EXPORT_SYMBOL_GPL(kunit_release_action);
--
2.40.1.698.g37aff9b760-goog
The basic idea here is to "simulate" memory poisoning for VMs. A VM
running on some host might encounter a memory error, after which some
page(s) are poisoned (i.e., future accesses SIGBUS). They expect that
once poisoned, pages can never become "un-poisoned". So, when we live
migrate the VM, we need to preserve the poisoned status of these pages.
When live migrating, we try to get the guest running on its new host as
quickly as possible. So, we start it running before all memory has been
copied, and before we're certain which pages should be poisoned or not.
So the basic way to use this new feature is:
- On the new host, the guest's memory is registered with userfaultfd, in
either MISSING or MINOR mode (doesn't really matter for this purpose).
- On any first access, we get a userfaultfd event. At this point we can
communicate with the old host to find out if the page was poisoned.
- If so, we can respond with a UFFDIO_SIGBUS - this places a swap marker
so any future accesses will SIGBUS. Because the pte is now "present",
future accesses won't generate more userfaultfd events, they'll just
SIGBUS directly.
UFFDIO_SIGBUS does not handle unmapping previously-present PTEs. This
isn't needed, because during live migration we want to intercept
all accesses with userfaultfd (not just writes, so WP mode isn't useful
for this). So whether minor or missing mode is being used (or both), the
PTE won't be present in any case, so handling that case isn't needed.
Signed-off-by: Axel Rasmussen <axelrasmussen(a)google.com>
---
fs/userfaultfd.c | 63 ++++++++++++++++++++++++++++++++
include/linux/swapops.h | 3 +-
include/linux/userfaultfd_k.h | 4 ++
include/uapi/linux/userfaultfd.h | 25 +++++++++++--
mm/memory.c | 4 ++
mm/userfaultfd.c | 62 ++++++++++++++++++++++++++++++-
6 files changed, 156 insertions(+), 5 deletions(-)
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 0fd96d6e39ce..edc2928dae2b 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -1966,6 +1966,66 @@ static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg)
return ret;
}
+static inline int userfaultfd_sigbus(struct userfaultfd_ctx *ctx, unsigned long arg)
+{
+ __s64 ret;
+ struct uffdio_sigbus uffdio_sigbus;
+ struct uffdio_sigbus __user *user_uffdio_sigbus;
+ struct userfaultfd_wake_range range;
+
+ user_uffdio_sigbus = (struct uffdio_sigbus __user *)arg;
+
+ ret = -EAGAIN;
+ if (atomic_read(&ctx->mmap_changing))
+ goto out;
+
+ ret = -EFAULT;
+ if (copy_from_user(&uffdio_sigbus, user_uffdio_sigbus,
+ /* don't copy the output fields */
+ sizeof(uffdio_sigbus) - (sizeof(__s64))))
+ goto out;
+
+ ret = validate_range(ctx->mm, uffdio_sigbus.range.start,
+ uffdio_sigbus.range.len);
+ if (ret)
+ goto out;
+
+ ret = -EINVAL;
+ /* double check for wraparound just in case. */
+ if (uffdio_sigbus.range.start + uffdio_sigbus.range.len <=
+ uffdio_sigbus.range.start) {
+ goto out;
+ }
+ if (uffdio_sigbus.mode & ~UFFDIO_SIGBUS_MODE_DONTWAKE)
+ goto out;
+
+ if (mmget_not_zero(ctx->mm)) {
+ ret = mfill_atomic_sigbus(ctx->mm, uffdio_sigbus.range.start,
+ uffdio_sigbus.range.len,
+ &ctx->mmap_changing, 0);
+ mmput(ctx->mm);
+ } else {
+ return -ESRCH;
+ }
+
+ if (unlikely(put_user(ret, &user_uffdio_sigbus->updated)))
+ return -EFAULT;
+ if (ret < 0)
+ goto out;
+
+ /* len == 0 would wake all */
+ BUG_ON(!ret);
+ range.len = ret;
+ if (!(uffdio_sigbus.mode & UFFDIO_SIGBUS_MODE_DONTWAKE)) {
+ range.start = uffdio_sigbus.range.start;
+ wake_userfault(ctx, &range);
+ }
+ ret = range.len == uffdio_sigbus.range.len ? 0 : -EAGAIN;
+
+out:
+ return ret;
+}
+
static inline unsigned int uffd_ctx_features(__u64 user_features)
{
/*
@@ -2067,6 +2127,9 @@ static long userfaultfd_ioctl(struct file *file, unsigned cmd,
case UFFDIO_CONTINUE:
ret = userfaultfd_continue(ctx, arg);
break;
+ case UFFDIO_SIGBUS:
+ ret = userfaultfd_sigbus(ctx, arg);
+ break;
}
return ret;
}
diff --git a/include/linux/swapops.h b/include/linux/swapops.h
index 3a451b7afcb3..fa778a0ae730 100644
--- a/include/linux/swapops.h
+++ b/include/linux/swapops.h
@@ -405,7 +405,8 @@ typedef unsigned long pte_marker;
#define PTE_MARKER_UFFD_WP BIT(0)
#define PTE_MARKER_SWAPIN_ERROR BIT(1)
-#define PTE_MARKER_MASK (BIT(2) - 1)
+#define PTE_MARKER_UFFD_SIGBUS BIT(2)
+#define PTE_MARKER_MASK (BIT(3) - 1)
static inline swp_entry_t make_pte_marker_entry(pte_marker marker)
{
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index d78b01524349..6de1084939c5 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -46,6 +46,7 @@ enum mfill_atomic_mode {
MFILL_ATOMIC_COPY,
MFILL_ATOMIC_ZEROPAGE,
MFILL_ATOMIC_CONTINUE,
+ MFILL_ATOMIC_SIGBUS,
NR_MFILL_ATOMIC_MODES,
};
@@ -83,6 +84,9 @@ extern ssize_t mfill_atomic_zeropage(struct mm_struct *dst_mm,
extern ssize_t mfill_atomic_continue(struct mm_struct *dst_mm, unsigned long dst_start,
unsigned long len, atomic_t *mmap_changing,
uffd_flags_t flags);
+extern ssize_t mfill_atomic_sigbus(struct mm_struct *dst_mm, unsigned long start,
+ unsigned long len, atomic_t *mmap_changing,
+ uffd_flags_t flags);
extern int mwriteprotect_range(struct mm_struct *dst_mm,
unsigned long start, unsigned long len,
bool enable_wp, atomic_t *mmap_changing);
diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h
index 66dd4cd277bd..616e33d3db97 100644
--- a/include/uapi/linux/userfaultfd.h
+++ b/include/uapi/linux/userfaultfd.h
@@ -39,7 +39,8 @@
UFFD_FEATURE_MINOR_SHMEM | \
UFFD_FEATURE_EXACT_ADDRESS | \
UFFD_FEATURE_WP_HUGETLBFS_SHMEM | \
- UFFD_FEATURE_WP_UNPOPULATED)
+ UFFD_FEATURE_WP_UNPOPULATED | \
+ UFFD_FEATURE_SIGBUS_IOCTL)
#define UFFD_API_IOCTLS \
((__u64)1 << _UFFDIO_REGISTER | \
(__u64)1 << _UFFDIO_UNREGISTER | \
@@ -49,12 +50,14 @@
(__u64)1 << _UFFDIO_COPY | \
(__u64)1 << _UFFDIO_ZEROPAGE | \
(__u64)1 << _UFFDIO_WRITEPROTECT | \
- (__u64)1 << _UFFDIO_CONTINUE)
+ (__u64)1 << _UFFDIO_CONTINUE | \
+ (__u64)1 << _UFFDIO_SIGBUS)
#define UFFD_API_RANGE_IOCTLS_BASIC \
((__u64)1 << _UFFDIO_WAKE | \
(__u64)1 << _UFFDIO_COPY | \
+ (__u64)1 << _UFFDIO_WRITEPROTECT | \
(__u64)1 << _UFFDIO_CONTINUE | \
- (__u64)1 << _UFFDIO_WRITEPROTECT)
+ (__u64)1 << _UFFDIO_SIGBUS)
/*
* Valid ioctl command number range with this API is from 0x00 to
@@ -71,6 +74,7 @@
#define _UFFDIO_ZEROPAGE (0x04)
#define _UFFDIO_WRITEPROTECT (0x06)
#define _UFFDIO_CONTINUE (0x07)
+#define _UFFDIO_SIGBUS (0x08)
#define _UFFDIO_API (0x3F)
/* userfaultfd ioctl ids */
@@ -91,6 +95,8 @@
struct uffdio_writeprotect)
#define UFFDIO_CONTINUE _IOWR(UFFDIO, _UFFDIO_CONTINUE, \
struct uffdio_continue)
+#define UFFDIO_SIGBUS _IOWR(UFFDIO, _UFFDIO_SIGBUS, \
+ struct uffdio_sigbus)
/* read() structure */
struct uffd_msg {
@@ -225,6 +231,7 @@ struct uffdio_api {
#define UFFD_FEATURE_EXACT_ADDRESS (1<<11)
#define UFFD_FEATURE_WP_HUGETLBFS_SHMEM (1<<12)
#define UFFD_FEATURE_WP_UNPOPULATED (1<<13)
+#define UFFD_FEATURE_SIGBUS_IOCTL (1<<14)
__u64 features;
__u64 ioctls;
@@ -321,6 +328,18 @@ struct uffdio_continue {
__s64 mapped;
};
+struct uffdio_sigbus {
+ struct uffdio_range range;
+#define UFFDIO_SIGBUS_MODE_DONTWAKE ((__u64)1<<0)
+ __u64 mode;
+
+ /*
+ * Fields below here are written by the ioctl and must be at the end:
+ * the copy_from_user will not read past here.
+ */
+ __s64 updated;
+};
+
/*
* Flags for the userfaultfd(2) system call itself.
*/
diff --git a/mm/memory.c b/mm/memory.c
index f69fbc251198..e4b4207c2590 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3675,6 +3675,10 @@ static vm_fault_t handle_pte_marker(struct vm_fault *vmf)
if (WARN_ON_ONCE(!marker))
return VM_FAULT_SIGBUS;
+ /* SIGBUS explicitly requested for this PTE. */
+ if (marker & PTE_MARKER_UFFD_SIGBUS)
+ return VM_FAULT_SIGBUS;
+
/* Higher priority than uffd-wp when data corrupted */
if (marker & PTE_MARKER_SWAPIN_ERROR)
return VM_FAULT_SIGBUS;
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index e97a0b4889fc..933587eebd5d 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -278,6 +278,51 @@ static int mfill_atomic_pte_continue(pmd_t *dst_pmd,
goto out;
}
+/* Handles UFFDIO_SIGBUS for all non-hugetlb VMAs. */
+static int mfill_atomic_pte_sigbus(pmd_t *dst_pmd,
+ struct vm_area_struct *dst_vma,
+ unsigned long dst_addr,
+ uffd_flags_t flags)
+{
+ int ret;
+ struct mm_struct *dst_mm = dst_vma->vm_mm;
+ pte_t _dst_pte, *dst_pte;
+ spinlock_t *ptl;
+
+ _dst_pte = make_pte_marker(PTE_MARKER_UFFD_SIGBUS);
+ dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl);
+
+ if (vma_is_shmem(dst_vma)) {
+ struct inode *inode;
+ pgoff_t offset, max_off;
+
+ /* serialize against truncate with the page table lock */
+ inode = dst_vma->vm_file->f_inode;
+ offset = linear_page_index(dst_vma, dst_addr);
+ max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
+ ret = -EFAULT;
+ if (unlikely(offset >= max_off))
+ goto out_unlock;
+ }
+
+ ret = -EEXIST;
+ /*
+ * For now, we don't handle unmapping pages, so only support filling in
+ * none PTEs, or replacing PTE markers.
+ */
+ if (!pte_none_mostly(*dst_pte))
+ goto out_unlock;
+
+ set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
+
+ /* No need to invalidate - it was non-present before */
+ update_mmu_cache(dst_vma, dst_addr, dst_pte);
+ ret = 0;
+out_unlock:
+ pte_unmap_unlock(dst_pte, ptl);
+ return ret;
+}
+
static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address)
{
pgd_t *pgd;
@@ -328,8 +373,12 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
* supported by hugetlb. A PMD_SIZE huge pages may exist as used
* by THP. Since we can not reliably insert a zero page, this
* feature is not supported.
+ *
+ * PTE marker handling for hugetlb is a bit special, so for now
+ * UFFDIO_SIGBUS is not supported.
*/
- if (uffd_flags_mode_is(flags, MFILL_ATOMIC_ZEROPAGE)) {
+ if (uffd_flags_mode_is(flags, MFILL_ATOMIC_ZEROPAGE) ||
+ uffd_flags_mode_is(flags, MFILL_ATOMIC_SIGBUS)) {
mmap_read_unlock(dst_mm);
return -EINVAL;
}
@@ -473,6 +522,9 @@ static __always_inline ssize_t mfill_atomic_pte(pmd_t *dst_pmd,
if (uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE)) {
return mfill_atomic_pte_continue(dst_pmd, dst_vma,
dst_addr, flags);
+ } else if (uffd_flags_mode_is(flags, MFILL_ATOMIC_SIGBUS)) {
+ return mfill_atomic_pte_sigbus(dst_pmd, dst_vma,
+ dst_addr, flags);
}
/*
@@ -694,6 +746,14 @@ ssize_t mfill_atomic_continue(struct mm_struct *dst_mm, unsigned long start,
uffd_flags_set_mode(flags, MFILL_ATOMIC_CONTINUE));
}
+ssize_t mfill_atomic_sigbus(struct mm_struct *dst_mm, unsigned long start,
+ unsigned long len, atomic_t *mmap_changing,
+ uffd_flags_t flags)
+{
+ return mfill_atomic(dst_mm, start, 0, len, mmap_changing,
+ uffd_flags_set_mode(flags, MFILL_ATOMIC_SIGBUS));
+}
+
long uffd_wp_range(struct vm_area_struct *dst_vma,
unsigned long start, unsigned long len, bool enable_wp)
{
--
2.40.1.606.ga4b1b128d6-goog
*Changes in v15*
- Build fix (Add missed build fix in RESEND)
*Changes in v14*
- Fix build error caused by #ifdef added at last minute in some configs
*Changes in v13*
- Rebase on top of next-20230414
- Give-up on using uffd_wp_range() and write new helpers, flush tlb only
once
*Changes in v12*
- Update and other memory types to UFFD_FEATURE_WP_ASYNC
- Rebaase on top of next-20230406
- Review updates
*Changes in v11*
- Rebase on top of next-20230307
- Base patches on UFFD_FEATURE_WP_UNPOPULATED
- Do a lot of cosmetic changes and review updates
- Remove ENGAGE_WP + !GET operation as it can be performed with
UFFDIO_WRITEPROTECT
*Changes in v10*
- Add specific condition to return error if hugetlb is used with wp
async
- Move changes in tools/include/uapi/linux/fs.h to separate patch
- Add documentation
*Changes in v9:*
- Correct fault resolution for userfaultfd wp async
- Fix build warnings and errors which were happening on some configs
- Simplify pagemap ioctl's code
*Changes in v8:*
- Update uffd async wp implementation
- Improve PAGEMAP_IOCTL implementation
*Changes in v7:*
- Add uffd wp async
- Update the IOCTL to use uffd under the hood instead of soft-dirty
flags
*Motivation*
The real motivation for adding PAGEMAP_SCAN IOCTL is to emulate Windows
GetWriteWatch() syscall [1]. The GetWriteWatch{} retrieves the addresses of
the pages that are written to in a region of virtual memory.
This syscall is used in Windows applications and games etc. This syscall is
being emulated in pretty slow manner in userspace. Our purpose is to
enhance the kernel such that we translate it efficiently in a better way.
Currently some out of tree hack patches are being used to efficiently
emulate it in some kernels. We intend to replace those with these patches.
So the whole gaming on Linux can effectively get benefit from this. It
means there would be tons of users of this code.
CRIU use case [2] was mentioned by Andrei and Danylo:
> Use cases for migrating sparse VMAs are binaries sanitized with ASAN,
> MSAN or TSAN [3]. All of these sanitizers produce sparse mappings of
> shadow memory [4]. Being able to migrate such binaries allows to highly
> reduce the amount of work needed to identify and fix post-migration
> crashes, which happen constantly.
Andrei's defines the following uses of this code:
* it is more granular and allows us to track changed pages more
effectively. The current interface can clear dirty bits for the entire
process only. In addition, reading info about pages is a separate
operation. It means we must freeze the process to read information
about all its pages, reset dirty bits, only then we can start dumping
pages. The information about pages becomes more and more outdated,
while we are processing pages. The new interface solves both these
downsides. First, it allows us to read pte bits and clear the
soft-dirty bit atomically. It means that CRIU will not need to freeze
processes to pre-dump their memory. Second, it clears soft-dirty bits
for a specified region of memory. It means CRIU will have actual info
about pages to the moment of dumping them.
* The new interface has to be much faster because basic page filtering
is happening in the kernel. With the old interface, we have to read
pagemap for each page.
*Implementation Evolution (Short Summary)*
From the definition of GetWriteWatch(), we feel like kernel's soft-dirty
feature can be used under the hood with some additions like:
* reset soft-dirty flag for only a specific region of memory instead of
clearing the flag for the entire process
* get and clear soft-dirty flag for a specific region atomically
So we decided to use ioctl on pagemap file to read or/and reset soft-dirty
flag. But using soft-dirty flag, sometimes we get extra pages which weren't
even written. They had become soft-dirty because of VMA merging and
VM_SOFTDIRTY flag. This breaks the definition of GetWriteWatch(). We were
able to by-pass this short coming by ignoring VM_SOFTDIRTY until David
reported that mprotect etc messes up the soft-dirty flag while ignoring
VM_SOFTDIRTY [5]. This wasn't happening until [6] got introduced. We
discussed if we can revert these patches. But we could not reach to any
conclusion. So at this point, I made couple of tries to solve this whole
VM_SOFTDIRTY issue by correcting the soft-dirty implementation:
* [7] Correct the bug fixed wrongly back in 2014. It had potential to cause
regression. We left it behind.
* [8] Keep a list of soft-dirty part of a VMA across splits and merges. I
got the reply don't increase the size of the VMA by 8 bytes.
At this point, we left soft-dirty considering it is too much delicate and
userfaultfd [9] seemed like the only way forward. From there onward, we
have been basing soft-dirty emulation on userfaultfd wp feature where
kernel resolves the faults itself when WP_ASYNC feature is used. It was
straight forward to add WP_ASYNC feature in userfautlfd. Now we get only
those pages dirty or written-to which are really written in reality. (PS
There is another WP_UNPOPULATED userfautfd feature is required which is
needed to avoid pre-faulting memory before write-protecting [9].)
All the different masks were added on the request of CRIU devs to create
interface more generic and better.
[1] https://learn.microsoft.com/en-us/windows/win32/api/memoryapi/nf-memoryapi-…
[2] https://lore.kernel.org/all/20221014134802.1361436-1-mdanylo@google.com
[3] https://github.com/google/sanitizers
[4] https://github.com/google/sanitizers/wiki/AddressSanitizerAlgorithm#64-bit
[5] https://lore.kernel.org/all/bfcae708-db21-04b4-0bbe-712badd03071@redhat.com
[6] https://lore.kernel.org/all/20220725142048.30450-1-peterx@redhat.com/
[7] https://lore.kernel.org/all/20221122115007.2787017-1-usama.anjum@collabora.…
[8] https://lore.kernel.org/all/20221220162606.1595355-1-usama.anjum@collabora.…
[9] https://lore.kernel.org/all/20230306213925.617814-1-peterx@redhat.com
[10] https://lore.kernel.org/all/20230125144529.1630917-1-mdanylo@google.com
* Original Cover letter from v8*
Hello,
Note:
Soft-dirty pages and pages which have been written-to are synonyms. As
kernel already has soft-dirty feature inside which we have given up to
use, we are using written-to terminology while using UFFD async WP under
the hood.
This IOCTL, PAGEMAP_SCAN on pagemap file can be used to get and/or clear
the info about page table entries. The following operations are
supported in this ioctl:
- Get the information if the pages have been written-to (PAGE_IS_WRITTEN),
file mapped (PAGE_IS_FILE), present (PAGE_IS_PRESENT) or swapped
(PAGE_IS_SWAPPED).
- Write-protect the pages (PAGEMAP_WP_ENGAGE) to start finding which
pages have been written-to.
- Find pages which have been written-to and write protect the pages
(atomic PAGE_IS_WRITTEN + PAGEMAP_WP_ENGAGE)
It is possible to find and clear soft-dirty pages entirely in userspace.
But it isn't efficient:
- The mprotect and SIGSEGV handler for bookkeeping
- The userfaultfd wp (synchronous) with the handler for bookkeeping
Some benchmarks can be seen here[1]. This series adds features that weren't
present earlier:
- There is no atomic get soft-dirty/Written-to status and clear present in
the kernel.
- The pages which have been written-to can not be found in accurate way.
(Kernel's soft-dirty PTE bit + sof_dirty VMA bit shows more soft-dirty
pages than there actually are.)
Historically, soft-dirty PTE bit tracking has been used in the CRIU
project. The procfs interface is enough for finding the soft-dirty bit
status and clearing the soft-dirty bit of all the pages of a process.
We have the use case where we need to track the soft-dirty PTE bit for
only specific pages on-demand. We need this tracking and clear mechanism
of a region of memory while the process is running to emulate the
getWriteWatch() syscall of Windows.
*(Moved to using UFFD instead of soft-dirtyi feature to find pages which
have been written-to from v7 patch series)*:
Stop using the soft-dirty flags for finding which pages have been
written to. It is too delicate and wrong as it shows more soft-dirty
pages than the actual soft-dirty pages. There is no interest in
correcting it [2][3] as this is how the feature was written years ago.
It shouldn't be updated to changed behaviour. Peter Xu has suggested
using the async version of the UFFD WP [4] as it is based inherently
on the PTEs.
So in this patch series, I've added a new mode to the UFFD which is
asynchronous version of the write protect. When this variant of the
UFFD WP is used, the page faults are resolved automatically by the
kernel. The pages which have been written-to can be found by reading
pagemap file (!PM_UFFD_WP). This feature can be used successfully to
find which pages have been written to from the time the pages were
write protected. This works just like the soft-dirty flag without
showing any extra pages which aren't soft-dirty in reality.
The information related to pages if the page is file mapped, present and
swapped is required for the CRIU project [5][6]. The addition of the
required mask, any mask, excluded mask and return masks are also required
for the CRIU project [5].
The IOCTL returns the addresses of the pages which match the specific
masks. The page addresses are returned in struct page_region in a compact
form. The max_pages is needed to support a use case where user only wants
to get a specific number of pages. So there is no need to find all the
pages of interest in the range when max_pages is specified. The IOCTL
returns when the maximum number of the pages are found. The max_pages is
optional. If max_pages is specified, it must be equal or greater than the
vec_size. This restriction is needed to handle worse case when one
page_region only contains info of one page and it cannot be compacted.
This is needed to emulate the Windows getWriteWatch() syscall.
The patch series include the detailed selftest which can be used as an
example for the uffd async wp test and PAGEMAP_IOCTL. It shows the
interface usages as well.
[1] https://lore.kernel.org/lkml/54d4c322-cd6e-eefd-b161-2af2b56aae24@collabora…
[2] https://lore.kernel.org/all/20221220162606.1595355-1-usama.anjum@collabora.…
[3] https://lore.kernel.org/all/20221122115007.2787017-1-usama.anjum@collabora.…
[4] https://lore.kernel.org/all/Y6Hc2d+7eTKs7AiH@x1n
[5] https://lore.kernel.org/all/YyiDg79flhWoMDZB@gmail.com/
[6] https://lore.kernel.org/all/20221014134802.1361436-1-mdanylo@google.com/
Regards,
Muhammad Usama Anjum
Muhammad Usama Anjum (4):
fs/proc/task_mmu: Implement IOCTL to get and optionally clear info
about PTEs
tools headers UAPI: Update linux/fs.h with the kernel sources
mm/pagemap: add documentation of PAGEMAP_SCAN IOCTL
selftests: mm: add pagemap ioctl tests
Peter Xu (1):
userfaultfd: UFFD_FEATURE_WP_ASYNC
Documentation/admin-guide/mm/pagemap.rst | 56 +
Documentation/admin-guide/mm/userfaultfd.rst | 35 +
fs/proc/task_mmu.c | 481 +++++++
fs/userfaultfd.c | 26 +-
include/linux/userfaultfd_k.h | 21 +-
include/uapi/linux/fs.h | 53 +
include/uapi/linux/userfaultfd.h | 9 +-
mm/hugetlb.c | 32 +-
mm/memory.c | 27 +-
tools/include/uapi/linux/fs.h | 53 +
tools/testing/selftests/mm/.gitignore | 1 +
tools/testing/selftests/mm/Makefile | 3 +-
tools/testing/selftests/mm/config | 1 +
tools/testing/selftests/mm/pagemap_ioctl.c | 1326 ++++++++++++++++++
tools/testing/selftests/mm/run_vmtests.sh | 4 +
15 files changed, 2105 insertions(+), 23 deletions(-)
create mode 100644 tools/testing/selftests/mm/pagemap_ioctl.c
mode change 100644 => 100755 tools/testing/selftests/mm/run_vmtests.sh
--
2.39.2
Hi,
On vanilla AlmaLinux 8.7 (CentOS fork) selftests/net/af_unix/diag_uid.c doesn't
compile out of the box, giving the errors:
make[2]: Entering directory '/home/marvin/linux/kernel/linux_torvalds/tools/testing/selftests/net/af_unix'
gcc diag_uid.c -o /home/marvin/linux/kernel/linux_torvalds/tools/testing/selftests/net/af_unix/diag_uid
diag_uid.c:36:16: error: ‘UDIAG_SHOW_UID’ undeclared here (not in a function); did you mean ‘UDIAG_SHOW_VFS’?
.udiag_show = UDIAG_SHOW_UID
^~~~~~~~~~~~~~
UDIAG_SHOW_VFS
In file included from diag_uid.c:17:
diag_uid.c: In function ‘render_response’:
diag_uid.c:128:28: error: ‘UNIX_DIAG_UID’ undeclared (first use in this function); did you mean ‘UNIX_DIAG_VFS’?
ASSERT_EQ(attr->rta_type, UNIX_DIAG_UID);
^~~~~~~~~~~~~
../../kselftest_harness.h:707:13: note: in definition of macro ‘__EXPECT’
__typeof__(_seen) __seen = (_seen); \
^~~~~
diag_uid.c:128:2: note: in expansion of macro ‘ASSERT_EQ’
ASSERT_EQ(attr->rta_type, UNIX_DIAG_UID);
^~~~~~~~~
diag_uid.c:128:28: note: each undeclared identifier is reported only once for each function it appears in
ASSERT_EQ(attr->rta_type, UNIX_DIAG_UID);
^~~~~~~~~~~~~
../../kselftest_harness.h:707:13: note: in definition of macro ‘__EXPECT’
__typeof__(_seen) __seen = (_seen); \
^~~~~
diag_uid.c:128:2: note: in expansion of macro ‘ASSERT_EQ’
ASSERT_EQ(attr->rta_type, UNIX_DIAG_UID);
^~~~~~~~~
make[2]: *** [../../lib.mk:147: /home/marvin/linux/kernel/linux_torvalds/tools/testing/selftests/net/af_unix/diag_uid] Error 1
The correct value is in <uapi/linux/unix_diag.h>:
include/uapi/linux/unix_diag.h:23:#define UDIAG_SHOW_UID 0x00000040 /* show socket's UID */
The fix is as follows:
---
tools/testing/selftests/net/af_unix/diag_uid.c | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/tools/testing/selftests/net/af_unix/diag_uid.c b/tools/testing/selftests/net/af_unix/diag_uid.c
index 5b88f7129fea..66d75b646d35 100644
--- a/tools/testing/selftests/net/af_unix/diag_uid.c
+++ b/tools/testing/selftests/net/af_unix/diag_uid.c
@@ -16,6 +16,10 @@
#include "../../kselftest_harness.h"
+#ifndef UDIAG_SHOW_UID
+#define UDIAG_SHOW_UID 0x00000040 /* show socket's UID */
+#endif
+
FIXTURE(diag_uid)
{
int netlink_fd;
--
However, this patch reveals another undefined value:
make[2]: Entering directory '/home/marvin/linux/kernel/linux_torvalds/tools/testing/selftests/net/af_unix'
gcc diag_uid.c -o /home/marvin/linux/kernel/linux_torvalds/tools/testing/selftests/net/af_unix/diag_uid
In file included from diag_uid.c:17:
diag_uid.c: In function ‘render_response’:
diag_uid.c:132:28: error: ‘UNIX_DIAG_UID’ undeclared (first use in this function); did you mean ‘UNIX_DIAG_VFS’?
ASSERT_EQ(attr->rta_type, UNIX_DIAG_UID);
^~~~~~~~~~~~~
../../kselftest_harness.h:707:13: note: in definition of macro ‘__EXPECT’
__typeof__(_seen) __seen = (_seen); \
^~~~~
diag_uid.c:132:2: note: in expansion of macro ‘ASSERT_EQ’
ASSERT_EQ(attr->rta_type, UNIX_DIAG_UID);
^~~~~~~~~
diag_uid.c:132:28: note: each undeclared identifier is reported only once for each function it appears in
ASSERT_EQ(attr->rta_type, UNIX_DIAG_UID);
^~~~~~~~~~~~~
../../kselftest_harness.h:707:13: note: in definition of macro ‘__EXPECT’
__typeof__(_seen) __seen = (_seen); \
^~~~~
diag_uid.c:132:2: note: in expansion of macro ‘ASSERT_EQ’
ASSERT_EQ(attr->rta_type, UNIX_DIAG_UID);
^~~~~~~~~
make[2]: *** [../../lib.mk:147: /home/marvin/linux/kernel/linux_torvalds/tools/testing/selftests/net/af_unix/diag_uid] Error 1
Apparently, AlmaLinux 8.7 lacks this enum UNIX_DIAG_UID:
diff -u /usr/include/linux/unix_diag.h include/uapi/linux/unix_diag.h
--- /usr/include/linux/unix_diag.h 2023-05-16 13:47:51.000000000 +0200
+++ include/uapi/linux/unix_diag.h 2022-10-12 07:35:58.253481367 +0200
@@ -20,6 +20,7 @@
#define UDIAG_SHOW_ICONS 0x00000008 /* show pending connections */
#define UDIAG_SHOW_RQLEN 0x00000010 /* show skb receive queue len */
#define UDIAG_SHOW_MEMINFO 0x00000020 /* show memory info of a socket */
+#define UDIAG_SHOW_UID 0x00000040 /* show socket's UID */
struct unix_diag_msg {
__u8 udiag_family;
@@ -40,6 +41,7 @@
UNIX_DIAG_RQLEN,
UNIX_DIAG_MEMINFO,
UNIX_DIAG_SHUTDOWN,
+ UNIX_DIAG_UID,
__UNIX_DIAG_MAX,
};
Now, this is a change in enums and there doesn't seem to an easy way out
here. (I think I saw an example, but I cannot recall which thread. I will do
more research.)
When I included
# gcc -I ../../../../include diag_uid.c
I've got the following error:
[marvin@pc-mtodorov linux_torvalds]$ cd tools/testing/selftests/net/af_unix/
[marvin@pc-mtodorov af_unix]$ gcc -I ../../../../../include diag_uid.c -o
/home/marvin/linux/kernel/linux_torvalds/tools/testing/selftests/net/af_unix/diag_uid
In file included from ../../../../../include/linux/build_bug.h:5,
from ../../../../../include/linux/bits.h:21,
from ../../../../../include/linux/capability.h:18,
from ../../../../../include/linux/netlink.h:6,
from diag_uid.c:8:
../../../../../include/linux/compiler.h:246:10: fatal error: asm/rwonce.h: No such file or directory
#include <asm/rwonce.h>
^~~~~~~~~~~~~~
compilation terminated.
[marvin@pc-mtodorov af_unix]$
At this point I gave up, as it would be an overkill to change kernel system
header to make a test pass, and this probably wouldn't be accepted upsteam?
Hope this helps. (If we still want to build on CentOS/AlmaLinux/Rocky 8?)
Best regards,
Mirsad
--
Mirsad Goran Todorovac
Sistem inženjer
Grafički fakultet | Akademija likovnih umjetnosti
Sveučilište u Zagrebu
System engineer
Faculty of Graphic Arts | Academy of Fine Arts
University of Zagreb, Republic of Croatia