On Mon, Sep 27, 2021 at 09:37:58AM -0700, Luis Chamberlain wrote:
This extends test_sysfs with support for using the failure injection wait completion and knobs to force a few race conditions which demonstrates that kernfs active reference protection is sufficient for kobject / device protection at higher layers.
This adds 4 new tests which tries to remove the device attribute store operation in 4 different situations:
- at the start of kernfs_kernfs_fop_write_iter()
- before the of->mutex is held in kernfs_kernfs_fop_write_iter()
- after the of->mutex is held in kernfs_kernfs_fop_write_iter()
- after the kernfs node active reference is taken
A write fails in call cases except the last one, test number #32. There is a good explanation for this: *once* kernfs_get_active() gets called we have a guarantee that the kernfs entry cannot be removed. If kernfs_get_active() succeeds that entry cannot be removed and so anything trying to remove that entry will have to wait. It is perhaps not obvious but since a sysfs write will trigger eventually a kernfs_get_active() call, and *only* if this succeeds will the sysfs op be called, this and the fact that you cannot remove the kernfs entry while the kenfs entry is active implies that a module that created the respective sysfs / kernfs entry *cannot* possibly be removed during a sysfs operation. And test number 32 provides us with proof of this. If it were not true test #32 should crash.
No null dereferences are reproduced, even though this has been observed in some complex testing cases [0]. If this issue really exists we should have enough tools on the sysfs_test toolbox now to try to reproduce this easily without having to poke around other drivers. It very likley was the case that the issue reported [0] was possibly a side issue after the first bug which was zram specific. This is why it is important to isolate the issue and try to reproduce it in a generic form using the test_sysfs driver.
[0] https://lkml.kernel.org/r/20210623215007.862787-1-mcgrof@kernel.org
Signed-off-by: Luis Chamberlain mcgrof@kernel.org
lib/Kconfig.debug | 3 + lib/test_sysfs.c | 31 +++++ tools/testing/selftests/sysfs/config | 3 + tools/testing/selftests/sysfs/sysfs.sh | 175 +++++++++++++++++++++++++ 4 files changed, 212 insertions(+)
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index a29b7d398c4e..176b822654e5 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -2358,6 +2358,9 @@ config TEST_SYSFS depends on SYSFS depends on NET depends on BLOCK
- select FAULT_INJECTION
- select FAULT_INJECTION_DEBUG_FS
- select FAIL_KERNFS_KNOBS
I don't like seeing "select" for user-configurable CONFIGs -- things tend to end up weird. This should simply be:
depends on FAIL_KERNFS_KNOBS
help This builds the "test_sysfs" module. This driver enables to test the sysfs file system safely without affecting production knobs which diff --git a/lib/test_sysfs.c b/lib/test_sysfs.c index 2043ca494af8..c6e62de61403 100644 --- a/lib/test_sysfs.c +++ b/lib/test_sysfs.c @@ -38,6 +38,11 @@ #include <linux/rtnetlink.h> #include <linux/genhd.h> #include <linux/blkdev.h> +#include <linux/kernfs.h>
+#ifdef CONFIG_FAIL_KERNFS_KNOBS
This isn't an optional config here (and following)?
+MODULE_IMPORT_NS(KERNFS_DEBUG_PRIVATE); +#endif static bool enable_lock; module_param(enable_lock, bool_enable_only, 0644); @@ -82,6 +87,13 @@ static bool enable_verbose_rmmod; module_param(enable_verbose_rmmod, bool_enable_only, 0644); MODULE_PARM_DESC(enable_verbose_rmmod, "enable verbose print messages on rmmod"); +#ifdef CONFIG_FAIL_KERNFS_KNOBS +static bool enable_completion_on_rmmod; +module_param(enable_completion_on_rmmod, bool_enable_only, 0644); +MODULE_PARM_DESC(enable_completion_on_rmmod,
"enable sending a kernfs completion on rmmod");
+#endif
static int sysfs_test_major; /** @@ -285,6 +297,12 @@ static ssize_t config_show(struct device *dev, "enable_verbose_writes:\t%s\n", enable_verbose_writes ? "true" : "false"); +#ifdef CONFIG_FAIL_KERNFS_KNOBS
- len += snprintf(buf+len, PAGE_SIZE - len,
"enable_completion_on_rmmod:\t%s\n",
enable_completion_on_rmmod ? "true" : "false");
+#endif
- test_dev_config_unlock(test_dev);
return len; @@ -904,10 +922,23 @@ static int __init test_sysfs_init(void) } module_init(test_sysfs_init); +#ifdef CONFIG_FAIL_KERNFS_KNOBS +/* The goal is to race our device removal with a pending kernfs -> store call */ +static void test_sysfs_kernfs_send_completion_rmmod(void) +{
- if (!enable_completion_on_rmmod)
return;
- complete(&kernfs_debug_wait_completion);
+} +#else +static inline void test_sysfs_kernfs_send_completion_rmmod(void) {} +#endif
static void __exit test_sysfs_exit(void) { if (enable_debugfs) debugfs_remove(debugfs_dir);
- test_sysfs_kernfs_send_completion_rmmod(); if (delay_rmmod_ms) msleep(delay_rmmod_ms); unregister_test_dev_sysfs(first_test_dev);
diff --git a/tools/testing/selftests/sysfs/config b/tools/testing/selftests/sysfs/config index 9196f452ecd5..2876a229f95b 100644 --- a/tools/testing/selftests/sysfs/config +++ b/tools/testing/selftests/sysfs/config @@ -1,2 +1,5 @@ CONFIG_SYSFS=m CONFIG_TEST_SYSFS=m +CONFIG_FAULT_INJECTION=y +CONFIG_FAULT_INJECTION_DEBUG_FS=y +CONFIG_FAIL_KERNFS_KNOBS=y diff --git a/tools/testing/selftests/sysfs/sysfs.sh b/tools/testing/selftests/sysfs/sysfs.sh index b3f4c2236c7f..f928635d0e35 100755 --- a/tools/testing/selftests/sysfs/sysfs.sh +++ b/tools/testing/selftests/sysfs/sysfs.sh @@ -62,6 +62,10 @@ ALL_TESTS="$ALL_TESTS 0025:1:1:test_dev_y:block" ALL_TESTS="$ALL_TESTS 0026:1:1:test_dev_y:block" ALL_TESTS="$ALL_TESTS 0027:1:0:test_dev_x:block" # deadlock test ALL_TESTS="$ALL_TESTS 0028:1:0:test_dev_x:block" # deadlock test with rntl_lock +ALL_TESTS="$ALL_TESTS 0029:1:1:test_dev_x:block" # kernfs race removal of store +ALL_TESTS="$ALL_TESTS 0030:1:1:test_dev_x:block" # kernfs race removal before mutex +ALL_TESTS="$ALL_TESTS 0031:1:1:test_dev_x:block" # kernfs race removal after mutex +ALL_TESTS="$ALL_TESTS 0032:1:1:test_dev_x:block" # kernfs race removal after active allow_user_defaults() { @@ -92,6 +96,9 @@ allow_user_defaults() if [ -z $SYSFS_DEBUGFS_DIR ]; then SYSFS_DEBUGFS_DIR="/sys/kernel/debug/test_sysfs" fi
- if [ -z $KERNFS_DEBUGFS_DIR ]; then
KERNFS_DEBUGFS_DIR="/sys/kernel/debug/kernfs"
- fi if [ -z $PAGE_SIZE ]; then PAGE_SIZE=$(getconf PAGESIZE) fi
@@ -167,6 +174,14 @@ modprobe_reset_enable_rtnl_lock_on_rmmod() unset FIRST_MODPROBE_ARGS } +modprobe_reset_enable_completion() +{
- FIRST_MODPROBE_ARGS="enable_completion_on_rmmod=1 enable_verbose_writes=1"
- FIRST_MODPROBE_ARGS="$FIRST_MODPROBE_ARGS enable_verbose_rmmod=1 delay_rmmod_ms=0"
- modprobe_reset
- unset FIRST_MODPROBE_ARGS
+}
load_req_mod() { modprobe_reset @@ -197,6 +212,63 @@ debugfs_reset_first_test_dev_ignore_errors() echo -n "1" >"$SYSFS_DEBUGFS_DIR"/reset_first_test_dev } +debugfs_kernfs_kernfs_fop_write_iter_exists() +{
- KNOB_DIR="${KERNFS_DEBUGFS_DIR}/config_fail_kernfs_fop_write_iter"
- if [[ ! -d $KNOB_DIR ]]; then
echo "kernfs debugfs does not exist $KNOB_DIR"
return 0;
- fi
- KNOB_DEBUGFS="${KERNFS_DEBUGFS_DIR}/fail_kernfs_fop_write_iter"
- if [[ ! -d $KNOB_DEBUGFS ]]; then
echo -n "kernfs debugfs for coniguring fail_kernfs_fop_write_iter "
echo "does not exist $KNOB_DIR"
return 0;
- fi
- return 1
+}
+debugfs_kernfs_kernfs_fop_write_iter_set_fail_once() +{
- KNOB_DEBUGFS="${KERNFS_DEBUGFS_DIR}/fail_kernfs_fop_write_iter"
- echo 1 > $KNOB_DEBUGFS/interval
- echo 100 > $KNOB_DEBUGFS/probability
- echo 0 > $KNOB_DEBUGFS/space
- # Disable verbose messages on the kernel ring buffer which may
- # confuse developers with a kernel panic.
- echo 0 > $KNOB_DEBUGFS/verbose
- # Fail only once
- echo 1 > $KNOB_DEBUGFS/times
+}
+debugfs_kernfs_kernfs_fop_write_iter_set_fail_never() +{
- KNOB_DEBUGFS="${KERNFS_DEBUGFS_DIR}/fail_kernfs_fop_write_iter"
- echo 0 > $KNOB_DEBUGFS/times
+}
+debugfs_kernfs_set_wait_ms() +{
- SLEEP_AFTER_WAIT_MS="${KERNFS_DEBUGFS_DIR}/sleep_after_wait_ms"
- echo $1 > $SLEEP_AFTER_WAIT_MS
+}
+debugfs_kernfs_disable_wait_kernfs_fop_write_iter() +{
- ENABLE_WAIT_KNOB="${KERNFS_DEBUGFS_DIR}/config_fail_kernfs_fop_write_iter/wait_"
- for KNOB in ${ENABLE_WAIT_KNOB}*; do
echo 0 > $KNOB
- done
+}
+debugfs_kernfs_enable_wait_kernfs_fop_write_iter() +{
- ENABLE_WAIT_KNOB="${KERNFS_DEBUGFS_DIR}/config_fail_kernfs_fop_write_iter/wait_$1"
- echo -n "1" > $ENABLE_WAIT_KNOB
- return $?
+}
set_orig() { if [[ ! -z $TARGET ]] && [[ ! -z $ORIG ]]; then @@ -972,6 +1044,105 @@ sysfs_test_0028() fi } +sysfs_race_kernfs_kernfs_fop_write_iter() +{
- TARGET="${DIR}/$(get_test_target $1)"
- WAIT_AT=$2
- EXPECT_WRITE_RETURNS=$3
- MSDELAY=$4
- modprobe_reset_enable_completion
- ORIG=$(cat "${TARGET}")
- TEST_STR=$(( $ORIG + 1 ))
- echo -n "Test racing removal of sysfs store op with kernfs $WAIT_AT ... "
- if debugfs_kernfs_kernfs_fop_write_iter_exists; then
echo -n "skipping test as CONFIG_FAIL_KERNFS_KNOBS "
echo " or CONFIG_FAULT_INJECTION_DEBUG_FS is disabled"
return $ksft_skip
- fi
- # Allow for failing the kernfs_kernfs_fop_write_iter call once,
- # we'll provide exact context shortly afterwards.
- debugfs_kernfs_kernfs_fop_write_iter_set_fail_once
- # First disable all waits
- debugfs_kernfs_disable_wait_kernfs_fop_write_iter
- # Enable a wait_for_completion(&kernfs_debug_wait_completion) at the
- # specified location inside the kernfs_fop_write_iter() routine
- debugfs_kernfs_enable_wait_kernfs_fop_write_iter $WAIT_AT
- # Configure kernfs so that after its wait_for_completion() it
- # will msleep() this amount of time and schedule(). We figure this
- # will be sufficient time to allow for our module removal to complete.
- debugfs_kernfs_set_wait_ms $MSDELAY
- # Now we trigger a kernfs write op, which will run kernfs_fop_write_iter,
- # but will wait until our driver sends a respective completion
- set_test_ignore_errors &
- write_pid=$!
- # At this point kernfs_fop_write_iter() hasn't run our op, its
- # waiting for our completion at the specified time $WAIT_AT.
- # We now remove our module which will send a
- # complete(&kernfs_debug_wait_completion) right before we deregister
- # our device and the sysfs device attributes are removed.
- #
- # After the completion is sent, the test_sysfs driver races with
- # kernfs to do the device deregistration with the kernfs msleep
- # and schedule(). This should mean we've forced trying to remove the
- # module prior to allowing kernfs to run our store operation. If the
- # race did happen we'll panic with a null dereference on the store op.
- #
- # If no race happens we should see no write operation triggered.
- modprobe -r $TEST_DRIVER > /dev/null 2>&1
- debugfs_kernfs_kernfs_fop_write_iter_set_fail_never
- wait $write_pid
- if [[ $? -eq $EXPECT_WRITE_RETURNS ]]; then
echo "ok"
- else
echo "FAIL" >&2
- fi
+}
+sysfs_test_0029() +{
- for delay in 0 2 4 8 16 32 64 128 246 512 1024; do
echo "Using delay-after-completion: $delay"
sysfs_race_kernfs_kernfs_fop_write_iter 0029 at_start 1 $delay
- done
+}
+sysfs_test_0030() +{
- for delay in 0 2 4 8 16 32 64 128 246 512 1024; do
echo "Using delay-after-completion: $delay"
sysfs_race_kernfs_kernfs_fop_write_iter 0030 before_mutex 1 $delay
- done
+}
+sysfs_test_0031() +{
- for delay in 0 2 4 8 16 32 64 128 246 512 1024; do
echo "Using delay-after-completion: $delay"
sysfs_race_kernfs_kernfs_fop_write_iter 0031 after_mutex 1 $delay
- done
+}
+# A write only succeeds *iff* a module removal happens *after* the +# kernfs active reference is obtained with kernfs_get_active(). +sysfs_test_0032() +{
- for delay in 0 2 4 8 16 32 64 128 246 512 1024; do
echo "Using delay-after-completion: $delay"
sysfs_race_kernfs_kernfs_fop_write_iter 0032 after_active 0 $delay
- done
+}
test_gen_desc() { echo -n "$1 x $(get_test_count $1)" @@ -1013,6 +1184,10 @@ list_tests() echo "$(test_gen_desc 0026) - block test writing y larger delay and resetting device" echo "$(test_gen_desc 0027) - test rmmod deadlock while writing x ... " echo "$(test_gen_desc 0028) - test rmmod deadlock using rtnl_lock while writing x ..."
- echo "$(test_gen_desc 0029) - racing removal of store op with kernfs at start"
- echo "$(test_gen_desc 0030) - racing removal of store op with kernfs before mutex"
- echo "$(test_gen_desc 0031) - racing removal of store op with kernfs after mutex"
- echo "$(test_gen_desc 0032) - racing removal of store op with kernfs after active"
} usage() -- 2.30.2