November 2023 - Linux-kselftest-mirror

[PATCH v3] selftests/ftrace: traceonoff_triggers: strip off names

by Yipeng Zou

The func_traceonoff_triggers.tc sometimes goes to fail on my board, Kunpeng-920. [root@localhost]# ./ftracetest ./test.d/ftrace/func_traceonoff_triggers.tc -l fail.log === Ftrace unit tests === [1] ftrace - test for function traceon/off triggers [FAIL] [2] (instance) ftrace - test for function traceon/off triggers [UNSUPPORTED] I look up the log, and it shows that the md5sum is different between csum1 and csum2. ++ cnt=611 ++ sleep .1 +++ cnt_trace +++ grep -v '^#' trace +++ wc -l ++ cnt2=611 ++ '[' 611 -ne 611 ']' +++ cat tracing_on ++ on=0 ++ '[' 0 '!=' 0 ']' +++ md5sum trace ++ csum1='76896aa74362fff66a6a5f3cf8a8a500 trace' ++ sleep .1 +++ md5sum trace ++ csum2='ee8625a21c058818fc26e45c1ed3f6de trace' ++ '[' '76896aa74362fff66a6a5f3cf8a8a500 trace' '!=' 'ee8625a21c058818fc26e45c1ed3f6de trace' ']' ++ fail 'Tracing file is still changing' ++ echo Tracing file is still changing Tracing file is still changing ++ exit_fail ++ exit 1 So I directly dump the trace file before md5sum, the diff shows that: [root@localhost]# diff trace_1.log trace_2.log -y --suppress-common-lines dockerd-12285 [036] d.... 18385.510290: sched_stat | <...>-12285 [036] d.... 18385.510290: sched_stat dockerd-12285 [036] d.... 18385.510291: sched_swit | <...>-12285 [036] d.... 18385.510291: sched_swit <...>-740 [044] d.... 18385.602859: sched_stat | kworker/44:1-740 [044] d.... 18385.602859: sched_stat <...>-740 [044] d.... 18385.602860: sched_swit | kworker/44:1-740 [044] d.... 18385.602860: sched_swit And we can see that <...> filed be filled with names. We can strip off the names there to fix that. After strip off the names: kworker/u257:0-12 [019] d..2. 2528.758910: sched_stat | -12 [019] d..2. 2528.758910: sched_stat_runtime: comm=k kworker/u257:0-12 [019] d..2. 2528.758912: sched_swit | -12 [019] d..2. 2528.758912: sched_switch: prev_comm=kw <idle>-0 [000] d.s5. 2528.762318: sched_waki | -0 [000] d.s5. 2528.762318: sched_waking: comm=sshd pi <idle>-0 [037] dNh2. 2528.762326: sched_wake | -0 [037] dNh2. 2528.762326: sched_wakeup: comm=sshd pi <idle>-0 [037] d..2. 2528.762334: sched_swit | -0 [037] d..2. 2528.762334: sched_switch: prev_comm=sw Fixes: d87b29179aa0 ("selftests: ftrace: Use md5sum to take less time of checking logs") Suggested-by: Steven Rostedt (Google) <rostedt(a)goodmis.org> Signed-off-by: Yipeng Zou <zouyipeng(a)huawei.com> Acked-by: Masami Hiramatsu (Google) <mhiramat(a)kernel.org> Reviewed-by: Steven Rostedt (Google) <rostedt(a)goodmis.org> --- .../ftrace/test.d/ftrace/func_traceonoff_triggers.tc | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/ftrace/test.d/ftrace/func_traceonoff_triggers.tc b/tools/testing/selftests/ftrace/test.d/ftrace/func_traceonoff_triggers.tc index aee22289536b..1b57771dbfdf 100644 --- a/tools/testing/selftests/ftrace/test.d/ftrace/func_traceonoff_triggers.tc +++ b/tools/testing/selftests/ftrace/test.d/ftrace/func_traceonoff_triggers.tc @@ -90,9 +90,10 @@ if [ $on != "0" ]; then fail "Tracing is not off" fi -csum1=`md5sum trace` +# Cannot rely on names being around as they are only cached, strip them +csum1=`cat trace | sed -e 's/^ *[^ ]*$-[0-9][0-9]*$/\1/' | md5sum` sleep $SLEEP_TIME -csum2=`md5sum trace` +csum2=`cat trace | sed -e 's/^ *[^ ]*$-[0-9][0-9]*$/\1/' | md5sum` if [ "$csum1" != "$csum2" ]; then fail "Tracing file is still changing" -- 2.34.1

1 year, 1 month

2
2
0 0

[PATCH] KVM: selftests: Use TAP in the steal_time test

by Thomas Huth

For easier use of the tests in automation and for having some status information for the user while the test is running, let's provide some TAP output in this test. Signed-off-by: Thomas Huth <thuth(a)redhat.com> --- NB: This patch does not use the interface from kselftest_harness.h since it is not very suitable for the for-loop in this patch. tools/testing/selftests/kvm/steal_time.c | 46 ++++++++++++------------ 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/tools/testing/selftests/kvm/steal_time.c b/tools/testing/selftests/kvm/steal_time.c index 171adfb2a6cb..aa6149eb9ea1 100644 --- a/tools/testing/selftests/kvm/steal_time.c +++ b/tools/testing/selftests/kvm/steal_time.c @@ -81,20 +81,18 @@ static void steal_time_init(struct kvm_vcpu *vcpu, uint32_t i) static void steal_time_dump(struct kvm_vm *vm, uint32_t vcpu_idx) { struct kvm_steal_time *st = addr_gva2hva(vm, (ulong)st_gva[vcpu_idx]); - int i; - pr_info("VCPU%d:\n", vcpu_idx); - pr_info(" steal: %lld\n", st->steal); - pr_info(" version: %d\n", st->version); - pr_info(" flags: %d\n", st->flags); - pr_info(" preempted: %d\n", st->preempted); - pr_info(" u8_pad: "); - for (i = 0; i < 3; ++i) - pr_info("%d", st->u8_pad[i]); - pr_info("\n pad: "); - for (i = 0; i < 11; ++i) - pr_info("%d", st->pad[i]); - pr_info("\n"); + ksft_print_msg("VCPU%d:\n", vcpu_idx); + ksft_print_msg(" steal: %lld\n", st->steal); + ksft_print_msg(" version: %d\n", st->version); + ksft_print_msg(" flags: %d\n", st->flags); + ksft_print_msg(" preempted: %d\n", st->preempted); + ksft_print_msg(" u8_pad: %d %d %d\n", + st->u8_pad[0], st->u8_pad[1], st->u8_pad[2]); + ksft_print_msg(" pad: %d %d %d %d %d %d %d %d %d %d %d\n", + st->pad[0], st->pad[1], st->pad[2], st->pad[3], + st->pad[4], st->pad[5], st->pad[6], st->pad[7], + st->pad[8], st->pad[9], st->pad[10]); } #elif defined(__aarch64__) @@ -197,10 +195,10 @@ static void steal_time_dump(struct kvm_vm *vm, uint32_t vcpu_idx) { struct st_time *st = addr_gva2hva(vm, (ulong)st_gva[vcpu_idx]); - pr_info("VCPU%d:\n", vcpu_idx); - pr_info(" rev: %d\n", st->rev); - pr_info(" attr: %d\n", st->attr); - pr_info(" st_time: %ld\n", st->st_time); + ksft_print_msg("VCPU%d:\n", vcpu_idx); + ksft_print_msg(" rev: %d\n", st->rev); + ksft_print_msg(" attr: %d\n", st->attr); + ksft_print_msg(" st_time: %ld\n", st->st_time); } #endif @@ -267,7 +265,9 @@ int main(int ac, char **av) vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, ST_GPA_BASE, 1, gpages, 0); virt_map(vm, ST_GPA_BASE, ST_GPA_BASE, gpages); + ksft_print_header(); TEST_REQUIRE(is_steal_time_supported(vcpus[0])); + ksft_set_plan(NR_VCPUS); /* Run test on each VCPU */ for (i = 0; i < NR_VCPUS; ++i) { @@ -308,14 +308,14 @@ int main(int ac, char **av) run_delay, stolen_time); if (verbose) { - pr_info("VCPU%d: total-stolen-time=%ld test-stolen-time=%ld", i, - guest_stolen_time[i], stolen_time); - if (stolen_time == run_delay) - pr_info(" (BONUS: guest test-stolen-time even exactly matches test-run_delay)"); - pr_info("\n"); + ksft_print_msg("VCPU%d: total-stolen-time=%ld test-stolen-time=%ld%s\n", + i, guest_stolen_time[i], stolen_time, + stolen_time == run_delay ? + " (BONUS: guest test-stolen-time even exactly matches test-run_delay)" : ""); steal_time_dump(vm, i); } + ksft_test_result_pass("vcpu%d\n", i); } - return 0; + ksft_finished(); /* Print results and exit() accordingly */ } -- 2.41.0

1 year, 2 months

5
6
0 0

[PATCH 0/8] iommufd support pasid attach/replace

by Yi Liu

PASID (Process Address Space ID) is a PCIe extension to tag the DMA transactions out of a physical device, and most modern IOMMU hardware have supported PASID granular address translation. So a PASID-capable device can be attached to multiple hwpts (a.k.a. domains), each attachment is tagged with a pasid. This series first adds a missing iommu API to replace domain for a pasid, then adds iommufd APIs for device drivers to attach/replace/detach pasid to/from hwpt per userspace's request, and adds selftest to validate the iommufd APIs. pasid attach/replace is mandatory on Intel VT-d given the PASID table locates in the physical address space hence must be managed by the kernel, both for supporting vSVA and coming SIOV. But it's optional on ARM/AMD which allow configuring the PASID/CD table either in host physical address space or nested on top of an GPA address space. This series only add VT-d support as the minimal requirement. Complete code can be found in below link: https://github.com/yiliu1765/iommufd/tree/iommufd_pasid Change log: v1: - Implemnet iommu_replace_device_pasid() to fall back to the original domain if this replacement failed (Kevin) - Add check in do_attach() to check corressponding attach_fn per the pasid value. rfc: https://lore.kernel.org/linux-iommu/20230926092651.17041-1-yi.l.liu@intel.c… Regards, Yi Liu Kevin Tian (1): iommufd: Support attach/replace hwpt per pasid Lu Baolu (2): iommu: Introduce a replace API for device pasid iommu/vt-d: Add set_dev_pasid callback for nested domain Yi Liu (5): iommufd: replace attach_fn with a structure iommufd/selftest: Add set_dev_pasid and remove_dev_pasid in mock iommu iommufd/selftest: Add a helper to get test device iommufd/selftest: Add test ops to test pasid attach/detach iommufd/selftest: Add coverage for iommufd pasid attach/detach drivers/iommu/intel/nested.c | 47 +++++ drivers/iommu/iommu-priv.h | 2 + drivers/iommu/iommu.c | 82 ++++++-- drivers/iommu/iommufd/Makefile | 1 + drivers/iommu/iommufd/device.c | 50 +++-- drivers/iommu/iommufd/iommufd_private.h | 23 +++ drivers/iommu/iommufd/iommufd_test.h | 24 +++ drivers/iommu/iommufd/pasid.c | 138 ++++++++++++++ drivers/iommu/iommufd/selftest.c | 176 ++++++++++++++++-- include/linux/iommufd.h | 6 + tools/testing/selftests/iommu/iommufd.c | 172 +++++++++++++++++ .../selftests/iommu/iommufd_fail_nth.c | 28 ++- tools/testing/selftests/iommu/iommufd_utils.h | 78 ++++++++ 13 files changed, 785 insertions(+), 42 deletions(-) create mode 100644 drivers/iommu/iommufd/pasid.c -- 2.34.1

1 year, 2 months

5
35
0 0

[PATCH v3 0/7] Split a folio to any lower order folios

by Zi Yan

From: Zi Yan <ziy(a)nvidia.com> Hi all, File folio supports any order and people would like to support flexible orders for anonymous folio[1] too. Currently, split_huge_page() only splits a huge page to order-0 pages, but splitting to orders higher than 0 is also useful. This patchset adds support for splitting a huge page to any lower order pages and uses it during file folio truncate operations. The patchset is on top of mm-everything-2023-03-27-21-20. Changelog === Since v2 --- 1. Fixed an issue in __split_page_owner() introduced during my rebase Since v1 --- 1. Changed split_page_memcg() and split_page_owner() parameter to use order 2. Used folio_test_pmd_mappable() in place of the equivalent code Details === * Patch 1 changes split_page_memcg() to use order instead of nr_pages * Patch 2 changes split_page_owner() to use order instead of nr_pages * Patch 3 and 4 add new_order parameter split_page_memcg() and split_page_owner() and prepare for upcoming changes. * Patch 5 adds split_huge_page_to_list_to_order() to split a huge page to any lower order. The original split_huge_page_to_list() calls split_huge_page_to_list_to_order() with new_order = 0. * Patch 6 uses split_huge_page_to_list_to_order() in large pagecache folio truncation instead of split the large folio all the way down to order-0. * Patch 7 adds a test API to debugfs and test cases in split_huge_page_test selftests. Comments and/or suggestions are welcome. [1] https://lore.kernel.org/linux-mm/Y%2FblF0GIunm+pRIC@casper.infradead.org/ Zi Yan (7): mm/memcg: use order instead of nr in split_page_memcg() mm/page_owner: use order instead of nr in split_page_owner() mm: memcg: make memcg huge page split support any order split. mm: page_owner: add support for splitting to any order in split page_owner. mm: thp: split huge page to any lower order pages. mm: truncate: split huge page cache page to a non-zero order if possible. mm: huge_memory: enable debugfs to split huge pages to any order. include/linux/huge_mm.h | 10 +- include/linux/memcontrol.h | 4 +- include/linux/page_owner.h | 10 +- mm/huge_memory.c | 137 ++++++++--- mm/memcontrol.c | 10 +- mm/page_alloc.c | 8 +- mm/page_owner.c | 8 +- mm/truncate.c | 21 +- .../selftests/mm/split_huge_page_test.c | 225 +++++++++++++++++- 9 files changed, 365 insertions(+), 68 deletions(-) -- 2.39.2

1 year, 3 months

6
22
0 0

[PATCH v2 0/7] Use TAP in some more x86 KVM selftests

by Thomas Huth

Here's a follow-up from my RFC series last year: https://lore.kernel.org/lkml/20221004093131.40392-1-thuth@redhat.com/T/ and from v1 earlier this year: https://lore.kernel.org/kvm/20230712075910.22480-1-thuth@redhat.com/ Basic idea of this series is now to use the kselftest_harness.h framework to get TAP output in the tests, so that it is easier for the user to see what is going on, and e.g. to be able to detect whether a certain test is part of the test binary or not (which is useful when tests get extended in the course of time). v2: - Dropped the "Rename the ASSERT_EQ macro" patch (already merged) - Split the fixes in the sync_regs_test into separate patches (see the first two patches) - Introduce the KVM_ONE_VCPU_TEST_SUITE() macro as suggested by Sean (see third patch) and use it in the following patches - Add a new patch to convert vmx_pmu_caps_test.c, too Thomas Huth (7): KVM: selftests: x86: sync_regs_test: Use vcpu_run() where appropriate KVM: selftests: x86: sync_regs_test: Get regs structure before modifying it KVM: selftests: Add a macro to define a test with one vcpu KVM: selftests: x86: Use TAP interface in the sync_regs test KVM: selftests: x86: Use TAP interface in the fix_hypercall test KVM: selftests: x86: Use TAP interface in the vmx_pmu_caps test KVM: selftests: x86: Use TAP interface in the userspace_msr_exit test .../selftests/kvm/include/kvm_test_harness.h | 35 +++++ .../selftests/kvm/x86_64/fix_hypercall_test.c | 27 ++-- .../selftests/kvm/x86_64/sync_regs_test.c | 121 +++++++++++++----- .../kvm/x86_64/userspace_msr_exit_test.c | 19 +-- .../selftests/kvm/x86_64/vmx_pmu_caps_test.c | 50 ++------ 5 files changed, 160 insertions(+), 92 deletions(-) create mode 100644 tools/testing/selftests/kvm/include/kvm_test_harness.h -- 2.41.0

1 year, 4 months

2
9
0 0

[PATCH RFT v3 0/5] fork: Support shadow stacks in clone3()

by Mark Brown

The kernel has recently added support for shadow stacks, currently x86 only using their CET feature but both arm64 and RISC-V have equivalent features (GCS and Zicfiss respectively), I am actively working on GCS[1]. With shadow stacks the hardware maintains an additional stack containing only the return addresses for branch instructions which is not generally writeable by userspace and ensures that any returns are to the recorded addresses. This provides some protection against ROP attacks and making it easier to collect call stacks. These shadow stacks are allocated in the address space of the userspace process. Our API for shadow stacks does not currently offer userspace any flexiblity for managing the allocation of shadow stacks for newly created threads, instead the kernel allocates a new shadow stack with the same size as the normal stack whenever a thread is created with the feature enabled. The stacks allocated in this way are freed by the kernel when the thread exits or shadow stacks are disabled for the thread. This lack of flexibility and control isn't ideal, in the vast majority of cases the shadow stack will be over allocated and the implicit allocation and deallocation is not consistent with other interfaces. As far as I can tell the interface is done in this manner mainly because the shadow stack patches were in development since before clone3() was implemented. Since clone3() is readily extensible let's add support for specifying a shadow stack when creating a new thread or process in a similar manner to how the normal stack is specified, keeping the current implicit allocation behaviour if one is not specified either with clone3() or through the use of clone(). Unlike normal stacks only the shadow stack size is specified, similar issues to those that lead to the creation of map_shadow_stack() apply. Please note that the x86 portions of this code are build tested only, I don't appear to have a system that can run CET avaible to me, I have done testing with an integration into my pending work for GCS. There is some possibility that the arm64 implementation may require the use of clone3() and explicit userspace allocation of shadow stacks, this is still under discussion. A new architecture feature Kconfig option for shadow stacks is added as here, this was suggested as part of the review comments for the arm64 GCS series and since we need to detect if shadow stacks are supported it seemed sensible to roll it in here. [1] https://lore.kernel.org/r/20231009-arm64-gcs-v6-0-78e55deaa4dd@kernel.org/ Signed-off-by: Mark Brown <broonie(a)kernel.org> --- Changes in v3: - Rebase onto v6.7-rc2. - Remove stale shadow_stack in internal kargs. - If a shadow stack is specified unconditionally use it regardless of CLONE_ parameters. - Force enable shadow stacks in the selftest. - Update changelogs for RISC-V feature rename. - Link to v2: https://lore.kernel.org/r/20231114-clone3-shadow-stack-v2-0-b613f8681155@ke… Changes in v2: - Rebase onto v6.7-rc1. - Remove ability to provide preallocated shadow stack, just specify the desired size. - Link to v1: https://lore.kernel.org/r/20231023-clone3-shadow-stack-v1-0-d867d0b5d4d0@ke… --- Mark Brown (5): mm: Introduce ARCH_HAS_USER_SHADOW_STACK fork: Add shadow stack support to clone3() selftests/clone3: Factor more of main loop into test_clone3() selftests/clone3: Allow tests to flag if -E2BIG is a valid error code kselftest/clone3: Test shadow stack support arch/x86/Kconfig | 1 + arch/x86/include/asm/shstk.h | 11 +- arch/x86/kernel/process.c | 2 +- arch/x86/kernel/shstk.c | 59 +++++-- fs/proc/task_mmu.c | 2 +- include/linux/mm.h | 2 +- include/linux/sched/task.h | 1 + include/uapi/linux/sched.h | 4 + kernel/fork.c | 22 ++- mm/Kconfig | 6 + tools/testing/selftests/clone3/clone3.c | 200 +++++++++++++++++----- tools/testing/selftests/clone3/clone3_selftests.h | 7 + 12 files changed, 250 insertions(+), 67 deletions(-) --- base-commit: 98b1cc82c4affc16f5598d4fa14b1858671b2263 change-id: 20231019-clone3-shadow-stack-15d40d2bf536 Best regards, -- Mark Brown <broonie(a)kernel.org>

1 year, 4 months

5
18
0 0

[PATCH v2] KVM: selftests: Fix the dirty_log_test semaphore imbalance

by Shaoqin Huang

When execute the dirty_log_test on some aarch64 machine, it sometimes trigger the ASSERT: ==== Test Assertion Failure ==== dirty_log_test.c:384: dirty_ring_vcpu_ring_full pid=14854 tid=14854 errno=22 - Invalid argument 1 0x00000000004033eb: dirty_ring_collect_dirty_pages at dirty_log_test.c:384 2 0x0000000000402d27: log_mode_collect_dirty_pages at dirty_log_test.c:505 3 (inlined by) run_test at dirty_log_test.c:802 4 0x0000000000403dc7: for_each_guest_mode at guest_modes.c:100 5 0x0000000000401dff: main at dirty_log_test.c:941 (discriminator 3) 6 0x0000ffff9be173c7: ?? ??:0 7 0x0000ffff9be1749f: ?? ??:0 8 0x000000000040206f: _start at ??:? Didn't continue vcpu even without ring full The dirty_log_test fails when execute the dirty-ring test, this is because the sem_vcpu_cont and the sem_vcpu_stop is non-zero value when execute the dirty_ring_collect_dirty_pages() function. When those two sem_t variables are non-zero, the dirty_ring_wait_vcpu() at the beginning of the dirty_ring_collect_dirty_pages() will not wait for the vcpu to stop, but continue to execute the following code. In this case, before vcpu stop, if the dirty_ring_vcpu_ring_full is true, and the dirty_ring_collect_dirty_pages() has passed the check for the dirty_ring_vcpu_ring_full but hasn't execute the check for the continued_vcpu, the vcpu stop, and set the dirty_ring_vcpu_ring_full to false. Then dirty_ring_collect_dirty_pages() will trigger the ASSERT. Why sem_vcpu_cont and sem_vcpu_stop can be non-zero value? It's because the dirty_ring_before_vcpu_join() execute the sem_post(&sem_vcpu_cont) at the end of each dirty-ring test. It can cause two cases: 1. sem_vcpu_cont be non-zero. When we set the host_quit to be true, the vcpu_worker directly see the host_quit to be true, it quit. So the log_mode_before_vcpu_join() function will set the sem_vcpu_cont to 1, since the vcpu_worker has quit, it won't consume it. 2. sem_vcpu_stop be non-zero. When we set the host_quit to be true, the vcpu_worker has entered the guest state, the next time it exit from guest state, it will set the sem_vcpu_stop to 1, and then see the host_quit, no one will consume the sem_vcpu_stop. When execute more and more dirty-ring tests, the sem_vcpu_cont and sem_vcpu_stop can be larger and larger, which makes many code paths don't wait for the sem_t. Thus finally cause the problem. To fix this problem, we can wait a while before set the host_quit to true, which gives the vcpu time to enter the guest state, so it will exit again. Then we can wait the vcpu to exit, and let it continue again, then the vcpu will see the host_quit. Thus the sem_vcpu_cont and sem_vcpu_stop will be both zero when test finished. Signed-off-by: Shaoqin Huang <shahuang(a)redhat.com> --- v1->v2: - Fix the real logic bug, not just fresh the context. v1: https://lore.kernel.org/all/20231116093536.22256-1-shahuang@redhat.com/ --- tools/testing/selftests/kvm/dirty_log_test.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/kvm/dirty_log_test.c b/tools/testing/selftests/kvm/dirty_log_test.c index 936f3a8d1b83..a6e0ff46a07c 100644 --- a/tools/testing/selftests/kvm/dirty_log_test.c +++ b/tools/testing/selftests/kvm/dirty_log_test.c @@ -417,7 +417,8 @@ static void dirty_ring_after_vcpu_run(struct kvm_vcpu *vcpu, int ret, int err) static void dirty_ring_before_vcpu_join(void) { - /* Kick another round of vcpu just to make sure it will quit */ + /* Wait vcpu exit, and let it continue to see the host_quit. */ + dirty_ring_wait_vcpu(); sem_post(&sem_vcpu_cont); } @@ -719,6 +720,7 @@ static void run_test(enum vm_guest_mode mode, void *arg) struct kvm_vm *vm; unsigned long *bmap; uint32_t ring_buf_idx = 0; + int sem_val; if (!log_mode_supported()) { print_skip("Log mode '%s' not supported", @@ -726,6 +728,11 @@ static void run_test(enum vm_guest_mode mode, void *arg) return; } + sem_getvalue(&sem_vcpu_stop, &sem_val); + assert(sem_val == 0); + sem_getvalue(&sem_vcpu_cont, &sem_val); + assert(sem_val == 0); + /* * We reserve page table for 2 times of extra dirty mem which * will definitely cover the original (1G+) test range. Here @@ -825,6 +832,13 @@ static void run_test(enum vm_guest_mode mode, void *arg) sync_global_to_guest(vm, iteration); } + /* + * + * Before we set the host_quit, let the vcpu has time to run, to make + * sure we consume the sem_vcpu_stop and the vcpu consume the + * sem_vcpu_cont, to keep the semaphore balance. + */ + usleep(p->interval * 1000); /* Tell the vcpu thread to quit */ host_quit = true; log_mode_before_vcpu_join(); -- 2.40.1

1 year, 4 months

2
4
0 0

[RFC PATCH v1 0/8] KVM: seftests: Support guest user mode execution and running

by Zeng Guang

This patch series give a proposal to support guest VM running in user mode and in canonical linear address organization as well. First design to parition the 64-bit canonical linear address space into two half parts belonging to user-mode and supervisor-mode respectively, similar as the organization of linear addresses used in linux OS. Currently the linear addresses use 48-bit canonical format in which bits 63:47 of the address are identical. Secondly setup page table mapping the same guest physical address of test code and data segment onto both user-mode and supervisor-mode address space. It allows guest in different runtime mode, i.e. user or supervisor, can run one code base in the corresponding linear address space. Also provide the runtime environment setup API for switching to user mode execution. Zeng Guang (8): KVM: selftests: x86: Fix bug in addr_arch_gva2gpa() KVM: selftests: x86: Support guest running on canonical linear-address organization KVM: selftests: Add virt_arch_ucall_prealloc() arch specific implementation KVM : selftests : Adapt selftest cases to kernel canonical linear address KVM: selftests: x86: Prepare setup for user mode support KVM: selftests: x86: Allow user to access user-mode address and I/O address space KVM: selftests: x86: Support vcpu run in user mode KVM: selftests: x86: Add KVM forced emulation prefix capability .../selftests/kvm/include/kvm_util_base.h | 20 ++- .../selftests/kvm/include/x86_64/processor.h | 48 ++++++- .../selftests/kvm/lib/aarch64/processor.c | 5 + tools/testing/selftests/kvm/lib/kvm_util.c | 6 +- .../selftests/kvm/lib/riscv/processor.c | 5 + .../selftests/kvm/lib/s390x/processor.c | 5 + .../testing/selftests/kvm/lib/ucall_common.c | 2 + .../selftests/kvm/lib/x86_64/processor.c | 117 ++++++++++++++---- .../selftests/kvm/set_memory_region_test.c | 13 +- .../testing/selftests/kvm/x86_64/debug_regs.c | 2 +- .../kvm/x86_64/userspace_msr_exit_test.c | 9 +- 11 files changed, 195 insertions(+), 37 deletions(-) -- 2.21.3

1 year, 4 months

2
11
0 0

[RFC PATCH 0/3] MAINTAINERS: Introduce V: field for required tests

by Nikolai Kondrashov

The idea of this RFC is to introduce a way to catalogue and document any tests that should be executed for changes to a subsystem, as well as to make checkpatch.pl require a tag in commit messages certifying they were, plus hopefully make it easier to discover and run them. This is following a discussion Veronika Kabatova started with a few (addressed) people at the LPC last year (IIRC), where there was a good deal of interest for something like this. Apart from implementing basic support (surely to be improved), two sample changes are added on top, adding a few test suites (roughly) based on what the maintainers described earlier. I'm definitely not qualified for describing them adequately, and don't have the time to dig deeper, but hopefully they could serve as illustrations, and shouldn't be merged as is. I would defer to maintainers of the corresponding subsystems and tests to describe their tests and requirements better. Although I would accept amendments too, if they prefer it that way. One bug I know that's definitely there is handling removed files. The scripts/get_maintainer.pl chokes on non-existing files, failing to output the required test suites (I'm sure there's a good reason, but I couldn't see it). My first idea is to only check for required tests upon encountering the '+++ <file>' line, and ignore the '/dev/null' file, but I hope the checkpatch.pl maintainers could recommend a better way. Anyway, tell me what you think, and I'll work on polishing this. Thank you! Nick --- Nikolai Kondrashov (3): MAINTAINERS: Introduce V: field for required tests MAINTAINERS: Require kvm-xfstests smoke for ext4 MAINTAINERS: Require kunit core tests for framework changes Documentation/process/submitting-patches.rst | 19 +++++ Documentation/process/tests.rst | 80 ++++++++++++++++++ MAINTAINERS | 8 ++ scripts/checkpatch.pl | 118 ++++++++++++++++++++++++++- scripts/get_maintainer.pl | 17 +++- scripts/parse-maintainers.pl | 3 +- 6 files changed, 241 insertions(+), 4 deletions(-) ---

1 year, 4 months

17
71
0 0

[PATCH v12] exec: Fix dead-lock in de_thread with ptrace_attach

by Bernd Edlinger

This introduces signal->exec_bprm, which is used to fix the case when at least one of the sibling threads is traced, and therefore the trace process may dead-lock in ptrace_attach, but de_thread will need to wait for the tracer to continue execution. The solution is to detect this situation and allow ptrace_attach to continue by temporarily releasing the cred_guard_mutex, while de_thread() is still waiting for traced zombies to be eventually released by the tracer. In the case of the thread group leader we only have to wait for the thread to become a zombie, which may also need co-operation from the tracer due to PTRACE_O_TRACEEXIT. When a tracer wants to ptrace_attach a task that already is in execve, we simply retry the ptrace_may_access check while temporarily installing the new credentials and dumpability which are about to be used after execve completes. If the ptrace_attach happens on a thread that is a sibling-thread of the thread doing execve, it is sufficient to check against the old credentials, as this thread will be waited for, before the new credentials are installed. Other threads die quickly since the cred_guard_mutex is released, but a deadly signal is already pending. In case the mutex_lock_killable misses the signal, the non-zero current->signal->exec_bprm makes sure they release the mutex immediately and return with -ERESTARTNOINTR. This means there is no API change, unlike the previous version of this patch which was discussed here: https://lore.kernel.org/lkml/b6537ae6-31b1-5c50-f32b-8b8332ace882@hotmail.d… See tools/testing/selftests/ptrace/vmaccess.c for a test case that gets fixed by this change. Note that since the test case was originally designed to test the ptrace_attach returning an error in this situation, the test expectation needed to be adjusted, to allow the API to succeed at the first attempt. Signed-off-by: Bernd Edlinger <bernd.edlinger(a)hotmail.de> --- fs/exec.c | 69 ++++++++++++++++------- fs/proc/base.c | 6 ++ include/linux/cred.h | 1 + include/linux/sched/signal.h | 18 ++++++ kernel/cred.c | 28 +++++++-- kernel/ptrace.c | 32 +++++++++++ kernel/seccomp.c | 12 +++- tools/testing/selftests/ptrace/vmaccess.c | 23 +++++--- 8 files changed, 155 insertions(+), 34 deletions(-) v10: Changes to previous version, make the PTRACE_ATTACH retun -EAGAIN, instead of execve return -ERESTARTSYS. Added some lessions learned to the description. v11: Check old and new credentials in PTRACE_ATTACH again without changing the API. Note: I got actually one response from an automatic checker to the v11 patch, https://lore.kernel.org/lkml/202107121344.wu68hEPF-lkp@intel.com/ which is complaining about: >> kernel/ptrace.c:425:26: sparse: sparse: incorrect type in assignment (different address spaces) @@ expected struct cred const *old_cred @@ got struct cred const [noderef] __rcu *real_cred @@ 417 struct linux_binprm *bprm = task->signal->exec_bprm; 418 const struct cred *old_cred; 419 struct mm_struct *old_mm; 420 421 retval = down_write_killable(&task->signal->exec_update_lock); 422 if (retval) 423 goto unlock_creds; 424 task_lock(task); > 425 old_cred = task->real_cred; v12: Essentially identical to v11. - Fixed a minor merge conflict in linux v5.17, and fixed the above mentioned nit by adding __rcu to the declaration. - re-tested the patch with all linux versions from v5.11 to v6.6 v10 was an alternative approach which did imply an API change. But I would prefer to avoid such an API change. The difficult part is getting the right dumpability flags assigned before de_thread starts, hope you like this version. If not, the v10 is of course also acceptable. Thanks Bernd. diff --git a/fs/exec.c b/fs/exec.c index 2f2b0acec4f0..902d3b230485 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1041,11 +1041,13 @@ static int exec_mmap(struct mm_struct *mm) return 0; } -static int de_thread(struct task_struct *tsk) +static int de_thread(struct task_struct *tsk, struct linux_binprm *bprm) { struct signal_struct *sig = tsk->signal; struct sighand_struct *oldsighand = tsk->sighand; spinlock_t *lock = &oldsighand->siglock; + struct task_struct *t = tsk; + bool unsafe_execve_in_progress = false; if (thread_group_empty(tsk)) goto no_thread_group; @@ -1068,6 +1070,19 @@ static int de_thread(struct task_struct *tsk) if (!thread_group_leader(tsk)) sig->notify_count--; + while_each_thread(tsk, t) { + if (unlikely(t->ptrace) + && (t != tsk->group_leader || !t->exit_state)) + unsafe_execve_in_progress = true; + } + + if (unlikely(unsafe_execve_in_progress)) { + spin_unlock_irq(lock); + sig->exec_bprm = bprm; + mutex_unlock(&sig->cred_guard_mutex); + spin_lock_irq(lock); + } + while (sig->notify_count) { __set_current_state(TASK_KILLABLE); spin_unlock_irq(lock); @@ -1158,6 +1173,11 @@ static int de_thread(struct task_struct *tsk) release_task(leader); } + if (unlikely(unsafe_execve_in_progress)) { + mutex_lock(&sig->cred_guard_mutex); + sig->exec_bprm = NULL; + } + sig->group_exec_task = NULL; sig->notify_count = 0; @@ -1169,6 +1189,11 @@ static int de_thread(struct task_struct *tsk) return 0; killed: + if (unlikely(unsafe_execve_in_progress)) { + mutex_lock(&sig->cred_guard_mutex); + sig->exec_bprm = NULL; + } + /* protects against exit_notify() and __exit_signal() */ read_lock(&tasklist_lock); sig->group_exec_task = NULL; @@ -1253,6 +1278,24 @@ int begin_new_exec(struct linux_binprm * bprm) if (retval) return retval; + /* If the binary is not readable then enforce mm->dumpable=0 */ + would_dump(bprm, bprm->file); + if (bprm->have_execfd) + would_dump(bprm, bprm->executable); + + /* + * Figure out dumpability. Note that this checking only of current + * is wrong, but userspace depends on it. This should be testing + * bprm->secureexec instead. + */ + if (bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP || + is_dumpability_changed(current_cred(), bprm->cred) || + !(uid_eq(current_euid(), current_uid()) && + gid_eq(current_egid(), current_gid()))) + set_dumpable(bprm->mm, suid_dumpable); + else + set_dumpable(bprm->mm, SUID_DUMP_USER); + /* * Ensure all future errors are fatal. */ @@ -1261,7 +1304,7 @@ int begin_new_exec(struct linux_binprm * bprm) /* * Make this the only thread in the thread group. */ - retval = de_thread(me); + retval = de_thread(me, bprm); if (retval) goto out; @@ -1284,11 +1327,6 @@ int begin_new_exec(struct linux_binprm * bprm) if (retval) goto out; - /* If the binary is not readable then enforce mm->dumpable=0 */ - would_dump(bprm, bprm->file); - if (bprm->have_execfd) - would_dump(bprm, bprm->executable); - /* * Release all of the old mmap stuff */ @@ -1350,18 +1388,6 @@ int begin_new_exec(struct linux_binprm * bprm) me->sas_ss_sp = me->sas_ss_size = 0; - /* - * Figure out dumpability. Note that this checking only of current - * is wrong, but userspace depends on it. This should be testing - * bprm->secureexec instead. - */ - if (bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP || - !(uid_eq(current_euid(), current_uid()) && - gid_eq(current_egid(), current_gid()))) - set_dumpable(current->mm, suid_dumpable); - else - set_dumpable(current->mm, SUID_DUMP_USER); - perf_event_exec(); __set_task_comm(me, kbasename(bprm->filename), true); @@ -1480,6 +1506,11 @@ static int prepare_bprm_creds(struct linux_binprm *bprm) if (mutex_lock_interruptible(&current->signal->cred_guard_mutex)) return -ERESTARTNOINTR; + if (unlikely(current->signal->exec_bprm)) { + mutex_unlock(&current->signal->cred_guard_mutex); + return -ERESTARTNOINTR; + } + bprm->cred = prepare_exec_creds(); if (likely(bprm->cred)) return 0; diff --git a/fs/proc/base.c b/fs/proc/base.c index ffd54617c354..0da9adfadb48 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -2788,6 +2788,12 @@ static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf, if (rv < 0) goto out_free; + if (unlikely(current->signal->exec_bprm)) { + mutex_unlock(&current->signal->cred_guard_mutex); + rv = -ERESTARTNOINTR; + goto out_free; + } + rv = security_setprocattr(PROC_I(inode)->op.lsm, file->f_path.dentry->d_name.name, page, count); diff --git a/include/linux/cred.h b/include/linux/cred.h index f923528d5cc4..b01e309f5686 100644 --- a/include/linux/cred.h +++ b/include/linux/cred.h @@ -159,6 +159,7 @@ extern const struct cred *get_task_cred(struct task_struct *); extern struct cred *cred_alloc_blank(void); extern struct cred *prepare_creds(void); extern struct cred *prepare_exec_creds(void); +extern bool is_dumpability_changed(const struct cred *, const struct cred *); extern int commit_creds(struct cred *); extern void abort_creds(struct cred *); extern const struct cred *override_creds(const struct cred *); diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index 0014d3adaf84..14df7073a0a8 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -234,9 +234,27 @@ struct signal_struct { struct mm_struct *oom_mm; /* recorded mm when the thread group got * killed by the oom killer */ + struct linux_binprm *exec_bprm; /* Used to check ptrace_may_access + * against new credentials while + * de_thread is waiting for other + * traced threads to terminate. + * Set while de_thread is executing. + * The cred_guard_mutex is released + * after de_thread() has called + * zap_other_threads(), therefore + * a fatal signal is guaranteed to be + * already pending in the unlikely + * event, that + * current->signal->exec_bprm happens + * to be non-zero after the + * cred_guard_mutex was acquired. + */ + struct mutex cred_guard_mutex; /* guard against foreign influences on * credential calculations * (notably. ptrace) + * Held while execve runs, except when + * a sibling thread is being traced. * Deprecated do not use in new code. * Use exec_update_lock instead. */ diff --git a/kernel/cred.c b/kernel/cred.c index 98cb4eca23fb..586cb6c7cf6b 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -433,6 +433,28 @@ static bool cred_cap_issubset(const struct cred *set, const struct cred *subset) return false; } +/** + * is_dumpability_changed - Will changing creds from old to new + * affect the dumpability in commit_creds? + * + * Return: false - dumpability will not be changed in commit_creds. + * Return: true - dumpability will be changed to non-dumpable. + * + * @old: The old credentials + * @new: The new credentials + */ +bool is_dumpability_changed(const struct cred *old, const struct cred *new) +{ + if (!uid_eq(old->euid, new->euid) || + !gid_eq(old->egid, new->egid) || + !uid_eq(old->fsuid, new->fsuid) || + !gid_eq(old->fsgid, new->fsgid) || + !cred_cap_issubset(old, new)) + return true; + + return false; +} + /** * commit_creds - Install new credentials upon the current task * @new: The credentials to be assigned @@ -467,11 +489,7 @@ int commit_creds(struct cred *new) get_cred(new); /* we will require a ref for the subj creds too */ /* dumpability changes */ - if (!uid_eq(old->euid, new->euid) || - !gid_eq(old->egid, new->egid) || - !uid_eq(old->fsuid, new->fsuid) || - !gid_eq(old->fsgid, new->fsgid) || - !cred_cap_issubset(old, new)) { + if (is_dumpability_changed(old, new)) { if (task->mm) set_dumpable(task->mm, suid_dumpable); task->pdeath_signal = 0; diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 443057bee87c..eb1c450bb7d7 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -20,6 +20,7 @@ #include <linux/pagemap.h> #include <linux/ptrace.h> #include <linux/security.h> +#include <linux/binfmts.h> #include <linux/signal.h> #include <linux/uio.h> #include <linux/audit.h> @@ -435,6 +436,28 @@ static int ptrace_attach(struct task_struct *task, long request, if (retval) goto unlock_creds; + if (unlikely(task->in_execve)) { + struct linux_binprm *bprm = task->signal->exec_bprm; + const struct cred __rcu *old_cred; + struct mm_struct *old_mm; + + retval = down_write_killable(&task->signal->exec_update_lock); + if (retval) + goto unlock_creds; + task_lock(task); + old_cred = task->real_cred; + old_mm = task->mm; + rcu_assign_pointer(task->real_cred, bprm->cred); + task->mm = bprm->mm; + retval = __ptrace_may_access(task, PTRACE_MODE_ATTACH_REALCREDS); + rcu_assign_pointer(task->real_cred, old_cred); + task->mm = old_mm; + task_unlock(task); + up_write(&task->signal->exec_update_lock); + if (retval) + goto unlock_creds; + } + write_lock_irq(&tasklist_lock); retval = -EPERM; if (unlikely(task->exit_state)) @@ -508,6 +531,14 @@ static int ptrace_traceme(void) { int ret = -EPERM; + if (mutex_lock_interruptible(&current->signal->cred_guard_mutex)) + return -ERESTARTNOINTR; + + if (unlikely(current->signal->exec_bprm)) { + mutex_unlock(&current->signal->cred_guard_mutex); + return -ERESTARTNOINTR; + } + write_lock_irq(&tasklist_lock); /* Are we already being traced? */ if (!current->ptrace) { @@ -523,6 +554,7 @@ static int ptrace_traceme(void) } } write_unlock_irq(&tasklist_lock); + mutex_unlock(&current->signal->cred_guard_mutex); return ret; } diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 255999ba9190..b29bbfa0b044 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -1955,9 +1955,15 @@ static long seccomp_set_mode_filter(unsigned int flags, * Make sure we cannot change seccomp or nnp state via TSYNC * while another thread is in the middle of calling exec. */ - if (flags & SECCOMP_FILTER_FLAG_TSYNC && - mutex_lock_killable(&current->signal->cred_guard_mutex)) - goto out_put_fd; + if (flags & SECCOMP_FILTER_FLAG_TSYNC) { + if (mutex_lock_killable(&current->signal->cred_guard_mutex)) + goto out_put_fd; + + if (unlikely(current->signal->exec_bprm)) { + mutex_unlock(&current->signal->cred_guard_mutex); + goto out_put_fd; + } + } spin_lock_irq(&current->sighand->siglock); diff --git a/tools/testing/selftests/ptrace/vmaccess.c b/tools/testing/selftests/ptrace/vmaccess.c index 4db327b44586..3b7d81fb99bb 100644 --- a/tools/testing/selftests/ptrace/vmaccess.c +++ b/tools/testing/selftests/ptrace/vmaccess.c @@ -39,8 +39,15 @@ TEST(vmaccess) f = open(mm, O_RDONLY); ASSERT_GE(f, 0); close(f); - f = kill(pid, SIGCONT); - ASSERT_EQ(f, 0); + f = waitpid(-1, NULL, 0); + ASSERT_NE(f, -1); + ASSERT_NE(f, 0); + ASSERT_NE(f, pid); + f = waitpid(-1, NULL, 0); + ASSERT_EQ(f, pid); + f = waitpid(-1, NULL, 0); + ASSERT_EQ(f, -1); + ASSERT_EQ(errno, ECHILD); } TEST(attach) @@ -57,22 +64,24 @@ TEST(attach) sleep(1); k = ptrace(PTRACE_ATTACH, pid, 0L, 0L); - ASSERT_EQ(errno, EAGAIN); - ASSERT_EQ(k, -1); + ASSERT_EQ(k, 0); k = waitpid(-1, &s, WNOHANG); ASSERT_NE(k, -1); ASSERT_NE(k, 0); ASSERT_NE(k, pid); ASSERT_EQ(WIFEXITED(s), 1); ASSERT_EQ(WEXITSTATUS(s), 0); - sleep(1); - k = ptrace(PTRACE_ATTACH, pid, 0L, 0L); + k = waitpid(-1, &s, 0); + ASSERT_EQ(k, pid); + ASSERT_EQ(WIFSTOPPED(s), 1); + ASSERT_EQ(WSTOPSIG(s), SIGTRAP); + k = ptrace(PTRACE_CONT, pid, 0L, 0L); ASSERT_EQ(k, 0); k = waitpid(-1, &s, 0); ASSERT_EQ(k, pid); ASSERT_EQ(WIFSTOPPED(s), 1); ASSERT_EQ(WSTOPSIG(s), SIGSTOP); - k = ptrace(PTRACE_DETACH, pid, 0L, 0L); + k = ptrace(PTRACE_CONT, pid, 0L, 0L); ASSERT_EQ(k, 0); k = waitpid(-1, &s, 0); ASSERT_EQ(k, pid); -- 2.39.2

1 year, 4 months

5
14
0 0

2025

2024

2023

2022

2021

2020

2019

2018

2017

Linux-kselftest-mirror November 2023