Linux-kselftest-mirror April 2021

linux-kselftest-mirror@lists.linaro.org

92 participants
124 discussions

[PATCH -next] khugepaged: selftests: remove debug_cow

by Nanyong Sun

The debug_cow attribute had been removed since commit 4958e4d86ecb01 ("mm: thp: remove debug_cow switch"), so remove it in selftest code too, otherwise the khugepaged test will fail. Signed-off-by: Nanyong Sun <sunnanyong(a)huawei.com> --- tools/testing/selftests/vm/khugepaged.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/tools/testing/selftests/vm/khugepaged.c b/tools/testing/selftests/vm/khugepaged.c index 8b7582130..155120b67 100644 --- a/tools/testing/selftests/vm/khugepaged.c +++ b/tools/testing/selftests/vm/khugepaged.c @@ -86,7 +86,6 @@ struct settings { enum thp_enabled thp_enabled; enum thp_defrag thp_defrag; enum shmem_enabled shmem_enabled; - bool debug_cow; bool use_zero_page; struct khugepaged_settings khugepaged; }; @@ -95,7 +94,6 @@ static struct settings default_settings = { .thp_enabled = THP_MADVISE, .thp_defrag = THP_DEFRAG_ALWAYS, .shmem_enabled = SHMEM_NEVER, - .debug_cow = 0, .use_zero_page = 0, .khugepaged = { .defrag = 1, @@ -268,7 +266,6 @@ static void write_settings(struct settings *settings) write_string("defrag", thp_defrag_strings[settings->thp_defrag]); write_string("shmem_enabled", shmem_enabled_strings[settings->shmem_enabled]); - write_num("debug_cow", settings->debug_cow); write_num("use_zero_page", settings->use_zero_page); write_num("khugepaged/defrag", khugepaged->defrag); @@ -304,7 +301,6 @@ static void save_settings(void) .thp_defrag = read_string("defrag", thp_defrag_strings), .shmem_enabled = read_string("shmem_enabled", shmem_enabled_strings), - .debug_cow = read_num("debug_cow"), .use_zero_page = read_num("use_zero_page"), }; saved_settings.khugepaged = (struct khugepaged_settings) { -- 2.25.1

4 years, 7 months

[PATCH v2 1/2] KVM: selftests: Keep track of memslots more efficiently

by Maciej S. Szmigiero

From: "Maciej S. Szmigiero" <maciej.szmigiero(a)oracle.com> The KVM selftest framework was using a simple list for keeping track of the memslots currently in use. This resulted in lookups and adding a single memslot being O(n), the later due to linear scanning of the existing memslot set to check for the presence of any conflicting entries. Before this change, benchmarking high count of memslots was more or less impossible as pretty much all the benchmark time was spent in the selftest framework code. We can simply use a rbtree for keeping track of both of gfn and hva. We don't need an interval tree for hva here as we can't have overlapping memslots because we allocate a completely new memory chunk for each new memslot. Signed-off-by: Maciej S. Szmigiero <maciej.szmigiero(a)oracle.com> Reviewed-by: Andrew Jones <drjones(a)redhat.com> --- Changes from v1: * Add Andrew's Reviewed-by: tag tools/testing/selftests/kvm/Makefile | 2 +- tools/testing/selftests/kvm/lib/kvm_util.c | 141 ++++++++++++++---- .../selftests/kvm/lib/kvm_util_internal.h | 15 +- tools/testing/selftests/kvm/lib/rbtree.c | 1 + 4 files changed, 124 insertions(+), 35 deletions(-) create mode 100644 tools/testing/selftests/kvm/lib/rbtree.c diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile index a6d61f451f88..c30a21c1d676 100644 --- a/tools/testing/selftests/kvm/Makefile +++ b/tools/testing/selftests/kvm/Makefile @@ -33,7 +33,7 @@ ifeq ($(ARCH),s390) UNAME_M := s390x endif -LIBKVM = lib/assert.c lib/elf.c lib/io.c lib/kvm_util.c lib/sparsebit.c lib/test_util.c lib/guest_modes.c lib/perf_test_util.c +LIBKVM = lib/assert.c lib/elf.c lib/io.c lib/kvm_util.c lib/rbtree.c lib/sparsebit.c lib/test_util.c lib/guest_modes.c lib/perf_test_util.c LIBKVM_x86_64 = lib/x86_64/processor.c lib/x86_64/vmx.c lib/x86_64/svm.c lib/x86_64/ucall.c lib/x86_64/handlers.S LIBKVM_aarch64 = lib/aarch64/processor.c lib/aarch64/ucall.c LIBKVM_s390x = lib/s390x/processor.c lib/s390x/ucall.c lib/s390x/diag318_test_handler.c diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index e5fbf16f725b..69ee0a72c7d8 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -197,7 +197,9 @@ struct kvm_vm *vm_create(enum vm_guest_mode mode, uint64_t phy_pages, int perm) TEST_ASSERT(vm != NULL, "Insufficient Memory"); INIT_LIST_HEAD(&vm->vcpus); - INIT_LIST_HEAD(&vm->userspace_mem_regions); + vm->regions.gpa_tree = RB_ROOT; + vm->regions.hva_tree = RB_ROOT; + hash_init(vm->regions.slot_hash); vm->mode = mode; vm->type = 0; @@ -349,13 +351,14 @@ struct kvm_vm *vm_create_default(uint32_t vcpuid, uint64_t extra_mem_pages, */ void kvm_vm_restart(struct kvm_vm *vmp, int perm) { + int ctr; struct userspace_mem_region *region; vm_open(vmp, perm); if (vmp->has_irqchip) vm_create_irqchip(vmp); - list_for_each_entry(region, &vmp->userspace_mem_regions, list) { + hash_for_each(vmp->regions.slot_hash, ctr, region, slot_node) { int ret = ioctl(vmp->fd, KVM_SET_USER_MEMORY_REGION, &region->region); TEST_ASSERT(ret == 0, "KVM_SET_USER_MEMORY_REGION IOCTL failed,\n" " rc: %i errno: %i\n" @@ -418,14 +421,21 @@ uint32_t kvm_vm_reset_dirty_ring(struct kvm_vm *vm) static struct userspace_mem_region * userspace_mem_region_find(struct kvm_vm *vm, uint64_t start, uint64_t end) { - struct userspace_mem_region *region; + struct rb_node *node; - list_for_each_entry(region, &vm->userspace_mem_regions, list) { + for (node = vm->regions.gpa_tree.rb_node; node; ) { + struct userspace_mem_region *region = + container_of(node, struct userspace_mem_region, gpa_node); uint64_t existing_start = region->region.guest_phys_addr; uint64_t existing_end = region->region.guest_phys_addr + region->region.memory_size - 1; if (start <= existing_end && end >= existing_start) return region; + + if (start < existing_start) + node = node->rb_left; + else + node = node->rb_right; } return NULL; @@ -540,11 +550,16 @@ void kvm_vm_release(struct kvm_vm *vmp) } static void __vm_mem_region_delete(struct kvm_vm *vm, - struct userspace_mem_region *region) + struct userspace_mem_region *region, + bool unlink) { int ret; - list_del(&region->list); + if (unlink) { + rb_erase(&region->gpa_node, &vm->regions.gpa_tree); + rb_erase(&region->hva_node, &vm->regions.hva_tree); + hash_del(&region->slot_node); + } region->region.memory_size = 0; ret = ioctl(vm->fd, KVM_SET_USER_MEMORY_REGION, &region->region); @@ -563,14 +578,16 @@ static void __vm_mem_region_delete(struct kvm_vm *vm, */ void kvm_vm_free(struct kvm_vm *vmp) { - struct userspace_mem_region *region, *tmp; + int ctr; + struct hlist_node *node; + struct userspace_mem_region *region; if (vmp == NULL) return; /* Free userspace_mem_regions. */ - list_for_each_entry_safe(region, tmp, &vmp->userspace_mem_regions, list) - __vm_mem_region_delete(vmp, region); + hash_for_each_safe(vmp->regions.slot_hash, ctr, node, region, slot_node) + __vm_mem_region_delete(vmp, region, false); /* Free sparsebit arrays. */ sparsebit_free(&vmp->vpages_valid); @@ -652,6 +669,57 @@ int kvm_memcmp_hva_gva(void *hva, struct kvm_vm *vm, vm_vaddr_t gva, size_t len) return 0; } +static void vm_userspace_mem_region_gpa_insert(struct rb_root *gpa_tree, + struct userspace_mem_region *region) +{ + struct rb_node **cur, *parent; + + for (cur = &gpa_tree->rb_node, parent = NULL; *cur; ) { + struct userspace_mem_region *cregion; + + cregion = container_of(*cur, typeof(*cregion), gpa_node); + parent = *cur; + if (region->region.guest_phys_addr < + cregion->region.guest_phys_addr) + cur = &(*cur)->rb_left; + else { + TEST_ASSERT(region->region.guest_phys_addr != + cregion->region.guest_phys_addr, + "Duplicate GPA in region tree"); + + cur = &(*cur)->rb_right; + } + } + + rb_link_node(&region->gpa_node, parent, cur); + rb_insert_color(&region->gpa_node, gpa_tree); +} + +static void vm_userspace_mem_region_hva_insert(struct rb_root *hva_tree, + struct userspace_mem_region *region) +{ + struct rb_node **cur, *parent; + + for (cur = &hva_tree->rb_node, parent = NULL; *cur; ) { + struct userspace_mem_region *cregion; + + cregion = container_of(*cur, typeof(*cregion), hva_node); + parent = *cur; + if (region->host_mem < cregion->host_mem) + cur = &(*cur)->rb_left; + else { + TEST_ASSERT(region->host_mem != + cregion->host_mem, + "Duplicate HVA in region tree"); + + cur = &(*cur)->rb_right; + } + } + + rb_link_node(&region->hva_node, parent, cur); + rb_insert_color(&region->hva_node, hva_tree); +} + /* * VM Userspace Memory Region Add * @@ -716,7 +784,8 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm, (uint64_t) region->region.memory_size); /* Confirm no region with the requested slot already exists. */ - list_for_each_entry(region, &vm->userspace_mem_regions, list) { + hash_for_each_possible(vm->regions.slot_hash, region, slot_node, + slot) { if (region->region.slot != slot) continue; @@ -796,8 +865,10 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm, ret, errno, slot, flags, guest_paddr, (uint64_t) region->region.memory_size); - /* Add to linked-list of memory regions. */ - list_add(&region->list, &vm->userspace_mem_regions); + /* Add to quick lookup data structures */ + vm_userspace_mem_region_gpa_insert(&vm->regions.gpa_tree, region); + vm_userspace_mem_region_hva_insert(&vm->regions.hva_tree, region); + hash_add(vm->regions.slot_hash, &region->slot_node, slot); } /* @@ -820,10 +891,10 @@ memslot2region(struct kvm_vm *vm, uint32_t memslot) { struct userspace_mem_region *region; - list_for_each_entry(region, &vm->userspace_mem_regions, list) { + hash_for_each_possible(vm->regions.slot_hash, region, slot_node, + memslot) if (region->region.slot == memslot) return region; - } fprintf(stderr, "No mem region with the requested slot found,\n" " requested slot: %u\n", memslot); @@ -908,7 +979,7 @@ void vm_mem_region_move(struct kvm_vm *vm, uint32_t slot, uint64_t new_gpa) */ void vm_mem_region_delete(struct kvm_vm *vm, uint32_t slot) { - __vm_mem_region_delete(vm, memslot2region(vm, slot)); + __vm_mem_region_delete(vm, memslot2region(vm, slot), true); } /* @@ -1180,16 +1251,14 @@ void *addr_gpa2hva(struct kvm_vm *vm, vm_paddr_t gpa) { struct userspace_mem_region *region; - list_for_each_entry(region, &vm->userspace_mem_regions, list) { - if ((gpa >= region->region.guest_phys_addr) - && (gpa <= (region->region.guest_phys_addr - + region->region.memory_size - 1))) - return (void *) ((uintptr_t) region->host_mem - + (gpa - region->region.guest_phys_addr)); + region = userspace_mem_region_find(vm, gpa, gpa); + if (!region) { + TEST_FAIL("No vm physical memory at 0x%lx", gpa); + return NULL; } - TEST_FAIL("No vm physical memory at 0x%lx", gpa); - return NULL; + return (void *)((uintptr_t)region->host_mem + + (gpa - region->region.guest_phys_addr)); } /* @@ -1211,15 +1280,22 @@ void *addr_gpa2hva(struct kvm_vm *vm, vm_paddr_t gpa) */ vm_paddr_t addr_hva2gpa(struct kvm_vm *vm, void *hva) { - struct userspace_mem_region *region; + struct rb_node *node; + + for (node = vm->regions.hva_tree.rb_node; node; ) { + struct userspace_mem_region *region = + container_of(node, struct userspace_mem_region, hva_node); + + if (hva >= region->host_mem) { + if (hva <= (region->host_mem + + region->region.memory_size - 1)) + return (vm_paddr_t)((uintptr_t) + region->region.guest_phys_addr + + (hva - (uintptr_t)region->host_mem)); - list_for_each_entry(region, &vm->userspace_mem_regions, list) { - if ((hva >= region->host_mem) - && (hva <= (region->host_mem - + region->region.memory_size - 1))) - return (vm_paddr_t) ((uintptr_t) - region->region.guest_phys_addr - + (hva - (uintptr_t) region->host_mem)); + node = node->rb_right; + } else + node = node->rb_left; } TEST_FAIL("No mapping to a guest physical address, hva: %p", hva); @@ -1745,6 +1821,7 @@ int _kvm_ioctl(struct kvm_vm *vm, unsigned long cmd, void *arg) */ void vm_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent) { + int ctr; struct userspace_mem_region *region; struct vcpu *vcpu; @@ -1752,7 +1829,7 @@ void vm_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent) fprintf(stream, "%*sfd: %i\n", indent, "", vm->fd); fprintf(stream, "%*spage_size: 0x%x\n", indent, "", vm->page_size); fprintf(stream, "%*sMem Regions:\n", indent, ""); - list_for_each_entry(region, &vm->userspace_mem_regions, list) { + hash_for_each(vm->regions.slot_hash, ctr, region, slot_node) { fprintf(stream, "%*sguest_phys: 0x%lx size: 0x%lx " "host_virt: %p\n", indent + 2, "", (uint64_t) region->region.guest_phys_addr, diff --git a/tools/testing/selftests/kvm/lib/kvm_util_internal.h b/tools/testing/selftests/kvm/lib/kvm_util_internal.h index 34465dc562d8..af310110602b 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util_internal.h +++ b/tools/testing/selftests/kvm/lib/kvm_util_internal.h @@ -8,6 +8,9 @@ #ifndef SELFTEST_KVM_UTIL_INTERNAL_H #define SELFTEST_KVM_UTIL_INTERNAL_H +#include "linux/hashtable.h" +#include "linux/rbtree.h" + #include "sparsebit.h" #define KVM_DEV_PATH "/dev/kvm" @@ -20,7 +23,9 @@ struct userspace_mem_region { void *host_mem; void *mmap_start; size_t mmap_size; - struct list_head list; + struct rb_node gpa_node; + struct rb_node hva_node; + struct hlist_node slot_node; }; struct vcpu { @@ -33,6 +38,12 @@ struct vcpu { uint32_t dirty_gfns_count; }; +struct userspace_mem_regions { + struct rb_root gpa_tree; + struct rb_root hva_tree; + DECLARE_HASHTABLE(slot_hash, 9); +}; + struct kvm_vm { int mode; unsigned long type; @@ -45,7 +56,7 @@ struct kvm_vm { unsigned int va_bits; uint64_t max_gfn; struct list_head vcpus; - struct list_head userspace_mem_regions; + struct userspace_mem_regions regions; struct sparsebit *vpages_valid; struct sparsebit *vpages_mapped; bool has_irqchip; diff --git a/tools/testing/selftests/kvm/lib/rbtree.c b/tools/testing/selftests/kvm/lib/rbtree.c new file mode 100644 index 000000000000..a703f0194ea3 --- /dev/null +++ b/tools/testing/selftests/kvm/lib/rbtree.c @@ -0,0 +1 @@ +#include "../../../../lib/rbtree.c"

4 years, 7 months

[PATCH v5 00/10] userfaultfd: add minor fault handling for shmem

by Axel Rasmussen

Base ==== This series is based on (and therefore should apply cleanly to) the tag "v5.12-rc7-mmots-2021-04-11-20-49", additionally with Peter's selftest cleanup series applied first: https://lore.kernel.org/patchwork/cover/1412450/ Changelog ========= v4->v5: - Picked up {Reviewed,Acked}-by's. - Fix cleanup in error path in shmem_mcopy_atomic_pte(). [Hugh, Peter] - Mention switching to lru_cache_add() in the commit message of 9/10. [Hugh] - Split + reorder commits, so now we 1) implement the faulting path, 2) implement the CONTINUE ioctl, and 3) advertise the feature. Squash the documentation update into step (3). [Hugh, Peter] - Reorder install_pte() cleanup to come before selftest changes. [Hugh] v3->v4: - Fix handling of the shmem private mcopy case. Previously, I had (incorrectly) assumed that !vma_is_anonymous() was equivalent to "the page will be in the page cache". But, in this case we have an optimization where we allocate a new *anonymous* page. So, use a new "bool page_in_cache" instead, which checks if page->mapping is set. Correct several places with this new check. [Hugh] - Fix calling mm_counter() before page_add_..._rmap(). [Hugh] - When modifying shmem_mcopy_atomic_pte() to use the new install_pte() helper, just use lru_cache_add_inactive_or_unevictable(), no need to branch and maybe use lru_cache_add(). [Hugh] - De-pluralize mcopy_atomic_install_pte(s). [Hugh] - Make "writable" a bool, and initialize consistently. [Hugh] v2->v3: - Picked up {Reviewed,Acked}-by's. - Reorder commits: introduce CONTINUE before MINOR registration. [Hugh, Peter] - Don't try to {unlock,put}_page an xarray value in shmem_getpage_gfp. [Hugh] - Move enum mcopy_atomic_mode forward declare out of CONFIG_HUGETLB_PAGE. [Hugh] - Keep mistakenly removed UFFD_USER_MODE_ONLY in selftest. [Peter] - Cleanup context management in self test (make clear implicit, remove unneeded return values now that we have err()). [Peter] - Correct dst_pte argument to dst_pmd in shmem_mcopy_atomic_pte macro. [Hugh] - Mention the new shmem support feature in documentation. [Hugh] v1->v2: - Pick up Reviewed-by's. - Don't swapin page when a minor fault occurs. Notice that it needs to be swapped in, and just immediately fire the minor fault. Let a future CONTINUE deal with swapping in the page. [Peter] - Clarify comment about i_size checks in mm/userfaultfd.c. [Peter] - Only forward declare once (out of #ifdef) in hugetlb.h. [Peter] Changes since [2]: - Squash the fixes ([2]) in with the original series ([1]). This makes reviewing easier, as we no longer have to sift through deltas undoing what we had done before. [Hugh, Peter] - Modify shmem_mcopy_atomic_pte() to use the new mcopy_atomic_install_ptes() helper, reducing code duplication. [Hugh] - Properly trigger handle_userfault() in the shmem_swapin_page() case. [Hugh] - Use shmem_getpage() instead of find_lock_page() to lookup the existing page in for continue. This properly deals with swapped-out pages. [Hugh] - Unconditionally pte_mkdirty() for anon memory (as before). [Peter] - Don't include userfaultfd_k.h in either hugetlb.h or shmem_fs.h. [Hugh] - Add comment for UFFD_FEATURE_MINOR_SHMEM (to match _HUGETLBFS). [Hugh] - Fix some small cleanup issues (parens, reworded conditionals, reduced plumbing of some parameters, simplify labels/gotos, ...). [Hugh, Peter] Overview ======== See the series which added minor faults for hugetlbfs [3] for a detailed overview of minor fault handling in general. This series adds the same support for shmem-backed areas. This series is structured as follows: - Commits 1 and 2 are cleanups. - Commits 3 and 4 implement the new feature (minor fault handling for shmem). - Commit 5 advertises that the feature is now available since at this point it's fully implemented. - Commit 6 is a final cleanup, modifying an existing code path to re-use a new helper we've introduced. - Commits 7, 8, 9, 10 update the userfaultfd selftest to exercise the feature. Use Case ======== In some cases it is useful to have VM memory backed by tmpfs instead of hugetlbfs. So, this feature will be used to support the same VM live migration use case described in my original series. Additionally, Android folks (Lokesh Gidra <lokeshgidra(a)google.com>) hope to optimize the Android Runtime garbage collector using this feature: "The plan is to use userfaultfd for concurrently compacting the heap. With this feature, the heap can be shared-mapped at another location where the GC-thread(s) could continue the compaction operation without the need to invoke userfault ioctl(UFFDIO_COPY) each time. OTOH, if and when Java threads get faults on the heap, UFFDIO_CONTINUE can be used to resume execution. Furthermore, this feature enables updating references in the 'non-moving' portion of the heap efficiently. Without this feature, uneccessary page copying (ioctl(UFFDIO_COPY)) would be required." [1] https://lore.kernel.org/patchwork/cover/1388144/ [2] https://lore.kernel.org/patchwork/patch/1408161/ [3] https://lore.kernel.org/linux-fsdevel/20210301222728.176417-1-axelrasmussen… Axel Rasmussen (10): userfaultfd/hugetlbfs: avoid including userfaultfd_k.h in hugetlb.h userfaultfd/shmem: combine shmem_{mcopy_atomic,mfill_zeropage}_pte userfaultfd/shmem: support minor fault registration for shmem userfaultfd/shmem: support UFFDIO_CONTINUE for shmem userfaultfd/shmem: advertise shmem minor fault support userfaultfd/shmem: modify shmem_mcopy_atomic_pte to use install_pte() userfaultfd/selftests: use memfd_create for shmem test type userfaultfd/selftests: create alias mappings in the shmem test userfaultfd/selftests: reinitialize test context in each test userfaultfd/selftests: exercise minor fault handling shmem support Documentation/admin-guide/mm/userfaultfd.rst | 3 +- fs/userfaultfd.c | 6 +- include/linux/hugetlb.h | 4 +- include/linux/shmem_fs.h | 17 +- include/linux/userfaultfd_k.h | 5 + include/uapi/linux/userfaultfd.h | 7 +- mm/hugetlb.c | 1 + mm/memory.c | 8 +- mm/shmem.c | 110 +++----- mm/userfaultfd.c | 175 ++++++++---- tools/testing/selftests/vm/userfaultfd.c | 274 ++++++++++++------- 11 files changed, 360 insertions(+), 250 deletions(-) -- 2.31.1.498.g6c1eba8ee3d-goog

4 years, 7 months

[PATCH] [v2] selftests: powerpc: Remove unneeded variables

by Wan Jiabing

Fix coccicheck warning: ./tools/testing/selftests/powerpc/alignment/alignment_handler.c:539:5-7: Unneeded variable: "rc". Return "0" on line 562 ./tools/testing/selftests/powerpc/alignment/alignment_handler.c:567:5-7: Unneeded variable: "rc". Return "0" on line 580 ./tools/testing/selftests/powerpc/alignment/alignment_handler.c:585:5-7: Unneeded variable: "rc". Return "0" on line 594 ./tools/testing/selftests/powerpc/alignment/alignment_handler.c:600:5-7: Unneeded variable: "rc". Return "0" on line 611 ./tools/testing/selftests/powerpc/alignment/alignment_handler.c:416:5-7: Unneeded variable: "rc". Return "0" on line 470 ./tools/testing/selftests/powerpc/alignment/alignment_handler.c:475:5-7: Unneeded variable: "rc". Return "0" on line 485 ./tools/testing/selftests/powerpc/alignment/alignment_handler.c:490:5-7: Unneeded variable: "rc". Return "0" on line 506 ./tools/testing/selftests/powerpc/alignment/alignment_handler.c:511:5-7: Unneeded variable: "rc". Return "0" on line 534 ./tools/testing/selftests/powerpc/alignment/alignment_handler.c:331:5-7: Unneeded variable: "rc". Return "0" on line 344 ./tools/testing/selftests/powerpc/alignment/alignment_handler.c:349:5-7: Unneeded variable: "rc". Return "0" on line 360 ./tools/testing/selftests/powerpc/alignment/alignment_handler.c:365:5-7: Unneeded variable: "rc". Return "0" on line 392 ./tools/testing/selftests/powerpc/alignment/alignment_handler.c:397:5-7: Unneeded variable: "rc". Return "0" on line 411 Signed-off-by: Wan Jiabing <wanjiabing(a)vivo.com> --- Changelog: v2: - Modify the subject line. --- .../powerpc/alignment/alignment_handler.c | 48 +++++-------------- 1 file changed, 12 insertions(+), 36 deletions(-) diff --git a/tools/testing/selftests/powerpc/alignment/alignment_handler.c b/tools/testing/selftests/powerpc/alignment/alignment_handler.c index c25cf7cd45e9..48bfb7b36d84 100644 --- a/tools/testing/selftests/powerpc/alignment/alignment_handler.c +++ b/tools/testing/selftests/powerpc/alignment/alignment_handler.c @@ -328,8 +328,6 @@ static bool can_open_cifile(void) int test_alignment_handler_vsx_206(void) { - int rc = 0; - SKIP_IF(!can_open_cifile()); SKIP_IF(!have_hwcap(PPC_FEATURE_ARCH_2_06)); @@ -341,13 +339,11 @@ int test_alignment_handler_vsx_206(void) STORE_VSX_XFORM_TEST(stxvd2x); STORE_VSX_XFORM_TEST(stxvw4x); STORE_VSX_XFORM_TEST(stxsdx); - return rc; + return 0; } int test_alignment_handler_vsx_207(void) { - int rc = 0; - SKIP_IF(!can_open_cifile()); SKIP_IF(!have_hwcap2(PPC_FEATURE2_ARCH_2_07)); @@ -357,13 +353,11 @@ int test_alignment_handler_vsx_207(void) LOAD_VSX_XFORM_TEST(lxsiwzx); STORE_VSX_XFORM_TEST(stxsspx); STORE_VSX_XFORM_TEST(stxsiwx); - return rc; + return 0; } int test_alignment_handler_vsx_300(void) { - int rc = 0; - SKIP_IF(!can_open_cifile()); SKIP_IF(!have_hwcap2(PPC_FEATURE2_ARCH_3_00)); @@ -389,13 +383,11 @@ int test_alignment_handler_vsx_300(void) STORE_VSX_XFORM_TEST(stxvx); STORE_VSX_XFORM_TEST(stxvl); STORE_VSX_XFORM_TEST(stxvll); - return rc; + return 0; } int test_alignment_handler_vsx_prefix(void) { - int rc = 0; - SKIP_IF(!can_open_cifile()); SKIP_IF(!have_hwcap2(PPC_FEATURE2_ARCH_3_1)); @@ -408,13 +400,11 @@ int test_alignment_handler_vsx_prefix(void) STORE_VSX_8LS_PREFIX_TEST(PSTXSSP, 0); STORE_VSX_8LS_PREFIX_TEST(PSTXV0, 0); STORE_VSX_8LS_PREFIX_TEST(PSTXV1, 1); - return rc; + return 0; } int test_alignment_handler_integer(void) { - int rc = 0; - SKIP_IF(!can_open_cifile()); printf("Integer\n"); @@ -467,13 +457,11 @@ int test_alignment_handler_integer(void) STORE_DFORM_TEST(stmw); #endif - return rc; + return 0; } int test_alignment_handler_integer_206(void) { - int rc = 0; - SKIP_IF(!can_open_cifile()); SKIP_IF(!have_hwcap(PPC_FEATURE_ARCH_2_06)); @@ -482,13 +470,11 @@ int test_alignment_handler_integer_206(void) LOAD_XFORM_TEST(ldbrx); STORE_XFORM_TEST(stdbrx); - return rc; + return 0; } int test_alignment_handler_integer_prefix(void) { - int rc = 0; - SKIP_IF(!can_open_cifile()); SKIP_IF(!have_hwcap2(PPC_FEATURE2_ARCH_3_1)); @@ -503,13 +489,11 @@ int test_alignment_handler_integer_prefix(void) STORE_MLS_PREFIX_TEST(PSTH); STORE_MLS_PREFIX_TEST(PSTW); STORE_8LS_PREFIX_TEST(PSTD); - return rc; + return 0; } int test_alignment_handler_vmx(void) { - int rc = 0; - SKIP_IF(!can_open_cifile()); SKIP_IF(!have_hwcap(PPC_FEATURE_HAS_ALTIVEC)); @@ -531,13 +515,11 @@ int test_alignment_handler_vmx(void) STORE_VMX_XFORM_TEST(stvehx); STORE_VMX_XFORM_TEST(stvewx); STORE_VMX_XFORM_TEST(stvxl); - return rc; + return 0; } int test_alignment_handler_fp(void) { - int rc = 0; - SKIP_IF(!can_open_cifile()); printf("Floating point\n"); @@ -559,13 +541,11 @@ int test_alignment_handler_fp(void) STORE_FLOAT_XFORM_TEST(stfsux); STORE_FLOAT_XFORM_TEST(stfiwx); - return rc; + return 0; } int test_alignment_handler_fp_205(void) { - int rc = 0; - SKIP_IF(!can_open_cifile()); SKIP_IF(!have_hwcap(PPC_FEATURE_ARCH_2_05)); @@ -577,13 +557,11 @@ int test_alignment_handler_fp_205(void) STORE_FLOAT_DFORM_TEST(stfdp); STORE_FLOAT_XFORM_TEST(stfdpx); - return rc; + return 0; } int test_alignment_handler_fp_206(void) { - int rc = 0; - SKIP_IF(!can_open_cifile()); SKIP_IF(!have_hwcap(PPC_FEATURE_ARCH_2_06)); @@ -591,14 +569,12 @@ int test_alignment_handler_fp_206(void) LOAD_FLOAT_XFORM_TEST(lfiwzx); - return rc; + return 0; } int test_alignment_handler_fp_prefix(void) { - int rc = 0; - SKIP_IF(!can_open_cifile()); SKIP_IF(!have_hwcap2(PPC_FEATURE2_ARCH_3_1)); @@ -608,7 +584,7 @@ int test_alignment_handler_fp_prefix(void) LOAD_FLOAT_MLS_PREFIX_TEST(PLFD); STORE_FLOAT_MLS_PREFIX_TEST(PSTFS); STORE_FLOAT_MLS_PREFIX_TEST(PSTFD); - return rc; + return 0; } void usage(char *prog) -- 2.30.2

4 years, 8 months

[PATCH v3 0/4] KVM statistics data fd-based binary interface

by Jing Zhang

This patchset provides a file descriptor for every VM and VCPU to read KVM statistics data in binary format. It is meant to provide a lightweight, flexible, scalable and efficient lock-free solution for user space telemetry applications to pull the statistics data periodically for large scale systems. The pulling frequency could be as high as a few times per second. In this patchset, every statistics data are treated to have some attributes as below: * architecture dependent or common * VM statistics data or VCPU statistics data * type: cumulative, instantaneous, * unit: none for simple counter, nanosecond, microsecond, millisecond, second, Byte, KiByte, MiByte, GiByte. Clock Cycles Since no lock/synchronization is used, the consistency between all the statistics data is not guaranteed. That means not all statistics data are read out at the exact same time, since the statistics date are still being updated by KVM subsystems while they are read out. --- * v2 -> v3 - Rebase to kvm/queue, commit edf408f5257b ("KVM: avoid "deadlock" between install_new_memslots and MMU notifier") - Resolve some nitpicks about format * v1 -> v2 - Use ARRAY_SIZE to count the number of stats descriptors - Fix missing `size` field initialization in macro STATS_DESC [1] https://lore.kernel.org/kvm/20210402224359.2297157-1-jingzhangos@google.com [2] https://lore.kernel.org/kvm/20210415151741.1607806-1-jingzhangos@google.com --- Jing Zhang (4): KVM: stats: Separate common stats from architecture specific ones KVM: stats: Add fd-based API to read binary stats data KVM: stats: Add documentation for statistics data binary interface KVM: selftests: Add selftest for KVM statistics data binary interface Documentation/virt/kvm/api.rst | 171 ++++++++ arch/arm64/include/asm/kvm_host.h | 9 +- arch/arm64/kvm/guest.c | 42 +- arch/mips/include/asm/kvm_host.h | 9 +- arch/mips/kvm/mips.c | 67 +++- arch/powerpc/include/asm/kvm_host.h | 9 +- arch/powerpc/kvm/book3s.c | 68 +++- arch/powerpc/kvm/book3s_hv.c | 12 +- arch/powerpc/kvm/book3s_pr.c | 2 +- arch/powerpc/kvm/book3s_pr_papr.c | 2 +- arch/powerpc/kvm/booke.c | 63 ++- arch/s390/include/asm/kvm_host.h | 9 +- arch/s390/kvm/kvm-s390.c | 133 ++++++- arch/x86/include/asm/kvm_host.h | 9 +- arch/x86/kvm/x86.c | 71 +++- include/linux/kvm_host.h | 132 ++++++- include/linux/kvm_types.h | 12 + include/uapi/linux/kvm.h | 50 +++ tools/testing/selftests/kvm/.gitignore | 1 + tools/testing/selftests/kvm/Makefile | 3 + .../testing/selftests/kvm/include/kvm_util.h | 3 + .../selftests/kvm/kvm_bin_form_stats.c | 370 ++++++++++++++++++ tools/testing/selftests/kvm/lib/kvm_util.c | 11 + virt/kvm/kvm_main.c | 237 ++++++++++- 24 files changed, 1405 insertions(+), 90 deletions(-) create mode 100644 tools/testing/selftests/kvm/kvm_bin_form_stats.c base-commit: edf408f5257ba39e63781b820528e1ce1ec0f543 -- 2.31.1.498.g6c1eba8ee3d-goog

4 years, 8 months

[PATCH v5 0/2] CPU-Idle latency selftest framework

by Pratik R. Sampat

Changelog RFC v4 --> PATCH v5: 1. Added a CPU online check prior to parsing the CPU topology to avoid parsing topologies for CPUs unavailable for the latency test 2. Added comment describing the selftest in cpuidle.sh As I have made changes to cpuidle.sh's working, hence dropping "Reviewed-by" from Doug Smythies for the second patch, while retaining it for the first patch. RFC v4: https://lkml.org/lkml/2021/4/12/99 --- A kernel module + userspace driver to estimate the wakeup latency caused by going into stop states. The motivation behind this program is to find significant deviations behind advertised latency and residency values. The patchset measures latencies for two kinds of events. IPIs and Timers As this is a software-only mechanism, there will additional latencies of the kernel-firmware-hardware interactions. To account for that, the program also measures a baseline latency on a 100 percent loaded CPU and the latencies achieved must be in view relative to that. To achieve this, we introduce a kernel module and expose its control knobs through the debugfs interface that the selftests can engage with. The kernel module provides the following interfaces within /sys/kernel/debug/latency_test/ for, IPI test: ipi_cpu_dest = Destination CPU for the IPI ipi_cpu_src = Origin of the IPI ipi_latency_ns = Measured latency time in ns Timeout test: timeout_cpu_src = CPU on which the timer to be queued timeout_expected_ns = Timer duration timeout_diff_ns = Difference of actual duration vs expected timer Sample output on a POWER9 system is as follows: # --IPI Latency Test--- # Baseline Average IPI latency(ns): 3114 # Observed Average IPI latency(ns) - State0: 3265 # Observed Average IPI latency(ns) - State1: 3507 # Observed Average IPI latency(ns) - State2: 3739 # Observed Average IPI latency(ns) - State3: 3807 # Observed Average IPI latency(ns) - State4: 17070 # Observed Average IPI latency(ns) - State5: 1038174 # Observed Average IPI latency(ns) - State6: 1068784 # # --Timeout Latency Test-- # Baseline Average timeout diff(ns): 1420 # Observed Average timeout diff(ns) - State0: 1640 # Observed Average timeout diff(ns) - State1: 1764 # Observed Average timeout diff(ns) - State2: 1715 # Observed Average timeout diff(ns) - State3: 1845 # Observed Average timeout diff(ns) - State4: 16581 # Observed Average timeout diff(ns) - State5: 939977 # Observed Average timeout diff(ns) - State6: 1073024 Things to keep in mind: 1. This kernel module + bash driver does not guarantee idleness on a core when the IPI and the Timer is armed. It only invokes sleep and hopes that the core is idle once the IPI/Timer is invoked onto it. Hence this program must be run on a completely idle system for best results 2. Even on a completely idle system, there maybe book-keeping tasks or jitter tasks that can run on the core we want idle. This can create outliers in the latency measurement. Thankfully, these outliers should be large enough to easily weed them out. 3. A userspace only selftest variant was also sent out as RFC based on suggestions over the previous patchset to simply the kernel complexeity. However, a userspace only approach had more noise in the latency measurement due to userspace-kernel interactions which led to run to run variance and a lesser accurate test. Another downside of the nature of a userspace program is that it takes orders of magnitude longer to complete a full system test compared to the kernel framework. RFC patch: https://lkml.org/lkml/2020/9/2/356 4. For Intel Systems, the Timer based latencies don't exactly give out the measure of idle latencies. This is because of a hardware optimization mechanism that pre-arms a CPU when a timer is set to wakeup. That doesn't make this metric useless for Intel systems, it just means that is measuring IPI/Timer responding latency rather than idle wakeup latencies. (Source: https://lkml.org/lkml/2020/9/2/610) For solution to this problem, a hardware based latency analyzer is devised by Artem Bityutskiy from Intel. https://youtu.be/Opk92aQyvt0?t=8266 https://intel.github.io/wult/ Pratik R. Sampat (2): cpuidle: Extract IPI based and timer based wakeup latency from idle states selftest/cpuidle: Add support for cpuidle latency measurement drivers/cpuidle/Makefile | 1 + drivers/cpuidle/test-cpuidle_latency.c | 157 ++++++++ lib/Kconfig.debug | 10 + tools/testing/selftests/Makefile | 1 + tools/testing/selftests/cpuidle/Makefile | 6 + tools/testing/selftests/cpuidle/cpuidle.sh | 414 +++++++++++++++++++++ tools/testing/selftests/cpuidle/settings | 2 + 7 files changed, 591 insertions(+) create mode 100644 drivers/cpuidle/test-cpuidle_latency.c create mode 100644 tools/testing/selftests/cpuidle/Makefile create mode 100755 tools/testing/selftests/cpuidle/cpuidle.sh create mode 100644 tools/testing/selftests/cpuidle/settings -- 2.17.1

4 years, 8 months

[PATCH v4 0/4] KVM statistics data fd-based binary interface

by Jing Zhang

This patchset provides a file descriptor for every VM and VCPU to read KVM statistics data in binary format. It is meant to provide a lightweight, flexible, scalable and efficient lock-free solution for user space telemetry applications to pull the statistics data periodically for large scale systems. The pulling frequency could be as high as a few times per second. In this patchset, every statistics data are treated to have some attributes as below: * architecture dependent or common * VM statistics data or VCPU statistics data * type: cumulative, instantaneous, * unit: none for simple counter, nanosecond, microsecond, millisecond, second, Byte, KiByte, MiByte, GiByte. Clock Cycles Since no lock/synchronization is used, the consistency between all the statistics data is not guaranteed. That means not all statistics data are read out at the exact same time, since the statistics date are still being updated by KVM subsystems while they are read out. --- * v3 -> v4 - Rebase to kvm/queue, commit 9f242010c3b4 ("KVM: avoid "deadlock" between install_new_memslots and MMU notifier") - Use C-stype comments in the whole patch - Fix wrong count for x86 VCPU stats descriptors - Fix KVM stats data size counting and validity check in selftest * v2 -> v3 - Rebase to kvm/queue, commit edf408f5257b ("KVM: avoid "deadlock" between install_new_memslots and MMU notifier") - Resolve some nitpicks about format * v1 -> v2 - Use ARRAY_SIZE to count the number of stats descriptors - Fix missing `size` field initialization in macro STATS_DESC [1] https://lore.kernel.org/kvm/20210402224359.2297157-1-jingzhangos@google.com [2] https://lore.kernel.org/kvm/20210415151741.1607806-1-jingzhangos@google.com [3] https://lore.kernel.org/kvm/20210423181727.596466-1-jingzhangos@google.com --- Jing Zhang (4): KVM: stats: Separate common stats from architecture specific ones KVM: stats: Add fd-based API to read binary stats data KVM: stats: Add documentation for statistics data binary interface KVM: selftests: Add selftest for KVM statistics data binary interface Documentation/virt/kvm/api.rst | 171 ++++++++ arch/arm64/include/asm/kvm_host.h | 9 +- arch/arm64/kvm/guest.c | 42 +- arch/mips/include/asm/kvm_host.h | 9 +- arch/mips/kvm/mips.c | 67 ++- arch/powerpc/include/asm/kvm_host.h | 9 +- arch/powerpc/kvm/book3s.c | 68 +++- arch/powerpc/kvm/book3s_hv.c | 12 +- arch/powerpc/kvm/book3s_pr.c | 2 +- arch/powerpc/kvm/book3s_pr_papr.c | 2 +- arch/powerpc/kvm/booke.c | 63 ++- arch/s390/include/asm/kvm_host.h | 9 +- arch/s390/kvm/kvm-s390.c | 133 +++++- arch/x86/include/asm/kvm_host.h | 9 +- arch/x86/kvm/x86.c | 71 +++- include/linux/kvm_host.h | 132 +++++- include/linux/kvm_types.h | 12 + include/uapi/linux/kvm.h | 50 +++ tools/testing/selftests/kvm/.gitignore | 1 + tools/testing/selftests/kvm/Makefile | 3 + .../testing/selftests/kvm/include/kvm_util.h | 3 + .../selftests/kvm/kvm_bin_form_stats.c | 380 ++++++++++++++++++ tools/testing/selftests/kvm/lib/kvm_util.c | 11 + virt/kvm/kvm_main.c | 237 ++++++++++- 24 files changed, 1415 insertions(+), 90 deletions(-) create mode 100644 tools/testing/selftests/kvm/kvm_bin_form_stats.c base-commit: 9f242010c3b46e63bc62f08fff42cef992d3801b -- 2.31.1.527.g47e6f16901-goog

4 years, 8 months

[PATCH v18 0/9] mm: introduce memfd_secret system call to create "secret" memory areas

by Mike Rapoport

From: Mike Rapoport <rppt(a)linux.ibm.com> Hi, @Andrew, this is based on v5.12-rc1, I can rebase whatever way you prefer. This is an implementation of "secret" mappings backed by a file descriptor. The file descriptor backing secret memory mappings is created using a dedicated memfd_secret system call The desired protection mode for the memory is configured using flags parameter of the system call. The mmap() of the file descriptor created with memfd_secret() will create a "secret" memory mapping. The pages in that mapping will be marked as not present in the direct map and will be present only in the page table of the owning mm. Although normally Linux userspace mappings are protected from other users, such secret mappings are useful for environments where a hostile tenant is trying to trick the kernel into giving them access to other tenants mappings. Additionally, in the future the secret mappings may be used as a mean to protect guest memory in a virtual machine host. For demonstration of secret memory usage we've created a userspace library https://git.kernel.org/pub/scm/linux/kernel/git/jejb/secret-memory-preloade… that does two things: the first is act as a preloader for openssl to redirect all the OPENSSL_malloc calls to secret memory meaning any secret keys get automatically protected this way and the other thing it does is expose the API to the user who needs it. We anticipate that a lot of the use cases would be like the openssl one: many toolkits that deal with secret keys already have special handling for the memory to try to give them greater protection, so this would simply be pluggable into the toolkits without any need for user application modification. Hiding secret memory mappings behind an anonymous file allows usage of the page cache for tracking pages allocated for the "secret" mappings as well as using address_space_operations for e.g. page migration callbacks. The anonymous file may be also used implicitly, like hugetlb files, to implement mmap(MAP_SECRET) and use the secret memory areas with "native" mm ABIs in the future. Removing of the pages from the direct map may cause its fragmentation on architectures that use large pages to map the physical memory which affects the system performance. However, the original Kconfig text for CONFIG_DIRECT_GBPAGES said that gigabyte pages in the direct map "... can improve the kernel's performance a tiny bit ..." (commit 00d1c5e05736 ("x86: add gbpages switches")) and the recent report [1] showed that "... although 1G mappings are a good default choice, there is no compelling evidence that it must be the only choice". Hence, it is sufficient to have secretmem disabled by default with the ability of a system administrator to enable it at boot time. In addition, there is also a long term goal to improve management of the direct map. [1] https://lore.kernel.org/linux-mm/213b4567-46ce-f116-9cdf-bbd0c884eb3c@linux… v18: * rebase on v5.12-rc1 * merge kfence fix into the original patch * massage commit message of the patch introducing the memfd_secret syscall v17: https://lore.kernel.org/lkml/20210208084920.2884-1-rppt@kernel.org * Remove pool of large pages backing secretmem allocations, per Michal Hocko * Add secretmem pages to unevictable LRU, per Michal Hocko * Use GFP_HIGHUSER as secretmem mapping mask, per Michal Hocko * Make secretmem an opt-in feature that is disabled by default v16: https://lore.kernel.org/lkml/20210121122723.3446-1-rppt@kernel.org * Fix memory leak intorduced in v15 * Clean the data left from previous page user before handing the page to the userspace v15: https://lore.kernel.org/lkml/20210120180612.1058-1-rppt@kernel.org * Add riscv/Kconfig update to disable set_memory operations for nommu builds (patch 3) * Update the code around add_to_page_cache() per Matthew's comments (patches 6,7) * Add fixups for build/checkpatch errors discovered by CI systems v14: https://lore.kernel.org/lkml/20201203062949.5484-1-rppt@kernel.org * Finally s/mod_node_page_state/mod_lruvec_page_state/ v13: https://lore.kernel.org/lkml/20201201074559.27742-1-rppt@kernel.org * Added Reviewed-by, thanks Catalin and David * s/mod_node_page_state/mod_lruvec_page_state/ as Shakeel suggested Older history: v12: https://lore.kernel.org/lkml/20201125092208.12544-1-rppt@kernel.org v11: https://lore.kernel.org/lkml/20201124092556.12009-1-rppt@kernel.org v10: https://lore.kernel.org/lkml/20201123095432.5860-1-rppt@kernel.org v9: https://lore.kernel.org/lkml/20201117162932.13649-1-rppt@kernel.org v8: https://lore.kernel.org/lkml/20201110151444.20662-1-rppt@kernel.org v7: https://lore.kernel.org/lkml/20201026083752.13267-1-rppt@kernel.org v6: https://lore.kernel.org/lkml/20200924132904.1391-1-rppt@kernel.org v5: https://lore.kernel.org/lkml/20200916073539.3552-1-rppt@kernel.org v4: https://lore.kernel.org/lkml/20200818141554.13945-1-rppt@kernel.org v3: https://lore.kernel.org/lkml/20200804095035.18778-1-rppt@kernel.org v2: https://lore.kernel.org/lkml/20200727162935.31714-1-rppt@kernel.org v1: https://lore.kernel.org/lkml/20200720092435.17469-1-rppt@kernel.org rfc-v2: https://lore.kernel.org/lkml/20200706172051.19465-1-rppt@kernel.org/ rfc-v1: https://lore.kernel.org/lkml/20200130162340.GA14232@rapoport-lnx/ rfc-v0: https://lore.kernel.org/lkml/1572171452-7958-1-git-send-email-rppt@kernel.o… Mike Rapoport (9): mm: add definition of PMD_PAGE_ORDER mmap: make mlock_future_check() global riscv/Kconfig: make direct map manipulation options depend on MMU set_memory: allow set_direct_map_*_noflush() for multiple pages set_memory: allow querying whether set_direct_map_*() is actually enabled mm: introduce memfd_secret system call to create "secret" memory areas PM: hibernate: disable when there are active secretmem users arch, mm: wire up memfd_secret system call where relevant secretmem: test: add basic selftest for memfd_secret(2) arch/arm64/include/asm/Kbuild | 1 - arch/arm64/include/asm/cacheflush.h | 6 - arch/arm64/include/asm/kfence.h | 2 +- arch/arm64/include/asm/set_memory.h | 17 ++ arch/arm64/include/uapi/asm/unistd.h | 1 + arch/arm64/kernel/machine_kexec.c | 1 + arch/arm64/mm/mmu.c | 6 +- arch/arm64/mm/pageattr.c | 23 +- arch/riscv/Kconfig | 4 +- arch/riscv/include/asm/set_memory.h | 4 +- arch/riscv/include/asm/unistd.h | 1 + arch/riscv/mm/pageattr.c | 8 +- arch/x86/entry/syscalls/syscall_32.tbl | 1 + arch/x86/entry/syscalls/syscall_64.tbl | 1 + arch/x86/include/asm/set_memory.h | 4 +- arch/x86/mm/pat/set_memory.c | 8 +- fs/dax.c | 11 +- include/linux/pgtable.h | 3 + include/linux/secretmem.h | 30 +++ include/linux/set_memory.h | 16 +- include/linux/syscalls.h | 1 + include/uapi/asm-generic/unistd.h | 6 +- include/uapi/linux/magic.h | 1 + kernel/power/hibernate.c | 5 +- kernel/power/snapshot.c | 4 +- kernel/sys_ni.c | 2 + mm/Kconfig | 3 + mm/Makefile | 1 + mm/gup.c | 10 + mm/internal.h | 3 + mm/mlock.c | 3 +- mm/mmap.c | 5 +- mm/secretmem.c | 261 +++++++++++++++++++ mm/vmalloc.c | 5 +- scripts/checksyscalls.sh | 4 + tools/testing/selftests/vm/.gitignore | 1 + tools/testing/selftests/vm/Makefile | 3 +- tools/testing/selftests/vm/memfd_secret.c | 296 ++++++++++++++++++++++ tools/testing/selftests/vm/run_vmtests.sh | 17 ++ 39 files changed, 726 insertions(+), 53 deletions(-) create mode 100644 arch/arm64/include/asm/set_memory.h create mode 100644 include/linux/secretmem.h create mode 100644 mm/secretmem.c create mode 100644 tools/testing/selftests/vm/memfd_secret.c -- 2.28.0

4 years, 8 months

[PATCH v3 0/2] secretmem: optimize page_is_secretmem()

by Mike Rapoport

From: Mike Rapoport <rppt(a)linux.ibm.com> Hi, This is an updated version of page_is_secretmem() changes. This is based on v5.12-rc7-mmots-2021-04-15-16-28. @Andrew, please let me know if you'd like me to rebase it differently or resend the entire set. v3: * add missing put_compound_head() if we are to return NULL from gup_page_range(), thanks David. * add unlikely() to test for page_is_secretmem. v2: * move the check for secretmem page in gup_pte_range after we get a reference to the page, per Matthew. Mike Rapoport (2): secretmem/gup: don't check if page is secretmem without reference secretmem: optimize page_is_secretmem() include/linux/secretmem.h | 26 +++++++++++++++++++++++++- mm/gup.c | 6 +++--- mm/secretmem.c | 12 +----------- 3 files changed, 29 insertions(+), 15 deletions(-) -- 2.28.0 Mike Rapoport (2): secretmem/gup: don't check if page is secretmem without reference secretmem: optimize page_is_secretmem() include/linux/secretmem.h | 26 +++++++++++++++++++++++++- mm/gup.c | 8 +++++--- mm/secretmem.c | 12 +----------- 3 files changed, 31 insertions(+), 15 deletions(-) -- 2.28.0

4 years, 8 months

[PATCH v3 00/13] Add futex2 syscalls

by André Almeida

Hi, This patch series introduces the futex2 syscalls. * What happened to the current futex()? For some years now, developers have been trying to add new features to futex, but maintainers have been reluctant to accept then, given the multiplexed interface full of legacy features and tricky to do big changes. Some problems that people tried to address with patchsets are: NUMA-awareness[0], smaller sized futexes[1], wait on multiple futexes[2]. NUMA, for instance, just doesn't fit the current API in a reasonable way. Considering that, it's not possible to merge new features into the current futex. ** The NUMA problem At the current implementation, all futex kernel side infrastructure is stored on a single node. Given that, all futex() calls issued by processors that aren't located on that node will have a memory access penalty when doing it. ** The 32bit sized futex problem Embedded systems or anything with memory constrains would benefit of using smaller sizes for the futex userspace integer. Also, a mutex implementation can be done using just three values, so 8 bits is enough for various scenarios. ** The wait on multiple problem The use case lies in the Wine implementation of the Windows NT interface WaitMultipleObjects. This Windows API function allows a thread to sleep waiting on the first of a set of event sources (mutexes, timers, signal, console input, etc) to signal. Considering this is a primitive synchronization operation for Windows applications, being able to quickly signal events on the producer side, and quickly go to sleep on the consumer side is essential for good performance of those running over Wine. [0] https://lore.kernel.org/lkml/20160505204230.932454245@linutronix.de/ [1] https://lore.kernel.org/lkml/20191221155659.3159-2-malteskarupke@web.de/ [2] https://lore.kernel.org/lkml/20200213214525.183689-1-andrealmeid@collabora.… * The solution As proposed by Peter Zijlstra and Florian Weimer[3], a new interface is required to solve this, which must be designed with those features in mind. futex2() is that interface. As opposed to the current multiplexed interface, the new one should have one syscall per operation. This will allow the maintainability of the API if it gets extended, and will help users with type checking of arguments. In particular, the new interface is extended to support the ability to wait on any of a list of futexes at a time, which could be seen as a vectored extension of the FUTEX_WAIT semantics. [3] https://lore.kernel.org/lkml/20200303120050.GC2596@hirez.programming.kicks-… * The interface The new interface can be seen in details in the following patches, but this is a high level summary of what the interface can do: - Supports wake/wait semantics, as in futex() - Supports requeue operations, similarly as FUTEX_CMP_REQUEUE, but with individual flags for each address - Supports waiting for a vector of futexes, using a new syscall named futex_waitv() - Supports variable sized futexes (8bits, 16bits, 32bits and 64bits) - Supports NUMA-awareness operations, where the user can specify on which memory node would like to operate * Implementation The internal implementation follows a similar design to the original futex. Given that we want to replicate the same external behavior of current futex, this should be somewhat expected. For some functions, like the init and the code to get a shared key, I literally copied code and comments from kernel/futex.c. I decided to do so instead of exposing the original function as a public function since in that way we can freely modify our implementation if required, without any impact on old futex. Also, the comments precisely describes the details and corner cases of the implementation. Each patch contains a brief description of implementation, but patch 6 "docs: locking: futex2: Add documentation" adds a more complete document about it. * The patchset This patchset can be also found at my git tree: https://gitlab.collabora.com/tonyk/linux/-/tree/futex2-dev - Patch 1: Implements wait/wake, and the basics foundations of futex2 - Patches 2-4: Implement the remaining features (shared, waitv, requeue). - Patch 5: Adds the x86_x32 ABI handling. I kept it in a separated patch since I'm not sure if x86_x32 is still a thing, or if it should return -ENOSYS. - Patch 6: Add a documentation file which details the interface and the internal implementation. - Patches 7-13: Selftests for all operations along with perf support for futex2. - Patch 14: While working on porting glibc for futex2, I found out that there's a futex_wake() call at the user thread exit path, if that thread was created with clone(..., CLONE_CHILD_SETTID, ...). In order to make pthreads work with futex2, it was required to add this patch. Note that this is more a proof-of-concept of what we will need to do in future, rather than part of the interface and shouldn't be merged as it is. * Testing: This patchset provides selftests for each operation and their flags. Along with that, the following work was done: ** Stability To stress the interface in "real world scenarios": - glibc[4]: nptl's low level locking was modified to use futex2 API (except for robust and PI things). All relevant nptl/ tests passed. - Wine[5]: Proton/Wine was modified in order to use futex2() for the emulation of Windows NT sync mechanisms based on futex, called "fsync". Triple-A games with huge CPU's loads and tons of parallel jobs worked as expected when compared with the previous FUTEX_WAIT_MULTIPLE implementation at futex(). Some games issue 42k futex2() calls per second. - Full GNU/Linux distro: I installed the modified glibc in my host machine, so all pthread's programs would use futex2(). After tweaking systemd[6] to allow futex2() calls at seccomp, everything worked as expected (web browsers do some syscall sandboxing and need some configuration as well). - perf: The perf benchmarks tests can also be used to stress the interface, and they can be found in this patchset. ** Performance - For comparing futex() and futex2() performance, I used the artificial benchmarks implemented at perf (wake, wake-parallel, hash and requeue). The setup was 200 runs for each test and using 8, 80, 800, 8000 for the number of threads, Note that for this test, I'm not using patch 14 ("kernel: Enable waitpid() for futex2") , for reasons explained at "The patchset" section. - For the first three ones, I measured an average of 4% gain in performance. This is not a big step, but it shows that the new interface is at least comparable in performance with the current one. - For requeue, I measured an average of 21% decrease in performance compared to the original futex implementation. This is expected given the new design with individual flags. The performance trade-offs are explained at patch 4 ("futex2: Implement requeue operation"). [4] https://gitlab.collabora.com/tonyk/glibc/-/tree/futex2 [5] https://gitlab.collabora.com/tonyk/wine/-/tree/proton_5.13 [6] https://gitlab.collabora.com/tonyk/systemd * FAQ ** "Where's the code for NUMA and FUTEX_8/16/64?" The current code is already complex enough to take some time for review, so I believe it's better to split that work out to a future iteration of this patchset. Besides that, this RFC is the core part of the infrastructure, and the following features will not pose big design changes to it, the work will be more about wiring up the flags and modifying some functions. ** "Where's the PI/robust stuff?" As said by Peter Zijlstra at [3], all those new features are related to the "simple" futex interface, that doesn't use PI or robust. Do we want to have this complexity at futex2() and if so, should it be part of this patchset or can it be future work? Thanks, André * Changelog Changes from v2: - API now supports 64bit futexes, in addition to 8, 16 and 32. - This API change will break the glibc[4] and Proton[5] ports for now. - Refactored futex2_wait and futex2_waitv selftests v2: https://lore.kernel.org/lkml/20210304004219.134051-1-andrealmeid@collabora.… Changes from v1: - Unified futex_set_timer_and_wait and __futex_wait code - Dropped _carefull from linked list function calls - Fixed typos on docs patch - uAPI flags are now added as features are introduced, instead of all flags in patch 1 - Removed struct futex_single_waiter in favor of an anon struct v1: https://lore.kernel.org/lkml/20210215152404.250281-1-andrealmeid@collabora.… André Almeida (13): futex2: Implement wait and wake functions futex2: Add support for shared futexes futex2: Implement vectorized wait futex2: Implement requeue operation futex2: Add compatibility entry point for x86_x32 ABI docs: locking: futex2: Add documentation selftests: futex2: Add wake/wait test selftests: futex2: Add timeout test selftests: futex2: Add wouldblock test selftests: futex2: Add waitv test selftests: futex2: Add requeue test perf bench: Add futex2 benchmark tests kernel: Enable waitpid() for futex2 Documentation/locking/futex2.rst | 198 +++ Documentation/locking/index.rst | 1 + MAINTAINERS | 2 +- arch/arm/tools/syscall.tbl | 4 + arch/arm64/include/asm/unistd.h | 2 +- arch/arm64/include/asm/unistd32.h | 8 + arch/x86/entry/syscalls/syscall_32.tbl | 4 + arch/x86/entry/syscalls/syscall_64.tbl | 4 + fs/inode.c | 1 + include/linux/compat.h | 26 + include/linux/fs.h | 1 + include/linux/syscalls.h | 17 + include/uapi/asm-generic/unistd.h | 14 +- include/uapi/linux/futex.h | 31 + init/Kconfig | 7 + kernel/Makefile | 1 + kernel/fork.c | 2 + kernel/futex2.c | 1252 +++++++++++++++++ kernel/sys_ni.c | 9 + tools/arch/x86/include/asm/unistd_64.h | 12 + tools/include/uapi/asm-generic/unistd.h | 11 +- .../arch/x86/entry/syscalls/syscall_64.tbl | 4 + tools/perf/bench/bench.h | 4 + tools/perf/bench/futex-hash.c | 24 +- tools/perf/bench/futex-requeue.c | 57 +- tools/perf/bench/futex-wake-parallel.c | 41 +- tools/perf/bench/futex-wake.c | 37 +- tools/perf/bench/futex.h | 47 + tools/perf/builtin-bench.c | 18 +- .../selftests/futex/functional/.gitignore | 3 + .../selftests/futex/functional/Makefile | 6 +- .../futex/functional/futex2_requeue.c | 164 +++ .../selftests/futex/functional/futex2_wait.c | 195 +++ .../selftests/futex/functional/futex2_waitv.c | 154 ++ .../futex/functional/futex_wait_timeout.c | 58 +- .../futex/functional/futex_wait_wouldblock.c | 33 +- .../testing/selftests/futex/functional/run.sh | 6 + .../selftests/futex/include/futex2test.h | 112 ++ 38 files changed, 2518 insertions(+), 52 deletions(-) create mode 100644 Documentation/locking/futex2.rst create mode 100644 kernel/futex2.c create mode 100644 tools/testing/selftests/futex/functional/futex2_requeue.c create mode 100644 tools/testing/selftests/futex/functional/futex2_wait.c create mode 100644 tools/testing/selftests/futex/functional/futex2_waitv.c create mode 100644 tools/testing/selftests/futex/include/futex2test.h -- 2.31.1

4 years, 8 months

2026

2025

2024

2023

2022

2021

2020

2019

2018

2017

Linux-kselftest-mirror April 2021