March 2021 - Linux-kselftest-mirror

[PATCH v7 1/2] mm: huge_memory: a new debugfs interface for splitting THP tests.

by Zi Yan

From: Zi Yan <ziy(a)nvidia.com> We did not have a direct user interface of splitting the compound page backing a THP and there is no need unless we want to expose the THP implementation details to users. Make <debugfs>/split_huge_pages accept a new command to do that. By writing "<pid>,<vaddr_start>,<vaddr_end>" to <debugfs>/split_huge_pages, THPs within the given virtual address range from the process with the given pid are split. It is used to test split_huge_page function. In addition, a selftest program is added to tools/testing/selftests/vm to utilize the interface by splitting PMD THPs and PTE-mapped THPs. This does not change the old behavior, i.e., writing 1 to the interface to split all THPs in the system. Changelog: >From v6: 1. pr_info -> pr_debug. 2. Added cond_resched() in all split loops. (suggested by David Rientjes) >From v5: 1. Skipped special VMAs and other fixes. (suggested by Yang Shi) >From v4: 1. Fixed the error code return issue, spotted by kernel test robot <lkp(a)intel.com>. >From v3: 1. Factored out split huge pages in the given pid code to a separate function. 2. Added the missing put_page for not split pages. 3. pr_debug -> pr_info, make reading results simpler. >From v2: 1. Reused existing <debugfs>/split_huge_pages interface. (suggested by Yang Shi) >From v1: 1. Removed unnecessary calling to vma_migratable, spotted by kernel test robot <lkp(a)intel.com>. 2. Dropped the use of find_mm_struct and code it directly, since there is no need for the permission check in that function and the function is only available when migration is on. 3. Added some comments in the selftest program to clarify how PTE-mapped THPs are formed. Signed-off-by: Zi Yan <ziy(a)nvidia.com> Reviewed-by: Yang Shi <shy828301(a)gmail.com> --- mm/huge_memory.c | 155 ++++++++- tools/testing/selftests/vm/.gitignore | 1 + tools/testing/selftests/vm/Makefile | 1 + .../selftests/vm/split_huge_page_test.c | 318 ++++++++++++++++++ 4 files changed, 467 insertions(+), 8 deletions(-) create mode 100644 tools/testing/selftests/vm/split_huge_page_test.c diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 8eba529a0f17..1bcab247aea8 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -7,6 +7,7 @@ #include <linux/mm.h> #include <linux/sched.h> +#include <linux/sched/mm.h> #include <linux/sched/coredump.h> #include <linux/sched/numa_balancing.h> #include <linux/highmem.h> @@ -2927,16 +2928,14 @@ static struct shrinker deferred_split_shrinker = { }; #ifdef CONFIG_DEBUG_FS -static int split_huge_pages_set(void *data, u64 val) +static void split_huge_pages_all(void) { struct zone *zone; struct page *page; unsigned long pfn, max_zone_pfn; unsigned long total = 0, split = 0; - if (val != 1) - return -EINVAL; - + pr_debug("Split all THPs\n"); for_each_populated_zone(zone) { max_zone_pfn = zone_end_pfn(zone); for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) { @@ -2960,15 +2959,155 @@ static int split_huge_pages_set(void *data, u64 val) unlock_page(page); next: put_page(page); + cond_resched(); } } - pr_info("%lu of %lu THP split\n", split, total); + pr_debug("%lu of %lu THP split\n", split, total); +} - return 0; +static inline bool vma_not_suitable_for_thp_split(struct vm_area_struct *vma) +{ + return vma_is_special_huge(vma) || (vma->vm_flags & VM_IO) || + is_vm_hugetlb_page(vma); +} + +static int split_huge_pages_pid(int pid, unsigned long vaddr_start, + unsigned long vaddr_end) +{ + int ret = 0; + struct task_struct *task; + struct mm_struct *mm; + unsigned long total = 0, split = 0; + unsigned long addr; + + vaddr_start &= PAGE_MASK; + vaddr_end &= PAGE_MASK; + + /* Find the task_struct from pid */ + rcu_read_lock(); + task = find_task_by_vpid(pid); + if (!task) { + rcu_read_unlock(); + ret = -ESRCH; + goto out; + } + get_task_struct(task); + rcu_read_unlock(); + + /* Find the mm_struct */ + mm = get_task_mm(task); + put_task_struct(task); + + if (!mm) { + ret = -EINVAL; + goto out; + } + + pr_debug("Split huge pages in pid: %d, vaddr: [0x%lx - 0x%lx]\n", + pid, vaddr_start, vaddr_end); + + mmap_read_lock(mm); + /* + * always increase addr by PAGE_SIZE, since we could have a PTE page + * table filled with PTE-mapped THPs, each of which is distinct. + */ + for (addr = vaddr_start; addr < vaddr_end; addr += PAGE_SIZE) { + struct vm_area_struct *vma = find_vma(mm, addr); + unsigned int follflags; + struct page *page; + + if (!vma || addr < vma->vm_start) + break; + + /* skip special VMA and hugetlb VMA */ + if (vma_not_suitable_for_thp_split(vma)) { + addr = vma->vm_end; + continue; + } + + /* FOLL_DUMP to ignore special (like zero) pages */ + follflags = FOLL_GET | FOLL_DUMP; + page = follow_page(vma, addr, follflags); + + if (IS_ERR(page)) + continue; + if (!page) + continue; + + if (!is_transparent_hugepage(page)) + goto next; + + total++; + if (!can_split_huge_page(compound_head(page), NULL)) + goto next; + + if (!trylock_page(page)) + goto next; + + if (!split_huge_page(page)) + split++; + + unlock_page(page); +next: + put_page(page); + cond_resched(); + } + mmap_read_unlock(mm); + mmput(mm); + + pr_debug("%lu of %lu THP split\n", split, total); + +out: + return ret; } -DEFINE_DEBUGFS_ATTRIBUTE(split_huge_pages_fops, NULL, split_huge_pages_set, - "%llu\n"); + +#define MAX_INPUT_BUF_SZ 255 + +static ssize_t split_huge_pages_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppops) +{ + static DEFINE_MUTEX(split_debug_mutex); + ssize_t ret; + char input_buf[MAX_INPUT_BUF_SZ]; /* hold pid, start_vaddr, end_vaddr */ + int pid; + unsigned long vaddr_start, vaddr_end; + + ret = mutex_lock_interruptible(&split_debug_mutex); + if (ret) + return ret; + + ret = -EFAULT; + + memset(input_buf, 0, MAX_INPUT_BUF_SZ); + if (copy_from_user(input_buf, buf, min_t(size_t, count, MAX_INPUT_BUF_SZ))) + goto out; + + input_buf[MAX_INPUT_BUF_SZ - 1] = '\0'; + ret = sscanf(input_buf, "%d,0x%lx,0x%lx", &pid, &vaddr_start, &vaddr_end); + if (ret == 1 && pid == 1) { + split_huge_pages_all(); + ret = strlen(input_buf); + goto out; + } else if (ret != 3) { + ret = -EINVAL; + goto out; + } + + ret = split_huge_pages_pid(pid, vaddr_start, vaddr_end); + if (!ret) + ret = strlen(input_buf); +out: + mutex_unlock(&split_debug_mutex); + return ret; + +} + +static const struct file_operations split_huge_pages_fops = { + .owner = THIS_MODULE, + .write = split_huge_pages_write, + .llseek = no_llseek, +}; static int __init split_huge_pages_debugfs(void) { diff --git a/tools/testing/selftests/vm/.gitignore b/tools/testing/selftests/vm/.gitignore index c8deddc81e7a..da92ded5a27c 100644 --- a/tools/testing/selftests/vm/.gitignore +++ b/tools/testing/selftests/vm/.gitignore @@ -23,3 +23,4 @@ write_to_hugetlbfs hmm-tests memfd_secret local_config.* +split_huge_page_test diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile index 38e25b90f8bf..266580ea938c 100644 --- a/tools/testing/selftests/vm/Makefile +++ b/tools/testing/selftests/vm/Makefile @@ -43,6 +43,7 @@ TEST_GEN_FILES += on-fault-limit TEST_GEN_FILES += thuge-gen TEST_GEN_FILES += transhuge-stress TEST_GEN_FILES += userfaultfd +TEST_GEN_FILES += split_huge_page_test ifeq ($(MACHINE),x86_64) CAN_BUILD_I386 := $(shell ./../x86/check_cc.sh $(CC) ../x86/trivial_32bit_program.c -m32) diff --git a/tools/testing/selftests/vm/split_huge_page_test.c b/tools/testing/selftests/vm/split_huge_page_test.c new file mode 100644 index 000000000000..2c0c18e60c57 --- /dev/null +++ b/tools/testing/selftests/vm/split_huge_page_test.c @@ -0,0 +1,318 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * A test of splitting PMD THPs and PTE-mapped THPs from a specified virtual + * address range in a process via <debugfs>/split_huge_pages interface. + */ + +#define _GNU_SOURCE +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <inttypes.h> +#include <string.h> +#include <fcntl.h> +#include <sys/mman.h> +#include <malloc.h> +#include <stdbool.h> + +uint64_t pagesize; +unsigned int pageshift; +uint64_t pmd_pagesize; + +#define PMD_SIZE_PATH "/sys/kernel/mm/transparent_hugepage/hpage_pmd_size" +#define SPLIT_DEBUGFS "/sys/kernel/debug/split_huge_pages" +#define SMAP_PATH "/proc/self/smaps" +#define INPUT_MAX 80 + +#define PFN_MASK ((1UL<<55)-1) +#define KPF_THP (1UL<<22) + +int is_backed_by_thp(char *vaddr, int pagemap_file, int kpageflags_file) +{ + uint64_t paddr; + uint64_t page_flags; + + if (pagemap_file) { + pread(pagemap_file, &paddr, sizeof(paddr), + ((long)vaddr >> pageshift) * sizeof(paddr)); + + if (kpageflags_file) { + pread(kpageflags_file, &page_flags, sizeof(page_flags), + (paddr & PFN_MASK) * sizeof(page_flags)); + + return !!(page_flags & KPF_THP); + } + } + return 0; +} + + +static uint64_t read_pmd_pagesize(void) +{ + int fd; + char buf[20]; + ssize_t num_read; + + fd = open(PMD_SIZE_PATH, O_RDONLY); + if (fd == -1) { + perror("Open hpage_pmd_size failed"); + exit(EXIT_FAILURE); + } + num_read = read(fd, buf, 19); + if (num_read < 1) { + close(fd); + perror("Read hpage_pmd_size failed"); + exit(EXIT_FAILURE); + } + buf[num_read] = '\0'; + close(fd); + + return strtoul(buf, NULL, 10); +} + +static int write_file(const char *path, const char *buf, size_t buflen) +{ + int fd; + ssize_t numwritten; + + fd = open(path, O_WRONLY); + if (fd == -1) + return 0; + + numwritten = write(fd, buf, buflen - 1); + close(fd); + if (numwritten < 1) + return 0; + + return (unsigned int) numwritten; +} + +static void write_debugfs(int pid, uint64_t vaddr_start, uint64_t vaddr_end) +{ + char input[INPUT_MAX]; + int ret; + + ret = snprintf(input, INPUT_MAX, "%d,0x%lx,0x%lx", pid, vaddr_start, + vaddr_end); + if (ret >= INPUT_MAX) { + printf("%s: Debugfs input is too long\n", __func__); + exit(EXIT_FAILURE); + } + + if (!write_file(SPLIT_DEBUGFS, input, ret + 1)) { + perror(SPLIT_DEBUGFS); + exit(EXIT_FAILURE); + } +} + +#define MAX_LINE_LENGTH 500 + +static bool check_for_pattern(FILE *fp, const char *pattern, char *buf) +{ + while (fgets(buf, MAX_LINE_LENGTH, fp) != NULL) { + if (!strncmp(buf, pattern, strlen(pattern))) + return true; + } + return false; +} + +static uint64_t check_huge(void *addr) +{ + uint64_t thp = 0; + int ret; + FILE *fp; + char buffer[MAX_LINE_LENGTH]; + char addr_pattern[MAX_LINE_LENGTH]; + + ret = snprintf(addr_pattern, MAX_LINE_LENGTH, "%08lx-", + (unsigned long) addr); + if (ret >= MAX_LINE_LENGTH) { + printf("%s: Pattern is too long\n", __func__); + exit(EXIT_FAILURE); + } + + + fp = fopen(SMAP_PATH, "r"); + if (!fp) { + printf("%s: Failed to open file %s\n", __func__, SMAP_PATH); + exit(EXIT_FAILURE); + } + if (!check_for_pattern(fp, addr_pattern, buffer)) + goto err_out; + + /* + * Fetch the AnonHugePages: in the same block and check the number of + * hugepages. + */ + if (!check_for_pattern(fp, "AnonHugePages:", buffer)) + goto err_out; + + if (sscanf(buffer, "AnonHugePages:%10ld kB", &thp) != 1) { + printf("Reading smap error\n"); + exit(EXIT_FAILURE); + } + +err_out: + fclose(fp); + return thp; +} + +void split_pmd_thp(void) +{ + char *one_page; + size_t len = 4 * pmd_pagesize; + uint64_t thp_size; + size_t i; + + one_page = memalign(pmd_pagesize, len); + + if (!one_page) { + printf("Fail to allocate memory\n"); + exit(EXIT_FAILURE); + } + + madvise(one_page, len, MADV_HUGEPAGE); + + for (i = 0; i < len; i++) + one_page[i] = (char)i; + + thp_size = check_huge(one_page); + if (!thp_size) { + printf("No THP is allocated\n"); + exit(EXIT_FAILURE); + } + + /* split all THPs */ + write_debugfs(getpid(), (uint64_t)one_page, (uint64_t)one_page + len); + + for (i = 0; i < len; i++) + if (one_page[i] != (char)i) { + printf("%ld byte corrupted\n", i); + exit(EXIT_FAILURE); + } + + + thp_size = check_huge(one_page); + if (thp_size) { + printf("Still %ld kB AnonHugePages not split\n", thp_size); + exit(EXIT_FAILURE); + } + + printf("Split huge pages successful\n"); + free(one_page); +} + +void split_pte_mapped_thp(void) +{ + char *one_page, *pte_mapped, *pte_mapped2; + size_t len = 4 * pmd_pagesize; + uint64_t thp_size; + size_t i; + const char *pagemap_template = "/proc/%d/pagemap"; + const char *kpageflags_proc = "/proc/kpageflags"; + char pagemap_proc[255]; + int pagemap_fd; + int kpageflags_fd; + + if (snprintf(pagemap_proc, 255, pagemap_template, getpid()) < 0) { + perror("get pagemap proc error"); + exit(EXIT_FAILURE); + } + pagemap_fd = open(pagemap_proc, O_RDONLY); + + if (pagemap_fd == -1) { + perror("read pagemap:"); + exit(EXIT_FAILURE); + } + + kpageflags_fd = open(kpageflags_proc, O_RDONLY); + + if (kpageflags_fd == -1) { + perror("read kpageflags:"); + exit(EXIT_FAILURE); + } + + one_page = mmap((void *)(1UL << 30), len, PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + + madvise(one_page, len, MADV_HUGEPAGE); + + for (i = 0; i < len; i++) + one_page[i] = (char)i; + + thp_size = check_huge(one_page); + if (!thp_size) { + printf("No THP is allocated\n"); + exit(EXIT_FAILURE); + } + + /* remap the first pagesize of first THP */ + pte_mapped = mremap(one_page, pagesize, pagesize, MREMAP_MAYMOVE); + + /* remap the Nth pagesize of Nth THP */ + for (i = 1; i < 4; i++) { + pte_mapped2 = mremap(one_page + pmd_pagesize * i + pagesize * i, + pagesize, pagesize, + MREMAP_MAYMOVE|MREMAP_FIXED, + pte_mapped + pagesize * i); + if (pte_mapped2 == (char *)-1) { + perror("mremap failed"); + exit(EXIT_FAILURE); + } + } + + /* smap does not show THPs after mremap, use kpageflags instead */ + thp_size = 0; + for (i = 0; i < pagesize * 4; i++) + if (i % pagesize == 0 && + is_backed_by_thp(&pte_mapped[i], pagemap_fd, kpageflags_fd)) + thp_size++; + + if (thp_size != 4) { + printf("Some THPs are missing during mremap\n"); + exit(EXIT_FAILURE); + } + + /* split all remapped THPs */ + write_debugfs(getpid(), (uint64_t)pte_mapped, + (uint64_t)pte_mapped + pagesize * 4); + + /* smap does not show THPs after mremap, use kpageflags instead */ + thp_size = 0; + for (i = 0; i < pagesize * 4; i++) { + if (pte_mapped[i] != (char)i) { + printf("%ld byte corrupted\n", i); + exit(EXIT_FAILURE); + } + if (i % pagesize == 0 && + is_backed_by_thp(&pte_mapped[i], pagemap_fd, kpageflags_fd)) + thp_size++; + } + + if (thp_size) { + printf("Still %ld THPs not split\n", thp_size); + exit(EXIT_FAILURE); + } + + printf("Split PTE-mapped huge pages successful\n"); + munmap(one_page, len); + close(pagemap_fd); + close(kpageflags_fd); +} + +int main(int argc, char **argv) +{ + if (geteuid() != 0) { + printf("Please run the benchmark as root\n"); + exit(EXIT_FAILURE); + } + + pagesize = getpagesize(); + pageshift = ffs(pagesize) - 1; + pmd_pagesize = read_pmd_pagesize(); + + split_pmd_thp(); + split_pte_mapped_thp(); + + return 0; +} -- 2.30.2

4 years, 2 months

3
3
0 0

[PATCH v3 00/11] Add support for synchronous signals on perf events

by Marco Elver

The perf subsystem today unifies various tracing and monitoring features, from both software and hardware. One benefit of the perf subsystem is automatically inheriting events to child tasks, which enables process-wide events monitoring with low overheads. By default perf events are non-intrusive, not affecting behaviour of the tasks being monitored. For certain use-cases, however, it makes sense to leverage the generality of the perf events subsystem and optionally allow the tasks being monitored to receive signals on events they are interested in. This patch series adds the option to synchronously signal user space on events. To better support process-wide synchronous self-monitoring, without events propagating to children that do not share the current process's shared environment, two pre-requisite patches are added to optionally restrict inheritance to CLONE_THREAD, and remove events on exec (without affecting the parent). Examples how to use these features can be found in the tests added at the end of the series. In addition to the tests added, the series has also been subjected to syzkaller fuzzing (focus on 'kernel/events/' coverage). Motivation and Example Uses --------------------------- 1. Our immediate motivation is low-overhead sampling-based race detection for user space [1]. By using perf_event_open() at process initialization, we can create hardware breakpoint/watchpoint events that are propagated automatically to all threads in a process. As far as we are aware, today no existing kernel facility (such as ptrace) allows us to set up process-wide watchpoints with minimal overheads (that are comparable to mprotect() of whole pages). 2. Other low-overhead error detectors that rely on detecting accesses to certain memory locations or code, process-wide and also only in a specific set of subtasks or threads. [1] https://llvm.org/devmtg/2020-09/slides/Morehouse-GWP-Tsan.pdf Other ideas for use-cases we found interesting, but should only illustrate the range of potential to further motivate the utility (we're sure there are more): 3. Code hot patching without full stop-the-world. Specifically, by setting a code breakpoint to entry to the patched routine, then send signals to threads and check that they are not in the routine, but without stopping them further. If any of the threads will enter the routine, it will receive SIGTRAP and pause. 4. Safepoints without mprotect(). Some Java implementations use "load from a known memory location" as a safepoint. When threads need to be stopped, the page containing the location is mprotect()ed and threads get a signal. This could be replaced with a watchpoint, which does not require a whole page nor DTLB shootdowns. 5. Threads receiving signals on performance events to throttle/unthrottle themselves. 6. Tracking data flow globally. Changelog --------- v3: * Add patch "perf: Rework perf_event_exit_event()" to beginning of series, courtesy of Peter Zijlstra. * Rework "perf: Add support for event removal on exec" based on the added "perf: Rework perf_event_exit_event()". * Fix kselftests to work with more recent libc, due to the way it forces using the kernel's own siginfo_t. * Add basic perf-tool built-in test. v2/RFC: https://lkml.kernel.org/r/20210310104139.679618-1-elver@google.com * Patch "Support only inheriting events if cloned with CLONE_THREAD" added to series. * Patch "Add support for event removal on exec" added to series. * Patch "Add kselftest for process-wide sigtrap handling" added to series. * Patch "Add kselftest for remove_on_exec" added to series. * Implicitly restrict inheriting events if sigtrap, but the child was cloned with CLONE_CLEAR_SIGHAND, because it is not generally safe if the child cleared all signal handlers to continue sending SIGTRAP. * Various minor fixes (see details in patches). v1/RFC: https://lkml.kernel.org/r/20210223143426.2412737-1-elver@google.com Pre-series: The discussion at [2] led to the changes in this series. The approach taken in "Add support for SIGTRAP on perf events" to trigger the signal was suggested by Peter Zijlstra in [3]. [2] https://lore.kernel.org/lkml/CACT4Y+YPrXGw+AtESxAgPyZ84TYkNZdP0xpocX2jwVAbZ… [3] https://lore.kernel.org/lkml/YBv3rAT566k+6zjg@hirez.programming.kicks-ass.n… Marco Elver (10): perf: Apply PERF_EVENT_IOC_MODIFY_ATTRIBUTES to children perf: Support only inheriting events if cloned with CLONE_THREAD perf: Add support for event removal on exec signal: Introduce TRAP_PERF si_code and si_perf to siginfo perf: Add support for SIGTRAP on perf events perf: Add breakpoint information to siginfo on SIGTRAP selftests/perf_events: Add kselftest for process-wide sigtrap handling selftests/perf_events: Add kselftest for remove_on_exec tools headers uapi: Sync tools/include/uapi/linux/perf_event.h perf test: Add basic stress test for sigtrap handling Peter Zijlstra (1): perf: Rework perf_event_exit_event() arch/m68k/kernel/signal.c | 3 + arch/x86/kernel/signal_compat.c | 5 +- fs/signalfd.c | 4 + include/linux/compat.h | 2 + include/linux/perf_event.h | 6 +- include/linux/signal.h | 1 + include/uapi/asm-generic/siginfo.h | 6 +- include/uapi/linux/perf_event.h | 5 +- include/uapi/linux/signalfd.h | 4 +- kernel/events/core.c | 297 +++++++++++++----- kernel/fork.c | 2 +- kernel/signal.c | 11 + tools/include/uapi/linux/perf_event.h | 5 +- tools/perf/tests/Build | 1 + tools/perf/tests/builtin-test.c | 5 + tools/perf/tests/sigtrap.c | 148 +++++++++ tools/perf/tests/tests.h | 1 + .../testing/selftests/perf_events/.gitignore | 3 + tools/testing/selftests/perf_events/Makefile | 6 + tools/testing/selftests/perf_events/config | 1 + .../selftests/perf_events/remove_on_exec.c | 260 +++++++++++++++ tools/testing/selftests/perf_events/settings | 1 + .../selftests/perf_events/sigtrap_threads.c | 206 ++++++++++++ 23 files changed, 896 insertions(+), 87 deletions(-) create mode 100644 tools/perf/tests/sigtrap.c create mode 100644 tools/testing/selftests/perf_events/.gitignore create mode 100644 tools/testing/selftests/perf_events/Makefile create mode 100644 tools/testing/selftests/perf_events/config create mode 100644 tools/testing/selftests/perf_events/remove_on_exec.c create mode 100644 tools/testing/selftests/perf_events/settings create mode 100644 tools/testing/selftests/perf_events/sigtrap_threads.c -- 2.31.0.291.g576ba9dcdaf-goog

4 years, 2 months

5
38
0 0

[PATCH v3] userfaultfd/shmem: fix MCOPY_ATOMIC_CONTNUE behavior

by Axel Rasmussen

Previously, we shared too much of the code with COPY and ZEROPAGE, so we manipulated things in various invalid ways: - Previously, we unconditionally called shmem_inode_acct_block. In the continue case, we're looking up an existing page which would have been accounted for properly when it was allocated. So doing it twice results in double-counting, and eventually leaking. - Previously, we made the pte writable whenever the VMA was writable. However, for continue, consider this case: 1. A tmpfs file was created 2. The non-UFFD-registered side mmap()-s with MAP_SHARED 3. The UFFD-registered side mmap()-s with MAP_PRIVATE In this case, even though the UFFD-registered VMA may be writable, we still want CoW behavior. So, check for this case and don't make the pte writable. - The initial pgoff / max_off check isn't necessary, so we can skip past it. The second one seems likely to be unnecessary too, but keep it just in case. Modify both checks to use pgoff, as offset is equivalent and not needed. - Previously, we unconditionally called ClearPageDirty() in the error path. In the continue case though, since this is an existing page, it might have already been dirty before we started touching it. It's very problematic to clear the bit incorrectly, but not a problem to leave it - so, just omit the ClearPageDirty() entirely. - Previously, we unconditionally removed the page from the page cache in the error path. But in the continue case, we didn't add it - it was already there because the page is present in some second (non-UFFD-registered) mapping. So, removing it is invalid. Because the error handling issues are easy to exercise in the selftest, make a small modification there to do so. Finally, refactor shmem_mcopy_atomic_pte a bit. By this point, we've added a lot of "if (!is_continue)"-s everywhere. It's cleaner to just check for that mode first thing, and then "goto" down to where the parts we actually want are. This leaves the code in between cleaner. Changes since v2: - Drop the ClearPageDirty() entirely, instead of trying to remember the old value. - Modify both pgoff / max_off checks to use pgoff. It's equivalent to offset, but offset wasn't initialized until the first check (which we're skipping). - Keep the second pgoff / max_off check in the continue case. Changes since v1: - Refactor to skip ahead with goto, instead of adding several more "if (!is_continue)". - Fix unconditional ClearPageDirty(). - Don't pte_mkwrite() when is_continue && !VM_SHARED. Fixes: 00da60b9d0a0 ("userfaultfd: support minor fault handling for shmem") Signed-off-by: Axel Rasmussen <axelrasmussen(a)google.com> --- mm/shmem.c | 60 +++++++++++++----------- tools/testing/selftests/vm/userfaultfd.c | 12 +++++ 2 files changed, 44 insertions(+), 28 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index d2e0e81b7d2e..fbcce850a16e 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2377,18 +2377,22 @@ int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd, struct page *page; pte_t _dst_pte, *dst_pte; int ret; - pgoff_t offset, max_off; - - ret = -ENOMEM; - if (!shmem_inode_acct_block(inode, 1)) - goto out; + pgoff_t max_off; + int writable; if (is_continue) { ret = -EFAULT; page = find_lock_page(mapping, pgoff); if (!page) - goto out_unacct_blocks; - } else if (!*pagep) { + goto out; + goto install_ptes; + } + + ret = -ENOMEM; + if (!shmem_inode_acct_block(inode, 1)) + goto out; + + if (!*pagep) { page = shmem_alloc_page(gfp, info, pgoff); if (!page) goto out_unacct_blocks; @@ -2415,30 +2419,29 @@ int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd, *pagep = NULL; } - if (!is_continue) { - VM_BUG_ON(PageSwapBacked(page)); - VM_BUG_ON(PageLocked(page)); - __SetPageLocked(page); - __SetPageSwapBacked(page); - __SetPageUptodate(page); - } + VM_BUG_ON(PageSwapBacked(page)); + VM_BUG_ON(PageLocked(page)); + __SetPageLocked(page); + __SetPageSwapBacked(page); + __SetPageUptodate(page); ret = -EFAULT; - offset = linear_page_index(dst_vma, dst_addr); max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); - if (unlikely(offset >= max_off)) + if (unlikely(pgoff >= max_off)) goto out_release; - /* If page wasn't already in the page cache, add it. */ - if (!is_continue) { - ret = shmem_add_to_page_cache(page, mapping, pgoff, NULL, - gfp & GFP_RECLAIM_MASK, dst_mm); - if (ret) - goto out_release; - } + ret = shmem_add_to_page_cache(page, mapping, pgoff, NULL, + gfp & GFP_RECLAIM_MASK, dst_mm); + if (ret) + goto out_release; +install_ptes: _dst_pte = mk_pte(page, dst_vma->vm_page_prot); - if (dst_vma->vm_flags & VM_WRITE) + /* For CONTINUE on a non-shared VMA, don't pte_mkwrite for CoW. */ + writable = is_continue && !(dst_vma->vm_flags & VM_SHARED) + ? 0 + : dst_vma->vm_flags & VM_WRITE; + if (writable) _dst_pte = pte_mkwrite(pte_mkdirty(_dst_pte)); else { /* @@ -2455,7 +2458,7 @@ int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd, ret = -EFAULT; max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); - if (unlikely(offset >= max_off)) + if (unlikely(pgoff >= max_off)) goto out_release_unlock; ret = -EEXIST; @@ -2485,13 +2488,14 @@ int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd, return ret; out_release_unlock: pte_unmap_unlock(dst_pte, ptl); - ClearPageDirty(page); - delete_from_page_cache(page); + if (!is_continue) + delete_from_page_cache(page); out_release: unlock_page(page); put_page(page); out_unacct_blocks: - shmem_inode_unacct_blocks(inode, 1); + if (!is_continue) + shmem_inode_unacct_blocks(inode, 1); goto out; } #endif /* CONFIG_USERFAULTFD */ diff --git a/tools/testing/selftests/vm/userfaultfd.c b/tools/testing/selftests/vm/userfaultfd.c index f6c86b036d0f..d8541a59dae5 100644 --- a/tools/testing/selftests/vm/userfaultfd.c +++ b/tools/testing/selftests/vm/userfaultfd.c @@ -485,6 +485,7 @@ static void wp_range(int ufd, __u64 start, __u64 len, bool wp) static void continue_range(int ufd, __u64 start, __u64 len) { struct uffdio_continue req; + int ret; req.range.start = start; req.range.len = len; @@ -493,6 +494,17 @@ static void continue_range(int ufd, __u64 start, __u64 len) if (ioctl(ufd, UFFDIO_CONTINUE, &req)) err("UFFDIO_CONTINUE failed for address 0x%" PRIx64, (uint64_t)start); + + /* + * Error handling within the kernel for continue is subtly different + * from copy or zeropage, so it may be a source of bugs. Trigger an + * error (-EEXIST) on purpose, to verify doing so doesn't cause a BUG. + */ + req.mapped = 0; + ret = ioctl(ufd, UFFDIO_CONTINUE, &req); + if (ret >= 0 || req.mapped != -EEXIST) + err("failed to exercise UFFDIO_CONTINUE error handling, ret=%d, mapped=%" PRId64, + ret, req.mapped); } static void *locking_thread(void *arg) -- 2.31.0.291.g576ba9dcdaf-goog

4 years, 2 months

2
3
0 0

Bidding invitation

by Albert Bourla

Good Day Sir/Ms, We are please to invite you or your company to quote the following item listed below: Product/Model No: A702TH FYNE PRESSURE REGULATOR Model Number: A702TH Qty. 30 units Compulsory,Kindly send your quotation to: quotation(a)pfizerbvsupply.com for immediate approval. Kind Regards, Albert Bourla PFIZER B.V Supply Chain Manager Tel: +31(0)208080 880 ADDRESS: Rivium Westlaan 142, 2909 LD Capelle aan den IJssel, Netherlands

4 years, 2 months

1
0
0 0

[PATCH] kvm: fix minor typos in x86/kvm.h and selftests/processor.c

by Emanuele Giuseppe Esposito

Signed-off-by: Emanuele Giuseppe Esposito <eesposit(a)redhat.com> --- tools/arch/x86/include/uapi/asm/kvm.h | 2 +- tools/testing/selftests/kvm/lib/x86_64/processor.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/arch/x86/include/uapi/asm/kvm.h b/tools/arch/x86/include/uapi/asm/kvm.h index 5a3022c8af82..e00d44bc5f55 100644 --- a/tools/arch/x86/include/uapi/asm/kvm.h +++ b/tools/arch/x86/include/uapi/asm/kvm.h @@ -190,7 +190,7 @@ struct kvm_msrs { /* for KVM_GET_MSR_INDEX_LIST */ struct kvm_msr_list { - __u32 nmsrs; /* number of msrs in entries */ + __u32 nmsrs; /* number of msrs in indices */ __u32 indices[0]; }; diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c index a8906e60a108..e676fe40bfe6 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/processor.c +++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c @@ -714,7 +714,7 @@ uint64_t kvm_get_feature_msr(uint64_t msr_index) * * Return: KVM CPUID (KVM_GET_CPUID2) * - * Set the VCPU's CPUID. + * Get the VCPU's CPUID. */ struct kvm_cpuid2 *vcpu_get_cpuid(struct kvm_vm *vm, uint32_t vcpuid) { -- 2.30.2

4 years, 2 months

1
0
0 0

[PATCH V4 00/10] PKS Add Protection Key Supervisor support

by ira.weiny＠intel.com

From: Ira Weiny <ira.weiny(a)intel.com> Introduce a new page protection mechanism for supervisor pages, Protection Key Supervisor (PKS). Generally PKS enables protections on 'domains' of supervisor pages to limit supervisor mode access to pages beyond the normal paging protections. PKS works in a similar fashion to user space pkeys, PKU. As with PKU, supervisor pkeys are checked in addition to normal paging protections and Access or Writes can be disabled via a MSR update without TLB flushes when permissions change. Also like PKU, a page mapping is assigned to a domain by setting pkey bits in the page table entry for that mapping. Access is controlled through a PKRS register which is updated via WRMSR/RDMSR. XSAVE is not supported for the PKRS MSR. Therefore the implementation saves/restores the MSR across context switches and during exceptions. Nested exceptions are supported by each exception getting a new PKS state. For consistent behavior with current paging protections, pkey 0 is reserved and configured to allow full access via the pkey mechanism, thus preserving the default paging protections on mappings with the default pkey value of 0. Other keys, (1-15) are allocated by an allocator which prepares us for key contention from day one. Kernel users should be prepared for the allocator to fail either because of key exhaustion or due to PKS not being supported on the CPU instance. The following are key attributes of PKS. 1) Fast switching of permissions 1a) Prevents access without page table manipulations 1b) No TLB flushes required 2) Works on a per thread basis PKS is available with 4 and 5 level paging. Like PKRU it consumes 4 bits from the PTE to store the pkey within the entry. All code to support PKS is configured via ARCH_ENABLE_SUPERVISOR_PKEYS which is designed to only be turned on when a user is configured on in the kernel. Those users must depend on ARCH_HAS_SUPERVISOR_PKEYS to properly work with other architectures which do not yet support PKS. Originally this series was submitted as part of a large patch set which converted the kmap call sites.[1] Many follow on discussions revealed a few problems. The first of which was that some callers leak a kmap mapping across threads rather than containing it to a critical section. Attempts were made to see if these 'global kmaps' could be supported.[2] However, supporting global kmaps had many problems. Work is being done in parallel on converting as many kmap calls to the new kmap_local_page().[3] Changes from V3 [4] Add ARCH_ENABLE_SUPERVISOR_PKEYS config which is selected by kernel users to add the functionality to the core. However, they should only select this if ARCH_HAS_SUPERVISOR_PKEYS is available. Clean up test code for context switching Adjust for extended_pt_regs Reduce output unless --debug is specified Address internal review comments from Dan Williams and Dave Hansen Help with macros and assembly coding Change names of various functions Clean up documentation Move all #ifdefery into header files. Clean up cover letter. Make extended_pt_regs handling a macro rather than coding around every call to C Add macross for PKS shift/mask New patch : x86/pks: Add additional PKEY helper macros Preserve pkrs_cache as static when PKS_TEST is not configured Remove unnecessary pr_* prints Clarify pks_key_alloc flags parameter Change CONFIG_PKS_TESTING to CONFIG_PKS_TEST Clean up test code separation from main code in fault.c Remove module boilerplate from test code Clean up all commit messages Address comments from Thomas Gleixner Provide a warning and fallback to no protection if a global mapping is requested. Fix context switch. Fix where pks_sched_in() is called. Fix test to actually do a context switch Remove unecessary noinstr's From Andy Lutomirski Use extended_pt_regs idea to stash pks values on the stack Drop patches 5/10 and 7/10 And use extended_pt_regs to print pkey info on fault Adjust tests Comments from Randy Dunlap: Fix gramatical errors in doc Clean up kernel docs Rebase to 5.12 [1] https://lore.kernel.org/lkml/20201009195033.3208459-1-ira.weiny@intel.com/ [2] https://lore.kernel.org/lkml/87mtycqcjf.fsf@nanos.tec.linutronix.de/ [3] https://lore.kernel.org/lkml/20210128061503.1496847-1-ira.weiny@intel.com/ https://lore.kernel.org/lkml/20210210062221.3023586-1-ira.weiny@intel.com/ https://lore.kernel.org/lkml/20210205170030.856723-1-ira.weiny@intel.com/ https://lore.kernel.org/lkml/20210217024826.3466046-1-ira.weiny@intel.com/ [4] https://lore.kernel.org/lkml/20201106232908.364581-1-ira.weiny@intel.com/ </proposed cover letter> Fenghua Yu (1): x86/pks: Add PKS kernel API Ira Weiny (9): x86/pkeys: Create pkeys_common.h x86/fpu: Refactor arch_set_user_pkey_access() for PKS support x86/pks: Add additional PKEY helper macros x86/pks: Add PKS defines and Kconfig options x86/pks: Add PKS setup code x86/fault: Adjust WARN_ON for PKey fault x86/pks: Preserve the PKRS MSR on context switch x86/entry: Preserve PKRS MSR across exceptions x86/pks: Add PKS test code Documentation/core-api/protection-keys.rst | 111 +++- arch/x86/Kconfig | 1 + arch/x86/entry/calling.h | 26 + arch/x86/entry/common.c | 58 ++ arch/x86/entry/entry_64.S | 22 +- arch/x86/entry/entry_64_compat.S | 6 +- arch/x86/include/asm/cpufeatures.h | 1 + arch/x86/include/asm/disabled-features.h | 8 +- arch/x86/include/asm/msr-index.h | 1 + arch/x86/include/asm/pgtable.h | 10 +- arch/x86/include/asm/pgtable_types.h | 12 + arch/x86/include/asm/pkeys.h | 4 + arch/x86/include/asm/pkeys_common.h | 34 + arch/x86/include/asm/pks.h | 54 ++ arch/x86/include/asm/processor-flags.h | 2 + arch/x86/include/asm/processor.h | 43 +- arch/x86/include/uapi/asm/processor-flags.h | 2 + arch/x86/kernel/cpu/common.c | 2 + arch/x86/kernel/fpu/xstate.c | 22 +- arch/x86/kernel/head_64.S | 7 +- arch/x86/kernel/process.c | 3 + arch/x86/kernel/process_64.c | 2 + arch/x86/mm/fault.c | 27 +- arch/x86/mm/pkeys.c | 218 +++++- include/linux/pgtable.h | 4 + include/linux/pkeys.h | 34 + kernel/entry/common.c | 14 +- lib/Kconfig.debug | 11 + lib/Makefile | 3 + lib/pks/Makefile | 3 + lib/pks/pks_test.c | 693 ++++++++++++++++++++ mm/Kconfig | 5 + tools/testing/selftests/x86/Makefile | 3 +- tools/testing/selftests/x86/test_pks.c | 150 +++++ 34 files changed, 1519 insertions(+), 77 deletions(-) create mode 100644 arch/x86/include/asm/pkeys_common.h create mode 100644 arch/x86/include/asm/pks.h create mode 100644 lib/pks/Makefile create mode 100644 lib/pks/pks_test.c create mode 100644 tools/testing/selftests/x86/test_pks.c -- 2.28.0.rc0.12.gb6a658bd00c9

4 years, 2 months

3
15
0 0

BUG: key ffff000800eba398 has not been registered!

by Naresh Kamboju

While running kselftest recently added gpio gpio-sim.sh test case the following warning was triggered on Linux next tag 20210330 tag running on arm64 juno and hikey devices. GOOD: next-20210326 BAD: next-20210330 # selftests: gpio: gpio-sim.sh # 1. chip_name and dev_name attributes # 1.1. Chip name is communicated to user [ 143.081193] BUG: key ffff000800eba398 has not been registered! [ 143.087326] ------------[ cut here ]------------ [ 143.091987] DEBUG_LOCKS_WARN_ON(1) [ 143.092005] WARNING: CPU: 1 PID: 1821 at /usr/src/kernel/kernel/locking/lockdep.c:4688 lockdep_init_map_type+0xf0/0x298 [ 143.106223] Modules linked in: gpio_sim rfkill tda998x cec drm_kms_helper drm crct10dif_ce fuse [last unloaded: gpio_mockup] [ 143.117495] CPU: 1 PID: 1821 Comm: mv Not tainted 5.12.0-rc5-next-20210330 #1 [ 143.124645] Hardware name: ARM Juno development board (r2) (DT) [ 143.130572] pstate: 40000005 (nZcv daif -PAN -UAO -TCO BTYPE=--) [ 143.136589] pc : lockdep_init_map_type+0xf0/0x298 [ 143.141302] lr : lockdep_init_map_type+0xf0/0x298 [ 143.146014] sp : ffff800013fb3560 [ 143.149330] x29: ffff800013fb3560 x28: 00000000ffffee4b [ 143.154655] x27: 00000000000011b4 x26: 0000000000001000 [ 143.159979] x25: ffff000800eba380 x24: 0000000000000000 [ 143.165303] x23: 0000000000000000 x22: 0000000000000000 [ 143.170626] x21: ffff80001382b000 x20: ffff000800eba398 [ 143.175949] x19: ffff000827ac32a8 x18: ffffffffffffffff [ 143.181273] x17: 0000000000000000 x16: 0000000000000000 [ 143.186595] x15: ffff800012900a88 x14: ffff800093fb3167 [ 143.191918] x13: ffff800013fb3175 x12: 000000000000a0ec [ 143.197241] x11: 0000000005f5e0ff x10: ffff800013fb30c0 [ 143.202565] x9 : ffff800013fb3560 x8 : 4e5241575f534b43 [ 143.207888] x7 : ffff800012989ad8 x6 : ffff800013fb3180 [ 143.213211] x5 : 0000000000000001 x4 : 0000000000000001 [ 143.218534] x3 : ffff800012901000 x2 : 0000000000000000 [ 143.223856] x1 : b4b7acac5f71bc00 x0 : 0000000000000000 [ 143.229180] Call trace: [ 143.231625] lockdep_init_map_type+0xf0/0x298 [ 143.235989] __kernfs_create_file+0xa8/0x1d0 [ 143.240268] sysfs_add_file_mode_ns+0xa8/0x1f8 [ 143.244718] internal_create_group+0x118/0x420 [ 143.249169] sysfs_create_group+0x2c/0x38 [ 143.253185] gpio_sim_probe+0x358/0x3c0 [gpio_sim] [ 143.257995] platform_probe+0x6c/0xd8 [ 143.261663] really_probe+0x16c/0x508 [ 143.265332] driver_probe_device+0x104/0x178 [ 143.269610] __device_attach_driver+0xa4/0x130 [ 143.274062] bus_for_each_drv+0x78/0xd8 [ 143.277903] __device_attach+0xf0/0x178 [ 143.281745] device_initial_probe+0x24/0x30 [ 143.285935] bus_probe_device+0xa0/0xa8 [ 143.289776] device_add+0x424/0x810 [ 143.293270] platform_device_add+0x12c/0x2c0 [ 143.297545] platform_device_register_full+0x124/0x150 [ 143.302692] gpio_sim_config_commit_item+0x160/0x1e0 [gpio_sim] [ 143.308628] configfs_rename+0x1dc/0x220 [ 143.312557] vfs_rename+0x394/0x960 [ 143.316051] do_renameat2+0x408/0x4c0 [ 143.319718] __arm64_sys_renameat+0x5c/0x70 [ 143.323908] el0_svc_common+0x7c/0x158 [ 143.327665] do_el0_svc+0x38/0x90 [ 143.330985] el0_svc+0x20/0x30 [ 143.334045] el0_sync_handler+0x8c/0xb0 [ 143.337886] el0_sync+0x13c/0x140 [ 143.341204] irq event stamp: 5607 [ 143.344520] hardirqs last enabled at (5607): [<ffff800011462dd8>] _raw_spin_unlock_irq+0x48/0x90 [ 143.353410] hardirqs last disabled at (5606): [<ffff8000114586c4>] __schedule+0x364/0x950 [ 143.361605] softirqs last enabled at (5602): [<ffff800010010958>] __do_softirq+0x510/0x63c [ 143.369971] softirqs last disabled at (5577): [<ffff8000100a7474>] irq_exit+0x1b4/0x1c0 [ 143.377992] ---[ end trace bc3c86ef609281aa ]--- # 1.2. chip_name returns 'none' if the chip is still pending Reported-by: Naresh Kamboju <naresh.kamboju(a)linaro.org> metadata: git branch: master git repo: https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git git describe: next-20210330 kernel-config: http://snapshots.linaro.org/openembedded/lkft/lkft/sumo/juno/lkft/linux-nex… Full test log link, https://qa-reports.linaro.org/lkft/linux-next-master/build/next-20210330/te… git log --oneline next-20210326..next-20210330 -- tools/testing/selftests/gpio/ 9d940ab72645 selftests: gpio: add test cases for gpio-sim 8a4cb2823240 selftests: gpio: add a helper for reading GPIO line names ab1dbed6f4e8 selftests: gpio: provide a helper for reading chip info -- Linaro LKFT https://lkft.linaro.org

4 years, 2 months

1
0
0 0

[PATCH v8 1/4] lib: vsprintf: scanf: Negative number must have field width > 1

by Richard Fitzgerald

If a signed number field starts with a '-' the field width must be > 1, or unlimited, to allow at least one digit after the '-'. This patch adds a check for this. If a signed field starts with '-' and field_width == 1 the scanf will quit. It is ok for a signed number field to have a field width of 1 if it starts with a digit. In that case the single digit can be converted. Signed-off-by: Richard Fitzgerald <rf(a)opensource.cirrus.com> Reviewed-by: Petr Mladek <pmladek(a)suse.com> Acked-by: Andy Shevchenko <andriy.shevchenko(a)linux.intel.com> --- lib/vsprintf.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/lib/vsprintf.c b/lib/vsprintf.c index 41ddc353ebb8..f78651e9b030 100644 --- a/lib/vsprintf.c +++ b/lib/vsprintf.c @@ -3466,8 +3466,12 @@ int vsscanf(const char *buf, const char *fmt, va_list args) str = skip_spaces(str); digit = *str; - if (is_sign && digit == '-') + if (is_sign && digit == '-') { + if (field_width == 1) + break; + digit = *(str + 1); + } if (!digit || (base == 16 && !isxdigit(digit)) -- 2.20.1

4 years, 2 months

1
3
0 0

[PATCH v2] userfaultfd/shmem: fix MCOPY_ATOMIC_CONTNUE behavior

by Axel Rasmussen

Previously, we shared too much of the code with COPY and ZEROPAGE, so we manipulated things in various invalid ways: - Previously, we unconditionally called shmem_inode_acct_block. In the continue case, we're looking up an existing page which would have been accounted for properly when it was allocated. So doing it twice results in double-counting, and eventually leaking. - Previously, we made the pte writable whenever the VMA was writable. However, for continue, consider this case: 1. A tmpfs file was created 2. The non-UFFD-registered side mmap()-s with MAP_SHARED 3. The UFFD-registered side mmap()-s with MAP_PRIVATE In this case, even though the UFFD-registered VMA may be writable, we still want CoW behavior. So, check for this case and don't make the pte writable. - The offset / max_off checking doesn't necessarily hurt anything, but it's not needed in the CONTINUE case, so skip it. - Previously, we unconditionally called ClearPageDirty() in the error path. In the continue case though, since this is an existing page, it might have already been dirty before we started touching it. So, remember whether or not it was dirty before we set_page_dirty(), and only clear the bit if it wasn't dirty before. - Previously, we unconditionally removed the page from the page cache in the error path. But in the continue case, we didn't add it - it was already there because the page is present in some second (non-UFFD-registered) mapping. So, removing it is invalid. Because the error handling issues are easy to exercise in the selftest, make a small modification there to do so. Finally, refactor shmem_mcopy_atomic_pte a bit. By this point, we've added a lot of "if (!is_continue)"-s everywhere. It's cleaner to just check for that mode first thing, and then "goto" down to where the parts we actually want are. This leaves the code in between cleaner. Changes since v1: - Refactor to skip ahead with goto, instead of adding several more "if (!is_continue)". - Fix unconditional ClearPageDirty(). - Don't pte_mkwrite() when is_continue && !VM_SHARED. Fixes: 00da60b9d0a0 ("userfaultfd: support minor fault handling for shmem") Signed-off-by: Axel Rasmussen <axelrasmussen(a)google.com> --- mm/shmem.c | 67 ++++++++++++++---------- tools/testing/selftests/vm/userfaultfd.c | 12 +++++ 2 files changed, 51 insertions(+), 28 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index d2e0e81b7d2e..8ab1f1f29987 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2378,17 +2378,22 @@ int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd, pte_t _dst_pte, *dst_pte; int ret; pgoff_t offset, max_off; - - ret = -ENOMEM; - if (!shmem_inode_acct_block(inode, 1)) - goto out; + int writable; + bool was_dirty; if (is_continue) { ret = -EFAULT; page = find_lock_page(mapping, pgoff); if (!page) - goto out_unacct_blocks; - } else if (!*pagep) { + goto out; + goto install_ptes; + } + + ret = -ENOMEM; + if (!shmem_inode_acct_block(inode, 1)) + goto out; + + if (!*pagep) { page = shmem_alloc_page(gfp, info, pgoff); if (!page) goto out_unacct_blocks; @@ -2415,13 +2420,11 @@ int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd, *pagep = NULL; } - if (!is_continue) { - VM_BUG_ON(PageSwapBacked(page)); - VM_BUG_ON(PageLocked(page)); - __SetPageLocked(page); - __SetPageSwapBacked(page); - __SetPageUptodate(page); - } + VM_BUG_ON(PageSwapBacked(page)); + VM_BUG_ON(PageLocked(page)); + __SetPageLocked(page); + __SetPageSwapBacked(page); + __SetPageUptodate(page); ret = -EFAULT; offset = linear_page_index(dst_vma, dst_addr); @@ -2429,16 +2432,18 @@ int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd, if (unlikely(offset >= max_off)) goto out_release; - /* If page wasn't already in the page cache, add it. */ - if (!is_continue) { - ret = shmem_add_to_page_cache(page, mapping, pgoff, NULL, - gfp & GFP_RECLAIM_MASK, dst_mm); - if (ret) - goto out_release; - } + ret = shmem_add_to_page_cache(page, mapping, pgoff, NULL, + gfp & GFP_RECLAIM_MASK, dst_mm); + if (ret) + goto out_release; +install_ptes: _dst_pte = mk_pte(page, dst_vma->vm_page_prot); - if (dst_vma->vm_flags & VM_WRITE) + /* For CONTINUE on a non-shared VMA, don't pte_mkwrite for CoW. */ + writable = is_continue && !(dst_vma->vm_flags & VM_SHARED) + ? 0 + : dst_vma->vm_flags & VM_WRITE; + if (writable) _dst_pte = pte_mkwrite(pte_mkdirty(_dst_pte)); else { /* @@ -2448,15 +2453,18 @@ int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd, * unconditionally before unlock_page(), but doing it * only if VM_WRITE is not set is faster. */ + was_dirty = PageDirty(page); set_page_dirty(page); } dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl); - ret = -EFAULT; - max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); - if (unlikely(offset >= max_off)) - goto out_release_unlock; + if (!is_continue) { + ret = -EFAULT; + max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); + if (unlikely(offset >= max_off)) + goto out_release_unlock; + } ret = -EEXIST; if (!pte_none(*dst_pte)) @@ -2485,13 +2493,16 @@ int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd, return ret; out_release_unlock: pte_unmap_unlock(dst_pte, ptl); - ClearPageDirty(page); - delete_from_page_cache(page); + if (!was_dirty) + ClearPageDirty(page); + if (!is_continue) + delete_from_page_cache(page); out_release: unlock_page(page); put_page(page); out_unacct_blocks: - shmem_inode_unacct_blocks(inode, 1); + if (!is_continue) + shmem_inode_unacct_blocks(inode, 1); goto out; } #endif /* CONFIG_USERFAULTFD */ diff --git a/tools/testing/selftests/vm/userfaultfd.c b/tools/testing/selftests/vm/userfaultfd.c index f6c86b036d0f..d8541a59dae5 100644 --- a/tools/testing/selftests/vm/userfaultfd.c +++ b/tools/testing/selftests/vm/userfaultfd.c @@ -485,6 +485,7 @@ static void wp_range(int ufd, __u64 start, __u64 len, bool wp) static void continue_range(int ufd, __u64 start, __u64 len) { struct uffdio_continue req; + int ret; req.range.start = start; req.range.len = len; @@ -493,6 +494,17 @@ static void continue_range(int ufd, __u64 start, __u64 len) if (ioctl(ufd, UFFDIO_CONTINUE, &req)) err("UFFDIO_CONTINUE failed for address 0x%" PRIx64, (uint64_t)start); + + /* + * Error handling within the kernel for continue is subtly different + * from copy or zeropage, so it may be a source of bugs. Trigger an + * error (-EEXIST) on purpose, to verify doing so doesn't cause a BUG. + */ + req.mapped = 0; + ret = ioctl(ufd, UFFDIO_CONTINUE, &req); + if (ret >= 0 || req.mapped != -EEXIST) + err("failed to exercise UFFDIO_CONTINUE error handling, ret=%d, mapped=%" PRId64, + ret, req.mapped); } static void *locking_thread(void *arg) -- 2.31.0.291.g576ba9dcdaf-goog

4 years, 2 months

2
1
0 0

[PATCH AUTOSEL 5.10 27/33] selftests/vm: fix out-of-tree build

by Sasha Levin

From: Rong Chen <rong.a.chen(a)intel.com> [ Upstream commit 19ec368cbc7ee1915e78c120b7a49c7f14734192 ] When building out-of-tree, attempting to make target from $(OUTPUT) directory: make[1]: *** No rule to make target '$(OUTPUT)/protection_keys.c', needed by '$(OUTPUT)/protection_keys_32'. Link: https://lkml.kernel.org/r/20210315094700.522753-1-rong.a.chen@intel.com Signed-off-by: Rong Chen <rong.a.chen(a)intel.com> Reported-by: kernel test robot <lkp(a)intel.com> Cc: Shuah Khan <shuah(a)kernel.org> Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds(a)linux-foundation.org> Signed-off-by: Sasha Levin <sashal(a)kernel.org> --- tools/testing/selftests/vm/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile index e63f31632708..2cf32e6b376e 100644 --- a/tools/testing/selftests/vm/Makefile +++ b/tools/testing/selftests/vm/Makefile @@ -99,7 +99,7 @@ endef ifeq ($(CAN_BUILD_I386),1) $(BINARIES_32): CFLAGS += -m32 $(BINARIES_32): LDLIBS += -lrt -ldl -lm -$(BINARIES_32): %_32: %.c +$(BINARIES_32): $(OUTPUT)/%_32: %.c $(CC) $(CFLAGS) $(EXTRA_CFLAGS) $(notdir $^) $(LDLIBS) -o $@ $(foreach t,$(TARGETS),$(eval $(call gen-target-rule-32,$(t)))) endif @@ -107,7 +107,7 @@ endif ifeq ($(CAN_BUILD_X86_64),1) $(BINARIES_64): CFLAGS += -m64 $(BINARIES_64): LDLIBS += -lrt -ldl -$(BINARIES_64): %_64: %.c +$(BINARIES_64): $(OUTPUT)/%_64: %.c $(CC) $(CFLAGS) $(EXTRA_CFLAGS) $(notdir $^) $(LDLIBS) -o $@ $(foreach t,$(TARGETS),$(eval $(call gen-target-rule-64,$(t)))) endif -- 2.30.1

4 years, 2 months

1
0
0 0

2025

2024

2023

2022

2021

2020

2019

2018

2017

Linux-kselftest-mirror March 2021