July 2024 - Linux-stable-mirror

[PATCH] mm: fix maxnode for mbind(), set_mempolicy() and migrate_pages()

by Jerome Glisse

Because maxnode bug there is no way to bind or migrate_pages to the last node in multi-node NUMA system unless you lie about maxnodes when making the mbind, set_mempolicy or migrate_pages syscall. Manpage for those syscall describe maxnodes as the number of bits in the node bitmap ("bit mask of nodes containing up to maxnode bits"). Thus if maxnode is n then we expect to have a n bit(s) bitmap which means that the mask of valid bits is ((1 << n) - 1). The get_nodes() decrement lead to the mask being ((1 << (n - 1)) - 1). The three syscalls use a common helper get_nodes() and first things this helper do is decrement maxnode by 1 which leads to using n-1 bits in the provided mask of nodes (see get_bitmap() an helper function to get_nodes()). The lead to two bugs, either the last node in the bitmap provided will not be use in either of the three syscalls, or the syscalls will error out and return EINVAL if the only bit set in the bitmap was the last bit in the mask of nodes (which is ignored because of the bug and an empty mask of nodes is an invalid argument). I am surprised this bug was never caught ... it has been in the kernel since forever. People can use the following function to detect if the kernel has the bug: bool kernel_has_maxnodes_bug(void) { unsigned long nodemask = 1; bool has_bug; long res; res = set_mempolicy(MPOL_BIND, &nodemask, 1); has_bug = res && (errno == EINVAL); set_mempolicy(MPOL_DEFAULT, NULL, 0); return has_bug; } You can tested with any of the three program below: gcc mbind.c -o mbind -lnuma gcc set_mempolicy.c -o set_mempolicy -lnuma gcc migrate_pages.c -o migrate_pages -lnuma First argument is maxnode, second argument is the bit index to set in the mask of node (0 set the first bit, 1 the second bit, ...). ./mbind 2 1 & sleep 2 && numastat -n -p `pidof mbind` && fg ./set_mempolicy 2 1 & sleep 2 && numastat -n -p `pidof set_mempolicy` && fg ./migrate_pages 2 1 & sleep 2 && numastat -n -p `pidof migrate_pages` && fg mbind.c %< ---------------------------------------------------------- void *anon_mem(size_t size) { void *ret; ret = mmap(NULL, size, PROT_READ| PROT_WRITE, MAP_PRIVATE| MAP_ANON, -1, 0); return ret == MAP_FAILED ? NULL : ret; } unsigned long mround(unsigned long v, unsigned long m) { if (m == 0) { return v; } return v + m - (v % m); } void bitmap_set(void *_bitmap, unsigned long b) { uint8_t *bitmap = _bitmap; bitmap[b >> 3] |= (1 << (b & 7)); } int main(int argc, char *argv[]) { unsigned long *nodemask, maxnode, node, i; size_t bytes; int8_t *mem; long res; if (argv[1] == NULL || argv[2] == NULL) { printf("missing argument: %s maxnodes node\n", argv[0]); return -1; } maxnode = atoi(argv[1]); node = atoi(argv[2]); bytes = mround(mround(maxnode, 8) >> 3, sizeof(unsigned long)); nodemask = calloc(bytes, 1); mem = anon_mem(NPAGES << 12); if (!mem || !nodemask) { return -1; } // Try to bind memory to node bitmap_set(nodemask, node); res = mbind(mem, NPAGES << 12, MPOL_BIND, nodemask, maxnode, 0); if (res) { printf("mbind(mem, NPAGES << 12, MPOL_BIND, " "nodemask, %d, 0) failed with %d\n", maxnode, errno); return -1; } // Write something to breakup from the zero page for (unsigned i = 0; i < NPAGES; i++) { mem[i << 12] = i + 1; } // Allow numastats to gather statistics getchar(); return 0; } set_mempolicy %< ---------------------------------------------------- void *anon_mem(size_t size) { void *ret; ret = mmap(NULL, size, PROT_READ| PROT_WRITE, MAP_PRIVATE| MAP_ANON, -1, 0); return ret == MAP_FAILED ? NULL : ret; } unsigned long mround(unsigned long v, unsigned long m) { if (m == 0) { return v; } return v + m - (v % m); } void bitmap_set(void *_bitmap, unsigned long b) { uint8_t *bitmap = _bitmap; bitmap[b >> 3] |= (1 << (b & 7)); } int main(int argc, char *argv[]) { unsigned long *nodemask, maxnode, node, i; size_t bytes; int8_t *mem; long res; if (argv[1] == NULL || argv[2] == NULL) { printf("missing argument: %s maxnodes node\n", argv[0]); return -1; } maxnode = atoi(argv[1]); node = atoi(argv[2]); // bind memory to node 0 ... i = 1; res = set_mempolicy(MPOL_BIND, i, 2); if (res) { printf("set_mempolicy(MPOL_BIND, []=1, %d) " "failed with %d\n", maxnode, errno); return -1; } bytes = mround(mround(maxnode, 8) >> 3, sizeof(unsigned long)); nodemask = calloc(bytes, 1); mem = anon_mem(NPAGES << 12); if (!mem || !nodemask) { return -1; } // Try to bind memory to node bitmap_set(nodemask, node); res = set_mempolicy(MPOL_BIND, nodemask, maxnode); if (res) { printf("set_mempolicy(MPOL_BIND, nodemask, %d) " "failed with %d\n", maxnode, errno); return -1; } // Write something to breakup from the zero page for (unsigned i = 0; i < NPAGES; i++) { mem[i << 12] = i + 1; } // Allow numastats to gather statistics getchar(); return 0; } migrate_pages %< ---------------------------------------------------- void *anon_mem(size_t size) { void *ret; ret = mmap(NULL, size, PROT_READ| PROT_WRITE, MAP_PRIVATE| MAP_ANON, -1, 0); return ret == MAP_FAILED ? NULL : ret; } unsigned long mround(unsigned long v, unsigned long m) { if (m == 0) { return v; } return v + m - (v % m); } void bitmap_set(void *_bitmap, unsigned long b) { uint8_t *bitmap = _bitmap; bitmap[b >> 3] |= (1 << (b & 7)); } int main(int argc, char *argv[]) { unsigned long *old_nodes, *new_nodes, maxnode, node, i; size_t bytes; int8_t *mem; long res; if (argv[1] == NULL || argv[2] == NULL) { printf("missing argument: %s maxnodes node\n", argv[0]); return -1; } maxnode = atoi(argv[1]); node = atoi(argv[2]); // bind memory to node 0 ... i = 1; res = set_mempolicy(MPOL_BIND, &i, 2); if (res) { printf("set_mempolicy(MPOL_BIND, []=1, %d) " "failed with %d\n", maxnode, errno); return -1; } bytes = mround(mround(maxnode, 8) >> 3, sizeof(unsigned long)); old_nodes = calloc(bytes, 1); new_nodes = calloc(bytes, 1); mem = anon_mem(NPAGES << 12); if (!mem || !new_nodes || !old_nodes) { return -1; } // Write something to breakup from the zero page for (unsigned i = 0; i < NPAGES; i++) { mem[i << 12] = i + 1; } // Try to bind memory to node bitmap_set(old_nodes, 0); bitmap_set(new_nodes, node); res = migrate_pages(getpid(), maxnode, old_nodes, new_nodes); if (res) { printf("migrate_pages(pid, %d, old_nodes, " "new_nodes) failed with %d\n", maxnode, errno); return -1; } // Allow numastats to gather statistics getchar(); return 0; } Signed-off-by: Jérôme Glisse <jglisse(a)google.com> To: Andrew Morton <akpm(a)linux-foundation.org> To: linux-mm(a)kvack.org Cc: linux-kernel(a)vger.kernel.org Cc: stable(a)vger.kernel.org --- mm/mempolicy.c | 1 - 1 file changed, 1 deletion(-) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index aec756ae5637..658e5366d266 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1434,7 +1434,6 @@ static int get_bitmap(unsigned long *mask, const unsigned long __user *nmask, static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask, unsigned long maxnode) { - --maxnode; nodes_clear(*nodes); if (maxnode == 0 || !nmask) return 0; -- 2.45.2.1089.g2a221341d9-goog

7 months, 2 weeks

4
8
0 0

+ crash-fix-x86_32-crash-memory-reserve-dead-loop-bug-at-high.patch added to mm-nonmm-unstable branch

by Andrew Morton

The patch titled Subject: crash: fix x86_32 crash memory reserve dead loop bug at high has been added to the -mm mm-nonmm-unstable branch. Its filename is crash-fix-x86_32-crash-memory-reserve-dead-loop-bug-at-high.patch This patch will shortly appear at https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patche… This patch will later appear in the mm-nonmm-unstable branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm Before you just go and hit "reply", please: a) Consider who else should be cc'ed b) Prefer to cc a suitable mailing list as well c) Ideally: find the original patch on the mailing list and do a reply-to-all to that, adding suitable additional cc's *** Remember to use Documentation/process/submit-checklist.rst when testing your code *** The -mm tree is included into linux-next via the mm-everything branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm and is updated there every 2-3 working days ------------------------------------------------------ From: Jinjie Ruan <ruanjinjie(a)huawei.com> Subject: crash: fix x86_32 crash memory reserve dead loop bug at high Date: Thu, 18 Jul 2024 11:54:43 +0800 On x86_32 Qemu machine with 1GB memory, the cmdline "crashkernel=512M" will also cause system stall as below: ACPI: Reserving FACP table memory at [mem 0x3ffe18b8-0x3ffe192b] ACPI: Reserving DSDT table memory at [mem 0x3ffe0040-0x3ffe18b7] ACPI: Reserving FACS table memory at [mem 0x3ffe0000-0x3ffe003f] ACPI: Reserving APIC table memory at [mem 0x3ffe192c-0x3ffe19bb] ACPI: Reserving HPET table memory at [mem 0x3ffe19bc-0x3ffe19f3] ACPI: Reserving WAET table memory at [mem 0x3ffe19f4-0x3ffe1a1b] 143MB HIGHMEM available. 879MB LOWMEM available. mapped low ram: 0 - 36ffe000 low ram: 0 - 36ffe000 (stall here) The reason is that the CRASH_ADDR_LOW_MAX is equal to CRASH_ADDR_HIGH_MAX on x86_32, the first "low" crash kernel memory reservation for 512M fails, then it go into the "retry" loop and never came out as below (consider CRASH_ADDR_LOW_MAX = CRASH_ADDR_HIGH_MAX = 512M): -> reserve_crashkernel_generic() and high is false -> alloc at [0, 0x20000000] fail -> alloc at [0x20000000, 0x20000000] fail and repeatedly (because CRASH_ADDR_LOW_MAX = CRASH_ADDR_HIGH_MAX). Fix it by skipping meaningless calls of memblock_phys_alloc_range() with `start = end` After this patch, the retry dead loop is avoided and print below info: cannot allocate crashkernel (size:0x20000000) And apply generic crashkernel reservation to 32bit system will be ready. Link: https://lkml.kernel.org/r/20240718035444.2977105-3-ruanjinjie@huawei.com Fixes: 9c08a2a139fe ("x86: kdump: use generic interface to simplify crashkernel reservation code") Signed-off-by: Jinjie Ruan <ruanjinjie(a)huawei.com> Signed-off-by: Baoquan He <bhe(a)redhat.com> Tested-by: Jinjie Ruan <ruanjinjie(a)huawei.com> Cc: Albert Ou <aou(a)eecs.berkeley.edu> Cc: Andrew Davis <afd(a)ti.com> Cc: Arnd Bergmann <arnd(a)arndb.de> Cc: Borislav Petkov <bp(a)alien8.de> Cc: Catalin Marinas <catalin.marinas(a)arm.com> Cc: Chen Jiahao <chenjiahao16(a)huawei.com> Cc: Dave Hansen <dave.hansen(a)linux.intel.com> Cc: Dave Young <dyoung(a)redhat.com> Cc: Eric DeVolder <eric.devolder(a)oracle.com> Cc: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org> Cc: Hari Bathini <hbathini(a)linux.ibm.com> Cc: Helge Deller <deller(a)gmx.de> Cc: "H. Peter Anvin" <hpa(a)zytor.com> Cc: Ingo Molnar <mingo(a)redhat.com> Cc: Javier Martinez Canillas <javierm(a)redhat.com> Cc: Linus Walleij <linus.walleij(a)linaro.org> Cc: Palmer Dabbelt <palmer(a)dabbelt.com> Cc: Paul Walmsley <paul.walmsley(a)sifive.com> Cc: Rob Herring <robh(a)kernel.org> Cc: Russell King <linux(a)armlinux.org.uk> Cc: Thomas Gleixner <tglx(a)linutronix.de> Cc: Vivek Goyal <vgoyal(a)redhat.com> Cc: Will Deacon <will(a)kernel.org> Cc: Zhen Lei <thunder.leizhen(a)huawei.com> Cc: <stable(a)vger.kernel.org> Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org> --- kernel/crash_reserve.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) --- a/kernel/crash_reserve.c~crash-fix-x86_32-crash-memory-reserve-dead-loop-bug-at-high +++ a/kernel/crash_reserve.c @@ -413,7 +413,8 @@ retry: search_end = CRASH_ADDR_HIGH_MAX; search_base = CRASH_ADDR_LOW_MAX; crash_low_size = DEFAULT_CRASH_KERNEL_LOW_SIZE; - goto retry; + if (search_base != search_end) + goto retry; } /* _ Patches currently in -mm which might be from ruanjinjie(a)huawei.com are crash-fix-x86_32-crash-memory-reserve-dead-loop-bug.patch crash-fix-x86_32-crash-memory-reserve-dead-loop-bug-at-high.patch arm-use-generic-interface-to-simplify-crashkernel-reservation.patch

7 months, 2 weeks

1
0
0 0

[PATCH v2 4.19 0/4] ext4: improve delalloc buffer write performance

by WangYuli

Changes since v1: Fixed some formatting errors to make the patchset less confusing. A patchset from linux-5.15 should be backported to 4.19 that can significantly improve ext4 fs read and write performance. Unixbench test results for linux-4.19.318 on Phytium D2000 CPU are shown below. Test cmd: (Phytium D2000 only has 8 cores) ./Run fs -c 8 Before this patch set: File Copy 1024 bufsize 2000 maxblocks 1124181 File Copy 256 bufsize 500 maxblocks 281885 File Copy 4096 bufsize 8000 maxblocks 3383785 File Read 1024 bufsize 2000 maxblocks 8702173 File Read 256 bufsize 500 maxblocks 3869384 File Read 4096 bufsize 8000 maxblocks 13043151 File Write 1024 bufsize 2000 maxblocks 1107185 File Write 256 bufsize 500 maxblocks 270493 File Write 4096 bufsize 8000 maxblocks 4018084 After this patch set: File Copy 1024 bufsize 2000 maxblocks 2026206 File Copy 256 bufsize 500 maxblocks 829534 File Copy 4096 bufsize 8000 maxblocks 4066659 File Read 1024 bufsize 2000 maxblocks 8877219 File Read 256 bufsize 500 maxblocks 3997445 File Read 4096 bufsize 8000 maxblocks 13179885 File Write 1024 bufsize 2000 maxblocks 4256929 File Write 256 bufsize 500 maxblocks 1305320 File Write 4096 bufsize 8000 maxblocks 10721052 We can observe a quantum leap in the test results as a consequence of applying this patchset Link: https://lore.kernel.org/all/20210716122024.1105856-1-yi.zhang@huawei.com/ Original description: This patchset address to improve buffer write performance with delalloc. The first patch reduce the unnecessary update i_disksize, the second two patch refactor the inline data write procedure and also do some small fix, the last patch do improve by remove all unnecessary journal handle in the delalloc write procedure. After this patch set, we could get a lot of performance improvement. Below is the Unixbench comparison data test on my machine with 'Intel Xeon Gold 5120' CPU and nvme SSD backend. Test cmd: ./Run -c 56 -i 3 fstime fsbuffer fsdisk Before this patch set: System Benchmarks Partial Index BASELINE RESULT INDEX File Copy 1024 bufsize 2000 maxblocks 3960.0 422965.0 1068.1 File Copy 256 bufsize 500 maxblocks 1655.0 105077.0 634.9 File Copy 4096 bufsize 8000 maxblocks 5800.0 1429092.0 2464.0 ======== System Benchmarks Index Score (Partial Only) 1186.6 After this patch set: System Benchmarks Partial Index BASELINE RESULT INDEX File Copy 1024 bufsize 2000 maxblocks 3960.0 732716.0 1850.3 File Copy 256 bufsize 500 maxblocks 1655.0 184940.0 1117.5 File Copy 4096 bufsize 8000 maxblocks 5800.0 2427152.0 4184.7 ======== System Benchmarks Index Score (Partial Only) 2053.0 Zhang Yi (4): ext4: check and update i_disksize properly ext4: correct the error path of ext4_write_inline_data_end() ext4: factor out write end code of inline file ext4: drop unnecessary journal handle in delalloc write fs/ext4/ext4.h | 3 - fs/ext4/inline.c | 120 ++++++++++++++++++------------------- fs/ext4/inode.c | 150 ++++++++++++----------------------------------- 3 files changed, 99 insertions(+), 174 deletions(-) -- 2.31.1

7 months, 2 weeks

2
4
0 0

+ mm-page_alloc-fix-pcp-count-race-between-drain_pages_zone-vs-__rmqueue_pcplist.patch added to mm-hotfixes-unstable branch

by Andrew Morton

The patch titled Subject: mm/page_alloc: fix pcp->count race between drain_pages_zone() vs __rmqueue_pcplist() has been added to the -mm mm-hotfixes-unstable branch. Its filename is mm-page_alloc-fix-pcp-count-race-between-drain_pages_zone-vs-__rmqueue_pcplist.patch This patch will shortly appear at https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patche… This patch will later appear in the mm-hotfixes-unstable branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm Before you just go and hit "reply", please: a) Consider who else should be cc'ed b) Prefer to cc a suitable mailing list as well c) Ideally: find the original patch on the mailing list and do a reply-to-all to that, adding suitable additional cc's *** Remember to use Documentation/process/submit-checklist.rst when testing your code *** The -mm tree is included into linux-next via the mm-everything branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm and is updated there every 2-3 working days ------------------------------------------------------ From: Li Zhijian <lizhijian(a)fujitsu.com> Subject: mm/page_alloc: fix pcp->count race between drain_pages_zone() vs __rmqueue_pcplist() Date: Tue, 23 Jul 2024 14:44:28 +0800 It's expected that no page should be left in pcp_list after calling zone_pcp_disable() in offline_pages(). Previously, it's observed that offline_pages() gets stuck [1] due to some pages remaining in pcp_list. Cause: There is a race condition between drain_pages_zone() and __rmqueue_pcplist() involving the pcp->count variable. See below scenario: CPU0 CPU1 ---------------- --------------- spin_lock(&pcp->lock); __rmqueue_pcplist() { zone_pcp_disable() { /* list is empty */ if (list_empty(list)) { /* add pages to pcp_list */ alloced = rmqueue_bulk() mutex_lock(&pcp_batch_high_lock) ... __drain_all_pages() { drain_pages_zone() { /* read pcp->count, it's 0 here */ count = READ_ONCE(pcp->count) /* 0 means nothing to drain */ /* update pcp->count */ pcp->count += alloced << order; ... ... spin_unlock(&pcp->lock); In this case, after calling zone_pcp_disable() though, there are still some pages in pcp_list. And these pages in pcp_list are neither movable nor isolated, offline_pages() gets stuck as a result. Solution: Expand the scope of the pcp->lock to also protect pcp->count in drain_pages_zone(), to ensure no pages are left in the pcp list after zone_pcp_disable() [1] https://lore.kernel.org/linux-mm/6a07125f-e720-404c-b2f9-e55f3f166e85@fujit… Link: https://lkml.kernel.org/r/20240723064428.1179519-1-lizhijian@fujitsu.com Fixes: 4b23a68f9536 ("mm/page_alloc: protect PCP lists with a spinlock") Signed-off-by: Li Zhijian <lizhijian(a)fujitsu.com> Reported-by: Yao Xingtao <yaoxt.fnst(a)fujitsu.com> Reviewed-by: Vlastimil Babka <vbabka(a)suse.cz> Cc: David Hildenbrand <david(a)redhat.com> Cc: <stable(a)vger.kernel.org> Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org> --- mm/page_alloc.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) --- a/mm/page_alloc.c~mm-page_alloc-fix-pcp-count-race-between-drain_pages_zone-vs-__rmqueue_pcplist +++ a/mm/page_alloc.c @@ -2343,16 +2343,20 @@ void drain_zone_pages(struct zone *zone, static void drain_pages_zone(unsigned int cpu, struct zone *zone) { struct per_cpu_pages *pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu); - int count = READ_ONCE(pcp->count); - - while (count) { - int to_drain = min(count, pcp->batch << CONFIG_PCP_BATCH_SCALE_MAX); - count -= to_drain; + int count; + do { spin_lock(&pcp->lock); - free_pcppages_bulk(zone, to_drain, pcp, 0); + count = pcp->count; + if (count) { + int to_drain = min(count, + pcp->batch << CONFIG_PCP_BATCH_SCALE_MAX); + + free_pcppages_bulk(zone, to_drain, pcp, 0); + count -= to_drain; + } spin_unlock(&pcp->lock); - } + } while (count); } /* _ Patches currently in -mm which might be from lizhijian(a)fujitsu.com are mm-page_alloc-fix-pcp-count-race-between-drain_pages_zone-vs-__rmqueue_pcplist.patch

7 months, 2 weeks

1
0
0 0

+ scripts-gdb-fix-lx-mounts-command-error.patch added to mm-nonmm-unstable branch

by Andrew Morton

The patch titled Subject: scripts/gdb: fix lx-mounts command error has been added to the -mm mm-nonmm-unstable branch. Its filename is scripts-gdb-fix-lx-mounts-command-error.patch This patch will shortly appear at https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patche… This patch will later appear in the mm-nonmm-unstable branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm Before you just go and hit "reply", please: a) Consider who else should be cc'ed b) Prefer to cc a suitable mailing list as well c) Ideally: find the original patch on the mailing list and do a reply-to-all to that, adding suitable additional cc's *** Remember to use Documentation/process/submit-checklist.rst when testing your code *** The -mm tree is included into linux-next via the mm-everything branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm and is updated there every 2-3 working days ------------------------------------------------------ From: Kuan-Ying Lee <kuan-ying.lee(a)canonical.com> Subject: scripts/gdb: fix lx-mounts command error Date: Tue, 23 Jul 2024 14:48:59 +0800 (gdb) lx-mounts mount super_block devname pathname fstype options Python Exception <class 'gdb.error'>: There is no member named list. Error occurred in Python: There is no member named list. We encounter the above issue after commit 2eea9ce4310d ("mounts: keep list of mounts in an rbtree"). The commit move a mount from list into rbtree. So we can instead use rbtree to iterate all mounts information. Link: https://lkml.kernel.org/r/20240723064902.124154-4-kuan-ying.lee@canonical.c… Fixes: 2eea9ce4310d ("mounts: keep list of mounts in an rbtree") Signed-off-by: Kuan-Ying Lee <kuan-ying.lee(a)canonical.com> Cc: Jan Kiszka <jan.kiszka(a)siemens.com> Cc: Kieran Bingham <kbingham(a)kernel.org> Cc: <stable(a)vger.kernel.org> Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org> --- scripts/gdb/linux/proc.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) --- a/scripts/gdb/linux/proc.py~scripts-gdb-fix-lx-mounts-command-error +++ a/scripts/gdb/linux/proc.py @@ -18,6 +18,7 @@ from linux import utils from linux import tasks from linux import lists from linux import vfs +from linux import rbtree from struct import * @@ -172,8 +173,7 @@ values of that process namespace""" gdb.write("{:^18} {:^15} {:>9} {} {} options\n".format( "mount", "super_block", "devname", "pathname", "fstype")) - for mnt in lists.list_for_each_entry(namespace['list'], - mount_ptr_type, "mnt_list"): + for mnt in rbtree.rb_inorder_for_each_entry(namespace['mounts'], mount_ptr_type, "mnt_node"): devname = mnt['mnt_devname'].string() devname = devname if devname else "none" _ Patches currently in -mm which might be from kuan-ying.lee(a)canonical.com are scripts-gdb-fix-timerlist-parsing-issue.patch scripts-gdb-add-iteration-function-for-rbtree.patch scripts-gdb-fix-lx-mounts-command-error.patch scripts-gdb-add-lx-stack_depot_lookup-command.patch scripts-gdb-add-lx-kasan_mem_to_shadow-command.patch

7 months, 2 weeks

1
0
0 0

+ scripts-gdb-add-iteration-function-for-rbtree.patch added to mm-nonmm-unstable branch

by Andrew Morton

The patch titled Subject: scripts/gdb: add iteration function for rbtree has been added to the -mm mm-nonmm-unstable branch. Its filename is scripts-gdb-add-iteration-function-for-rbtree.patch This patch will shortly appear at https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patche… This patch will later appear in the mm-nonmm-unstable branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm Before you just go and hit "reply", please: a) Consider who else should be cc'ed b) Prefer to cc a suitable mailing list as well c) Ideally: find the original patch on the mailing list and do a reply-to-all to that, adding suitable additional cc's *** Remember to use Documentation/process/submit-checklist.rst when testing your code *** The -mm tree is included into linux-next via the mm-everything branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm and is updated there every 2-3 working days ------------------------------------------------------ From: Kuan-Ying Lee <kuan-ying.lee(a)canonical.com> Subject: scripts/gdb: add iteration function for rbtree Date: Tue, 23 Jul 2024 14:48:58 +0800 Add inorder iteration function for rbtree usage. This is a preparation patch for the next patch to fix the gdb mounts issue. Link: https://lkml.kernel.org/r/20240723064902.124154-3-kuan-ying.lee@canonical.c… Fixes: 2eea9ce4310d ("mounts: keep list of mounts in an rbtree") Signed-off-by: Kuan-Ying Lee <kuan-ying.lee(a)canonical.com> Cc: Jan Kiszka <jan.kiszka(a)siemens.com> Cc: Kieran Bingham <kbingham(a)kernel.org> Cc: <stable(a)vger.kernel.org> Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org> --- scripts/gdb/linux/rbtree.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) --- a/scripts/gdb/linux/rbtree.py~scripts-gdb-add-iteration-function-for-rbtree +++ a/scripts/gdb/linux/rbtree.py @@ -9,6 +9,18 @@ from linux import utils rb_root_type = utils.CachedType("struct rb_root") rb_node_type = utils.CachedType("struct rb_node") +def rb_inorder_for_each(root): + def inorder(node): + if node: + yield from inorder(node['rb_left']) + yield node + yield from inorder(node['rb_right']) + + yield from inorder(root['rb_node']) + +def rb_inorder_for_each_entry(root, gdbtype, member): + for node in rb_inorder_for_each(root): + yield utils.container_of(node, gdbtype, member) def rb_first(root): if root.type == rb_root_type.get_type(): _ Patches currently in -mm which might be from kuan-ying.lee(a)canonical.com are scripts-gdb-fix-timerlist-parsing-issue.patch scripts-gdb-add-iteration-function-for-rbtree.patch scripts-gdb-fix-lx-mounts-command-error.patch scripts-gdb-add-lx-stack_depot_lookup-command.patch scripts-gdb-add-lx-kasan_mem_to_shadow-command.patch

7 months, 2 weeks

1
0
0 0

+ scripts-gdb-fix-timerlist-parsing-issue.patch added to mm-nonmm-unstable branch

by Andrew Morton

The patch titled Subject: scripts/gdb: fix timerlist parsing issue has been added to the -mm mm-nonmm-unstable branch. Its filename is scripts-gdb-fix-timerlist-parsing-issue.patch This patch will shortly appear at https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patche… This patch will later appear in the mm-nonmm-unstable branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm Before you just go and hit "reply", please: a) Consider who else should be cc'ed b) Prefer to cc a suitable mailing list as well c) Ideally: find the original patch on the mailing list and do a reply-to-all to that, adding suitable additional cc's *** Remember to use Documentation/process/submit-checklist.rst when testing your code *** The -mm tree is included into linux-next via the mm-everything branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm and is updated there every 2-3 working days ------------------------------------------------------ From: Kuan-Ying Lee <kuan-ying.lee(a)canonical.com> Subject: scripts/gdb: fix timerlist parsing issue Date: Tue, 23 Jul 2024 14:48:57 +0800 Patch series "Fix some GDB command error and add some GDB commands", v3. Fix some GDB command errors and add some useful GDB commands. This patch (of 5): Commit 7988e5ae2be7 ("tick: Split nohz and highres features from nohz_mode") and commit 7988e5ae2be7 ("tick: Split nohz and highres features from nohz_mode") move 'tick_stopped' and 'nohz_mode' to flags field which will break the gdb lx-mounts command: (gdb) lx-timerlist Python Exception <class 'gdb.error'>: There is no member named nohz_mode. Error occurred in Python: There is no member named nohz_mode. (gdb) lx-timerlist Python Exception <class 'gdb.error'>: There is no member named tick_stopped. Error occurred in Python: There is no member named tick_stopped. We move 'tick_stopped' and 'nohz_mode' to flags field instead. Link: https://lkml.kernel.org/r/20240723064902.124154-1-kuan-ying.lee@canonical.c… Link: https://lkml.kernel.org/r/20240723064902.124154-2-kuan-ying.lee@canonical.c… Fixes: a478ffb2ae23 ("tick: Move individual bit features to debuggable mask accesses") Fixes: 7988e5ae2be7 ("tick: Split nohz and highres features from nohz_mode") Signed-off-by: Kuan-Ying Lee <kuan-ying.lee(a)canonical.com> Cc: Jan Kiszka <jan.kiszka(a)siemens.com> Cc: Kieran Bingham <kbingham(a)kernel.org> Cc: <stable(a)vger.kernel.org> Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org> --- scripts/gdb/linux/timerlist.py | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) --- a/scripts/gdb/linux/timerlist.py~scripts-gdb-fix-timerlist-parsing-issue +++ a/scripts/gdb/linux/timerlist.py @@ -87,21 +87,22 @@ def print_cpu(hrtimer_bases, cpu, max_cl text += "\n" if constants.LX_CONFIG_TICK_ONESHOT: - fmts = [(" .{} : {}", 'nohz_mode'), - (" .{} : {} nsecs", 'last_tick'), - (" .{} : {}", 'tick_stopped'), - (" .{} : {}", 'idle_jiffies'), - (" .{} : {}", 'idle_calls'), - (" .{} : {}", 'idle_sleeps'), - (" .{} : {} nsecs", 'idle_entrytime'), - (" .{} : {} nsecs", 'idle_waketime'), - (" .{} : {} nsecs", 'idle_exittime'), - (" .{} : {} nsecs", 'idle_sleeptime'), - (" .{}: {} nsecs", 'iowait_sleeptime'), - (" .{} : {}", 'last_jiffies'), - (" .{} : {}", 'next_timer'), - (" .{} : {} nsecs", 'idle_expires')] - text += "\n".join([s.format(f, ts[f]) for s, f in fmts]) + TS_FLAG_STOPPED = 1 << 1 + TS_FLAG_NOHZ = 1 << 4 + text += f" .{'nohz':15s}: {int(bool(ts['flags'] & TS_FLAG_NOHZ))}\n" + text += f" .{'last_tick':15s}: {ts['last_tick']}\n" + text += f" .{'tick_stopped':15s}: {int(bool(ts['flags'] & TS_FLAG_STOPPED))}\n" + text += f" .{'idle_jiffies':15s}: {ts['idle_jiffies']}\n" + text += f" .{'idle_calls':15s}: {ts['idle_calls']}\n" + text += f" .{'idle_sleeps':15s}: {ts['idle_sleeps']}\n" + text += f" .{'idle_entrytime':15s}: {ts['idle_entrytime']} nsecs\n" + text += f" .{'idle_waketime':15s}: {ts['idle_waketime']} nsecs\n" + text += f" .{'idle_exittime':15s}: {ts['idle_exittime']} nsecs\n" + text += f" .{'idle_sleeptime':15s}: {ts['idle_sleeptime']} nsecs\n" + text += f" .{'iowait_sleeptime':15s}: {ts['iowait_sleeptime']} nsecs\n" + text += f" .{'last_jiffies':15s}: {ts['last_jiffies']}\n" + text += f" .{'next_timer':15s}: {ts['next_timer']}\n" + text += f" .{'idle_expires':15s}: {ts['idle_expires']} nsecs\n" text += "\njiffies: {}\n".format(jiffies) text += "\n" _ Patches currently in -mm which might be from kuan-ying.lee(a)canonical.com are scripts-gdb-fix-timerlist-parsing-issue.patch scripts-gdb-add-iteration-function-for-rbtree.patch scripts-gdb-fix-lx-mounts-command-error.patch scripts-gdb-add-lx-stack_depot_lookup-command.patch scripts-gdb-add-lx-kasan_mem_to_shadow-command.patch

7 months, 2 weeks

1
0
0 0

[PATCH net 1/2] ice: Add a per-VF limit on number of FDIR filters

by Tony Nguyen

From: Ahmed Zaki <ahmed.zaki(a)intel.com> While the iavf driver adds a s/w limit (128) on the number of FDIR filters that the VF can request, a malicious VF driver can request more than that and exhaust the resources for other VFs. Add a similar limit in ice. CC: stable(a)vger.kernel.org Fixes: 1f7ea1cd6a37 ("ice: Enable FDIR Configure for AVF") Reviewed-by: Przemek Kitszel <przemyslaw.kitszel(a)intel.com> Suggested-by: Sridhar Samudrala <sridhar.samudrala(a)intel.com> Signed-off-by: Ahmed Zaki <ahmed.zaki(a)intel.com> Reviewed-by: Wojciech Drewek <wojciech.drewek(a)intel.com> Tested-by: Rafal Romanowski <rafal.romanowski(a)intel.com> Signed-off-by: Tony Nguyen <anthony.l.nguyen(a)intel.com> --- .../net/ethernet/intel/ice/ice_ethtool_fdir.c | 2 +- drivers/net/ethernet/intel/ice/ice_fdir.h | 3 +++ .../net/ethernet/intel/ice/ice_virtchnl_fdir.c | 16 ++++++++++++++++ .../net/ethernet/intel/ice/ice_virtchnl_fdir.h | 1 + 4 files changed, 21 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/ice/ice_ethtool_fdir.c b/drivers/net/ethernet/intel/ice/ice_ethtool_fdir.c index e3cab8e98f52..5412eff8ef23 100644 --- a/drivers/net/ethernet/intel/ice/ice_ethtool_fdir.c +++ b/drivers/net/ethernet/intel/ice/ice_ethtool_fdir.c @@ -534,7 +534,7 @@ ice_parse_rx_flow_user_data(struct ethtool_rx_flow_spec *fsp, * * Returns the number of available flow director filters to this VSI */ -static int ice_fdir_num_avail_fltr(struct ice_hw *hw, struct ice_vsi *vsi) +int ice_fdir_num_avail_fltr(struct ice_hw *hw, struct ice_vsi *vsi) { u16 vsi_num = ice_get_hw_vsi_num(hw, vsi->idx); u16 num_guar; diff --git a/drivers/net/ethernet/intel/ice/ice_fdir.h b/drivers/net/ethernet/intel/ice/ice_fdir.h index 021ecbac7848..ab5b118daa2d 100644 --- a/drivers/net/ethernet/intel/ice/ice_fdir.h +++ b/drivers/net/ethernet/intel/ice/ice_fdir.h @@ -207,6 +207,8 @@ struct ice_fdir_base_pkt { const u8 *tun_pkt; }; +struct ice_vsi; + int ice_alloc_fd_res_cntr(struct ice_hw *hw, u16 *cntr_id); int ice_free_fd_res_cntr(struct ice_hw *hw, u16 cntr_id); int ice_alloc_fd_guar_item(struct ice_hw *hw, u16 *cntr_id, u16 num_fltr); @@ -218,6 +220,7 @@ int ice_fdir_get_gen_prgm_pkt(struct ice_hw *hw, struct ice_fdir_fltr *input, u8 *pkt, bool frag, bool tun); int ice_get_fdir_cnt_all(struct ice_hw *hw); +int ice_fdir_num_avail_fltr(struct ice_hw *hw, struct ice_vsi *vsi); bool ice_fdir_is_dup_fltr(struct ice_hw *hw, struct ice_fdir_fltr *input); bool ice_fdir_has_frag(enum ice_fltr_ptype flow); struct ice_fdir_fltr * diff --git a/drivers/net/ethernet/intel/ice/ice_virtchnl_fdir.c b/drivers/net/ethernet/intel/ice/ice_virtchnl_fdir.c index 8e4ff3af86c6..b4feb0927687 100644 --- a/drivers/net/ethernet/intel/ice/ice_virtchnl_fdir.c +++ b/drivers/net/ethernet/intel/ice/ice_virtchnl_fdir.c @@ -536,6 +536,8 @@ static void ice_vc_fdir_reset_cnt_all(struct ice_vf_fdir *fdir) fdir->fdir_fltr_cnt[flow][0] = 0; fdir->fdir_fltr_cnt[flow][1] = 0; } + + fdir->fdir_fltr_cnt_total = 0; } /** @@ -1560,6 +1562,7 @@ ice_vc_add_fdir_fltr_post(struct ice_vf *vf, struct ice_vf_fdir_ctx *ctx, resp->status = status; resp->flow_id = conf->flow_id; vf->fdir.fdir_fltr_cnt[conf->input.flow_type][is_tun]++; + vf->fdir.fdir_fltr_cnt_total++; ret = ice_vc_send_msg_to_vf(vf, ctx->v_opcode, v_ret, (u8 *)resp, len); @@ -1624,6 +1627,7 @@ ice_vc_del_fdir_fltr_post(struct ice_vf *vf, struct ice_vf_fdir_ctx *ctx, resp->status = status; ice_vc_fdir_remove_entry(vf, conf, conf->flow_id); vf->fdir.fdir_fltr_cnt[conf->input.flow_type][is_tun]--; + vf->fdir.fdir_fltr_cnt_total--; ret = ice_vc_send_msg_to_vf(vf, ctx->v_opcode, v_ret, (u8 *)resp, len); @@ -1790,6 +1794,7 @@ int ice_vc_add_fdir_fltr(struct ice_vf *vf, u8 *msg) struct virtchnl_fdir_add *stat = NULL; struct virtchnl_fdir_fltr_conf *conf; enum virtchnl_status_code v_ret; + struct ice_vsi *vf_vsi; struct device *dev; struct ice_pf *pf; int is_tun = 0; @@ -1798,6 +1803,17 @@ int ice_vc_add_fdir_fltr(struct ice_vf *vf, u8 *msg) pf = vf->pf; dev = ice_pf_to_dev(pf); + vf_vsi = ice_get_vf_vsi(vf); + +#define ICE_VF_MAX_FDIR_FILTERS 128 + if (!ice_fdir_num_avail_fltr(&pf->hw, vf_vsi) || + vf->fdir.fdir_fltr_cnt_total >= ICE_VF_MAX_FDIR_FILTERS) { + v_ret = VIRTCHNL_STATUS_ERR_PARAM; + dev_err(dev, "Max number of FDIR filters for VF %d is reached\n", + vf->vf_id); + goto err_exit; + } + ret = ice_vc_fdir_param_check(vf, fltr->vsi_id); if (ret) { v_ret = VIRTCHNL_STATUS_ERR_PARAM; diff --git a/drivers/net/ethernet/intel/ice/ice_virtchnl_fdir.h b/drivers/net/ethernet/intel/ice/ice_virtchnl_fdir.h index c5bcc8d7481c..ac6dcab454b4 100644 --- a/drivers/net/ethernet/intel/ice/ice_virtchnl_fdir.h +++ b/drivers/net/ethernet/intel/ice/ice_virtchnl_fdir.h @@ -29,6 +29,7 @@ struct ice_vf_fdir_ctx { struct ice_vf_fdir { u16 fdir_fltr_cnt[ICE_FLTR_PTYPE_MAX][ICE_FD_HW_SEG_MAX]; int prof_entry_cnt[ICE_FLTR_PTYPE_MAX][ICE_FD_HW_SEG_MAX]; + u16 fdir_fltr_cnt_total; struct ice_fd_hw_prof **fdir_prof; struct idr fdir_rule_idr; -- 2.41.0

7 months, 2 weeks

1
0
0 0

Re: [PATCH 6.9 000/163] 6.9.11-rc1 review

by Ronald Warsow

Hi Greg no regressions here on x86_64 (RKL, Intel 11th Gen. CPU) Thanks Tested-by: Ronald Warsow <rwarsow(a)gmx.de>

7 months, 2 weeks

1
0
0 0

[PATCH] clk: samsung: fix getting Exynos4 fin_pll rate from external clocks

by Krzysztof Kozlowski

Commit 0dc83ad8bfc9 ("clk: samsung: Don't register clkdev lookup for the fixed rate clocks") claimed registering clkdev lookup is not necessary anymore, but that was not entirely true: Exynos4210/4212/4412 clock code still relied on it to get the clock rate of xxti or xusbxti external clocks. Drop that requirement by accessing already registered clk_hw when looking up the xxti/xusbxti rate. Reported-by: Artur Weber <aweber.kernel(a)gmail.com> Closes: https://lore.kernel.org/all/6227c1fb-d769-462a-b79b-abcc15d3db8e@gmail.com/ Fixes: 0dc83ad8bfc9 ("clk: samsung: Don't register clkdev lookup for the fixed rate clocks") Cc: <stable(a)vger.kernel.org> Signed-off-by: Krzysztof Kozlowski <krzysztof.kozlowski(a)linaro.org> --- drivers/clk/samsung/clk-exynos4.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/drivers/clk/samsung/clk-exynos4.c b/drivers/clk/samsung/clk-exynos4.c index a026ccca7315..28945b6b0ee1 100644 --- a/drivers/clk/samsung/clk-exynos4.c +++ b/drivers/clk/samsung/clk-exynos4.c @@ -1040,19 +1040,20 @@ static unsigned long __init exynos4_get_xom(void) static void __init exynos4_clk_register_finpll(struct samsung_clk_provider *ctx) { struct samsung_fixed_rate_clock fclk; - struct clk *clk; - unsigned long finpll_f = 24000000; + unsigned long finpll_f; + unsigned int parent; char *parent_name; unsigned int xom = exynos4_get_xom(); parent_name = xom & 1 ? "xusbxti" : "xxti"; - clk = clk_get(NULL, parent_name); - if (IS_ERR(clk)) { + parent = xom & 1 ? CLK_XUSBXTI : CLK_XXTI; + + finpll_f = clk_hw_get_rate(ctx->clk_data.hws[parent]); + if (!finpll_f) { pr_err("%s: failed to lookup parent clock %s, assuming " "fin_pll clock frequency is 24MHz\n", __func__, parent_name); - } else { - finpll_f = clk_get_rate(clk); + finpll_f = 24000000; } fclk.id = CLK_FIN_PLL; -- 2.43.0

7 months, 2 weeks

3
2
0 0

2025

2024

2023

2022

2021

2020

2019

2018

2017

Linux-stable-mirror July 2024