February 2025 - Linux-stable-mirror

[PATCH] hfs/hfsplus: fix slab-out-of-bounds in hfs_bnode_read_key

by Vasiliy Kovalev

Syzbot reported an issue in hfs subsystem: BUG: KASAN: slab-out-of-bounds in memcpy_from_page include/linux/highmem.h:423 [inline] BUG: KASAN: slab-out-of-bounds in hfs_bnode_read fs/hfs/bnode.c:35 [inline] BUG: KASAN: slab-out-of-bounds in hfs_bnode_read_key+0x314/0x450 fs/hfs/bnode.c:70 Write of size 94 at addr ffff8880123cd100 by task syz-executor237/5102 Call Trace: <TASK> __dump_stack lib/dump_stack.c:94 [inline] dump_stack_lvl+0x241/0x360 lib/dump_stack.c:120 print_address_description mm/kasan/report.c:377 [inline] print_report+0x169/0x550 mm/kasan/report.c:488 kasan_report+0x143/0x180 mm/kasan/report.c:601 kasan_check_range+0x282/0x290 mm/kasan/generic.c:189 __asan_memcpy+0x40/0x70 mm/kasan/shadow.c:106 memcpy_from_page include/linux/highmem.h:423 [inline] hfs_bnode_read fs/hfs/bnode.c:35 [inline] hfs_bnode_read_key+0x314/0x450 fs/hfs/bnode.c:70 hfs_brec_insert+0x7f3/0xbd0 fs/hfs/brec.c:159 hfs_cat_create+0x41d/0xa50 fs/hfs/catalog.c:118 hfs_mkdir+0x6c/0xe0 fs/hfs/dir.c:232 vfs_mkdir+0x2f9/0x4f0 fs/namei.c:4257 do_mkdirat+0x264/0x3a0 fs/namei.c:4280 __do_sys_mkdir fs/namei.c:4300 [inline] __se_sys_mkdir fs/namei.c:4298 [inline] __x64_sys_mkdir+0x6c/0x80 fs/namei.c:4298 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xf3/0x230 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f RIP: 0033:0x7fbdd6057a99 Add a check for key length in hfs_bnode_read_key to prevent out-of-bounds memory access. If the key length is invalid, the key buffer is cleared, improving stability and reliability. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Reported-by: syzbot+5f3a973ed3dfb85a6683(a)syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=5f3a973ed3dfb85a6683 Cc: stable(a)vger.kernel.org Signed-off-by: Vasiliy Kovalev <kovalev(a)altlinux.org> --- fs/hfs/bnode.c | 6 ++++++ fs/hfsplus/bnode.c | 6 ++++++ 2 files changed, 12 insertions(+) diff --git a/fs/hfs/bnode.c b/fs/hfs/bnode.c index 6add6ebfef8967..cb823a8a6ba960 100644 --- a/fs/hfs/bnode.c +++ b/fs/hfs/bnode.c @@ -67,6 +67,12 @@ void hfs_bnode_read_key(struct hfs_bnode *node, void *key, int off) else key_len = tree->max_key_len + 1; + if (key_len > sizeof(hfs_btree_key) || key_len < 1) { + memset(key, 0, sizeof(hfs_btree_key)); + pr_err("hfs: Invalid key length: %d\n", key_len); + return; + } + hfs_bnode_read(node, key, off, key_len); } diff --git a/fs/hfsplus/bnode.c b/fs/hfsplus/bnode.c index 87974d5e679156..079ea80534f7de 100644 --- a/fs/hfsplus/bnode.c +++ b/fs/hfsplus/bnode.c @@ -67,6 +67,12 @@ void hfs_bnode_read_key(struct hfs_bnode *node, void *key, int off) else key_len = tree->max_key_len + 2; + if (key_len > sizeof(hfsplus_btree_key) || key_len < 1) { + memset(key, 0, sizeof(hfsplus_btree_key)); + pr_err("hfsplus: Invalid key length: %d\n", key_len); + return; + } + hfs_bnode_read(node, key, off, key_len); } -- 2.33.8

2 months, 2 weeks

10
19
0 0

[PATCH 1/2] PCI/MSI: Add MSIX option to write to ENTRY_DATA before any reads

by dullfire＠yahoo.com

From: Jonathan Currier <dullfire(a)yahoo.com> Commit 7d5ec3d36123 ("PCI/MSI: Mask all unused MSI-X entries") introduces a readl() from ENTRY_VECTOR_CTRL before the writel() to ENTRY_DATA. This is correct, however some hardware, like the Sun Neptune chips, the niu module, will cause an error and/or fatal trap if any MSIX table entry is read before the corresponding ENTRY_DATA field is written to. This patch adds an optional early writel() in msix_prepare_msi_desc(). Cc: stable(a)vger.kernel.org Signed-off-by: Jonathan Currier <dullfire(a)yahoo.com> --- drivers/pci/msi/msi.c | 2 ++ include/linux/pci.h | 2 ++ 2 files changed, 4 insertions(+) diff --git a/drivers/pci/msi/msi.c b/drivers/pci/msi/msi.c index 3a45879d85db..50d87fb5e37f 100644 --- a/drivers/pci/msi/msi.c +++ b/drivers/pci/msi/msi.c @@ -611,6 +611,8 @@ void msix_prepare_msi_desc(struct pci_dev *dev, struct msi_desc *desc) if (desc->pci.msi_attrib.can_mask) { void __iomem *addr = pci_msix_desc_addr(desc); + if (dev->dev_flags & PCI_DEV_FLAGS_MSIX_TOUCH_ENTRY_DATA_FIRST) + writel(0, addr + PCI_MSIX_ENTRY_DATA); desc->pci.msix_ctrl = readl(addr + PCI_MSIX_ENTRY_VECTOR_CTRL); } } diff --git a/include/linux/pci.h b/include/linux/pci.h index 37d97bef060f..b8b95b58d522 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -245,6 +245,8 @@ enum pci_dev_flags { PCI_DEV_FLAGS_NO_RELAXED_ORDERING = (__force pci_dev_flags_t) (1 << 11), /* Device does honor MSI masking despite saying otherwise */ PCI_DEV_FLAGS_HAS_MSI_MASKING = (__force pci_dev_flags_t) (1 << 12), + /* Device requires write to PCI_MSIX_ENTRY_DATA before any MSIX reads */ + PCI_DEV_FLAGS_MSIX_TOUCH_ENTRY_DATA_FIRST = (__force pci_dev_flags_t) (1 << 13), }; enum pci_irq_reroute_variant { -- 2.45.2

2 months, 2 weeks

5
7
0 0

[PATCH 2/2] net/niu: niu requires MSIX ENTRY_DATA fields touch before entry reads

by dullfire＠yahoo.com

From: Jonathan Currier <dullfire(a)yahoo.com> Fix niu_try_msix() to not cause a fatal trap on sparc systems. Set PCI_DEV_FLAGS_MSIX_TOUCH_ENTRY_DATA_FIRST on the struct pci_dev to work around a bug in the hardware or firmware. For each vector entry in the msix table, niu chips will cause a fatal trap if any registers in that entry are read before that entries' ENTRY_DATA register is written to. Testing indicates writes to other registers are not sufficient to prevent the fatal trap, however the value does not appear to matter. This only needs to happen once after power up, so simply rebooting into a kernel lacking this fix will NOT cause the trap. NON-RESUMABLE ERROR: Reporting on cpu 64 NON-RESUMABLE ERROR: TPC [0x00000000005f6900] <msix_prepare_msi_desc+0x90/0xa0> NON-RESUMABLE ERROR: RAW [4010000000000016:00000e37f93e32ff:0000000202000080:ffffffffffffffff NON-RESUMABLE ERROR: 0000000800000000:0000000000000000:0000000000000000:0000000000000000] NON-RESUMABLE ERROR: handle [0x4010000000000016] stick [0x00000e37f93e32ff] NON-RESUMABLE ERROR: type [precise nonresumable] NON-RESUMABLE ERROR: attrs [0x02000080] < ASI sp-faulted priv > NON-RESUMABLE ERROR: raddr [0xffffffffffffffff] NON-RESUMABLE ERROR: insn effective address [0x000000c50020000c] NON-RESUMABLE ERROR: size [0x8] NON-RESUMABLE ERROR: asi [0x00] CPU: 64 UID: 0 PID: 745 Comm: kworker/64:1 Not tainted 6.11.5 #63 Workqueue: events work_for_cpu_fn TSTATE: 0000000011001602 TPC: 00000000005f6900 TNPC: 00000000005f6904 Y: 00000000 Not tainted TPC: <msix_prepare_msi_desc+0x90/0xa0> g0: 00000000000002e9 g1: 000000000000000c g2: 000000c50020000c g3: 0000000000000100 g4: ffff8000470307c0 g5: ffff800fec5be000 g6: ffff800047a08000 g7: 0000000000000000 o0: ffff800014feb000 o1: ffff800047a0b620 o2: 0000000000000011 o3: ffff800047a0b620 o4: 0000000000000080 o5: 0000000000000011 sp: ffff800047a0ad51 ret_pc: 00000000005f7128 RPC: <__pci_enable_msix_range+0x3cc/0x460> l0: 000000000000000d l1: 000000000000c01f l2: ffff800014feb0a8 l3: 0000000000000020 l4: 000000000000c000 l5: 0000000000000001 l6: 0000000020000000 l7: ffff800047a0b734 i0: ffff800014feb000 i1: ffff800047a0b730 i2: 0000000000000001 i3: 000000000000000d i4: 0000000000000000 i5: 0000000000000000 i6: ffff800047a0ae81 i7: 00000000101888b0 I7: <niu_try_msix.constprop.0+0xc0/0x130 [niu]> Call Trace: [<00000000101888b0>] niu_try_msix.constprop.0+0xc0/0x130 [niu] [<000000001018f840>] niu_get_invariants+0x183c/0x207c [niu] [<00000000101902fc>] niu_pci_init_one+0x27c/0x2fc [niu] [<00000000005ef3e4>] local_pci_probe+0x28/0x74 [<0000000000469240>] work_for_cpu_fn+0x8/0x1c [<000000000046b008>] process_scheduled_works+0x144/0x210 [<000000000046b518>] worker_thread+0x13c/0x1c0 [<00000000004710e0>] kthread+0xb8/0xc8 [<00000000004060c8>] ret_from_fork+0x1c/0x2c [<0000000000000000>] 0x0 Kernel panic - not syncing: Non-resumable error. Fixes: 7d5ec3d36123 ("PCI/MSI: Mask all unused MSI-X entries") Cc: stable(a)vger.kernel.org Signed-off-by: Jonathan Currier <dullfire(a)yahoo.com> --- drivers/net/ethernet/sun/niu.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/sun/niu.c b/drivers/net/ethernet/sun/niu.c index 41a27ae58ced..f5449b73b9a7 100644 --- a/drivers/net/ethernet/sun/niu.c +++ b/drivers/net/ethernet/sun/niu.c @@ -9058,6 +9058,8 @@ static void niu_try_msix(struct niu *np, u8 *ldg_num_map) msi_vec[i].entry = i; } + pdev->dev_flags |= PCI_DEV_FLAGS_MSIX_TOUCH_ENTRY_DATA_FIRST; + num_irqs = pci_enable_msix_range(pdev, msi_vec, 1, num_irqs); if (num_irqs < 0) { np->flags &= ~NIU_FLAGS_MSIX; -- 2.45.2

2 months, 2 weeks

2
1
0 0

[PATCH V4] mm/gup: Clear the LRU flag of a page before adding to LRU batch

by yangge1116＠126.com

From: yangge <yangge1116(a)126.com> If a large number of CMA memory are configured in system (for example, the CMA memory accounts for 50% of the system memory), starting a virtual virtual machine with device passthrough, it will call pin_user_pages_remote(..., FOLL_LONGTERM, ...) to pin memory. Normally if a page is present and in CMA area, pin_user_pages_remote() will migrate the page from CMA area to non-CMA area because of FOLL_LONGTERM flag. But the current code will cause the migration failure due to unexpected page refcounts, and eventually cause the virtual machine fail to start. If a page is added in LRU batch, its refcount increases one, remove the page from LRU batch decreases one. Page migration requires the page is not referenced by others except page mapping. Before migrating a page, we should try to drain the page from LRU batch in case the page is in it, however, folio_test_lru() is not sufficient to tell whether the page is in LRU batch or not, if the page is in LRU batch, the migration will fail. To solve the problem above, we modify the logic of adding to LRU batch. Before adding a page to LRU batch, we clear the LRU flag of the page so that we can check whether the page is in LRU batch by folio_test_lru(page). It's quite valuable, because likely we don't want to blindly drain the LRU batch simply because there is some unexpected reference on a page, as described above. This change makes the LRU flag of a page invisible for longer, which may impact some programs. For example, as long as a page is on a LRU batch, we cannot isolate it, and we cannot check if it's an LRU page. Further, a page can now only be on exactly one LRU batch. This doesn't seem to matter much, because a new page is allocated from buddy and added to the lru batch, or be isolated, it's LRU flag may also be invisible for a long time. Fixes: 9a4e9f3b2d73 ("mm: update get_user_pages_longterm to migrate pages allocated from CMA region") Cc: <stable(a)vger.kernel.org> Signed-off-by: yangge <yangge1116(a)126.com> --- mm/swap.c | 43 +++++++++++++++++++++++++++++++------------ 1 file changed, 31 insertions(+), 12 deletions(-) V4: Adjust commit message according to David's comments V3: Add fixes tag V2: Adjust code and commit message according to David's comments diff --git a/mm/swap.c b/mm/swap.c index dc205bd..9caf6b0 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -211,10 +211,6 @@ static void folio_batch_move_lru(struct folio_batch *fbatch, move_fn_t move_fn) for (i = 0; i < folio_batch_count(fbatch); i++) { struct folio *folio = fbatch->folios[i]; - /* block memcg migration while the folio moves between lru */ - if (move_fn != lru_add_fn && !folio_test_clear_lru(folio)) - continue; - folio_lruvec_relock_irqsave(folio, &lruvec, &flags); move_fn(lruvec, folio); @@ -255,11 +251,16 @@ static void lru_move_tail_fn(struct lruvec *lruvec, struct folio *folio) void folio_rotate_reclaimable(struct folio *folio) { if (!folio_test_locked(folio) && !folio_test_dirty(folio) && - !folio_test_unevictable(folio) && folio_test_lru(folio)) { + !folio_test_unevictable(folio)) { struct folio_batch *fbatch; unsigned long flags; folio_get(folio); + if (!folio_test_clear_lru(folio)) { + folio_put(folio); + return; + } + local_lock_irqsave(&lru_rotate.lock, flags); fbatch = this_cpu_ptr(&lru_rotate.fbatch); folio_batch_add_and_move(fbatch, folio, lru_move_tail_fn); @@ -352,11 +353,15 @@ static void folio_activate_drain(int cpu) void folio_activate(struct folio *folio) { - if (folio_test_lru(folio) && !folio_test_active(folio) && - !folio_test_unevictable(folio)) { + if (!folio_test_active(folio) && !folio_test_unevictable(folio)) { struct folio_batch *fbatch; folio_get(folio); + if (!folio_test_clear_lru(folio)) { + folio_put(folio); + return; + } + local_lock(&cpu_fbatches.lock); fbatch = this_cpu_ptr(&cpu_fbatches.activate); folio_batch_add_and_move(fbatch, folio, folio_activate_fn); @@ -700,6 +705,11 @@ void deactivate_file_folio(struct folio *folio) return; folio_get(folio); + if (!folio_test_clear_lru(folio)) { + folio_put(folio); + return; + } + local_lock(&cpu_fbatches.lock); fbatch = this_cpu_ptr(&cpu_fbatches.lru_deactivate_file); folio_batch_add_and_move(fbatch, folio, lru_deactivate_file_fn); @@ -716,11 +726,16 @@ void deactivate_file_folio(struct folio *folio) */ void folio_deactivate(struct folio *folio) { - if (folio_test_lru(folio) && !folio_test_unevictable(folio) && - (folio_test_active(folio) || lru_gen_enabled())) { + if (!folio_test_unevictable(folio) && (folio_test_active(folio) || + lru_gen_enabled())) { struct folio_batch *fbatch; folio_get(folio); + if (!folio_test_clear_lru(folio)) { + folio_put(folio); + return; + } + local_lock(&cpu_fbatches.lock); fbatch = this_cpu_ptr(&cpu_fbatches.lru_deactivate); folio_batch_add_and_move(fbatch, folio, lru_deactivate_fn); @@ -737,12 +752,16 @@ void folio_deactivate(struct folio *folio) */ void folio_mark_lazyfree(struct folio *folio) { - if (folio_test_lru(folio) && folio_test_anon(folio) && - folio_test_swapbacked(folio) && !folio_test_swapcache(folio) && - !folio_test_unevictable(folio)) { + if (folio_test_anon(folio) && folio_test_swapbacked(folio) && + !folio_test_swapcache(folio) && !folio_test_unevictable(folio)) { struct folio_batch *fbatch; folio_get(folio); + if (!folio_test_clear_lru(folio)) { + folio_put(folio); + return; + } + local_lock(&cpu_fbatches.lock); fbatch = this_cpu_ptr(&cpu_fbatches.lru_lazyfree); folio_batch_add_and_move(fbatch, folio, lru_lazyfree_fn); -- 2.7.4

2 months, 3 weeks

3
9
0 0

[PATCH 6.6 000/152] 6.6.79-rc1 review

by Greg Kroah-Hartman

This is the start of the stable review cycle for the 6.6.79 release. There are 152 patches in this series, all will be posted as a response to this one. If anyone has any issues with these being applied, please let me know. Responses should be made by Fri, 21 Feb 2025 08:25:11 +0000. Anything received after that time might be too late. The whole patch series can be found in one patch at: https://www.kernel.org/pub/linux/kernel/v6.x/stable-review/patch-6.6.79-rc1… or in the git tree and branch at: git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git linux-6.6.y and the diffstat can be found below. thanks, greg k-h ------------- Pseudo-Shortlog of commits: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org> Linux 6.6.79-rc1 Greg Kroah-Hartman <gregkh(a)linuxfoundation.org> Revert "vfio/platform: check the bounds of read/write syscalls" David Woodhouse <dwmw(a)amazon.co.uk> x86/i8253: Disable PIT timer 0 when not in use Michal Luczaj <mhal(a)rbox.co> vsock: Orphan socket after transport release Michal Luczaj <mhal(a)rbox.co> vsock: Keep the binding until socket destruction Pavel Begunkov <asml.silence(a)gmail.com> io_uring/kbuf: reallocate buf lists on upgrade Vicki Pfau <vi(a)endrift.com> HID: hid-steam: Don't use cancel_delayed_work_sync in IRQ context Ivan Kokshaysky <ink(a)unseen.parts> alpha: replace hardcoded stack offsets with autogenerated ones Zhaoyang Huang <zhaoyang.huang(a)unisoc.com> mm: gup: fix infinite loop within __get_longterm_locked Marc Zyngier <maz(a)kernel.org> arm64: Filter out SVE hwcaps when FEAT_SVE isn't implemented Yu Kuai <yukuai3(a)huawei.com> md/md-bitmap: move bitmap_{start, end}write to md upper layer Yu Kuai <yukuai3(a)huawei.com> md/raid5: implement pers->bitmap_sector() Yu Kuai <yukuai3(a)huawei.com> md: add a new callback pers->bitmap_sector() Yu Kuai <yukuai3(a)huawei.com> md/md-bitmap: remove the last parameter for bimtap_ops->endwrite() Yu Kuai <yukuai3(a)huawei.com> md/md-bitmap: factor behind write counters out from bitmap_{start/end}write() Benjamin Marzinski <bmarzins(a)redhat.com> md/raid5: recheck if reshape has finished with device_lock held Hangbin Liu <liuhangbin(a)gmail.com> selftests: rtnetlink: update netdevsim ipsec output format Hangbin Liu <liuhangbin(a)gmail.com> netdevsim: print human readable IP address Alex Hung <alex.hung(a)amd.com> drm/amd/display: Pass non-null to dcn20_validate_apply_pipe_split_flags Srinivasan Shanmugam <srinivasan.shanmugam(a)amd.com> drm/amd/display: Add null check for head_pipe in dcn201_acquire_free_pipe_for_layer Andrew Cooper <andrew.cooper3(a)citrix.com> x86/static-call: Remove early_boot_irqs_disabled check to fix Xen PVH dom0 Christian Gmeiner <cgmeiner(a)igalia.com> drm/v3d: Stop active perfmon if it is being destroyed Tomi Valkeinen <tomi.valkeinen+renesas(a)ideasonboard.com> drm/rcar-du: dsi: Fix PHY lock bit check Devarsh Thakkar <devarsht(a)ti.com> drm/tidss: Clear the interrupt status for interrupts being disabled Tomi Valkeinen <tomi.valkeinen(a)ideasonboard.com> drm/tidss: Fix issue in irq handling causing irq-flood issue Eric Dumazet <edumazet(a)google.com> ipv6: mcast: add RCU protection to mld_newpack() Eric Dumazet <edumazet(a)google.com> ipv6: mcast: extend RCU protection in igmp6_send() Eric Dumazet <edumazet(a)google.com> ndisc: extend RCU protection in ndisc_send_skb() Eric Dumazet <edumazet(a)google.com> openvswitch: use RCU protection in ovs_vport_cmd_fill_info() Eric Dumazet <edumazet(a)google.com> arp: use RCU protection in arp_xmit() Eric Dumazet <edumazet(a)google.com> neighbour: use RCU protection in __neigh_notify() Li Zetao <lizetao1(a)huawei.com> neighbour: delete redundant judgment statements Eric Dumazet <edumazet(a)google.com> ndisc: use RCU protection in ndisc_alloc_skb() Vicki Pfau <vi(a)endrift.com> HID: hid-steam: Move hidraw input (un)registering to work Vicki Pfau <vi(a)endrift.com> HID: hid-steam: Make sure rumble work is canceled on removal Max Maisel <mmm-1(a)posteo.net> HID: hid-steam: Add Deck IMU support Dan Carpenter <dan.carpenter(a)linaro.org> HID: hid-steam: Fix cleanup in probe() Dan Carpenter <dan.carpenter(a)linaro.org> HID: hid-steam: remove pointless error message Vicki Pfau <vi(a)endrift.com> HID: hid-steam: Add gamepad-only mode switched to by holding options Vicki Pfau <vi(a)endrift.com> HID: hid-steam: Update list of identifiers from SDL Vicki Pfau <vi(a)endrift.com> HID: hid-steam: Clean up locking Vicki Pfau <vi(a)endrift.com> HID: hid-steam: Disable watchdog instead of using a heartbeat Vicki Pfau <vi(a)endrift.com> HID: hid-steam: Avoid overwriting smoothing parameter Eric Dumazet <edumazet(a)google.com> ipv6: icmp: convert to dev_net_rcu() Eric Dumazet <edumazet(a)google.com> ipv6: use RCU protection in ip6_default_advmss() Eric Dumazet <edumazet(a)google.com> flow_dissector: use RCU protection to fetch dev_net() Eric Dumazet <edumazet(a)google.com> ipv4: icmp: convert to dev_net_rcu() Eric Dumazet <edumazet(a)google.com> ipv4: use RCU protection in __ip_rt_update_pmtu() Vladimir Vdovin <deliran(a)verdict.gg> net: ipv4: Cache pmtu for all packet paths if multipath enabled Eric Dumazet <edumazet(a)google.com> ipv4: use RCU protection in inet_select_addr() Eric Dumazet <edumazet(a)google.com> ipv4: use RCU protection in rt_is_expired() Eric Dumazet <edumazet(a)google.com> ipv4: use RCU protection in ipv4_default_advmss() Eric Dumazet <edumazet(a)google.com> net: add dev_net_rcu() helper Jiri Pirko <jiri(a)resnulli.us> net: treat possible_net_t net pointer as an RCU one and add read_pnet_rcu() Eric Dumazet <edumazet(a)google.com> ipv4: add RCU protection to ip4_dst_hoplimit() Waiman Long <longman(a)redhat.com> clocksource: Use migrate_disable() to avoid calling get_random_u32() in atomic context Waiman Long <longman(a)redhat.com> clocksource: Use pr_info() for "Checking clocksource synchronization" message Filipe Manana <fdmanana(a)suse.com> btrfs: fix hole expansion when writing at an offset beyond EOF Wentao Liang <vulab(a)iscas.ac.cn> mlxsw: Add return value check for mlxsw_sp_port_get_stats_raw() Song Yoong Siang <yoong.siang.song(a)intel.com> igc: Set buffer type for empty frames in igc_init_empty_frame Andy-ld Lu <andy-ld.lu(a)mediatek.com> mmc: mtk-sd: Fix register settings for hs400(es) mode Nathan Chancellor <nathan(a)kernel.org> arm64: Handle .ARM.attributes section in linker scripts Jiasheng Jiang <jiashengjiangcool(a)gmail.com> regmap-irq: Add missing kfree() Varadarajan Narayanan <quic_varada(a)quicinc.com> regulator: qcom_smd: Add l2, l5 sub-node to mp5496 regulator Jann Horn <jannh(a)google.com> partitions: mac: fix handling of bogus partition table Wentao Liang <vulab(a)iscas.ac.cn> gpio: stmpe: Check return value of stmpe_reg_read in stmpe_gpio_irq_sync_unlock Mario Limonciello <mario.limonciello(a)amd.com> gpiolib: acpi: Add a quirk for Acer Nitro ANV14 Ivan Kokshaysky <ink(a)unseen.parts> alpha: align stack for page fault and user unaligned trap handlers John Keeping <jkeeping(a)inmusicbrands.com> serial: 8250: Fix fifo underflow on flush Andy Shevchenko <andriy.shevchenko(a)linux.intel.com> serial: port: Always update ->iotype in __uart_read_properties() Andy Shevchenko <andriy.shevchenko(a)linux.intel.com> serial: port: Assign ->iotype correctly when ->iobase is set Shakeel Butt <shakeel.butt(a)linux.dev> cgroup: fix race between fork and cgroup.kill Ard Biesheuvel <ardb(a)kernel.org> efi: Avoid cold plugged memory for placing the kernel Thomas Weißschuh <thomas.weissschuh(a)linutronix.de> kbuild: userprogs: fix bitsize and target detection on clang Aditya Kumar Singh <aditya.kumar.singh(a)oss.qualcomm.com> wifi: ath12k: fix handling of 6 GHz rules Ivan Kokshaysky <ink(a)unseen.parts> alpha: make stack 16-byte aligned (most cases) Vincent Mailhol <mailhol.vincent(a)wanadoo.fr> can: etas_es58x: fix potential NULL pointer dereference on udev->serial Alexander Hölzl <alexander.hoelzl(a)gmx.net> can: j1939: j1939_sk_send_loop(): fix unable to send messages with data length zero Krzysztof Kozlowski <krzysztof.kozlowski(a)linaro.org> can: c_can: fix unbalanced runtime PM disable in error path Fedor Pchelkin <pchelkin(a)ispras.ru> can: ctucanfd: handle skb allocation failure Johan Hovold <johan(a)kernel.org> USB: serial: option: drop MeiG Smart defines Fabio Porcedda <fabio.porcedda(a)gmail.com> USB: serial: option: fix Telit Cinterion FN990A name Fabio Porcedda <fabio.porcedda(a)gmail.com> USB: serial: option: add Telit Cinterion FN990B compositions Chester A. Unal <chester.a.unal(a)arinc9.com> USB: serial: option: add MeiG Smart SLM828 Jann Horn <jannh(a)google.com> usb: cdc-acm: Fix handling of oversized fragments Jann Horn <jannh(a)google.com> usb: cdc-acm: Check control transfer buffer size before access Marek Vasut <marek.vasut+renesas(a)mailbox.org> USB: cdc-acm: Fill in Renesas R-Car D3 USB Download mode quirk Alan Stern <stern(a)rowland.harvard.edu> USB: hub: Ignore non-compliant devices with too many configs or interfaces John Keeping <jkeeping(a)inmusicbrands.com> usb: gadget: f_midi: fix MIDI Streaming descriptor lengths Mathias Nyman <mathias.nyman(a)linux.intel.com> USB: Add USB_QUIRK_NO_LPM quirk for sony xperia xz1 smartphone Lei Huang <huanglei(a)kylinos.cn> USB: quirks: add USB_QUIRK_NO_LPM quirk for Teclast dist Stefan Eichenberger <stefan.eichenberger(a)toradex.com> usb: core: fix pipe creation for get_bMaxPacketSize0 Huacai Chen <chenhuacai(a)kernel.org> USB: pci-quirks: Fix HCCPARAMS register error for LS7A EHCI Fabrice Gasnier <fabrice.gasnier(a)foss.st.com> usb: dwc2: gadget: remove of_node reference upon udc_stop Guo Ren <guoren(a)kernel.org> usb: gadget: udc: renesas_usb3: Fix compiler warning Elson Roy Serrao <quic_eserrao(a)quicinc.com> usb: roles: set switch registered flag early on Selvarasu Ganesan <selvarasu.g(a)samsung.com> usb: dwc3: Fix timeout issue during controller enter/exit from halt state Selvarasu Ganesan <selvarasu.g(a)samsung.com> usb: gadget: f_midi: Fixing wMaxPacketSize exceeded issue during MIDI bind retries Sean Christopherson <seanjc(a)google.com> perf/x86/intel: Ensure LBRs are disabled when a CPU is starting Sean Christopherson <seanjc(a)google.com> KVM: nSVM: Enter guest mode before initializing nested NPT MMU Sean Christopherson <seanjc(a)google.com> KVM: x86: Reject Hyper-V's SEND_IPI hypercalls if local APIC isn't in-kernel Jiang Liu <gerry(a)linux.alibaba.com> drm/amdgpu: avoid buffer overflow attach in smu_sys_set_pp_table() Sven Eckelmann <sven(a)narfation.org> batman-adv: Drop unmanaged ELP metric worker Sven Eckelmann <sven(a)narfation.org> batman-adv: Ignore neighbor throughput metrics in error case Andy Strohman <andrew(a)andrewstrohman.com> batman-adv: fix panic during interface removal Hans de Goede <hdegoede(a)redhat.com> ASoC: Intel: bytcr_rt5640: Add DMI quirk for Vexia Edu Atla 10 tablet 5V Mike Marshall <hubcap(a)omnibond.com> orangefs: fix a oob in orangefs_debug_write Rik van Riel <riel(a)fb.com> x86/mm/tlb: Only trim the mm_cpumask once a second Hans de Goede <hdegoede(a)redhat.com> ACPI: x86: Add skip i2c clients quirk for Vexia EDU ATLA 10 tablet 5V Koichiro Den <koichiro.den(a)canonical.com> selftests: gpio: gpio-sim: Fix missing chip disablements Maksym Planeta <maksym(a)exostellar.io> Grab mm lock before grabbing pt lock Zichen Xie <zichenxie0106(a)gmail.com> NFS: Fix potential buffer overflowin nfs_sysfs_link_rpc_client() Ramesh Thomas <ramesh.thomas(a)intel.com> vfio/pci: Enable iowrite64 and ioread64 for vfio pci Tomas Glozar <tglozar(a)redhat.com> rtla/timerlat_top: Abort event processing on second signal Tomas Glozar <tglozar(a)redhat.com> rtla/timerlat_hist: Abort event processing on second signal Guixin Liu <kanie(a)linux.alibaba.com> scsi: ufs: bsg: Set bsg_queue to NULL after removal Rakesh Babu Saladi <Saladi.Rakeshbabu(a)microchip.com> PCI: switchtec: Add Microchip PCI100X device IDs Takashi Iwai <tiwai(a)suse.de> PCI/DPC: Quirk PIO log size for Intel Raptor Lake-P Edward Adam Davis <eadavis(a)qq.com> media: vidtv: Fix a null-ptr-deref in vidtv_mux_stop_thread Isaac Scott <isaac.scott(a)ideasonboard.com> media: uvcvideo: Add Kurokesu C1 PRO camera Isaac Scott <isaac.scott(a)ideasonboard.com> media: uvcvideo: Add new quirk definition for the Sonix Technology Co. 292a camera Isaac Scott <isaac.scott(a)ideasonboard.com> media: uvcvideo: Implement dual stream quirk to fix loss of usb packets Tomi Valkeinen <tomi.valkeinen(a)ideasonboard.com> media: i2c: ds90ub953: Add error handling for i2c reads/writes Tomi Valkeinen <tomi.valkeinen(a)ideasonboard.com> media: i2c: ds90ub913: Add error handling to ub913_hw_init() Arnd Bergmann <arnd(a)arndb.de> media: cxd2841er: fix 64-bit division on gcc-9 Kartik Rajput <kkartik(a)nvidia.com> soc/tegra: fuse: Update Tegra234 nvmem keepout list Aaro Koskinen <aaro.koskinen(a)iki.fi> fbdev: omap: use threaded IRQ for LCD DMA Michael Margolin <mrgolin(a)amazon.com> RDMA/efa: Reset device on probe failure Masahiro Yamada <masahiroy(a)kernel.org> tools: fix annoying "mkdir -p ..." logs when building tools in parallel Andy Shevchenko <andriy.shevchenko(a)linux.intel.com> gpiolib: Fix crash on error in gpiochip_get_ngpios() Jens Axboe <axboe(a)kernel.dk> block: cleanup and fix batch completion adding conditions Juergen Gross <jgross(a)suse.com> x86/xen: allow larger contiguous memory regions in PV guests Juergen Gross <jgross(a)suse.com> xen/swiotlb: relax alignment requirements Jiang Liu <gerry(a)linux.alibaba.com> drm/amdgpu: bail out when failed to load fw in psp_init_cap_microcode() Artur Weber <aweber.kernel(a)gmail.com> gpio: bcm-kona: Add missing newline to dev_err format string Artur Weber <aweber.kernel(a)gmail.com> gpio: bcm-kona: Make sure GPIO bits are unlocked when requesting IRQ Artur Weber <aweber.kernel(a)gmail.com> gpio: bcm-kona: Fix GPIO lock/unlock for banks above bank 0 Krzysztof Karas <krzysztof.karas(a)intel.com> drm/i915/selftests: avoid using uninitialized context Muhammad Adeel <Muhammad.Adeel(a)ibm.com> cgroup: Remove steal time from usage_usec Radu Rendec <rrendec(a)redhat.com> arm64: cacheinfo: Avoid out-of-bounds write to cacheinfo array Eric Dumazet <edumazet(a)google.com> team: better TEAM_OPTION_TYPE_STRING validation Yuli Wang <wangyuli(a)uniontech.com> LoongArch: csum: Fix OoB access in IP checksum code for negative lengths Marco Crivellari <marco.crivellari(a)suse.com> LoongArch: Fix idle VS timer enqueue Eric Dumazet <edumazet(a)google.com> vxlan: check vxlan_vnigroup_init() return value Eric Dumazet <edumazet(a)google.com> vrf: use RCU protection in l3mdev_l3_out() Eric Dumazet <edumazet(a)google.com> ndisc: ndisc_send_redirect() must use dev_get_by_index_rcu() Murad Masimov <m.masimov(a)mt-integration.ru> ax25: Fix refcount leak caused by setting SO_BINDTODEVICE sockopt Kunihiko Hayashi <hayashi.kunihiko(a)socionext.com> spi: sn-f-ospi: Fix division by zero Tulio Fernandes <tuliomf09(a)gmail.com> HID: hid-thrustmaster: fix stack-out-of-bounds read in usb_check_int_endpoints() Charles Han <hanchunchao(a)inspur.com> HID: multitouch: Add NULL check in mt_input_configured Andy Shevchenko <andriy.shevchenko(a)linux.intel.com> pinctrl: cy8c95x0: Respect IRQ trigger settings from firmware Dai Ngo <dai.ngo(a)oracle.com> NFSD: fix hang in nfsd4_shutdown_callback Li Lingfeng <lilingfeng3(a)huawei.com> nfsd: clear acl_access/acl_default after releasing them ------------- Diffstat: Documentation/arch/arm64/elf_hwcaps.rst | 36 +- .../bindings/regulator/qcom,smd-rpm-regulator.yaml | 2 +- Makefile | 17 +- arch/alpha/include/uapi/asm/ptrace.h | 2 + arch/alpha/kernel/asm-offsets.c | 2 + arch/alpha/kernel/entry.S | 24 +- arch/alpha/kernel/traps.c | 2 +- arch/alpha/mm/fault.c | 4 +- arch/arm64/kernel/cacheinfo.c | 12 +- arch/arm64/kernel/cpufeature.c | 38 +- arch/arm64/kernel/vdso/vdso.lds.S | 1 + arch/arm64/kernel/vmlinux.lds.S | 1 + arch/loongarch/kernel/genex.S | 28 +- arch/loongarch/kernel/idle.c | 3 +- arch/loongarch/kernel/reset.c | 6 +- arch/loongarch/lib/csum.c | 2 +- arch/x86/events/intel/core.c | 5 +- arch/x86/include/asm/mmu.h | 2 + arch/x86/include/asm/mmu_context.h | 1 + arch/x86/include/asm/msr-index.h | 3 +- arch/x86/include/asm/tlbflush.h | 1 + arch/x86/kernel/i8253.c | 11 +- arch/x86/kernel/static_call.c | 1 - arch/x86/kvm/hyperv.c | 6 +- arch/x86/kvm/mmu/mmu.c | 2 +- arch/x86/kvm/svm/nested.c | 10 +- arch/x86/mm/tlb.c | 35 +- arch/x86/xen/mmu_pv.c | 75 ++- block/partitions/mac.c | 18 +- drivers/acpi/x86/utils.c | 13 + drivers/base/regmap/regmap-irq.c | 2 + drivers/clocksource/i8253.c | 13 +- drivers/firmware/efi/efi.c | 6 +- drivers/firmware/efi/libstub/randomalloc.c | 3 + drivers/firmware/efi/libstub/relocate.c | 3 + drivers/gpio/gpio-bcm-kona.c | 71 +- drivers/gpio/gpio-stmpe.c | 15 +- drivers/gpio/gpiolib-acpi.c | 14 + drivers/gpio/gpiolib.c | 6 +- drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 5 +- .../gpu/drm/amd/display/dc/dcn20/dcn20_resource.c | 3 +- .../drm/amd/display/dc/dcn201/dcn201_resource.c | 4 +- .../gpu/drm/amd/display/dc/dcn21/dcn21_resource.c | 3 +- drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c | 3 +- drivers/gpu/drm/i915/selftests/i915_gem_gtt.c | 4 +- drivers/gpu/drm/renesas/rcar-du/rcar_mipi_dsi.c | 2 +- .../gpu/drm/renesas/rcar-du/rcar_mipi_dsi_regs.h | 1 - drivers/gpu/drm/tidss/tidss_dispc.c | 22 +- drivers/gpu/drm/v3d/v3d_perfmon.c | 5 + drivers/hid/hid-multitouch.c | 5 +- drivers/hid/hid-steam.c | 738 ++++++++++++++++----- drivers/hid/hid-thrustmaster.c | 2 +- drivers/infiniband/hw/efa/efa_main.c | 9 +- drivers/md/md-bitmap.c | 75 ++- drivers/md/md-bitmap.h | 6 +- drivers/md/md.c | 26 + drivers/md/md.h | 5 + drivers/md/raid1.c | 35 +- drivers/md/raid1.h | 1 - drivers/md/raid10.c | 26 +- drivers/md/raid10.h | 1 - drivers/md/raid5-cache.c | 4 - drivers/md/raid5.c | 174 ++--- drivers/md/raid5.h | 4 - drivers/media/dvb-frontends/cxd2841er.c | 8 +- drivers/media/i2c/ds90ub913.c | 25 +- drivers/media/i2c/ds90ub953.c | 46 +- drivers/media/test-drivers/vidtv/vidtv_bridge.c | 8 +- drivers/media/usb/uvc/uvc_driver.c | 18 + drivers/media/usb/uvc/uvc_video.c | 27 +- drivers/media/usb/uvc/uvcvideo.h | 1 + drivers/mmc/host/mtk-sd.c | 31 +- drivers/net/can/c_can/c_can_platform.c | 5 +- drivers/net/can/ctucanfd/ctucanfd_base.c | 10 +- drivers/net/can/usb/etas_es58x/es58x_devlink.c | 6 +- drivers/net/ethernet/intel/igc/igc_main.c | 1 + .../net/ethernet/mellanox/mlxsw/spectrum_ethtool.c | 4 +- drivers/net/netdevsim/ipsec.c | 12 +- drivers/net/team/team.c | 4 +- drivers/net/vxlan/vxlan_core.c | 7 +- drivers/net/wireless/ath/ath12k/wmi.c | 61 +- drivers/net/wireless/ath/ath12k/wmi.h | 1 - drivers/pci/quirks.c | 12 + drivers/pci/switch/switchtec.c | 26 + drivers/pinctrl/pinctrl-cy8c95x0.c | 2 +- drivers/soc/tegra/fuse/fuse-tegra30.c | 17 +- drivers/spi/spi-sn-f-ospi.c | 3 + drivers/tty/serial/8250/8250.h | 2 + drivers/tty/serial/8250/8250_dma.c | 16 + drivers/tty/serial/8250/8250_port.c | 9 + drivers/tty/serial/serial_port.c | 5 +- drivers/ufs/core/ufs_bsg.c | 1 + drivers/usb/class/cdc-acm.c | 28 +- drivers/usb/core/hub.c | 14 +- drivers/usb/core/quirks.c | 6 + drivers/usb/dwc2/gadget.c | 1 + drivers/usb/dwc3/gadget.c | 34 + drivers/usb/gadget/function/f_midi.c | 17 +- drivers/usb/gadget/udc/renesas_usb3.c | 2 +- drivers/usb/host/pci-quirks.c | 9 + drivers/usb/roles/class.c | 5 +- drivers/usb/serial/option.c | 49 +- drivers/vfio/pci/vfio_pci_rdwr.c | 1 + drivers/vfio/platform/vfio_platform_common.c | 10 - drivers/video/fbdev/omap/lcd_dma.c | 4 +- drivers/xen/swiotlb-xen.c | 20 +- fs/btrfs/file.c | 4 +- fs/nfs/sysfs.c | 6 +- fs/nfsd/nfs2acl.c | 2 + fs/nfsd/nfs3acl.c | 2 + fs/nfsd/nfs4callback.c | 7 +- fs/orangefs/orangefs-debugfs.c | 4 +- include/linux/blk-mq.h | 18 +- include/linux/cgroup-defs.h | 6 +- include/linux/efi.h | 1 + include/linux/i8253.h | 1 + include/linux/netdevice.h | 6 + include/linux/sched/task.h | 1 + include/net/l3mdev.h | 2 + include/net/net_namespace.h | 15 +- include/net/route.h | 9 +- io_uring/kbuf.c | 15 +- kernel/cgroup/cgroup.c | 20 +- kernel/cgroup/rstat.c | 1 - kernel/time/clocksource.c | 9 +- mm/gup.c | 14 +- net/ax25/af_ax25.c | 11 + net/batman-adv/bat_v.c | 2 - net/batman-adv/bat_v_elp.c | 122 +++- net/batman-adv/bat_v_elp.h | 2 - net/batman-adv/types.h | 3 - net/can/j1939/socket.c | 4 +- net/can/j1939/transport.c | 5 +- net/core/flow_dissector.c | 21 +- net/core/neighbour.c | 11 +- net/ipv4/arp.c | 4 +- net/ipv4/devinet.c | 3 +- net/ipv4/icmp.c | 31 +- net/ipv4/route.c | 39 +- net/ipv6/icmp.c | 42 +- net/ipv6/mcast.c | 45 +- net/ipv6/ndisc.c | 28 +- net/ipv6/route.c | 7 +- net/openvswitch/datapath.c | 12 +- net/vmw_vsock/af_vsock.c | 12 +- sound/soc/intel/boards/bytcr_rt5640.c | 17 +- tools/testing/selftests/gpio/gpio-sim.sh | 31 +- tools/testing/selftests/net/pmtu.sh | 112 +++- tools/testing/selftests/net/rtnetlink.sh | 4 +- tools/tracing/rtla/src/timerlat_hist.c | 8 + tools/tracing/rtla/src/timerlat_top.c | 8 + 151 files changed, 2108 insertions(+), 846 deletions(-)

2 months, 3 weeks

13
165
0 0

[PATCH v3] sched/rt: Fix race in push_rt_task

by Harshit Agarwal

Overview ======== When a CPU chooses to call push_rt_task and picks a task to push to another CPU's runqueue then it will call find_lock_lowest_rq method which would take a double lock on both CPUs' runqueues. If one of the locks aren't readily available, it may lead to dropping the current runqueue lock and reacquiring both the locks at once. During this window it is possible that the task is already migrated and is running on some other CPU. These cases are already handled. However, if the task is migrated and has already been executed and another CPU is now trying to wake it up (ttwu) such that it is queued again on the runqeue (on_rq is 1) and also if the task was run by the same CPU, then the current checks will pass even though the task was migrated out and is no longer in the pushable tasks list. Crashes ======= This bug resulted in quite a few flavors of crashes triggering kernel panics with various crash signatures such as assert failures, page faults, null pointer dereferences, and queue corruption errors all coming from scheduler itself. Some of the crashes: -> kernel BUG at kernel/sched/rt.c:1616! BUG_ON(idx >= MAX_RT_PRIO) Call Trace: ? __die_body+0x1a/0x60 ? die+0x2a/0x50 ? do_trap+0x85/0x100 ? pick_next_task_rt+0x6e/0x1d0 ? do_error_trap+0x64/0xa0 ? pick_next_task_rt+0x6e/0x1d0 ? exc_invalid_op+0x4c/0x60 ? pick_next_task_rt+0x6e/0x1d0 ? asm_exc_invalid_op+0x12/0x20 ? pick_next_task_rt+0x6e/0x1d0 __schedule+0x5cb/0x790 ? update_ts_time_stats+0x55/0x70 schedule_idle+0x1e/0x40 do_idle+0x15e/0x200 cpu_startup_entry+0x19/0x20 start_secondary+0x117/0x160 secondary_startup_64_no_verify+0xb0/0xbb -> BUG: kernel NULL pointer dereference, address: 00000000000000c0 Call Trace: ? __die_body+0x1a/0x60 ? no_context+0x183/0x350 ? __warn+0x8a/0xe0 ? exc_page_fault+0x3d6/0x520 ? asm_exc_page_fault+0x1e/0x30 ? pick_next_task_rt+0xb5/0x1d0 ? pick_next_task_rt+0x8c/0x1d0 __schedule+0x583/0x7e0 ? update_ts_time_stats+0x55/0x70 schedule_idle+0x1e/0x40 do_idle+0x15e/0x200 cpu_startup_entry+0x19/0x20 start_secondary+0x117/0x160 secondary_startup_64_no_verify+0xb0/0xbb -> BUG: unable to handle page fault for address: ffff9464daea5900 kernel BUG at kernel/sched/rt.c:1861! BUG_ON(rq->cpu != task_cpu(p)) -> kernel BUG at kernel/sched/rt.c:1055! BUG_ON(!rq->nr_running) Call Trace: ? __die_body+0x1a/0x60 ? die+0x2a/0x50 ? do_trap+0x85/0x100 ? dequeue_top_rt_rq+0xa2/0xb0 ? do_error_trap+0x64/0xa0 ? dequeue_top_rt_rq+0xa2/0xb0 ? exc_invalid_op+0x4c/0x60 ? dequeue_top_rt_rq+0xa2/0xb0 ? asm_exc_invalid_op+0x12/0x20 ? dequeue_top_rt_rq+0xa2/0xb0 dequeue_rt_entity+0x1f/0x70 dequeue_task_rt+0x2d/0x70 __schedule+0x1a8/0x7e0 ? blk_finish_plug+0x25/0x40 schedule+0x3c/0xb0 futex_wait_queue_me+0xb6/0x120 futex_wait+0xd9/0x240 do_futex+0x344/0xa90 ? get_mm_exe_file+0x30/0x60 ? audit_exe_compare+0x58/0x70 ? audit_filter_rules.constprop.26+0x65e/0x1220 __x64_sys_futex+0x148/0x1f0 do_syscall_64+0x30/0x80 entry_SYSCALL_64_after_hwframe+0x62/0xc7 -> BUG: unable to handle page fault for address: ffff8cf3608bc2c0 Call Trace: ? __die_body+0x1a/0x60 ? no_context+0x183/0x350 ? spurious_kernel_fault+0x171/0x1c0 ? exc_page_fault+0x3b6/0x520 ? plist_check_list+0x15/0x40 ? plist_check_list+0x2e/0x40 ? asm_exc_page_fault+0x1e/0x30 ? _cond_resched+0x15/0x30 ? futex_wait_queue_me+0xc8/0x120 ? futex_wait+0xd9/0x240 ? try_to_wake_up+0x1b8/0x490 ? futex_wake+0x78/0x160 ? do_futex+0xcd/0xa90 ? plist_check_list+0x15/0x40 ? plist_check_list+0x2e/0x40 ? plist_del+0x6a/0xd0 ? plist_check_list+0x15/0x40 ? plist_check_list+0x2e/0x40 ? dequeue_pushable_task+0x20/0x70 ? __schedule+0x382/0x7e0 ? asm_sysvec_reschedule_ipi+0xa/0x20 ? schedule+0x3c/0xb0 ? exit_to_user_mode_prepare+0x9e/0x150 ? irqentry_exit_to_user_mode+0x5/0x30 ? asm_sysvec_reschedule_ipi+0x12/0x20 Above are some of the common examples of the crashes that were observed due to this issue. Details ======= Let's look at the following scenario to understand this race. 1) CPU A enters push_rt_task a) CPU A has chosen next_task = task p. b) CPU A calls find_lock_lowest_rq(Task p, CPU Z’s rq). c) CPU A identifies CPU X as a destination CPU (X < Z). d) CPU A enters double_lock_balance(CPU Z’s rq, CPU X’s rq). e) Since X is lower than Z, CPU A unlocks CPU Z’s rq. Someone else has locked CPU X’s rq, and thus, CPU A must wait. 2) At CPU Z a) Previous task has completed execution and thus, CPU Z enters schedule, locks its own rq after CPU A releases it. b) CPU Z dequeues previous task and begins executing task p. c) CPU Z unlocks its rq. d) Task p yields the CPU (ex. by doing IO or waiting to acquire a lock) which triggers the schedule function on CPU Z. e) CPU Z enters schedule again, locks its own rq, and dequeues task p. f) As part of dequeue, it sets p.on_rq = 0 and unlocks its rq. 3) At CPU B a) CPU B enters try_to_wake_up with input task p. b) Since CPU Z dequeued task p, p.on_rq = 0, and CPU B updates B.state = WAKING. c) CPU B via select_task_rq determines CPU Y as the target CPU. 4) The race a) CPU A acquires CPU X’s lock and relocks CPU Z. b) CPU A reads task p.cpu = Z and incorrectly concludes task p is still on CPU Z. c) CPU A failed to notice task p had been dequeued from CPU Z while CPU A was waiting for locks in double_lock_balance. If CPU A knew that task p had been dequeued, it would return NULL forcing push_rt_task to give up the task p's migration. d) CPU B updates task p.cpu = Y and calls ttwu_queue. e) CPU B locks Ys rq. CPU B enqueues task p onto Y and sets task p.on_rq = 1. f) CPU B unlocks CPU Y, triggering memory synchronization. g) CPU A reads task p.on_rq = 1, cementing its assumption that task p has not migrated. h) CPU A decides to migrate p to CPU X. This leads to A dequeuing p from Y's queue and various crashes down the line. Solution ======== The solution here is fairly simple. After obtaining the lock (at 4a), the check is enhanced to make sure that the task is still at the head of the pushable tasks list. If not, then it is anyway not suitable for being pushed out. Testing ======= The fix is tested on a cluster of 3 nodes, where the panics due to this are hit every couple of days. A fix similar to this was deployed on such cluster and was stable for more than 30 days. Co-developed-by: Jon Kohler <jon(a)nutanix.com> Signed-off-by: Jon Kohler <jon(a)nutanix.com> Co-developed-by: Gauri Patwardhan <gauri.patwardhan(a)nutanix.com> Signed-off-by: Gauri Patwardhan <gauri.patwardhan(a)nutanix.com> Co-developed-by: Rahul Chunduru <rahul.chunduru(a)nutanix.com> Signed-off-by: Rahul Chunduru <rahul.chunduru(a)nutanix.com> Signed-off-by: Harshit Agarwal <harshit(a)nutanix.com> Tested-by: Will Ton <william.ton(a)nutanix.com> Reviewed-by: Steven Rostedt (Google) <rostedt(a)goodmis.org> Cc: stable(a)vger.kernel.org --- Changes in v2: - As per Steve's suggestion, removed some checks that are done after obtaining the lock that are no longer needed with the addition of new check. - Moved up is_migration_disabled check. - Link to v1: https://lore.kernel.org/lkml/20250211054646.23987-1-harshit@nutanix.com/ Changes in v3: - Updated commit message to add stable maintainers and reviewed-by tag. - Link to v2: https://lore.kernel.org/lkml/20250214170844.201692-1-harshit@nutanix.com/ --- kernel/sched/rt.c | 54 +++++++++++++++++++++++------------------------ 1 file changed, 26 insertions(+), 28 deletions(-) diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 4b8e33c615b1..4762dd3f50c5 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1885,6 +1885,27 @@ static int find_lowest_rq(struct task_struct *task) return -1; } +static struct task_struct *pick_next_pushable_task(struct rq *rq) +{ + struct task_struct *p; + + if (!has_pushable_tasks(rq)) + return NULL; + + p = plist_first_entry(&rq->rt.pushable_tasks, + struct task_struct, pushable_tasks); + + BUG_ON(rq->cpu != task_cpu(p)); + BUG_ON(task_current(rq, p)); + BUG_ON(task_current_donor(rq, p)); + BUG_ON(p->nr_cpus_allowed <= 1); + + BUG_ON(!task_on_rq_queued(p)); + BUG_ON(!rt_task(p)); + + return p; +} + /* Will lock the rq it finds */ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq) { @@ -1915,18 +1936,16 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq) /* * We had to unlock the run queue. In * the mean time, task could have - * migrated already or had its affinity changed. - * Also make sure that it wasn't scheduled on its rq. + * migrated already or had its affinity changed, + * therefore check if the task is still at the + * head of the pushable tasks list. * It is possible the task was scheduled, set * "migrate_disabled" and then got preempted, so we must * check the task migration disable flag here too. */ - if (unlikely(task_rq(task) != rq || + if (unlikely(is_migration_disabled(task) || !cpumask_test_cpu(lowest_rq->cpu, &task->cpus_mask) || - task_on_cpu(rq, task) || - !rt_task(task) || - is_migration_disabled(task) || - !task_on_rq_queued(task))) { + task != pick_next_pushable_task(rq))) { double_unlock_balance(rq, lowest_rq); lowest_rq = NULL; @@ -1946,27 +1965,6 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq) return lowest_rq; } -static struct task_struct *pick_next_pushable_task(struct rq *rq) -{ - struct task_struct *p; - - if (!has_pushable_tasks(rq)) - return NULL; - - p = plist_first_entry(&rq->rt.pushable_tasks, - struct task_struct, pushable_tasks); - - BUG_ON(rq->cpu != task_cpu(p)); - BUG_ON(task_current(rq, p)); - BUG_ON(task_current_donor(rq, p)); - BUG_ON(p->nr_cpus_allowed <= 1); - - BUG_ON(!task_on_rq_queued(p)); - BUG_ON(!rt_task(p)); - - return p; -} - /* * If the current CPU has more than one RT task, see if the non * running task can migrate over to a CPU that is running a task -- 2.22.3

2 months, 3 weeks

6
11
0 0

[REGRESSION] Chrome and VSCode breakage with the commit b9b588f22a0c

by Takashi Iwai

[ resent due to a wrong address for regression reporting, sorry! ] Hi, we received a bug report showing the regression on 6.13.1 kernel against 6.13.0. The symptom is that Chrome and VSCode stopped working with Gnome Scaling, as reported on openSUSE Tumbleweed bug tracker https://bugzilla.suse.com/show_bug.cgi?id=1236943 Quoting from there: """ I use the latest TW on Gnome with a 4K display and 150% scaling. Everything has been working fine, but recently both Chrome and VSCode (installed from official non-openSUSE channels) stopped working with Scaling. .... I am using VSCode with: `--enable-features=UseOzonePlatform --enable-features=WaylandWindowDecorations --ozone-platform-hint=auto` and for Chrome, I select `Preferred Ozone platform` == `Wayland`. """ Surprisingly, the bisection pointed to the backport of the commit b9b588f22a0c049a14885399e27625635ae6ef91 ("libfs: Use d_children list to iterate simple_offset directories"). Indeed, the revert of this patch on the latest 6.13.4 was confirmed to fix the issue. Also, the reporter verified that the latest 6.14-rc release is still affected, too. For now I have no concrete idea how the patch could break the behavior of a graphical application like the above. Let us know if you need something for debugging. (Or at easiest, join to the bugzilla entry and ask there; or open another bug report at whatever you like.) BTW, I'll be traveling tomorrow, so my reply will be delayed. thanks, Takashi #regzbot introduced: b9b588f22a0c049a14885399e27625635ae6ef91 #regzbot monitor: https://bugzilla.suse.com/show_bug.cgi?id=1236943

2 months, 3 weeks

6
29
0 0

[PATCH 1/2] drm/i915: Schedule the HPD poll init work on an unbound workqueue

by Imre Deak

Disabling HPD polling from i915_hpd_poll_init_work() involves probing all display connectors explicitly to account for lost hotplug interrupts. On some platforms (mostly pre-ICL) with HDMI connectors the I2C EDID bit-banging using udelay() triggers in turn the workqueue: i915_hpd_poll_init_work [i915] hogged CPU for >10000us 4 times, consider switching to WQ_UNBOUND warning. Fix the above by scheduling i915_hpd_poll_init_work() on a WQ_UNBOUND workqueue. It's ok to use a system WQ, since i915_hpd_poll_init_work() is properly flushed in intel_hpd_cancel_work(). The connector probing from drm_mode_config::output_poll_work resulting in the same warning is fixed by the next patch. Cc: Tejun Heo <tj(a)kernel.org> Cc: Heiner Kallweit <hkallweit1(a)gmail.com> CC: stable(a)vger.kernel.org # 6.5 Suggested-by: Tejun Heo <tj(a)kernel.org> Suggested-by: Heiner Kallweit <hkallweit1(a)gmail.com> Reported-by: Heiner Kallweit <hkallweit1(a)gmail.com> Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/9245 Link: https://lore.kernel.org/all/f7e21caa-e98d-e5b5-932a-fe12d27fde9b@gmail.com Signed-off-by: Imre Deak <imre.deak(a)intel.com> --- drivers/gpu/drm/i915/display/intel_hotplug.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/i915/display/intel_hotplug.c b/drivers/gpu/drm/i915/display/intel_hotplug.c index e8562f6f8bb44..accc2fec562a0 100644 --- a/drivers/gpu/drm/i915/display/intel_hotplug.c +++ b/drivers/gpu/drm/i915/display/intel_hotplug.c @@ -774,7 +774,7 @@ void intel_hpd_poll_enable(struct drm_i915_private *dev_priv) * As well, there's no issue if we race here since we always reschedule * this worker anyway */ - queue_work(dev_priv->unordered_wq, + queue_work(system_unbound_wq, &dev_priv->display.hotplug.poll_init_work); } @@ -803,7 +803,7 @@ void intel_hpd_poll_disable(struct drm_i915_private *dev_priv) return; WRITE_ONCE(dev_priv->display.hotplug.poll_enabled, false); - queue_work(dev_priv->unordered_wq, + queue_work(system_unbound_wq, &dev_priv->display.hotplug.poll_init_work); } -- 2.37.2

2 months, 3 weeks

4
5
0 0

[PATCH] dmaengine: ti: k3-udma: Add missing locking

by Ronald Wahl

From: Ronald Wahl <ronald.wahl(a)legrand.com> Recent kernels complain about a missing lock in k3-udma.c when the lock validator is enabled: [ 4.128073] WARNING: CPU: 0 PID: 746 at drivers/dma/ti/../virt-dma.h:169 udma_start.isra.0+0x34/0x238 [ 4.137352] CPU: 0 UID: 0 PID: 746 Comm: kworker/0:3 Not tainted 6.12.9-arm64 #28 [ 4.144867] Hardware name: pp-v12 (DT) [ 4.148648] Workqueue: events udma_check_tx_completion [ 4.153841] pstate: 60000005 (nZCv daif -PAN -UAO -TCO -DIT -SSBS BTYPE=--) [ 4.160834] pc : udma_start.isra.0+0x34/0x238 [ 4.165227] lr : udma_start.isra.0+0x30/0x238 [ 4.169618] sp : ffffffc083cabcf0 [ 4.172963] x29: ffffffc083cabcf0 x28: 0000000000000000 x27: ffffff800001b005 [ 4.180167] x26: ffffffc0812f0000 x25: 0000000000000000 x24: 0000000000000000 [ 4.187370] x23: 0000000000000001 x22: 00000000e21eabe9 x21: ffffff8000fa0670 [ 4.194571] x20: ffffff8001b6bf00 x19: ffffff8000fa0430 x18: ffffffc083b95030 [ 4.201773] x17: 0000000000000000 x16: 00000000f0000000 x15: 0000000000000048 [ 4.208976] x14: 0000000000000048 x13: 0000000000000000 x12: 0000000000000001 [ 4.216179] x11: ffffffc08151a240 x10: 0000000000003ea1 x9 : ffffffc08046ab68 [ 4.223381] x8 : ffffffc083cabac0 x7 : ffffffc081df3718 x6 : 0000000000029fc8 [ 4.230583] x5 : ffffffc0817ee6d8 x4 : 0000000000000bc0 x3 : 0000000000000000 [ 4.237784] x2 : 0000000000000000 x1 : 00000000001fffff x0 : 0000000000000000 [ 4.244986] Call trace: [ 4.247463] udma_start.isra.0+0x34/0x238 [ 4.251509] udma_check_tx_completion+0xd0/0xdc [ 4.256076] process_one_work+0x244/0x3fc [ 4.260129] process_scheduled_works+0x6c/0x74 [ 4.264610] worker_thread+0x150/0x1dc [ 4.268398] kthread+0xd8/0xe8 [ 4.271492] ret_from_fork+0x10/0x20 [ 4.275107] irq event stamp: 220 [ 4.278363] hardirqs last enabled at (219): [<ffffffc080a27c7c>] _raw_spin_unlock_irq+0x38/0x50 [ 4.287183] hardirqs last disabled at (220): [<ffffffc080a1c154>] el1_dbg+0x24/0x50 [ 4.294879] softirqs last enabled at (182): [<ffffffc080037e68>] handle_softirqs+0x1c0/0x3cc [ 4.303437] softirqs last disabled at (177): [<ffffffc080010170>] __do_softirq+0x1c/0x28 [ 4.311559] ---[ end trace 0000000000000000 ]--- This commit adds the missing locking. Fixes: 25dcb5dd7b7c ("dmaengine: ti: New driver for K3 UDMA") Cc: Peter Ujfalusi <peter.ujfalusi(a)gmail.com> Cc: Vignesh Raghavendra <vigneshr(a)ti.com> Cc: Vinod Koul <vkoul(a)kernel.org> Cc: dmaengine(a)vger.kernel.org Cc: stable(a)vger.kernel.org Signed-off-by: Ronald Wahl <ronald.wahl(a)legrand.com> --- drivers/dma/ti/k3-udma.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/dma/ti/k3-udma.c b/drivers/dma/ti/k3-udma.c index b3f27b3f9209..b9e497e8134b 100644 --- a/drivers/dma/ti/k3-udma.c +++ b/drivers/dma/ti/k3-udma.c @@ -1091,8 +1091,11 @@ static void udma_check_tx_completion(struct work_struct *work) u32 residue_diff; ktime_t time_diff; unsigned long delay; + unsigned long flags; while (1) { + spin_lock_irqsave(&uc->vc.lock, flags); + if (uc->desc) { /* Get previous residue and time stamp */ residue_diff = uc->tx_drain.residue; @@ -1127,6 +1130,8 @@ static void udma_check_tx_completion(struct work_struct *work) break; } + spin_unlock_irqrestore(&uc->vc.lock, flags); + usleep_range(ktime_to_us(delay), ktime_to_us(delay) + 10); continue; @@ -1143,6 +1148,8 @@ static void udma_check_tx_completion(struct work_struct *work) break; } + + spin_unlock_irqrestore(&uc->vc.lock, flags); } static irqreturn_t udma_ring_irq_handler(int irq, void *data) -- 2.48.0

2 months, 4 weeks

3
2
0 0

+ x86-vmemmap-use-direct-mapped-va-instead-of-vmemmap-based-va.patch added to mm-hotfixes-unstable branch

by Andrew Morton

The patch titled Subject: x86/vmemmap: use direct-mapped VA instead of vmemmap-based VA has been added to the -mm mm-hotfixes-unstable branch. Its filename is x86-vmemmap-use-direct-mapped-va-instead-of-vmemmap-based-va.patch This patch will shortly appear at https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patche… This patch will later appear in the mm-hotfixes-unstable branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm Before you just go and hit "reply", please: a) Consider who else should be cc'ed b) Prefer to cc a suitable mailing list as well c) Ideally: find the original patch on the mailing list and do a reply-to-all to that, adding suitable additional cc's *** Remember to use Documentation/process/submit-checklist.rst when testing your code *** The -mm tree is included into linux-next via the mm-everything branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm and is updated there every 2-3 working days ------------------------------------------------------ From: Gwan-gyeong Mun <gwan-gyeong.mun(a)intel.com> Subject: x86/vmemmap: use direct-mapped VA instead of vmemmap-based VA Date: Mon, 17 Feb 2025 13:41:33 +0200 Address an Oops issues when performing test of loading XE GPU driver module after applying the GPU SVM and Xe SVM patch series[1] and the Dept patch series[2]. The issue occurs when loading the xe driver via modprobe [3], which adds a struct page for device memory via devm_memremap_pages(). When a process leads the addition of a struct page to vmemmap (e.g. hot-plug), the page table update for the newly added vmemmap-based virtual address is updated first in init_mm's page table and then synchronized later. If the vmemmap-based virtual address is accessed through the process's page table before this sync, a page fault will occur. This patch translates vmemmap-based virtual address to direct-mapped virtual address and use it, if the current top-level page table is not init_mm's page table when accessing a vmemmap-based virtual address before this sync. [1] https://lore.kernel.org/dri-devel/20250213021112.1228481-1-matthew.brost@in… [2] https://lore.kernel.org/lkml/20240508094726.35754-1-byungchul@sk.com/ [3] [ 49.103630] xe 0000:00:04.0: [drm] Available VRAM: 0x0000000800000000, 0x00000002fb800000 [ 49.116710] BUG: unable to handle page fault for address: ffffeb3ff1200000 [ 49.117175] #PF: supervisor write access in kernel mode [ 49.117511] #PF: error_code(0x0002) - not-present page [ 49.117835] PGD 0 P4D 0 [ 49.118015] Oops: Oops: 0002 [#1] PREEMPT SMP NOPTI [ 49.118366] CPU: 3 UID: 0 PID: 302 Comm: modprobe Tainted: G W 6.13.0-drm-tip-test+ #62 [ 49.118976] Tainted: [W]=WARN [ 49.119179] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.15.0-1 04/01/2014 [ 49.119710] RIP: 0010:vmemmap_set_pmd+0xff/0x230 [ 49.120011] Code: 77 22 02 a9 ff ff 1f 00 74 58 48 8b 3d 62 77 22 02 48 85 ff 0f 85 9a 00 00 00 48 8d 7d 08 48 89 e9 31 c0 48 89 ea 48 83 e7 f8 <48> c7 45 00 00 00 00 00 48 29 f9 48 c7 45 48 00 00 00 00 83 c1 50 [ 49.121158] RSP: 0018:ffffc900016d37a8 EFLAGS: 00010282 [ 49.121502] RAX: 0000000000000000 RBX: ffff888164000000 RCX: ffffeb3ff1200000 [ 49.121966] RDX: ffffeb3ff1200000 RSI: 80000000000001e3 RDI: ffffeb3ff1200008 [ 49.122499] RBP: ffffeb3ff1200000 R08: ffffeb3ff1280000 R09: 0000000000000000 [ 49.123032] R10: ffff88817b94dc48 R11: 0000000000000003 R12: ffffeb3ff1280000 [ 49.123566] R13: 0000000000000000 R14: ffff88817b94dc48 R15: 8000000163e001e3 [ 49.124096] FS: 00007f53ae71d740(0000) GS:ffff88843fd80000(0000) knlGS:0000000000000000 [ 49.124698] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 49.125129] CR2: ffffeb3ff1200000 CR3: 000000017c7d2000 CR4: 0000000000750ef0 [ 49.125662] PKRU: 55555554 [ 49.125880] Call Trace: [ 49.126078] <TASK> [ 49.126252] ? __die_body.cold+0x19/0x26 [ 49.126509] ? page_fault_oops+0xa2/0x240 [ 49.126736] ? preempt_count_add+0x47/0xa0 [ 49.126968] ? search_module_extables+0x4a/0x80 [ 49.127224] ? exc_page_fault+0x206/0x230 [ 49.127454] ? asm_exc_page_fault+0x22/0x30 [ 49.127691] ? vmemmap_set_pmd+0xff/0x230 [ 49.127919] vmemmap_populate_hugepages+0x176/0x180 [ 49.128194] vmemmap_populate+0x34/0x80 [ 49.128416] __populate_section_memmap+0x41/0x90 [ 49.128676] sparse_add_section+0x121/0x3e0 [ 49.128914] __add_pages+0xba/0x150 [ 49.129116] add_pages+0x1d/0x70 [ 49.129305] memremap_pages+0x3dc/0x810 [ 49.129529] devm_memremap_pages+0x1c/0x60 [ 49.129762] xe_devm_add+0x8b/0x100 [xe] [ 49.130072] xe_tile_init_noalloc+0x6a/0x70 [xe] [ 49.130408] xe_device_probe+0x48c/0x740 [xe] [ 49.130714] ? __pfx___drmm_mutex_release+0x10/0x10 [ 49.130982] ? __drmm_add_action+0x85/0xd0 [ 49.131208] ? __pfx___drmm_mutex_release+0x10/0x10 [ 49.131478] xe_pci_probe+0x7ef/0xd90 [xe] [ 49.131777] ? _raw_spin_unlock_irqrestore+0x66/0x90 [ 49.132049] ? lockdep_hardirqs_on+0xba/0x140 [ 49.132290] pci_device_probe+0x99/0x110 [ 49.132510] really_probe+0xdb/0x340 [ 49.132710] ? pm_runtime_barrier+0x50/0x90 [ 49.132941] ? __pfx___driver_attach+0x10/0x10 [ 49.133190] __driver_probe_device+0x78/0x110 [ 49.133433] driver_probe_device+0x1f/0xa0 [ 49.133661] __driver_attach+0xba/0x1c0 [ 49.133874] bus_for_each_dev+0x7a/0xd0 [ 49.134089] bus_add_driver+0x114/0x200 [ 49.134302] driver_register+0x6e/0xc0 [ 49.134515] xe_init+0x1e/0x50 [xe] [ 49.134827] ? __pfx_xe_init+0x10/0x10 [xe] [ 49.134926] xe 0000:00:04.0: [drm:process_one_work] GT1: GuC CT safe-mode canceled [ 49.135112] do_one_initcall+0x5b/0x2b0 [ 49.135734] ? rcu_is_watching+0xd/0x40 [ 49.135995] ? __kmalloc_cache_noprof+0x231/0x310 [ 49.136315] do_init_module+0x60/0x210 [ 49.136572] init_module_from_file+0x86/0xc0 [ 49.136863] idempotent_init_module+0x12b/0x340 [ 49.137156] __x64_sys_finit_module+0x61/0xc0 [ 49.137437] do_syscall_64+0x69/0x140 [ 49.137681] entry_SYSCALL_64_after_hwframe+0x76/0x7e [ 49.137953] RIP: 0033:0x7f53ae1261fd [ 49.138153] Code: ff c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d e3 fa 0c 00 f7 d8 64 89 01 48 [ 49.139117] RSP: 002b:00007ffd0e9021e8 EFLAGS: 00000246 ORIG_RAX: 0000000000000139 [ 49.139525] RAX: ffffffffffffffda RBX: 000055c02951ee50 RCX: 00007f53ae1261fd [ 49.139905] RDX: 0000000000000000 RSI: 000055bfff125478 RDI: 0000000000000010 [ 49.140282] RBP: 000055bfff125478 R08: 00007f53ae1f6b20 R09: 00007ffd0e902230 [ 49.140663] R10: 000055c029522000 R11: 0000000000000246 R12: 0000000000040000 [ 49.141040] R13: 000055c02951ef80 R14: 0000000000000000 R15: 000055c029521fc0 [ 49.141424] </TASK> [ 49.141552] Modules linked in: xe(+) drm_ttm_helper gpu_sched drm_suballoc_helper drm_gpuvm drm_exec drm_gpusvm i2c_algo_bit drm_buddy video wmi ttm drm_display_helper drm_kms_helper crct10dif_pclmul crc32_pclmul i2c_piix4 e1000 ghash_clmulni_intel i2c_smbus fuse [ 49.142824] CR2: ffffeb3ff1200000 [ 49.143010] ---[ end trace 0000000000000000 ]--- [ 49.143268] RIP: 0010:vmemmap_set_pmd+0xff/0x230 [ 49.143523] Code: 77 22 02 a9 ff ff 1f 00 74 58 48 8b 3d 62 77 22 02 48 85 ff 0f 85 9a 00 00 00 48 8d 7d 08 48 89 e9 31 c0 48 89 ea 48 83 e7 f8 <48> c7 45 00 00 00 00 00 48 29 f9 48 c7 45 48 00 00 00 00 83 c1 50 [ 49.144489] RSP: 0018:ffffc900016d37a8 EFLAGS: 00010282 [ 49.144775] RAX: 0000000000000000 RBX: ffff888164000000 RCX: ffffeb3ff1200000 [ 49.145154] RDX: ffffeb3ff1200000 RSI: 80000000000001e3 RDI: ffffeb3ff1200008 [ 49.145536] RBP: ffffeb3ff1200000 R08: ffffeb3ff1280000 R09: 0000000000000000 [ 49.145914] R10: ffff88817b94dc48 R11: 0000000000000003 R12: ffffeb3ff1280000 [ 49.146292] R13: 0000000000000000 R14: ffff88817b94dc48 R15: 8000000163e001e3 [ 49.146671] FS: 00007f53ae71d740(0000) GS:ffff88843fd80000(0000) knlGS:0000000000000000 [ 49.147097] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 49.147407] CR2: ffffeb3ff1200000 CR3: 000000017c7d2000 CR4: 0000000000750ef0 [ 49.147786] PKRU: 55555554 [ 49.147941] note: modprobe[302] exited with irqs disabled When a process leads the addition of a struct page to vmemmap (e.g. hot-plug), the page table update for the newly added vmemmap-based virtual address is updated first in init_mm's page table and then synchronized later. If the vmemmap-based virtual address is accessed through the process's page table before this sync, a page fault will occur. This translates vmemmap-based virtual address to direct-mapped virtual address and use it, if the current top-level page table is not init_mm's page table when accessing a vmemmap-based virtual address before this sync. Link: https://lkml.kernel.org/r/20250217114133.400063-2-gwan-gyeong.mun@intel.com Fixes: faf1c0008a33 ("x86/vmemmap: optimize for consecutive sections in partial populated PMDs") Signed-off-by: Gwan-gyeong Mun <gwan-gyeong.mun(a)intel.com> Cc: Oscar Salvador <osalvador(a)suse.de> Cc: Hyeonggon Yoo <42.hyeyoo(a)gmail.com> Cc: Byungchul Park <byungchul(a)sk.com> Cc: Dave Hansen <dave.hansen(a)linux.intel.com> Cc: Andy Lutomirski <luto(a)kernel.org> Cc: Peter Zijlstra <peterz(a)infradead.org> Cc: <stable(a)vger.kernel.org> Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org> --- arch/x86/mm/init_64.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) --- a/arch/x86/mm/init_64.c~x86-vmemmap-use-direct-mapped-va-instead-of-vmemmap-based-va +++ a/arch/x86/mm/init_64.c @@ -844,6 +844,17 @@ void __init paging_init(void) */ static unsigned long unused_pmd_start __meminitdata; +static void * __meminit vmemmap_va_to_kaddr(unsigned long vmemmap_va) +{ + void *kaddr = (void *)vmemmap_va; + pgd_t *pgd = __va(read_cr3_pa()); + + if (init_mm.pgd != pgd) + kaddr = __va(slow_virt_to_phys(kaddr)); + + return kaddr; +} + static void __meminit vmemmap_flush_unused_pmd(void) { if (!unused_pmd_start) @@ -851,7 +862,7 @@ static void __meminit vmemmap_flush_unus /* * Clears (unused_pmd_start, PMD_END] */ - memset((void *)unused_pmd_start, PAGE_UNUSED, + memset(vmemmap_va_to_kaddr(unused_pmd_start), PAGE_UNUSED, ALIGN(unused_pmd_start, PMD_SIZE) - unused_pmd_start); unused_pmd_start = 0; } @@ -882,7 +893,7 @@ static void __meminit __vmemmap_use_sub_ * case the first memmap never gets initialized e.g., because the memory * block never gets onlined). */ - memset((void *)start, 0, sizeof(struct page)); + memset(vmemmap_va_to_kaddr(start), 0, sizeof(struct page)); } static void __meminit vmemmap_use_sub_pmd(unsigned long start, unsigned long end) @@ -924,7 +935,7 @@ static void __meminit vmemmap_use_new_su * Mark with PAGE_UNUSED the unused parts of the new memmap range */ if (!IS_ALIGNED(start, PMD_SIZE)) - memset((void *)page, PAGE_UNUSED, start - page); + memset(vmemmap_va_to_kaddr(page), PAGE_UNUSED, start - page); /* * We want to avoid memset(PAGE_UNUSED) when populating the vmemmap of _ Patches currently in -mm which might be from gwan-gyeong.mun(a)intel.com are x86-vmemmap-use-direct-mapped-va-instead-of-vmemmap-based-va.patch

3 months

3
5
0 0

2025

2024

2023

2022

2021

2020

2019

2018

2017

Linux-stable-mirror February 2025