- Linux-stable-mirror - lists.linaro.org

[PATCH 6.12 01/49] xfrm: delete x->tunnel as we delete x

by Greg Kroah-Hartman

6.12-stable review patch. If anyone has any objections, please let me know. ------------------ From: Sabrina Dubroca <sd(a)queasysnail.net> [ Upstream commit b441cf3f8c4b8576639d20c8eb4aa32917602ecd ] The ipcomp fallback tunnels currently get deleted (from the various lists and hashtables) as the last user state that needed that fallback is destroyed (not deleted). If a reference to that user state still exists, the fallback state will remain on the hashtables/lists, triggering the WARN in xfrm_state_fini. Because of those remaining references, the fix in commit f75a2804da39 ("xfrm: destroy xfrm_state synchronously on net exit path") is not complete. We recently fixed one such situation in TCP due to defered freeing of skbs (commit 9b6412e6979f ("tcp: drop secpath at the same time as we currently drop dst")). This can also happen due to IP reassembly: skbs with a secpath remain on the reassembly queue until netns destruction. If we can't guarantee that the queues are flushed by the time xfrm_state_fini runs, there may still be references to a (user) xfrm_state, preventing the timely deletion of the corresponding fallback state. Instead of chasing each instance of skbs holding a secpath one by one, this patch fixes the issue directly within xfrm, by deleting the fallback state as soon as the last user state depending on it has been deleted. Destruction will still happen when the final reference is dropped. A separate lockdep class for the fallback state is required since we're going to lock x->tunnel while x is locked. Fixes: 9d4139c76905 ("netns xfrm: per-netns xfrm_state_all list") Signed-off-by: Sabrina Dubroca <sd(a)queasysnail.net> Signed-off-by: Steffen Klassert <steffen.klassert(a)secunet.com> Signed-off-by: Sasha Levin <sashal(a)kernel.org> --- include/net/xfrm.h | 1 - net/ipv4/ipcomp.c | 2 ++ net/ipv6/ipcomp6.c | 2 ++ net/ipv6/xfrm6_tunnel.c | 2 +- net/xfrm/xfrm_ipcomp.c | 1 - net/xfrm/xfrm_state.c | 19 ++++++++----------- 6 files changed, 13 insertions(+), 14 deletions(-) diff --git a/include/net/xfrm.h b/include/net/xfrm.h index caaff61601a07..d51204041bf7d 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -424,7 +424,6 @@ int xfrm_input_register_afinfo(const struct xfrm_input_afinfo *afinfo); int xfrm_input_unregister_afinfo(const struct xfrm_input_afinfo *afinfo); void xfrm_flush_gc(void); -void xfrm_state_delete_tunnel(struct xfrm_state *x); struct xfrm_type { struct module *owner; diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c index 5a4fb2539b08b..9a45aed508d19 100644 --- a/net/ipv4/ipcomp.c +++ b/net/ipv4/ipcomp.c @@ -54,6 +54,7 @@ static int ipcomp4_err(struct sk_buff *skb, u32 info) } /* We always hold one tunnel user reference to indicate a tunnel */ +static struct lock_class_key xfrm_state_lock_key; static struct xfrm_state *ipcomp_tunnel_create(struct xfrm_state *x) { struct net *net = xs_net(x); @@ -62,6 +63,7 @@ static struct xfrm_state *ipcomp_tunnel_create(struct xfrm_state *x) t = xfrm_state_alloc(net); if (!t) goto out; + lockdep_set_class(&t->lock, &xfrm_state_lock_key); t->id.proto = IPPROTO_IPIP; t->id.spi = x->props.saddr.a4; diff --git a/net/ipv6/ipcomp6.c b/net/ipv6/ipcomp6.c index 72d4858dec18a..8607569de34f3 100644 --- a/net/ipv6/ipcomp6.c +++ b/net/ipv6/ipcomp6.c @@ -71,6 +71,7 @@ static int ipcomp6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, return 0; } +static struct lock_class_key xfrm_state_lock_key; static struct xfrm_state *ipcomp6_tunnel_create(struct xfrm_state *x) { struct net *net = xs_net(x); @@ -79,6 +80,7 @@ static struct xfrm_state *ipcomp6_tunnel_create(struct xfrm_state *x) t = xfrm_state_alloc(net); if (!t) goto out; + lockdep_set_class(&t->lock, &xfrm_state_lock_key); t->id.proto = IPPROTO_IPV6; t->id.spi = xfrm6_tunnel_alloc_spi(net, (xfrm_address_t *)&x->props.saddr); diff --git a/net/ipv6/xfrm6_tunnel.c b/net/ipv6/xfrm6_tunnel.c index bf140ef781c1f..7fd8bc08e6eb1 100644 --- a/net/ipv6/xfrm6_tunnel.c +++ b/net/ipv6/xfrm6_tunnel.c @@ -334,8 +334,8 @@ static void __net_exit xfrm6_tunnel_net_exit(struct net *net) struct xfrm6_tunnel_net *xfrm6_tn = xfrm6_tunnel_pernet(net); unsigned int i; - xfrm_flush_gc(); xfrm_state_flush(net, 0, false, true); + xfrm_flush_gc(); for (i = 0; i < XFRM6_TUNNEL_SPI_BYADDR_HSIZE; i++) WARN_ON_ONCE(!hlist_empty(&xfrm6_tn->spi_byaddr[i])); diff --git a/net/xfrm/xfrm_ipcomp.c b/net/xfrm/xfrm_ipcomp.c index 9c0fa0e1786a2..f2e70e918f114 100644 --- a/net/xfrm/xfrm_ipcomp.c +++ b/net/xfrm/xfrm_ipcomp.c @@ -315,7 +315,6 @@ void ipcomp_destroy(struct xfrm_state *x) struct ipcomp_data *ipcd = x->data; if (!ipcd) return; - xfrm_state_delete_tunnel(x); mutex_lock(&ipcomp_resource_mutex); ipcomp_free_data(ipcd); mutex_unlock(&ipcomp_resource_mutex); diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index f8cb033f102ed..e4500d481e26b 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -748,6 +748,7 @@ void __xfrm_state_destroy(struct xfrm_state *x, bool sync) } EXPORT_SYMBOL(__xfrm_state_destroy); +static void xfrm_state_delete_tunnel(struct xfrm_state *x); int __xfrm_state_delete(struct xfrm_state *x) { struct net *net = xs_net(x); @@ -775,6 +776,8 @@ int __xfrm_state_delete(struct xfrm_state *x) xfrm_dev_state_delete(x); + xfrm_state_delete_tunnel(x); + /* All xfrm_state objects are created by xfrm_state_alloc. * The xfrm_state_alloc call gives a reference, and that * is what we are dropping here. @@ -878,10 +881,7 @@ int xfrm_state_flush(struct net *net, u8 proto, bool task_valid, bool sync) err = xfrm_state_delete(x); xfrm_audit_state_delete(x, err ? 0 : 1, task_valid); - if (sync) - xfrm_state_put_sync(x); - else - xfrm_state_put(x); + xfrm_state_put(x); if (!err) cnt++; @@ -3008,20 +3008,17 @@ void xfrm_flush_gc(void) } EXPORT_SYMBOL(xfrm_flush_gc); -/* Temporarily located here until net/xfrm/xfrm_tunnel.c is created */ -void xfrm_state_delete_tunnel(struct xfrm_state *x) +static void xfrm_state_delete_tunnel(struct xfrm_state *x) { if (x->tunnel) { struct xfrm_state *t = x->tunnel; - if (atomic_read(&t->tunnel_users) == 2) + if (atomic_dec_return(&t->tunnel_users) == 1) xfrm_state_delete(t); - atomic_dec(&t->tunnel_users); - xfrm_state_put_sync(t); + xfrm_state_put(t); x->tunnel = NULL; } } -EXPORT_SYMBOL(xfrm_state_delete_tunnel); u32 xfrm_state_mtu(struct xfrm_state *x, int mtu) { @@ -3221,8 +3218,8 @@ void xfrm_state_fini(struct net *net) unsigned int sz; flush_work(&net->xfrm.state_hash_work); - flush_work(&xfrm_state_gc_work); xfrm_state_flush(net, 0, false, true); + flush_work(&xfrm_state_gc_work); WARN_ON(!list_empty(&net->xfrm.state_all)); -- 2.51.0

8 hours, 23 minutes

1
0
0 0

Re: Patch "dma-mapping: Allow use of DMA_BIT_MASK(64) in global scope" has been added to the 6.17-stable tree

by Nathan Chancellor

On Sun, Dec 07, 2025 at 10:07:49PM -0500, Sasha Levin wrote: > This is a note to let you know that I've just added the patch titled > > dma-mapping: Allow use of DMA_BIT_MASK(64) in global scope > > to the 6.17-stable tree which can be found at: > http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum… > > The filename of the patch is: > dma-mapping-allow-use-of-dma_bit_mask-64-in-global-s.patch > and it can be found in the queue-6.17 subdirectory. > > If you, or anyone else, feels it should not be added to the stable tree, > please let <stable(a)vger.kernel.org> know about it. > > > > commit 51a1912d46ceb70602de50c167e440966b9836f1 > Author: James Clark <james.clark(a)linaro.org> > Date: Thu Oct 30 14:05:27 2025 +0000 > > dma-mapping: Allow use of DMA_BIT_MASK(64) in global scope > > [ Upstream commit a50f7456f853ec3a6f07cbe1d16ad8a8b2501320 ] This change has a pending bug fix, consider waiting to apply this to the stable trees. https://lore.kernel.org/20251207184756.97904-1-johannes.goede@oss.qualcomm.… Cheers, Nathan

8 hours, 27 minutes

2
1
0 0

[PATCH AUTOSEL 6.18-5.10] serial: sprd: Return -EPROBE_DEFER when uart clock is not ready

by Sasha Levin

From: Wenhua Lin <Wenhua.Lin(a)unisoc.com> [ Upstream commit 29e8a0c587e328ed458380a45d6028adf64d7487 ] In sprd_clk_init(), when devm_clk_get() returns -EPROBE_DEFER for either uart or source clock, we should propagate the error instead of just warning and continuing with NULL clocks. Currently the driver only emits a warning when clock acquisition fails and proceeds with NULL clock pointers. This can lead to issues later when the clocks are actually needed. More importantly, when the clock provider is not ready yet and returns -EPROBE_DEFER, we should return this error to allow deferred probing. This change adds explicit checks for -EPROBE_DEFER after both: 1. devm_clk_get(uport->dev, uart) 2. devm_clk_get(uport->dev, source) When -EPROBE_DEFER is encountered, the function now returns -EPROBE_DEFER to let the driver framework retry probing later when the clock dependencies are resolved. Signed-off-by: Wenhua Lin <Wenhua.Lin(a)unisoc.com> Link: https://patch.msgid.link/20251022030840.956589-1-Wenhua.Lin@unisoc.com Reviewed-by: Cixi Geng <cixi.geng(a)linux.dev> Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org> Signed-off-by: Sasha Levin <sashal(a)kernel.org> --- LLM Generated explanations, may be completely bogus: ## Commit Analysis: serial: sprd: Return -EPROBE_DEFER when uart clock is not ready ### 1. COMMIT MESSAGE ANALYSIS **Subject**: Fixes incorrect handling of `-EPROBE_DEFER` error from `devm_clk_get()` **Key observations**: - No `Cc: stable(a)vger.kernel.org` tag present - No `Fixes:` tag pointing to when the bug was introduced - Has `Reviewed-by:` from Cixi Geng - Accepted by Greg Kroah-Hartman (serial subsystem maintainer) ### 2. CODE CHANGE ANALYSIS The change adds two checks in `sprd_clk_init()`: ```c clk_uart = devm_clk_get(uport->dev, "uart"); if (IS_ERR(clk_uart)) { + if (PTR_ERR(clk_uart) == -EPROBE_DEFER) + return -EPROBE_DEFER; dev_warn(...); clk_uart = NULL; } clk_parent = devm_clk_get(uport->dev, "source"); if (IS_ERR(clk_parent)) { + if (PTR_ERR(clk_parent) == -EPROBE_DEFER) + return -EPROBE_DEFER; dev_warn(...); clk_parent = NULL; } ``` **Technical bug mechanism**: When clock providers aren't ready yet, `devm_clk_get()` returns `-EPROBE_DEFER`. The existing code ignores this error, sets the clock pointer to NULL, and continues. This bypasses the kernel's deferred probing mechanism which exists precisely to handle this dependency ordering scenario. **Existing pattern**: The function already has identical handling for the "enable" clock (visible in the context lines). This fix makes the handling consistent for all three clocks. ### 3. CLASSIFICATION **Type**: Bug fix (not a feature addition) This fixes incorrect error handling. The `-EPROBE_DEFER` mechanism is a fundamental kernel feature for handling driver load order dependencies. Not propagating this error is a bug. ### 4. SCOPE AND RISK ASSESSMENT - **Lines changed**: 6 lines added (two 3-line checks) - **Files touched**: 1 file (`drivers/tty/serial/sprd_serial.c`) - **Complexity**: Very low - follows identical pattern already in the same function - **Subsystem**: Hardware-specific serial driver for Spreadtrum/Unisoc UARTs **Risk**: Very low - Pattern is already established and proven in the same function - Only affects error handling during probe - No changes to normal operation when clocks are available - Worst case: probe failure happens earlier/more explicitly ### 5. USER IMPACT **Affected users**: Users of Spreadtrum/Unisoc UART hardware (embedded devices, some Android phones) **Severity**: Medium-High for affected users - without this fix, the serial port may not work correctly on systems where clock providers load after the serial driver. This is common in embedded systems with device tree-based configurations. **Real bug**: This is a practical issue in probe ordering scenarios. The driver would proceed with NULL clocks instead of waiting for dependencies to be ready. ### 6. STABILITY INDICATORS - Reviewed-by tag indicates code review - Maintainer accepted the change - Simple, straightforward change following existing code patterns - No complex logic introduced ### 7. DEPENDENCY CHECK - **Dependencies**: None - self-contained fix - **Code existence in stable**: The sprd_serial driver has existed since ~2015 (commit 3e1f2029a4b40), so it's present in all active stable trees ### SUMMARY **What it fixes**: Incorrect handling of `-EPROBE_DEFER` from `devm_clk_get()` for two clocks ("uart" and "source"), causing the driver to proceed with NULL clocks instead of deferring probe when clock providers aren't ready. **Stable kernel criteria**: - ✅ Obviously correct (follows identical pattern already in function) - ✅ Fixes a real bug (broken deferred probing) - ✅ Small and contained (6 lines, 1 file) - ✅ No new features - ✅ Low risk of regression **Concerns**: - No explicit `Cc: stable` tag (author/maintainer didn't flag for stable) - Relatively niche driver (Spreadtrum hardware) **Risk vs Benefit**: The fix is minimal risk (identical pattern to existing code) and addresses a real bug that could leave serial hardware non-functional on embedded systems. The benefit outweighs the minimal risk. The lack of `Cc: stable` tag is notable but not determinative - this is a straightforward bug fix that meets all stable criteria. The fix is small, obviously correct, and addresses a real probe ordering issue that embedded system users could encounter. **YES** drivers/tty/serial/sprd_serial.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/tty/serial/sprd_serial.c b/drivers/tty/serial/sprd_serial.c index 8c9366321f8e7..092755f356836 100644 --- a/drivers/tty/serial/sprd_serial.c +++ b/drivers/tty/serial/sprd_serial.c @@ -1133,6 +1133,9 @@ static int sprd_clk_init(struct uart_port *uport) clk_uart = devm_clk_get(uport->dev, "uart"); if (IS_ERR(clk_uart)) { + if (PTR_ERR(clk_uart) == -EPROBE_DEFER) + return -EPROBE_DEFER; + dev_warn(uport->dev, "uart%d can't get uart clock\n", uport->line); clk_uart = NULL; @@ -1140,6 +1143,9 @@ static int sprd_clk_init(struct uart_port *uport) clk_parent = devm_clk_get(uport->dev, "source"); if (IS_ERR(clk_parent)) { + if (PTR_ERR(clk_parent) == -EPROBE_DEFER) + return -EPROBE_DEFER; + dev_warn(uport->dev, "uart%d can't get source clock\n", uport->line); clk_parent = NULL; -- 2.51.0

9 hours, 18 minutes

1
7
0 0

[PATCH v2] media: dw2102: validate I2C messages in su3000_i2c_transfer()

by Dharanitharan R

syzbot reports a general protection fault caused by su3000_i2c_transfer() dereferencing msg->buf without validating the message length or buffer pointer. Although i2c-dev blocks zero-length messages, malformed I²C messages can still reach the driver through the DVB USB subsystem. Add strict validation of each message to prevent NULL-pointer dereferences. Reported-by: syzbot+d99f3a288cc7d8ef60fb(a)syzkaller.appspotmail.com Fixes: 0e148a522b84 ("media: dw2102: Don't translate i2c read into write") Closes: https://syzkaller.appspot.com/bug?extid=d99f3a288cc7d8ef60fb Cc: stable(a)vger.kernel.org Signed-off-by: Dharanitharan R <dharanitharan725(a)gmail.com> --- drivers/media/usb/dvb-usb/dw2102.c | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/drivers/media/usb/dvb-usb/dw2102.c b/drivers/media/usb/dvb-usb/dw2102.c index 4fecf2f965e9..0dd210ea16f3 100644 --- a/drivers/media/usb/dvb-usb/dw2102.c +++ b/drivers/media/usb/dvb-usb/dw2102.c @@ -733,6 +733,36 @@ static int su3000_i2c_transfer(struct i2c_adapter *adap, struct i2c_msg msg[], return -EAGAIN; } + /* Validate incoming I²C messages */ + if (!msg || num <= 0) { + mutex_unlock(&d->data_mutex); + mutex_unlock(&d->i2c_mutex); + return -EINVAL; + } + + for (j = 0; j < num; j++) { + /* msg buffer must exist */ + if (!msg[j].buf) { + mutex_unlock(&d->data_mutex); + mutex_unlock(&d->i2c_mutex); + return -EINVAL; + } + + /* zero or negative length is invalid */ + if (msg[j].len <= 0) { + mutex_unlock(&d->data_mutex); + mutex_unlock(&d->i2c_mutex); + return -EINVAL; + } + + /* protect against unreasonable sizes */ + if (msg[j].len > 256) { + mutex_unlock(&d->data_mutex); + mutex_unlock(&d->i2c_mutex); + return -EOPNOTSUPP; + } + } + j = 0; while (j < num) { switch (msg[j].addr) { -- 2.43.0

9 hours, 57 minutes

1
0
0 0

[PATCH] drm/nouveau/dispnv50: Don't call drm_atomic_get_crtc_state() in prepare_fb

by Lyude Paul

Since we recently started warning about uses of this function after the atomic check phase completes, we've started getting warnings about this in nouveau. It appears a misplaced drm_atomic_get_crtc_state() call has been hiding in our .prepare_fb callback for a while. So, fix this by adding a new nv50_head_atom_get_new() function and use that in our .prepare_fb callback instead. Signed-off-by: Lyude Paul <lyude(a)redhat.com> Fixes: 1590700d94ac ("drm/nouveau/kms/nv50-: split each resource type into their own source files") Cc: <stable(a)vger.kernel.org> # v4.18+ Signed-off-by: Lyude Paul <lyude(a)redhat.com> --- drivers/gpu/drm/nouveau/dispnv50/atom.h | 13 +++++++++++++ drivers/gpu/drm/nouveau/dispnv50/wndw.c | 2 +- 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/nouveau/dispnv50/atom.h b/drivers/gpu/drm/nouveau/dispnv50/atom.h index 93f8f4f645784..85b7cf70d13c4 100644 --- a/drivers/gpu/drm/nouveau/dispnv50/atom.h +++ b/drivers/gpu/drm/nouveau/dispnv50/atom.h @@ -152,8 +152,21 @@ static inline struct nv50_head_atom * nv50_head_atom_get(struct drm_atomic_state *state, struct drm_crtc *crtc) { struct drm_crtc_state *statec = drm_atomic_get_crtc_state(state, crtc); + if (IS_ERR(statec)) return (void *)statec; + + return nv50_head_atom(statec); +} + +static inline struct nv50_head_atom * +nv50_head_atom_get_new(struct drm_atomic_state *state, struct drm_crtc *crtc) +{ + struct drm_crtc_state *statec = drm_atomic_get_new_crtc_state(state, crtc); + + if (IS_ERR(statec)) + return (void*)statec; + return nv50_head_atom(statec); } diff --git a/drivers/gpu/drm/nouveau/dispnv50/wndw.c b/drivers/gpu/drm/nouveau/dispnv50/wndw.c index ef9e410babbfb..9a2c20fce0f3e 100644 --- a/drivers/gpu/drm/nouveau/dispnv50/wndw.c +++ b/drivers/gpu/drm/nouveau/dispnv50/wndw.c @@ -583,7 +583,7 @@ nv50_wndw_prepare_fb(struct drm_plane *plane, struct drm_plane_state *state) asyw->image.offset[0] = nvbo->offset; if (wndw->func->prepare) { - asyh = nv50_head_atom_get(asyw->state.state, asyw->state.crtc); + asyh = nv50_head_atom_get_new(asyw->state.state, asyw->state.crtc); if (IS_ERR(asyh)) return PTR_ERR(asyh); -- 2.52.0

10 hours, 52 minutes

3
2
0 0

[PATCH v2] powerpc: Add reloc_offset() to font bitmap pointer used for bootx_printf()

by Finn Thain

Since Linux v6.7, booting using BootX on an Old World PowerMac produces an early crash. Stan Johnson writes, "the symptoms are that the screen goes blank and the backlight stays on, and the system freezes (Linux doesn't boot)." Further testing revealed that the failure can be avoided by disabling CONFIG_BOOTX_TEXT. Bisection revealed that the regression was caused by a change to the font bitmap pointer that's used when btext_init() begins painting characters on the display, early in the boot process. Christophe Leroy explains, "before kernel text is relocated to its final location ... data is addressed with an offset which is added to the Global Offset Table (GOT) entries at the start of bootx_init() by function reloc_got2(). But the pointers that are located inside a structure are not referenced in the GOT and are therefore not updated by reloc_got2(). It is therefore needed to apply the offset manually by using PTRRELOC() macro." Cc: Cedar Maxwell <cedarmaxwell(a)mac.com> Cc: Stan Johnson <userm57(a)yahoo.com> Cc: "Dr. David Alan Gilbert" <linux(a)treblig.org> Cc: Benjamin Herrenschmidt <benh(a)kernel.crashing.org> Cc: stable(a)vger.kernel.org Link: https://lists.debian.org/debian-powerpc/2025/10/msg00111.html Link: https://lore.kernel.org/linuxppc-dev/d81ddca8-c5ee-d583-d579-02b19ed95301@y… Reported-by: Cedar Maxwell <cedarmaxwell(a)mac.com> Closes: https://lists.debian.org/debian-powerpc/2025/09/msg00031.html Bisected-by: Stan Johnson <userm57(a)yahoo.com> Tested-by: Stan Johnson <userm57(a)yahoo.com> Fixes: 0ebc7feae79a ("powerpc: Use shared font data") Suggested-by: Christophe Leroy <christophe.leroy(a)csgroup.eu> Signed-off-by: Finn Thain <fthain(a)linux-m68k.org> --- Changed since v1: - Improved commit log entry to better explain the need for PTRRELOC(). --- arch/powerpc/kernel/btext.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/btext.c b/arch/powerpc/kernel/btext.c index 7f63f1cdc6c3..ca00c4824e31 100644 --- a/arch/powerpc/kernel/btext.c +++ b/arch/powerpc/kernel/btext.c @@ -20,6 +20,7 @@ #include <asm/io.h> #include <asm/processor.h> #include <asm/udbg.h> +#include <asm/setup.h> #define NO_SCROLL @@ -463,7 +464,7 @@ static noinline void draw_byte(unsigned char c, long locX, long locY) { unsigned char *base = calc_base(locX << 3, locY << 4); unsigned int font_index = c * 16; - const unsigned char *font = font_sun_8x16.data + font_index; + const unsigned char *font = PTRRELOC(font_sun_8x16.data) + font_index; int rb = dispDeviceRowBytes; rmci_maybe_on(); -- 2.49.1

11 hours, 54 minutes

3
3
0 0

[PATCH AUTOSEL 6.18-6.17] functionfs: fix the open/removal races

by Sasha Levin

From: Al Viro <viro(a)zeniv.linux.org.uk> [ Upstream commit e5bf5ee266633cb18fff6f98f0b7d59a62819eee ] ffs_epfile_open() can race with removal, ending up with file->private_data pointing to freed object. There is a total count of opened files on functionfs (both ep0 and dynamic ones) and when it hits zero, dynamic files get removed. Unfortunately, that removal can happen while another thread is in ffs_epfile_open(), but has not incremented the count yet. In that case open will succeed, leaving us with UAF on any subsequent read() or write(). The root cause is that ffs->opened is misused; atomic_dec_and_test() vs. atomic_add_return() is not a good idea, when object remains visible all along. To untangle that * serialize openers on ffs->mutex (both for ep0 and for dynamic files) * have dynamic ones use atomic_inc_not_zero() and fail if we had zero ->opened; in that case the file we are opening is doomed. * have the inodes of dynamic files marked on removal (from the callback of simple_recursive_removal()) - clear ->i_private there. * have open of dynamic ones verify they hadn't been already removed, along with checking that state is FFS_ACTIVE. Reviewed-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org> Signed-off-by: Al Viro <viro(a)zeniv.linux.org.uk> Signed-off-by: Sasha Levin <sashal(a)kernel.org> --- LLM Generated explanations, may be completely bogus: ## Summary ### What the Commit Fixes This commit fixes a **Use-After-Free (UAF)** vulnerability in USB gadget functionfs. The race condition occurs between `ffs_epfile_open()` and file removal: 1. Thread A is in `ffs_epfile_open()` but hasn't incremented `ffs->opened` yet 2. Thread B closes the last file handle, triggering removal (since `opened` is 0) 3. Thread A's open succeeds with `file->private_data` pointing to freed memory 4. Subsequent `read()`/`write()` operations cause UAF ### Fix Mechanism The fix implements proper synchronization: - Serializes openers using `ffs->mutex` - Uses `atomic_inc_not_zero()` to fail if counter already zero - Uses `smp_load_acquire()`/`smp_store_release()` for memory ordering - Clears `i_private` during removal via `simple_recursive_removal()` callback - Uses `file->private_data` instead of `inode->i_private` in release path ### Stable Kernel Criteria Assessment | Criterion | Assessment | |-----------|------------| | Obviously correct | ✅ Uses standard kernel primitives, proper locking patterns | | Fixes real bug | ✅ UAF vulnerability, security-relevant | | Important issue | ✅ Security bug, potential for crashes/exploitation | | Small and contained | ✅ Single file, +43/-10 lines, localized changes | | No new features | ✅ Pure bug fix, no new APIs | ### Risk vs Benefit **Benefits:** - Fixes serious UAF vulnerability - USB gadget functionfs used in Android, embedded systems - Reviewed by Greg Kroah-Hartman (USB maintainer, stable maintainer) - Written by Al Viro (highly respected kernel developer) **Risks:** - Moderate complexity (changes locking behavior) - Recent commit (November 2025), limited mainline soak time - No explicit `Cc: stable(a)vger.kernel.org` tag ### Dependencies - `ffs_mutex_lock()` - exists in functionfs since early versions - `simple_recursive_removal()` with callback - available since ~5.x kernels - Standard kernel APIs (`atomic_inc_not_zero`, memory barriers) - universally available ### Concerns 1. **No Fixes: tag** - Makes it harder to determine which stable trees need this fix 2. **No Cc: stable tag** - May indicate maintainers wanted soak time, or an oversight given Greg KH reviewed it 3. **Backport effort** - May need adjustment for older stable trees depending on functionfs evolution ### Conclusion This is a legitimate UAF security fix that affects real-world users (Android, embedded USB gadgets). Despite moderate complexity, the fix: - Addresses a serious vulnerability class (UAF) - Uses correct synchronization patterns - Has been reviewed by the appropriate maintainer who also maintains stable trees - Is self-contained with no feature additions The lack of explicit stable tags appears to be an oversight given the security nature of the bug and Greg KH's review. UAF vulnerabilities typically warrant expedited backporting. **YES** drivers/usb/gadget/function/f_fs.c | 53 ++++++++++++++++++++++++------ 1 file changed, 43 insertions(+), 10 deletions(-) diff --git a/drivers/usb/gadget/function/f_fs.c b/drivers/usb/gadget/function/f_fs.c index 47cfbe41fdff8..69f6e3c0f7e00 100644 --- a/drivers/usb/gadget/function/f_fs.c +++ b/drivers/usb/gadget/function/f_fs.c @@ -640,13 +640,22 @@ static ssize_t ffs_ep0_read(struct file *file, char __user *buf, static int ffs_ep0_open(struct inode *inode, struct file *file) { - struct ffs_data *ffs = inode->i_private; + struct ffs_data *ffs = inode->i_sb->s_fs_info; + int ret; - if (ffs->state == FFS_CLOSING) - return -EBUSY; + /* Acquire mutex */ + ret = ffs_mutex_lock(&ffs->mutex, file->f_flags & O_NONBLOCK); + if (ret < 0) + return ret; - file->private_data = ffs; ffs_data_opened(ffs); + if (ffs->state == FFS_CLOSING) { + ffs_data_closed(ffs); + mutex_unlock(&ffs->mutex); + return -EBUSY; + } + mutex_unlock(&ffs->mutex); + file->private_data = ffs; return stream_open(inode, file); } @@ -1193,14 +1202,33 @@ static ssize_t ffs_epfile_io(struct file *file, struct ffs_io_data *io_data) static int ffs_epfile_open(struct inode *inode, struct file *file) { - struct ffs_epfile *epfile = inode->i_private; + struct ffs_data *ffs = inode->i_sb->s_fs_info; + struct ffs_epfile *epfile; + int ret; - if (WARN_ON(epfile->ffs->state != FFS_ACTIVE)) + /* Acquire mutex */ + ret = ffs_mutex_lock(&ffs->mutex, file->f_flags & O_NONBLOCK); + if (ret < 0) + return ret; + + if (!atomic_inc_not_zero(&ffs->opened)) { + mutex_unlock(&ffs->mutex); + return -ENODEV; + } + /* + * we want the state to be FFS_ACTIVE; FFS_ACTIVE alone is + * not enough, though - we might have been through FFS_CLOSING + * and back to FFS_ACTIVE, with our file already removed. + */ + epfile = smp_load_acquire(&inode->i_private); + if (unlikely(ffs->state != FFS_ACTIVE || !epfile)) { + mutex_unlock(&ffs->mutex); + ffs_data_closed(ffs); return -ENODEV; + } + mutex_unlock(&ffs->mutex); file->private_data = epfile; - ffs_data_opened(epfile->ffs); - return stream_open(inode, file); } @@ -1332,7 +1360,7 @@ static void ffs_dmabuf_put(struct dma_buf_attachment *attach) static int ffs_epfile_release(struct inode *inode, struct file *file) { - struct ffs_epfile *epfile = inode->i_private; + struct ffs_epfile *epfile = file->private_data; struct ffs_dmabuf_priv *priv, *tmp; struct ffs_data *ffs = epfile->ffs; @@ -2352,6 +2380,11 @@ static int ffs_epfiles_create(struct ffs_data *ffs) return 0; } +static void clear_one(struct dentry *dentry) +{ + smp_store_release(&dentry->d_inode->i_private, NULL); +} + static void ffs_epfiles_destroy(struct ffs_epfile *epfiles, unsigned count) { struct ffs_epfile *epfile = epfiles; @@ -2359,7 +2392,7 @@ static void ffs_epfiles_destroy(struct ffs_epfile *epfiles, unsigned count) for (; count; --count, ++epfile) { BUG_ON(mutex_is_locked(&epfile->mutex)); if (epfile->dentry) { - simple_recursive_removal(epfile->dentry, NULL); + simple_recursive_removal(epfile->dentry, clear_one); epfile->dentry = NULL; } } -- 2.51.0

12 hours, 4 minutes

1
19
0 0

回复:6.12.50 regression: netem: cannot mix duplicating netems with other netems in tree.

by zyc zyc

Hello, Resend my last email without HTML. ---- zyc zyc <zyc199902(a)zohomail.cn> 在 Sat, 2025-11-29 18:57:01 写到：--- > Hello, maintainer > > I would like to report what appears to be a regression in 6.12.50 kernel release related to netem. > It rejects our configuration with the message: > Error: netem: cannot mix duplicating netems with other netems in tree. > > This breaks setups that previously worked correctly for many years. > > > Our team uses multiple netem qdiscs in the same HTB branch, arranged in a parallel fashion using a prio fan-out. Each branch of the prio qdisc has its own distinct netem instance with different duplication characteristics. > > This is used to emulate our production conditions where a single logical path fans out into two downstream segments, for example: > > two ECMP next hops with different misbehaviour characteristics, or > > > an HA firewall cluster where only one node is replaying frames, or > > > two LAG / ToR paths where one path intermittently duplicates packets. > > > In our environments, only a subset of flows are affected, and different downstream devices may cause different styles of duplication. > This regression breaks existing automated tests, training environments, and network simulation pipelines. > > I would be happy to provide our reproducer if needed. > > Thank you for your time and for maintaining Linux kernel. > > > > Best regards, > zyc > > >

12 hours, 22 minutes

3
6
0 0

[PATCH v2] pmdomain: imx: Fix reference count leak in imx_gpc_probe()

by Wentao Liang

of_get_child_by_name() returns a node pointer with refcount incremented, we should use the __free() attribute to manage the pgc_node reference. This ensures automatic of_node_put() cleanup when pgc_node goes out of scope, eliminating the need for explicit error handling paths and avoiding reference count leaks. Fixes: 721cabf6c660 ("soc: imx: move PGC handling to a new GPC driver") Cc: stable(a)vger.kernel.org Signed-off-by: Wentao Liang <vulab(a)iscas.ac.cn> --- Change in V2: - Use __free() attribute instead of explicit of_node_put() calls --- drivers/pmdomain/imx/gpc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pmdomain/imx/gpc.c b/drivers/pmdomain/imx/gpc.c index f18c7e6e75dd..89d5d68c055d 100644 --- a/drivers/pmdomain/imx/gpc.c +++ b/drivers/pmdomain/imx/gpc.c @@ -403,7 +403,7 @@ static int imx_gpc_old_dt_init(struct device *dev, struct regmap *regmap, static int imx_gpc_probe(struct platform_device *pdev) { const struct imx_gpc_dt_data *of_id_data = device_get_match_data(&pdev->dev); - struct device_node *pgc_node; + struct device_node *pgc_node __free(pgc_node); struct regmap *regmap; void __iomem *base; int ret; -- 2.34.1

12 hours, 31 minutes

3
2
0 0

[PATCH] media: cx23885: Add missing unmap in snd_cx23885_hw_params()

by Haoxiang Li

In error path, add cx23885_alsa_dma_unmap() to release the resource acquired by cx23885_alsa_dma_map(). Fixes: 9529a4b0cf49 ("[media] cx23885: drop videobuf abuse in cx23885-alsa") Cc: stable(a)vger.kernel.org Signed-off-by: Haoxiang Li <lihaoxiang(a)isrc.iscas.ac.cn> --- drivers/media/pci/cx23885/cx23885-alsa.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/media/pci/cx23885/cx23885-alsa.c b/drivers/media/pci/cx23885/cx23885-alsa.c index 25dc8d4dc5b7..717fc6c9ef21 100644 --- a/drivers/media/pci/cx23885/cx23885-alsa.c +++ b/drivers/media/pci/cx23885/cx23885-alsa.c @@ -392,8 +392,10 @@ static int snd_cx23885_hw_params(struct snd_pcm_substream *substream, ret = cx23885_risc_databuffer(chip->pci, &buf->risc, buf->sglist, chip->period_size, chip->num_periods, 1); - if (ret < 0) + if (ret < 0) { + cx23885_alsa_dma_unmap(chip); goto error; + } /* Loop back to start of program */ buf->risc.jmp[0] = cpu_to_le32(RISC_JUMP|RISC_IRQ1|RISC_CNT_INC); -- 2.25.1

12 hours, 51 minutes

1
0
0 0

[PATCH] media: cx88: Add missing unmap in snd_cx88_hw_params()

by Haoxiang Li

In error path, add cx88_alsa_dma_unmap() to release resource acquired by cx88_alsa_dma_map(). Fixes: b2c75abde0de ("[media] cx88: drop videobuf abuse in cx88-alsa") Cc: stable(a)vger.kernel.org Signed-off-by: Haoxiang Li <lihaoxiang(a)isrc.iscas.ac.cn> --- drivers/media/pci/cx88/cx88-alsa.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/media/pci/cx88/cx88-alsa.c b/drivers/media/pci/cx88/cx88-alsa.c index 29fb1311e443..4e574d8390b4 100644 --- a/drivers/media/pci/cx88/cx88-alsa.c +++ b/drivers/media/pci/cx88/cx88-alsa.c @@ -483,8 +483,10 @@ static int snd_cx88_hw_params(struct snd_pcm_substream *substream, ret = cx88_risc_databuffer(chip->pci, &buf->risc, buf->sglist, chip->period_size, chip->num_periods, 1); - if (ret < 0) + if (ret < 0) { + cx88_alsa_dma_unmap(chip); goto error; + } /* Loop back to start of program */ buf->risc.jmp[0] = cpu_to_le32(RISC_JUMP | RISC_IRQ1 | RISC_CNT_INC); -- 2.25.1

12 hours, 59 minutes

1
0
0 0

[PATCH] media: cx25821: Add missing unmap in snd_cx25821_hw_params()

by Haoxiang Li

In error path, add cx25821_alsa_dma_unmap() to release the resource acquired by cx25821_alsa_dma_map() Fixes: b2c75abde0de ("[media] cx88: drop videobuf abuse in cx88-alsa") Cc: stable(a)vger.kernel.org Signed-off-by: Haoxiang Li <lihaoxiang(a)isrc.iscas.ac.cn> --- drivers/media/pci/cx25821/cx25821-alsa.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/media/pci/cx25821/cx25821-alsa.c b/drivers/media/pci/cx25821/cx25821-alsa.c index a42f0c03a7ca..f463365163b7 100644 --- a/drivers/media/pci/cx25821/cx25821-alsa.c +++ b/drivers/media/pci/cx25821/cx25821-alsa.c @@ -535,6 +535,7 @@ static int snd_cx25821_hw_params(struct snd_pcm_substream *substream, chip->period_size, chip->num_periods, 1); if (ret < 0) { pr_info("DEBUG: ERROR after cx25821_risc_databuffer_audio()\n"); + cx25821_alsa_dma_unmap(chip); goto error; } -- 2.25.1

13 hours, 7 minutes

1
0
0 0

[PATCH 1/2] scsi: sd: fix write_same(16/10) to enable sector size > PAGE_SIZE

by sw.prabhu6＠gmail.com

From: Swarna Prabhu <sw.prabhu6(a)gmail.com> The WRITE SAME(16) and WRITE SAME(10) scsi commands uses a page from a dedicated mempool('sd_page_pool') for its payload. This pool was initialized to allocate single pages, which was sufficient as long as the device sector size did not exceed the PAGE_SIZE. Given that block layer now supports block size upto 64K ie beyond PAGE_SIZE, adapt sd_set_special_bvec() to accommodate that. With the above fix, enable sector sizes > PAGE_SIZE in scsi sd driver. Cc: stable(a)vger.kernel.org Signed-off-by: Swarna Prabhu <s.prabhu(a)samsung.com> Co-developed-by: Pankaj Raghav <p.raghav(a)samsung.com> Signed-off-by: Pankaj Raghav <p.raghav(a)samsung.com> --- Note: We are allocating pages of order aligned to BLK_MAX_BLOCK_SIZE for the mempool page allocator 'sd_page_pool' all the time. This is because we only know that a bigger sector size device is attached at sd_probe and it might be too late to reallocate mempool with order >0. drivers/scsi/sd.c | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index 0252d3f6bed1..17b5c1589eb2 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -892,14 +892,24 @@ static void sd_config_discard(struct scsi_disk *sdkp, struct queue_limits *lim, (logical_block_size >> SECTOR_SHIFT); } -static void *sd_set_special_bvec(struct request *rq, unsigned int data_len) +static void *sd_set_special_bvec(struct scsi_cmnd *cmd, unsigned int data_len) { struct page *page; + struct request *rq = scsi_cmd_to_rq(cmd); + struct scsi_device *sdp = cmd->device; + unsigned sector_size = sdp->sector_size; + unsigned int nr_pages = DIV_ROUND_UP(sector_size, PAGE_SIZE); + int n = 0; page = mempool_alloc(sd_page_pool, GFP_ATOMIC); if (!page) return NULL; - clear_highpage(page); + + do { + clear_highpage(page + n); + n++; + } while (n < nr_pages); + bvec_set_page(&rq->special_vec, page, data_len, 0); rq->rq_flags |= RQF_SPECIAL_PAYLOAD; return bvec_virt(&rq->special_vec); @@ -915,7 +925,7 @@ static blk_status_t sd_setup_unmap_cmnd(struct scsi_cmnd *cmd) unsigned int data_len = 24; char *buf; - buf = sd_set_special_bvec(rq, data_len); + buf = sd_set_special_bvec(cmd, data_len); if (!buf) return BLK_STS_RESOURCE; @@ -1004,7 +1014,7 @@ static blk_status_t sd_setup_write_same16_cmnd(struct scsi_cmnd *cmd, u32 nr_blocks = sectors_to_logical(sdp, blk_rq_sectors(rq)); u32 data_len = sdp->sector_size; - if (!sd_set_special_bvec(rq, data_len)) + if (!sd_set_special_bvec(cmd, data_len)) return BLK_STS_RESOURCE; cmd->cmd_len = 16; @@ -1031,7 +1041,7 @@ static blk_status_t sd_setup_write_same10_cmnd(struct scsi_cmnd *cmd, u32 nr_blocks = sectors_to_logical(sdp, blk_rq_sectors(rq)); u32 data_len = sdp->sector_size; - if (!sd_set_special_bvec(rq, data_len)) + if (!sd_set_special_bvec(cmd, data_len)) return BLK_STS_RESOURCE; cmd->cmd_len = 10; @@ -2880,10 +2890,7 @@ sd_read_capacity(struct scsi_disk *sdkp, struct queue_limits *lim, "assuming 512.\n"); } - if (sector_size != 512 && - sector_size != 1024 && - sector_size != 2048 && - sector_size != 4096) { + if (blk_validate_block_size(sector_size)) { sd_printk(KERN_NOTICE, sdkp, "Unsupported sector size %d.\n", sector_size); /* @@ -4368,7 +4375,7 @@ static int __init init_sd(void) if (err) goto err_out; - sd_page_pool = mempool_create_page_pool(SD_MEMPOOL_SIZE, 0); + sd_page_pool = mempool_create_page_pool(SD_MEMPOOL_SIZE, get_order(BLK_MAX_BLOCK_SIZE)); if (!sd_page_pool) { printk(KERN_ERR "sd: can't init discard page pool\n"); err = -ENOMEM; -- 2.51.0

14 hours

2
1
0 0

[PATCH v2] x86/elf: Fix core dump truncation on CPUs with no extended xfeatures

by yongxin.liu＠windriver.com

From: Yongxin Liu <yongxin.liu(a)windriver.com> Zero can be a valid value of num_records. For example, on Intel Atom x6425RE, only x87 and SSE are supported (features 0, 1), and fpu_user_cfg.max_features is 3. The for_each_extended_xfeature() loop only iterates feature 2, which is not enabled, so num_records = 0. This is valid and should not cause core dump failure. The issue is that dump_xsave_layout_desc() returns 0 for both genuine errors (dump_emit() failure) and valid cases (no extended features). Use negative return values for errors and only abort on genuine failures. Cc: stable(a)vger.kernel.org Fixes: ba386777a30b ("x86/elf: Add a new FPU buffer layout info to x86 core files") Signed-off-by: Yongxin Liu <yongxin.liu(a)windriver.com> --- V2: Keep error checking but use negative value for genuine error V1: Remove error checking entirely --- arch/x86/kernel/fpu/xstate.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 48113c5193aa..76153dfb58c9 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -1946,7 +1946,7 @@ static int dump_xsave_layout_desc(struct coredump_params *cprm) }; if (!dump_emit(cprm, &xc, sizeof(xc))) - return 0; + return -1; num_records++; } @@ -1984,7 +1984,7 @@ int elf_coredump_extra_notes_write(struct coredump_params *cprm) return 1; num_records = dump_xsave_layout_desc(cprm); - if (!num_records) + if (num_records < 0) return 1; /* Total size should be equal to the number of records */ -- 2.46.2

15 hours, 51 minutes

1
0
0 0

[PATCH] x86/elf: Fix core dump truncation on CPUs with no extended xfeatures

by yongxin.liu＠windriver.com

From: Yongxin Liu <yongxin.liu(a)windriver.com> Zero can be a valid value of num_records. For example, on Intel Atom x6425RE, only x87 and SSE are supported (features 0, 1), and fpu_user_cfg.max_features is 3. The for_each_extended_xfeature() loop only iterates feature 2, which is not enabled, so num_records = 0. This is valid and should not cause core dump failure. The size check already validates consistency: if num_records = 0, then en.n_descsz = 0, so the check passes. Cc: stable(a)vger.kernel.org Fixes: ba386777a30b ("x86/elf: Add a new FPU buffer layout info to x86 core files") Signed-off-by: Yongxin Liu <yongxin.liu(a)windriver.com> --- arch/x86/kernel/fpu/xstate.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 48113c5193aa..b1dd30eb21a8 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -1984,8 +1984,6 @@ int elf_coredump_extra_notes_write(struct coredump_params *cprm) return 1; num_records = dump_xsave_layout_desc(cprm); - if (!num_records) - return 1; /* Total size should be equal to the number of records */ if ((sizeof(struct x86_xfeat_component) * num_records) != en.n_descsz) -- 2.46.2

15 hours, 58 minutes

3
2
0 0

[PATCH v5 1/6] drm/amdgpu: Fix gfx9 update PTE mtype flag

by Philip Yang

Fix copy&paste error, that should have been an assignment instead of an or, otherwise MTYPE_UC 0x3 can not be updated to MTYPE_RW 0x1. CC stables. cc: stable(a)vger.kernel.org Signed-off-by: Philip Yang <Philip.Yang(a)amd.com> Reviewed-by: Christian König <christian.koenig(a)amd.com> --- drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c index 97a04e3171f2..205c34eb8d11 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c @@ -1204,16 +1204,16 @@ static void gmc_v9_0_get_vm_pte(struct amdgpu_device *adev, *flags = AMDGPU_PTE_MTYPE_VG10(*flags, MTYPE_NC); break; case AMDGPU_VM_MTYPE_WC: - *flags |= AMDGPU_PTE_MTYPE_VG10(*flags, MTYPE_WC); + *flags = AMDGPU_PTE_MTYPE_VG10(*flags, MTYPE_WC); break; case AMDGPU_VM_MTYPE_RW: - *flags |= AMDGPU_PTE_MTYPE_VG10(*flags, MTYPE_RW); + *flags = AMDGPU_PTE_MTYPE_VG10(*flags, MTYPE_RW); break; case AMDGPU_VM_MTYPE_CC: - *flags |= AMDGPU_PTE_MTYPE_VG10(*flags, MTYPE_CC); + *flags = AMDGPU_PTE_MTYPE_VG10(*flags, MTYPE_CC); break; case AMDGPU_VM_MTYPE_UC: - *flags |= AMDGPU_PTE_MTYPE_VG10(*flags, MTYPE_UC); + *flags = AMDGPU_PTE_MTYPE_VG10(*flags, MTYPE_UC); break; } -- 2.50.1

16 hours, 10 minutes

1
0
0 0

[PATCH] lib/crypto: riscv: Depend on RISCV_EFFICIENT_VECTOR_UNALIGNED_ACCESS

by Eric Biggers

Replace the RISCV_ISA_V dependency of the RISC-V crypto code with RISCV_EFFICIENT_VECTOR_UNALIGNED_ACCESS, which implies RISCV_ISA_V as well as vector unaligned accesses being efficient. This is necessary because this code assumes that vector unaligned accesses are supported and are efficient. (It does so to avoid having to use lots of extra vsetvli instructions to switch the element width back and forth between 8 and either 32 or 64.) This was omitted from the code originally just because the RISC-V kernel support for detecting this feature didn't exist yet. Support has now been added, but it's fragmented into per-CPU runtime detection, a command-line parameter, and a kconfig option. The kconfig option is the only reasonable way to do it, though, so let's just rely on that. Fixes: eb24af5d7a05 ("crypto: riscv - add vector crypto accelerated AES-{ECB,CBC,CTR,XTS}") Fixes: bb54668837a0 ("crypto: riscv - add vector crypto accelerated ChaCha20") Fixes: 600a3853dfa0 ("crypto: riscv - add vector crypto accelerated GHASH") Fixes: 8c8e40470ffe ("crypto: riscv - add vector crypto accelerated SHA-{256,224}") Fixes: b3415925a08b ("crypto: riscv - add vector crypto accelerated SHA-{512,384}") Fixes: 563a5255afa2 ("crypto: riscv - add vector crypto accelerated SM3") Fixes: b8d06352bbf3 ("crypto: riscv - add vector crypto accelerated SM4") Cc: stable(a)vger.kernel.org Signed-off-by: Eric Biggers <ebiggers(a)kernel.org> --- arch/riscv/crypto/Kconfig | 12 ++++++++---- lib/crypto/Kconfig | 9 ++++++--- 2 files changed, 14 insertions(+), 7 deletions(-) diff --git a/arch/riscv/crypto/Kconfig b/arch/riscv/crypto/Kconfig index a75d6325607b..14c5acb935e9 100644 --- a/arch/riscv/crypto/Kconfig +++ b/arch/riscv/crypto/Kconfig @@ -2,11 +2,12 @@ menu "Accelerated Cryptographic Algorithms for CPU (riscv)" config CRYPTO_AES_RISCV64 tristate "Ciphers: AES, modes: ECB, CBC, CTS, CTR, XTS" - depends on 64BIT && RISCV_ISA_V && TOOLCHAIN_HAS_VECTOR_CRYPTO + depends on 64BIT && TOOLCHAIN_HAS_VECTOR_CRYPTO && \ + RISCV_EFFICIENT_VECTOR_UNALIGNED_ACCESS select CRYPTO_ALGAPI select CRYPTO_LIB_AES select CRYPTO_SKCIPHER help Block cipher: AES cipher algorithms @@ -18,21 +19,23 @@ config CRYPTO_AES_RISCV64 - Zvkb vector crypto extension (CTR) - Zvkg vector crypto extension (XTS) config CRYPTO_GHASH_RISCV64 tristate "Hash functions: GHASH" - depends on 64BIT && RISCV_ISA_V && TOOLCHAIN_HAS_VECTOR_CRYPTO + depends on 64BIT && TOOLCHAIN_HAS_VECTOR_CRYPTO && \ + RISCV_EFFICIENT_VECTOR_UNALIGNED_ACCESS select CRYPTO_GCM help GCM GHASH function (NIST SP 800-38D) Architecture: riscv64 using: - Zvkg vector crypto extension config CRYPTO_SM3_RISCV64 tristate "Hash functions: SM3 (ShangMi 3)" - depends on 64BIT && RISCV_ISA_V && TOOLCHAIN_HAS_VECTOR_CRYPTO + depends on 64BIT && TOOLCHAIN_HAS_VECTOR_CRYPTO && \ + RISCV_EFFICIENT_VECTOR_UNALIGNED_ACCESS select CRYPTO_HASH select CRYPTO_LIB_SM3 help SM3 (ShangMi 3) secure hash function (OSCCA GM/T 0004-2012) @@ -40,11 +43,12 @@ config CRYPTO_SM3_RISCV64 - Zvksh vector crypto extension - Zvkb vector crypto extension config CRYPTO_SM4_RISCV64 tristate "Ciphers: SM4 (ShangMi 4)" - depends on 64BIT && RISCV_ISA_V && TOOLCHAIN_HAS_VECTOR_CRYPTO + depends on 64BIT && TOOLCHAIN_HAS_VECTOR_CRYPTO && \ + RISCV_EFFICIENT_VECTOR_UNALIGNED_ACCESS select CRYPTO_ALGAPI select CRYPTO_SM4 help SM4 block cipher algorithm (OSCCA GB/T 32907-2016, ISO/IEC 18033-3:2010/Amd 1:2021) diff --git a/lib/crypto/Kconfig b/lib/crypto/Kconfig index a3647352bff6..6871a41e5069 100644 --- a/lib/crypto/Kconfig +++ b/lib/crypto/Kconfig @@ -59,11 +59,12 @@ config CRYPTO_LIB_CHACHA_ARCH depends on CRYPTO_LIB_CHACHA && !UML && !KMSAN default y if ARM default y if ARM64 && KERNEL_MODE_NEON default y if MIPS && CPU_MIPS32_R2 default y if PPC64 && CPU_LITTLE_ENDIAN && VSX - default y if RISCV && 64BIT && RISCV_ISA_V && TOOLCHAIN_HAS_VECTOR_CRYPTO + default y if RISCV && 64BIT && TOOLCHAIN_HAS_VECTOR_CRYPTO && \ + RISCV_EFFICIENT_VECTOR_UNALIGNED_ACCESS default y if S390 default y if X86_64 config CRYPTO_LIB_CURVE25519 tristate @@ -182,11 +183,12 @@ config CRYPTO_LIB_SHA256_ARCH depends on CRYPTO_LIB_SHA256 && !UML default y if ARM && !CPU_V7M default y if ARM64 default y if MIPS && CPU_CAVIUM_OCTEON default y if PPC && SPE - default y if RISCV && 64BIT && RISCV_ISA_V && TOOLCHAIN_HAS_VECTOR_CRYPTO + default y if RISCV && 64BIT && TOOLCHAIN_HAS_VECTOR_CRYPTO && \ + RISCV_EFFICIENT_VECTOR_UNALIGNED_ACCESS default y if S390 default y if SPARC64 default y if X86_64 config CRYPTO_LIB_SHA512 @@ -200,11 +202,12 @@ config CRYPTO_LIB_SHA512_ARCH bool depends on CRYPTO_LIB_SHA512 && !UML default y if ARM && !CPU_V7M default y if ARM64 default y if MIPS && CPU_CAVIUM_OCTEON - default y if RISCV && 64BIT && RISCV_ISA_V && TOOLCHAIN_HAS_VECTOR_CRYPTO + default y if RISCV && 64BIT && TOOLCHAIN_HAS_VECTOR_CRYPTO && \ + RISCV_EFFICIENT_VECTOR_UNALIGNED_ACCESS default y if S390 default y if SPARC64 default y if X86_64 config CRYPTO_LIB_SHA3 base-commit: 43dfc13ca972988e620a6edb72956981b75ab6b0 -- 2.52.0

16 hours, 41 minutes

2
2
0 0

[PATCH 1/2] drm/xe: Use generic_handle_irq_safe inside heci gsc irq handler

by Maarten Lankhorst

This makes the irq handler safe on PREEMPT-RT too. This is similar to the i915 commit 8cadce97bf26 ("drm/i915/gsc: mei interrupt top half should be in irq disabled context"). Fixes: 87a4c85d3a3e ("drm/xe/gsc: add gsc device support") Cc: <stable(a)vger.kernel.org> # v6.8+ Signed-off-by: Maarten Lankhorst <dev(a)lankhorst.se> --- drivers/gpu/drm/xe/xe_heci_gsc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_heci_gsc.c b/drivers/gpu/drm/xe/xe_heci_gsc.c index 2b3d49dd394c0..495cdd4f948d5 100644 --- a/drivers/gpu/drm/xe/xe_heci_gsc.c +++ b/drivers/gpu/drm/xe/xe_heci_gsc.c @@ -223,7 +223,7 @@ void xe_heci_gsc_irq_handler(struct xe_device *xe, u32 iir) if (xe->heci_gsc.irq < 0) return; - ret = generic_handle_irq(xe->heci_gsc.irq); + ret = generic_handle_irq_safe(xe->heci_gsc.irq); if (ret) drm_err_ratelimited(&xe->drm, "error handling GSC irq: %d\n", ret); } @@ -243,7 +243,7 @@ void xe_heci_csc_irq_handler(struct xe_device *xe, u32 iir) if (xe->heci_gsc.irq < 0) return; - ret = generic_handle_irq(xe->heci_gsc.irq); + ret = generic_handle_irq_safe(xe->heci_gsc.irq); if (ret) drm_err_ratelimited(&xe->drm, "error handling GSC irq: %d\n", ret); } -- 2.51.0

17 hours, 14 minutes

2
1
0 0

[PATCH sched_ext/for-6.19-fixes] sched_ext: Fix bypass depth leak on scx_enable() failure

by Tejun Heo

scx_enable() calls scx_bypass(true) to initialize in bypass mode and then scx_bypass(false) on success to exit. If scx_enable() fails during task initialization - e.g. scx_cgroup_init() or scx_init_task() returns an error - it jumps to err_disable while bypass is still active. scx_disable_workfn() then calls scx_bypass(true/false) for its own bypass, leaving the bypass depth at 1 instead of 0. This causes the system to remain permanently in bypass mode after a failed scx_enable(). Failures after task initialization is complete - e.g. scx_tryset_enable_state() at the end - already call scx_bypass(false) before reaching the error path and are not affected. This only affects a subset of failure modes. Fix it by tracking whether scx_enable() called scx_bypass(true) in a bool and having scx_disable_workfn() call an extra scx_bypass(false) to clear it. This is a temporary measure as the bypass depth will be moved into the sched instance, which will make this tracking unnecessary. Fixes: 8c2090c504e9 ("sched_ext: Initialize in bypass mode") Cc: stable(a)vger.kernel.org # v6.12+ Reported-by: Chris Mason <clm(a)meta.com> Signed-off-by: Tejun Heo <tj(a)kernel.org> --- kernel/sched/ext.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -41,6 +41,13 @@ static bool scx_init_task_enabled; static bool scx_switching_all; DEFINE_STATIC_KEY_FALSE(__scx_switched_all); +/* + * Tracks whether scx_enable() called scx_bypass(true). Used to balance bypass + * depth on enable failure. Will be removed when bypass depth is moved into the + * sched instance. + */ +static bool scx_bypassed_for_enable; + static atomic_long_t scx_nr_rejected = ATOMIC_LONG_INIT(0); static atomic_long_t scx_hotplug_seq = ATOMIC_LONG_INIT(0); @@ -4318,6 +4325,11 @@ static void scx_disable_workfn(struct kt scx_dsp_max_batch = 0; free_kick_syncs(); + if (scx_bypassed_for_enable) { + scx_bypassed_for_enable = false; + scx_bypass(false); + } + mutex_unlock(&scx_enable_mutex); WARN_ON_ONCE(scx_set_enable_state(SCX_DISABLED) != SCX_DISABLING); @@ -4970,6 +4982,7 @@ static int scx_enable(struct sched_ext_o * Init in bypass mode to guarantee forward progress. */ scx_bypass(true); + scx_bypassed_for_enable = true; for (i = SCX_OPI_NORMAL_BEGIN; i < SCX_OPI_NORMAL_END; i++) if (((void (**)(void))ops)[i]) @@ -5067,6 +5080,7 @@ static int scx_enable(struct sched_ext_o scx_task_iter_stop(&sti); percpu_up_write(&scx_fork_rwsem); + scx_bypassed_for_enable = false; scx_bypass(false); if (!scx_tryset_enable_state(SCX_ENABLED, SCX_ENABLING)) { -- tejun

17 hours, 17 minutes

4
3
0 0

[PATCH v6 0/9] Error recovery for vfio-pci devices on s390x

by Farhan Ali

Hi, This Linux kernel patch series introduces support for error recovery for passthrough PCI devices on System Z (s390x). Background ---------- For PCI devices on s390x an operating system receives platform specific error events from firmware rather than through AER.Today for passthrough/userspace devices, we don't attempt any error recovery and ignore any error events for the devices. The passthrough/userspace devices are managed by the vfio-pci driver. The driver does register error handling callbacks (error_detected), and on an error trigger an eventfd to userspace. But we need a mechanism to notify userspace (QEMU/guest/userspace drivers) about the error event. Proposal -------- We can expose this error information (currently only the PCI Error Code) via a device feature. Userspace can then obtain the error information via VFIO_DEVICE_FEATURE ioctl and take appropriate actions such as driving a device reset. This is how a typical flow for passthrough devices to a VM would work: For passthrough devices to a VM, the driver bound to the device on the host is vfio-pci. vfio-pci driver does support the error_detected() callback (vfio_pci_core_aer_err_detected()), and on an PCI error s390x recovery code on the host will call the vfio-pci error_detected() callback. The vfio-pci error_detected() callback will notify userspace/QEMU via an eventfd, and return PCI_ERS_RESULT_CAN_RECOVER. At this point the s390x error recovery on the host will skip any further action(see patch 6) and let userspace drive the error recovery. Once userspace/QEMU is notified, it then injects this error into the VM so device drivers in the VM can take recovery actions. For example for a passthrough NVMe device, the VM's OS NVMe driver will access the device. At this point the VM's NVMe driver's error_detected() will drive the recovery by returning PCI_ERS_RESULT_NEED_RESET, and the s390x error recovery in the VM's OS will try to do a reset. Resets are privileged operations and so the VM will need intervention from QEMU to perform the reset. QEMU will invoke the VFIO_DEVICE_RESET ioctl to now notify the host that the VM is requesting a reset of the device. The vfio-pci driver on the host will then perform the reset on the device to recover it. Thanks Farhan ChangeLog --------- v5 series https://lore.kernel.org/all/20251113183502.2388-1-alifm@linux.ibm.com/ v5 -> v6 - Rebase on 6.18 + Lukas's PCI: Universal error recoverability of devices series (https://lore.kernel.org/all/cover.1763483367.git.lukas@wunner.de/) - Re-work config space accessibility check to pci_dev_save_and_disable() (patch 3). This avoids saving the config space, in the reset path, if the device's config space is corrupted or inaccessible. v4 series https://lore.kernel.org/all/20250924171628.826-1-alifm@linux.ibm.com/ v4 -> v5 - Rebase on 6.18-rc5 - Move bug fixes to the beginning of the series (patch 1 and 2). These patches were posted as a separate fixes series https://lore.kernel.org/all/a14936ac-47d6-461b-816f-0fd66f869b0f@linux.ibm.… - Add matching pci_put_dev() for pci_get_slot() (patch 6). v3 series https://lore.kernel.org/all/20250911183307.1910-1-alifm@linux.ibm.com/ v3 -> v4 - Remove warn messages for each PCI capability not restored (patch 1) - Check PCI_COMMAND and PCI_STATUS register for error value instead of device id (patch 1) - Fix kernel crash in patch 3 - Added reviewed by tags - Address comments from Niklas's (patches 4, 5, 7) - Fix compilation error non s390x system (patch 8) - Explicitly align struct vfio_device_feature_zpci_err (patch 8) v2 series https://lore.kernel.org/all/20250825171226.1602-1-alifm@linux.ibm.com/ v2 -> v3 - Patch 1 avoids saving any config space state if the device is in error (suggested by Alex) - Patch 2 adds additional check only for FLR reset to try other function reset method (suggested by Alex). - Patch 3 fixes a bug in s390 for resetting PCI devices with multiple functions. Creates a new flag pci_slot to allow per function slot. - Patch 4 fixes a bug in s390 for resource to bus address translation. - Rebase on 6.17-rc5 v1 series https://lore.kernel.org/all/20250813170821.1115-1-alifm@linux.ibm.com/ v1 - > v2 - Patches 1 and 2 adds some additional checks for FLR/PM reset to try other function reset method (suggested by Alex). - Patch 3 fixes a bug in s390 for resetting PCI devices with multiple functions. - Patch 7 adds a new device feature for zPCI devices for the VFIO_DEVICE_FEATURE ioctl. The ioctl is used by userspace to retriece any PCI error information for the device (suggested by Alex). - Patch 8 adds a reset_done() callback for the vfio-pci driver, to restore the state of the device after a reset. - Patch 9 removes the pcie check for triggering VFIO_PCI_ERR_IRQ_INDEX. Farhan Ali (9): PCI: Allow per function PCI slots s390/pci: Add architecture specific resource/bus address translation PCI: Avoid saving config space state if inaccessible PCI: Add additional checks for flr reset s390/pci: Update the logic for detecting passthrough device s390/pci: Store PCI error information for passthrough devices vfio-pci/zdev: Add a device feature for error information vfio: Add a reset_done callback for vfio-pci driver vfio: Remove the pcie check for VFIO_PCI_ERR_IRQ_INDEX arch/s390/include/asm/pci.h | 29 ++++++++ arch/s390/pci/pci.c | 75 +++++++++++++++++++++ arch/s390/pci/pci_event.c | 107 +++++++++++++++++------------- drivers/pci/host-bridge.c | 4 +- drivers/pci/pci.c | 19 +++++- drivers/pci/slot.c | 25 ++++++- drivers/vfio/pci/vfio_pci_core.c | 20 ++++-- drivers/vfio/pci/vfio_pci_intrs.c | 3 +- drivers/vfio/pci/vfio_pci_priv.h | 9 +++ drivers/vfio/pci/vfio_pci_zdev.c | 45 ++++++++++++- include/linux/pci.h | 1 + include/uapi/linux/vfio.h | 15 +++++ 12 files changed, 291 insertions(+), 61 deletions(-) -- 2.43.0

19 hours, 39 minutes

3
13
0 0

¿Cuánto cuesta una mala contratación?

by Valeria Pérez

¿Cuánto cuesta una mala contratación? body { margin: 0; padding: 0; font-family: Arial, Helvetica, sans-serif; font-size: 14px; color: #333; background-color: #ffffff; } table { border-spacing: 0; width: 100%; max-width: 600px; margin: auto; } td { padding: 12px 20px; } a { color: #1a73e8; text-decoration: none; } .footer { font-size: 12px; color: #888888; text-align: center; } Una mala contratación cuesta 3X el salario. Evítalo con datos, no percepciones. Hola, , ¿Sabías que una mala contratación cuesta hasta 3 veces el salario anual? El 74% de empresas admite haber contratado a la persona equivocada. El motivo: decisiones basadas en percepciones, no en datos objetivos. PsicoSmart te ayuda a evaluar talento con precisión: 31 pruebas psicométricas validadas para medir liderazgo, honestidad e inteligencia 2,500+ exámenes técnicos especializados por industria Verificación de identidad con captura fotográfica automática Resultados en minutos, accesible desde cualquier dispositivo Reduce hasta 60% el riesgo de error en selección. ¿Quieres una demostración gratuita? Responde este correo y te contacto en menos de 24 horas. Saludos, -------------- Atte.: Valeria Pérez Ciudad de México: (55) 5018 0565 WhatsApp: +52 33 1607 2089 Si no deseas recibir más correos, haz clic aquí para darte de baja. Para remover su dirección de esta lista haga <a href="https://s1.arrobamail.com/unsuscribe.php?id=yiwtsrewiswqtrseup">click aquí</a>

19 hours, 42 minutes

1
0
0 0

[PATCH v2 04/13] KVM: nSVM: Fix consistency checks for NP_ENABLE

by Yosry Ahmed

KVM currenty fails a nested VMRUN and injects VMEXIT_INVALID (aka SVM_EXIT_ERR) if L1 sets NP_ENABLE and the host does not support NPTs. On first glance, it seems like the check should actually be for guest_cpu_cap_has(X86_FEATURE_NPT) instead, as it is possible for the host to support NPTs but the guest CPUID to not advertise it. However, the consistency check is not architectural to begin with. The APM does not mention VMEXIT_INVALID if NP_ENABLE is set on a processor that does not have X86_FEATURE_NPT. Hence, NP_ENABLE should be ignored if X86_FEATURE_NPT is not available for L1. Apart from the consistency check, this is currently the case because NP_ENABLE is actually copied from VMCB01 to VMCB02, not from VMCB12. On the other hand, the APM does mention two other consistency checks for NP_ENABLE, both of which are missing (paraphrased): In Volume #2, 15.25.3 (24593—Rev. 3.42—March 2024): If VMRUN is executed with hCR0.PG cleared to zero and NP_ENABLE set to 1, VMRUN terminates with #VMEXIT(VMEXIT_INVALID) In Volume #2, 15.25.4 (24593—Rev. 3.42—March 2024): When VMRUN is executed with nested paging enabled (NP_ENABLE = 1), the following conditions are considered illegal state combinations, in addition to those mentioned in “Canonicalization and Consistency Checks”: • Any MBZ bit of nCR3 is set. • Any G_PAT.PA field has an unsupported type encoding or any reserved field in G_PAT has a nonzero value. Replace the existing consistency check with consistency checks on hCR0.PG and nCR3. Only perform the consistency checks if L1 has X86_FEATURE_NPT and NP_ENABLE is set in VMCB12. The G_PAT consistency check will be addressed separately. As it is now possible for an L1 to run L2 with NP_ENABLE set but ignored, also check that L1 has X86_FEATURE_NPT in nested_npt_enabled(). Pass L1's CR0 to __nested_vmcb_check_controls(). In nested_vmcb_check_controls(), L1's CR0 is available through kvm_read_cr0(), as vcpu->arch.cr0 is not updated to L2's CR0 until later through nested_vmcb02_prepare_save() -> svm_set_cr0(). In svm_set_nested_state(), L1's CR0 is available in the captured save area, as svm_get_nested_state() captures L1's save area when running L2, and L1's CR0 is stashed in VMCB01 on nested VMRUN (in nested_svm_vmrun()). Fixes: 4b16184c1cca ("KVM: SVM: Initialize Nested Nested MMU context on VMRUN") Cc: stable(a)vger.kernel.org Signed-off-by: Yosry Ahmed <yosry.ahmed(a)linux.dev> --- arch/x86/kvm/svm/nested.c | 21 ++++++++++++++++----- arch/x86/kvm/svm/svm.h | 3 ++- 2 files changed, 18 insertions(+), 6 deletions(-) diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 74211c5c68026..87bcc5eff96e8 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -325,7 +325,8 @@ static bool nested_svm_check_bitmap_pa(struct kvm_vcpu *vcpu, u64 pa, u32 size) } static bool __nested_vmcb_check_controls(struct kvm_vcpu *vcpu, - struct vmcb_ctrl_area_cached *control) + struct vmcb_ctrl_area_cached *control, + unsigned long l1_cr0) { if (CC(!vmcb12_is_intercept(control, INTERCEPT_VMRUN))) return false; @@ -333,8 +334,12 @@ static bool __nested_vmcb_check_controls(struct kvm_vcpu *vcpu, if (CC(control->asid == 0)) return false; - if (CC((control->nested_ctl & SVM_NESTED_CTL_NP_ENABLE) && !npt_enabled)) - return false; + if (nested_npt_enabled(to_svm(vcpu))) { + if (CC(!kvm_vcpu_is_legal_gpa(vcpu, control->nested_cr3))) + return false; + if (CC(!(l1_cr0 & X86_CR0_PG))) + return false; + } if (CC(!nested_svm_check_bitmap_pa(vcpu, control->msrpm_base_pa, MSRPM_SIZE))) @@ -400,7 +405,12 @@ static bool nested_vmcb_check_controls(struct kvm_vcpu *vcpu) struct vcpu_svm *svm = to_svm(vcpu); struct vmcb_ctrl_area_cached *ctl = &svm->nested.ctl; - return __nested_vmcb_check_controls(vcpu, ctl); + /* + * Make sure we did not enter guest mode yet, in which case + * kvm_read_cr0() could return L2's CR0. + */ + WARN_ON_ONCE(is_guest_mode(vcpu)); + return __nested_vmcb_check_controls(vcpu, ctl, kvm_read_cr0(vcpu)); } static @@ -1831,7 +1841,8 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu, ret = -EINVAL; __nested_copy_vmcb_control_to_cache(vcpu, &ctl_cached, ctl); - if (!__nested_vmcb_check_controls(vcpu, &ctl_cached)) + /* 'save' contains L1 state saved from before VMRUN */ + if (!__nested_vmcb_check_controls(vcpu, &ctl_cached, save->cr0)) goto out_free; /* diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index f6fb70ddf7272..3e805a43ffcdb 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -552,7 +552,8 @@ static inline bool gif_set(struct vcpu_svm *svm) static inline bool nested_npt_enabled(struct vcpu_svm *svm) { - return svm->nested.ctl.nested_ctl & SVM_NESTED_CTL_NP_ENABLE; + return guest_cpu_cap_has(&svm->vcpu, X86_FEATURE_NPT) && + svm->nested.ctl.nested_ctl & SVM_NESTED_CTL_NP_ENABLE; } static inline bool nested_vnmi_enabled(struct vcpu_svm *svm) -- 2.51.2.1041.gc1ab5b90ca-goog

19 hours, 51 minutes

2
6
0 0

[merged mm-nonmm-stable] ocfs2-fix-kernel-bug-in-ocfs2_find_victim_chain.patch removed from -mm tree

by Andrew Morton

The quilt patch titled Subject: ocfs2: fix kernel BUG in ocfs2_find_victim_chain has been removed from the -mm tree. Its filename was ocfs2-fix-kernel-bug-in-ocfs2_find_victim_chain.patch This patch was dropped because it was merged into the mm-nonmm-stable branch of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm ------------------------------------------------------ From: Prithvi Tambewagh <activprithvi(a)gmail.com> Subject: ocfs2: fix kernel BUG in ocfs2_find_victim_chain Date: Mon, 1 Dec 2025 18:37:11 +0530 syzbot reported a kernel BUG in ocfs2_find_victim_chain() because the `cl_next_free_rec` field of the allocation chain list (next free slot in the chain list) is 0, triggring the BUG_ON(!cl->cl_next_free_rec) condition in ocfs2_find_victim_chain() and panicking the kernel. To fix this, an if condition is introduced in ocfs2_claim_suballoc_bits(), just before calling ocfs2_find_victim_chain(), the code block in it being executed when either of the following conditions is true: 1. `cl_next_free_rec` is equal to 0, indicating that there are no free chains in the allocation chain list 2. `cl_next_free_rec` is greater than `cl_count` (the total number of chains in the allocation chain list) Either of them being true is indicative of the fact that there are no chains left for usage. This is addressed using ocfs2_error(), which prints the error log for debugging purposes, rather than panicking the kernel. Link: https://lkml.kernel.org/r/20251201130711.143900-1-activprithvi@gmail.com Signed-off-by: Prithvi Tambewagh <activprithvi(a)gmail.com> Reported-by: syzbot+96d38c6e1655c1420a72(a)syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=96d38c6e1655c1420a72 Tested-by: syzbot+96d38c6e1655c1420a72(a)syzkaller.appspotmail.com Reviewed-by: Joseph Qi <joseph.qi(a)linux.alibaba.com> Cc: Mark Fasheh <mark(a)fasheh.com> Cc: Joel Becker <jlbec(a)evilplan.org> Cc: Junxiao Bi <junxiao.bi(a)oracle.com> Cc: Changwei Ge <gechangwei(a)live.cn> Cc: Jun Piao <piaojun(a)huawei.com> Cc: Heming Zhao <heming.zhao(a)suse.com> Cc: <stable(a)vger.kernel.org> Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org> --- fs/ocfs2/suballoc.c | 10 ++++++++++ 1 file changed, 10 insertions(+) --- a/fs/ocfs2/suballoc.c~ocfs2-fix-kernel-bug-in-ocfs2_find_victim_chain +++ a/fs/ocfs2/suballoc.c @@ -1993,6 +1993,16 @@ static int ocfs2_claim_suballoc_bits(str } cl = (struct ocfs2_chain_list *) &fe->id2.i_chain; + if (!le16_to_cpu(cl->cl_next_free_rec) || + le16_to_cpu(cl->cl_next_free_rec) > le16_to_cpu(cl->cl_count)) { + status = ocfs2_error(ac->ac_inode->i_sb, + "Chain allocator dinode %llu has invalid next " + "free chain record %u, but only %u total\n", + (unsigned long long)le64_to_cpu(fe->i_blkno), + le16_to_cpu(cl->cl_next_free_rec), + le16_to_cpu(cl->cl_count)); + goto bail; + } victim = ocfs2_find_victim_chain(cl); ac->ac_chain = victim; _ Patches currently in -mm which might be from activprithvi(a)gmail.com are

20 hours, 20 minutes

1
0
0 0

[merged mm-stable] mm-hugetlb-fix-excessive-ipi-broadcasts-when-unsharing-pmd-tables-using-mmu_gather.patch removed from -mm tree

by Andrew Morton

The quilt patch titled Subject: mm/hugetlb: fix excessive IPI broadcasts when unsharing PMD tables using mmu_gather has been removed from the -mm tree. Its filename was mm-hugetlb-fix-excessive-ipi-broadcasts-when-unsharing-pmd-tables-using-mmu_gather.patch This patch was dropped because it was merged into the mm-stable branch of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm ------------------------------------------------------ From: "David Hildenbrand (Red Hat)" <david(a)kernel.org> Subject: mm/hugetlb: fix excessive IPI broadcasts when unsharing PMD tables using mmu_gather Date: Fri, 5 Dec 2025 22:35:58 +0100 As reported, ever since commit 1013af4f585f ("mm/hugetlb: fix huge_pmd_unshare() vs GUP-fast race") we can end up in some situations where we perform so many IPI broadcasts when unsharing hugetlb PMD page tables that it severely regresses some workloads. In particular, when we fork()+exit(), or when we munmap() a large area backed by many shared PMD tables, we perform one IPI broadcast per unshared PMD table. There are two optimizations to be had: (1) When we process (unshare) multiple such PMD tables, such as during exit(), it is sufficient to send a single IPI broadcast (as long as we respect locking rules) instead of one per PMD table. Locking prevents that any of these PMD tables could get reuse before we drop the lock. (2) When we are not the last sharer (> 2 users including us), there is no need to send the IPI broadcast. The shared PMD tables cannot become exclusive (fully unshared) before an IPI will be broadcasted by the last sharer. Concurrent GUP-fast could walk into a PMD table just before we unshared it. It could then succeed in grabbing a page from the shared page table even after munmap() etc succeeded (and supressed an IPI). But there is not difference compared to GUP-fast just sleeping for a while after grabbing the page and re-enabling IRQs. Most importantly, GUP-fast will never walk into page tables that are no-longer shared, because the last sharer will issue an IPI broadcast. (if ever required, checking whether the PUD changed in GUP-fast after grabbing the page like we do in the PTE case could handle this) So let's rework PMD sharing TLB flushing + IPI sync to use the mmu_gather infrastructure so we can implement these optimizations and demystify the code at least a bit. Extend the mmu_gather infrastructure to be able to deal with our special hugetlb PMD table sharing implementation. We'll consolidate the handling for (full) unsharing of PMD tables in tlb_unshare_pmd_ptdesc() and tlb_flush_unshared_tables(), and track in "struct mmu_gather" whether we had (full) unsharing of PMD tables. Because locking is very special (concurrent unsharing+reuse must be prevented), we disallow deferring flushing to tlb_finish_mmu() and instead require an explicit earlier call to tlb_flush_unshared_tables(). From hugetlb code, we call huge_pmd_unshare_flush() where we make sure that the expected lock protecting us from concurrent unsharing+reuse is still held. Check with a VM_WARN_ON_ONCE() in tlb_finish_mmu() that tlb_flush_unshared_tables() was properly called earlier. Document it all properly. Notes about tlb_remove_table_sync_one() interaction with unsharing: There are two fairly tricky things: (1) tlb_remove_table_sync_one() is a NOP on architectures without CONFIG_MMU_GATHER_RCU_TABLE_FREE. Here, the assumption is that the previous TLB flush would send an IPI to all relevant CPUs. Careful: some architectures like x86 only send IPIs to all relevant CPUs when tlb->freed_tables is set. The relevant architectures should be selecting MMU_GATHER_RCU_TABLE_FREE, but x86 might not do that in stable kernels and it might have been problematic before this patch. Also, the arch flushing behavior (independent of IPIs) is different when tlb->freed_tables is set. Do we have to enlighten them to also take care of tlb->unshared_tables? So far we didn't care, so hopefully we are fine. Of course, we could be setting tlb->freed_tables as well, but that might then unnecessarily flush too much, because the semantics of tlb->freed_tables are a bit fuzzy. This patch changes nothing in this regard. (2) tlb_remove_table_sync_one() is not a NOP on architectures with CONFIG_MMU_GATHER_RCU_TABLE_FREE that actually don't need a sync. Take x86 as an example: in the common case (!pv, !X86_FEATURE_INVLPGB) we still issue IPIs during TLB flushes and don't actually need the second tlb_remove_table_sync_one(). This optimized can be implemented on top of this, by checking e.g., in tlb_remove_table_sync_one() whether we really need IPIs. But as described in (1), it really must honor tlb->freed_tables then to send IPIs to all relevant CPUs. Further note that the ptdesc_pmd_pts_dec() in huge_pmd_share() is not a concern, as we are holding the i_mmap_lock the whole time, preventing concurrent unsharing. That ptdesc_pmd_pts_dec() usage will be removed separately as a cleanup later. There are plenty more cleanups to be had, but they have to wait until this is fixed. Link: https://lkml.kernel.org/r/20251205213558.2980480-5-david@kernel.org Fixes: 1013af4f585f ("mm/hugetlb: fix huge_pmd_unshare() vs GUP-fast race") Signed-off-by: David Hildenbrand (Red Hat) <david(a)kernel.org> Reported-by: Uschakow, Stanislav" <suschako(a)amazon.de> Closes: https://lore.kernel.org/all/4d3878531c76479d9f8ca9789dc6485d@amazon.de/ Tested-by: Laurence Oberman <loberman(a)redhat.com> Cc: "Aneesh Kumar K.V" <aneesh.kumar(a)kernel.org> Cc: Arnd Bergmann <arnd(a)arndb.de> Cc: Jann Horn <jannh(a)google.com> Cc: Liam Howlett <liam.howlett(a)oracle.com> Cc: Liu Shixin <liushixin2(a)huawei.com> Cc: Lorenzo Stoakes <lorenzo.stoakes(a)oracle.com> Cc: Muchun Song <muchun.song(a)linux.dev> Cc: Nadav Amit <nadav.amit(a)gmail.com> Cc: Nicholas Piggin <npiggin(a)gmail.com> Cc: Oscar Salvador <osalvador(a)suse.de> Cc: Peter Zijlstra <peterz(a)infradead.org> Cc: Prakash Sangappa <prakash.sangappa(a)oracle.com> Cc: Rik van Riel <riel(a)surriel.com> Cc: Vlastimil Babka <vbabka(a)suse.cz> Cc: Will Deacon <will(a)kernel.org> Cc: Lance Yang <lance.yang(a)linux.dev> Cc: <stable(a)vger.kernel.org> Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org> --- include/asm-generic/tlb.h | 69 ++++++++++++++++++++ include/linux/hugetlb.h | 19 +++-- mm/hugetlb.c | 121 ++++++++++++++++++++---------------- mm/mmu_gather.c | 6 + mm/mprotect.c | 2 mm/rmap.c | 25 +++++-- 6 files changed, 173 insertions(+), 69 deletions(-) --- a/include/asm-generic/tlb.h~mm-hugetlb-fix-excessive-ipi-broadcasts-when-unsharing-pmd-tables-using-mmu_gather +++ a/include/asm-generic/tlb.h @@ -364,6 +364,17 @@ struct mmu_gather { unsigned int vma_huge : 1; unsigned int vma_pfn : 1; + /* + * Did we unshare (unmap) any shared page tables? + */ + unsigned int unshared_tables : 1; + + /* + * Did we unshare any page tables such that they are now exclusive + * and could get reused+modified by the new owner? + */ + unsigned int fully_unshared_tables : 1; + unsigned int batch_count; #ifndef CONFIG_MMU_GATHER_NO_GATHER @@ -400,6 +411,7 @@ static inline void __tlb_reset_range(str tlb->cleared_pmds = 0; tlb->cleared_puds = 0; tlb->cleared_p4ds = 0; + tlb->unshared_tables = 0; /* * Do not reset mmu_gather::vma_* fields here, we do not * call into tlb_start_vma() again to set them if there is an @@ -484,7 +496,7 @@ static inline void tlb_flush_mmu_tlbonly * these bits. */ if (!(tlb->freed_tables || tlb->cleared_ptes || tlb->cleared_pmds || - tlb->cleared_puds || tlb->cleared_p4ds)) + tlb->cleared_puds || tlb->cleared_p4ds || tlb->unshared_tables)) return; tlb_flush(tlb); @@ -773,6 +785,61 @@ static inline bool huge_pmd_needs_flush( } #endif +#ifdef CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING +static inline void tlb_unshare_pmd_ptdesc(struct mmu_gather *tlb, struct ptdesc *pt, + unsigned long addr) +{ + /* + * The caller must make sure that concurrent unsharing + exclusive + * reuse is impossible until tlb_flush_unshared_tables() was called. + */ + VM_WARN_ON_ONCE(!ptdesc_pmd_is_shared(pt)); + ptdesc_pmd_pts_dec(pt); + + /* Clearing a PUD pointing at a PMD table with PMD leaves. */ + tlb_flush_pmd_range(tlb, addr & PUD_MASK, PUD_SIZE); + + /* + * If the page table is now exclusively owned, we fully unshared + * a page table. + */ + if (!ptdesc_pmd_is_shared(pt)) + tlb->fully_unshared_tables = true; + tlb->unshared_tables = true; +} + +static inline void tlb_flush_unshared_tables(struct mmu_gather *tlb) +{ + /* + * As soon as the caller drops locks to allow for reuse of + * previously-shared tables, these tables could get modified and + * even reused outside of hugetlb context. So flush the TLB now. + * + * Note that we cannot defer the flush to a later point even if we are + * not the last sharer of the page table. + */ + if (tlb->unshared_tables) + tlb_flush_mmu_tlbonly(tlb); + + /* + * Similarly, we must make sure that concurrent GUP-fast will not + * walk previously-shared page tables that are getting modified+reused + * elsewhere. So broadcast an IPI to wait for any concurrent GUP-fast. + * + * We only perform this when we are the last sharer of a page table, + * as the IPI will reach all CPUs: any GUP-fast. + * + * Note that on configs where tlb_remove_table_sync_one() is a NOP, + * the expectation is that the tlb_flush_mmu_tlbonly() would have issued + * required IPIs already for us. + */ + if (tlb->fully_unshared_tables) { + tlb_remove_table_sync_one(); + tlb->fully_unshared_tables = false; + } +} +#endif /* CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING */ + #endif /* CONFIG_MMU */ #endif /* _ASM_GENERIC__TLB_H */ --- a/include/linux/hugetlb.h~mm-hugetlb-fix-excessive-ipi-broadcasts-when-unsharing-pmd-tables-using-mmu_gather +++ a/include/linux/hugetlb.h @@ -240,8 +240,9 @@ pte_t *huge_pte_alloc(struct mm_struct * pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr, unsigned long sz); unsigned long hugetlb_mask_last_page(struct hstate *h); -int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long addr, pte_t *ptep); +int huge_pmd_unshare(struct mmu_gather *tlb, struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep); +void huge_pmd_unshare_flush(struct mmu_gather *tlb, struct vm_area_struct *vma); void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma, unsigned long *start, unsigned long *end); @@ -271,7 +272,7 @@ void hugetlb_vma_unlock_write(struct vm_ int hugetlb_vma_trylock_write(struct vm_area_struct *vma); void hugetlb_vma_assert_locked(struct vm_area_struct *vma); void hugetlb_vma_lock_release(struct kref *kref); -long hugetlb_change_protection(struct vm_area_struct *vma, +long hugetlb_change_protection(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long address, unsigned long end, pgprot_t newprot, unsigned long cp_flags); void hugetlb_unshare_all_pmds(struct vm_area_struct *vma); @@ -300,13 +301,17 @@ static inline struct address_space *huge return NULL; } -static inline int huge_pmd_unshare(struct mm_struct *mm, - struct vm_area_struct *vma, - unsigned long addr, pte_t *ptep) +static inline int huge_pmd_unshare(struct mmu_gather *tlb, + struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { return 0; } +static inline void huge_pmd_unshare_flush(struct mmu_gather *tlb, + struct vm_area_struct *vma) +{ +} + static inline void adjust_range_if_pmd_sharing_possible( struct vm_area_struct *vma, unsigned long *start, unsigned long *end) @@ -432,7 +437,7 @@ static inline void move_hugetlb_state(st { } -static inline long hugetlb_change_protection( +static inline long hugetlb_change_protection(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long address, unsigned long end, pgprot_t newprot, unsigned long cp_flags) --- a/mm/hugetlb.c~mm-hugetlb-fix-excessive-ipi-broadcasts-when-unsharing-pmd-tables-using-mmu_gather +++ a/mm/hugetlb.c @@ -5096,8 +5096,9 @@ int move_hugetlb_page_tables(struct vm_a unsigned long last_addr_mask; pte_t *src_pte, *dst_pte; struct mmu_notifier_range range; - bool shared_pmd = false; + struct mmu_gather tlb; + tlb_gather_mmu(&tlb, vma->vm_mm); mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, old_addr, old_end); adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end); @@ -5122,12 +5123,12 @@ int move_hugetlb_page_tables(struct vm_a if (huge_pte_none(huge_ptep_get(mm, old_addr, src_pte))) continue; - if (huge_pmd_unshare(mm, vma, old_addr, src_pte)) { - shared_pmd = true; + if (huge_pmd_unshare(&tlb, vma, old_addr, src_pte)) { old_addr |= last_addr_mask; new_addr |= last_addr_mask; continue; } + tlb_remove_huge_tlb_entry(h, &tlb, src_pte, old_addr); dst_pte = huge_pte_alloc(mm, new_vma, new_addr, sz); if (!dst_pte) @@ -5136,13 +5137,13 @@ int move_hugetlb_page_tables(struct vm_a move_huge_pte(vma, old_addr, new_addr, src_pte, dst_pte, sz); } - if (shared_pmd) - flush_hugetlb_tlb_range(vma, range.start, range.end); - else - flush_hugetlb_tlb_range(vma, old_end - len, old_end); + tlb_flush_mmu_tlbonly(&tlb); + huge_pmd_unshare_flush(&tlb, vma); + mmu_notifier_invalidate_range_end(&range); i_mmap_unlock_write(mapping); hugetlb_vma_unlock_write(vma); + tlb_finish_mmu(&tlb); return len + old_addr - old_end; } @@ -5161,7 +5162,6 @@ void __unmap_hugepage_range(struct mmu_g unsigned long sz = huge_page_size(h); bool adjust_reservation; unsigned long last_addr_mask; - bool force_flush = false; WARN_ON(!is_vm_hugetlb_page(vma)); BUG_ON(start & ~huge_page_mask(h)); @@ -5184,10 +5184,8 @@ void __unmap_hugepage_range(struct mmu_g } ptl = huge_pte_lock(h, mm, ptep); - if (huge_pmd_unshare(mm, vma, address, ptep)) { + if (huge_pmd_unshare(tlb, vma, address, ptep)) { spin_unlock(ptl); - tlb_flush_pmd_range(tlb, address & PUD_MASK, PUD_SIZE); - force_flush = true; address |= last_addr_mask; continue; } @@ -5303,14 +5301,7 @@ void __unmap_hugepage_range(struct mmu_g } tlb_end_vma(tlb, vma); - /* - * There is nothing protecting a previously-shared page table that we - * unshared through huge_pmd_unshare() from getting freed after we - * release i_mmap_rwsem, so flush the TLB now. If huge_pmd_unshare() - * succeeded, flush the range corresponding to the pud. - */ - if (force_flush) - tlb_flush_mmu_tlbonly(tlb); + huge_pmd_unshare_flush(tlb, vma); } void __hugetlb_zap_begin(struct vm_area_struct *vma, @@ -6399,7 +6390,7 @@ out_release_nounlock: } #endif /* CONFIG_USERFAULTFD */ -long hugetlb_change_protection(struct vm_area_struct *vma, +long hugetlb_change_protection(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long address, unsigned long end, pgprot_t newprot, unsigned long cp_flags) { @@ -6409,7 +6400,6 @@ long hugetlb_change_protection(struct vm pte_t pte; struct hstate *h = hstate_vma(vma); long pages = 0, psize = huge_page_size(h); - bool shared_pmd = false; struct mmu_notifier_range range; unsigned long last_addr_mask; bool uffd_wp = cp_flags & MM_CP_UFFD_WP; @@ -6452,7 +6442,7 @@ long hugetlb_change_protection(struct vm } } ptl = huge_pte_lock(h, mm, ptep); - if (huge_pmd_unshare(mm, vma, address, ptep)) { + if (huge_pmd_unshare(tlb, vma, address, ptep)) { /* * When uffd-wp is enabled on the vma, unshare * shouldn't happen at all. Warn about it if it @@ -6461,7 +6451,6 @@ long hugetlb_change_protection(struct vm WARN_ON_ONCE(uffd_wp || uffd_wp_resolve); pages++; spin_unlock(ptl); - shared_pmd = true; address |= last_addr_mask; continue; } @@ -6522,22 +6511,16 @@ long hugetlb_change_protection(struct vm pte = huge_pte_clear_uffd_wp(pte); huge_ptep_modify_prot_commit(vma, address, ptep, old_pte, pte); pages++; + tlb_remove_huge_tlb_entry(h, tlb, ptep, address); } next: spin_unlock(ptl); cond_resched(); } - /* - * There is nothing protecting a previously-shared page table that we - * unshared through huge_pmd_unshare() from getting freed after we - * release i_mmap_rwsem, so flush the TLB now. If huge_pmd_unshare() - * succeeded, flush the range corresponding to the pud. - */ - if (shared_pmd) - flush_hugetlb_tlb_range(vma, range.start, range.end); - else - flush_hugetlb_tlb_range(vma, start, end); + + tlb_flush_mmu_tlbonly(tlb); + huge_pmd_unshare_flush(tlb, vma); /* * No need to call mmu_notifier_arch_invalidate_secondary_tlbs() we are * downgrading page table protection not changing it to point to a new @@ -6904,18 +6887,27 @@ out: return pte; } -/* - * unmap huge page backed by shared pte. +/** + * huge_pmd_unshare - Unmap a pmd table if it is shared by multiple users + * @tlb: the current mmu_gather. + * @vma: the vma covering the pmd table. + * @addr: the address we are trying to unshare. + * @ptep: pointer into the (pmd) page table. + * + * Called with the page table lock held, the i_mmap_rwsem held in write mode + * and the hugetlb vma lock held in write mode. * - * Called with page table lock held. + * Note: The caller must call huge_pmd_unshare_flush() before dropping the + * i_mmap_rwsem. * - * returns: 1 successfully unmapped a shared pte page - * 0 the underlying pte page is not shared, or it is the last user + * Returns: 1 if it was a shared PMD table and it got unmapped, or 0 if it + * was not a shared PMD table. */ -int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long addr, pte_t *ptep) +int huge_pmd_unshare(struct mmu_gather *tlb, struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep) { unsigned long sz = huge_page_size(hstate_vma(vma)); + struct mm_struct *mm = vma->vm_mm; pgd_t *pgd = pgd_offset(mm, addr); p4d_t *p4d = p4d_offset(pgd, addr); pud_t *pud = pud_offset(p4d, addr); @@ -6927,18 +6919,36 @@ int huge_pmd_unshare(struct mm_struct *m i_mmap_assert_write_locked(vma->vm_file->f_mapping); hugetlb_vma_assert_locked(vma); pud_clear(pud); - /* - * Once our caller drops the rmap lock, some other process might be - * using this page table as a normal, non-hugetlb page table. - * Wait for pending gup_fast() in other threads to finish before letting - * that happen. - */ - tlb_remove_table_sync_one(); - ptdesc_pmd_pts_dec(virt_to_ptdesc(ptep)); + + tlb_unshare_pmd_ptdesc(tlb, virt_to_ptdesc(ptep), addr); + mm_dec_nr_pmds(mm); return 1; } +/* + * huge_pmd_unshare_flush - Complete a sequence of huge_pmd_unshare() calls + * @tlb: the current mmu_gather. + * @vma: the vma covering the pmd table. + * + * Perform necessary TLB flushes or IPI broadcasts to synchronize PMD table + * unsharing with concurrent page table walkers (TLB, GUP-fast, etc.). + * + * This function must be called after a sequence of huge_pmd_unshare() + * calls while still holding the i_mmap_rwsem. + */ +void huge_pmd_unshare_flush(struct mmu_gather *tlb, struct vm_area_struct *vma) +{ + /* + * We must synchronize page table unsharing such that nobody will + * try reusing a previously-shared page table while it might still + * be in use by previous sharers (TLB, GUP_fast). + */ + i_mmap_assert_write_locked(vma->vm_file->f_mapping); + + tlb_flush_unshared_tables(tlb); +} + #else /* !CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING */ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma, @@ -6947,12 +6957,16 @@ pte_t *huge_pmd_share(struct mm_struct * return NULL; } -int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long addr, pte_t *ptep) +int huge_pmd_unshare(struct mmu_gather *tlb, struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep) { return 0; } +void huge_pmd_unshare_flush(struct mmu_gather *tlb, struct vm_area_struct *vma) +{ +} + void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma, unsigned long *start, unsigned long *end) { @@ -7219,6 +7233,7 @@ static void hugetlb_unshare_pmds(struct unsigned long sz = huge_page_size(h); struct mm_struct *mm = vma->vm_mm; struct mmu_notifier_range range; + struct mmu_gather tlb; unsigned long address; spinlock_t *ptl; pte_t *ptep; @@ -7229,6 +7244,7 @@ static void hugetlb_unshare_pmds(struct if (start >= end) return; + tlb_gather_mmu(&tlb, mm); flush_cache_range(vma, start, end); /* * No need to call adjust_range_if_pmd_sharing_possible(), because @@ -7248,10 +7264,10 @@ static void hugetlb_unshare_pmds(struct if (!ptep) continue; ptl = huge_pte_lock(h, mm, ptep); - huge_pmd_unshare(mm, vma, address, ptep); + huge_pmd_unshare(&tlb, vma, address, ptep); spin_unlock(ptl); } - flush_hugetlb_tlb_range(vma, start, end); + huge_pmd_unshare_flush(&tlb, vma); if (take_locks) { i_mmap_unlock_write(vma->vm_file->f_mapping); hugetlb_vma_unlock_write(vma); @@ -7261,6 +7277,7 @@ static void hugetlb_unshare_pmds(struct * Documentation/mm/mmu_notifier.rst. */ mmu_notifier_invalidate_range_end(&range); + tlb_finish_mmu(&tlb); } /* --- a/mm/mmu_gather.c~mm-hugetlb-fix-excessive-ipi-broadcasts-when-unsharing-pmd-tables-using-mmu_gather +++ a/mm/mmu_gather.c @@ -469,6 +469,12 @@ void tlb_gather_mmu_fullmm(struct mmu_ga void tlb_finish_mmu(struct mmu_gather *tlb) { /* + * We expect an earlier huge_pmd_unshare_flush() call to sort this out, + * due to complicated locking requirements with page table unsharing. + */ + VM_WARN_ON_ONCE(tlb->fully_unshared_tables); + + /* * If there are parallel threads are doing PTE changes on same range * under non-exclusive lock (e.g., mmap_lock read-side) but defer TLB * flush by batching, one thread may end up seeing inconsistent PTEs --- a/mm/mprotect.c~mm-hugetlb-fix-excessive-ipi-broadcasts-when-unsharing-pmd-tables-using-mmu_gather +++ a/mm/mprotect.c @@ -652,7 +652,7 @@ long change_protection(struct mmu_gather #endif if (is_vm_hugetlb_page(vma)) - pages = hugetlb_change_protection(vma, start, end, newprot, + pages = hugetlb_change_protection(tlb, vma, start, end, newprot, cp_flags); else pages = change_protection_range(tlb, vma, start, end, newprot, --- a/mm/rmap.c~mm-hugetlb-fix-excessive-ipi-broadcasts-when-unsharing-pmd-tables-using-mmu_gather +++ a/mm/rmap.c @@ -76,7 +76,7 @@ #include <linux/mm_inline.h> #include <linux/oom.h> -#include <asm/tlbflush.h> +#include <asm/tlb.h> #define CREATE_TRACE_POINTS #include <trace/events/migrate.h> @@ -2008,13 +2008,17 @@ static bool try_to_unmap_one(struct foli * if unsuccessful. */ if (!anon) { + struct mmu_gather tlb; + VM_BUG_ON(!(flags & TTU_RMAP_LOCKED)); if (!hugetlb_vma_trylock_write(vma)) goto walk_abort; - if (huge_pmd_unshare(mm, vma, address, pvmw.pte)) { + + tlb_gather_mmu(&tlb, mm); + if (huge_pmd_unshare(&tlb, vma, address, pvmw.pte)) { hugetlb_vma_unlock_write(vma); - flush_tlb_range(vma, - range.start, range.end); + huge_pmd_unshare_flush(&tlb, vma); + tlb_finish_mmu(&tlb); /* * The PMD table was unmapped, * consequently unmapping the folio. @@ -2022,6 +2026,7 @@ static bool try_to_unmap_one(struct foli goto walk_done; } hugetlb_vma_unlock_write(vma); + tlb_finish_mmu(&tlb); } pteval = huge_ptep_clear_flush(vma, address, pvmw.pte); if (pte_dirty(pteval)) @@ -2398,17 +2403,20 @@ static bool try_to_migrate_one(struct fo * fail if unsuccessful. */ if (!anon) { + struct mmu_gather tlb; + VM_BUG_ON(!(flags & TTU_RMAP_LOCKED)); if (!hugetlb_vma_trylock_write(vma)) { page_vma_mapped_walk_done(&pvmw); ret = false; break; } - if (huge_pmd_unshare(mm, vma, address, pvmw.pte)) { - hugetlb_vma_unlock_write(vma); - flush_tlb_range(vma, - range.start, range.end); + tlb_gather_mmu(&tlb, mm); + if (huge_pmd_unshare(&tlb, vma, address, pvmw.pte)) { + hugetlb_vma_unlock_write(vma); + huge_pmd_unshare_flush(&tlb, vma); + tlb_finish_mmu(&tlb); /* * The PMD table was unmapped, * consequently unmapping the folio. @@ -2417,6 +2425,7 @@ static bool try_to_migrate_one(struct fo break; } hugetlb_vma_unlock_write(vma); + tlb_finish_mmu(&tlb); } /* Nuke the hugetlb page table entry */ pteval = huge_ptep_clear_flush(vma, address, pvmw.pte); _ Patches currently in -mm which might be from david(a)kernel.org are

20 hours, 26 minutes

1
0
0 0

[merged mm-stable] mm-hugetlb-fix-hugetlb_pmd_shared.patch removed from -mm tree

by Andrew Morton

The quilt patch titled Subject: mm/hugetlb: fix hugetlb_pmd_shared() has been removed from the -mm tree. Its filename was mm-hugetlb-fix-hugetlb_pmd_shared.patch This patch was dropped because it was merged into the mm-stable branch of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm ------------------------------------------------------ From: "David Hildenbrand (Red Hat)" <david(a)kernel.org> Subject: mm/hugetlb: fix hugetlb_pmd_shared() Date: Fri, 5 Dec 2025 22:35:55 +0100 Patch series "mm/hugetlb: fixes for PMD table sharing (incl. using mmu_gather)". One functional fix, one performance regression fix, and two related comment fixes. I cleaned up my prototype I recently shared [1] for the performance fix, deferring most of the cleanups I had in the prototype to a later point. While doing that I identified the other things. The goal of this patch set is to be backported to stable trees "fairly" easily. At least patch #1 and #4. Patch #1 fixes hugetlb_pmd_shared() not detecting any sharing Patch #2 + #3 are simple comment fixes that patch #4 interacts with. Patch #4 is a fix for the reported performance regression due to excessive IPI broadcasts during fork()+exit(). The last patch is all about TLB flushes, IPIs and mmu_gather. Read: complicated I added as much comments + description that I possibly could, and I am hoping for review from Jann. There are plenty of cleanups in the future to be had + one reasonable optimization on x86. But that's all out of scope for this series. This patch (of 4): We switched from (wrongly) using the page count to an independent shared count. Now, shared page tables have a refcount of 1 (excluding speculative references) and instead use ptdesc->pt_share_count to identify sharing. We didn't convert hugetlb_pmd_shared(), so right now, we would never detect a shared PMD table as such, because sharing/unsharing no longer touches the refcount of a PMD table. Page migration, like mbind() or migrate_pages() would allow for migrating folios mapped into such shared PMD tables, even though the folios are not exclusive. In smaps we would account them as "private" although they are "shared", and we would be wrongly setting the PM_MMAP_EXCLUSIVE in the pagemap interface. Fix it by properly using ptdesc_pmd_is_shared() in hugetlb_pmd_shared(). Link: https://lkml.kernel.org/r/20251205213558.2980480-1-david@kernel.org Link: https://lkml.kernel.org/r/20251205213558.2980480-2-david@kernel.org Link: https://lore.kernel.org/all/8cab934d-4a56-44aa-b641-bfd7e23bd673@kernel.org/ [1] Fixes: 59d9094df3d7 ("mm: hugetlb: independent PMD page table shared count") Signed-off-by: David Hildenbrand (Red Hat) <david(a)kernel.org> Tested-by: Laurence Oberman <loberman(a)redhat.com> Reviewed-by: Rik van Riel <riel(a)surriel.com> Reviewed-by: Lance Yang <lance.yang(a)linux.dev> Cc: Liu Shixin <liushixin2(a)huawei.com> Cc: "Aneesh Kumar K.V" <aneesh.kumar(a)kernel.org> Cc: Arnd Bergmann <arnd(a)arndb.de> Cc: Jann Horn <jannh(a)google.com> Cc: Liam Howlett <liam.howlett(a)oracle.com> Cc: Lorenzo Stoakes <lorenzo.stoakes(a)oracle.com> Cc: Muchun Song <muchun.song(a)linux.dev> Cc: Nadav Amit <nadav.amit(a)gmail.com> Cc: Nicholas Piggin <npiggin(a)gmail.com> Cc: Oscar Salvador <osalvador(a)suse.de> Cc: Peter Zijlstra <peterz(a)infradead.org> Cc: Prakash Sangappa <prakash.sangappa(a)oracle.com> Cc: Vlastimil Babka <vbabka(a)suse.cz> Cc: Will Deacon <will(a)kernel.org> Cc: Uschakow, Stanislav" <suschako(a)amazon.de> Cc: <stable(a)vger.kernel.org> Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org> --- include/linux/hugetlb.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) --- a/include/linux/hugetlb.h~mm-hugetlb-fix-hugetlb_pmd_shared +++ a/include/linux/hugetlb.h @@ -1326,7 +1326,7 @@ static inline __init void hugetlb_cma_re #ifdef CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING static inline bool hugetlb_pmd_shared(pte_t *pte) { - return page_count(virt_to_page(pte)) > 1; + return ptdesc_pmd_is_shared(virt_to_ptdesc(pte)); } #else static inline bool hugetlb_pmd_shared(pte_t *pte) _ Patches currently in -mm which might be from david(a)kernel.org are

20 hours, 26 minutes

1
0
0 0

2025

2024

2023

2022

2021

2020

2019

2018

2017

Linux-stable-mirror