This reverts commit b3b274bc9d3d7307308aeaf75f70731765ac999a.
On the DragonBoard 820c (which uses APQ8096/MSM8996) this change causes
the CPUs to downclock to roughly half speed under sustained load. The
regression is visible both during boot and when running CPU stress
workloads such as stress-ng: the CPUs initially ramp up to the expected
frequency, then drop to a lower OPP even though the system is clearly
CPU-bound.
Bisecting points to this commit and reverting it restores the expected
behaviour on the DragonBoard 820c - the CPUs track the cpufreq policy
and run at full performance under load.
The exact interaction with the ACD is not yet fully understood and we
would like to keep ACD in use to avoid possible SoC reliability issues.
Until we have a better fix that preserves ACD while avoiding this
performance regression, revert the bisected patch to restore the
previous behaviour.
Fixes: b3b274bc9d3d ("clk: qcom: cpu-8996: simplify the cpu_clk_notifier_cb")
Cc: stable(a)vger.kernel.org # v6.3+
Link: https://lore.kernel.org/linux-arm-msm/20230113120544.59320-8-dmitry.baryshk…
Cc: Dmitry Baryshkov <dmitry.baryshkov(a)oss.qualcomm.com>
Signed-off-by: Christopher Obbard <christopher.obbard(a)linaro.org>
---
Hi all,
This series contains a single revert for a regression affecting the
APQ8096/MSM8996 (DragonBoard 820c).
The commit being reverted, b3b274bc9d3d ("clk: qcom: cpu-8996: simplify the cpu_clk_notifier_cb"),
introduces a significant performance issue where the CPUs downclock to
~50% of their expected frequency under sustained load. The problem is
reproducible both at boot and when running CPU-bound workloads such as
stress-ng.
Bisecting the issue pointed directly to this commit and reverting it
restores correct cpufreq behaviour.
The root cause appears to be related to the interaction between the
simplified notifier callback and ACD (Adaptive Clock Distribution).
Since we would prefer to keep ACD enabled for SoC reliability reasons,
a revert is the safest option until a proper fix is identified.
Full details are included in the commit message.
Feedback & suggestions welcome.
Cheers!
Christopher Obbard
---
drivers/clk/qcom/clk-cpu-8996.c | 30 +++++++++++-------------------
1 file changed, 11 insertions(+), 19 deletions(-)
diff --git a/drivers/clk/qcom/clk-cpu-8996.c b/drivers/clk/qcom/clk-cpu-8996.c
index 21d13c0841ed..028476931747 100644
--- a/drivers/clk/qcom/clk-cpu-8996.c
+++ b/drivers/clk/qcom/clk-cpu-8996.c
@@ -547,35 +547,27 @@ static int cpu_clk_notifier_cb(struct notifier_block *nb, unsigned long event,
{
struct clk_cpu_8996_pmux *cpuclk = to_clk_cpu_8996_pmux_nb(nb);
struct clk_notifier_data *cnd = data;
+ int ret;
switch (event) {
case PRE_RATE_CHANGE:
+ ret = clk_cpu_8996_pmux_set_parent(&cpuclk->clkr.hw, ALT_INDEX);
qcom_cpu_clk_msm8996_acd_init(cpuclk->clkr.regmap);
-
- /*
- * Avoid overvolting. clk_core_set_rate_nolock() walks from top
- * to bottom, so it will change the rate of the PLL before
- * chaging the parent of PMUX. This can result in pmux getting
- * clocked twice the expected rate.
- *
- * Manually switch to PLL/2 here.
- */
- if (cnd->new_rate < DIV_2_THRESHOLD &&
- cnd->old_rate > DIV_2_THRESHOLD)
- clk_cpu_8996_pmux_set_parent(&cpuclk->clkr.hw, SMUX_INDEX);
-
break;
- case ABORT_RATE_CHANGE:
- /* Revert manual change */
- if (cnd->new_rate < DIV_2_THRESHOLD &&
- cnd->old_rate > DIV_2_THRESHOLD)
- clk_cpu_8996_pmux_set_parent(&cpuclk->clkr.hw, ACD_INDEX);
+ case POST_RATE_CHANGE:
+ if (cnd->new_rate < DIV_2_THRESHOLD)
+ ret = clk_cpu_8996_pmux_set_parent(&cpuclk->clkr.hw,
+ SMUX_INDEX);
+ else
+ ret = clk_cpu_8996_pmux_set_parent(&cpuclk->clkr.hw,
+ ACD_INDEX);
break;
default:
+ ret = 0;
break;
}
- return NOTIFY_OK;
+ return notifier_from_errno(ret);
};
static int qcom_cpu_clk_msm8996_driver_probe(struct platform_device *pdev)
---
base-commit: c17e270dfb342a782d69c4a7c4c32980455afd9c
change-id: 20251202-wip-obbardc-qcom-msm8096-clk-cpu-fix-downclock-b7561da4cb95
Best regards,
--
Christopher Obbard <christopher.obbard(a)linaro.org>
Hi,
With Live Design International 2025 (LDI) concluded, I wanted to personally offer you exclusive access to our Visitor Contact List, featuring 16,956 fully verified leads, along with all on-site walk-ins.
Each entry includes Name, Job Title, Company, Website, Address, Phone, Official Email and more.
If you’d like more details, simply reply, “Send me pricing.”
Kind Regards,
Rose
Sr. Demand Generation
P.S. Not the right fit? Reply “Unfollow” to opt out.
When of_find_net_device_by_node() successfully acquires a reference to
a network device but the subsequent call to dsa_port_parse_cpu()
fails, dsa_port_parse_of() returns without releasing the reference
count on the network device.
of_find_net_device_by_node() increments the reference count of the
returned structure, which should be balanced with a corresponding
put_device() when the reference is no longer needed.
Found by code review.
Cc: stable(a)vger.kernel.org
Fixes: deff710703d8 ("net: dsa: Allow default tag protocol to be overridden from DT")
Signed-off-by: Ma Ke <make24(a)iscas.ac.cn>
---
Changes in v2:
- simplified the patch as suggestions;
- modified the Fixes tag as suggestions.
---
net/dsa/dsa.c | 7 ++++++-
1 file changed, 6 insertions(+), 1 deletion(-)
diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c
index a20efabe778f..31b409a47491 100644
--- a/net/dsa/dsa.c
+++ b/net/dsa/dsa.c
@@ -1247,6 +1247,7 @@ static int dsa_port_parse_of(struct dsa_port *dp, struct device_node *dn)
struct device_node *ethernet = of_parse_phandle(dn, "ethernet", 0);
const char *name = of_get_property(dn, "label", NULL);
bool link = of_property_read_bool(dn, "link");
+ int err = 0;
dp->dn = dn;
@@ -1260,7 +1261,11 @@ static int dsa_port_parse_of(struct dsa_port *dp, struct device_node *dn)
return -EPROBE_DEFER;
user_protocol = of_get_property(dn, "dsa-tag-protocol", NULL);
- return dsa_port_parse_cpu(dp, conduit, user_protocol);
+ err = dsa_port_parse_cpu(dp, conduit, user_protocol);
+ if (err)
+ put_device(conduit);
+
+ return err;
}
if (link)
--
2.17.1
When the filesystem is being mounted, the kernel panics while the data
regarding slot map allocation to the local node, is being written to the
disk. This occurs because the value of slot map buffer head block
number, which should have been greater than or equal to
`OCFS2_SUPER_BLOCK_BLKNO` (evaluating to 2) is less than it, indicative
of disk metadata corruption. This triggers
BUG_ON(bh->b_blocknr < OCFS2_SUPER_BLOCK_BLKNO) in ocfs2_write_block(),
causing the kernel to panic.
This is fixed by introducing an if condition block in
ocfs2_update_disk_slot(), right before calling ocfs2_write_block(), which
checks if `bh->b_blocknr` is lesser than `OCFS2_SUPER_BLOCK_BLKNO`; if
yes, then ocfs2_error is called, which prints the error log, for
debugging purposes, and the return value of ocfs2_error() is returned
back to caller of ocfs2_update_disk_slot() i.e. ocfs2_find_slot(). If
the return value is zero. then error code EIO is returned.
Reported-by: syzbot+c818e5c4559444f88aa0(a)syzkaller.appspotmail.com
Closes: https://syzkaller.appspot.com/bug?extid=c818e5c4559444f88aa0
Tested-by: syzbot+c818e5c4559444f88aa0(a)syzkaller.appspotmail.com
Cc: stable(a)vger.kernel.org
Signed-off-by: Prithvi Tambewagh <activprithvi(a)gmail.com>
---
v1->v2:
- Remove usage of le16_to_cpu() from ocfs2_error()
- Cast bh->b_blocknr to unsigned long long
- Remove type casting for OCFS2_SUPER_BLOCK_BLKNO
- Fix Sparse warnings reported in v1 by kernel test robot
- Update title from 'ocfs2: Fix kernel BUG in ocfs2_write_block' to
'ocfs2: fix kernel BUG in ocfs2_write_block'
v1 link: https://lore.kernel.org/all/20251206154819.175479-1-activprithvi@gmail.com/…
fs/ocfs2/slot_map.c | 10 ++++++++++
1 file changed, 10 insertions(+)
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c
index e544c704b583..e916a2e8f92d 100644
--- a/fs/ocfs2/slot_map.c
+++ b/fs/ocfs2/slot_map.c
@@ -193,6 +193,16 @@ static int ocfs2_update_disk_slot(struct ocfs2_super *osb,
else
ocfs2_update_disk_slot_old(si, slot_num, &bh);
spin_unlock(&osb->osb_lock);
+ if (bh->b_blocknr < OCFS2_SUPER_BLOCK_BLKNO) {
+ status = ocfs2_error(osb->sb,
+ "Invalid Slot Map Buffer Head "
+ "Block Number : %llu, Should be >= %d",
+ (unsigned long long)bh->b_blocknr,
+ OCFS2_SUPER_BLOCK_BLKNO);
+ if (!status)
+ return -EIO;
+ return status;
+ }
status = ocfs2_write_block(osb, bh, INODE_CACHE(si->si_inode));
if (status < 0)
base-commit: 24172e0d79900908cf5ebf366600616d29c9b417
--
2.43.0
Hi Exhibitor,
Hope you had a successful experience at LDI Show 2025 (Dec 3–9, Las Vegas). We have access to a verified list of 13,594 attendees across the live events, lighting, audio, staging, and production-technology sectors.
This includes lighting designers, audio engineers, production managers, AV integrators, stage/rigging technicians, venue operations heads, broadcast specialists, and other key live-event decision-makers.
Don’t miss the opportunity to connect with high-quality prospects after the event.
If interested, kindly reply “Send Pricing” to receive the details.
Best regards,
Megan Castillo
Senior Market Analyst
To opt-out, reply “Not Interested”.
tegra_ahb_enable_smmu() utilizes driver_find_device_by_of_node() which
internally calls driver_find_device() to locate the matching device.
driver_find_device() increments the ref count of the found device by
calling get_device(), but tegra_ahb_enable_smmu() fails to call
put_device() to decrement the reference count before returning. This
results in a reference count leak of the device, which may prevent the
device from being properly released and cause a memory leak.
Found by code review.
Cc: stable(a)vger.kernel.org
Fixes: 89c788bab1f0 ("ARM: tegra: Add SMMU enabler in AHB")
Signed-off-by: Ma Ke <make24(a)iscas.ac.cn>
---
drivers/amba/tegra-ahb.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/drivers/amba/tegra-ahb.c b/drivers/amba/tegra-ahb.c
index f23c3ed01810..3ed5cef34806 100644
--- a/drivers/amba/tegra-ahb.c
+++ b/drivers/amba/tegra-ahb.c
@@ -148,6 +148,7 @@ int tegra_ahb_enable_smmu(struct device_node *dn)
val = gizmo_readl(ahb, AHB_ARBITRATION_XBAR_CTRL);
val |= AHB_ARBITRATION_XBAR_CTRL_SMMU_INIT_DONE;
gizmo_writel(ahb, val, AHB_ARBITRATION_XBAR_CTRL);
+ put_device(dev);
return 0;
}
EXPORT_SYMBOL(tegra_ahb_enable_smmu);
--
2.17.1
kmb_probe() obtain a reference to a platform device by
of_find_device_by_node(). This call increases the reference count of
the returned device, which should be dropped by calling put_device()
when the device is no longer needed. However, the code fails to call
put_device() in several error handling paths and the normal device
removal path. This could result in reference count leaks that prevent
the proper cleanup of the platform device when the driver is unloaded
or during error recovery.
Add put_device() in all code paths where dsi_pdev is no longer needed,
including error paths and the normal removal path.
Found by code review.
Cc: stable(a)vger.kernel.org
Fixes: 7f7b96a8a0a1 ("drm/kmb: Add support for KeemBay Display")
Signed-off-by: Ma Ke <make24(a)iscas.ac.cn>
---
drivers/gpu/drm/kmb/kmb_drv.c | 16 +++++++++++-----
1 file changed, 11 insertions(+), 5 deletions(-)
diff --git a/drivers/gpu/drm/kmb/kmb_drv.c b/drivers/gpu/drm/kmb/kmb_drv.c
index 7c2eb1152fc2..9733337abe92 100644
--- a/drivers/gpu/drm/kmb/kmb_drv.c
+++ b/drivers/gpu/drm/kmb/kmb_drv.c
@@ -474,6 +474,8 @@ static void kmb_remove(struct platform_device *pdev)
/* Unregister DSI host */
kmb_dsi_host_unregister(kmb->kmb_dsi);
+ if (kmb->kmb_dsi && kmb->kmb_dsi->pdev)
+ put_device(&kmb->kmb_dsi->pdev->dev);
drm_atomic_helper_shutdown(drm);
}
@@ -518,17 +520,20 @@ static int kmb_probe(struct platform_device *pdev)
ret = kmb_dsi_host_bridge_init(get_device(&dsi_pdev->dev));
if (ret == -EPROBE_DEFER) {
- return -EPROBE_DEFER;
+ ret = -EPROBE_DEFER;
+ goto err_free2;
} else if (ret) {
DRM_ERROR("probe failed to initialize DSI host bridge\n");
- return ret;
+ goto err_free2;
}
/* Create DRM device */
kmb = devm_drm_dev_alloc(dev, &kmb_driver,
struct kmb_drm_private, drm);
- if (IS_ERR(kmb))
- return PTR_ERR(kmb);
+ if (IS_ERR(kmb)) {
+ ret = PTR_ERR(kmb);
+ goto err_free2;
+ }
dev_set_drvdata(dev, &kmb->drm);
@@ -577,7 +582,8 @@ static int kmb_probe(struct platform_device *pdev)
err_free1:
dev_set_drvdata(dev, NULL);
kmb_dsi_host_unregister(kmb->kmb_dsi);
-
+err_free2:
+ put_device(&dsi_pdev->dev);
return ret;
}
--
2.17.1
Changing the enable/disable sequence in commit c9b1150a68d9
("drm/atomic-helper: Re-order bridge chain pre-enable and post-disable")
has caused regressions on multiple platforms: R-Car, MCDE, Rockchip.
This is an alternate series to Linus' series:
https://lore.kernel.org/all/20251202-mcde-drm-regression-thirdfix-v6-0-f1bf…
This series first reverts the original commit and reverts a fix for
mediatek which is no longer needed. It then exposes helper functions
from DRM core, and finally implements the new sequence only in the tidss
driver.
There is one more fix in upstream for the original commit, commit
5d91394f2361 ("drm/exynos: fimd: Guard display clock control with
runtime PM calls"), but I have not reverted that one as it looks like a
valid patch in its own.
I added Cc stable v6.17+ to all patches, but I didn't add Fixes tags, as
I wasn't sure what should they point to. But I could perhaps add Fixes:
<original commit> to all of these.
Signed-off-by: Tomi Valkeinen <tomi.valkeinen(a)ideasonboard.com>
---
Linus Walleij (1):
drm/atomic-helper: Export and namespace some functions
Tomi Valkeinen (3):
Revert "drm/atomic-helper: Re-order bridge chain pre-enable and post-disable"
Revert "drm/mediatek: dsi: Fix DSI host and panel bridge pre-enable order"
drm/tidss: Fix enable/disable order
drivers/gpu/drm/drm_atomic_helper.c | 122 ++++++++++++++----
drivers/gpu/drm/mediatek/mtk_dsi.c | 6 -
drivers/gpu/drm/tidss/tidss_kms.c | 30 ++++-
include/drm/drm_atomic_helper.h | 22 ++++
include/drm/drm_bridge.h | 249 ++++++++++--------------------------
5 files changed, 214 insertions(+), 215 deletions(-)
---
base-commit: 88e721ab978a86426aa08da520de77430fa7bb84
change-id: 20251205-drm-seq-fix-b4ed1f56604b
Best regards,
--
Tomi Valkeinen <tomi.valkeinen(a)ideasonboard.com>
From: Alex Deucher <alexander.deucher(a)amd.com>
commit eb296c09805ee37dd4ea520a7fb3ec157c31090f upstream.
SI hardware doesn't support pasids, user mode queues, or
KIQ/MES so there is no need for this. Doing so results in
a segfault as these callbacks are non-existent for SI.
Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/4744
Fixes: f3854e04b708 ("drm/amdgpu: attach tlb fence to the PTs update")
Reviewed-by: Timur Kristóf <timur.kristof(a)gmail.com>
Signed-off-by: Alex Deucher <alexander.deucher(a)amd.com>
Signed-off-by: Hans de Goede <johannes.goede(a)oss.qualcomm.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index 676e24fb8864..cdcafde3c71a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -1066,7 +1066,9 @@ amdgpu_vm_tlb_flush(struct amdgpu_vm_update_params *params,
}
/* Prepare a TLB flush fence to be attached to PTs */
- if (!params->unlocked) {
+ if (!params->unlocked &&
+ /* SI doesn't support pasid or KIQ/MES */
+ params->adev->family > AMDGPU_FAMILY_SI) {
amdgpu_vm_tlb_fence_create(params->adev, vm, fence);
/* Makes sure no PD/PT is freed before the flush */
--
2.52.0
The following commit has been merged into the x86/boot branch of tip:
Commit-ID: adbf61cc47cb72b102682e690ad323e1eda652c2
Gitweb: https://git.kernel.org/tip/adbf61cc47cb72b102682e690ad323e1eda652c2
Author: Yazen Ghannam <yazen.ghannam(a)amd.com>
AuthorDate: Tue, 11 Nov 2025 14:53:57
Committer: Ingo Molnar <mingo(a)kernel.org>
CommitterDate: Sun, 14 Dec 2025 09:19:03 +01:00
x86/acpi/boot: Correct acpi_is_processor_usable() check again
ACPI v6.3 defined a new "Online Capable" MADT LAPIC flag. This bit is
used in conjunction with the "Enabled" MADT LAPIC flag to determine if
a CPU can be enabled/hotplugged by the OS after boot.
Before the new bit was defined, the "Enabled" bit was explicitly
described like this (ACPI v6.0 wording provided):
"If zero, this processor is unusable, and the operating system
support will not attempt to use it"
This means that CPU hotplug (based on MADT) is not possible. Many BIOS
implementations follow this guidance. They may include LAPIC entries in
MADT for unavailable CPUs, but since these entries are marked with
"Enabled=0" it is expected that the OS will completely ignore these
entries.
However, QEMU will do the same (include entries with "Enabled=0") for
the purpose of allowing CPU hotplug within the guest.
Comment from QEMU function pc_madt_cpu_entry():
/* ACPI spec says that LAPIC entry for non present
* CPU may be omitted from MADT or it must be marked
* as disabled. However omitting non present CPU from
* MADT breaks hotplug on linux. So possible CPUs
* should be put in MADT but kept disabled.
*/
Recent Linux topology changes broke the QEMU use case. A following fix
for the QEMU use case broke bare metal topology enumeration.
Rework the Linux MADT LAPIC flags check to allow the QEMU use case only
for guests and to maintain the ACPI spec behavior for bare metal.
Remove an unnecessary check added to fix a bare metal case introduced by
the QEMU "fix".
[ bp: Change logic as Michal suggested. ]
[ mingo: Removed misapplied -stable tag. ]
Fixes: fed8d8773b8e ("x86/acpi/boot: Correct acpi_is_processor_usable() check")
Fixes: f0551af02130 ("x86/topology: Ignore non-present APIC IDs in a present package")
Closes: https://lore.kernel.org/r/20251024204658.3da9bf3f.michal.pecio@gmail.com
Reported-by: Michal Pecio <michal.pecio(a)gmail.com>
Signed-off-by: Yazen Ghannam <yazen.ghannam(a)amd.com>
Signed-off-by: Borislav Petkov (AMD) <bp(a)alien8.de>
Signed-off-by: Ingo Molnar <mingo(a)kernel.org>
Tested-by: Michal Pecio <michal.pecio(a)gmail.com>
Tested-by: Ricardo Neri <ricardo.neri-calderon(a)linux.intel.com>
Link: https://lore.kernel.org/20251111145357.4031846-1-yazen.ghannam@amd.com
Cc: stable(a)vger.kernel.org
---
arch/x86/kernel/acpi/boot.c | 12 ++++++++----
arch/x86/kernel/cpu/topology.c | 15 ---------------
2 files changed, 8 insertions(+), 19 deletions(-)
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 9fa321a..d6138b2 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -35,6 +35,7 @@
#include <asm/smp.h>
#include <asm/i8259.h>
#include <asm/setup.h>
+#include <asm/hypervisor.h>
#include "sleep.h" /* To include x86_acpi_suspend_lowlevel */
static int __initdata acpi_force = 0;
@@ -164,11 +165,14 @@ static bool __init acpi_is_processor_usable(u32 lapic_flags)
if (lapic_flags & ACPI_MADT_ENABLED)
return true;
- if (!acpi_support_online_capable ||
- (lapic_flags & ACPI_MADT_ONLINE_CAPABLE))
- return true;
+ if (acpi_support_online_capable)
+ return lapic_flags & ACPI_MADT_ONLINE_CAPABLE;
- return false;
+ /*
+ * QEMU expects legacy "Enabled=0" LAPIC entries to be counted as usable
+ * in order to support CPU hotplug in guests.
+ */
+ return !hypervisor_is_type(X86_HYPER_NATIVE);
}
static int __init
diff --git a/arch/x86/kernel/cpu/topology.c b/arch/x86/kernel/cpu/topology.c
index f55ea3c..23190a7 100644
--- a/arch/x86/kernel/cpu/topology.c
+++ b/arch/x86/kernel/cpu/topology.c
@@ -27,7 +27,6 @@
#include <xen/xen.h>
#include <asm/apic.h>
-#include <asm/hypervisor.h>
#include <asm/io_apic.h>
#include <asm/mpspec.h>
#include <asm/msr.h>
@@ -236,20 +235,6 @@ static __init void topo_register_apic(u32 apic_id, u32 acpi_id, bool present)
cpuid_to_apicid[cpu] = apic_id;
topo_set_cpuids(cpu, apic_id, acpi_id);
} else {
- u32 pkgid = topo_apicid(apic_id, TOPO_PKG_DOMAIN);
-
- /*
- * Check for present APICs in the same package when running
- * on bare metal. Allow the bogosity in a guest.
- */
- if (hypervisor_is_type(X86_HYPER_NATIVE) &&
- topo_unit_count(pkgid, TOPO_PKG_DOMAIN, phys_cpu_present_map)) {
- pr_info_once("Ignoring hot-pluggable APIC ID %x in present package.\n",
- apic_id);
- topo_info.nr_rejected_cpus++;
- return;
- }
-
topo_info.nr_disabled_cpus++;
}
Hello all,
I have the following problem:
https://gitlab.postmarketos.org/postmarketOS/pmbootstrap/-/issues/2635
In short, what is happening is the following:
- The kernel boots and outputs via UART when I build the kernel with the
following:
make LLVM=1 ARCH="$arm" CC="${CC:-gcc}"
- The kernel doesn't boot and there is no output via UART when I build
the kernel with the following:
make LLVM=1 ARCH="$arm"
The only difference being: CC="${CC:-gcc}". Is this expected? I think
this was present in the Linux kernel ever since Rust was enabled for
ARMv7, and I never encountered it because postmarketOS was originally
building the first way.
Thanks,
Rudraksha
This is the start of the stable review cycle for the 6.12.62 release.
There are 49 patches in this series, all will be posted as a response
to this one. If anyone has any issues with these being applied, please
let me know.
Responses should be made by Fri, 12 Dec 2025 07:29:38 +0000.
Anything received after that time might be too late.
The whole patch series can be found in one patch at:
https://www.kernel.org/pub/linux/kernel/v6.x/stable-review/patch-6.12.62-rc…
or in the git tree and branch at:
git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git linux-6.12.y
and the diffstat can be found below.
thanks,
greg k-h
-------------
Pseudo-Shortlog of commits:
Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
Linux 6.12.62-rc1
Daniele Palmas <dnlplm(a)gmail.com>
bus: mhi: host: pci_generic: Add Telit FN990B40 modem support
Daniele Palmas <dnlplm(a)gmail.com>
bus: mhi: host: pci_generic: Add Telit FN920C04 modem support
Navaneeth K <knavaneeth786(a)gmail.com>
staging: rtl8723bs: fix out-of-bounds read in OnBeacon ESR IE parsing
Navaneeth K <knavaneeth786(a)gmail.com>
staging: rtl8723bs: fix stack buffer overflow in OnAssocReq IE parsing
Navaneeth K <knavaneeth786(a)gmail.com>
staging: rtl8723bs: fix out-of-bounds read in rtw_get_ie() parser
Nikita Zhandarovich <n.zhandarovich(a)fintech.ru>
comedi: check device's attached status in compat ioctls
Nikita Zhandarovich <n.zhandarovich(a)fintech.ru>
comedi: multiq3: sanitize config options in multiq3_attach()
Ian Abbott <abbotti(a)mev.co.uk>
comedi: c6xdigio: Fix invalid PNP driver unregistration
Zenm Chen <zenmchen(a)gmail.com>
wifi: rtw88: Add USB ID 2001:3329 for D-Link AC13U rev. A1
Zenm Chen <zenmchen(a)gmail.com>
wifi: rtl8xxxu: Add USB ID 2001:3328 for D-Link AN3U rev. A1
Linus Torvalds <torvalds(a)linux-foundation.org>
samples: work around glibc redefining some of our defines wrong
Huacai Chen <chenhuacai(a)kernel.org>
LoongArch: Mask all interrupts during kexec/kdump
Naoki Ueki <naoki25519(a)gmail.com>
HID: elecom: Add support for ELECOM M-XT3URBK (018F)
Antheas Kapenekakis <lkml(a)antheas.dev>
platform/x86/amd/pmc: Add spurious_8042 to Xbox Ally
Antheas Kapenekakis <lkml(a)antheas.dev>
platform/x86/amd: pmc: Add Lenovo Legion Go 2 to pmc quirk list
Jia Ston <ston.jia(a)outlook.com>
platform/x86: huawei-wmi: add keys for HONOR models
April Grimoire <april(a)aprilg.moe>
HID: apple: Add SONiX AK870 PRO to non_apple_keyboards quirk list
Armin Wolf <W_Armin(a)gmx.de>
platform/x86: acer-wmi: Ignore backlight event
Praveen Talari <praveen.talari(a)oss.qualcomm.com>
pinctrl: qcom: msm: Fix deadlock in pinmux configuration
Keith Busch <kbusch(a)kernel.org>
nvme: fix admin request_queue lifetime
Mario Limonciello (AMD) <superm1(a)kernel.org>
HID: hid-input: Extend Elan ignore battery quirk to USB
Tetsuo Handa <penguin-kernel(a)I-love.SAKURA.ne.jp>
bfs: Reconstruct file type when loading from disk
Lushih Hsieh <bruce(a)mail.kh.edu.tw>
ALSA: usb-audio: Add native DSD quirks for PureAudio DAC series
Harish Kasiviswanathan <Harish.Kasiviswanathan(a)amd.com>
drm/amdkfd: Fix GPU mappings for APU after prefetch
Yiqi Sun <sunyiqixm(a)gmail.com>
smb: fix invalid username check in smb3_fs_context_parse_param()
Max Chou <max.chou(a)realtek.com>
Bluetooth: btrtl: Avoid loading the config file on security chips
Ian Forbes <ian.forbes(a)broadcom.com>
drm/vmwgfx: Use kref in vmw_bo_dirty
Robin Gong <yibin.gong(a)nxp.com>
spi: imx: keep dma request disabled before dma transfer setup
Alvaro Gamez Machado <alvaro.gamez(a)hazent.com>
spi: xilinx: increase number of retries before declaring stall
Song Liu <song(a)kernel.org>
ftrace: bpf: Fix IPMODIFY + DIRECT in modify_ftrace_direct()
Johan Hovold <johan(a)kernel.org>
USB: serial: kobil_sct: fix TIOCMBIS and TIOCMBIC
Johan Hovold <johan(a)kernel.org>
USB: serial: belkin_sa: fix TIOCMBIS and TIOCMBIC
Magne Bruno <magne.bruno(a)addi-data.com>
serial: add support of CPCI cards
Johan Hovold <johan(a)kernel.org>
USB: serial: ftdi_sio: match on interface number for jtag
Fabio Porcedda <fabio.porcedda(a)gmail.com>
USB: serial: option: move Telit 0x10c7 composition in the right place
Fabio Porcedda <fabio.porcedda(a)gmail.com>
USB: serial: option: add Telit Cinterion FE910C04 new compositions
Slark Xiao <slark_xiao(a)163.com>
USB: serial: option: add Foxconn T99W760
Omar Sandoval <osandov(a)fb.com>
KVM: SVM: Don't skip unrelated instruction if INT3/INTO is replaced
Nikita Zhandarovich <n.zhandarovich(a)fintech.ru>
comedi: pcl818: fix null-ptr-deref in pcl818_ai_cancel()
Alexey Nepomnyashih <sdl(a)nppct.ru>
ext4: add i_data_sem protection in ext4_destroy_inline_data_nolock()
Alexander Sverdlin <alexander.sverdlin(a)siemens.com>
locking/spinlock/debug: Fix data-race in do_raw_write_lock
Qianchang Zhao <pioooooooooip(a)gmail.com>
ksmbd: ipc: fix use-after-free in ipc_msg_send_request
Deepanshu Kartikey <kartikey406(a)gmail.com>
ext4: refresh inline data size before write operations
Ye Bin <yebin10(a)huawei.com>
jbd2: avoid bug_on in jbd2_journal_get_create_access() when file system corrupted
Bagas Sanjaya <bagasdotme(a)gmail.com>
Documentation: process: Also mention Sasha Levin as stable tree maintainer
Sabrina Dubroca <sd(a)queasysnail.net>
xfrm: flush all states in xfrm_state_fini
Sabrina Dubroca <sd(a)queasysnail.net>
xfrm: also call xfrm_state_delete_tunnel at destroy time for states that were never added
Sabrina Dubroca <sd(a)queasysnail.net>
Revert "xfrm: destroy xfrm_state synchronously on net exit path"
Sabrina Dubroca <sd(a)queasysnail.net>
xfrm: delete x->tunnel as we delete x
-------------
Diffstat:
Documentation/process/2.Process.rst | 6 ++-
Makefile | 4 +-
arch/loongarch/kernel/machine_kexec.c | 2 +
arch/x86/include/asm/kvm_host.h | 9 ++++
arch/x86/kvm/svm/svm.c | 24 +++++----
arch/x86/kvm/x86.c | 21 ++++++++
drivers/bluetooth/btrtl.c | 24 +++++----
drivers/bus/mhi/host/pci_generic.c | 52 +++++++++++++++++++
drivers/comedi/comedi_fops.c | 42 ++++++++++++---
drivers/comedi/drivers/c6xdigio.c | 46 ++++++++++++----
drivers/comedi/drivers/multiq3.c | 9 ++++
drivers/comedi/drivers/pcl818.c | 5 +-
drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 2 +
drivers/gpu/drm/vmwgfx/vmwgfx_page_dirty.c | 12 ++---
drivers/hid/hid-apple.c | 1 +
drivers/hid/hid-elecom.c | 6 ++-
drivers/hid/hid-ids.h | 3 +-
drivers/hid/hid-input.c | 5 +-
drivers/hid/hid-quirks.c | 3 +-
drivers/net/wireless/realtek/rtl8xxxu/core.c | 3 ++
drivers/net/wireless/realtek/rtw88/rtw8822cu.c | 2 +
drivers/nvme/host/core.c | 3 +-
drivers/pinctrl/qcom/pinctrl-msm.c | 2 +-
drivers/platform/x86/acer-wmi.c | 4 ++
drivers/platform/x86/amd/pmc/pmc-quirks.c | 25 +++++++++
drivers/platform/x86/huawei-wmi.c | 4 ++
drivers/spi/spi-imx.c | 15 ++++--
drivers/spi/spi-xilinx.c | 2 +-
drivers/staging/rtl8723bs/core/rtw_ieee80211.c | 14 ++---
drivers/staging/rtl8723bs/core/rtw_mlme_ext.c | 13 +++--
drivers/tty/serial/8250/8250_pci.c | 37 +++++++++++++
drivers/usb/serial/belkin_sa.c | 28 ++++++----
drivers/usb/serial/ftdi_sio.c | 72 +++++++++-----------------
drivers/usb/serial/kobil_sct.c | 18 +++----
drivers/usb/serial/option.c | 22 ++++++--
fs/bfs/inode.c | 19 ++++++-
fs/ext4/inline.c | 14 ++++-
fs/jbd2/transaction.c | 19 +++++--
fs/smb/client/fs_context.c | 2 +-
fs/smb/server/transport_ipc.c | 7 ++-
include/net/xfrm.h | 13 ++---
kernel/locking/spinlock_debug.c | 4 +-
kernel/trace/ftrace.c | 40 ++++++++++----
net/ipv4/ipcomp.c | 2 +
net/ipv6/ipcomp6.c | 2 +
net/ipv6/xfrm6_tunnel.c | 2 +-
net/key/af_key.c | 2 +-
net/xfrm/xfrm_ipcomp.c | 1 -
net/xfrm/xfrm_state.c | 41 ++++++---------
net/xfrm/xfrm_user.c | 2 +-
samples/vfs/test-statx.c | 6 +++
samples/watch_queue/watch_test.c | 6 +++
sound/usb/quirks.c | 6 +++
53 files changed, 521 insertions(+), 207 deletions(-)
The struct ip_tunnel_info has a flexible array member named
options that is protected by a counted_by(options_len)
attribute.
The compiler will use this information to enforce runtime bounds
checking deployed by FORTIFY_SOURCE string helpers.
As laid out in the GCC documentation, the counter must be
initialized before the first reference to the flexible array
member.
In the normal case the ip_tunnel_info_opts_set() helper is used
which would initialize options_len properly, however in the GRE
ERSPAN code a partial update is done, preventing the use of the
helper function.
Before this change the handling of ERSPAN traffic in GRE tunnels
would cause a kernel panic when the kernel is compiled with
GCC 15+ and having FORTIFY_SOURCE configured:
memcpy: detected buffer overflow: 4 byte write of buffer size 0
Call Trace:
<IRQ>
__fortify_panic+0xd/0xf
erspan_rcv.cold+0x68/0x83
? ip_route_input_slow+0x816/0x9d0
gre_rcv+0x1b2/0x1c0
gre_rcv+0x8e/0x100
? raw_v4_input+0x2a0/0x2b0
ip_protocol_deliver_rcu+0x1ea/0x210
ip_local_deliver_finish+0x86/0x110
ip_local_deliver+0x65/0x110
? ip_rcv_finish_core+0xd6/0x360
ip_rcv+0x186/0x1a0
Link: https://gcc.gnu.org/onlinedocs/gcc/Common-Variable-Attributes.html#index-co…
Reported-at: https://launchpad.net/bugs/2129580
Fixes: bb5e62f2d547 ("net: Add options as a flexible array to struct ip_tunnel_info")
Signed-off-by: Frode Nordahl <fnordahl(a)ubuntu.com>
---
net/ipv4/ip_gre.c | 18 ++++++++++++++++--
net/ipv6/ip6_gre.c | 18 ++++++++++++++++--
2 files changed, 32 insertions(+), 4 deletions(-)
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 761a53c6a89a..285a656c9e41 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -330,6 +330,22 @@ static int erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi,
if (!tun_dst)
return PACKET_REJECT;
+ /* The struct ip_tunnel_info has a flexible array member named
+ * options that is protected by a counted_by(options_len)
+ * attribute.
+ *
+ * The compiler will use this information to enforce runtime bounds
+ * checking deployed by FORTIFY_SOURCE string helpers.
+ *
+ * As laid out in the GCC documentation, the counter must be
+ * initialized before the first reference to the flexible array
+ * member.
+ *
+ * Link: https://gcc.gnu.org/onlinedocs/gcc/Common-Variable-Attributes.html#index-co…
+ */
+ info = &tun_dst->u.tun_info;
+ info->options_len = sizeof(*md);
+
/* skb can be uncloned in __iptunnel_pull_header, so
* old pkt_md is no longer valid and we need to reset
* it
@@ -344,10 +360,8 @@ static int erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi,
memcpy(md2, pkt_md, ver == 1 ? ERSPAN_V1_MDSIZE :
ERSPAN_V2_MDSIZE);
- info = &tun_dst->u.tun_info;
__set_bit(IP_TUNNEL_ERSPAN_OPT_BIT,
info->key.tun_flags);
- info->options_len = sizeof(*md);
}
skb_reset_mac_header(skb);
diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index c82a75510c0e..eb840a11b93b 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -535,6 +535,22 @@ static int ip6erspan_rcv(struct sk_buff *skb,
if (!tun_dst)
return PACKET_REJECT;
+ /* The struct ip_tunnel_info has a flexible array member named
+ * options that is protected by a counted_by(options_len)
+ * attribute.
+ *
+ * The compiler will use this information to enforce runtime bounds
+ * checking deployed by FORTIFY_SOURCE string helpers.
+ *
+ * As laid out in the GCC documentation, the counter must be
+ * initialized before the first reference to the flexible array
+ * member.
+ *
+ * Link: https://gcc.gnu.org/onlinedocs/gcc/Common-Variable-Attributes.html#index-co…
+ */
+ info = &tun_dst->u.tun_info;
+ info->options_len = sizeof(*md);
+
/* skb can be uncloned in __iptunnel_pull_header, so
* old pkt_md is no longer valid and we need to reset
* it
@@ -543,7 +559,6 @@ static int ip6erspan_rcv(struct sk_buff *skb,
skb_network_header_len(skb);
pkt_md = (struct erspan_metadata *)(gh + gre_hdr_len +
sizeof(*ershdr));
- info = &tun_dst->u.tun_info;
md = ip_tunnel_info_opts(info);
md->version = ver;
md2 = &md->u.md2;
@@ -551,7 +566,6 @@ static int ip6erspan_rcv(struct sk_buff *skb,
ERSPAN_V2_MDSIZE);
__set_bit(IP_TUNNEL_ERSPAN_OPT_BIT,
info->key.tun_flags);
- info->options_len = sizeof(*md);
ip6_tnl_rcv(tunnel, skb, tpi, tun_dst, log_ecn_error);
--
2.43.0
Hi Sasha,
On 13-Dec-25 10:35, Sasha Levin wrote:
> This is a note to let you know that I've just added the patch titled
>
> media: ov02c10: Fix default vertical flip
This fix is incomplete, leading to wrong colors and it causes
the image to be upside down on some Dell XPS models where it
currently is the right way up.
There is a series of fixes which applies on top of this to
fix both issues:
https://lore.kernel.org/linux-media/20251210112436.167212-1-johannes.goede@…
For now (without the fixes on top) we are better of not adding
this patch to the stable series. Can you drop this patch
please?
Same for 6.17 and other stable series.
Regards,
Hans
>
> to the 6.18-stable tree which can be found at:
> http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum…
>
> The filename of the patch is:
> media-ov02c10-fix-default-vertical-flip.patch
> and it can be found in the queue-6.18 subdirectory.
>
> If you, or anyone else, feels it should not be added to the stable tree,
> please let <stable(a)vger.kernel.org> know about it.
>
>
>
> commit 14cc4474799a595caeccdb8fdf2ca4b867cef972
> Author: Sebastian Reichel <sre(a)kernel.org>
> Date: Wed Aug 20 02:13:19 2025 +0200
>
> media: ov02c10: Fix default vertical flip
>
> [ Upstream commit d5ebe3f7d13d4cee3ff7e718de23564915aaf163 ]
>
> The driver right now defaults to setting the vertical flip bit. This
> conflicts with proper handling of the rotation property defined in
> ACPI or device tree, so drop the VFLIP bit. It should be handled via
> V4L2_CID_VFLIP instead.
>
> Reported-by: Frederic Stuyk <fstuyk(a)runbox.com>
> Closes: https://lore.kernel.org/all/b6df9ae7-ea9f-4e5a-8065-5b130f534f37@runbox.com/
> Fixes: 44f89010dae0 ("media: i2c: Add Omnivision OV02C10 sensor driver")
> Signed-off-by: Sebastian Reichel <sre(a)kernel.org>
> Reviewed-by: Bryan O'Donoghue <bod(a)kernel.org>
> Signed-off-by: Sakari Ailus <sakari.ailus(a)linux.intel.com>
> Signed-off-by: Hans Verkuil <hverkuil+cisco(a)kernel.org>
> Signed-off-by: Sasha Levin <sashal(a)kernel.org>
>
> diff --git a/drivers/media/i2c/ov02c10.c b/drivers/media/i2c/ov02c10.c
> index 8c4d85dc7922e..8e22ff446b0c4 100644
> --- a/drivers/media/i2c/ov02c10.c
> +++ b/drivers/media/i2c/ov02c10.c
> @@ -174,7 +174,7 @@ static const struct reg_sequence sensor_1928x1092_30fps_setting[] = {
> {0x3816, 0x01},
> {0x3817, 0x01},
>
> - {0x3820, 0xb0},
> + {0x3820, 0xa0},
> {0x3821, 0x00},
> {0x3822, 0x80},
> {0x3823, 0x08},
Hi Sacha,
Em Sat, 13 Dec 2025 04:49:42 -0500
Sasha Levin <sashal(a)kernel.org> escreveu:
> This is a note to let you know that I've just added the patch titled
>
> RAS: Report all ARM processor CPER information to userspace
>
> to the 6.18-stable tree which can be found at:
> http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum…
>
> The filename of the patch is:
> ras-report-all-arm-processor-cper-information-to-use.patch
> and it can be found in the queue-6.18 subdirectory.
>
> If you, or anyone else, feels it should not be added to the stable tree,
> please let <stable(a)vger.kernel.org> know about it.
You should also backport this patch(*):
96b010536ee0 efi/cper: align ARM CPER type with UEFI 2.9A/2.10 specs
It fixes a bug at the UEFI parser for the ARM Processor Error record:
basically, the specs were not clear about how the error type should be
reported. The Kernel implementation were assuming that this was an
enum, but UEFI errata 2.9A make it clear that the value is a bitmap.
So, basically, all kernels up to 6.18 are not parsing the field the
expected way: only "Cache error" was properly reported. The other
3 types were wrong.
(*) You could need to backport those patches as well:
a976d790f494 efi/cper: Add a new helper function to print bitmasks
8ad2c72e21ef efi/cper: Adjust infopfx size to accept an extra space
Regards,
Mauro
Thanks,
Mauro
The struct ip_tunnel_info has a flexible array member named
options that is protected by a counted_by(options_len)
attribute.
The compiler will use this information to enforce runtime bounds
checking deployed by FORTIFY_SOURCE string helpers.
As laid out in the GCC documentation, the counter must be
initialized before the first reference to the flexible array
member.
After scanning through the files that use struct ip_tunnel_info
and also refer to options or options_len, it appears the normal
case is to use the ip_tunnel_info_opts_set() helper.
Said helper would initialize options_len properly before copying
data into options, however in the GRE ERSPAN code a partial
update is done, preventing the use of the helper function.
Before this change the handling of ERSPAN traffic in GRE tunnels
would cause a kernel panic when the kernel is compiled with
GCC 15+ and having FORTIFY_SOURCE configured:
memcpy: detected buffer overflow: 4 byte write of buffer size 0
Call Trace:
<IRQ>
__fortify_panic+0xd/0xf
erspan_rcv.cold+0x68/0x83
? ip_route_input_slow+0x816/0x9d0
gre_rcv+0x1b2/0x1c0
gre_rcv+0x8e/0x100
? raw_v4_input+0x2a0/0x2b0
ip_protocol_deliver_rcu+0x1ea/0x210
ip_local_deliver_finish+0x86/0x110
ip_local_deliver+0x65/0x110
? ip_rcv_finish_core+0xd6/0x360
ip_rcv+0x186/0x1a0
Cc: stable(a)vger.kernel.org
Link: https://gcc.gnu.org/onlinedocs/gcc/Common-Variable-Attributes.html#index-co…
Reported-at: https://launchpad.net/bugs/2129580
Fixes: bb5e62f2d547 ("net: Add options as a flexible array to struct ip_tunnel_info")
Signed-off-by: Frode Nordahl <fnordahl(a)ubuntu.com>
---
v2:
- target correct netdev tree and properly cc stable in commit message.
- replace repeated long in-line comments and link with a single line.
- document search for any similar offenses in the code base in commit
message.
v1: https://lore.kernel.org/all/20251212073202.13153-1-fnordahl@ubuntu.com/
net/ipv4/ip_gre.c | 6 ++++--
net/ipv6/ip6_gre.c | 6 ++++--
2 files changed, 8 insertions(+), 4 deletions(-)
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 761a53c6a89a..8178c44a3cdd 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -330,6 +330,10 @@ static int erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi,
if (!tun_dst)
return PACKET_REJECT;
+ /* MUST set options_len before referencing options */
+ info = &tun_dst->u.tun_info;
+ info->options_len = sizeof(*md);
+
/* skb can be uncloned in __iptunnel_pull_header, so
* old pkt_md is no longer valid and we need to reset
* it
@@ -344,10 +348,8 @@ static int erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi,
memcpy(md2, pkt_md, ver == 1 ? ERSPAN_V1_MDSIZE :
ERSPAN_V2_MDSIZE);
- info = &tun_dst->u.tun_info;
__set_bit(IP_TUNNEL_ERSPAN_OPT_BIT,
info->key.tun_flags);
- info->options_len = sizeof(*md);
}
skb_reset_mac_header(skb);
diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index c82a75510c0e..4603554d4c7f 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -535,6 +535,10 @@ static int ip6erspan_rcv(struct sk_buff *skb,
if (!tun_dst)
return PACKET_REJECT;
+ /* MUST set options_len before referencing options */
+ info = &tun_dst->u.tun_info;
+ info->options_len = sizeof(*md);
+
/* skb can be uncloned in __iptunnel_pull_header, so
* old pkt_md is no longer valid and we need to reset
* it
@@ -543,7 +547,6 @@ static int ip6erspan_rcv(struct sk_buff *skb,
skb_network_header_len(skb);
pkt_md = (struct erspan_metadata *)(gh + gre_hdr_len +
sizeof(*ershdr));
- info = &tun_dst->u.tun_info;
md = ip_tunnel_info_opts(info);
md->version = ver;
md2 = &md->u.md2;
@@ -551,7 +554,6 @@ static int ip6erspan_rcv(struct sk_buff *skb,
ERSPAN_V2_MDSIZE);
__set_bit(IP_TUNNEL_ERSPAN_OPT_BIT,
info->key.tun_flags);
- info->options_len = sizeof(*md);
ip6_tnl_rcv(tunnel, skb, tpi, tun_dst, log_ecn_error);
--
2.51.0
After an innocuous optimization change in clang-22, allmodconfig (which
enables CONFIG_KASAN and CONFIG_WERROR) breaks with:
drivers/gpu/drm/amd/amdgpu/../display/dc/dml/dcn32/display_mode_vba_32.c:1724:6: error: stack frame size (3144) exceeds limit (3072) in 'dml32_ModeSupportAndSystemConfigurationFull' [-Werror,-Wframe-larger-than]
1724 | void dml32_ModeSupportAndSystemConfigurationFull(struct display_mode_lib *mode_lib)
| ^
With clang-21, this function was already pretty close to the existing
limit of 3072 bytes.
drivers/gpu/drm/amd/amdgpu/../display/dc/dml/dcn32/display_mode_vba_32.c:1724:6: error: stack frame size (2904) exceeds limit (2048) in 'dml32_ModeSupportAndSystemConfigurationFull' [-Werror,-Wframe-larger-than]
1724 | void dml32_ModeSupportAndSystemConfigurationFull(struct display_mode_lib *mode_lib)
| ^
A similar situation occurred in dml2, which was resolved by
commit e4479aecf658 ("drm/amd/display: Increase sanitizer frame larger
than limit when compile testing with clang") by increasing the limit for
clang when compile testing with certain sanitizer enabled, so that
allmodconfig (an easy testing target) continues to work.
Apply that same change to the dml folder to clear up the warning for
allmodconfig, unbreaking the build.
Cc: stable(a)vger.kernel.org
Closes: https://github.com/ClangBuiltLinux/linux/issues/2135
Signed-off-by: Nathan Chancellor <nathan(a)kernel.org>
---
drivers/gpu/drm/amd/display/dc/dml/Makefile | 6 +++++-
1 file changed, 5 insertions(+), 1 deletion(-)
diff --git a/drivers/gpu/drm/amd/display/dc/dml/Makefile b/drivers/gpu/drm/amd/display/dc/dml/Makefile
index b357683b4255..268b5fbdb48b 100644
--- a/drivers/gpu/drm/amd/display/dc/dml/Makefile
+++ b/drivers/gpu/drm/amd/display/dc/dml/Makefile
@@ -30,7 +30,11 @@ dml_rcflags := $(CC_FLAGS_NO_FPU)
ifneq ($(CONFIG_FRAME_WARN),0)
ifeq ($(filter y,$(CONFIG_KASAN)$(CONFIG_KCSAN)),y)
- frame_warn_limit := 3072
+ ifeq ($(CONFIG_CC_IS_CLANG)$(CONFIG_COMPILE_TEST),yy)
+ frame_warn_limit := 4096
+ else
+ frame_warn_limit := 3072
+ endif
else
frame_warn_limit := 2048
endif
---
base-commit: f24e96d69f5b9eb0f3b9c49e53c385c50729edfd
change-id: 20251213-dml-bump-frame-warn-clang-sanitizers-0a34fc916aec
Best regards,
--
Nathan Chancellor <nathan(a)kernel.org>
synaptics_i2c_irq() schedules touch->dwork via mod_delayed_work().
The delayed work performs I2C transactions and may still be running
(or get queued) when the device is removed.
synaptics_i2c_remove() currently frees 'touch' without canceling
touch->dwork. If removal happens while the work is pending/running,
the work handler may dereference freed memory, leading to a potential
use-after-free.
Cancel the delayed work synchronously before unregistering/freeing
the device.
Fixes: eef3e4cab72e Input: add driver for Synaptics I2C touchpad
Reported-by: Minseong Kim <ii4gsp(a)gmail.com>
Cc: stable(a)vger.kernel.org
Signed-off-by: Minseong Kim <ii4gsp(a)gmail.com>
---
drivers/input/mouse/synaptics_i2c.c | 2 ++
1 file changed, 2 insertions(+)
diff --git a/drivers/input/mouse/synaptics_i2c.c b/drivers/input/mouse/synaptics_i2c.c
index a0d707e47d93..fe30bf9aea3a 100644
--- a/drivers/input/mouse/synaptics_i2c.c
+++ b/drivers/input/mouse/synaptics_i2c.c
@@ -593,6 +593,8 @@ static void synaptics_i2c_remove(struct i2c_client *client)
if (!polling_req)
free_irq(client->irq, touch);
+ cancel_delayed_work_sync(&touch->dwork);
+
input_unregister_device(touch->input);
kfree(touch);
}
--
2.39.5
virtio_fs_add_queues_sysfs() creates per-queue sysfs kobjects via
kobject_create_and_add(). The current code checks the wrong variable
after the allocation:
- kobject_create_and_add() may return NULL on failure.
- The code incorrectly checks fs->mqs_kobj (the parent kobject), which is
expected to be non-NULL at this point.
- If kobject_create_and_add() fails, fsvq->kobj is NULL but the code can
still call sysfs_create_group(fsvq->kobj, ...), leading to a NULL pointer
dereference and kernel panic (DoS).
Fix by validating fsvq->kobj immediately after kobject_create_and_add()
and aborting on failure, so sysfs_create_group() is never called with a
NULL kobject.
Reported-by: Qianchang Zhao <pioooooooooip(a)gmail.com>
Reported-by: Zhitong Liu <liuzhitong1993(a)gmail.com>
Cc: stable(a)vger.kernel.org
Signed-off-by: Qianchang Zhao <pioooooooooip(a)gmail.com>
---
fs/fuse/virtio_fs.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c
index 6bc7c97b0..b2f6486fe 100644
--- a/fs/fuse/virtio_fs.c
+++ b/fs/fuse/virtio_fs.c
@@ -373,7 +373,7 @@ static int virtio_fs_add_queues_sysfs(struct virtio_fs *fs)
sprintf(buff, "%d", i);
fsvq->kobj = kobject_create_and_add(buff, fs->mqs_kobj);
- if (!fs->mqs_kobj) {
+ if (!fsvq->kobj) {
ret = -ENOMEM;
goto out_del;
}
--
2.34.1
KVM currenty fails a nested VMRUN and injects VMEXIT_INVALID (aka
SVM_EXIT_ERR) if L1 sets NP_ENABLE and the host does not support NPTs.
On first glance, it seems like the check should actually be for
guest_cpu_cap_has(X86_FEATURE_NPT) instead, as it is possible for the
host to support NPTs but the guest CPUID to not advertise it.
However, the consistency check is not architectural to begin with. The
APM does not mention VMEXIT_INVALID if NP_ENABLE is set on a processor
that does not have X86_FEATURE_NPT. Hence, NP_ENABLE should be ignored
if X86_FEATURE_NPT is not available for L1. Apart from the consistency
check, this is currently the case because NP_ENABLE is actually copied
from VMCB01 to VMCB02, not from VMCB12.
On the other hand, the APM does mention two other consistency checks for
NP_ENABLE, both of which are missing (paraphrased):
In Volume #2, 15.25.3 (24593—Rev. 3.42—March 2024):
If VMRUN is executed with hCR0.PG cleared to zero and NP_ENABLE set to
1, VMRUN terminates with #VMEXIT(VMEXIT_INVALID)
In Volume #2, 15.25.4 (24593—Rev. 3.42—March 2024):
When VMRUN is executed with nested paging enabled (NP_ENABLE = 1), the
following conditions are considered illegal state combinations, in
addition to those mentioned in “Canonicalization and Consistency
Checks”:
• Any MBZ bit of nCR3 is set.
• Any G_PAT.PA field has an unsupported type encoding or any
reserved field in G_PAT has a nonzero value.
Replace the existing consistency check with consistency checks on
hCR0.PG and nCR3. Only perform the consistency checks if L1 has
X86_FEATURE_NPT and NP_ENABLE is set in VMCB12. The G_PAT consistency
check will be addressed separately.
As it is now possible for an L1 to run L2 with NP_ENABLE set but
ignored, also check that L1 has X86_FEATURE_NPT in nested_npt_enabled().
Pass L1's CR0 to __nested_vmcb_check_controls(). In
nested_vmcb_check_controls(), L1's CR0 is available through
kvm_read_cr0(), as vcpu->arch.cr0 is not updated to L2's CR0 until later
through nested_vmcb02_prepare_save() -> svm_set_cr0().
In svm_set_nested_state(), L1's CR0 is available in the captured save
area, as svm_get_nested_state() captures L1's save area when running L2,
and L1's CR0 is stashed in VMCB01 on nested VMRUN (in
nested_svm_vmrun()).
Fixes: 4b16184c1cca ("KVM: SVM: Initialize Nested Nested MMU context on VMRUN")
Cc: stable(a)vger.kernel.org
Signed-off-by: Yosry Ahmed <yosry.ahmed(a)linux.dev>
---
arch/x86/kvm/svm/nested.c | 21 ++++++++++++++++-----
arch/x86/kvm/svm/svm.h | 3 ++-
2 files changed, 18 insertions(+), 6 deletions(-)
diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index 74211c5c68026..87bcc5eff96e8 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -325,7 +325,8 @@ static bool nested_svm_check_bitmap_pa(struct kvm_vcpu *vcpu, u64 pa, u32 size)
}
static bool __nested_vmcb_check_controls(struct kvm_vcpu *vcpu,
- struct vmcb_ctrl_area_cached *control)
+ struct vmcb_ctrl_area_cached *control,
+ unsigned long l1_cr0)
{
if (CC(!vmcb12_is_intercept(control, INTERCEPT_VMRUN)))
return false;
@@ -333,8 +334,12 @@ static bool __nested_vmcb_check_controls(struct kvm_vcpu *vcpu,
if (CC(control->asid == 0))
return false;
- if (CC((control->nested_ctl & SVM_NESTED_CTL_NP_ENABLE) && !npt_enabled))
- return false;
+ if (nested_npt_enabled(to_svm(vcpu))) {
+ if (CC(!kvm_vcpu_is_legal_gpa(vcpu, control->nested_cr3)))
+ return false;
+ if (CC(!(l1_cr0 & X86_CR0_PG)))
+ return false;
+ }
if (CC(!nested_svm_check_bitmap_pa(vcpu, control->msrpm_base_pa,
MSRPM_SIZE)))
@@ -400,7 +405,12 @@ static bool nested_vmcb_check_controls(struct kvm_vcpu *vcpu)
struct vcpu_svm *svm = to_svm(vcpu);
struct vmcb_ctrl_area_cached *ctl = &svm->nested.ctl;
- return __nested_vmcb_check_controls(vcpu, ctl);
+ /*
+ * Make sure we did not enter guest mode yet, in which case
+ * kvm_read_cr0() could return L2's CR0.
+ */
+ WARN_ON_ONCE(is_guest_mode(vcpu));
+ return __nested_vmcb_check_controls(vcpu, ctl, kvm_read_cr0(vcpu));
}
static
@@ -1831,7 +1841,8 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu,
ret = -EINVAL;
__nested_copy_vmcb_control_to_cache(vcpu, &ctl_cached, ctl);
- if (!__nested_vmcb_check_controls(vcpu, &ctl_cached))
+ /* 'save' contains L1 state saved from before VMRUN */
+ if (!__nested_vmcb_check_controls(vcpu, &ctl_cached, save->cr0))
goto out_free;
/*
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index f6fb70ddf7272..3e805a43ffcdb 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -552,7 +552,8 @@ static inline bool gif_set(struct vcpu_svm *svm)
static inline bool nested_npt_enabled(struct vcpu_svm *svm)
{
- return svm->nested.ctl.nested_ctl & SVM_NESTED_CTL_NP_ENABLE;
+ return guest_cpu_cap_has(&svm->vcpu, X86_FEATURE_NPT) &&
+ svm->nested.ctl.nested_ctl & SVM_NESTED_CTL_NP_ENABLE;
}
static inline bool nested_vnmi_enabled(struct vcpu_svm *svm)
--
2.51.2.1041.gc1ab5b90ca-goog
From: Chen Linxuan <me(a)black-desk.cn>
When using fsconfig(..., FSCONFIG_CMD_CREATE, ...), the filesystem
context is retrieved from the file descriptor. Since the file structure
persists across syscall restarts, the context state is preserved:
// fs/fsopen.c
SYSCALL_DEFINE5(fsconfig, ...)
{
...
fc = fd_file(f)->private_data;
...
ret = vfs_fsconfig_locked(fc, cmd, ¶m);
...
}
In vfs_cmd_create(), the context phase is transitioned to
FS_CONTEXT_CREATING before calling vfs_get_tree():
// fs/fsopen.c
static int vfs_cmd_create(struct fs_context *fc, bool exclusive)
{
...
fc->phase = FS_CONTEXT_CREATING;
...
ret = vfs_get_tree(fc);
...
}
However, vfs_get_tree() may return -ERESTARTNOINTR if the filesystem
implementation needs to restart the syscall. For example, cgroup v1 does
this when it encounters a race condition where the root is dying:
// kernel/cgroup/cgroup-v1.c
int cgroup1_get_tree(struct fs_context *fc)
{
...
if (unlikely(ret > 0)) {
msleep(10);
return restart_syscall();
}
return ret;
}
If the syscall is restarted, fsconfig() is called again and retrieves
the *same* fs_context. However, vfs_cmd_create() rejects the call
because the phase was left as FS_CONTEXT_CREATING during the first
attempt:
if (fc->phase != FS_CONTEXT_CREATE_PARAMS)
return -EBUSY;
Fix this by resetting fc->phase back to FS_CONTEXT_CREATE_PARAMS if
vfs_get_tree() returns -ERESTARTNOINTR.
Cc: stable(a)vger.kernel.org
Signed-off-by: Chen Linxuan <me(a)black-desk.cn>
---
fs/fsopen.c | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/fs/fsopen.c b/fs/fsopen.c
index f645c99204eb..8a7cb031af50 100644
--- a/fs/fsopen.c
+++ b/fs/fsopen.c
@@ -229,6 +229,10 @@ static int vfs_cmd_create(struct fs_context *fc, bool exclusive)
fc->exclusive = exclusive;
ret = vfs_get_tree(fc);
+ if (ret == -ERESTARTNOINTR) {
+ fc->phase = FS_CONTEXT_CREATE_PARAMS;
+ return ret;
+ }
if (ret) {
fc->phase = FS_CONTEXT_FAILED;
return ret;
---
base-commit: 187d0801404f415f22c0b31531982c7ea97fa341
change-id: 20251213-mount-ebusy-8ee3888a7e4f
Best regards,
--
Chen Linxuan <me(a)black-desk.cn>
Hello,
move_local_task_to_local_dsq() was missing post-enqueue handling which
matters now that scx_bpf_dsq_move() can be called while the CPU is busy.
v2: Updated commit messages and added Cc stable.
v1: http://lkml.kernel.org/r/20251211224809.3383633-1-tj@kernel.org
0001-sched_ext-Factor-out-local_dsq_post_enq-from-dispatc.patch
0002-sched_ext-Fix-missing-post-enqueue-handling-in-move_.patch
Based on sched_ext/for-6.19-fixes (9f769637a93f).
diffstat:
kernel/sched/ext.c | 44 +++++++++++++++++++++++++++++---------------
1 file changed, 29 insertions(+), 15 deletions(-)
--
tejun
Hi,
Just checking in to see if you had a moment to review my earlier message.
As you have been an exhibitor at Live Design International 2025 We have the final updated attendee list of verified contacts 16,537 of people including last-minute registers and walk-ins to the show, and all are confirmed Opt-in and Verified contacts
Special :20% reduction on our compliant contact data.
If you’d like more details, just let me know or reply “Send me pricing.”
Best regards,
Juanita Garcia
Sr. Marketing Manager
If you prefer not to receive updates, reply “Not Interested.”
Automatically unpin pages on cleanup. The test currently fails with
the error
[ 58.246263] drm-kunit-mock-device drm_gem_shmem_test_get_sg_table.drm-kunit-mock-device: [drm] drm_WARN_ON(refcount_read(&shmem->pages_pin_count))
while cleaning up the GEM object. The pin count has to be zero at this
point.
Signed-off-by: Thomas Zimmermann <tzimmermann(a)suse.de>
Fixes: d586b535f144 ("drm/shmem-helper: Add and use pages_pin_count")
Cc: dri-devel(a)lists.freedesktop.org
Cc: <stable(a)vger.kernel.org> # v6.16+
---
drivers/gpu/drm/tests/drm_gem_shmem_test.c | 6 ++++++
1 file changed, 6 insertions(+)
diff --git a/drivers/gpu/drm/tests/drm_gem_shmem_test.c b/drivers/gpu/drm/tests/drm_gem_shmem_test.c
index 872881ec9c30..1d50bab51ef3 100644
--- a/drivers/gpu/drm/tests/drm_gem_shmem_test.c
+++ b/drivers/gpu/drm/tests/drm_gem_shmem_test.c
@@ -34,6 +34,9 @@ KUNIT_DEFINE_ACTION_WRAPPER(sg_free_table_wrapper, sg_free_table,
KUNIT_DEFINE_ACTION_WRAPPER(drm_gem_shmem_free_wrapper, drm_gem_shmem_free,
struct drm_gem_shmem_object *);
+KUNIT_DEFINE_ACTION_WRAPPER(drm_gem_shmem_unpin_wrapper, drm_gem_shmem_unpin,
+ struct drm_gem_shmem_object *);
+
/*
* Test creating a shmem GEM object backed by shmem buffer. The test
* case succeeds if the GEM object is successfully allocated with the
@@ -212,6 +215,9 @@ static void drm_gem_shmem_test_get_sg_table(struct kunit *test)
ret = drm_gem_shmem_pin(shmem);
KUNIT_ASSERT_EQ(test, ret, 0);
+ ret = kunit_add_action_or_reset(test, drm_gem_shmem_unpin_wrapper, shmem);
+ KUNIT_ASSERT_EQ(test, ret, 0);
+
sgt = drm_gem_shmem_get_sg_table(shmem);
KUNIT_ASSERT_NOT_ERR_OR_NULL(test, sgt);
KUNIT_EXPECT_NULL(test, shmem->sgt);
--
2.52.0
Calling napi_disable() on an already disabled napi can cause the
deadlock. In commit 4bc12818b363 ("virtio-net: disable delayed refill
when pausing rx"), to avoid the deadlock, when pausing the RX in
virtnet_rx_pause[_all](), we disable and cancel the delayed refill work.
However, in the virtnet_rx_resume_all(), we enable the delayed refill
work too early before enabling all the receive queue napis.
The deadlock can be reproduced by running
selftests/drivers/net/hw/xsk_reconfig.py with multiqueue virtio-net
device and inserting a cond_resched() inside the for loop in
virtnet_rx_resume_all() to increase the success rate. Because the worker
processing the delayed refilled work runs on the same CPU as
virtnet_rx_resume_all(), a reschedule is needed to cause the deadlock.
In real scenario, the contention on netdev_lock can cause the
reschedule.
This fixes the deadlock by ensuring all receive queue's napis are
enabled before we enable the delayed refill work in
virtnet_rx_resume_all() and virtnet_open().
Fixes: 4bc12818b363 ("virtio-net: disable delayed refill when pausing rx")
Reported-by: Paolo Abeni <pabeni(a)redhat.com>
Closes: https://netdev-ctrl.bots.linux.dev/logs/vmksft/drv-hw-dbg/results/400961/3-…
Cc: stable(a)vger.kernel.org
Signed-off-by: Bui Quang Minh <minhquangbui99(a)gmail.com>
---
Changes in v2:
- Move try_fill_recv() before rx napi_enable()
- Link to v1: https://lore.kernel.org/netdev/20251208153419.18196-1-minhquangbui99@gmail.…
---
drivers/net/virtio_net.c | 71 +++++++++++++++++++++++++---------------
1 file changed, 45 insertions(+), 26 deletions(-)
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 8e04adb57f52..4e08880a9467 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -3214,21 +3214,31 @@ static void virtnet_update_settings(struct virtnet_info *vi)
static int virtnet_open(struct net_device *dev)
{
struct virtnet_info *vi = netdev_priv(dev);
+ bool schedule_refill = false;
int i, err;
- enable_delayed_refill(vi);
-
+ /* - We must call try_fill_recv before enabling napi of the same receive
+ * queue so that it doesn't race with the call in virtnet_receive.
+ * - We must enable and schedule delayed refill work only when we have
+ * enabled all the receive queue's napi. Otherwise, in refill_work, we
+ * have a deadlock when calling napi_disable on an already disabled
+ * napi.
+ */
for (i = 0; i < vi->max_queue_pairs; i++) {
if (i < vi->curr_queue_pairs)
/* Make sure we have some buffers: if oom use wq. */
if (!try_fill_recv(vi, &vi->rq[i], GFP_KERNEL))
- schedule_delayed_work(&vi->refill, 0);
+ schedule_refill = true;
err = virtnet_enable_queue_pair(vi, i);
if (err < 0)
goto err_enable_qp;
}
+ enable_delayed_refill(vi);
+ if (schedule_refill)
+ schedule_delayed_work(&vi->refill, 0);
+
if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_STATUS)) {
if (vi->status & VIRTIO_NET_S_LINK_UP)
netif_carrier_on(vi->dev);
@@ -3463,39 +3473,48 @@ static void virtnet_rx_pause(struct virtnet_info *vi, struct receive_queue *rq)
__virtnet_rx_pause(vi, rq);
}
-static void __virtnet_rx_resume(struct virtnet_info *vi,
- struct receive_queue *rq,
- bool refill)
+static void virtnet_rx_resume_all(struct virtnet_info *vi)
{
- bool running = netif_running(vi->dev);
bool schedule_refill = false;
+ int i;
- if (refill && !try_fill_recv(vi, rq, GFP_KERNEL))
- schedule_refill = true;
- if (running)
- virtnet_napi_enable(rq);
-
- if (schedule_refill)
- schedule_delayed_work(&vi->refill, 0);
-}
+ if (netif_running(vi->dev)) {
+ /* See the comment in virtnet_open for the ordering rule
+ * of try_fill_recv, receive queue napi_enable and delayed
+ * refill enable/schedule.
+ */
+ for (i = 0; i < vi->max_queue_pairs; i++) {
+ if (i < vi->curr_queue_pairs)
+ if (!try_fill_recv(vi, &vi->rq[i], GFP_KERNEL))
+ schedule_refill = true;
-static void virtnet_rx_resume_all(struct virtnet_info *vi)
-{
- int i;
+ virtnet_napi_enable(&vi->rq[i]);
+ }
- enable_delayed_refill(vi);
- for (i = 0; i < vi->max_queue_pairs; i++) {
- if (i < vi->curr_queue_pairs)
- __virtnet_rx_resume(vi, &vi->rq[i], true);
- else
- __virtnet_rx_resume(vi, &vi->rq[i], false);
+ enable_delayed_refill(vi);
+ if (schedule_refill)
+ schedule_delayed_work(&vi->refill, 0);
}
}
static void virtnet_rx_resume(struct virtnet_info *vi, struct receive_queue *rq)
{
- enable_delayed_refill(vi);
- __virtnet_rx_resume(vi, rq, true);
+ bool schedule_refill = false;
+
+ if (netif_running(vi->dev)) {
+ /* See the comment in virtnet_open for the ordering rule
+ * of try_fill_recv, receive queue napi_enable and delayed
+ * refill enable/schedule.
+ */
+ if (!try_fill_recv(vi, rq, GFP_KERNEL))
+ schedule_refill = true;
+
+ virtnet_napi_enable(rq);
+
+ enable_delayed_refill(vi);
+ if (schedule_refill)
+ schedule_delayed_work(&vi->refill, 0);
+ }
}
static int virtnet_rx_resize(struct virtnet_info *vi,
--
2.43.0
Initialize the eb.vma array with values of 0 when the eb structure is
first set up. In particular, this sets the eb->vma[i].vma pointers to
NULL, simplifying cleanup and getting rid of the bug described below.
During the execution of eb_lookup_vmas(), the eb->vma array is
successively filled up with struct eb_vma objects. This process includes
calling eb_add_vma(), which might fail; however, even in the event of
failure, eb->vma[i].vma is set for the currently processed buffer.
If eb_add_vma() fails, eb_lookup_vmas() returns with an error, which
prompts a call to eb_release_vmas() to clean up the mess. Since
eb_lookup_vmas() might fail during processing any (possibly not first)
buffer, eb_release_vmas() checks whether a buffer's vma is NULL to know
at what point did the lookup function fail.
In eb_lookup_vmas(), eb->vma[i].vma is set to NULL if either the helper
function eb_lookup_vma() or eb_validate_vma() fails. eb->vma[i+1].vma is
set to NULL in case i915_gem_object_userptr_submit_init() fails; the
current one needs to be cleaned up by eb_release_vmas() at this point,
so the next one is set. If eb_add_vma() fails, neither the current nor
the next vma is nullified, which is a source of a NULL deref bug
described in [1].
When entering eb_lookup_vmas(), the vma pointers are set to the slab
poison value, instead of NULL. This doesn't matter for the actual
lookup, since it gets overwritten anyway, however the eb_release_vmas()
function only recognizes NULL as the stopping value, hence the pointers
are being nullified as they go in case of intermediate failure. This
patch changes the approach to filling them all with NULL at the start
instead, rather than handling that manually during failure.
Closes: https://gitlab.freedesktop.org/drm/i915/kernel/-/issues/15062
Fixes: 544460c33821 ("drm/i915: Multi-BB execbuf")
Reported-by: Gangmin Kim <km.kim1503(a)gmail.com>
Cc: <stable(a)vger.kernel.org> # 5.16.x
Signed-off-by: Krzysztof Niemiec <krzysztof.niemiec(a)intel.com>
---
I messed up the continuity in previous revisions; the original patch
was sent as [1], and the first revision (which I didn't mark as v2 due
to the title change) was sent as [2].
This is the full current changelog:
v4:
- delete an empty line (Janusz), reword the comment a bit (Krzysztof,
Janusz)
v3:
- use memset() to fill the entire eb.vma array with zeros instead of
looping through the elements (Janusz)
- add a comment clarifying the mechanism of the initial allocation (Janusz)
- change the commit log again, including title
- rearrange the tags to keep checkpatch happy
v2:
- set the eb->vma[i].vma pointers to NULL during setup instead of
ad-hoc at failure (Janusz)
- romanize the reporter's name (Andi, offline)
- change the commit log, including title
[1] https://patchwork.freedesktop.org/series/156832/
[2] https://patchwork.freedesktop.org/series/158036/
.../gpu/drm/i915/gem/i915_gem_execbuffer.c | 37 +++++++++----------
1 file changed, 17 insertions(+), 20 deletions(-)
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
index b057c2fa03a4..348023d13668 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
@@ -951,13 +951,13 @@ static int eb_lookup_vmas(struct i915_execbuffer *eb)
vma = eb_lookup_vma(eb, eb->exec[i].handle);
if (IS_ERR(vma)) {
err = PTR_ERR(vma);
- goto err;
+ return err;
}
err = eb_validate_vma(eb, &eb->exec[i], vma);
if (unlikely(err)) {
i915_vma_put(vma);
- goto err;
+ return err;
}
err = eb_add_vma(eb, ¤t_batch, i, vma);
@@ -966,19 +966,8 @@ static int eb_lookup_vmas(struct i915_execbuffer *eb)
if (i915_gem_object_is_userptr(vma->obj)) {
err = i915_gem_object_userptr_submit_init(vma->obj);
- if (err) {
- if (i + 1 < eb->buffer_count) {
- /*
- * Execbuffer code expects last vma entry to be NULL,
- * since we already initialized this entry,
- * set the next value to NULL or we mess up
- * cleanup handling.
- */
- eb->vma[i + 1].vma = NULL;
- }
-
+ if (err)
return err;
- }
eb->vma[i].flags |= __EXEC_OBJECT_USERPTR_INIT;
eb->args->flags |= __EXEC_USERPTR_USED;
@@ -986,10 +975,6 @@ static int eb_lookup_vmas(struct i915_execbuffer *eb)
}
return 0;
-
-err:
- eb->vma[i].vma = NULL;
- return err;
}
static int eb_lock_vmas(struct i915_execbuffer *eb)
@@ -3375,7 +3360,8 @@ i915_gem_do_execbuffer(struct drm_device *dev,
eb.exec = exec;
eb.vma = (struct eb_vma *)(exec + args->buffer_count + 1);
- eb.vma[0].vma = NULL;
+ memset(eb.vma, 0x00, args->buffer_count * sizeof(struct eb_vma));
+
eb.batch_pool = NULL;
eb.invalid_flags = __EXEC_OBJECT_UNKNOWN_FLAGS;
@@ -3584,7 +3570,18 @@ i915_gem_execbuffer2_ioctl(struct drm_device *dev, void *data,
if (err)
return err;
- /* Allocate extra slots for use by the command parser */
+ /*
+ * Allocate extra slots for use by the command parser.
+ *
+ * Note that this allocation handles two different arrays (the
+ * exec2_list array, and the eventual eb.vma array introduced in
+ * i915_gem_do_execubuffer()), that reside in virtually contiguous
+ * memory. Also note that the allocation intentionally doesn't fill the
+ * area with zeros (because the exec2_list part doesn't need to be, as
+ * it's immediately overwritten by user data a few lines below).
+ * However, the eb.vma part is explicitly zeroed later in
+ * i915_gem_do_execbuffer().
+ */
exec2_list = kvmalloc_array(count + 2, eb_element_size(),
__GFP_NOWARN | GFP_KERNEL);
if (exec2_list == NULL) {
--
2.45.2
Commit a4e772898f8b ("PCI: Add missing bridge lock to pci_bus_lock()")
delegates the bridge device's pci_dev_trylock() to pci_bus_trylock() in
pci_slot_trylock(), but it forgets to remove the corresponding
pci_dev_unlock() when pci_bus_trylock() fails.
Before the commit, the code did:
if (!pci_dev_trylock(dev)) /* <- lock bridge device */
goto unlock;
if (dev->subordinate) {
if (!pci_bus_trylock(dev->subordinate)) {
pci_dev_unlock(dev); /* <- unlock bridge device */
goto unlock;
}
}
After the commit the bridge-device lock is no longer taken, but the
pci_dev_unlock(dev) on the failure path was left in place, leading to
the bug.
This yields one of two errors:
1. A warning that the lock is being unlocked when no one holds it.
2. An incorrect unlock of a lock that belongs to another thread.
Fix it by removing the now-redundant pci_dev_unlock(dev) on the failure
path.
Fixes: a4e772898f8b ("PCI: Add missing bridge lock to pci_bus_lock()")
Cc: stable(a)vger.kernel.org
Signed-off-by: Jinhui Guo <guojinhui.liam(a)bytedance.com>
Acked-by: Ilpo Järvinen <ilpo.jarvinen(a)linux.intel.com>
---
Hi, all
v1: https://lore.kernel.org/all/20251211123635.2215-1-guojinhui.liam@bytedance.…
Changelog in v1 -> v2
- The v1 commit message was too brief, so I’ve sent v2 with more detail.
- Remove the braces from the if (!pci_bus_trylock(dev->subordinate)) statement.
Sorry for the noise.
Best Regards,
Jinhui
drivers/pci/pci.c | 4 +---
1 file changed, 1 insertion(+), 3 deletions(-)
diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index 13dbb405dc31..59319e08fca6 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -5346,10 +5346,8 @@ static int pci_slot_trylock(struct pci_slot *slot)
if (!dev->slot || dev->slot != slot)
continue;
if (dev->subordinate) {
- if (!pci_bus_trylock(dev->subordinate)) {
- pci_dev_unlock(dev);
+ if (!pci_bus_trylock(dev->subordinate))
goto unlock;
- }
} else if (!pci_dev_trylock(dev))
goto unlock;
}
--
2.20.1
Commit a4e772898f8b ("PCI: Add missing bridge lock to pci_bus_lock()")
delegates the bridge device's pci_dev_trylock() to pci_bus_trylock()
in pci_slot_trylock(), but it leaves a redundant pci_dev_unlock() when
pci_bus_trylock() fails.
Remove the redundant bridge-device pci_dev_unlock() in pci_slot_trylock(),
since that lock is no longer taken there.
Fixes: a4e772898f8b ("PCI: Add missing bridge lock to pci_bus_lock()")
Cc: stable(a)vger.kernel.org
Signed-off-by: Jinhui Guo <guojinhui.liam(a)bytedance.com>
---
drivers/pci/pci.c | 1 -
1 file changed, 1 deletion(-)
diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index 13dbb405dc31..75a98819db6f 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -5347,7 +5347,6 @@ static int pci_slot_trylock(struct pci_slot *slot)
continue;
if (dev->subordinate) {
if (!pci_bus_trylock(dev->subordinate)) {
- pci_dev_unlock(dev);
goto unlock;
}
} else if (!pci_dev_trylock(dev))
--
2.20.1
Add missing drm_gem_object_put() call when drm_gem_object_lookup()
successfully returns an object. This fixes a GEM object reference
leak that can prevent driver modules from unloading when using
prime buffers.
Fixes: 53096728b891 ("drm: Add DRM prime interface to reassign GEM handle")
Cc: <stable(a)vger.kernel.org> # v6.18+
Signed-off-by: Karol Wachowski <karol.wachowski(a)linux.intel.com>
---
Changes between v3 and v2:
- correctly add CC: tag this time
Changes between v1 and v2:
- move setting ret value under if branch as suggested in review
- add Cc: stable 6.18+
---
drivers/gpu/drm/drm_gem.c | 8 ++++++--
1 file changed, 6 insertions(+), 2 deletions(-)
diff --git a/drivers/gpu/drm/drm_gem.c b/drivers/gpu/drm/drm_gem.c
index ca1956608261..bcc08a6aebf8 100644
--- a/drivers/gpu/drm/drm_gem.c
+++ b/drivers/gpu/drm/drm_gem.c
@@ -1010,8 +1010,10 @@ int drm_gem_change_handle_ioctl(struct drm_device *dev, void *data,
if (!obj)
return -ENOENT;
- if (args->handle == args->new_handle)
- return 0;
+ if (args->handle == args->new_handle) {
+ ret = 0;
+ goto out;
+ }
mutex_lock(&file_priv->prime.lock);
@@ -1043,6 +1045,8 @@ int drm_gem_change_handle_ioctl(struct drm_device *dev, void *data,
out_unlock:
mutex_unlock(&file_priv->prime.lock);
+out:
+ drm_gem_object_put(obj);
return ret;
}
--
2.43.0
Two somewhat related fixes addressing different issues found by
syzkaller, and producing the exact same splat: a WARNING in
subflow_data_ready().
- Patch 1: fallback earlier on simultaneous connections to avoid a
warning. A fix for v5.19.
- Patch 2: ensure context reset on disconnect, also to avoid a similar
warning. A fix for v6.2.
Signed-off-by: Matthieu Baerts (NGI0) <matttbe(a)kernel.org>
---
Paolo Abeni (2):
mptcp: fallback earlier on simult connection
mptcp: ensure context reset on disconnect()
net/mptcp/options.c | 10 ++++++++++
net/mptcp/protocol.c | 8 +++++---
net/mptcp/protocol.h | 9 ++++-----
net/mptcp/subflow.c | 6 ------
4 files changed, 19 insertions(+), 14 deletions(-)
---
base-commit: 885bebac9909994050bbbeed0829c727e42bd1b7
change-id: 20251212-net-mptcp-subflow_data_ready-warn-fd8126208c90
Best regards,
--
Matthieu Baerts (NGI0) <matttbe(a)kernel.org>
The documentation for `Cursor::peek_next` incorrectly describes it as
"Access the previous node without moving the cursor" when it actually
accesses the next node. Update the description to correctly state
"Access the next node without moving the cursor" to match the function
name and implementation.
Reported-by: Miguel Ojeda <ojeda(a)kernel.org>
Closes: https://github.com/Rust-for-Linux/linux/issues/1205
Fixes: 98c14e40e07a0 ("rust: rbtree: add cursor")
Cc: stable(a)vger.kernel.org
Signed-off-by: WeiKang Guo <guoweikang.kernel(a)outlook.com>
---
rust/kernel/rbtree.rs | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/rust/kernel/rbtree.rs b/rust/kernel/rbtree.rs
index 4729eb56827a..cd187e2ca328 100644
--- a/rust/kernel/rbtree.rs
+++ b/rust/kernel/rbtree.rs
@@ -985,7 +985,7 @@ pub fn peek_prev(&self) -> Option<(&K, &V)> {
self.peek(Direction::Prev)
}
- /// Access the previous node without moving the cursor.
+ /// Access the next node without moving the cursor.
pub fn peek_next(&self) -> Option<(&K, &V)> {
self.peek(Direction::Next)
}
--
2.52.0
The documentation for `Cursor::peek_next` incorrectly describes it as
"Access the previous node without moving the cursor" when it actually
accesses the next node. Update the description to correctly state
"Access the next node without moving the cursor" to match the function
name and implementation.
Reported-by: Miguel Ojeda <ojeda(a)kernel.org>
Closes: https://github.com/Rust-for-Linux/linux/issues/1205
Fixes: 98c14e40e07a0 ("rust: rbtree: add cursor")
Cc: stable(a)vger.kernel.org
Signed-off-by: WeiKang Guo <guoweikang.kernel(a)outlook.com>
---
rust/kernel/rbtree.rs | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/rust/kernel/rbtree.rs b/rust/kernel/rbtree.rs
index 4729eb56827a..cd187e2ca328 100644
--- a/rust/kernel/rbtree.rs
+++ b/rust/kernel/rbtree.rs
@@ -985,7 +985,7 @@ pub fn peek_prev(&self) -> Option<(&K, &V)> {
self.peek(Direction::Prev)
}
- /// Access the previous node without moving the cursor.
+ /// Access the next node without moving the cursor.
pub fn peek_next(&self) -> Option<(&K, &V)> {
self.peek(Direction::Next)
}
--
2.52.0
Before this change the LED was added to leds_list before led_init_core()
gets called adding it the list before led_classdev.set_brightness_work gets
initialized.
This leaves a window where led_trigger_register() of a LED's default
trigger will call led_trigger_set() which calls led_set_brightness()
which in turn will end up queueing the *uninitialized*
led_classdev.set_brightness_work.
This race gets hit by the lenovo-thinkpad-t14s EC driver which registers
2 LEDs with a default trigger provided by snd_ctl_led.ko in quick
succession. The first led_classdev_register() causes an async modprobe of
snd_ctl_led to run and that async modprobe manages to exactly hit
the window where the second LED is on the leds_list without led_init_core()
being called for it, resulting in:
------------[ cut here ]------------
WARNING: CPU: 11 PID: 5608 at kernel/workqueue.c:4234 __flush_work+0x344/0x390
Hardware name: LENOVO 21N2S01F0B/21N2S01F0B, BIOS N42ET93W (2.23 ) 09/01/2025
...
Call trace:
__flush_work+0x344/0x390 (P)
flush_work+0x2c/0x50
led_trigger_set+0x1c8/0x340
led_trigger_register+0x17c/0x1c0
led_trigger_register_simple+0x84/0xe8
snd_ctl_led_init+0x40/0xf88 [snd_ctl_led]
do_one_initcall+0x5c/0x318
do_init_module+0x9c/0x2b8
load_module+0x7e0/0x998
Close the race window by moving the adding of the LED to leds_list to
after the led_init_core() call.
Cc: Sebastian Reichel <sre(a)kernel.org>
Cc: stable(a)vger.kernel.org
Signed-off-by: Hans de Goede <johannes.goede(a)oss.qualcomm.com>
---
Note no Fixes tag as this problem has been around for a long long time,
so I could not really find a good commit for the Fixes tag.
---
drivers/leds/led-class.c | 10 +++++-----
1 file changed, 5 insertions(+), 5 deletions(-)
diff --git a/drivers/leds/led-class.c b/drivers/leds/led-class.c
index f3faf37f9a08..6b9fa060c3a1 100644
--- a/drivers/leds/led-class.c
+++ b/drivers/leds/led-class.c
@@ -560,11 +560,6 @@ int led_classdev_register_ext(struct device *parent,
#ifdef CONFIG_LEDS_BRIGHTNESS_HW_CHANGED
led_cdev->brightness_hw_changed = -1;
#endif
- /* add to the list of leds */
- down_write(&leds_list_lock);
- list_add_tail(&led_cdev->node, &leds_list);
- up_write(&leds_list_lock);
-
if (!led_cdev->max_brightness)
led_cdev->max_brightness = LED_FULL;
@@ -574,6 +569,11 @@ int led_classdev_register_ext(struct device *parent,
led_init_core(led_cdev);
+ /* add to the list of leds */
+ down_write(&leds_list_lock);
+ list_add_tail(&led_cdev->node, &leds_list);
+ up_write(&leds_list_lock);
+
#ifdef CONFIG_LEDS_TRIGGERS
led_trigger_set_default(led_cdev);
#endif
--
2.52.0
lkkbd_interrupt() schedules lk->tq with schedule_work(), and the work
handler lkkbd_reinit() dereferences the lkkbd structure and its
serio/input_dev fields.
lkkbd_disconnect() frees the lkkbd structure without cancelling this
work, so the work can run after the structure has been freed, leading
to a potential use-after-free.
Cancel the pending work in lkkbd_disconnect() before unregistering and
freeing the device, following the same pattern as sunkbd.
Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
Reported-by: Minseong Kim <ii4gsp(a)gmail.com>
Cc: stable(a)vger.kernel.org
Signed-off-by: Minseong Kim <ii4gsp(a)gmail.com>
---
drivers/input/keyboard/lkkbd.c | 2 ++
1 file changed, 2 insertions(+)
diff --git a/drivers/input/keyboard/lkkbd.c b/drivers/input/keyboard/lkkbd.c
index c035216dd27c..72c477aab1fc 100644
--- a/drivers/input/keyboard/lkkbd.c
+++ b/drivers/input/keyboard/lkkbd.c
@@ -684,6 +684,8 @@ static void lkkbd_disconnect(struct serio *serio)
{
struct lkkbd *lk = serio_get_drvdata(serio);
+ cancel_work_sync(&lk->tq);
+
input_get_device(lk->dev);
input_unregister_device(lk->dev);
serio_close(serio);
--
2.39.5
The following commit has been merged into the x86/urgent branch of tip:
Commit-ID: 4368a3b96c427ea3299be8cedb28684ba4157529
Gitweb: https://git.kernel.org/tip/4368a3b96c427ea3299be8cedb28684ba4157529
Author: Yazen Ghannam <yazen.ghannam(a)amd.com>
AuthorDate: Tue, 11 Nov 2025 14:53:57
Committer: Ingo Molnar <mingo(a)kernel.org>
CommitterDate: Fri, 12 Dec 2025 09:38:30 +01:00
x86/acpi/boot: Correct acpi_is_processor_usable() check again
ACPI v6.3 defined a new "Online Capable" MADT LAPIC flag. This bit is
used in conjunction with the "Enabled" MADT LAPIC flag to determine if
a CPU can be enabled/hotplugged by the OS after boot.
Before the new bit was defined, the "Enabled" bit was explicitly
described like this (ACPI v6.0 wording provided):
"If zero, this processor is unusable, and the operating system
support will not attempt to use it"
This means that CPU hotplug (based on MADT) is not possible. Many BIOS
implementations follow this guidance. They may include LAPIC entries in
MADT for unavailable CPUs, but since these entries are marked with
"Enabled=0" it is expected that the OS will completely ignore these
entries.
However, QEMU will do the same (include entries with "Enabled=0") for
the purpose of allowing CPU hotplug within the guest.
Comment from QEMU function pc_madt_cpu_entry():
/* ACPI spec says that LAPIC entry for non present
* CPU may be omitted from MADT or it must be marked
* as disabled. However omitting non present CPU from
* MADT breaks hotplug on linux. So possible CPUs
* should be put in MADT but kept disabled.
*/
Recent Linux topology changes broke the QEMU use case. A following fix
for the QEMU use case broke bare metal topology enumeration.
Rework the Linux MADT LAPIC flags check to allow the QEMU use case only
for guests and to maintain the ACPI spec behavior for bare metal.
Remove an unnecessary check added to fix a bare metal case introduced by
the QEMU "fix".
[ bp: Change logic as Michal suggested. ]
Fixes: fed8d8773b8e ("x86/acpi/boot: Correct acpi_is_processor_usable() check")
Fixes: f0551af02130 ("x86/topology: Ignore non-present APIC IDs in a present package")
Closes: https://lore.kernel.org/r/20251024204658.3da9bf3f.michal.pecio@gmail.com
Reported-by: Michal Pecio <michal.pecio(a)gmail.com>
Signed-off-by: Yazen Ghannam <yazen.ghannam(a)amd.com>
Signed-off-by: Borislav Petkov (AMD) <bp(a)alien8.de>
Signed-off-by: Ingo Molnar <mingo(a)kernel.org>
Tested-by: Michal Pecio <michal.pecio(a)gmail.com>
Tested-by: Ricardo Neri <ricardo.neri-calderon(a)linux.intel.com>
Cc: stable(a)vger.kernel.org
Link: https://lore.kernel.org/20251111145357.4031846-1-yazen.ghannam@amd.com
---
arch/x86/kernel/acpi/boot.c | 12 ++++++++----
arch/x86/kernel/cpu/topology.c | 15 ---------------
2 files changed, 8 insertions(+), 19 deletions(-)
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 9fa321a..d6138b2 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -35,6 +35,7 @@
#include <asm/smp.h>
#include <asm/i8259.h>
#include <asm/setup.h>
+#include <asm/hypervisor.h>
#include "sleep.h" /* To include x86_acpi_suspend_lowlevel */
static int __initdata acpi_force = 0;
@@ -164,11 +165,14 @@ static bool __init acpi_is_processor_usable(u32 lapic_flags)
if (lapic_flags & ACPI_MADT_ENABLED)
return true;
- if (!acpi_support_online_capable ||
- (lapic_flags & ACPI_MADT_ONLINE_CAPABLE))
- return true;
+ if (acpi_support_online_capable)
+ return lapic_flags & ACPI_MADT_ONLINE_CAPABLE;
- return false;
+ /*
+ * QEMU expects legacy "Enabled=0" LAPIC entries to be counted as usable
+ * in order to support CPU hotplug in guests.
+ */
+ return !hypervisor_is_type(X86_HYPER_NATIVE);
}
static int __init
diff --git a/arch/x86/kernel/cpu/topology.c b/arch/x86/kernel/cpu/topology.c
index f55ea3c..23190a7 100644
--- a/arch/x86/kernel/cpu/topology.c
+++ b/arch/x86/kernel/cpu/topology.c
@@ -27,7 +27,6 @@
#include <xen/xen.h>
#include <asm/apic.h>
-#include <asm/hypervisor.h>
#include <asm/io_apic.h>
#include <asm/mpspec.h>
#include <asm/msr.h>
@@ -236,20 +235,6 @@ static __init void topo_register_apic(u32 apic_id, u32 acpi_id, bool present)
cpuid_to_apicid[cpu] = apic_id;
topo_set_cpuids(cpu, apic_id, acpi_id);
} else {
- u32 pkgid = topo_apicid(apic_id, TOPO_PKG_DOMAIN);
-
- /*
- * Check for present APICs in the same package when running
- * on bare metal. Allow the bogosity in a guest.
- */
- if (hypervisor_is_type(X86_HYPER_NATIVE) &&
- topo_unit_count(pkgid, TOPO_PKG_DOMAIN, phys_cpu_present_map)) {
- pr_info_once("Ignoring hot-pluggable APIC ID %x in present package.\n",
- apic_id);
- topo_info.nr_rejected_cpus++;
- return;
- }
-
topo_info.nr_disabled_cpus++;
}
Add two flags for KVM_CAP_X2APIC_API to allow userspace to control support
for Suppress EOI Broadcasts, which KVM completely mishandles. When x2APIC
support was first added, KVM incorrectly advertised and "enabled" Suppress
EOI Broadcast, without fully supporting the I/O APIC side of the equation,
i.e. without adding directed EOI to KVM's in-kernel I/O APIC.
That flaw was carried over to split IRQCHIP support, i.e. KVM advertised
support for Suppress EOI Broadcasts irrespective of whether or not the
userspace I/O APIC implementation supported directed EOIs. Even worse,
KVM didn't actually suppress EOI broadcasts, i.e. userspace VMMs without
support for directed EOI came to rely on the "spurious" broadcasts.
KVM "fixed" the in-kernel I/O APIC implementation by completely disabling
support for Suppress EOI Broadcasts in commit 0bcc3fb95b97 ("KVM: lapic:
stop advertising DIRECTED_EOI when in-kernel IOAPIC is in use"), but
didn't do anything to remedy userspace I/O APIC implementations.
KVM's bogus handling of Suppress EOI Broadcast is problematic when the
guest relies on interrupts being masked in the I/O APIC until well after
the initial local APIC EOI. E.g. Windows with Credential Guard enabled
handles interrupts in the following order:
1. Interrupt for L2 arrives.
2. L1 APIC EOIs the interrupt.
3. L1 resumes L2 and injects the interrupt.
4. L2 EOIs after servicing.
5. L1 performs the I/O APIC EOI.
Because KVM EOIs the I/O APIC at step #2, the guest can get an interrupt
storm, e.g. if the IRQ line is still asserted and userspace reacts to the
EOI by re-injecting the IRQ, because the guest doesn't de-assert the line
until step #4, and doesn't expect the interrupt to be re-enabled until
step #5.
Unfortunately, simply "fixing" the bug isn't an option, as KVM has no way
of knowing if the userspace I/O APIC supports directed EOIs, i.e.
suppressing EOI broadcasts would result in interrupts being stuck masked
in the userspace I/O APIC due to step #5 being ignored by userspace. And
fully disabling support for Suppress EOI Broadcast is also undesirable, as
picking up the fix would require a guest reboot, *and* more importantly
would change the virtual CPU model exposed to the guest without any buy-in
from userspace.
Add KVM_X2APIC_ENABLE_SUPPRESS_EOI_BROADCAST and
KVM_X2APIC_DISABLE_SUPPRESS_EOI_BROADCAST flags to allow userspace to
explicitly enable or disable support for Suppress EOI Broadcasts while
using split IRQCHIP mode. This gives userspace control over the virtual
CPU model exposed to the guest, as KVM should never have enabled support
for Suppress EOI Broadcast without a userspace opt-in. Not setting
either flag will result in legacy quirky behavior for backward
compatibility.
Note, Suppress EOI Broadcasts is defined only in Intel's SDM, not in AMD's
APM. But the bit is writable on some AMD CPUs, e.g. Turin, and KVM's ABI
is to support Directed EOI (KVM's name) irrespective of guest CPU vendor.
Fixes: 7543a635aa09 ("KVM: x86: Add KVM exit for IOAPIC EOIs")
Closes: https://lore.kernel.org/kvm/7D497EF1-607D-4D37-98E7-DAF95F099342@nutanix.com
Cc: stable(a)vger.kernel.org
Suggested-by: David Woodhouse <dwmw2(a)infradead.org>
Co-developed-by: Sean Christopherson <seanjc(a)google.com>
Signed-off-by: Sean Christopherson <seanjc(a)google.com>
Signed-off-by: Khushit Shah <khushit.shah(a)nutanix.com>
---
v4:
- Add KVM_X2APIC_ENABLE_SUPPRESS_EOI_BROADCAST and
KVM_X2APIC_DISABLE_SUPPRESS_EOI_BROADCAST flags to allow userspace to
explicitly enable or disable support for Suppress EOI Broadcasts while
using split IRQCHIP mode.
After the inclusion of David Woodhouse's patch to support IOAPIC version 0x20,
we can tweak the uAPI to support kernel IRQCHIP mode as well.
Testing:
- Setting both the flags fails with EINVAL.
- Setting flags in kernel IRQCHIP mode fails with EINVAL.
- Setting flags in split IRQCHIP mode succeeds and both the flags work
as expected.
---
Documentation/virt/kvm/api.rst | 27 ++++++++++++++++++++--
arch/x86/include/asm/kvm_host.h | 7 ++++++
arch/x86/include/uapi/asm/kvm.h | 6 +++--
arch/x86/kvm/lapic.c | 40 +++++++++++++++++++++++++++++++++
arch/x86/kvm/x86.c | 19 +++++++++++++---
5 files changed, 92 insertions(+), 7 deletions(-)
diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index 57061fa29e6a..b26528e0fec1 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -7800,8 +7800,10 @@ Will return -EBUSY if a VCPU has already been created.
Valid feature flags in args[0] are::
- #define KVM_X2APIC_API_USE_32BIT_IDS (1ULL << 0)
- #define KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK (1ULL << 1)
+ #define KVM_X2APIC_API_USE_32BIT_IDS (1ULL << 0)
+ #define KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK (1ULL << 1)
+ #define KVM_X2APIC_ENABLE_SUPPRESS_EOI_BROADCAST (1ULL << 2)
+ #define KVM_X2APIC_DISABLE_SUPPRESS_EOI_BROADCAST (1ULL << 3)
Enabling KVM_X2APIC_API_USE_32BIT_IDS changes the behavior of
KVM_SET_GSI_ROUTING, KVM_SIGNAL_MSI, KVM_SET_LAPIC, and KVM_GET_LAPIC,
@@ -7814,6 +7816,27 @@ as a broadcast even in x2APIC mode in order to support physical x2APIC
without interrupt remapping. This is undesirable in logical mode,
where 0xff represents CPUs 0-7 in cluster 0.
+Setting KVM_X2APIC_ENABLE_SUPPRESS_EOI_BROADCAST instructs KVM to enable
+Suppress EOI Broadcasts. KVM will advertise support for Suppress EOI Broadcast
+to the guest and suppress LAPIC EOI broadcasts when the guest sets the
+Suppress EOI Broadcast bit in the SPIV register.
+
+Setting KVM_X2APIC_DISABLE_SUPPRESS_EOI_BROADCAST disables support for
+Suppress EOI Broadcasts entirely, i.e. instructs KVM to NOT advertise support
+to the guest.
+
+Modern VMMs should either enable KVM_X2APIC_ENABLE_SUPPRESS_EOI_BROADCAST or
+KVM_X2APIC_DISABLE_SUPPRESS_EOI_BROADCAST. If not, legacy quirky behavior will
+be used by KVM, which is to advertise support for Suppress EOI Broadcasts but
+not actually suppressing EOI broadcasts.
+
+Currently, both KVM_X2APIC_ENABLE_SUPPRESS_EOI_BROADCAST and
+KVM_X2APIC_DISABLE_SUPPRESS_EOI_BROADCAST must only be set when in split IRQCHIP
+mode. Otherwise, the ioctl will fail with an EINVAL error.
+
+Setting both KVM_X2APIC_ENABLE_SUPPRESS_EOI_BROADCAST and
+KVM_X2APIC_DISABLE_SUPPRESS_EOI_BROADCAST will fail with an EINVAL error.
+
7.8 KVM_CAP_S390_USER_INSTR0
----------------------------
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 48598d017d6f..4a6d94dc7a2a 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1229,6 +1229,12 @@ enum kvm_irqchip_mode {
KVM_IRQCHIP_SPLIT, /* created with KVM_CAP_SPLIT_IRQCHIP */
};
+enum kvm_suppress_eoi_broadcast_mode {
+ KVM_SUPPRESS_EOI_BROADCAST_QUIRKED, /* Legacy behavior */
+ KVM_SUPPRESS_EOI_BROADCAST_ENABLED, /* Enable Suppress EOI broadcast */
+ KVM_SUPPRESS_EOI_BROADCAST_DISABLED /* Disable Suppress EOI broadcast */
+};
+
struct kvm_x86_msr_filter {
u8 count;
bool default_allow:1;
@@ -1480,6 +1486,7 @@ struct kvm_arch {
bool x2apic_format;
bool x2apic_broadcast_quirk_disabled;
+ enum kvm_suppress_eoi_broadcast_mode suppress_eoi_broadcast_mode;
bool has_mapped_host_mmio;
bool guest_can_read_msr_platform_info;
diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
index d420c9c066d4..d30241429fa8 100644
--- a/arch/x86/include/uapi/asm/kvm.h
+++ b/arch/x86/include/uapi/asm/kvm.h
@@ -913,8 +913,10 @@ struct kvm_sev_snp_launch_finish {
__u64 pad1[4];
};
-#define KVM_X2APIC_API_USE_32BIT_IDS (1ULL << 0)
-#define KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK (1ULL << 1)
+#define KVM_X2APIC_API_USE_32BIT_IDS (_BITULL(0))
+#define KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK (_BITULL(1))
+#define KVM_X2APIC_ENABLE_SUPPRESS_EOI_BROADCAST (_BITULL(2))
+#define KVM_X2APIC_DISABLE_SUPPRESS_EOI_BROADCAST (_BITULL(3))
struct kvm_hyperv_eventfd {
__u32 conn_id;
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 0ae7f913d782..1ef0bd3eff1e 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -105,6 +105,34 @@ bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector)
apic_test_vector(vector, apic->regs + APIC_IRR);
}
+static inline bool kvm_lapic_advertise_suppress_eoi_broadcast(struct kvm *kvm)
+{
+ /*
+ * Advertise Suppress EOI broadcast support to the guest unless the VMM
+ * explicitly disabled it.
+ *
+ * Historically, KVM advertised this capability even though it did not
+ * actually suppress EOIs.
+ */
+ return kvm->arch.suppress_eoi_broadcast_mode !=
+ KVM_SUPPRESS_EOI_BROADCAST_DISABLED;
+}
+
+static inline bool kvm_lapic_ignore_suppress_eoi_broadcast(struct kvm *kvm)
+{
+ /*
+ * Returns true if KVM should ignore the suppress EOI broadcast bit set by
+ * the guest and broadcast EOIs anyway.
+ *
+ * Only returns false when the VMM explicitly enabled Suppress EOI
+ * broadcast. If disabled by VMM, the bit should be ignored as it is not
+ * supported. Legacy behavior was to ignore the bit and broadcast EOIs
+ * anyway.
+ */
+ return kvm->arch.suppress_eoi_broadcast_mode !=
+ KVM_SUPPRESS_EOI_BROADCAST_ENABLED;
+}
+
__read_mostly DEFINE_STATIC_KEY_FALSE(kvm_has_noapic_vcpu);
EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_has_noapic_vcpu);
@@ -562,6 +590,7 @@ void kvm_apic_set_version(struct kvm_vcpu *vcpu)
* IOAPIC.
*/
if (guest_cpu_cap_has(vcpu, X86_FEATURE_X2APIC) &&
+ kvm_lapic_advertise_suppress_eoi_broadcast(vcpu->kvm) &&
!ioapic_in_kernel(vcpu->kvm))
v |= APIC_LVR_DIRECTED_EOI;
kvm_lapic_set_reg(apic, APIC_LVR, v);
@@ -1517,6 +1546,17 @@ static void kvm_ioapic_send_eoi(struct kvm_lapic *apic, int vector)
/* Request a KVM exit to inform the userspace IOAPIC. */
if (irqchip_split(apic->vcpu->kvm)) {
+ /*
+ * Don't exit to userspace if the guest has enabled Directed
+ * EOI, a.k.a. Suppress EOI Broadcasts, in which case the local
+ * APIC doesn't broadcast EOIs (the guest must EOI the target
+ * I/O APIC(s) directly). Ignore the suppression if userspace
+ * has NOT explicitly enabled Suppress EOI broadcast.
+ */
+ if ((kvm_lapic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI) &&
+ !kvm_lapic_ignore_suppress_eoi_broadcast(apic->vcpu->kvm))
+ return;
+
apic->vcpu->arch.pending_ioapic_eoi = vector;
kvm_make_request(KVM_REQ_IOAPIC_EOI_EXIT, apic->vcpu);
return;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index c9c2aa6f4705..81b40fdb5f5f 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -121,8 +121,11 @@ static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE);
#define KVM_CAP_PMU_VALID_MASK KVM_PMU_CAP_DISABLE
-#define KVM_X2APIC_API_VALID_FLAGS (KVM_X2APIC_API_USE_32BIT_IDS | \
- KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK)
+#define KVM_X2APIC_API_VALID_FLAGS \
+ (KVM_X2APIC_API_USE_32BIT_IDS | \
+ KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK | \
+ KVM_X2APIC_ENABLE_SUPPRESS_EOI_BROADCAST | \
+ KVM_X2APIC_DISABLE_SUPPRESS_EOI_BROADCAST)
static void update_cr8_intercept(struct kvm_vcpu *vcpu);
static void process_nmi(struct kvm_vcpu *vcpu);
@@ -6777,12 +6780,22 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
r = -EINVAL;
if (cap->args[0] & ~KVM_X2APIC_API_VALID_FLAGS)
break;
+ if ((cap->args[0] & KVM_X2APIC_ENABLE_SUPPRESS_EOI_BROADCAST) &&
+ (cap->args[0] & KVM_X2APIC_DISABLE_SUPPRESS_EOI_BROADCAST))
+ break;
+ if (!irqchip_split(kvm) &&
+ ((cap->args[0] & KVM_X2APIC_ENABLE_SUPPRESS_EOI_BROADCAST) ||
+ (cap->args[0] & KVM_X2APIC_DISABLE_SUPPRESS_EOI_BROADCAST)))
+ break;
if (cap->args[0] & KVM_X2APIC_API_USE_32BIT_IDS)
kvm->arch.x2apic_format = true;
if (cap->args[0] & KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK)
kvm->arch.x2apic_broadcast_quirk_disabled = true;
-
+ if (cap->args[0] & KVM_X2APIC_ENABLE_SUPPRESS_EOI_BROADCAST)
+ kvm->arch.suppress_eoi_broadcast_mode = KVM_SUPPRESS_EOI_BROADCAST_ENABLED;
+ if (cap->args[0] & KVM_X2APIC_DISABLE_SUPPRESS_EOI_BROADCAST)
+ kvm->arch.suppress_eoi_broadcast_mode = KVM_SUPPRESS_EOI_BROADCAST_DISABLED;
r = 0;
break;
case KVM_CAP_X86_DISABLE_EXITS:
--
2.39.3
The vdd_mpu regulator maximum voltage was previously limited to 1.2985V,
which prevented the CPU from reaching the 1GHz operating point. This
limitation was put in place because voltage changes were not working
correctly, causing the board to stall when attempting higher frequencies.
Increase the maximum voltage to 1.3515V to allow the full 1GHz OPP to be
used.
Add a TPS65219 PMIC driver fixes that properly implement the LOCK register
handling, to make voltage transitions work reliably.
Changes in v4:
- Move the registers unlock in the probe instead of a custom regmap write
operation.
- Link to v3: https://lore.kernel.org/r/20251112-fix_tps65219-v3-0-e49bab4c01ce@bootlin.c…
Changes in v3:
- Remove an unused variable
- Link to v2: https://lore.kernel.org/r/20251106-fix_tps65219-v2-0-a7d608c4272f@bootlin.c…
Changes in v2:
- Setup a custom regmap_bus only for the TPS65214 instead of checking
the chip_id every time reg_write is called.
- Add the am335x-bonegreen-eco devicetree change in the same patch
series.
Signed-off-by: Kory Maincent (TI.com) <kory.maincent(a)bootlin.com>
---
Kory Maincent (TI.com) (2):
mfd: tps65219: Implement LOCK register handling for TPS65214
ARM: dts: am335x-bonegreen-eco: Enable 1GHz OPP by increasing vdd_mpu voltage
arch/arm/boot/dts/ti/omap/am335x-bonegreen-eco.dts | 2 +-
drivers/mfd/tps65219.c | 7 +++++++
include/linux/mfd/tps65219.h | 2 ++
3 files changed, 10 insertions(+), 1 deletion(-)
---
base-commit: 1c353dc8d962de652bc7ad2ba2e63f553331391c
change-id: 20251106-fix_tps65219-dd62141d22cf
Best regards,
--
Köry Maincent, Bootlin
Embedded Linux and kernel engineering
https://bootlin.com
The recent refactoring of where runtime PM is enabled done in commit
f1eb4e792bb1 ("spi: spi-cadence-quadspi: Enable pm runtime earlier to
avoid imbalance") made the fact that when we do a pm_runtime_disable()
in the error paths of probe() we can trigger a runtime disable which in
turn results in duplicate clock disables. Early on in the probe function
we do a pm_runtime_get_noresume() since the probe function leaves the
device in a powered up state but in the error path we can't assume that PM
is enabled so we also manually disable everything, including clocks. This
means that when runtime PM is active both it and the probe function release
the same reference to the main clock for the IP, triggering warnings from
the clock subsystem:
[ 8.693719] clk:75:7 already disabled
[ 8.693791] WARNING: CPU: 1 PID: 185 at /usr/src/kernel/drivers/clk/clk.c:1188 clk_core_disable+0xa0/0xb
...
[ 8.694261] clk_core_disable+0xa0/0xb4 (P)
[ 8.694272] clk_disable+0x38/0x60
[ 8.694283] cqspi_probe+0x7c8/0xc5c [spi_cadence_quadspi]
[ 8.694309] platform_probe+0x5c/0xa4
Avoid this confused ownership by moving the pm_runtime_get_noresume() to
after the last point at which the probe() function can fail.
Reported-by: Francesco Dolcini <francesco(a)dolcini.it>
Closes: https://lore.kernel.org/r/20251201072844.GA6785@francesco-nb
Signed-off-by: Mark Brown <broonie(a)kernel.org>
Cc: stable(a)vger.kernel.org
---
drivers/spi/spi-cadence-quadspi.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/spi/spi-cadence-quadspi.c b/drivers/spi/spi-cadence-quadspi.c
index af6d050da1c8..0833b6f666d0 100644
--- a/drivers/spi/spi-cadence-quadspi.c
+++ b/drivers/spi/spi-cadence-quadspi.c
@@ -1985,7 +1985,6 @@ static int cqspi_probe(struct platform_device *pdev)
pm_runtime_enable(dev);
pm_runtime_set_autosuspend_delay(dev, CQSPI_AUTOSUSPEND_TIMEOUT);
pm_runtime_use_autosuspend(dev);
- pm_runtime_get_noresume(dev);
}
ret = cqspi_setup_flash(cqspi);
@@ -2012,6 +2011,7 @@ static int cqspi_probe(struct platform_device *pdev)
}
if (!(ddata && (ddata->quirks & CQSPI_DISABLE_RUNTIME_PM))) {
+ pm_runtime_get_noresume(dev);
pm_runtime_mark_last_busy(dev);
pm_runtime_put_autosuspend(dev);
}
---
base-commit: 7d0a66e4bb9081d75c82ec4957c50034cb0ea449
change-id: 20251202-spi-cadence-qspi-runtime-pm-imbalance-657740cf7eae
Best regards,
--
Mark Brown <broonie(a)kernel.org>
As reported, ever since commit 1013af4f585f ("mm/hugetlb: fix
huge_pmd_unshare() vs GUP-fast race") we can end up in some situations
where we perform so many IPI broadcasts when unsharing hugetlb PMD page
tables that it severely regresses some workloads.
In particular, when we fork()+exit(), or when we munmap() a large
area backed by many shared PMD tables, we perform one IPI broadcast per
unshared PMD table.
There are two optimizations to be had:
(1) When we process (unshare) multiple such PMD tables, such as during
exit(), it is sufficient to send a single IPI broadcast (as long as
we respect locking rules) instead of one per PMD table.
Locking prevents that any of these PMD tables could get reuse before
we drop the lock.
(2) When we are not the last sharer (> 2 users including us), there is
no need to send the IPI broadcast. The shared PMD tables cannot
become exclusive (fully unshared) before an IPI will be broadcasted
by the last sharer.
Concurrent GUP-fast could walk into a PMD table just before we
unshared it. It could then succeed in grabbing a page from the
shared page table even after munmap() etc succeeded (and supressed
an IPI). But there is not difference compared to GUP-fast just
sleeping for a while after grabbing the page and re-enabling IRQs.
Most importantly, GUP-fast will never walk into page tables that are
no-longer shared, because the last sharer will issue an IPI
broadcast.
(if ever required, checking whether the PUD changed in GUP-fast
after grabbing the page like we do in the PTE case could handle
this)
So let's rework PMD sharing TLB flushing + IPI sync to use the mmu_gather
infrastructure so we can implement these optimizations and demystify the
code at least a bit. Extend the mmu_gather infrastructure to be able to
deal with our special hugetlb PMD table sharing implementation.
We'll consolidate the handling for (full) unsharing of PMD tables in
tlb_unshare_pmd_ptdesc() and tlb_flush_unshared_tables(), and track
in "struct mmu_gather" whether we had (full) unsharing of PMD tables.
Because locking is very special (concurrent unsharing+reuse must be
prevented), we disallow deferring flushing to tlb_finish_mmu() and instead
require an explicit earlier call to tlb_flush_unshared_tables().
From hugetlb code, we call huge_pmd_unshare_flush() where we make sure
that the expected lock protecting us from concurrent unsharing+reuse is
still held.
Check with a VM_WARN_ON_ONCE() in tlb_finish_mmu() that
tlb_flush_unshared_tables() was properly called earlier.
Document it all properly.
Notes about tlb_remove_table_sync_one() interaction with unsharing:
There are two fairly tricky things:
(1) tlb_remove_table_sync_one() is a NOP on architectures without
CONFIG_MMU_GATHER_RCU_TABLE_FREE.
Here, the assumption is that the previous TLB flush would send an
IPI to all relevant CPUs. Careful: some architectures like x86 only
send IPIs to all relevant CPUs when tlb->freed_tables is set.
The relevant architectures should be selecting
MMU_GATHER_RCU_TABLE_FREE, but x86 might not do that in stable
kernels and it might have been problematic before this patch.
Also, the arch flushing behavior (independent of IPIs) is different
when tlb->freed_tables is set. Do we have to enlighten them to also
take care of tlb->unshared_tables? So far we didn't care, so
hopefully we are fine. Of course, we could be setting
tlb->freed_tables as well, but that might then unnecessarily flush
too much, because the semantics of tlb->freed_tables are a bit
fuzzy.
This patch changes nothing in this regard.
(2) tlb_remove_table_sync_one() is not a NOP on architectures with
CONFIG_MMU_GATHER_RCU_TABLE_FREE that actually don't need a sync.
Take x86 as an example: in the common case (!pv, !X86_FEATURE_INVLPGB)
we still issue IPIs during TLB flushes and don't actually need the
second tlb_remove_table_sync_one().
This optimized can be implemented on top of this, by checking e.g., in
tlb_remove_table_sync_one() whether we really need IPIs. But as
described in (1), it really must honor tlb->freed_tables then to
send IPIs to all relevant CPUs.
Further note that the ptdesc_pmd_pts_dec() in huge_pmd_share() is not a
concern, as we are holding the i_mmap_lock the whole time, preventing
concurrent unsharing. That ptdesc_pmd_pts_dec() usage will be removed
separately as a cleanup later.
There are plenty more cleanups to be had, but they have to wait until
this is fixed.
Fixes: 1013af4f585f ("mm/hugetlb: fix huge_pmd_unshare() vs GUP-fast race")
Reported-by: Uschakow, Stanislav" <suschako(a)amazon.de>
Closes: https://lore.kernel.org/all/4d3878531c76479d9f8ca9789dc6485d@amazon.de/
Tested-by: Laurence Oberman <loberman(a)redhat.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: David Hildenbrand (Red Hat) <david(a)kernel.org>
---
include/asm-generic/tlb.h | 74 ++++++++++++++++++++++-
include/linux/hugetlb.h | 19 +++---
mm/hugetlb.c | 121 ++++++++++++++++++++++----------------
mm/mmu_gather.c | 7 +++
mm/mprotect.c | 2 +-
mm/rmap.c | 25 +++++---
6 files changed, 179 insertions(+), 69 deletions(-)
diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h
index 1fff717cae510..706416babb3d6 100644
--- a/include/asm-generic/tlb.h
+++ b/include/asm-generic/tlb.h
@@ -364,6 +364,20 @@ struct mmu_gather {
unsigned int vma_huge : 1;
unsigned int vma_pfn : 1;
+ /*
+ * Did we unshare (unmap) any shared page tables? For now only
+ * used for hugetlb PMD table sharing.
+ */
+ unsigned int unshared_tables : 1;
+
+ /*
+ * Did we unshare any page tables such that they are now exclusive
+ * and could get reused+modified by the new owner? When setting this
+ * flag, "unshared_tables" will be set as well. For now only used
+ * for hugetlb PMD table sharing.
+ */
+ unsigned int fully_unshared_tables : 1;
+
unsigned int batch_count;
#ifndef CONFIG_MMU_GATHER_NO_GATHER
@@ -400,6 +414,7 @@ static inline void __tlb_reset_range(struct mmu_gather *tlb)
tlb->cleared_pmds = 0;
tlb->cleared_puds = 0;
tlb->cleared_p4ds = 0;
+ tlb->unshared_tables = 0;
/*
* Do not reset mmu_gather::vma_* fields here, we do not
* call into tlb_start_vma() again to set them if there is an
@@ -484,7 +499,7 @@ static inline void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb)
* these bits.
*/
if (!(tlb->freed_tables || tlb->cleared_ptes || tlb->cleared_pmds ||
- tlb->cleared_puds || tlb->cleared_p4ds))
+ tlb->cleared_puds || tlb->cleared_p4ds || tlb->unshared_tables))
return;
tlb_flush(tlb);
@@ -773,6 +788,63 @@ static inline bool huge_pmd_needs_flush(pmd_t oldpmd, pmd_t newpmd)
}
#endif
+#ifdef CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING
+static inline void tlb_unshare_pmd_ptdesc(struct mmu_gather *tlb, struct ptdesc *pt,
+ unsigned long addr)
+{
+ /*
+ * The caller must make sure that concurrent unsharing + exclusive
+ * reuse is impossible until tlb_flush_unshared_tables() was called.
+ */
+ VM_WARN_ON_ONCE(!ptdesc_pmd_is_shared(pt));
+ ptdesc_pmd_pts_dec(pt);
+
+ /* Clearing a PUD pointing at a PMD table with PMD leaves. */
+ tlb_flush_pmd_range(tlb, addr & PUD_MASK, PUD_SIZE);
+
+ /*
+ * If the page table is now exclusively owned, we fully unshared
+ * a page table.
+ */
+ if (!ptdesc_pmd_is_shared(pt))
+ tlb->fully_unshared_tables = true;
+ tlb->unshared_tables = true;
+}
+
+static inline void tlb_flush_unshared_tables(struct mmu_gather *tlb)
+{
+ /*
+ * As soon as the caller drops locks to allow for reuse of
+ * previously-shared tables, these tables could get modified and
+ * even reused outside of hugetlb context, so we have to make sure that
+ * any page table walkers (incl. TLB, GUP-fast) are aware of that
+ * change.
+ *
+ * Even if we are not fully unsharing a PMD table, we must
+ * flush the TLB for the unsharer now.
+ */
+ if (tlb->unshared_tables)
+ tlb_flush_mmu_tlbonly(tlb);
+
+ /*
+ * Similarly, we must make sure that concurrent GUP-fast will not
+ * walk previously-shared page tables that are getting modified+reused
+ * elsewhere. So broadcast an IPI to wait for any concurrent GUP-fast.
+ *
+ * We only perform this when we are the last sharer of a page table,
+ * as the IPI will reach all CPUs: any GUP-fast.
+ *
+ * Note that on configs where tlb_remove_table_sync_one() is a NOP,
+ * the expectation is that the tlb_flush_mmu_tlbonly() would have issued
+ * required IPIs already for us.
+ */
+ if (tlb->fully_unshared_tables) {
+ tlb_remove_table_sync_one();
+ tlb->fully_unshared_tables = false;
+ }
+}
+#endif /* CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING */
+
#endif /* CONFIG_MMU */
#endif /* _ASM_GENERIC__TLB_H */
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 03c8725efa289..63b248c6bfd47 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -240,8 +240,9 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
pte_t *huge_pte_offset(struct mm_struct *mm,
unsigned long addr, unsigned long sz);
unsigned long hugetlb_mask_last_page(struct hstate *h);
-int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long addr, pte_t *ptep);
+int huge_pmd_unshare(struct mmu_gather *tlb, struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep);
+void huge_pmd_unshare_flush(struct mmu_gather *tlb, struct vm_area_struct *vma);
void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
unsigned long *start, unsigned long *end);
@@ -271,7 +272,7 @@ void hugetlb_vma_unlock_write(struct vm_area_struct *vma);
int hugetlb_vma_trylock_write(struct vm_area_struct *vma);
void hugetlb_vma_assert_locked(struct vm_area_struct *vma);
void hugetlb_vma_lock_release(struct kref *kref);
-long hugetlb_change_protection(struct vm_area_struct *vma,
+long hugetlb_change_protection(struct mmu_gather *tlb, struct vm_area_struct *vma,
unsigned long address, unsigned long end, pgprot_t newprot,
unsigned long cp_flags);
void hugetlb_unshare_all_pmds(struct vm_area_struct *vma);
@@ -300,13 +301,17 @@ static inline struct address_space *hugetlb_folio_mapping_lock_write(
return NULL;
}
-static inline int huge_pmd_unshare(struct mm_struct *mm,
- struct vm_area_struct *vma,
- unsigned long addr, pte_t *ptep)
+static inline int huge_pmd_unshare(struct mmu_gather *tlb,
+ struct vm_area_struct *vma, unsigned long addr, pte_t *ptep)
{
return 0;
}
+static inline void huge_pmd_unshare_flush(struct mmu_gather *tlb,
+ struct vm_area_struct *vma)
+{
+}
+
static inline void adjust_range_if_pmd_sharing_possible(
struct vm_area_struct *vma,
unsigned long *start, unsigned long *end)
@@ -432,7 +437,7 @@ static inline void move_hugetlb_state(struct folio *old_folio,
{
}
-static inline long hugetlb_change_protection(
+static inline long hugetlb_change_protection(struct mmu_gather *tlb,
struct vm_area_struct *vma, unsigned long address,
unsigned long end, pgprot_t newprot,
unsigned long cp_flags)
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 3c77cdef12a32..7fef0b94b5d1e 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -5096,8 +5096,9 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma,
unsigned long last_addr_mask;
pte_t *src_pte, *dst_pte;
struct mmu_notifier_range range;
- bool shared_pmd = false;
+ struct mmu_gather tlb;
+ tlb_gather_mmu(&tlb, vma->vm_mm);
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, old_addr,
old_end);
adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end);
@@ -5122,12 +5123,12 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma,
if (huge_pte_none(huge_ptep_get(mm, old_addr, src_pte)))
continue;
- if (huge_pmd_unshare(mm, vma, old_addr, src_pte)) {
- shared_pmd = true;
+ if (huge_pmd_unshare(&tlb, vma, old_addr, src_pte)) {
old_addr |= last_addr_mask;
new_addr |= last_addr_mask;
continue;
}
+ tlb_remove_huge_tlb_entry(h, &tlb, src_pte, old_addr);
dst_pte = huge_pte_alloc(mm, new_vma, new_addr, sz);
if (!dst_pte)
@@ -5136,13 +5137,13 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma,
move_huge_pte(vma, old_addr, new_addr, src_pte, dst_pte, sz);
}
- if (shared_pmd)
- flush_hugetlb_tlb_range(vma, range.start, range.end);
- else
- flush_hugetlb_tlb_range(vma, old_end - len, old_end);
+ tlb_flush_mmu_tlbonly(&tlb);
+ huge_pmd_unshare_flush(&tlb, vma);
+
mmu_notifier_invalidate_range_end(&range);
i_mmap_unlock_write(mapping);
hugetlb_vma_unlock_write(vma);
+ tlb_finish_mmu(&tlb);
return len + old_addr - old_end;
}
@@ -5161,7 +5162,6 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
unsigned long sz = huge_page_size(h);
bool adjust_reservation;
unsigned long last_addr_mask;
- bool force_flush = false;
WARN_ON(!is_vm_hugetlb_page(vma));
BUG_ON(start & ~huge_page_mask(h));
@@ -5184,10 +5184,8 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
}
ptl = huge_pte_lock(h, mm, ptep);
- if (huge_pmd_unshare(mm, vma, address, ptep)) {
+ if (huge_pmd_unshare(tlb, vma, address, ptep)) {
spin_unlock(ptl);
- tlb_flush_pmd_range(tlb, address & PUD_MASK, PUD_SIZE);
- force_flush = true;
address |= last_addr_mask;
continue;
}
@@ -5303,14 +5301,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
}
tlb_end_vma(tlb, vma);
- /*
- * There is nothing protecting a previously-shared page table that we
- * unshared through huge_pmd_unshare() from getting freed after we
- * release i_mmap_rwsem, so flush the TLB now. If huge_pmd_unshare()
- * succeeded, flush the range corresponding to the pud.
- */
- if (force_flush)
- tlb_flush_mmu_tlbonly(tlb);
+ huge_pmd_unshare_flush(tlb, vma);
}
void __hugetlb_zap_begin(struct vm_area_struct *vma,
@@ -6399,7 +6390,7 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte,
}
#endif /* CONFIG_USERFAULTFD */
-long hugetlb_change_protection(struct vm_area_struct *vma,
+long hugetlb_change_protection(struct mmu_gather *tlb, struct vm_area_struct *vma,
unsigned long address, unsigned long end,
pgprot_t newprot, unsigned long cp_flags)
{
@@ -6409,7 +6400,6 @@ long hugetlb_change_protection(struct vm_area_struct *vma,
pte_t pte;
struct hstate *h = hstate_vma(vma);
long pages = 0, psize = huge_page_size(h);
- bool shared_pmd = false;
struct mmu_notifier_range range;
unsigned long last_addr_mask;
bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
@@ -6452,7 +6442,7 @@ long hugetlb_change_protection(struct vm_area_struct *vma,
}
}
ptl = huge_pte_lock(h, mm, ptep);
- if (huge_pmd_unshare(mm, vma, address, ptep)) {
+ if (huge_pmd_unshare(tlb, vma, address, ptep)) {
/*
* When uffd-wp is enabled on the vma, unshare
* shouldn't happen at all. Warn about it if it
@@ -6461,7 +6451,6 @@ long hugetlb_change_protection(struct vm_area_struct *vma,
WARN_ON_ONCE(uffd_wp || uffd_wp_resolve);
pages++;
spin_unlock(ptl);
- shared_pmd = true;
address |= last_addr_mask;
continue;
}
@@ -6522,22 +6511,16 @@ long hugetlb_change_protection(struct vm_area_struct *vma,
pte = huge_pte_clear_uffd_wp(pte);
huge_ptep_modify_prot_commit(vma, address, ptep, old_pte, pte);
pages++;
+ tlb_remove_huge_tlb_entry(h, tlb, ptep, address);
}
next:
spin_unlock(ptl);
cond_resched();
}
- /*
- * There is nothing protecting a previously-shared page table that we
- * unshared through huge_pmd_unshare() from getting freed after we
- * release i_mmap_rwsem, so flush the TLB now. If huge_pmd_unshare()
- * succeeded, flush the range corresponding to the pud.
- */
- if (shared_pmd)
- flush_hugetlb_tlb_range(vma, range.start, range.end);
- else
- flush_hugetlb_tlb_range(vma, start, end);
+
+ tlb_flush_mmu_tlbonly(tlb);
+ huge_pmd_unshare_flush(tlb, vma);
/*
* No need to call mmu_notifier_arch_invalidate_secondary_tlbs() we are
* downgrading page table protection not changing it to point to a new
@@ -6904,18 +6887,27 @@ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
return pte;
}
-/*
- * unmap huge page backed by shared pte.
+/**
+ * huge_pmd_unshare - Unmap a pmd table if it is shared by multiple users
+ * @tlb: the current mmu_gather.
+ * @vma: the vma covering the pmd table.
+ * @addr: the address we are trying to unshare.
+ * @ptep: pointer into the (pmd) page table.
+ *
+ * Called with the page table lock held, the i_mmap_rwsem held in write mode
+ * and the hugetlb vma lock held in write mode.
*
- * Called with page table lock held.
+ * Note: The caller must call huge_pmd_unshare_flush() before dropping the
+ * i_mmap_rwsem.
*
- * returns: 1 successfully unmapped a shared pte page
- * 0 the underlying pte page is not shared, or it is the last user
+ * Returns: 1 if it was a shared PMD table and it got unmapped, or 0 if it
+ * was not a shared PMD table.
*/
-int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long addr, pte_t *ptep)
+int huge_pmd_unshare(struct mmu_gather *tlb, struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep)
{
unsigned long sz = huge_page_size(hstate_vma(vma));
+ struct mm_struct *mm = vma->vm_mm;
pgd_t *pgd = pgd_offset(mm, addr);
p4d_t *p4d = p4d_offset(pgd, addr);
pud_t *pud = pud_offset(p4d, addr);
@@ -6927,18 +6919,36 @@ int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
i_mmap_assert_write_locked(vma->vm_file->f_mapping);
hugetlb_vma_assert_locked(vma);
pud_clear(pud);
- /*
- * Once our caller drops the rmap lock, some other process might be
- * using this page table as a normal, non-hugetlb page table.
- * Wait for pending gup_fast() in other threads to finish before letting
- * that happen.
- */
- tlb_remove_table_sync_one();
- ptdesc_pmd_pts_dec(virt_to_ptdesc(ptep));
+
+ tlb_unshare_pmd_ptdesc(tlb, virt_to_ptdesc(ptep), addr);
+
mm_dec_nr_pmds(mm);
return 1;
}
+/*
+ * huge_pmd_unshare_flush - Complete a sequence of huge_pmd_unshare() calls
+ * @tlb: the current mmu_gather.
+ * @vma: the vma covering the pmd table.
+ *
+ * Perform necessary TLB flushes or IPI broadcasts to synchronize PMD table
+ * unsharing with concurrent page table walkers.
+ *
+ * This function must be called after a sequence of huge_pmd_unshare()
+ * calls while still holding the i_mmap_rwsem.
+ */
+void huge_pmd_unshare_flush(struct mmu_gather *tlb, struct vm_area_struct *vma)
+{
+ /*
+ * We must synchronize page table unsharing such that nobody will
+ * try reusing a previously-shared page table while it might still
+ * be in use by previous sharers (TLB, GUP_fast).
+ */
+ i_mmap_assert_write_locked(vma->vm_file->f_mapping);
+
+ tlb_flush_unshared_tables(tlb);
+}
+
#else /* !CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING */
pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
@@ -6947,12 +6957,16 @@ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
return NULL;
}
-int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long addr, pte_t *ptep)
+int huge_pmd_unshare(struct mmu_gather *tlb, struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep)
{
return 0;
}
+void huge_pmd_unshare_flush(struct mmu_gather *tlb, struct vm_area_struct *vma)
+{
+}
+
void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
unsigned long *start, unsigned long *end)
{
@@ -7219,6 +7233,7 @@ static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
unsigned long sz = huge_page_size(h);
struct mm_struct *mm = vma->vm_mm;
struct mmu_notifier_range range;
+ struct mmu_gather tlb;
unsigned long address;
spinlock_t *ptl;
pte_t *ptep;
@@ -7229,6 +7244,7 @@ static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
if (start >= end)
return;
+ tlb_gather_mmu(&tlb, mm);
flush_cache_range(vma, start, end);
/*
* No need to call adjust_range_if_pmd_sharing_possible(), because
@@ -7248,10 +7264,10 @@ static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
if (!ptep)
continue;
ptl = huge_pte_lock(h, mm, ptep);
- huge_pmd_unshare(mm, vma, address, ptep);
+ huge_pmd_unshare(&tlb, vma, address, ptep);
spin_unlock(ptl);
}
- flush_hugetlb_tlb_range(vma, start, end);
+ huge_pmd_unshare_flush(&tlb, vma);
if (take_locks) {
i_mmap_unlock_write(vma->vm_file->f_mapping);
hugetlb_vma_unlock_write(vma);
@@ -7261,6 +7277,7 @@ static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
* Documentation/mm/mmu_notifier.rst.
*/
mmu_notifier_invalidate_range_end(&range);
+ tlb_finish_mmu(&tlb);
}
/*
diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c
index 247e3f9db6c7a..030a162a263ba 100644
--- a/mm/mmu_gather.c
+++ b/mm/mmu_gather.c
@@ -426,6 +426,7 @@ static void __tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,
#endif
tlb->vma_pfn = 0;
+ tlb->fully_unshared_tables = 0;
__tlb_reset_range(tlb);
inc_tlb_flush_pending(tlb->mm);
}
@@ -468,6 +469,12 @@ void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm)
*/
void tlb_finish_mmu(struct mmu_gather *tlb)
{
+ /*
+ * We expect an earlier huge_pmd_unshare_flush() call to sort this out,
+ * due to complicated locking requirements with page table unsharing.
+ */
+ VM_WARN_ON_ONCE(tlb->fully_unshared_tables);
+
/*
* If there are parallel threads are doing PTE changes on same range
* under non-exclusive lock (e.g., mmap_lock read-side) but defer TLB
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 283889e4f1cec..5c330e817129e 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -652,7 +652,7 @@ long change_protection(struct mmu_gather *tlb,
#endif
if (is_vm_hugetlb_page(vma))
- pages = hugetlb_change_protection(vma, start, end, newprot,
+ pages = hugetlb_change_protection(tlb, vma, start, end, newprot,
cp_flags);
else
pages = change_protection_range(tlb, vma, start, end, newprot,
diff --git a/mm/rmap.c b/mm/rmap.c
index 748f48727a162..d6799afe11147 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -76,7 +76,7 @@
#include <linux/mm_inline.h>
#include <linux/oom.h>
-#include <asm/tlbflush.h>
+#include <asm/tlb.h>
#define CREATE_TRACE_POINTS
#include <trace/events/migrate.h>
@@ -2008,13 +2008,17 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
* if unsuccessful.
*/
if (!anon) {
+ struct mmu_gather tlb;
+
VM_BUG_ON(!(flags & TTU_RMAP_LOCKED));
if (!hugetlb_vma_trylock_write(vma))
goto walk_abort;
- if (huge_pmd_unshare(mm, vma, address, pvmw.pte)) {
+
+ tlb_gather_mmu(&tlb, mm);
+ if (huge_pmd_unshare(&tlb, vma, address, pvmw.pte)) {
hugetlb_vma_unlock_write(vma);
- flush_tlb_range(vma,
- range.start, range.end);
+ huge_pmd_unshare_flush(&tlb, vma);
+ tlb_finish_mmu(&tlb);
/*
* The PMD table was unmapped,
* consequently unmapping the folio.
@@ -2022,6 +2026,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
goto walk_done;
}
hugetlb_vma_unlock_write(vma);
+ tlb_finish_mmu(&tlb);
}
pteval = huge_ptep_clear_flush(vma, address, pvmw.pte);
if (pte_dirty(pteval))
@@ -2398,17 +2403,20 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
* fail if unsuccessful.
*/
if (!anon) {
+ struct mmu_gather tlb;
+
VM_BUG_ON(!(flags & TTU_RMAP_LOCKED));
if (!hugetlb_vma_trylock_write(vma)) {
page_vma_mapped_walk_done(&pvmw);
ret = false;
break;
}
- if (huge_pmd_unshare(mm, vma, address, pvmw.pte)) {
- hugetlb_vma_unlock_write(vma);
- flush_tlb_range(vma,
- range.start, range.end);
+ tlb_gather_mmu(&tlb, mm);
+ if (huge_pmd_unshare(&tlb, vma, address, pvmw.pte)) {
+ hugetlb_vma_unlock_write(vma);
+ huge_pmd_unshare_flush(&tlb, vma);
+ tlb_finish_mmu(&tlb);
/*
* The PMD table was unmapped,
* consequently unmapping the folio.
@@ -2417,6 +2425,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
break;
}
hugetlb_vma_unlock_write(vma);
+ tlb_finish_mmu(&tlb);
}
/* Nuke the hugetlb page table entry */
pteval = huge_ptep_clear_flush(vma, address, pvmw.pte);
--
2.52.0
We switched from (wrongly) using the page count to an independent
shared count. Now, shared page tables have a refcount of 1 (excluding
speculative references) and instead use ptdesc->pt_share_count to
identify sharing.
We didn't convert hugetlb_pmd_shared(), so right now, we would never
detect a shared PMD table as such, because sharing/unsharing no longer
touches the refcount of a PMD table.
Page migration, like mbind() or migrate_pages() would allow for migrating
folios mapped into such shared PMD tables, even though the folios are
not exclusive. In smaps we would account them as "private" although they
are "shared", and we would be wrongly setting the PM_MMAP_EXCLUSIVE in the
pagemap interface.
Fix it by properly using ptdesc_pmd_is_shared() in hugetlb_pmd_shared().
Fixes: 59d9094df3d7 ("mm: hugetlb: independent PMD page table shared count")
Reviewed-by: Rik van Riel <riel(a)surriel.com>
Reviewed-by: Lance Yang <lance.yang(a)linux.dev>
Tested-by: Lance Yang <lance.yang(a)linux.dev>
Reviewed-by: Harry Yoo <harry.yoo(a)oracle.com>
Tested-by: Laurence Oberman <loberman(a)redhat.com>
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes(a)oracle.com>
Acked-by: Oscar Salvador <osalvador(a)suse.de>
Cc: <stable(a)vger.kernel.org>
Cc: Liu Shixin <liushixin2(a)huawei.com>
Signed-off-by: David Hildenbrand (Red Hat) <david(a)kernel.org>
---
include/linux/hugetlb.h | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 019a1c5281e4e..03c8725efa289 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -1326,7 +1326,7 @@ static inline __init void hugetlb_cma_reserve(int order)
#ifdef CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING
static inline bool hugetlb_pmd_shared(pte_t *pte)
{
- return page_count(virt_to_page(pte)) > 1;
+ return ptdesc_pmd_is_shared(virt_to_ptdesc(pte));
}
#else
static inline bool hugetlb_pmd_shared(pte_t *pte)
--
2.52.0
From: Sam James <sam(a)gentoo.org>
[ Upstream commit 9aeed9041929812a10a6d693af050846942a1d16 ]
Similar in nature to ab107276607af90b13a5994997e19b7b9731e251. glibc-2.42
drops the legacy termio struct, but the ioctls.h header still defines some
TC* constants in terms of termio (via sizeof). Hardcode the values instead.
This fixes building Python for example, which falls over like:
./Modules/termios.c:1119:16: error: invalid application of 'sizeof' to incomplete type 'struct termio'
Link: https://bugs.gentoo.org/961769
Link: https://bugs.gentoo.org/962600
Signed-off-by: Sam James <sam(a)gentoo.org>
Reviewed-by: Magnus Lindholm <linmag7(a)gmail.com>
Link: https://lore.kernel.org/r/6ebd3451908785cad53b50ca6bc46cfe9d6bc03c.17649224…
Signed-off-by: Magnus Lindholm <linmag7(a)gmail.com>
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
---
LLM Generated explanations, may be completely bogus:
## Analysis Summary
### 1. COMMIT MESSAGE ANALYSIS
This commit addresses a **userspace build failure** caused by glibc-2.42
removing the legacy `struct termio` definition. The kernel's uapi header
`arch/alpha/include/uapi/asm/ioctls.h` uses `sizeof(struct termio)` in
macro expansions, which fails when that struct is undefined.
**Key signals:**
- Links to two real bug reports (Gentoo bugs #961769 and #962600)
- References a prior fix (ab107276607af) for powerpc with the exact same
issue
- Has `Reviewed-by:` tag
- Demonstrates real-world impact: Python build failure
### 2. CODE CHANGE ANALYSIS
**What the change does:**
```c
// Before: Uses sizeof(struct termio) in macro expansion
#define TCGETA _IOR('t', 23, struct termio)
// After: Pre-computed constant
#define TCGETA 0x40127417
```
**Verification of the hardcoded values:**
Looking at `arch/alpha/include/uapi/asm/ioctl.h`, the ioctl encoding on
alpha is:
- `_IOC_SIZESHIFT` = 16, `_IOC_DIRSHIFT` = 29
- `_IOC_READ` = 2, `_IOC_WRITE` = 4
For `TCGETA = _IOR('t', 23, struct termio)`:
- dir=2, type=0x74, nr=0x17, size=18(0x12)
- Result: `(2<<29)|(0x74<<8)|(0x17)|(0x12<<16)` = **0x40127417** ✓
The hardcoded values are mathematically correct.
### 3. CLASSIFICATION
**Category: BUILD FIX**
This falls under the **build fixes** exception category - it's critical
for users who need to build userspace software with modern glibc. Other
architectures (powerpc, sh, xtensa) already have identical fixes in the
tree.
### 4. SCOPE AND RISK ASSESSMENT
- **Lines changed:** 4
- **Files touched:** 1 (alpha-specific uapi header)
- **Risk: EXTREMELY LOW**
- No runtime behavior change - only affects compilation
- The ioctl numbers are identical (pre-computed vs macro-computed)
- Pattern already established in other architectures
### 5. USER IMPACT
- **Affected users:** Anyone on alpha using glibc 2.42+ (e.g., Gentoo
users)
- **Severity:** Critical for affected users - cannot build Python, and
potentially many other programs that use termios
- **Scope:** Limited to alpha architecture, but complete blocker for
those users
### 6. STABILITY INDICATORS
- Has `Reviewed-by:` tag
- Same pattern accepted for powerpc (commit ab107276607af)
- Documented with actual bug reports showing real users affected
- No runtime changes - purely compile-time fix
### 7. DEPENDENCY CHECK
- **No dependencies** - standalone fix
- The affected file exists in all stable trees (uapi header)
- Clean application expected
### Final Assessment
**Pros:**
- Fixes a real, documented build breakage with modern glibc
- Extremely small and localized (4 lines, 1 file)
- Zero runtime risk - mathematically equivalent values
- Established precedent with powerpc fix
- Has review tag and bug report links
**Cons:**
- No explicit `Cc: stable(a)vger.kernel.org` tag
- Alpha is a niche architecture
**Verdict:** This is textbook stable material. It's a build fix that:
1. Is obviously correct (values are mathematically equivalent)
2. Fixes a real bug (complete build failure)
3. Is small and contained (4 lines)
4. Has no runtime risk whatsoever
5. Follows established precedent from other architectures
The lack of a stable tag is not disqualifying - many valid stable
patches don't have it. Build compatibility with modern toolchains is
exactly what stable trees need to maintain.
**YES**
arch/alpha/include/uapi/asm/ioctls.h | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/arch/alpha/include/uapi/asm/ioctls.h b/arch/alpha/include/uapi/asm/ioctls.h
index 971311605288f..a09d04b49cc65 100644
--- a/arch/alpha/include/uapi/asm/ioctls.h
+++ b/arch/alpha/include/uapi/asm/ioctls.h
@@ -23,10 +23,10 @@
#define TCSETSW _IOW('t', 21, struct termios)
#define TCSETSF _IOW('t', 22, struct termios)
-#define TCGETA _IOR('t', 23, struct termio)
-#define TCSETA _IOW('t', 24, struct termio)
-#define TCSETAW _IOW('t', 25, struct termio)
-#define TCSETAF _IOW('t', 28, struct termio)
+#define TCGETA 0x40127417
+#define TCSETA 0x80127418
+#define TCSETAW 0x80127419
+#define TCSETAF 0x8012741c
#define TCSBRK _IO('t', 29)
#define TCXONC _IO('t', 30)
--
2.51.0
Commit 9a7c987fb92b ("crypto: arm64/ghash - Use API partial block
handling") made ghash_finup() pass the wrong buffer to
ghash_do_simd_update(). As a result, ghash-neon now produces incorrect
outputs when the message length isn't divisible by 16 bytes. Fix this.
(I didn't notice this earlier because this code is reached only on CPUs
that support NEON but not PMULL. I haven't yet found a way to get
qemu-system-aarch64 to emulate that configuration.)
Fixes: 9a7c987fb92b ("crypto: arm64/ghash - Use API partial block handling")
Cc: stable(a)vger.kernel.org
Reported-by: Diederik de Haas <diederik(a)cknow-tech.com>
Closes: https://lore.kernel.org/linux-crypto/DETXT7QI62KE.F3CGH2VWX1SC@cknow-tech.c…
Signed-off-by: Eric Biggers <ebiggers(a)kernel.org>
---
If it's okay, I'd like to just take this via libcrypto-fixes.
arch/arm64/crypto/ghash-ce-glue.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/arch/arm64/crypto/ghash-ce-glue.c b/arch/arm64/crypto/ghash-ce-glue.c
index 7951557a285a..ef249d06c92c 100644
--- a/arch/arm64/crypto/ghash-ce-glue.c
+++ b/arch/arm64/crypto/ghash-ce-glue.c
@@ -131,11 +131,11 @@ static int ghash_finup(struct shash_desc *desc, const u8 *src,
if (len) {
u8 buf[GHASH_BLOCK_SIZE] = {};
memcpy(buf, src, len);
- ghash_do_simd_update(1, ctx->digest, src, key, NULL,
+ ghash_do_simd_update(1, ctx->digest, buf, key, NULL,
pmull_ghash_update_p8);
memzero_explicit(buf, sizeof(buf));
}
return ghash_export(desc, dst);
}
base-commit: 7a3984bbd69055898add0fe22445f99435f33450
--
2.52.0
The Dell XPS 13 9350 and XPS 16 9640 both have an upside-down mounted
OV02C10 sensor. This rotation of 180° is reported in neither the SSDB nor
the _PLD for the sensor (both report a rotation of 0°).
Add a DMI quirk mechanism for upside-down sensors and add 2 initial entries
to the DMI quirk list for these 2 laptops.
Note the OV02C10 driver was originally developed on a XPS 16 9640 which
resulted in inverted vflip + hflip settings making it look like the sensor
was upright on the XPS 16 9640 and upside down elsewhere this has been
fixed in commit 69fe27173396 ("media: ov02c10: Fix default vertical flip").
This makes this commit a regression fix since now the video is upside down
on these Dell XPS models where it was not before.
Fixes: d5ebe3f7d13d ("media: ov02c10: Fix default vertical flip")
Cc: stable(a)vger.kernel.org
Reviewed-by: Bryan O'Donoghue <bod(a)kernel.org>
Signed-off-by: Hans de Goede <johannes.goede(a)oss.qualcomm.com>
---
Changes in v2:
- Fix fixes tag to use the correct commit hash
- Drop || COMPILE_TEST from Kconfig to fix compile errors when ACPI is disabled
---
drivers/media/pci/intel/Kconfig | 2 +-
drivers/media/pci/intel/ipu-bridge.c | 29 ++++++++++++++++++++++++++++
2 files changed, 30 insertions(+), 1 deletion(-)
diff --git a/drivers/media/pci/intel/Kconfig b/drivers/media/pci/intel/Kconfig
index d9fcddce028b..3f14ca110d06 100644
--- a/drivers/media/pci/intel/Kconfig
+++ b/drivers/media/pci/intel/Kconfig
@@ -6,7 +6,7 @@ source "drivers/media/pci/intel/ivsc/Kconfig"
config IPU_BRIDGE
tristate "Intel IPU Bridge"
- depends on ACPI || COMPILE_TEST
+ depends on ACPI
depends on I2C
help
The IPU bridge is a helper library for Intel IPU drivers to
diff --git a/drivers/media/pci/intel/ipu-bridge.c b/drivers/media/pci/intel/ipu-bridge.c
index 58ea01d40c0d..6463b2a47d78 100644
--- a/drivers/media/pci/intel/ipu-bridge.c
+++ b/drivers/media/pci/intel/ipu-bridge.c
@@ -5,6 +5,7 @@
#include <acpi/acpi_bus.h>
#include <linux/cleanup.h>
#include <linux/device.h>
+#include <linux/dmi.h>
#include <linux/i2c.h>
#include <linux/mei_cl_bus.h>
#include <linux/platform_device.h>
@@ -99,6 +100,28 @@ static const struct ipu_sensor_config ipu_supported_sensors[] = {
IPU_SENSOR_CONFIG("XMCC0003", 1, 321468000),
};
+/*
+ * DMI matches for laptops which have their sensor mounted upside-down
+ * without reporting a rotation of 180° in neither the SSDB nor the _PLD.
+ */
+static const struct dmi_system_id upside_down_sensor_dmi_ids[] = {
+ {
+ .matches = {
+ DMI_EXACT_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+ DMI_EXACT_MATCH(DMI_PRODUCT_NAME, "XPS 13 9350"),
+ },
+ .driver_data = "OVTI02C1",
+ },
+ {
+ .matches = {
+ DMI_EXACT_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+ DMI_EXACT_MATCH(DMI_PRODUCT_NAME, "XPS 16 9640"),
+ },
+ .driver_data = "OVTI02C1",
+ },
+ {} /* Terminating entry */
+};
+
static const struct ipu_property_names prop_names = {
.clock_frequency = "clock-frequency",
.rotation = "rotation",
@@ -249,6 +272,12 @@ static int ipu_bridge_read_acpi_buffer(struct acpi_device *adev, char *id,
static u32 ipu_bridge_parse_rotation(struct acpi_device *adev,
struct ipu_sensor_ssdb *ssdb)
{
+ const struct dmi_system_id *dmi_id;
+
+ dmi_id = dmi_first_match(upside_down_sensor_dmi_ids);
+ if (dmi_id && acpi_dev_hid_match(adev, dmi_id->driver_data))
+ return 180;
+
switch (ssdb->degree) {
case IPU_SENSOR_ROTATION_NORMAL:
return 0;
--
2.52.0
During sensor calibration I noticed that with the hflip control set to
false/disabled the image was mirrored.
The horizontal flip control is inverted and needs to be set to 1 to not
flip. This is something which seems to be common with various recent
Omnivision sensors, the ov01a10 and ov08x40 also have an inverted
mirror control.
Invert the hflip control to fix the sensor mirroring by default.
Fixes: b7cd2ba3f692 ("media: ov02c10: Support hflip and vflip")
Cc: stable(a)vger.kernel.org
Cc: Sebastian Reichel <sre(a)kernel.org>
Reviewed-by: Bryan O'Donoghue <bod(a)kernel.org>
Signed-off-by: Hans de Goede <johannes.goede(a)oss.qualcomm.com>
---
drivers/media/i2c/ov02c10.c | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/drivers/media/i2c/ov02c10.c b/drivers/media/i2c/ov02c10.c
index 384c2f0b1608..f912ae142040 100644
--- a/drivers/media/i2c/ov02c10.c
+++ b/drivers/media/i2c/ov02c10.c
@@ -170,7 +170,7 @@ static const struct reg_sequence sensor_1928x1092_30fps_setting[] = {
{0x3816, 0x01},
{0x3817, 0x01},
- {0x3820, 0xa0},
+ {0x3820, 0xa8},
{0x3821, 0x00},
{0x3822, 0x80},
{0x3823, 0x08},
@@ -462,9 +462,9 @@ static int ov02c10_set_ctrl(struct v4l2_ctrl *ctrl)
case V4L2_CID_HFLIP:
cci_write(ov02c10->regmap, OV02C10_ISP_X_WIN_CONTROL,
- ctrl->val ? 1 : 2, &ret);
+ ctrl->val ? 2 : 1, &ret);
cci_update_bits(ov02c10->regmap, OV02C10_ROTATE_CONTROL,
- BIT(3), ov02c10->hflip->val << 3, &ret);
+ BIT(3), ctrl->val ? 0 : BIT(3), &ret);
break;
case V4L2_CID_VFLIP:
--
2.52.0
The ov02c10 is capable of having its (crop) window shifted around with 1
pixel precision while streaming.
This allows changing the x/y window coordinates when changing flipping to
preserve the bayer-pattern.
__v4l2_ctrl_handler_setup() will now write the window coordinates at 0x3810
and 0x3812 so these can be dropped from sensor_1928x1092_30fps_setting.
Since the bayer-pattern is now unchanged, the V4L2_CTRL_FLAG_MODIFY_LAYOUT
flag can be dropped from the flip controls.
Note the original use of the V4L2_CTRL_FLAG_MODIFY_LAYOUT flag was
incomplete, besides setting the flag the driver should also have reported
a different mbus code when getting the source pad's format depending on
the hflip / vflip settings see the ov2680.c driver for example.
Fixes: b7cd2ba3f692 ("media: ov02c10: Support hflip and vflip")
Cc: stable(a)vger.kernel.org
Cc: Sebastian Reichel <sre(a)kernel.org>
Reviewed-by: Bryan O'Donoghue <bod(a)kernel.org>
Signed-off-by: Hans de Goede <johannes.goede(a)oss.qualcomm.com>
---
drivers/media/i2c/ov02c10.c | 12 ++++--------
1 file changed, 4 insertions(+), 8 deletions(-)
diff --git a/drivers/media/i2c/ov02c10.c b/drivers/media/i2c/ov02c10.c
index 6369841de88b..384c2f0b1608 100644
--- a/drivers/media/i2c/ov02c10.c
+++ b/drivers/media/i2c/ov02c10.c
@@ -165,10 +165,6 @@ static const struct reg_sequence sensor_1928x1092_30fps_setting[] = {
{0x3809, 0x88},
{0x380a, 0x04},
{0x380b, 0x44},
- {0x3810, 0x00},
- {0x3811, 0x02},
- {0x3812, 0x00},
- {0x3813, 0x01},
{0x3814, 0x01},
{0x3815, 0x01},
{0x3816, 0x01},
@@ -465,11 +461,15 @@ static int ov02c10_set_ctrl(struct v4l2_ctrl *ctrl)
break;
case V4L2_CID_HFLIP:
+ cci_write(ov02c10->regmap, OV02C10_ISP_X_WIN_CONTROL,
+ ctrl->val ? 1 : 2, &ret);
cci_update_bits(ov02c10->regmap, OV02C10_ROTATE_CONTROL,
BIT(3), ov02c10->hflip->val << 3, &ret);
break;
case V4L2_CID_VFLIP:
+ cci_write(ov02c10->regmap, OV02C10_ISP_Y_WIN_CONTROL,
+ ctrl->val ? 2 : 1, &ret);
cci_update_bits(ov02c10->regmap, OV02C10_ROTATE_CONTROL,
BIT(4), ov02c10->vflip->val << 4, &ret);
break;
@@ -551,13 +551,9 @@ static int ov02c10_init_controls(struct ov02c10 *ov02c10)
ov02c10->hflip = v4l2_ctrl_new_std(ctrl_hdlr, &ov02c10_ctrl_ops,
V4L2_CID_HFLIP, 0, 1, 1, 0);
- if (ov02c10->hflip)
- ov02c10->hflip->flags |= V4L2_CTRL_FLAG_MODIFY_LAYOUT;
ov02c10->vflip = v4l2_ctrl_new_std(ctrl_hdlr, &ov02c10_ctrl_ops,
V4L2_CID_VFLIP, 0, 1, 1, 0);
- if (ov02c10->vflip)
- ov02c10->vflip->flags |= V4L2_CTRL_FLAG_MODIFY_LAYOUT;
v4l2_ctrl_new_std_menu_items(ctrl_hdlr, &ov02c10_ctrl_ops,
V4L2_CID_TEST_PATTERN,
--
2.52.0
From: "Chia-Lin Kao (AceLan)" <acelan.kao(a)canonical.com>
[ Upstream commit b169e1733cadb614e87f69d7a5ae1b186c50d313 ]
Dell Pro Rugged 10/12 tablets has a reliable VGBS method.
If VGBS is not called on boot, the on-screen keyboard won't appear if the
device is booted without a keyboard.
Call VGBS on boot on thess devices to get the initial state of
SW_TABLET_MODE in a reliable way.
Signed-off-by: Chia-Lin Kao (AceLan) <acelan.kao(a)canonical.com>
Reviewed-by: Hans de Goede <johannes.goede(a)oss.qualcomm.com>
Link: https://patch.msgid.link/20251127070407.656463-1-acelan.kao@canonical.com
Reviewed-by: Ilpo Järvinen <ilpo.jarvinen(a)linux.intel.com>
Signed-off-by: Ilpo Järvinen <ilpo.jarvinen(a)linux.intel.com>
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
---
LLM Generated explanations, may be completely bogus:
## Summary Analysis
### What This Commit Does
This commit adds two Dell tablet models (Dell Pro Rugged 10 Tablet
RA00260 and Dell Pro Rugged 12 Tablet RA02260) to the
`dmi_vgbs_allow_list` DMI quirk table. When a device matches this list,
the driver calls the VGBS ACPI method at probe time to properly detect
the initial SW_TABLET_MODE state.
### The Bug Being Fixed
Without this quirk, on these Dell tablets:
- The VGBS method isn't called at boot
- The kernel doesn't know the device is in tablet mode
- The on-screen keyboard won't appear if the device is booted without a
physical keyboard attached
- This makes the device difficult/impossible to use in tablet-only mode
### Classification: Hardware Quirk/Workaround
This falls squarely into the **QUIRKS and WORKAROUNDS** category which
is explicitly allowed for stable backporting:
- It's a DMI-based allowlist entry for specific hardware models
- The pattern is identical to existing entries (HP Spectre, Microsoft
Surface Go, HP Dragonfly G2)
- The mechanism has been stable since Linux 5.10
### Risk Assessment
**Risk Level: EXTREMELY LOW**
- **Lines changed**: ~12 lines (just two DMI match entries)
- **Files touched**: 1 file
- **Scope**: Only affects Dell Pro Rugged 10/12 tablets
- **Cannot regress other hardware**: DMI matching is device-specific
- **Pattern proven**: Same structure as existing entries that have
worked for years
### Stability Indicators
- **Reviewed-by:** Hans de Goede (well-known x86 platform maintainer)
- **Reviewed-by:** Ilpo Järvinen (Intel platform maintainer)
- The `dmi_vgbs_allow_list` infrastructure has existed since v5.10-rc1
(commit 537b0dd4729e7)
### Stable Tree Criteria Assessment
| Criterion | Status |
|-----------|--------|
| Obviously correct | ✅ Yes - trivial data addition |
| Fixes real bug | ✅ Yes - on-screen keyboard not working |
| Small and contained | ✅ Yes - ~12 lines in 1 file |
| No new features | ✅ Yes - enables existing functionality |
| Applies cleanly | ✅ Yes - simple addition to allowlist |
### Concerns
1. **No explicit "Cc: stable" tag** - However, DMI quirk additions are
commonly appropriate for stable even without explicit tags
2. **No "Fixes:" tag** - This is adding new device support rather than
fixing a regression in existing code
### Verdict
This commit is appropriate for stable backporting because:
1. It's a minimal, surgical hardware quirk that only affects two
specific Dell tablet models
2. It fixes a real usability issue for affected users (tablet becomes
difficult to use without keyboard)
3. Zero risk of regression for any other hardware
4. The pattern exactly matches existing stable entries
5. The infrastructure has been stable since v5.10, available in all
current LTS kernels
6. Well-reviewed by experienced platform maintainers
The lack of explicit stable tags is not disqualifying - this is a
textbook example of a hardware quirk addition that should go to stable
trees to enable proper device functionality.
**YES**
drivers/platform/x86/intel/hid.c | 12 ++++++++++++
1 file changed, 12 insertions(+)
diff --git a/drivers/platform/x86/intel/hid.c b/drivers/platform/x86/intel/hid.c
index 9c07a7faf18fe..560cc063198e1 100644
--- a/drivers/platform/x86/intel/hid.c
+++ b/drivers/platform/x86/intel/hid.c
@@ -177,6 +177,18 @@ static const struct dmi_system_id dmi_vgbs_allow_list[] = {
DMI_MATCH(DMI_PRODUCT_NAME, "HP Elite Dragonfly G2 Notebook PC"),
},
},
+ {
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "Dell Pro Rugged 10 Tablet RA00260"),
+ },
+ },
+ {
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "Dell Pro Rugged 12 Tablet RA02260"),
+ },
+ },
{ }
};
--
2.51.0
From: Jens Reidel <adrian(a)mainlining.org>
[ Upstream commit e3c13e0caa8ceb7dec1a7c4fcfd9dbef56a69fbe ]
Set CLK_OPS_PARENT_ENABLE to ensure the parent gets prepared and enabled
when switching to it, fixing an "rcg didn't update its configuration"
warning.
Signed-off-by: Jens Reidel <adrian(a)mainlining.org>
Reviewed-by: Dmitry Baryshkov <dmitry.baryshkov(a)oss.qualcomm.com>
Link: https://lore.kernel.org/r/20250919-sm7150-dispcc-fixes-v1-3-308ad47c5fce@ma…
Signed-off-by: Bjorn Andersson <andersson(a)kernel.org>
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
---
LLM Generated explanations, may be completely bogus:
### 3. Technical Understanding
The `CLK_OPS_PARENT_ENABLE` flag (BIT(12) in clk-provider.h) ensures
that "parents need enable during gate/ungate, set rate and re-parent."
Without this flag, when the pixel clock source tries to switch to a new
parent clock, the parent might not be enabled, causing the RCG (Root
Clock Generator) to fail to update its configuration.
This is a well-known pattern in Qualcomm clock drivers - pixel clocks
dynamically switch parents during display panel initialization, and the
parent must be running for the configuration switch to complete.
### 4. Scope and Risk Assessment
- **Lines changed**: 1 line (adding a single flag)
- **Files touched**: 1 file
- **Complexity**: Very low
- **Risk**: Very low - this is an established pattern used across many
Qualcomm dispcc/gcc drivers
### 5. User Impact
- Affects SM7150-based devices (mobile phones)
- The warning indicates clock configuration issues that could affect
display initialization
- Limited scope - only SM7150 platform users
### 6. Stable Tree Applicability
- Driver was added in **v6.11** (commit 3829c412197e1)
- Only applicable to v6.11.y, v6.12.y, and newer stable trees
- NOT applicable to LTS kernels: 6.6.y, 6.1.y, 5.15.y (driver doesn't
exist)
### 7. Key Observations
**Against backporting:**
1. **No "Cc: stable(a)vger.kernel.org" tag** - The maintainers did not
request stable backport
2. **No "Fixes:" tag** - Doesn't reference the original commit where the
bug was introduced
3. **Very new driver** - SM7150 dispcc was added in v6.11, limiting
stable applicability
4. **Not a critical issue** - A warning message, not a crash, panic, or
data corruption
5. **Mobile SoC** - Users running stable kernels rarely use cutting-edge
mobile phone support
**For backporting:**
1. Valid bug fix with proven pattern
2. Extremely small and low-risk change
3. Has proper review from Qualcomm clock maintainers
drivers/clk/qcom/dispcc-sm7150.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/clk/qcom/dispcc-sm7150.c b/drivers/clk/qcom/dispcc-sm7150.c
index bdfff246ed3fe..ddc7230b8aea7 100644
--- a/drivers/clk/qcom/dispcc-sm7150.c
+++ b/drivers/clk/qcom/dispcc-sm7150.c
@@ -356,7 +356,7 @@ static struct clk_rcg2 dispcc_mdss_pclk0_clk_src = {
.name = "dispcc_mdss_pclk0_clk_src",
.parent_data = dispcc_parent_data_4,
.num_parents = ARRAY_SIZE(dispcc_parent_data_4),
- .flags = CLK_SET_RATE_PARENT,
+ .flags = CLK_SET_RATE_PARENT | CLK_OPS_PARENT_ENABLE,
.ops = &clk_pixel_ops,
},
};
--
2.51.0
The below “No resource for ep” warning appears when a StartTransfer
command is issued for bulk or interrupt endpoints in
`dwc3_gadget_ep_enable` while a previous StartTransfer on the same
endpoint is still in progress. The gadget functions drivers can invoke
`usb_ep_enable` (which triggers a new StartTransfer command) before the
earlier transfer has completed. Because the previous StartTransfer is
still active, `dwc3_gadget_ep_disable` can skip the required
`EndTransfer` due to `DWC3_EP_DELAY_STOP`, leading to the endpoint
resources are busy for previous StartTransfer and warning ("No resource
for ep") from dwc3 driver.
To resolve this, a check is added to `dwc3_gadget_ep_enable` that
checks the `DWC3_EP_TRANSFER_STARTED` flag before issuing a new
StartTransfer. By preventing a second StartTransfer on an already busy
endpoint, the resource conflict is eliminated, the warning disappears,
and potential kernel panics caused by `panic_on_warn` are avoided.
------------[ cut here ]------------
dwc3 13200000.dwc3: No resource for ep1out
WARNING: CPU: 0 PID: 700 at drivers/usb/dwc3/gadget.c:398 dwc3_send_gadget_ep_cmd+0x2f8/0x76c
Call trace:
dwc3_send_gadget_ep_cmd+0x2f8/0x76c
__dwc3_gadget_ep_enable+0x490/0x7c0
dwc3_gadget_ep_enable+0x6c/0xe4
usb_ep_enable+0x5c/0x15c
mp_eth_stop+0xd4/0x11c
__dev_close_many+0x160/0x1c8
__dev_change_flags+0xfc/0x220
dev_change_flags+0x24/0x70
devinet_ioctl+0x434/0x524
inet_ioctl+0xa8/0x224
sock_do_ioctl+0x74/0x128
sock_ioctl+0x3bc/0x468
__arm64_sys_ioctl+0xa8/0xe4
invoke_syscall+0x58/0x10c
el0_svc_common+0xa8/0xdc
do_el0_svc+0x1c/0x28
el0_svc+0x38/0x88
el0t_64_sync_handler+0x70/0xbc
el0t_64_sync+0x1a8/0x1ac
Fixes: a97ea994605e ("usb: dwc3: gadget: offset Start Transfer latency for bulk EPs")
Cc: stable(a)vger.kernel.org
Signed-off-by: Selvarasu Ganesan <selvarasu.g(a)samsung.com>
---
Changes in v2:
- Removed change-id.
- Updated commit message.
Link to v1: https://lore.kernel.org/linux-usb/20251117152812.622-1-selvarasu.g@samsung.…
---
drivers/usb/dwc3/gadget.c | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/drivers/usb/dwc3/gadget.c b/drivers/usb/dwc3/gadget.c
index 1f67fb6aead5..8d3caa71ea12 100644
--- a/drivers/usb/dwc3/gadget.c
+++ b/drivers/usb/dwc3/gadget.c
@@ -963,8 +963,9 @@ static int __dwc3_gadget_ep_enable(struct dwc3_ep *dep, unsigned int action)
* Issue StartTransfer here with no-op TRB so we can always rely on No
* Response Update Transfer command.
*/
- if (usb_endpoint_xfer_bulk(desc) ||
- usb_endpoint_xfer_int(desc)) {
+ if ((usb_endpoint_xfer_bulk(desc) ||
+ usb_endpoint_xfer_int(desc)) &&
+ !(dep->flags & DWC3_EP_TRANSFER_STARTED)) {
struct dwc3_gadget_ep_cmd_params params;
struct dwc3_trb *trb;
dma_addr_t trb_dma;
--
2.34.1
Previously, the AMPDU state bit for a given TID was set before attempting
to start a BA session, which could result in the AMPDU state being marked
active even if ieee80211_start_tx_ba_session() failed. This patch changes
the logic to only set the AMPDU state bit after successfully starting a BA
session, ensuring proper synchronization between AMPDU state and BA session
status.
This fixes potential issues with aggregation state tracking and improves
compatibility with mac80211 BA session management.
Fixes: 44eb173bdd4f ("wifi: mt76: mt7925: add link handling in mt7925_txwi_free")
Cc: stable(a)vger.kernel.org
Signed-off-by: Quan Zhou <quan.zhou(a)mediatek.com>
---
v2: modify to avoid weakening atomicity
---
drivers/net/wireless/mediatek/mt76/mt7925/mac.c | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/drivers/net/wireless/mediatek/mt76/mt7925/mac.c b/drivers/net/wireless/mediatek/mt76/mt7925/mac.c
index 871b67101976..5e5b1df78633 100644
--- a/drivers/net/wireless/mediatek/mt76/mt7925/mac.c
+++ b/drivers/net/wireless/mediatek/mt76/mt7925/mac.c
@@ -881,8 +881,10 @@ static void mt7925_tx_check_aggr(struct ieee80211_sta *sta, struct sk_buff *skb,
else
mlink = &msta->deflink;
- if (!test_and_set_bit(tid, &mlink->wcid.ampdu_state))
- ieee80211_start_tx_ba_session(sta, tid, 0);
+ if (!test_and_set_bit(tid, &mlink->wcid.ampdu_state)) {
+ if (ieee80211_start_tx_ba_session(sta, tid, 0))
+ clear_bit(tid, &mlink->wcid.ampdu_state);
+ }
}
static bool
--
2.45.2
[BUG]
For the following write sequence with 64K page size and 4K fs block size,
it will lead to file extent items to be inserted without any data
checksum:
mkfs.btrfs -s 4k -f $dev > /dev/null
mount $dev $mnt
xfs_io -f -c "pwrite 0 16k" -c "pwrite 32k 4k" -c pwrite "60k 64K" \
-c "truncate 16k" $mnt/foobar
umount $mnt
This will result the following 2 file extent items to be inserted (extra
trace point added to insert_ordered_extent_file_extent()):
btrfs_finish_one_ordered: root=5 ino=257 file_off=61440 num_bytes=4096 csum_bytes=0
btrfs_finish_one_ordered: root=5 ino=257 file_off=0 num_bytes=16384 csum_bytes=16384
Note for file offset 60K, we're inserting an file extent without any
data checksum.
Also note that range [32K, 36K) didn't reach
insert_ordered_extent_file_extent(), which is the correct behavior as
that OE is fully truncated, should not result any file extent.
Although file extent at 60K will be later dropped by btrfs_truncate(),
if the transaction got committed after file extent inserted but before
the file extent dropping, we will have a small window where we have a
file extent beyond EOF and without any data checksum.
That will cause "btrfs check" to report error.
[CAUSE]
The sequence happens like this:
- Buffered write dirtied the page cache and updated isize
Now the inode size is 64K, with the following page cache layout:
0 16K 32K 48K 64K
|/////////////| |//| |//|
- Truncate the inode to 16K
Which will trigger writeback through:
btrfs_setsize()
|- truncate_setsize()
| Now the inode size is set to 16K
|
|- btrfs_truncacte()
|- btrfs_wait_ordered_range() for [16K, u64(-1)]
|- btrfs_fdatawrite_range() for [16K, u64(-1)}
|- extent_writepage() for folio 0
|- writepage_delalloc()
| Generated OE for [0, 16K), [32K, 36K] and [60K, 64K)
|
|- extent_writepage_io()
Then inside extent_writepage_io(), the dirty fs blocks are handled
differently:
- Submit write for range [0, 16K)
As they are still inside the inode size (16K).
- Mark OE [32K, 36K) as truncated
Since we only call btrfs_lookup_first_ordered_range() once, which
returned the first OE after file offset 16K.
- Mark all OEs inside range [16K, 64K) as finished
Which will mark OE ranges [32K, 36K) and [60K, 64K) as finished.
For OE [32K, 36K) since it's already marked as truncated, and its
truncated length is 0, no file extent will be inserted.
For OE [60K, 64K) it has never been submitted thus has no data
checksum, and we insert the file extent as usual.
This is the root cause of file extent at 60K to be inserted without
any data checksum.
- Clear dirty flags for range [16K, 64K)
It is the function btrfs_folio_clear_dirty() which search and clear
any dirty blocks inside that range.
[FIX]
The bug itself is introduced a long time ago, way before subpage and
larger folio support.
At that time, fs block size must match page size, thus the range
[cur, end) is just one fs block.
But later with subpage and larger folios, the same range [cur, end)
can have multiple blocks and ordered extents.
Later commit 18de34daa7c6 ("btrfs: truncate ordered extent when skipping
writeback past i_size") is fixing a bug related to subpage/larger
folios, but it's still utilizing the old range [cur, end), meaning only
the first OE will be marked as truncated.
The proper fix here is to make EOF handling block-by-block, not trying
to handle the whole range to @end.
By this we always locate and truncate the OE for every dirty block.
Cc: stable(a)vger.kernel.org # 5.15+
Signed-off-by: Qu Wenruo <wqu(a)suse.com>
---
fs/btrfs/extent_io.c | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 629fd5af4286..a4b74023618d 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1728,7 +1728,7 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode,
struct btrfs_ordered_extent *ordered;
ordered = btrfs_lookup_first_ordered_range(inode, cur,
- folio_end - cur);
+ fs_info->sectorsize);
/*
* We have just run delalloc before getting here, so
* there must be an ordered extent.
@@ -1742,7 +1742,7 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode,
btrfs_put_ordered_extent(ordered);
btrfs_mark_ordered_io_finished(inode, folio, cur,
- end - cur, true);
+ fs_info->sectorsize, true);
/*
* This range is beyond i_size, thus we don't need to
* bother writing back.
@@ -1751,8 +1751,8 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode,
* writeback the sectors with subpage dirty bits,
* causing writeback without ordered extent.
*/
- btrfs_folio_clear_dirty(fs_info, folio, cur, end - cur);
- break;
+ btrfs_folio_clear_dirty(fs_info, folio, cur, fs_info->sectorsize);
+ continue;
}
ret = submit_one_sector(inode, folio, cur, bio_ctrl, i_size);
if (unlikely(ret < 0)) {
--
2.52.0
Since we recently started warning about uses of this function after the
atomic check phase completes, we've started getting warnings about this in
nouveau. It appears a misplaced drm_atomic_get_crtc_state() call has been
hiding in our .prepare_fb callback for a while.
So, fix this by adding a new nv50_head_atom_get_new() function and use that
in our .prepare_fb callback instead.
Signed-off-by: Lyude Paul <lyude(a)redhat.com>
Fixes: 1590700d94ac ("drm/nouveau/kms/nv50-: split each resource type into their own source files")
Cc: <stable(a)vger.kernel.org> # v4.18+
---
V2:
* Don't call IS_ERR against the return value of
drm_atomic_get_new_crtc_state(), it doesn't return pointer errors.
Signed-off-by: Lyude Paul <lyude(a)redhat.com>
---
drivers/gpu/drm/nouveau/dispnv50/atom.h | 13 +++++++++++++
drivers/gpu/drm/nouveau/dispnv50/wndw.c | 2 +-
2 files changed, 14 insertions(+), 1 deletion(-)
diff --git a/drivers/gpu/drm/nouveau/dispnv50/atom.h b/drivers/gpu/drm/nouveau/dispnv50/atom.h
index 93f8f4f645784..b43c4f9bbcdf5 100644
--- a/drivers/gpu/drm/nouveau/dispnv50/atom.h
+++ b/drivers/gpu/drm/nouveau/dispnv50/atom.h
@@ -152,8 +152,21 @@ static inline struct nv50_head_atom *
nv50_head_atom_get(struct drm_atomic_state *state, struct drm_crtc *crtc)
{
struct drm_crtc_state *statec = drm_atomic_get_crtc_state(state, crtc);
+
if (IS_ERR(statec))
return (void *)statec;
+
+ return nv50_head_atom(statec);
+}
+
+static inline struct nv50_head_atom *
+nv50_head_atom_get_new(struct drm_atomic_state *state, struct drm_crtc *crtc)
+{
+ struct drm_crtc_state *statec = drm_atomic_get_new_crtc_state(state, crtc);
+
+ if (!statec)
+ return NULL;
+
return nv50_head_atom(statec);
}
diff --git a/drivers/gpu/drm/nouveau/dispnv50/wndw.c b/drivers/gpu/drm/nouveau/dispnv50/wndw.c
index ef9e410babbfb..9a2c20fce0f3e 100644
--- a/drivers/gpu/drm/nouveau/dispnv50/wndw.c
+++ b/drivers/gpu/drm/nouveau/dispnv50/wndw.c
@@ -583,7 +583,7 @@ nv50_wndw_prepare_fb(struct drm_plane *plane, struct drm_plane_state *state)
asyw->image.offset[0] = nvbo->offset;
if (wndw->func->prepare) {
- asyh = nv50_head_atom_get(asyw->state.state, asyw->state.crtc);
+ asyh = nv50_head_atom_get_new(asyw->state.state, asyw->state.crtc);
if (IS_ERR(asyh))
return PTR_ERR(asyh);
base-commit: c7685d11108acb387e44e3d81194d0d8959eaa44
--
2.52.0
From: Jan H. Schönherr <jschoenh(a)amazon.de>
It is possible to degrade host performance by manipulating performance
counters from a VM and tricking the host hypervisor to enable branch
tracing. When the guest programs a CPU to track branch instructions and
deliver an interrupt after exactly one branch instruction, the value one
is handled by the host KVM/perf subsystems and treated incorrectly as a
special value to enable the branch trace store (BTS) subsystem. It
should not be possible to enable BTS from a guest. When BTS is enabled,
it leads to general host performance degradation to both VMs and host.
Perf considers the combination of PERF_COUNT_HW_BRANCH_INSTRUCTIONS with
a sample_period of 1 a special case and handles this as a BTS event (see
intel_pmu_has_bts_period()) -- a deviation from the usual semantic,
where the sample_period represents the amount of branch instructions to
encounter before the overflow handler is invoked.
Nothing prevents a guest from programming its vPMU with the above
settings (count branch, interrupt after one branch), which causes KVM to
erroneously instruct perf to create a BTS event within
pmc_reprogram_counter(), which does not have the desired semantics.
The guest could also do more benign actions and request an interrupt
after a more reasonable number of branch instructions via its vPMU. In
that case counting works initially. However, KVM occasionally pauses and
resumes the created performance counters. If the remaining amount of
branch instructions until interrupt has reached 1 exactly,
pmc_resume_counter() fails to resume the counter and a BTS event is
created instead with its incorrect semantics.
Fix this behavior by not passing the special value "1" as sample_period
to perf. Instead, perform the same quirk that happens later in
x86_perf_event_set_period() anyway, when the performance counter is
transferred to the actual PMU: bump the sample_period to 2.
Testing:
From guest:
`./wrmsr -p 12 0x186 0x1100c4`
`./wrmsr -p 12 0xc1 0xffffffffffff`
`./wrmsr -p 12 0x186 0x5100c4`
This sequence sets up branch instruction counting, initializes the counter
to overflow after one event (0xffffffffffff), and then enables edge
detection (bit 18) for branch events.
./wrmsr -p 12 0x186 0x1100c4
Writes to IA32_PERFEVTSEL0 (0x186)
Value 0x1100c4 breaks down as:
Event = 0xC4 (Branch instructions)
Bits 16-17: 0x1 (User mode only)
Bit 22: 1 (Enable counter)
./wrmsr -p 12 0xc1 0xffffffffffff
Writes to IA32_PMC0 (0xC1)
Sets counter to maximum value (0xffffffffffff)
This effectively sets up the counter to overflow on the next branch
./wrmsr -p 12 0x186 0x5100c4
Updates IA32_PERFEVTSEL0 again
Similar to first command but adds bit 18 (0x4 to 0x5)
Enables edge detection (bit 18)
These MSR writes are trapped by the hypervisor in KVM and forwarded to
the perf subsystem to create corresponding monitoring events.
It is possible to repro this problem in a more realistic guest scenario:
`perf record -e branches:u -c 2 -a &`
`perf record -e branches:u -c 2 -a &`
This presumably triggers the issue by KVM pausing and resuming the
performance counter at the wrong moment, when its value is about to
overflow.
Signed-off-by: Jan H. Schönherr <jschoenh(a)amazon.de>
Signed-off-by: Fernand Sieber <sieberf(a)amazon.com>
Reviewed-by: David Woodhouse <dwmw(a)amazon.co.uk>
Reviewed-by: Hendrik Borghorst <hborghor(a)amazon.de>
Link: https://lore.kernel.org/r/20251124100220.238177-1-sieberf@amazon.com
---
arch/x86/kvm/pmu.c | 13 +++++++++++++
1 file changed, 13 insertions(+)
diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
index 487ad19a236e..547512028e24 100644
--- a/arch/x86/kvm/pmu.c
+++ b/arch/x86/kvm/pmu.c
@@ -225,6 +225,19 @@ static u64 get_sample_period(struct kvm_pmc *pmc, u64 counter_value)
{
u64 sample_period = (-counter_value) & pmc_bitmask(pmc);
+ /*
+ * A sample_period of 1 might get mistaken by perf for a BTS event, see
+ * intel_pmu_has_bts_period(). This would prevent re-arming the counter
+ * via pmc_resume_counter(), followed by the accidental creation of an
+ * actual BTS event, which we do not want.
+ *
+ * Avoid this by bumping the sampling period. Note, that we do not lose
+ * any precision, because the same quirk happens later anyway (for
+ * different reasons) in x86_perf_event_set_period().
+ */
+ if (sample_period == 1)
+ sample_period = 2;
+
if (!sample_period)
sample_period = pmc_bitmask(pmc) + 1;
return sample_period;
--
2.43.0
Amazon Development Centre (South Africa) (Proprietary) Limited
29 Gogosoa Street, Observatory, Cape Town, Western Cape, 7925, South Africa
Registration Number: 2004 / 034463 / 07
of_get_child_by_name() returns a node pointer with refcount incremented.
Use the __free() attribute to manage the pgc_node reference, ensuring
automatic of_node_put() cleanup when pgc_node goes out of scope.
This eliminates the need for explicit error handling paths and avoids
reference count leaks.
Fixes: 721cabf6c660 ("soc: imx: move PGC handling to a new GPC driver")
Cc: stable(a)vger.kernel.org
Signed-off-by: Wentao Liang <vulab(a)iscas.ac.cn>
---
Change in V4:
- Fix typo error in code
Change in V3:
- Ensure variable is assigned when using cleanup attribute
Change in V2:
- Use __free() attribute instead of explicit of_node_put() calls
---
drivers/pmdomain/imx/gpc.c | 5 ++---
1 file changed, 2 insertions(+), 3 deletions(-)
diff --git a/drivers/pmdomain/imx/gpc.c b/drivers/pmdomain/imx/gpc.c
index f18c7e6e75dd..56a78cc86584 100644
--- a/drivers/pmdomain/imx/gpc.c
+++ b/drivers/pmdomain/imx/gpc.c
@@ -403,13 +403,12 @@ static int imx_gpc_old_dt_init(struct device *dev, struct regmap *regmap,
static int imx_gpc_probe(struct platform_device *pdev)
{
const struct imx_gpc_dt_data *of_id_data = device_get_match_data(&pdev->dev);
- struct device_node *pgc_node;
+ struct device_node *pgc_node __free(device_node)
+ = of_get_child_by_name(pdev->dev.of_node, "pgc");
struct regmap *regmap;
void __iomem *base;
int ret;
- pgc_node = of_get_child_by_name(pdev->dev.of_node, "pgc");
-
/* bail out if DT too old and doesn't provide the necessary info */
if (!of_property_present(pdev->dev.of_node, "#power-domain-cells") &&
!pgc_node)
--
2.34.1
According to PCIe r7.0, sec. 6.2.11, "Implementation Note: Determination
of DPC Control", it is recommended that the Operating System link the
enablement of Downstream Port Containment (DPC) to the enablement of
Advanced Error Reporting (AER).
However, AER is advertised only on Root Port (RP) or Root Complex Event
Collector (RCEC) devices. On the other hand, DPC may be advertised on
any PCIe device in the hierarchy. In fact, since the main usecase of DPC
is for the switch upstream of an endpoint device to trigger a signal to
the host-bridge, it is imperative that it be supported on non-RP,
non-RCEC devices.
Previously portdrv has interpreted the spec to mean that the AER service
must be available on the same device in order for DPC to be available.
This is not what the implementation note meant to imply. If the firmware
hands Linux control of AER via _OSC on the host bridge upstream of the
device, then Linux should be allowed to assume control of DPC on the device.
The comment above this check alludes to this, by saying:
> With dpc-native, allow Linux to use DPC even if it doesn't have
> permission to use AER.
However, permission to use AER is negotiated at the host bridge, not
per-device. So we should not link DPC to enabling AER at the device.
Instead, DPC should be enabled if the OS has control of AER for the
host bridge that is upstream of the device in question, or if dpc-native
was set on the command line.
Cc: stable(a)vger.kernel.org
Signed-off-by: Darshit Shah <darnshah(a)amazon.de>
Reviewed-by: Lukas Wunner <lukas(a)wunner.de>
Reviewed-by: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy(a)linux.intel.com>
---
Changes from v2:
* + stable(a)vger.kernel.org since moving to v6.2+ breaks DPC on our systems
* Minor stylistic changes to commit message
* NO functional changes
drivers/pci/pcie/portdrv.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/pci/pcie/portdrv.c b/drivers/pci/pcie/portdrv.c
index 38a41ccf79b9..8db2fa140ae2 100644
--- a/drivers/pci/pcie/portdrv.c
+++ b/drivers/pci/pcie/portdrv.c
@@ -264,7 +264,7 @@ static int get_port_device_capability(struct pci_dev *dev)
*/
if (pci_find_ext_capability(dev, PCI_EXT_CAP_ID_DPC) &&
pci_aer_available() &&
- (pcie_ports_dpc_native || (services & PCIE_PORT_SERVICE_AER)))
+ (host->native_aer || pcie_ports_dpc_native))
services |= PCIE_PORT_SERVICE_DPC;
/* Enable bandwidth control if more than one speed is supported. */
--
2.47.3
Amazon Web Services Development Center Germany GmbH
Tamara-Danz-Str. 13
10243 Berlin
Geschaeftsfuehrung: Christof Hellmis, Andreas Stieger
Eingetragen am Amtsgericht Charlottenburg unter HRB 257764 B
Sitz: Berlin
Ust-ID: DE 365 538 597
scx_enable() calls scx_bypass(true) to initialize in bypass mode and then
scx_bypass(false) on success to exit. If scx_enable() fails during task
initialization - e.g. scx_cgroup_init() or scx_init_task() returns an error -
it jumps to err_disable while bypass is still active. scx_disable_workfn()
then calls scx_bypass(true/false) for its own bypass, leaving the bypass depth
at 1 instead of 0. This causes the system to remain permanently in bypass mode
after a failed scx_enable().
Failures after task initialization is complete - e.g. scx_tryset_enable_state()
at the end - already call scx_bypass(false) before reaching the error path and
are not affected. This only affects a subset of failure modes.
Fix it by tracking whether scx_enable() called scx_bypass(true) in a bool and
having scx_disable_workfn() call an extra scx_bypass(false) to clear it. This
is a temporary measure as the bypass depth will be moved into the sched
instance, which will make this tracking unnecessary.
Fixes: 8c2090c504e9 ("sched_ext: Initialize in bypass mode")
Cc: stable(a)vger.kernel.org # v6.12+
Reported-by: Chris Mason <clm(a)meta.com>
Signed-off-by: Tejun Heo <tj(a)kernel.org>
---
kernel/sched/ext.c | 14 ++++++++++++++
1 file changed, 14 insertions(+)
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -41,6 +41,13 @@ static bool scx_init_task_enabled;
static bool scx_switching_all;
DEFINE_STATIC_KEY_FALSE(__scx_switched_all);
+/*
+ * Tracks whether scx_enable() called scx_bypass(true). Used to balance bypass
+ * depth on enable failure. Will be removed when bypass depth is moved into the
+ * sched instance.
+ */
+static bool scx_bypassed_for_enable;
+
static atomic_long_t scx_nr_rejected = ATOMIC_LONG_INIT(0);
static atomic_long_t scx_hotplug_seq = ATOMIC_LONG_INIT(0);
@@ -4318,6 +4325,11 @@ static void scx_disable_workfn(struct kt
scx_dsp_max_batch = 0;
free_kick_syncs();
+ if (scx_bypassed_for_enable) {
+ scx_bypassed_for_enable = false;
+ scx_bypass(false);
+ }
+
mutex_unlock(&scx_enable_mutex);
WARN_ON_ONCE(scx_set_enable_state(SCX_DISABLED) != SCX_DISABLING);
@@ -4970,6 +4982,7 @@ static int scx_enable(struct sched_ext_o
* Init in bypass mode to guarantee forward progress.
*/
scx_bypass(true);
+ scx_bypassed_for_enable = true;
for (i = SCX_OPI_NORMAL_BEGIN; i < SCX_OPI_NORMAL_END; i++)
if (((void (**)(void))ops)[i])
@@ -5067,6 +5080,7 @@ static int scx_enable(struct sched_ext_o
scx_task_iter_stop(&sti);
percpu_up_write(&scx_fork_rwsem);
+ scx_bypassed_for_enable = false;
scx_bypass(false);
if (!scx_tryset_enable_state(SCX_ENABLED, SCX_ENABLING)) {
--
tejun
Initialize the eb.vma array with values of 0 when the eb structure is
first set up. In particular, this sets the eb->vma[i].vma pointers to
NULL, simplifying cleanup and getting rid of the bug described below.
During the execution of eb_lookup_vmas(), the eb->vma array is
successively filled up with struct eb_vma objects. This process includes
calling eb_add_vma(), which might fail; however, even in the event of
failure, eb->vma[i].vma is set for the currently processed buffer.
If eb_add_vma() fails, eb_lookup_vmas() returns with an error, which
prompts a call to eb_release_vmas() to clean up the mess. Since
eb_lookup_vmas() might fail during processing any (possibly not first)
buffer, eb_release_vmas() checks whether a buffer's vma is NULL to know
at what point did the lookup function fail.
In eb_lookup_vmas(), eb->vma[i].vma is set to NULL if either the helper
function eb_lookup_vma() or eb_validate_vma() fails. eb->vma[i+1].vma is
set to NULL in case i915_gem_object_userptr_submit_init() fails; the
current one needs to be cleaned up by eb_release_vmas() at this point,
so the next one is set. If eb_add_vma() fails, neither the current nor
the next vma is nullified, which is a source of a NULL deref bug
described in the issue linked in the Closes tag.
When entering eb_lookup_vmas(), the vma pointers are set to the slab
poison value, instead of NULL. This doesn't matter for the actual
lookup, since it gets overwritten anyway, however the eb_release_vmas()
function only recognizes NULL as the stopping value, hence the pointers
are being nullified as they go in case of intermediate failure. This
patch changes the approach to filling them all with NULL at the start
instead, rather than handling that manually during failure.
Reported-by: Gangmin Kim <km.kim1503(a)gmail.com>
Closes: https://gitlab.freedesktop.org/drm/i915/kernel/-/issues/15062
Fixes: 544460c33821 ("drm/i915: Multi-BB execbuf")
Cc: <stable(a)vger.kernel.org> # 5.16.x
Signed-off-by: Krzysztof Niemiec <krzysztof.niemiec(a)intel.com>
---
I messed up the continuity in previous revisions; the original patch
was sent as [1], and the first revision (which I didn't mark as v2 due
to the title change) was sent as [2].
This is the full current changelog:
v3:
- use memset() to fill the entire eb.vma array with zeros instead of
looping through the elements (Janusz)
- add a comment clarifying the mechanism of the initial allocation (Janusz)
- change the commit log again, including title
- rearrange the tags to keep checkpatch happy
v2:
- set the eb->vma[i].vma pointers to NULL during setup instead of
ad-hoc at failure (Janusz)
- romanize the reporter's name (Andi, offline)
- change the commit log, including title
[1] https://patchwork.freedesktop.org/series/156832/
[2] https://patchwork.freedesktop.org/series/158036/
.../gpu/drm/i915/gem/i915_gem_execbuffer.c | 36 +++++++++----------
1 file changed, 16 insertions(+), 20 deletions(-)
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
index b057c2fa03a4..5f2b736b53ab 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
@@ -951,13 +951,13 @@ static int eb_lookup_vmas(struct i915_execbuffer *eb)
vma = eb_lookup_vma(eb, eb->exec[i].handle);
if (IS_ERR(vma)) {
err = PTR_ERR(vma);
- goto err;
+ return err;
}
err = eb_validate_vma(eb, &eb->exec[i], vma);
if (unlikely(err)) {
i915_vma_put(vma);
- goto err;
+ return err;
}
err = eb_add_vma(eb, ¤t_batch, i, vma);
@@ -966,19 +966,8 @@ static int eb_lookup_vmas(struct i915_execbuffer *eb)
if (i915_gem_object_is_userptr(vma->obj)) {
err = i915_gem_object_userptr_submit_init(vma->obj);
- if (err) {
- if (i + 1 < eb->buffer_count) {
- /*
- * Execbuffer code expects last vma entry to be NULL,
- * since we already initialized this entry,
- * set the next value to NULL or we mess up
- * cleanup handling.
- */
- eb->vma[i + 1].vma = NULL;
- }
-
+ if (err)
return err;
- }
eb->vma[i].flags |= __EXEC_OBJECT_USERPTR_INIT;
eb->args->flags |= __EXEC_USERPTR_USED;
@@ -986,10 +975,6 @@ static int eb_lookup_vmas(struct i915_execbuffer *eb)
}
return 0;
-
-err:
- eb->vma[i].vma = NULL;
- return err;
}
static int eb_lock_vmas(struct i915_execbuffer *eb)
@@ -3375,7 +3360,9 @@ i915_gem_do_execbuffer(struct drm_device *dev,
eb.exec = exec;
eb.vma = (struct eb_vma *)(exec + args->buffer_count + 1);
- eb.vma[0].vma = NULL;
+
+ memset(eb.vma, 0x00, args->buffer_count * sizeof(struct eb_vma));
+
eb.batch_pool = NULL;
eb.invalid_flags = __EXEC_OBJECT_UNKNOWN_FLAGS;
@@ -3584,7 +3571,16 @@ i915_gem_execbuffer2_ioctl(struct drm_device *dev, void *data,
if (err)
return err;
- /* Allocate extra slots for use by the command parser */
+ /*
+ * Allocate extra slots for use by the command parser.
+ *
+ * Note that this allocation handles two different arrays (the
+ * exec2_list array, and the eventual eb.vma array introduced in
+ * i915_gem_do_execubuffer()), that reside in virtually contiguous
+ * memory. Also note that the allocation doesn't fill the area with
+ * zeros (the first part doesn't need to be), but the second part only
+ * is explicitly zeroed later in i915_gem_do_execbuffer().
+ */
exec2_list = kvmalloc_array(count + 2, eb_element_size(),
__GFP_NOWARN | GFP_KERNEL);
if (exec2_list == NULL) {
--
2.45.2
When nvmem_cell_read() fails in mt798x_phy_calibration(), the function
returns without calling nvmem_cell_put(), leaking the cell reference.
Move nvmem_cell_put() right after nvmem_cell_read() to ensure the cell
reference is always released regardless of the read result.
Found via static analysis and code review.
Fixes: 98c485eaf509 ("net: phy: add driver for MediaTek SoC built-in GE PHYs")
Cc: stable(a)vger.kernel.org
Signed-off-by: Miaoqian Lin <linmq006(a)gmail.com>
---
drivers/net/phy/mediatek/mtk-ge-soc.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/net/phy/mediatek/mtk-ge-soc.c b/drivers/net/phy/mediatek/mtk-ge-soc.c
index cd09fbf92ef2..2c4bbc236202 100644
--- a/drivers/net/phy/mediatek/mtk-ge-soc.c
+++ b/drivers/net/phy/mediatek/mtk-ge-soc.c
@@ -1167,9 +1167,9 @@ static int mt798x_phy_calibration(struct phy_device *phydev)
}
buf = (u32 *)nvmem_cell_read(cell, &len);
+ nvmem_cell_put(cell);
if (IS_ERR(buf))
return PTR_ERR(buf);
- nvmem_cell_put(cell);
if (!buf[0] || !buf[1] || !buf[2] || !buf[3] || len < 4 * sizeof(u32)) {
phydev_err(phydev, "invalid efuse data\n");
--
2.25.1
It is checked almost always in dpu_encoder_phys_wb_setup_ctl(), but in a
single place the check is missing.
Also use convenient locals instead of phys_enc->* where available.
Cc: stable(a)vger.kernel.org
Fixes: d7d0e73f7de33 ("drm/msm/dpu: introduce the dpu_encoder_phys_* for writeback")
Signed-off-by: Nikolay Kuratov <kniv(a)yandex-team.ru>
---
drivers/gpu/drm/msm/disp/dpu1/dpu_encoder_phys_wb.c | 10 ++++------
1 file changed, 4 insertions(+), 6 deletions(-)
diff --git a/drivers/gpu/drm/msm/disp/dpu1/dpu_encoder_phys_wb.c b/drivers/gpu/drm/msm/disp/dpu1/dpu_encoder_phys_wb.c
index 46f348972a97..6d28f2281c76 100644
--- a/drivers/gpu/drm/msm/disp/dpu1/dpu_encoder_phys_wb.c
+++ b/drivers/gpu/drm/msm/disp/dpu1/dpu_encoder_phys_wb.c
@@ -247,14 +247,12 @@ static void dpu_encoder_phys_wb_setup_ctl(struct dpu_encoder_phys *phys_enc)
if (hw_cdm)
intf_cfg.cdm = hw_cdm->idx;
- if (phys_enc->hw_pp->merge_3d && phys_enc->hw_pp->merge_3d->ops.setup_3d_mode)
- phys_enc->hw_pp->merge_3d->ops.setup_3d_mode(phys_enc->hw_pp->merge_3d,
- mode_3d);
+ if (hw_pp && hw_pp->merge_3d && hw_pp->merge_3d->ops.setup_3d_mode)
+ hw_pp->merge_3d->ops.setup_3d_mode(hw_pp->merge_3d, mode_3d);
/* setup which pp blk will connect to this wb */
- if (hw_pp && phys_enc->hw_wb->ops.bind_pingpong_blk)
- phys_enc->hw_wb->ops.bind_pingpong_blk(phys_enc->hw_wb,
- phys_enc->hw_pp->idx);
+ if (hw_pp && hw_wb->ops.bind_pingpong_blk)
+ hw_wb->ops.bind_pingpong_blk(hw_wb, hw_pp->idx);
phys_enc->hw_ctl->ops.setup_intf_cfg(phys_enc->hw_ctl, &intf_cfg);
} else if (phys_enc->hw_ctl && phys_enc->hw_ctl->ops.setup_intf_cfg) {
--
2.34.1
Hi,
The following patch has already been merged into mainline (master) and
the linux-6.12.y stable branch:
net: lan743x: Allocate rings outside ZONE_DMA (commit
8a8f3f4991761a70834fe6719d09e9fd338a766e)
https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id…https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?h=…
I believe this patch should be backported. Our customer is experiencing
an issue where a Microchip LAN7430 NIC fails to work in a crashkernel
environment.
Because the driver uses the ZONE_DMA flag, memory allocation fails when
crashkernel reserves memory — preventing the card from functioning,
especially when they try to transfer the VMcore file over network.
We are using older kernel versions and request that this fix be applied
to the following branches:
linux-6.1.y
linux-6.6.y
linux-6.12.y
Could you please help backport this fix?
Thank you for your time and support.
Best regards,
Liyin
We switched from (wrongly) using the page count to an independent
shared count. Now, shared page tables have a refcount of 1 (excluding
speculative references) and instead use ptdesc->pt_share_count to
identify sharing.
We didn't convert hugetlb_pmd_shared(), so right now, we would never
detect a shared PMD table as such, because sharing/unsharing no longer
touches the refcount of a PMD table.
Page migration, like mbind() or migrate_pages() would allow for migrating
folios mapped into such shared PMD tables, even though the folios are
not exclusive. In smaps we would account them as "private" although they
are "shared", and we would be wrongly setting the PM_MMAP_EXCLUSIVE in the
pagemap interface.
Fix it by properly using ptdesc_pmd_is_shared() in hugetlb_pmd_shared().
Fixes: 59d9094df3d7 ("mm: hugetlb: independent PMD page table shared count")
Cc: <stable(a)vger.kernel.org>
Cc: Liu Shixin <liushixin2(a)huawei.com>
Signed-off-by: David Hildenbrand (Red Hat) <david(a)kernel.org>
---
include/linux/hugetlb.h | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 019a1c5281e4e..03c8725efa289 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -1326,7 +1326,7 @@ static inline __init void hugetlb_cma_reserve(int order)
#ifdef CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING
static inline bool hugetlb_pmd_shared(pte_t *pte)
{
- return page_count(virt_to_page(pte)) > 1;
+ return ptdesc_pmd_is_shared(virt_to_ptdesc(pte));
}
#else
static inline bool hugetlb_pmd_shared(pte_t *pte)
--
2.52.0
Hi, all
We hit hard-lockups when the Intel IOMMU waits indefinitely for an ATS invalidation
that cannot complete, especially under GDR high-load conditions.
1. Hard-lock when a passthrough PCIe NIC with ATS enabled link-down in Intel IOMMU
non-scalable mode. Two scenarios exist: NIC link-down with an explicit link-down
event and link-down without any event.
a) NIC link-down with an explicit link-dow event.
Call Trace:
qi_submit_sync
qi_flush_dev_iotlb
__context_flush_dev_iotlb.part.0
domain_context_clear_one_cb
pci_for_each_dma_alias
device_block_translation
blocking_domain_attach_dev
iommu_deinit_device
__iommu_group_remove_device
iommu_release_device
iommu_bus_notifier
blocking_notifier_call_chain
bus_notify
device_del
pci_remove_bus_device
pci_stop_and_remove_bus_device
pciehp_unconfigure_device
pciehp_disable_slot
pciehp_handle_presence_or_link_change
pciehp_ist
b) NIC link-down without an event - hard-lock on VM destroy.
Call Trace:
qi_submit_sync
qi_flush_dev_iotlb
__context_flush_dev_iotlb.part.0
domain_context_clear_one_cb
pci_for_each_dma_alias
device_block_translation
blocking_domain_attach_dev
__iommu_attach_device
__iommu_device_set_domain
__iommu_group_set_domain_internal
iommu_detach_group
vfio_iommu_type1_detach_group
vfio_group_detach_container
vfio_group_fops_release
__fput
2. Hard-lock when a passthrough PCIe NIC with ATS enabled link-down in Intel IOMMU
scalable mode; NIC link-down without an event hard-locks on VM destroy.
Call Trace:
qi_submit_sync
qi_flush_dev_iotlb
intel_pasid_tear_down_entry
device_block_translation
blocking_domain_attach_dev
__iommu_attach_device
__iommu_device_set_domain
__iommu_group_set_domain_internal
iommu_detach_group
vfio_iommu_type1_detach_group
vfio_group_detach_container
vfio_group_fops_release
__fput
Fix both issues with two patches:
1. Skip dev-IOTLB flush for inaccessible devices in __context_flush_dev_iotlb() using
pci_device_is_present().
2. Use pci_device_is_present() instead of pci_dev_is_disconnected() to decide when to
skip ATS invalidation in devtlb_invalidation_with_pasid().
Best Regards,
Jinhui
---
v1: https://lore.kernel.org/all/20251210171431.1589-1-guojinhui.liam@bytedance.…
Changelog in v1 -> v2 (suggested by Baolu Lu)
- Simplify the pci_device_is_present() check in __context_flush_dev_iotlb().
- Add Cc: stable(a)vger.kernel.org to both patches.
Jinhui Guo (2):
iommu/vt-d: Skip dev-iotlb flush for inaccessible PCIe device without
scalable mode
iommu/vt-d: Flush dev-IOTLB only when PCIe device is accessible in
scalable mode
drivers/iommu/intel/pasid.c | 11 ++++++++++-
1 file changed, 10 insertions(+), 1 deletion(-)
--
2.20.1
As reported, ever since commit 1013af4f585f ("mm/hugetlb: fix
huge_pmd_unshare() vs GUP-fast race") we can end up in some situations
where we perform so many IPI broadcasts when unsharing hugetlb PMD page
tables that it severely regresses some workloads.
In particular, when we fork()+exit(), or when we munmap() a large
area backed by many shared PMD tables, we perform one IPI broadcast per
unshared PMD table.
There are two optimizations to be had:
(1) When we process (unshare) multiple such PMD tables, such as during
exit(), it is sufficient to send a single IPI broadcast (as long as
we respect locking rules) instead of one per PMD table.
Locking prevents that any of these PMD tables could get reuse before
we drop the lock.
(2) When we are not the last sharer (> 2 users including us), there is
no need to send the IPI broadcast. The shared PMD tables cannot
become exclusive (fully unshared) before an IPI will be broadcasted
by the last sharer.
Concurrent GUP-fast could walk into a PMD table just before we
unshared it. It could then succeed in grabbing a page from the
shared page table even after munmap() etc succeeded (and supressed
an IPI). But there is not difference compared to GUP-fast just
sleeping for a while after grabbing the page and re-enabling IRQs.
Most importantly, GUP-fast will never walk into page tables that are
no-longer shared, because the last sharer will issue an IPI
broadcast.
(if ever required, checking whether the PUD changed in GUP-fast
after grabbing the page like we do in the PTE case could handle
this)
So let's rework PMD sharing TLB flushing + IPI sync to use the mmu_gather
infrastructure so we can implement these optimizations and demystify the
code at least a bit. Extend the mmu_gather infrastructure to be able to
deal with our special hugetlb PMD table sharing implementation.
We'll consolidate the handling for (full) unsharing of PMD tables in
tlb_unshare_pmd_ptdesc() and tlb_flush_unshared_tables(), and track
in "struct mmu_gather" whether we had (full) unsharing of PMD tables.
Because locking is very special (concurrent unsharing+reuse must be
prevented), we disallow deferring flushing to tlb_finish_mmu() and instead
require an explicit earlier call to tlb_flush_unshared_tables().
From hugetlb code, we call huge_pmd_unshare_flush() where we make sure
that the expected lock protecting us from concurrent unsharing+reuse is
still held.
Check with a VM_WARN_ON_ONCE() in tlb_finish_mmu() that
tlb_flush_unshared_tables() was properly called earlier.
Document it all properly.
Notes about tlb_remove_table_sync_one() interaction with unsharing:
There are two fairly tricky things:
(1) tlb_remove_table_sync_one() is a NOP on architectures without
CONFIG_MMU_GATHER_RCU_TABLE_FREE.
Here, the assumption is that the previous TLB flush would send an
IPI to all relevant CPUs. Careful: some architectures like x86 only
send IPIs to all relevant CPUs when tlb->freed_tables is set.
The relevant architectures should be selecting
MMU_GATHER_RCU_TABLE_FREE, but x86 might not do that in stable
kernels and it might have been problematic before this patch.
Also, the arch flushing behavior (independent of IPIs) is different
when tlb->freed_tables is set. Do we have to enlighten them to also
take care of tlb->unshared_tables? So far we didn't care, so
hopefully we are fine. Of course, we could be setting
tlb->freed_tables as well, but that might then unnecessarily flush
too much, because the semantics of tlb->freed_tables are a bit
fuzzy.
This patch changes nothing in this regard.
(2) tlb_remove_table_sync_one() is not a NOP on architectures with
CONFIG_MMU_GATHER_RCU_TABLE_FREE that actually don't need a sync.
Take x86 as an example: in the common case (!pv, !X86_FEATURE_INVLPGB)
we still issue IPIs during TLB flushes and don't actually need the
second tlb_remove_table_sync_one().
This optimized can be implemented on top of this, by checking e.g., in
tlb_remove_table_sync_one() whether we really need IPIs. But as
described in (1), it really must honor tlb->freed_tables then to
send IPIs to all relevant CPUs.
Further note that the ptdesc_pmd_pts_dec() in huge_pmd_share() is not a
concern, as we are holding the i_mmap_lock the whole time, preventing
concurrent unsharing. That ptdesc_pmd_pts_dec() usage will be removed
separately as a cleanup later.
There are plenty more cleanups to be had, but they have to wait until
this is fixed.
Fixes: 1013af4f585f ("mm/hugetlb: fix huge_pmd_unshare() vs GUP-fast race")
Reported-by: Uschakow, Stanislav" <suschako(a)amazon.de>
Closes: https://lore.kernel.org/all/4d3878531c76479d9f8ca9789dc6485d@amazon.de/
Cc: <stable(a)vger.kernel.org>
Signed-off-by: David Hildenbrand (Red Hat) <david(a)kernel.org>
---
include/asm-generic/tlb.h | 69 +++++++++++++++++++++-
include/linux/hugetlb.h | 19 +++---
mm/hugetlb.c | 121 ++++++++++++++++++++++----------------
mm/mmu_gather.c | 6 ++
mm/mprotect.c | 2 +-
mm/rmap.c | 25 +++++---
6 files changed, 173 insertions(+), 69 deletions(-)
diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h
index 1fff717cae510..324a21f53b644 100644
--- a/include/asm-generic/tlb.h
+++ b/include/asm-generic/tlb.h
@@ -364,6 +364,17 @@ struct mmu_gather {
unsigned int vma_huge : 1;
unsigned int vma_pfn : 1;
+ /*
+ * Did we unshare (unmap) any shared page tables?
+ */
+ unsigned int unshared_tables : 1;
+
+ /*
+ * Did we unshare any page tables such that they are now exclusive
+ * and could get reused+modified by the new owner?
+ */
+ unsigned int fully_unshared_tables : 1;
+
unsigned int batch_count;
#ifndef CONFIG_MMU_GATHER_NO_GATHER
@@ -400,6 +411,7 @@ static inline void __tlb_reset_range(struct mmu_gather *tlb)
tlb->cleared_pmds = 0;
tlb->cleared_puds = 0;
tlb->cleared_p4ds = 0;
+ tlb->unshared_tables = 0;
/*
* Do not reset mmu_gather::vma_* fields here, we do not
* call into tlb_start_vma() again to set them if there is an
@@ -484,7 +496,7 @@ static inline void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb)
* these bits.
*/
if (!(tlb->freed_tables || tlb->cleared_ptes || tlb->cleared_pmds ||
- tlb->cleared_puds || tlb->cleared_p4ds))
+ tlb->cleared_puds || tlb->cleared_p4ds || tlb->unshared_tables))
return;
tlb_flush(tlb);
@@ -773,6 +785,61 @@ static inline bool huge_pmd_needs_flush(pmd_t oldpmd, pmd_t newpmd)
}
#endif
+#ifdef CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING
+static inline void tlb_unshare_pmd_ptdesc(struct mmu_gather *tlb, struct ptdesc *pt,
+ unsigned long addr)
+{
+ /*
+ * The caller must make sure that concurrent unsharing + exclusive
+ * reuse is impossible until tlb_flush_unshared_tables() was called.
+ */
+ VM_WARN_ON_ONCE(!ptdesc_pmd_is_shared(pt));
+ ptdesc_pmd_pts_dec(pt);
+
+ /* Clearing a PUD pointing at a PMD table with PMD leaves. */
+ tlb_flush_pmd_range(tlb, addr & PUD_MASK, PUD_SIZE);
+
+ /*
+ * If the page table is now exclusively owned, we fully unshared
+ * a page table.
+ */
+ if (!ptdesc_pmd_is_shared(pt))
+ tlb->fully_unshared_tables = true;
+ tlb->unshared_tables = true;
+}
+
+static inline void tlb_flush_unshared_tables(struct mmu_gather *tlb)
+{
+ /*
+ * As soon as the caller drops locks to allow for reuse of
+ * previously-shared tables, these tables could get modified and
+ * even reused outside of hugetlb context. So flush the TLB now.
+ *
+ * Note that we cannot defer the flush to a later point even if we are
+ * not the last sharer of the page table.
+ */
+ if (tlb->unshared_tables)
+ tlb_flush_mmu_tlbonly(tlb);
+
+ /*
+ * Similarly, we must make sure that concurrent GUP-fast will not
+ * walk previously-shared page tables that are getting modified+reused
+ * elsewhere. So broadcast an IPI to wait for any concurrent GUP-fast.
+ *
+ * We only perform this when we are the last sharer of a page table,
+ * as the IPI will reach all CPUs: any GUP-fast.
+ *
+ * Note that on configs where tlb_remove_table_sync_one() is a NOP,
+ * the expectation is that the tlb_flush_mmu_tlbonly() would have issued
+ * required IPIs already for us.
+ */
+ if (tlb->fully_unshared_tables) {
+ tlb_remove_table_sync_one();
+ tlb->fully_unshared_tables = false;
+ }
+}
+#endif /* CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING */
+
#endif /* CONFIG_MMU */
#endif /* _ASM_GENERIC__TLB_H */
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 03c8725efa289..63b248c6bfd47 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -240,8 +240,9 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
pte_t *huge_pte_offset(struct mm_struct *mm,
unsigned long addr, unsigned long sz);
unsigned long hugetlb_mask_last_page(struct hstate *h);
-int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long addr, pte_t *ptep);
+int huge_pmd_unshare(struct mmu_gather *tlb, struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep);
+void huge_pmd_unshare_flush(struct mmu_gather *tlb, struct vm_area_struct *vma);
void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
unsigned long *start, unsigned long *end);
@@ -271,7 +272,7 @@ void hugetlb_vma_unlock_write(struct vm_area_struct *vma);
int hugetlb_vma_trylock_write(struct vm_area_struct *vma);
void hugetlb_vma_assert_locked(struct vm_area_struct *vma);
void hugetlb_vma_lock_release(struct kref *kref);
-long hugetlb_change_protection(struct vm_area_struct *vma,
+long hugetlb_change_protection(struct mmu_gather *tlb, struct vm_area_struct *vma,
unsigned long address, unsigned long end, pgprot_t newprot,
unsigned long cp_flags);
void hugetlb_unshare_all_pmds(struct vm_area_struct *vma);
@@ -300,13 +301,17 @@ static inline struct address_space *hugetlb_folio_mapping_lock_write(
return NULL;
}
-static inline int huge_pmd_unshare(struct mm_struct *mm,
- struct vm_area_struct *vma,
- unsigned long addr, pte_t *ptep)
+static inline int huge_pmd_unshare(struct mmu_gather *tlb,
+ struct vm_area_struct *vma, unsigned long addr, pte_t *ptep)
{
return 0;
}
+static inline void huge_pmd_unshare_flush(struct mmu_gather *tlb,
+ struct vm_area_struct *vma)
+{
+}
+
static inline void adjust_range_if_pmd_sharing_possible(
struct vm_area_struct *vma,
unsigned long *start, unsigned long *end)
@@ -432,7 +437,7 @@ static inline void move_hugetlb_state(struct folio *old_folio,
{
}
-static inline long hugetlb_change_protection(
+static inline long hugetlb_change_protection(struct mmu_gather *tlb,
struct vm_area_struct *vma, unsigned long address,
unsigned long end, pgprot_t newprot,
unsigned long cp_flags)
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 3c77cdef12a32..3db94693a06fc 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -5096,8 +5096,9 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma,
unsigned long last_addr_mask;
pte_t *src_pte, *dst_pte;
struct mmu_notifier_range range;
- bool shared_pmd = false;
+ struct mmu_gather tlb;
+ tlb_gather_mmu(&tlb, vma->vm_mm);
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, old_addr,
old_end);
adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end);
@@ -5122,12 +5123,12 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma,
if (huge_pte_none(huge_ptep_get(mm, old_addr, src_pte)))
continue;
- if (huge_pmd_unshare(mm, vma, old_addr, src_pte)) {
- shared_pmd = true;
+ if (huge_pmd_unshare(&tlb, vma, old_addr, src_pte)) {
old_addr |= last_addr_mask;
new_addr |= last_addr_mask;
continue;
}
+ tlb_remove_huge_tlb_entry(h, &tlb, src_pte, old_addr);
dst_pte = huge_pte_alloc(mm, new_vma, new_addr, sz);
if (!dst_pte)
@@ -5136,13 +5137,13 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma,
move_huge_pte(vma, old_addr, new_addr, src_pte, dst_pte, sz);
}
- if (shared_pmd)
- flush_hugetlb_tlb_range(vma, range.start, range.end);
- else
- flush_hugetlb_tlb_range(vma, old_end - len, old_end);
+ tlb_flush_mmu_tlbonly(&tlb);
+ huge_pmd_unshare_flush(&tlb, vma);
+
mmu_notifier_invalidate_range_end(&range);
i_mmap_unlock_write(mapping);
hugetlb_vma_unlock_write(vma);
+ tlb_finish_mmu(&tlb);
return len + old_addr - old_end;
}
@@ -5161,7 +5162,6 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
unsigned long sz = huge_page_size(h);
bool adjust_reservation;
unsigned long last_addr_mask;
- bool force_flush = false;
WARN_ON(!is_vm_hugetlb_page(vma));
BUG_ON(start & ~huge_page_mask(h));
@@ -5184,10 +5184,8 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
}
ptl = huge_pte_lock(h, mm, ptep);
- if (huge_pmd_unshare(mm, vma, address, ptep)) {
+ if (huge_pmd_unshare(tlb, vma, address, ptep)) {
spin_unlock(ptl);
- tlb_flush_pmd_range(tlb, address & PUD_MASK, PUD_SIZE);
- force_flush = true;
address |= last_addr_mask;
continue;
}
@@ -5303,14 +5301,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
}
tlb_end_vma(tlb, vma);
- /*
- * There is nothing protecting a previously-shared page table that we
- * unshared through huge_pmd_unshare() from getting freed after we
- * release i_mmap_rwsem, so flush the TLB now. If huge_pmd_unshare()
- * succeeded, flush the range corresponding to the pud.
- */
- if (force_flush)
- tlb_flush_mmu_tlbonly(tlb);
+ huge_pmd_unshare_flush(tlb, vma);
}
void __hugetlb_zap_begin(struct vm_area_struct *vma,
@@ -6399,7 +6390,7 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte,
}
#endif /* CONFIG_USERFAULTFD */
-long hugetlb_change_protection(struct vm_area_struct *vma,
+long hugetlb_change_protection(struct mmu_gather *tlb, struct vm_area_struct *vma,
unsigned long address, unsigned long end,
pgprot_t newprot, unsigned long cp_flags)
{
@@ -6409,7 +6400,6 @@ long hugetlb_change_protection(struct vm_area_struct *vma,
pte_t pte;
struct hstate *h = hstate_vma(vma);
long pages = 0, psize = huge_page_size(h);
- bool shared_pmd = false;
struct mmu_notifier_range range;
unsigned long last_addr_mask;
bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
@@ -6452,7 +6442,7 @@ long hugetlb_change_protection(struct vm_area_struct *vma,
}
}
ptl = huge_pte_lock(h, mm, ptep);
- if (huge_pmd_unshare(mm, vma, address, ptep)) {
+ if (huge_pmd_unshare(tlb, vma, address, ptep)) {
/*
* When uffd-wp is enabled on the vma, unshare
* shouldn't happen at all. Warn about it if it
@@ -6461,7 +6451,6 @@ long hugetlb_change_protection(struct vm_area_struct *vma,
WARN_ON_ONCE(uffd_wp || uffd_wp_resolve);
pages++;
spin_unlock(ptl);
- shared_pmd = true;
address |= last_addr_mask;
continue;
}
@@ -6522,22 +6511,16 @@ long hugetlb_change_protection(struct vm_area_struct *vma,
pte = huge_pte_clear_uffd_wp(pte);
huge_ptep_modify_prot_commit(vma, address, ptep, old_pte, pte);
pages++;
+ tlb_remove_huge_tlb_entry(h, tlb, ptep, address);
}
next:
spin_unlock(ptl);
cond_resched();
}
- /*
- * There is nothing protecting a previously-shared page table that we
- * unshared through huge_pmd_unshare() from getting freed after we
- * release i_mmap_rwsem, so flush the TLB now. If huge_pmd_unshare()
- * succeeded, flush the range corresponding to the pud.
- */
- if (shared_pmd)
- flush_hugetlb_tlb_range(vma, range.start, range.end);
- else
- flush_hugetlb_tlb_range(vma, start, end);
+
+ tlb_flush_mmu_tlbonly(tlb);
+ huge_pmd_unshare_flush(tlb, vma);
/*
* No need to call mmu_notifier_arch_invalidate_secondary_tlbs() we are
* downgrading page table protection not changing it to point to a new
@@ -6904,18 +6887,27 @@ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
return pte;
}
-/*
- * unmap huge page backed by shared pte.
+/**
+ * huge_pmd_unshare - Unmap a pmd table if it is shared by multiple users
+ * @tlb: the current mmu_gather.
+ * @vma: the vma covering the pmd table.
+ * @addr: the address we are trying to unshare.
+ * @ptep: pointer into the (pmd) page table.
+ *
+ * Called with the page table lock held, the i_mmap_rwsem held in write mode
+ * and the hugetlb vma lock held in write mode.
*
- * Called with page table lock held.
+ * Note: The caller must call huge_pmd_unshare_flush() before dropping the
+ * i_mmap_rwsem.
*
- * returns: 1 successfully unmapped a shared pte page
- * 0 the underlying pte page is not shared, or it is the last user
+ * Returns: 1 if it was a shared PMD table and it got unmapped, or 0 if it
+ * was not a shared PMD table.
*/
-int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long addr, pte_t *ptep)
+int huge_pmd_unshare(struct mmu_gather *tlb, struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep)
{
unsigned long sz = huge_page_size(hstate_vma(vma));
+ struct mm_struct *mm = vma->vm_mm;
pgd_t *pgd = pgd_offset(mm, addr);
p4d_t *p4d = p4d_offset(pgd, addr);
pud_t *pud = pud_offset(p4d, addr);
@@ -6927,18 +6919,36 @@ int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
i_mmap_assert_write_locked(vma->vm_file->f_mapping);
hugetlb_vma_assert_locked(vma);
pud_clear(pud);
- /*
- * Once our caller drops the rmap lock, some other process might be
- * using this page table as a normal, non-hugetlb page table.
- * Wait for pending gup_fast() in other threads to finish before letting
- * that happen.
- */
- tlb_remove_table_sync_one();
- ptdesc_pmd_pts_dec(virt_to_ptdesc(ptep));
+
+ tlb_unshare_pmd_ptdesc(tlb, virt_to_ptdesc(ptep), addr);
+
mm_dec_nr_pmds(mm);
return 1;
}
+/*
+ * huge_pmd_unshare_flush - Complete a sequence of huge_pmd_unshare() calls
+ * @tlb: the current mmu_gather.
+ * @vma: the vma covering the pmd table.
+ *
+ * Perform necessary TLB flushes or IPI broadcasts to synchronize PMD table
+ * unsharing with concurrent page table walkers (TLB, GUP-fast, etc.).
+ *
+ * This function must be called after a sequence of huge_pmd_unshare()
+ * calls while still holding the i_mmap_rwsem.
+ */
+void huge_pmd_unshare_flush(struct mmu_gather *tlb, struct vm_area_struct *vma)
+{
+ /*
+ * We must synchronize page table unsharing such that nobody will
+ * try reusing a previously-shared page table while it might still
+ * be in use by previous sharers (TLB, GUP_fast).
+ */
+ i_mmap_assert_write_locked(vma->vm_file->f_mapping);
+
+ tlb_flush_unshared_tables(tlb);
+}
+
#else /* !CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING */
pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
@@ -6947,12 +6957,16 @@ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
return NULL;
}
-int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long addr, pte_t *ptep)
+int huge_pmd_unshare(struct mmu_gather *tlb, struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep)
{
return 0;
}
+void huge_pmd_unshare_flush(struct mmu_gather *tlb, struct vm_area_struct *vma)
+{
+}
+
void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
unsigned long *start, unsigned long *end)
{
@@ -7219,6 +7233,7 @@ static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
unsigned long sz = huge_page_size(h);
struct mm_struct *mm = vma->vm_mm;
struct mmu_notifier_range range;
+ struct mmu_gather tlb;
unsigned long address;
spinlock_t *ptl;
pte_t *ptep;
@@ -7229,6 +7244,7 @@ static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
if (start >= end)
return;
+ tlb_gather_mmu(&tlb, mm);
flush_cache_range(vma, start, end);
/*
* No need to call adjust_range_if_pmd_sharing_possible(), because
@@ -7248,10 +7264,10 @@ static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
if (!ptep)
continue;
ptl = huge_pte_lock(h, mm, ptep);
- huge_pmd_unshare(mm, vma, address, ptep);
+ huge_pmd_unshare(&tlb, vma, address, ptep);
spin_unlock(ptl);
}
- flush_hugetlb_tlb_range(vma, start, end);
+ huge_pmd_unshare_flush(&tlb, vma);
if (take_locks) {
i_mmap_unlock_write(vma->vm_file->f_mapping);
hugetlb_vma_unlock_write(vma);
@@ -7261,6 +7277,7 @@ static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
* Documentation/mm/mmu_notifier.rst.
*/
mmu_notifier_invalidate_range_end(&range);
+ tlb_finish_mmu(&tlb);
}
/*
diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c
index 247e3f9db6c7a..822a790127375 100644
--- a/mm/mmu_gather.c
+++ b/mm/mmu_gather.c
@@ -468,6 +468,12 @@ void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm)
*/
void tlb_finish_mmu(struct mmu_gather *tlb)
{
+ /*
+ * We expect an earlier huge_pmd_unshare_flush() call to sort this out,
+ * due to complicated locking requirements with page table unsharing.
+ */
+ VM_WARN_ON_ONCE(tlb->fully_unshared_tables);
+
/*
* If there are parallel threads are doing PTE changes on same range
* under non-exclusive lock (e.g., mmap_lock read-side) but defer TLB
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 283889e4f1cec..5c330e817129e 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -652,7 +652,7 @@ long change_protection(struct mmu_gather *tlb,
#endif
if (is_vm_hugetlb_page(vma))
- pages = hugetlb_change_protection(vma, start, end, newprot,
+ pages = hugetlb_change_protection(tlb, vma, start, end, newprot,
cp_flags);
else
pages = change_protection_range(tlb, vma, start, end, newprot,
diff --git a/mm/rmap.c b/mm/rmap.c
index 748f48727a162..d6799afe11147 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -76,7 +76,7 @@
#include <linux/mm_inline.h>
#include <linux/oom.h>
-#include <asm/tlbflush.h>
+#include <asm/tlb.h>
#define CREATE_TRACE_POINTS
#include <trace/events/migrate.h>
@@ -2008,13 +2008,17 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
* if unsuccessful.
*/
if (!anon) {
+ struct mmu_gather tlb;
+
VM_BUG_ON(!(flags & TTU_RMAP_LOCKED));
if (!hugetlb_vma_trylock_write(vma))
goto walk_abort;
- if (huge_pmd_unshare(mm, vma, address, pvmw.pte)) {
+
+ tlb_gather_mmu(&tlb, mm);
+ if (huge_pmd_unshare(&tlb, vma, address, pvmw.pte)) {
hugetlb_vma_unlock_write(vma);
- flush_tlb_range(vma,
- range.start, range.end);
+ huge_pmd_unshare_flush(&tlb, vma);
+ tlb_finish_mmu(&tlb);
/*
* The PMD table was unmapped,
* consequently unmapping the folio.
@@ -2022,6 +2026,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
goto walk_done;
}
hugetlb_vma_unlock_write(vma);
+ tlb_finish_mmu(&tlb);
}
pteval = huge_ptep_clear_flush(vma, address, pvmw.pte);
if (pte_dirty(pteval))
@@ -2398,17 +2403,20 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
* fail if unsuccessful.
*/
if (!anon) {
+ struct mmu_gather tlb;
+
VM_BUG_ON(!(flags & TTU_RMAP_LOCKED));
if (!hugetlb_vma_trylock_write(vma)) {
page_vma_mapped_walk_done(&pvmw);
ret = false;
break;
}
- if (huge_pmd_unshare(mm, vma, address, pvmw.pte)) {
- hugetlb_vma_unlock_write(vma);
- flush_tlb_range(vma,
- range.start, range.end);
+ tlb_gather_mmu(&tlb, mm);
+ if (huge_pmd_unshare(&tlb, vma, address, pvmw.pte)) {
+ hugetlb_vma_unlock_write(vma);
+ huge_pmd_unshare_flush(&tlb, vma);
+ tlb_finish_mmu(&tlb);
/*
* The PMD table was unmapped,
* consequently unmapping the folio.
@@ -2417,6 +2425,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
break;
}
hugetlb_vma_unlock_write(vma);
+ tlb_finish_mmu(&tlb);
}
/* Nuke the hugetlb page table entry */
pteval = huge_ptep_clear_flush(vma, address, pvmw.pte);
--
2.52.0
Since we recently started warning about uses of this function after the
atomic check phase completes, we've started getting warnings about this in
nouveau. It appears a misplaced drm_atomic_get_crtc_state() call has been
hiding in our .prepare_fb callback for a while.
So, fix this by adding a new nv50_head_atom_get_new() function and use that
in our .prepare_fb callback instead.
Signed-off-by: Lyude Paul <lyude(a)redhat.com>
Fixes: 1590700d94ac ("drm/nouveau/kms/nv50-: split each resource type into their own source files")
Cc: <stable(a)vger.kernel.org> # v4.18+
Signed-off-by: Lyude Paul <lyude(a)redhat.com>
---
drivers/gpu/drm/nouveau/dispnv50/atom.h | 13 +++++++++++++
drivers/gpu/drm/nouveau/dispnv50/wndw.c | 2 +-
2 files changed, 14 insertions(+), 1 deletion(-)
diff --git a/drivers/gpu/drm/nouveau/dispnv50/atom.h b/drivers/gpu/drm/nouveau/dispnv50/atom.h
index 93f8f4f645784..85b7cf70d13c4 100644
--- a/drivers/gpu/drm/nouveau/dispnv50/atom.h
+++ b/drivers/gpu/drm/nouveau/dispnv50/atom.h
@@ -152,8 +152,21 @@ static inline struct nv50_head_atom *
nv50_head_atom_get(struct drm_atomic_state *state, struct drm_crtc *crtc)
{
struct drm_crtc_state *statec = drm_atomic_get_crtc_state(state, crtc);
+
if (IS_ERR(statec))
return (void *)statec;
+
+ return nv50_head_atom(statec);
+}
+
+static inline struct nv50_head_atom *
+nv50_head_atom_get_new(struct drm_atomic_state *state, struct drm_crtc *crtc)
+{
+ struct drm_crtc_state *statec = drm_atomic_get_new_crtc_state(state, crtc);
+
+ if (IS_ERR(statec))
+ return (void*)statec;
+
return nv50_head_atom(statec);
}
diff --git a/drivers/gpu/drm/nouveau/dispnv50/wndw.c b/drivers/gpu/drm/nouveau/dispnv50/wndw.c
index ef9e410babbfb..9a2c20fce0f3e 100644
--- a/drivers/gpu/drm/nouveau/dispnv50/wndw.c
+++ b/drivers/gpu/drm/nouveau/dispnv50/wndw.c
@@ -583,7 +583,7 @@ nv50_wndw_prepare_fb(struct drm_plane *plane, struct drm_plane_state *state)
asyw->image.offset[0] = nvbo->offset;
if (wndw->func->prepare) {
- asyh = nv50_head_atom_get(asyw->state.state, asyw->state.crtc);
+ asyh = nv50_head_atom_get_new(asyw->state.state, asyw->state.crtc);
if (IS_ERR(asyh))
return PTR_ERR(asyh);
--
2.52.0
__pci_read_base() sets resource start and end addresses when resource
is larger than 4G but pci_bus_addr_t or resource_size_t are not capable
of representing 64-bit PCI addresses. This creates a problematic
resource that has non-zero flags but the start and end addresses do not
yield to resource size of 0 but 1.
Replace custom resource addresses setup with resource_set_range()
that correctly sets end address as -1 which results in resource_size()
returning 0.
For consistency, also use resource_set_range() in the other branch that
does size based resource setup.
Fixes: 23b13bc76f35 ("PCI: Fail safely if we can't handle BARs larger than 4GB")
Link: https://lore.kernel.org/all/20251207215359.28895-1-ansuelsmth@gmail.com/T/#…
Signed-off-by: Ilpo Järvinen <ilpo.jarvinen(a)linux.intel.com>
Cc: stable(a)vger.kernel.org
Cc: Christian Marangi <ansuelsmth(a)gmail.com>
---
drivers/pci/probe.c | 6 ++----
1 file changed, 2 insertions(+), 4 deletions(-)
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index 124d2d309c58..b8294a2f11f9 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -287,8 +287,7 @@ int __pci_read_base(struct pci_dev *dev, enum pci_bar_type type,
if ((sizeof(pci_bus_addr_t) < 8 || sizeof(resource_size_t) < 8)
&& sz64 > 0x100000000ULL) {
res->flags |= IORESOURCE_UNSET | IORESOURCE_DISABLED;
- res->start = 0;
- res->end = 0;
+ resource_set_range(res, 0, 0);
pci_err(dev, "%s: can't handle BAR larger than 4GB (size %#010llx)\n",
res_name, (unsigned long long)sz64);
goto out;
@@ -297,8 +296,7 @@ int __pci_read_base(struct pci_dev *dev, enum pci_bar_type type,
if ((sizeof(pci_bus_addr_t) < 8) && l) {
/* Above 32-bit boundary; try to reallocate */
res->flags |= IORESOURCE_UNSET;
- res->start = 0;
- res->end = sz64 - 1;
+ resource_set_range(res, 0, sz64);
pci_info(dev, "%s: can't handle BAR above 4GB (bus address %#010llx)\n",
res_name, (unsigned long long)l64);
goto out;
base-commit: 43dfc13ca972988e620a6edb72956981b75ab6b0
--
2.39.5
¿Cuánto cuesta una mala contratación?
body {
margin: 0;
padding: 0;
font-family: Arial, Helvetica, sans-serif;
font-size: 14px;
color: #333;
background-color: #ffffff;
}
table {
border-spacing: 0;
width: 100%;
max-width: 600px;
margin: auto;
}
td {
padding: 12px 20px;
}
a {
color: #1a73e8;
text-decoration: none;
}
.footer {
font-size: 12px;
color: #888888;
text-align: center;
}
Una mala contratación cuesta 3X el salario. Evítalo con datos, no percepciones.
Hola, ,
¿Sabías que una mala contratación cuesta hasta 3 veces el salario anual?
El 74% de empresas admite haber contratado a la persona equivocada. El motivo: decisiones basadas en percepciones, no en datos objetivos.
PsicoSmart te ayuda a evaluar talento con precisión:
31 pruebas psicométricas validadas para medir liderazgo, honestidad e inteligencia
2,500+ exámenes técnicos especializados por industria
Verificación de identidad con captura fotográfica automática
Resultados en minutos, accesible desde cualquier dispositivo
Reduce hasta 60% el riesgo de error en selección.
¿Quieres una demostración gratuita? Responde este correo y te contacto en menos de 24 horas.
Saludos,
--------------
Atte.: Valeria Pérez
Ciudad de México: (55) 5018 0565
WhatsApp: +52 33 1607 2089
Si no deseas recibir más correos, haz clic aquí para darte de baja.
Para remover su dirección de esta lista haga <a href="https://s1.arrobamail.com/unsuscribe.php?id=yiwtsrewiswqtyseup">click aquí</a>
Hi,
Hope you're having a great day!
Are you looking for leads from NRF 2026 Retail's Big Show?
Attendees count: 30,000 Leads
Data Fields: Company Name, Web URL, Contact Name, Title, Direct Email, Phone Number, Mailing Address, Industry, Employee Size, Annual Sales.
If you're interested in these leads, I'd be glad to share the pricing. Let me know!
Thanks for the quick reply. I'm excited to get your thoughts.
Regards
Connie Griggs
Demand Generation Manager
Leads Focus Hub Inc.,
Please reply with REMOVE if you don't wish to receive further emails
Hello,
On Sat, Nov 15, 2025 at 10:59:38AM +0100, Fernando Fernandez Mancera wrote:
> When an IPv6 Router Advertisement (RA) is received for a prefix, the
> kernel creates the corresponding on-link route with flags RTF_ADDRCONF
> and RTF_PREFIX_RT configured and RTF_EXPIRES if lifetime is set.
>
> If later a user configures a static IPv6 address on the same prefix the
> kernel clears the RTF_EXPIRES flag but it doesn't clear the RTF_ADDRCONF
> and RTF_PREFIX_RT. When the next RA for that prefix is received, the
> kernel sees the route as RA-learned and wrongly configures back the
> lifetime. This is problematic because if the route expires, the static
> address won't have the corresponding on-link route.
>
> This fix clears the RTF_ADDRCONF and RTF_PREFIX_RT flags preventing that
> the lifetime is configured when the next RA arrives. If the static
> address is deleted, the route becomes RA-learned again.
>
> Fixes: 14ef37b6d00e ("ipv6: fix route lookup in addrconf_prefix_rcv()")
> Reported-by: Garri Djavadyan <g.djavadyan(a)gmail.com>
> Closes: https://lore.kernel.org/netdev/ba807d39aca5b4dcf395cc11dca61a130a52cfd3.cam…
> Signed-off-by: Fernando Fernandez Mancera <fmancera(a)suse.de>
this commit is in the mainline now as
f72514b3c5698e4b900b25345e09f9ed33123de6 and is supposed to fix
https://bugs.debian.org/1117959.
I would have expected this to get backported to stable (here: 6.12.x),
but it's not in the list for 6.12.62-rc1[1].
Can we please have this patch backported?
[1] https://lore.kernel.org/all/20251210072948.125620687@linuxfoundation.org/
Thanks
Uwe
of_get_child_by_name() returns a node pointer with refcount incremented.
Use the __free() attribute to manage the pgc_node reference, ensuring
automatic of_node_put() cleanup when pgc_node goes out of scope.
This eliminates the need for explicit error handling paths and avoids
reference count leaks.
Fixes: 721cabf6c660 ("soc: imx: move PGC handling to a new GPC driver")
Cc: stable(a)vger.kernel.org
Signed-off-by: Wentao Liang <vulab(a)iscas.ac.cn>
---
Change in V3:
- Ensure variable is assigned when using cleanup attribute
Change in V2:
- Use __free() attribute instead of explicit of_node_put() calls
---
drivers/pmdomain/imx/gpc.c | 5 ++---
1 file changed, 2 insertions(+), 3 deletions(-)
diff --git a/drivers/pmdomain/imx/gpc.c b/drivers/pmdomain/imx/gpc.c
index f18c7e6e75dd..0fb3250dbf5f 100644
--- a/drivers/pmdomain/imx/gpc.c
+++ b/drivers/pmdomain/imx/gpc.c
@@ -403,13 +403,12 @@ static int imx_gpc_old_dt_init(struct device *dev, struct regmap *regmap,
static int imx_gpc_probe(struct platform_device *pdev)
{
const struct imx_gpc_dt_data *of_id_data = device_get_match_data(&pdev->dev);
- struct device_node *pgc_node;
+ struct device_node *pgc_node __free(pgc_node)
+ = of_get_child_by_name(pdev->dev.of_node, "pgc");
struct regmap *regmap;
void __iomem *base;
int ret;
- pgc_node = of_get_child_by_name(pdev->dev.of_node, "pgc");
-
/* bail out if DT too old and doesn't provide the necessary info */
if (!of_property_present(pdev->dev.of_node, "#power-domain-cells") &&
!pgc_node)
--
2.34.1
If the ufs resume fails, the event history is updated in ufshcd_resume,
but there is no code anywhere to record ufs suspend. Therefore, add code
to record ufs suspend error event history.
Fixes: dd11376b9f1b ("scsi: ufs: Split the drivers/scsi/ufs directory")
Cc: stable(a)vger.kernel.org
Signed-off-by: Seunghwan Baek <sh8267.baek(a)samsung.com>
---
drivers/ufs/core/ufshcd.c | 5 ++++-
1 file changed, 4 insertions(+), 1 deletion(-)
diff --git a/drivers/ufs/core/ufshcd.c b/drivers/ufs/core/ufshcd.c
index 040a0ceb170a..6bb2781aefc7 100644
--- a/drivers/ufs/core/ufshcd.c
+++ b/drivers/ufs/core/ufshcd.c
@@ -10337,7 +10337,7 @@ static int ufshcd_suspend(struct ufs_hba *hba)
ret = ufshcd_setup_clocks(hba, false);
if (ret) {
ufshcd_enable_irq(hba);
- return ret;
+ goto out;
}
if (ufshcd_is_clkgating_allowed(hba)) {
hba->clk_gating.state = CLKS_OFF;
@@ -10349,6 +10349,9 @@ static int ufshcd_suspend(struct ufs_hba *hba)
/* Put the host controller in low power mode if possible */
ufshcd_hba_vreg_set_lpm(hba);
ufshcd_pm_qos_update(hba, false);
+out:
+ if (ret)
+ ufshcd_update_evt_hist(hba, UFS_EVT_SUSPEND_ERR, (u32)ret);
return ret;
}
--
2.43.0
It may happen that VF spawned for E610 adapter has problem with setting
link up. This happens when ixgbevf supporting mailbox API 1.6 coopearates
with PF driver which doesn't support this version of API, and hence
doesn't support new approach for getting PF link data.
In that case VF asks PF to provide link data but as PF doesn't support
it, returns -EOPNOTSUPP what leads to early bail from link configuration
sequence.
Avoid such situation by using legacy VFLINKS approach whenever negotiated
API version is less than 1.6.
Fixes: 53f0eb62b4d2 ("ixgbevf: fix getting link speed data for E610 devices")
Reviewed-by: Aleksandr Loktionov <aleksandr.loktionov(a)intel.com>
Reviewed-by: Piotr Kwapulinski <piotr.kwapulinski(a)intel.com>
Cc: stable(a)vger.kernel.org
Signed-off-by: Jedrzej Jagielski <jedrzej.jagielski(a)intel.com>
---
drivers/net/ethernet/intel/ixgbevf/vf.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/drivers/net/ethernet/intel/ixgbevf/vf.c b/drivers/net/ethernet/intel/ixgbevf/vf.c
index 29c5ce967938..8af88f615776 100644
--- a/drivers/net/ethernet/intel/ixgbevf/vf.c
+++ b/drivers/net/ethernet/intel/ixgbevf/vf.c
@@ -846,7 +846,8 @@ static s32 ixgbevf_check_mac_link_vf(struct ixgbe_hw *hw,
if (!mac->get_link_status)
goto out;
- if (hw->mac.type == ixgbe_mac_e610_vf) {
+ if (hw->mac.type == ixgbe_mac_e610_vf &&
+ hw->api_version >= ixgbe_mbox_api_16) {
ret_val = ixgbevf_get_pf_link_state(hw, speed, link_up);
if (ret_val)
goto out;
--
2.31.1
It may happen that VF spawned for E610 adapter has problem with setting
link up. This happens when ixgbevf supporting mailbox API 1.6 cooperates
with PF driver which doesn't support this version of API, and hence
doesn't support new approach for getting PF link data.
In that case VF asks PF to provide link data but as PF doesn't support
it, returns -EOPNOTSUPP what leads to early bail from link configuration
sequence.
Avoid such situation by using legacy VFLINKS approach whenever negotiated
API version is less than 1.6.
To reproduce the issue just create VF and set its link up - adapter must
be any from the E610 family, ixgbevf must support API 1.6 or higher while
ixgbevf must not.
Fixes: 53f0eb62b4d2 ("ixgbevf: fix getting link speed data for E610 devices")
Reviewed-by: Aleksandr Loktionov <aleksandr.loktionov(a)intel.com>
Reviewed-by: Piotr Kwapulinski <piotr.kwapulinski(a)intel.com>
Reviewed-by: Paul Menzel <pmenzel(a)molgen.mpg.de>
Cc: stable(a)vger.kernel.org
Signed-off-by: Jedrzej Jagielski <jedrzej.jagielski(a)intel.com>
---
v2: extend the commit msg (Paul)
---
drivers/net/ethernet/intel/ixgbevf/vf.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/drivers/net/ethernet/intel/ixgbevf/vf.c b/drivers/net/ethernet/intel/ixgbevf/vf.c
index 29c5ce967938..8af88f615776 100644
--- a/drivers/net/ethernet/intel/ixgbevf/vf.c
+++ b/drivers/net/ethernet/intel/ixgbevf/vf.c
@@ -846,7 +846,8 @@ static s32 ixgbevf_check_mac_link_vf(struct ixgbe_hw *hw,
if (!mac->get_link_status)
goto out;
- if (hw->mac.type == ixgbe_mac_e610_vf) {
+ if (hw->mac.type == ixgbe_mac_e610_vf &&
+ hw->api_version >= ixgbe_mbox_api_16) {
ret_val = ixgbevf_get_pf_link_state(hw, speed, link_up);
if (ret_val)
goto out;
--
2.31.1
The Dell XPS 13 9350 and XPS 16 9640 both have an upside-down mounted
OV02C10 sensor. This rotation of 180° is reported in neither the SSDB nor
the _PLD for the sensor (both report a rotation of 0°).
Add a DMI quirk mechanism for upside-down sensors and add 2 initial entries
to the DMI quirk list for these 2 laptops.
Note the OV02C10 driver was originally developed on a XPS 16 9640 which
resulted in inverted vflip + hflip settings making it look like the sensor
was upright on the XPS 16 9640 and upside down elsewhere this has been
fixed in commit 69fe27173396 ("media: ov02c10: Fix default vertical flip").
This makes this commit a regression fix since now the video is upside down
on these Dell XPS models where it was not before.
Fixes: 69fe27173396 ("media: ov02c10: Fix default vertical flip")
Cc: stable(a)vger.kernel.org
Signed-off-by: Hans de Goede <johannes.goede(a)oss.qualcomm.com>
---
drivers/media/pci/intel/ipu-bridge.c | 29 ++++++++++++++++++++++++++++
1 file changed, 29 insertions(+)
diff --git a/drivers/media/pci/intel/ipu-bridge.c b/drivers/media/pci/intel/ipu-bridge.c
index 58ea01d40c0d..6463b2a47d78 100644
--- a/drivers/media/pci/intel/ipu-bridge.c
+++ b/drivers/media/pci/intel/ipu-bridge.c
@@ -5,6 +5,7 @@
#include <acpi/acpi_bus.h>
#include <linux/cleanup.h>
#include <linux/device.h>
+#include <linux/dmi.h>
#include <linux/i2c.h>
#include <linux/mei_cl_bus.h>
#include <linux/platform_device.h>
@@ -99,6 +100,28 @@ static const struct ipu_sensor_config ipu_supported_sensors[] = {
IPU_SENSOR_CONFIG("XMCC0003", 1, 321468000),
};
+/*
+ * DMI matches for laptops which have their sensor mounted upside-down
+ * without reporting a rotation of 180° in neither the SSDB nor the _PLD.
+ */
+static const struct dmi_system_id upside_down_sensor_dmi_ids[] = {
+ {
+ .matches = {
+ DMI_EXACT_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+ DMI_EXACT_MATCH(DMI_PRODUCT_NAME, "XPS 13 9350"),
+ },
+ .driver_data = "OVTI02C1",
+ },
+ {
+ .matches = {
+ DMI_EXACT_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+ DMI_EXACT_MATCH(DMI_PRODUCT_NAME, "XPS 16 9640"),
+ },
+ .driver_data = "OVTI02C1",
+ },
+ {} /* Terminating entry */
+};
+
static const struct ipu_property_names prop_names = {
.clock_frequency = "clock-frequency",
.rotation = "rotation",
@@ -249,6 +272,12 @@ static int ipu_bridge_read_acpi_buffer(struct acpi_device *adev, char *id,
static u32 ipu_bridge_parse_rotation(struct acpi_device *adev,
struct ipu_sensor_ssdb *ssdb)
{
+ const struct dmi_system_id *dmi_id;
+
+ dmi_id = dmi_first_match(upside_down_sensor_dmi_ids);
+ if (dmi_id && acpi_dev_hid_match(adev, dmi_id->driver_data))
+ return 180;
+
switch (ssdb->degree) {
case IPU_SENSOR_ROTATION_NORMAL:
return 0;
--
2.52.0
handshake_req_submit() replaces sk->sk_destruct but never restores it when
submission fails before the request is hashed. handshake_sk_destruct() then
returns early and the original destructor never runs, leaking the socket.
Restore sk_destruct on the error path.
Fixes: 3b3009ea8abb ("net/handshake: Create a NETLINK service for handling handshake requests")
Reviewed-by: Chuck Lever <chuck.lever(a)oracle.com>
Cc: stable(a)vger.kernel.org
Signed-off-by: caoping <caoping(a)cmss.chinamobile.com>
---
net/handshake/request.c | 2 ++
1 file changed, 2 insertions(+)
diff --git a/net/handshake/request.c b/net/handshake/request.c
index 274d2c89b6b2..89435ed755cd 100644
--- a/net/handshake/request.c
+++ b/net/handshake/request.c
@@ -276,6 +276,8 @@ int handshake_req_submit(struct socket *sock, struct handshake_req *req,
out_unlock:
spin_unlock(&hn->hn_lock);
out_err:
+ /* Restore original destructor so socket teardown still runs on failure */
+ req->hr_sk->sk_destruct = req->hr_odestruct;
trace_handshake_submit_err(net, req, req->hr_sk, ret);
handshake_req_destroy(req);
return ret;
base-commit: 4a26e7032d7d57c998598c08a034872d6f0d3945
--
2.47.3
Some Xe bos are allocated with extra backing-store for the CCS
metadata. It's never been the intention to share the CCS metadata
when exporting such bos as dma-buf. Don't include it in the
dma-buf sg-table.
Fixes: dd08ebf6c352 ("drm/xe: Introduce a new DRM driver for Intel GPUs")
Cc: Rodrigo Vivi <rodrigo.vivi(a)intel.com>
Cc: Matthew Brost <matthew.brost(a)intel.com>
Cc: Maarten Lankhorst <maarten.lankhorst(a)linux.intel.com>
Cc: <stable(a)vger.kernel.org> # v6.8+
Signed-off-by: Thomas Hellström <thomas.hellstrom(a)linux.intel.com>
---
drivers/gpu/drm/xe/xe_dma_buf.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/gpu/drm/xe/xe_dma_buf.c b/drivers/gpu/drm/xe/xe_dma_buf.c
index 54e42960daad..7c74a31d4486 100644
--- a/drivers/gpu/drm/xe/xe_dma_buf.c
+++ b/drivers/gpu/drm/xe/xe_dma_buf.c
@@ -124,7 +124,7 @@ static struct sg_table *xe_dma_buf_map(struct dma_buf_attachment *attach,
case XE_PL_TT:
sgt = drm_prime_pages_to_sg(obj->dev,
bo->ttm.ttm->pages,
- bo->ttm.ttm->num_pages);
+ obj->size >> PAGE_SHIFT);
if (IS_ERR(sgt))
return sgt;
--
2.51.1
Commit 2603be9e8167 ("can: gs_usb: gs_can_open(): improve error handling")
added missing error handling to the gs_can_open() function.
The driver uses 2 USB anchors to track the allocated URBs: the TX URBs in
struct gs_can::tx_submitted for each netdev and the RX URBs in struct
gs_usb::rx_submitted for the USB device. gs_can_open() allocates the RX
URBs, while TX URBs are allocated during gs_can_start_xmit().
The cleanup in gs_can_open() kills all anchored dev->tx_submitted
URBs (which is not necessary since the netdev is not yet registered), but
misses the parent->rx_submitted URBs.
Fix the problem by killing the rx_submitted instead of the tx_submitted.
Fixes: 2603be9e8167 ("can: gs_usb: gs_can_open(): improve error handling")
Cc: stable(a)vger.kernel.org
Link: https://patch.msgid.link/20251210-gs_usb-fix-error-handling-v1-1-d6a5a03f10…
Signed-off-by: Marc Kleine-Budde <mkl(a)pengutronix.de>
---
drivers/net/can/usb/gs_usb.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/net/can/usb/gs_usb.c b/drivers/net/can/usb/gs_usb.c
index e29e85b67fd4..a0233e550a5a 100644
--- a/drivers/net/can/usb/gs_usb.c
+++ b/drivers/net/can/usb/gs_usb.c
@@ -1074,7 +1074,7 @@ static int gs_can_open(struct net_device *netdev)
usb_free_urb(urb);
out_usb_kill_anchored_urbs:
if (!parent->active_channels) {
- usb_kill_anchored_urbs(&dev->tx_submitted);
+ usb_kill_anchored_urbs(&parent->rx_submitted);
if (dev->feature & GS_CAN_FEATURE_HW_TIMESTAMP)
gs_usb_timestamp_stop(parent);
--
2.51.0
Commit 2603be9e8167 ("can: gs_usb: gs_can_open(): improve error handling")
added missing error handling to the gs_can_open() function.
The driver uses 2 USB anchors to track the allocated URBs: the TX URBs in
struct gs_can::tx_submitted for each netdev and the RX URBs in struct
gs_usb::rx_submitted for the USB device. gs_can_open() allocates the RX
URBs, while TX URBs are allocated during gs_can_start_xmit().
The cleanup in gs_can_open() kills all anchored dev->tx_submitted
URBs (which is not necessary since the netdev is not yet registered), but
misses the parent->rx_submitted URBs.
Fix the problem by killing the rx_submitted instead of the tx_submitted.
Fixes: 2603be9e8167 ("can: gs_usb: gs_can_open(): improve error handling")
Cc: stable(a)vger.kernel.org
Signed-off-by: Marc Kleine-Budde <mkl(a)pengutronix.de>
---
drivers/net/can/usb/gs_usb.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/net/can/usb/gs_usb.c b/drivers/net/can/usb/gs_usb.c
index e29e85b67fd4..a0233e550a5a 100644
--- a/drivers/net/can/usb/gs_usb.c
+++ b/drivers/net/can/usb/gs_usb.c
@@ -1074,7 +1074,7 @@ static int gs_can_open(struct net_device *netdev)
usb_free_urb(urb);
out_usb_kill_anchored_urbs:
if (!parent->active_channels) {
- usb_kill_anchored_urbs(&dev->tx_submitted);
+ usb_kill_anchored_urbs(&parent->rx_submitted);
if (dev->feature & GS_CAN_FEATURE_HW_TIMESTAMP)
gs_usb_timestamp_stop(parent);
---
base-commit: 186468c67fc687650b7fb713d8c627d5c8566886
change-id: 20251210-gs_usb-fix-error-handling-4f980294424c
Best regards,
--
Marc Kleine-Budde <mkl(a)pengutronix.de>
Since commit 9c328f54741b ("net: nfc: nci: Add parameter validation for
packet data") communication with nci nfc chips is not working any more.
The mentioned commit tries to fix access of uninitialized data, but
failed to understand that in some cases the data packet is of variable
length and can therefore not be compared to the maximum packet length
given by the sizeof(struct).
For these cases it is only possible to check for minimum packet length.
Fixes: 9c328f54741b ("net: nfc: nci: Add parameter validation for packet data")
Cc: stable(a)vger.kernel.org
Signed-off-by: Michael Thalmeier <michael.thalmeier(a)hale.at>
---
Changes in v2:
- Reference correct commit hash
---
net/nfc/nci/ntf.c | 11 ++++++++---
1 file changed, 8 insertions(+), 3 deletions(-)
diff --git a/net/nfc/nci/ntf.c b/net/nfc/nci/ntf.c
index 418b84e2b260..5161e94f067f 100644
--- a/net/nfc/nci/ntf.c
+++ b/net/nfc/nci/ntf.c
@@ -58,7 +58,8 @@ static int nci_core_conn_credits_ntf_packet(struct nci_dev *ndev,
struct nci_conn_info *conn_info;
int i;
- if (skb->len < sizeof(struct nci_core_conn_credit_ntf))
+ /* Minimal packet size for num_entries=1 is 1 x __u8 + 1 x conn_credit_entry */
+ if (skb->len < (sizeof(__u8) + sizeof(struct conn_credit_entry)))
return -EINVAL;
ntf = (struct nci_core_conn_credit_ntf *)skb->data;
@@ -364,7 +365,8 @@ static int nci_rf_discover_ntf_packet(struct nci_dev *ndev,
const __u8 *data;
bool add_target = true;
- if (skb->len < sizeof(struct nci_rf_discover_ntf))
+ /* Minimal packet size is 5 if rf_tech_specific_params_len=0 */
+ if (skb->len < (5 * sizeof(__u8)))
return -EINVAL;
data = skb->data;
@@ -596,7 +598,10 @@ static int nci_rf_intf_activated_ntf_packet(struct nci_dev *ndev,
const __u8 *data;
int err = NCI_STATUS_OK;
- if (skb->len < sizeof(struct nci_rf_intf_activated_ntf))
+ /* Minimal packet size is 11 if
+ * f_tech_specific_params_len=0 and activation_params_len=0
+ */
+ if (skb->len < (11 * sizeof(__u8)))
return -EINVAL;
data = skb->data;
--
2.52.0
Since commit 8fcc7315a10a ("net: nfc: nci: Add parameter validation for
packet data") communication with nci nfc chips is not working any more.
The mentioned commit tries to fix access of uninitialized data, but
failed to understand that in some cases the data packet is of variable
length and can therefore not be compared to the maximum packet length
given by the sizeof(struct).
For these cases it is only possible to check for minimum packet length.
Fixes: 8fcc7315a10a ("net: nfc: nci: Add parameter validation for packet data")
Cc: stable(a)vger.kernel.org
Signed-off-by: Michael Thalmeier <michael.thalmeier(a)hale.at>
---
net/nfc/nci/ntf.c | 11 ++++++++---
1 file changed, 8 insertions(+), 3 deletions(-)
diff --git a/net/nfc/nci/ntf.c b/net/nfc/nci/ntf.c
index 418b84e2b260..5161e94f067f 100644
--- a/net/nfc/nci/ntf.c
+++ b/net/nfc/nci/ntf.c
@@ -58,7 +58,8 @@ static int nci_core_conn_credits_ntf_packet(struct nci_dev *ndev,
struct nci_conn_info *conn_info;
int i;
- if (skb->len < sizeof(struct nci_core_conn_credit_ntf))
+ /* Minimal packet size for num_entries=1 is 1 x __u8 + 1 x conn_credit_entry */
+ if (skb->len < (sizeof(__u8) + sizeof(struct conn_credit_entry)))
return -EINVAL;
ntf = (struct nci_core_conn_credit_ntf *)skb->data;
@@ -364,7 +365,8 @@ static int nci_rf_discover_ntf_packet(struct nci_dev *ndev,
const __u8 *data;
bool add_target = true;
- if (skb->len < sizeof(struct nci_rf_discover_ntf))
+ /* Minimal packet size is 5 if rf_tech_specific_params_len=0 */
+ if (skb->len < (5 * sizeof(__u8)))
return -EINVAL;
data = skb->data;
@@ -596,7 +598,10 @@ static int nci_rf_intf_activated_ntf_packet(struct nci_dev *ndev,
const __u8 *data;
int err = NCI_STATUS_OK;
- if (skb->len < sizeof(struct nci_rf_intf_activated_ntf))
+ /* Minimal packet size is 11 if
+ * f_tech_specific_params_len=0 and activation_params_len=0
+ */
+ if (skb->len < (11 * sizeof(__u8)))
return -EINVAL;
data = skb->data;
--
2.52.0
On Sun, Dec 07, 2025 at 10:07:49PM -0500, Sasha Levin wrote:
> This is a note to let you know that I've just added the patch titled
>
> dma-mapping: Allow use of DMA_BIT_MASK(64) in global scope
>
> to the 6.17-stable tree which can be found at:
> http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum…
>
> The filename of the patch is:
> dma-mapping-allow-use-of-dma_bit_mask-64-in-global-s.patch
> and it can be found in the queue-6.17 subdirectory.
>
> If you, or anyone else, feels it should not be added to the stable tree,
> please let <stable(a)vger.kernel.org> know about it.
>
>
>
> commit 51a1912d46ceb70602de50c167e440966b9836f1
> Author: James Clark <james.clark(a)linaro.org>
> Date: Thu Oct 30 14:05:27 2025 +0000
>
> dma-mapping: Allow use of DMA_BIT_MASK(64) in global scope
>
> [ Upstream commit a50f7456f853ec3a6f07cbe1d16ad8a8b2501320 ]
This change has a pending bug fix, consider waiting to apply this to the
stable trees.
https://lore.kernel.org/20251207184756.97904-1-johannes.goede@oss.qualcomm.…
Cheers,
Nathan
From: Wenhua Lin <Wenhua.Lin(a)unisoc.com>
[ Upstream commit 29e8a0c587e328ed458380a45d6028adf64d7487 ]
In sprd_clk_init(), when devm_clk_get() returns -EPROBE_DEFER
for either uart or source clock, we should propagate the
error instead of just warning and continuing with NULL clocks.
Currently the driver only emits a warning when clock acquisition
fails and proceeds with NULL clock pointers. This can lead to
issues later when the clocks are actually needed. More importantly,
when the clock provider is not ready yet and returns -EPROBE_DEFER,
we should return this error to allow deferred probing.
This change adds explicit checks for -EPROBE_DEFER after both:
1. devm_clk_get(uport->dev, uart)
2. devm_clk_get(uport->dev, source)
When -EPROBE_DEFER is encountered, the function now returns
-EPROBE_DEFER to let the driver framework retry probing
later when the clock dependencies are resolved.
Signed-off-by: Wenhua Lin <Wenhua.Lin(a)unisoc.com>
Link: https://patch.msgid.link/20251022030840.956589-1-Wenhua.Lin@unisoc.com
Reviewed-by: Cixi Geng <cixi.geng(a)linux.dev>
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
---
LLM Generated explanations, may be completely bogus:
## Commit Analysis: serial: sprd: Return -EPROBE_DEFER when uart clock
is not ready
### 1. COMMIT MESSAGE ANALYSIS
**Subject**: Fixes incorrect handling of `-EPROBE_DEFER` error from
`devm_clk_get()`
**Key observations**:
- No `Cc: stable(a)vger.kernel.org` tag present
- No `Fixes:` tag pointing to when the bug was introduced
- Has `Reviewed-by:` from Cixi Geng
- Accepted by Greg Kroah-Hartman (serial subsystem maintainer)
### 2. CODE CHANGE ANALYSIS
The change adds two checks in `sprd_clk_init()`:
```c
clk_uart = devm_clk_get(uport->dev, "uart");
if (IS_ERR(clk_uart)) {
+ if (PTR_ERR(clk_uart) == -EPROBE_DEFER)
+ return -EPROBE_DEFER;
dev_warn(...);
clk_uart = NULL;
}
clk_parent = devm_clk_get(uport->dev, "source");
if (IS_ERR(clk_parent)) {
+ if (PTR_ERR(clk_parent) == -EPROBE_DEFER)
+ return -EPROBE_DEFER;
dev_warn(...);
clk_parent = NULL;
}
```
**Technical bug mechanism**: When clock providers aren't ready yet,
`devm_clk_get()` returns `-EPROBE_DEFER`. The existing code ignores this
error, sets the clock pointer to NULL, and continues. This bypasses the
kernel's deferred probing mechanism which exists precisely to handle
this dependency ordering scenario.
**Existing pattern**: The function already has identical handling for
the "enable" clock (visible in the context lines). This fix makes the
handling consistent for all three clocks.
### 3. CLASSIFICATION
**Type**: Bug fix (not a feature addition)
This fixes incorrect error handling. The `-EPROBE_DEFER` mechanism is a
fundamental kernel feature for handling driver load order dependencies.
Not propagating this error is a bug.
### 4. SCOPE AND RISK ASSESSMENT
- **Lines changed**: 6 lines added (two 3-line checks)
- **Files touched**: 1 file (`drivers/tty/serial/sprd_serial.c`)
- **Complexity**: Very low - follows identical pattern already in the
same function
- **Subsystem**: Hardware-specific serial driver for Spreadtrum/Unisoc
UARTs
**Risk**: Very low
- Pattern is already established and proven in the same function
- Only affects error handling during probe
- No changes to normal operation when clocks are available
- Worst case: probe failure happens earlier/more explicitly
### 5. USER IMPACT
**Affected users**: Users of Spreadtrum/Unisoc UART hardware (embedded
devices, some Android phones)
**Severity**: Medium-High for affected users - without this fix, the
serial port may not work correctly on systems where clock providers load
after the serial driver. This is common in embedded systems with device
tree-based configurations.
**Real bug**: This is a practical issue in probe ordering scenarios. The
driver would proceed with NULL clocks instead of waiting for
dependencies to be ready.
### 6. STABILITY INDICATORS
- Reviewed-by tag indicates code review
- Maintainer accepted the change
- Simple, straightforward change following existing code patterns
- No complex logic introduced
### 7. DEPENDENCY CHECK
- **Dependencies**: None - self-contained fix
- **Code existence in stable**: The sprd_serial driver has existed since
~2015 (commit 3e1f2029a4b40), so it's present in all active stable
trees
### SUMMARY
**What it fixes**: Incorrect handling of `-EPROBE_DEFER` from
`devm_clk_get()` for two clocks ("uart" and "source"), causing the
driver to proceed with NULL clocks instead of deferring probe when clock
providers aren't ready.
**Stable kernel criteria**:
- ✅ Obviously correct (follows identical pattern already in function)
- ✅ Fixes a real bug (broken deferred probing)
- ✅ Small and contained (6 lines, 1 file)
- ✅ No new features
- ✅ Low risk of regression
**Concerns**:
- No explicit `Cc: stable` tag (author/maintainer didn't flag for
stable)
- Relatively niche driver (Spreadtrum hardware)
**Risk vs Benefit**: The fix is minimal risk (identical pattern to
existing code) and addresses a real bug that could leave serial hardware
non-functional on embedded systems. The benefit outweighs the minimal
risk.
The lack of `Cc: stable` tag is notable but not determinative - this is
a straightforward bug fix that meets all stable criteria. The fix is
small, obviously correct, and addresses a real probe ordering issue that
embedded system users could encounter.
**YES**
drivers/tty/serial/sprd_serial.c | 6 ++++++
1 file changed, 6 insertions(+)
diff --git a/drivers/tty/serial/sprd_serial.c b/drivers/tty/serial/sprd_serial.c
index 8c9366321f8e7..092755f356836 100644
--- a/drivers/tty/serial/sprd_serial.c
+++ b/drivers/tty/serial/sprd_serial.c
@@ -1133,6 +1133,9 @@ static int sprd_clk_init(struct uart_port *uport)
clk_uart = devm_clk_get(uport->dev, "uart");
if (IS_ERR(clk_uart)) {
+ if (PTR_ERR(clk_uart) == -EPROBE_DEFER)
+ return -EPROBE_DEFER;
+
dev_warn(uport->dev, "uart%d can't get uart clock\n",
uport->line);
clk_uart = NULL;
@@ -1140,6 +1143,9 @@ static int sprd_clk_init(struct uart_port *uport)
clk_parent = devm_clk_get(uport->dev, "source");
if (IS_ERR(clk_parent)) {
+ if (PTR_ERR(clk_parent) == -EPROBE_DEFER)
+ return -EPROBE_DEFER;
+
dev_warn(uport->dev, "uart%d can't get source clock\n",
uport->line);
clk_parent = NULL;
--
2.51.0
Since Linux v6.7, booting using BootX on an Old World PowerMac produces
an early crash. Stan Johnson writes, "the symptoms are that the screen
goes blank and the backlight stays on, and the system freezes (Linux
doesn't boot)."
Further testing revealed that the failure can be avoided by disabling
CONFIG_BOOTX_TEXT. Bisection revealed that the regression was caused by
a change to the font bitmap pointer that's used when btext_init() begins
painting characters on the display, early in the boot process.
Christophe Leroy explains, "before kernel text is relocated to its final
location ... data is addressed with an offset which is added to the
Global Offset Table (GOT) entries at the start of bootx_init()
by function reloc_got2(). But the pointers that are located inside a
structure are not referenced in the GOT and are therefore not updated by
reloc_got2(). It is therefore needed to apply the offset manually by using
PTRRELOC() macro."
Cc: Cedar Maxwell <cedarmaxwell(a)mac.com>
Cc: Stan Johnson <userm57(a)yahoo.com>
Cc: "Dr. David Alan Gilbert" <linux(a)treblig.org>
Cc: Benjamin Herrenschmidt <benh(a)kernel.crashing.org>
Cc: stable(a)vger.kernel.org
Link: https://lists.debian.org/debian-powerpc/2025/10/msg00111.html
Link: https://lore.kernel.org/linuxppc-dev/d81ddca8-c5ee-d583-d579-02b19ed95301@y…
Reported-by: Cedar Maxwell <cedarmaxwell(a)mac.com>
Closes: https://lists.debian.org/debian-powerpc/2025/09/msg00031.html
Bisected-by: Stan Johnson <userm57(a)yahoo.com>
Tested-by: Stan Johnson <userm57(a)yahoo.com>
Fixes: 0ebc7feae79a ("powerpc: Use shared font data")
Suggested-by: Christophe Leroy <christophe.leroy(a)csgroup.eu>
Signed-off-by: Finn Thain <fthain(a)linux-m68k.org>
---
Changed since v1:
- Improved commit log entry to better explain the need for PTRRELOC().
---
arch/powerpc/kernel/btext.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/arch/powerpc/kernel/btext.c b/arch/powerpc/kernel/btext.c
index 7f63f1cdc6c3..ca00c4824e31 100644
--- a/arch/powerpc/kernel/btext.c
+++ b/arch/powerpc/kernel/btext.c
@@ -20,6 +20,7 @@
#include <asm/io.h>
#include <asm/processor.h>
#include <asm/udbg.h>
+#include <asm/setup.h>
#define NO_SCROLL
@@ -463,7 +464,7 @@ static noinline void draw_byte(unsigned char c, long locX, long locY)
{
unsigned char *base = calc_base(locX << 3, locY << 4);
unsigned int font_index = c * 16;
- const unsigned char *font = font_sun_8x16.data + font_index;
+ const unsigned char *font = PTRRELOC(font_sun_8x16.data) + font_index;
int rb = dispDeviceRowBytes;
rmci_maybe_on();
--
2.49.1
From: Al Viro <viro(a)zeniv.linux.org.uk>
[ Upstream commit e5bf5ee266633cb18fff6f98f0b7d59a62819eee ]
ffs_epfile_open() can race with removal, ending up with file->private_data
pointing to freed object.
There is a total count of opened files on functionfs (both ep0 and
dynamic ones) and when it hits zero, dynamic files get removed.
Unfortunately, that removal can happen while another thread is
in ffs_epfile_open(), but has not incremented the count yet.
In that case open will succeed, leaving us with UAF on any subsequent
read() or write().
The root cause is that ffs->opened is misused; atomic_dec_and_test() vs.
atomic_add_return() is not a good idea, when object remains visible all
along.
To untangle that
* serialize openers on ffs->mutex (both for ep0 and for dynamic files)
* have dynamic ones use atomic_inc_not_zero() and fail if we had
zero ->opened; in that case the file we are opening is doomed.
* have the inodes of dynamic files marked on removal (from the
callback of simple_recursive_removal()) - clear ->i_private there.
* have open of dynamic ones verify they hadn't been already removed,
along with checking that state is FFS_ACTIVE.
Reviewed-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
Signed-off-by: Al Viro <viro(a)zeniv.linux.org.uk>
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
---
LLM Generated explanations, may be completely bogus:
## Summary
### What the Commit Fixes
This commit fixes a **Use-After-Free (UAF)** vulnerability in USB gadget
functionfs. The race condition occurs between `ffs_epfile_open()` and
file removal:
1. Thread A is in `ffs_epfile_open()` but hasn't incremented
`ffs->opened` yet
2. Thread B closes the last file handle, triggering removal (since
`opened` is 0)
3. Thread A's open succeeds with `file->private_data` pointing to freed
memory
4. Subsequent `read()`/`write()` operations cause UAF
### Fix Mechanism
The fix implements proper synchronization:
- Serializes openers using `ffs->mutex`
- Uses `atomic_inc_not_zero()` to fail if counter already zero
- Uses `smp_load_acquire()`/`smp_store_release()` for memory ordering
- Clears `i_private` during removal via `simple_recursive_removal()`
callback
- Uses `file->private_data` instead of `inode->i_private` in release
path
### Stable Kernel Criteria Assessment
| Criterion | Assessment |
|-----------|------------|
| Obviously correct | ✅ Uses standard kernel primitives, proper locking
patterns |
| Fixes real bug | ✅ UAF vulnerability, security-relevant |
| Important issue | ✅ Security bug, potential for crashes/exploitation |
| Small and contained | ✅ Single file, +43/-10 lines, localized changes
|
| No new features | ✅ Pure bug fix, no new APIs |
### Risk vs Benefit
**Benefits:**
- Fixes serious UAF vulnerability
- USB gadget functionfs used in Android, embedded systems
- Reviewed by Greg Kroah-Hartman (USB maintainer, stable maintainer)
- Written by Al Viro (highly respected kernel developer)
**Risks:**
- Moderate complexity (changes locking behavior)
- Recent commit (November 2025), limited mainline soak time
- No explicit `Cc: stable(a)vger.kernel.org` tag
### Dependencies
- `ffs_mutex_lock()` - exists in functionfs since early versions
- `simple_recursive_removal()` with callback - available since ~5.x
kernels
- Standard kernel APIs (`atomic_inc_not_zero`, memory barriers) -
universally available
### Concerns
1. **No Fixes: tag** - Makes it harder to determine which stable trees
need this fix
2. **No Cc: stable tag** - May indicate maintainers wanted soak time, or
an oversight given Greg KH reviewed it
3. **Backport effort** - May need adjustment for older stable trees
depending on functionfs evolution
### Conclusion
This is a legitimate UAF security fix that affects real-world users
(Android, embedded USB gadgets). Despite moderate complexity, the fix:
- Addresses a serious vulnerability class (UAF)
- Uses correct synchronization patterns
- Has been reviewed by the appropriate maintainer who also maintains
stable trees
- Is self-contained with no feature additions
The lack of explicit stable tags appears to be an oversight given the
security nature of the bug and Greg KH's review. UAF vulnerabilities
typically warrant expedited backporting.
**YES**
drivers/usb/gadget/function/f_fs.c | 53 ++++++++++++++++++++++++------
1 file changed, 43 insertions(+), 10 deletions(-)
diff --git a/drivers/usb/gadget/function/f_fs.c b/drivers/usb/gadget/function/f_fs.c
index 47cfbe41fdff8..69f6e3c0f7e00 100644
--- a/drivers/usb/gadget/function/f_fs.c
+++ b/drivers/usb/gadget/function/f_fs.c
@@ -640,13 +640,22 @@ static ssize_t ffs_ep0_read(struct file *file, char __user *buf,
static int ffs_ep0_open(struct inode *inode, struct file *file)
{
- struct ffs_data *ffs = inode->i_private;
+ struct ffs_data *ffs = inode->i_sb->s_fs_info;
+ int ret;
- if (ffs->state == FFS_CLOSING)
- return -EBUSY;
+ /* Acquire mutex */
+ ret = ffs_mutex_lock(&ffs->mutex, file->f_flags & O_NONBLOCK);
+ if (ret < 0)
+ return ret;
- file->private_data = ffs;
ffs_data_opened(ffs);
+ if (ffs->state == FFS_CLOSING) {
+ ffs_data_closed(ffs);
+ mutex_unlock(&ffs->mutex);
+ return -EBUSY;
+ }
+ mutex_unlock(&ffs->mutex);
+ file->private_data = ffs;
return stream_open(inode, file);
}
@@ -1193,14 +1202,33 @@ static ssize_t ffs_epfile_io(struct file *file, struct ffs_io_data *io_data)
static int
ffs_epfile_open(struct inode *inode, struct file *file)
{
- struct ffs_epfile *epfile = inode->i_private;
+ struct ffs_data *ffs = inode->i_sb->s_fs_info;
+ struct ffs_epfile *epfile;
+ int ret;
- if (WARN_ON(epfile->ffs->state != FFS_ACTIVE))
+ /* Acquire mutex */
+ ret = ffs_mutex_lock(&ffs->mutex, file->f_flags & O_NONBLOCK);
+ if (ret < 0)
+ return ret;
+
+ if (!atomic_inc_not_zero(&ffs->opened)) {
+ mutex_unlock(&ffs->mutex);
+ return -ENODEV;
+ }
+ /*
+ * we want the state to be FFS_ACTIVE; FFS_ACTIVE alone is
+ * not enough, though - we might have been through FFS_CLOSING
+ * and back to FFS_ACTIVE, with our file already removed.
+ */
+ epfile = smp_load_acquire(&inode->i_private);
+ if (unlikely(ffs->state != FFS_ACTIVE || !epfile)) {
+ mutex_unlock(&ffs->mutex);
+ ffs_data_closed(ffs);
return -ENODEV;
+ }
+ mutex_unlock(&ffs->mutex);
file->private_data = epfile;
- ffs_data_opened(epfile->ffs);
-
return stream_open(inode, file);
}
@@ -1332,7 +1360,7 @@ static void ffs_dmabuf_put(struct dma_buf_attachment *attach)
static int
ffs_epfile_release(struct inode *inode, struct file *file)
{
- struct ffs_epfile *epfile = inode->i_private;
+ struct ffs_epfile *epfile = file->private_data;
struct ffs_dmabuf_priv *priv, *tmp;
struct ffs_data *ffs = epfile->ffs;
@@ -2352,6 +2380,11 @@ static int ffs_epfiles_create(struct ffs_data *ffs)
return 0;
}
+static void clear_one(struct dentry *dentry)
+{
+ smp_store_release(&dentry->d_inode->i_private, NULL);
+}
+
static void ffs_epfiles_destroy(struct ffs_epfile *epfiles, unsigned count)
{
struct ffs_epfile *epfile = epfiles;
@@ -2359,7 +2392,7 @@ static void ffs_epfiles_destroy(struct ffs_epfile *epfiles, unsigned count)
for (; count; --count, ++epfile) {
BUG_ON(mutex_is_locked(&epfile->mutex));
if (epfile->dentry) {
- simple_recursive_removal(epfile->dentry, NULL);
+ simple_recursive_removal(epfile->dentry, clear_one);
epfile->dentry = NULL;
}
}
--
2.51.0
Hello,
Resend my last email without HTML.
---- zyc zyc <zyc199902(a)zohomail.cn> 在 Sat, 2025-11-29 18:57:01 写到:---
> Hello, maintainer
>
> I would like to report what appears to be a regression in 6.12.50 kernel release related to netem.
> It rejects our configuration with the message:
> Error: netem: cannot mix duplicating netems with other netems in tree.
>
> This breaks setups that previously worked correctly for many years.
>
>
> Our team uses multiple netem qdiscs in the same HTB branch, arranged in a parallel fashion using a prio fan-out. Each branch of the prio qdisc has its own distinct netem instance with different duplication characteristics.
>
> This is used to emulate our production conditions where a single logical path fans out into two downstream segments, for example:
>
> two ECMP next hops with different misbehaviour characteristics, or
>
>
> an HA firewall cluster where only one node is replaying frames, or
>
>
> two LAG / ToR paths where one path intermittently duplicates packets.
>
>
> In our environments, only a subset of flows are affected, and different downstream devices may cause different styles of duplication.
> This regression breaks existing automated tests, training environments, and network simulation pipelines.
>
> I would be happy to provide our reproducer if needed.
>
> Thank you for your time and for maintaining Linux kernel.
>
>
>
> Best regards,
> zyc
>
>
>
of_get_child_by_name() returns a node pointer with refcount incremented,
we should use the __free() attribute to manage the pgc_node reference.
This ensures automatic of_node_put() cleanup when pgc_node goes out of
scope, eliminating the need for explicit error handling paths and
avoiding reference count leaks.
Fixes: 721cabf6c660 ("soc: imx: move PGC handling to a new GPC driver")
Cc: stable(a)vger.kernel.org
Signed-off-by: Wentao Liang <vulab(a)iscas.ac.cn>
---
Change in V2:
- Use __free() attribute instead of explicit of_node_put() calls
---
drivers/pmdomain/imx/gpc.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/pmdomain/imx/gpc.c b/drivers/pmdomain/imx/gpc.c
index f18c7e6e75dd..89d5d68c055d 100644
--- a/drivers/pmdomain/imx/gpc.c
+++ b/drivers/pmdomain/imx/gpc.c
@@ -403,7 +403,7 @@ static int imx_gpc_old_dt_init(struct device *dev, struct regmap *regmap,
static int imx_gpc_probe(struct platform_device *pdev)
{
const struct imx_gpc_dt_data *of_id_data = device_get_match_data(&pdev->dev);
- struct device_node *pgc_node;
+ struct device_node *pgc_node __free(pgc_node);
struct regmap *regmap;
void __iomem *base;
int ret;
--
2.34.1
From: Yongxin Liu <yongxin.liu(a)windriver.com>
Zero can be a valid value of num_records. For example, on Intel Atom x6425RE,
only x87 and SSE are supported (features 0, 1), and fpu_user_cfg.max_features
is 3. The for_each_extended_xfeature() loop only iterates feature 2, which is
not enabled, so num_records = 0. This is valid and should not cause core dump
failure.
The issue is that dump_xsave_layout_desc() returns 0 for both genuine errors
(dump_emit() failure) and valid cases (no extended features). Use negative
return values for errors and only abort on genuine failures.
Cc: stable(a)vger.kernel.org
Fixes: ba386777a30b ("x86/elf: Add a new FPU buffer layout info to x86 core files")
Signed-off-by: Yongxin Liu <yongxin.liu(a)windriver.com>
---
V2: Keep error checking but use negative value for genuine error
V1: Remove error checking entirely
---
arch/x86/kernel/fpu/xstate.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 48113c5193aa..76153dfb58c9 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -1946,7 +1946,7 @@ static int dump_xsave_layout_desc(struct coredump_params *cprm)
};
if (!dump_emit(cprm, &xc, sizeof(xc)))
- return 0;
+ return -1;
num_records++;
}
@@ -1984,7 +1984,7 @@ int elf_coredump_extra_notes_write(struct coredump_params *cprm)
return 1;
num_records = dump_xsave_layout_desc(cprm);
- if (!num_records)
+ if (num_records < 0)
return 1;
/* Total size should be equal to the number of records */
--
2.46.2
From: Yongxin Liu <yongxin.liu(a)windriver.com>
Zero can be a valid value of num_records. For example, on Intel Atom x6425RE,
only x87 and SSE are supported (features 0, 1), and fpu_user_cfg.max_features
is 3. The for_each_extended_xfeature() loop only iterates feature 2, which is
not enabled, so num_records = 0. This is valid and should not cause core dump
failure.
The size check already validates consistency: if num_records = 0, then
en.n_descsz = 0, so the check passes.
Cc: stable(a)vger.kernel.org
Fixes: ba386777a30b ("x86/elf: Add a new FPU buffer layout info to x86 core files")
Signed-off-by: Yongxin Liu <yongxin.liu(a)windriver.com>
---
arch/x86/kernel/fpu/xstate.c | 2 --
1 file changed, 2 deletions(-)
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 48113c5193aa..b1dd30eb21a8 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -1984,8 +1984,6 @@ int elf_coredump_extra_notes_write(struct coredump_params *cprm)
return 1;
num_records = dump_xsave_layout_desc(cprm);
- if (!num_records)
- return 1;
/* Total size should be equal to the number of records */
if ((sizeof(struct x86_xfeat_component) * num_records) != en.n_descsz)
--
2.46.2
Replace the RISCV_ISA_V dependency of the RISC-V crypto code with
RISCV_EFFICIENT_VECTOR_UNALIGNED_ACCESS, which implies RISCV_ISA_V as
well as vector unaligned accesses being efficient.
This is necessary because this code assumes that vector unaligned
accesses are supported and are efficient. (It does so to avoid having
to use lots of extra vsetvli instructions to switch the element width
back and forth between 8 and either 32 or 64.)
This was omitted from the code originally just because the RISC-V kernel
support for detecting this feature didn't exist yet. Support has now
been added, but it's fragmented into per-CPU runtime detection, a
command-line parameter, and a kconfig option. The kconfig option is the
only reasonable way to do it, though, so let's just rely on that.
Fixes: eb24af5d7a05 ("crypto: riscv - add vector crypto accelerated AES-{ECB,CBC,CTR,XTS}")
Fixes: bb54668837a0 ("crypto: riscv - add vector crypto accelerated ChaCha20")
Fixes: 600a3853dfa0 ("crypto: riscv - add vector crypto accelerated GHASH")
Fixes: 8c8e40470ffe ("crypto: riscv - add vector crypto accelerated SHA-{256,224}")
Fixes: b3415925a08b ("crypto: riscv - add vector crypto accelerated SHA-{512,384}")
Fixes: 563a5255afa2 ("crypto: riscv - add vector crypto accelerated SM3")
Fixes: b8d06352bbf3 ("crypto: riscv - add vector crypto accelerated SM4")
Cc: stable(a)vger.kernel.org
Signed-off-by: Eric Biggers <ebiggers(a)kernel.org>
---
arch/riscv/crypto/Kconfig | 12 ++++++++----
lib/crypto/Kconfig | 9 ++++++---
2 files changed, 14 insertions(+), 7 deletions(-)
diff --git a/arch/riscv/crypto/Kconfig b/arch/riscv/crypto/Kconfig
index a75d6325607b..14c5acb935e9 100644
--- a/arch/riscv/crypto/Kconfig
+++ b/arch/riscv/crypto/Kconfig
@@ -2,11 +2,12 @@
menu "Accelerated Cryptographic Algorithms for CPU (riscv)"
config CRYPTO_AES_RISCV64
tristate "Ciphers: AES, modes: ECB, CBC, CTS, CTR, XTS"
- depends on 64BIT && RISCV_ISA_V && TOOLCHAIN_HAS_VECTOR_CRYPTO
+ depends on 64BIT && TOOLCHAIN_HAS_VECTOR_CRYPTO && \
+ RISCV_EFFICIENT_VECTOR_UNALIGNED_ACCESS
select CRYPTO_ALGAPI
select CRYPTO_LIB_AES
select CRYPTO_SKCIPHER
help
Block cipher: AES cipher algorithms
@@ -18,21 +19,23 @@ config CRYPTO_AES_RISCV64
- Zvkb vector crypto extension (CTR)
- Zvkg vector crypto extension (XTS)
config CRYPTO_GHASH_RISCV64
tristate "Hash functions: GHASH"
- depends on 64BIT && RISCV_ISA_V && TOOLCHAIN_HAS_VECTOR_CRYPTO
+ depends on 64BIT && TOOLCHAIN_HAS_VECTOR_CRYPTO && \
+ RISCV_EFFICIENT_VECTOR_UNALIGNED_ACCESS
select CRYPTO_GCM
help
GCM GHASH function (NIST SP 800-38D)
Architecture: riscv64 using:
- Zvkg vector crypto extension
config CRYPTO_SM3_RISCV64
tristate "Hash functions: SM3 (ShangMi 3)"
- depends on 64BIT && RISCV_ISA_V && TOOLCHAIN_HAS_VECTOR_CRYPTO
+ depends on 64BIT && TOOLCHAIN_HAS_VECTOR_CRYPTO && \
+ RISCV_EFFICIENT_VECTOR_UNALIGNED_ACCESS
select CRYPTO_HASH
select CRYPTO_LIB_SM3
help
SM3 (ShangMi 3) secure hash function (OSCCA GM/T 0004-2012)
@@ -40,11 +43,12 @@ config CRYPTO_SM3_RISCV64
- Zvksh vector crypto extension
- Zvkb vector crypto extension
config CRYPTO_SM4_RISCV64
tristate "Ciphers: SM4 (ShangMi 4)"
- depends on 64BIT && RISCV_ISA_V && TOOLCHAIN_HAS_VECTOR_CRYPTO
+ depends on 64BIT && TOOLCHAIN_HAS_VECTOR_CRYPTO && \
+ RISCV_EFFICIENT_VECTOR_UNALIGNED_ACCESS
select CRYPTO_ALGAPI
select CRYPTO_SM4
help
SM4 block cipher algorithm (OSCCA GB/T 32907-2016,
ISO/IEC 18033-3:2010/Amd 1:2021)
diff --git a/lib/crypto/Kconfig b/lib/crypto/Kconfig
index a3647352bff6..6871a41e5069 100644
--- a/lib/crypto/Kconfig
+++ b/lib/crypto/Kconfig
@@ -59,11 +59,12 @@ config CRYPTO_LIB_CHACHA_ARCH
depends on CRYPTO_LIB_CHACHA && !UML && !KMSAN
default y if ARM
default y if ARM64 && KERNEL_MODE_NEON
default y if MIPS && CPU_MIPS32_R2
default y if PPC64 && CPU_LITTLE_ENDIAN && VSX
- default y if RISCV && 64BIT && RISCV_ISA_V && TOOLCHAIN_HAS_VECTOR_CRYPTO
+ default y if RISCV && 64BIT && TOOLCHAIN_HAS_VECTOR_CRYPTO && \
+ RISCV_EFFICIENT_VECTOR_UNALIGNED_ACCESS
default y if S390
default y if X86_64
config CRYPTO_LIB_CURVE25519
tristate
@@ -182,11 +183,12 @@ config CRYPTO_LIB_SHA256_ARCH
depends on CRYPTO_LIB_SHA256 && !UML
default y if ARM && !CPU_V7M
default y if ARM64
default y if MIPS && CPU_CAVIUM_OCTEON
default y if PPC && SPE
- default y if RISCV && 64BIT && RISCV_ISA_V && TOOLCHAIN_HAS_VECTOR_CRYPTO
+ default y if RISCV && 64BIT && TOOLCHAIN_HAS_VECTOR_CRYPTO && \
+ RISCV_EFFICIENT_VECTOR_UNALIGNED_ACCESS
default y if S390
default y if SPARC64
default y if X86_64
config CRYPTO_LIB_SHA512
@@ -200,11 +202,12 @@ config CRYPTO_LIB_SHA512_ARCH
bool
depends on CRYPTO_LIB_SHA512 && !UML
default y if ARM && !CPU_V7M
default y if ARM64
default y if MIPS && CPU_CAVIUM_OCTEON
- default y if RISCV && 64BIT && RISCV_ISA_V && TOOLCHAIN_HAS_VECTOR_CRYPTO
+ default y if RISCV && 64BIT && TOOLCHAIN_HAS_VECTOR_CRYPTO && \
+ RISCV_EFFICIENT_VECTOR_UNALIGNED_ACCESS
default y if S390
default y if SPARC64
default y if X86_64
config CRYPTO_LIB_SHA3
base-commit: 43dfc13ca972988e620a6edb72956981b75ab6b0
--
2.52.0
Hi,
This Linux kernel patch series introduces support for error recovery for
passthrough PCI devices on System Z (s390x).
Background
----------
For PCI devices on s390x an operating system receives platform specific
error events from firmware rather than through AER.Today for
passthrough/userspace devices, we don't attempt any error recovery and
ignore any error events for the devices. The passthrough/userspace devices
are managed by the vfio-pci driver. The driver does register error handling
callbacks (error_detected), and on an error trigger an eventfd to
userspace. But we need a mechanism to notify userspace
(QEMU/guest/userspace drivers) about the error event.
Proposal
--------
We can expose this error information (currently only the PCI Error Code)
via a device feature. Userspace can then obtain the error information
via VFIO_DEVICE_FEATURE ioctl and take appropriate actions such as driving
a device reset.
This is how a typical flow for passthrough devices to a VM would work:
For passthrough devices to a VM, the driver bound to the device on the host
is vfio-pci. vfio-pci driver does support the error_detected() callback
(vfio_pci_core_aer_err_detected()), and on an PCI error s390x recovery
code on the host will call the vfio-pci error_detected() callback. The
vfio-pci error_detected() callback will notify userspace/QEMU via an
eventfd, and return PCI_ERS_RESULT_CAN_RECOVER. At this point the s390x
error recovery on the host will skip any further action(see patch 6) and
let userspace drive the error recovery.
Once userspace/QEMU is notified, it then injects this error into the VM
so device drivers in the VM can take recovery actions. For example for a
passthrough NVMe device, the VM's OS NVMe driver will access the device.
At this point the VM's NVMe driver's error_detected() will drive the
recovery by returning PCI_ERS_RESULT_NEED_RESET, and the s390x error
recovery in the VM's OS will try to do a reset. Resets are privileged
operations and so the VM will need intervention from QEMU to perform the
reset. QEMU will invoke the VFIO_DEVICE_RESET ioctl to now notify the
host that the VM is requesting a reset of the device. The vfio-pci driver
on the host will then perform the reset on the device to recover it.
Thanks
Farhan
ChangeLog
---------
v5 series https://lore.kernel.org/all/20251113183502.2388-1-alifm@linux.ibm.com/
v5 -> v6
- Rebase on 6.18 + Lukas's PCI: Universal error recoverability of
devices series (https://lore.kernel.org/all/cover.1763483367.git.lukas@wunner.de/)
- Re-work config space accessibility check to pci_dev_save_and_disable() (patch 3).
This avoids saving the config space, in the reset path, if the device's config space is
corrupted or inaccessible.
v4 series https://lore.kernel.org/all/20250924171628.826-1-alifm@linux.ibm.com/
v4 -> v5
- Rebase on 6.18-rc5
- Move bug fixes to the beginning of the series (patch 1 and 2). These patches
were posted as a separate fixes series
https://lore.kernel.org/all/a14936ac-47d6-461b-816f-0fd66f869b0f@linux.ibm.…
- Add matching pci_put_dev() for pci_get_slot() (patch 6).
v3 series https://lore.kernel.org/all/20250911183307.1910-1-alifm@linux.ibm.com/
v3 -> v4
- Remove warn messages for each PCI capability not restored (patch 1)
- Check PCI_COMMAND and PCI_STATUS register for error value instead of device id
(patch 1)
- Fix kernel crash in patch 3
- Added reviewed by tags
- Address comments from Niklas's (patches 4, 5, 7)
- Fix compilation error non s390x system (patch 8)
- Explicitly align struct vfio_device_feature_zpci_err (patch 8)
v2 series https://lore.kernel.org/all/20250825171226.1602-1-alifm@linux.ibm.com/
v2 -> v3
- Patch 1 avoids saving any config space state if the device is in error
(suggested by Alex)
- Patch 2 adds additional check only for FLR reset to try other function
reset method (suggested by Alex).
- Patch 3 fixes a bug in s390 for resetting PCI devices with multiple
functions. Creates a new flag pci_slot to allow per function slot.
- Patch 4 fixes a bug in s390 for resource to bus address translation.
- Rebase on 6.17-rc5
v1 series https://lore.kernel.org/all/20250813170821.1115-1-alifm@linux.ibm.com/
v1 - > v2
- Patches 1 and 2 adds some additional checks for FLR/PM reset to
try other function reset method (suggested by Alex).
- Patch 3 fixes a bug in s390 for resetting PCI devices with multiple
functions.
- Patch 7 adds a new device feature for zPCI devices for the VFIO_DEVICE_FEATURE
ioctl. The ioctl is used by userspace to retriece any PCI error
information for the device (suggested by Alex).
- Patch 8 adds a reset_done() callback for the vfio-pci driver, to
restore the state of the device after a reset.
- Patch 9 removes the pcie check for triggering VFIO_PCI_ERR_IRQ_INDEX.
Farhan Ali (9):
PCI: Allow per function PCI slots
s390/pci: Add architecture specific resource/bus address translation
PCI: Avoid saving config space state if inaccessible
PCI: Add additional checks for flr reset
s390/pci: Update the logic for detecting passthrough device
s390/pci: Store PCI error information for passthrough devices
vfio-pci/zdev: Add a device feature for error information
vfio: Add a reset_done callback for vfio-pci driver
vfio: Remove the pcie check for VFIO_PCI_ERR_IRQ_INDEX
arch/s390/include/asm/pci.h | 29 ++++++++
arch/s390/pci/pci.c | 75 +++++++++++++++++++++
arch/s390/pci/pci_event.c | 107 +++++++++++++++++-------------
drivers/pci/host-bridge.c | 4 +-
drivers/pci/pci.c | 19 +++++-
drivers/pci/slot.c | 25 ++++++-
drivers/vfio/pci/vfio_pci_core.c | 20 ++++--
drivers/vfio/pci/vfio_pci_intrs.c | 3 +-
drivers/vfio/pci/vfio_pci_priv.h | 9 +++
drivers/vfio/pci/vfio_pci_zdev.c | 45 ++++++++++++-
include/linux/pci.h | 1 +
include/uapi/linux/vfio.h | 15 +++++
12 files changed, 291 insertions(+), 61 deletions(-)
--
2.43.0
¿Cuánto cuesta una mala contratación?
body {
margin: 0;
padding: 0;
font-family: Arial, Helvetica, sans-serif;
font-size: 14px;
color: #333;
background-color: #ffffff;
}
table {
border-spacing: 0;
width: 100%;
max-width: 600px;
margin: auto;
}
td {
padding: 12px 20px;
}
a {
color: #1a73e8;
text-decoration: none;
}
.footer {
font-size: 12px;
color: #888888;
text-align: center;
}
Una mala contratación cuesta 3X el salario. Evítalo con datos, no percepciones.
Hola, ,
¿Sabías que una mala contratación cuesta hasta 3 veces el salario anual?
El 74% de empresas admite haber contratado a la persona equivocada. El motivo: decisiones basadas en percepciones, no en datos objetivos.
PsicoSmart te ayuda a evaluar talento con precisión:
31 pruebas psicométricas validadas para medir liderazgo, honestidad e inteligencia
2,500+ exámenes técnicos especializados por industria
Verificación de identidad con captura fotográfica automática
Resultados en minutos, accesible desde cualquier dispositivo
Reduce hasta 60% el riesgo de error en selección.
¿Quieres una demostración gratuita? Responde este correo y te contacto en menos de 24 horas.
Saludos,
--------------
Atte.: Valeria Pérez
Ciudad de México: (55) 5018 0565
WhatsApp: +52 33 1607 2089
Si no deseas recibir más correos, haz clic aquí para darte de baja.
Para remover su dirección de esta lista haga <a href="https://s1.arrobamail.com/unsuscribe.php?id=yiwtsrewiswqtrseup">click aquí</a>
The quilt patch titled
Subject: ocfs2: fix kernel BUG in ocfs2_find_victim_chain
has been removed from the -mm tree. Its filename was
ocfs2-fix-kernel-bug-in-ocfs2_find_victim_chain.patch
This patch was dropped because it was merged into the mm-nonmm-stable branch
of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
------------------------------------------------------
From: Prithvi Tambewagh <activprithvi(a)gmail.com>
Subject: ocfs2: fix kernel BUG in ocfs2_find_victim_chain
Date: Mon, 1 Dec 2025 18:37:11 +0530
syzbot reported a kernel BUG in ocfs2_find_victim_chain() because the
`cl_next_free_rec` field of the allocation chain list (next free slot in
the chain list) is 0, triggring the BUG_ON(!cl->cl_next_free_rec)
condition in ocfs2_find_victim_chain() and panicking the kernel.
To fix this, an if condition is introduced in ocfs2_claim_suballoc_bits(),
just before calling ocfs2_find_victim_chain(), the code block in it being
executed when either of the following conditions is true:
1. `cl_next_free_rec` is equal to 0, indicating that there are no free
chains in the allocation chain list
2. `cl_next_free_rec` is greater than `cl_count` (the total number of
chains in the allocation chain list)
Either of them being true is indicative of the fact that there are no
chains left for usage.
This is addressed using ocfs2_error(), which prints
the error log for debugging purposes, rather than panicking the kernel.
Link: https://lkml.kernel.org/r/20251201130711.143900-1-activprithvi@gmail.com
Signed-off-by: Prithvi Tambewagh <activprithvi(a)gmail.com>
Reported-by: syzbot+96d38c6e1655c1420a72(a)syzkaller.appspotmail.com
Closes: https://syzkaller.appspot.com/bug?extid=96d38c6e1655c1420a72
Tested-by: syzbot+96d38c6e1655c1420a72(a)syzkaller.appspotmail.com
Reviewed-by: Joseph Qi <joseph.qi(a)linux.alibaba.com>
Cc: Mark Fasheh <mark(a)fasheh.com>
Cc: Joel Becker <jlbec(a)evilplan.org>
Cc: Junxiao Bi <junxiao.bi(a)oracle.com>
Cc: Changwei Ge <gechangwei(a)live.cn>
Cc: Jun Piao <piaojun(a)huawei.com>
Cc: Heming Zhao <heming.zhao(a)suse.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
fs/ocfs2/suballoc.c | 10 ++++++++++
1 file changed, 10 insertions(+)
--- a/fs/ocfs2/suballoc.c~ocfs2-fix-kernel-bug-in-ocfs2_find_victim_chain
+++ a/fs/ocfs2/suballoc.c
@@ -1993,6 +1993,16 @@ static int ocfs2_claim_suballoc_bits(str
}
cl = (struct ocfs2_chain_list *) &fe->id2.i_chain;
+ if (!le16_to_cpu(cl->cl_next_free_rec) ||
+ le16_to_cpu(cl->cl_next_free_rec) > le16_to_cpu(cl->cl_count)) {
+ status = ocfs2_error(ac->ac_inode->i_sb,
+ "Chain allocator dinode %llu has invalid next "
+ "free chain record %u, but only %u total\n",
+ (unsigned long long)le64_to_cpu(fe->i_blkno),
+ le16_to_cpu(cl->cl_next_free_rec),
+ le16_to_cpu(cl->cl_count));
+ goto bail;
+ }
victim = ocfs2_find_victim_chain(cl);
ac->ac_chain = victim;
_
Patches currently in -mm which might be from activprithvi(a)gmail.com are
The quilt patch titled
Subject: mm/hugetlb: fix excessive IPI broadcasts when unsharing PMD tables using mmu_gather
has been removed from the -mm tree. Its filename was
mm-hugetlb-fix-excessive-ipi-broadcasts-when-unsharing-pmd-tables-using-mmu_gather.patch
This patch was dropped because it was merged into the mm-stable branch
of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
------------------------------------------------------
From: "David Hildenbrand (Red Hat)" <david(a)kernel.org>
Subject: mm/hugetlb: fix excessive IPI broadcasts when unsharing PMD tables using mmu_gather
Date: Fri, 5 Dec 2025 22:35:58 +0100
As reported, ever since commit 1013af4f585f ("mm/hugetlb: fix
huge_pmd_unshare() vs GUP-fast race") we can end up in some situations
where we perform so many IPI broadcasts when unsharing hugetlb PMD page
tables that it severely regresses some workloads.
In particular, when we fork()+exit(), or when we munmap() a large area
backed by many shared PMD tables, we perform one IPI broadcast per
unshared PMD table.
There are two optimizations to be had:
(1) When we process (unshare) multiple such PMD tables, such as during
exit(), it is sufficient to send a single IPI broadcast (as long as
we respect locking rules) instead of one per PMD table.
Locking prevents that any of these PMD tables could get reuse before
we drop the lock.
(2) When we are not the last sharer (> 2 users including us), there is
no need to send the IPI broadcast. The shared PMD tables cannot
become exclusive (fully unshared) before an IPI will be broadcasted
by the last sharer.
Concurrent GUP-fast could walk into a PMD table just before we
unshared it. It could then succeed in grabbing a page from the
shared page table even after munmap() etc succeeded (and supressed
an IPI). But there is not difference compared to GUP-fast just
sleeping for a while after grabbing the page and re-enabling IRQs.
Most importantly, GUP-fast will never walk into page tables that are
no-longer shared, because the last sharer will issue an IPI
broadcast.
(if ever required, checking whether the PUD changed in GUP-fast
after grabbing the page like we do in the PTE case could handle
this)
So let's rework PMD sharing TLB flushing + IPI sync to use the mmu_gather
infrastructure so we can implement these optimizations and demystify the
code at least a bit. Extend the mmu_gather infrastructure to be able to
deal with our special hugetlb PMD table sharing implementation.
We'll consolidate the handling for (full) unsharing of PMD tables in
tlb_unshare_pmd_ptdesc() and tlb_flush_unshared_tables(), and track in
"struct mmu_gather" whether we had (full) unsharing of PMD tables.
Because locking is very special (concurrent unsharing+reuse must be
prevented), we disallow deferring flushing to tlb_finish_mmu() and instead
require an explicit earlier call to tlb_flush_unshared_tables().
From hugetlb code, we call huge_pmd_unshare_flush() where we make sure
that the expected lock protecting us from concurrent unsharing+reuse is
still held.
Check with a VM_WARN_ON_ONCE() in tlb_finish_mmu() that
tlb_flush_unshared_tables() was properly called earlier.
Document it all properly.
Notes about tlb_remove_table_sync_one() interaction with unsharing:
There are two fairly tricky things:
(1) tlb_remove_table_sync_one() is a NOP on architectures without
CONFIG_MMU_GATHER_RCU_TABLE_FREE.
Here, the assumption is that the previous TLB flush would send an
IPI to all relevant CPUs. Careful: some architectures like x86 only
send IPIs to all relevant CPUs when tlb->freed_tables is set.
The relevant architectures should be selecting
MMU_GATHER_RCU_TABLE_FREE, but x86 might not do that in stable
kernels and it might have been problematic before this patch.
Also, the arch flushing behavior (independent of IPIs) is different
when tlb->freed_tables is set. Do we have to enlighten them to also
take care of tlb->unshared_tables? So far we didn't care, so
hopefully we are fine. Of course, we could be setting
tlb->freed_tables as well, but that might then unnecessarily flush
too much, because the semantics of tlb->freed_tables are a bit
fuzzy.
This patch changes nothing in this regard.
(2) tlb_remove_table_sync_one() is not a NOP on architectures with
CONFIG_MMU_GATHER_RCU_TABLE_FREE that actually don't need a sync.
Take x86 as an example: in the common case (!pv, !X86_FEATURE_INVLPGB)
we still issue IPIs during TLB flushes and don't actually need the
second tlb_remove_table_sync_one().
This optimized can be implemented on top of this, by checking e.g., in
tlb_remove_table_sync_one() whether we really need IPIs. But as
described in (1), it really must honor tlb->freed_tables then to
send IPIs to all relevant CPUs.
Further note that the ptdesc_pmd_pts_dec() in huge_pmd_share() is not a
concern, as we are holding the i_mmap_lock the whole time, preventing
concurrent unsharing. That ptdesc_pmd_pts_dec() usage will be removed
separately as a cleanup later.
There are plenty more cleanups to be had, but they have to wait until
this is fixed.
Link: https://lkml.kernel.org/r/20251205213558.2980480-5-david@kernel.org
Fixes: 1013af4f585f ("mm/hugetlb: fix huge_pmd_unshare() vs GUP-fast race")
Signed-off-by: David Hildenbrand (Red Hat) <david(a)kernel.org>
Reported-by: Uschakow, Stanislav" <suschako(a)amazon.de>
Closes: https://lore.kernel.org/all/4d3878531c76479d9f8ca9789dc6485d@amazon.de/
Tested-by: Laurence Oberman <loberman(a)redhat.com>
Cc: "Aneesh Kumar K.V" <aneesh.kumar(a)kernel.org>
Cc: Arnd Bergmann <arnd(a)arndb.de>
Cc: Jann Horn <jannh(a)google.com>
Cc: Liam Howlett <liam.howlett(a)oracle.com>
Cc: Liu Shixin <liushixin2(a)huawei.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes(a)oracle.com>
Cc: Muchun Song <muchun.song(a)linux.dev>
Cc: Nadav Amit <nadav.amit(a)gmail.com>
Cc: Nicholas Piggin <npiggin(a)gmail.com>
Cc: Oscar Salvador <osalvador(a)suse.de>
Cc: Peter Zijlstra <peterz(a)infradead.org>
Cc: Prakash Sangappa <prakash.sangappa(a)oracle.com>
Cc: Rik van Riel <riel(a)surriel.com>
Cc: Vlastimil Babka <vbabka(a)suse.cz>
Cc: Will Deacon <will(a)kernel.org>
Cc: Lance Yang <lance.yang(a)linux.dev>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
include/asm-generic/tlb.h | 69 ++++++++++++++++++++
include/linux/hugetlb.h | 19 +++--
mm/hugetlb.c | 121 ++++++++++++++++++++----------------
mm/mmu_gather.c | 6 +
mm/mprotect.c | 2
mm/rmap.c | 25 +++++--
6 files changed, 173 insertions(+), 69 deletions(-)
--- a/include/asm-generic/tlb.h~mm-hugetlb-fix-excessive-ipi-broadcasts-when-unsharing-pmd-tables-using-mmu_gather
+++ a/include/asm-generic/tlb.h
@@ -364,6 +364,17 @@ struct mmu_gather {
unsigned int vma_huge : 1;
unsigned int vma_pfn : 1;
+ /*
+ * Did we unshare (unmap) any shared page tables?
+ */
+ unsigned int unshared_tables : 1;
+
+ /*
+ * Did we unshare any page tables such that they are now exclusive
+ * and could get reused+modified by the new owner?
+ */
+ unsigned int fully_unshared_tables : 1;
+
unsigned int batch_count;
#ifndef CONFIG_MMU_GATHER_NO_GATHER
@@ -400,6 +411,7 @@ static inline void __tlb_reset_range(str
tlb->cleared_pmds = 0;
tlb->cleared_puds = 0;
tlb->cleared_p4ds = 0;
+ tlb->unshared_tables = 0;
/*
* Do not reset mmu_gather::vma_* fields here, we do not
* call into tlb_start_vma() again to set them if there is an
@@ -484,7 +496,7 @@ static inline void tlb_flush_mmu_tlbonly
* these bits.
*/
if (!(tlb->freed_tables || tlb->cleared_ptes || tlb->cleared_pmds ||
- tlb->cleared_puds || tlb->cleared_p4ds))
+ tlb->cleared_puds || tlb->cleared_p4ds || tlb->unshared_tables))
return;
tlb_flush(tlb);
@@ -773,6 +785,61 @@ static inline bool huge_pmd_needs_flush(
}
#endif
+#ifdef CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING
+static inline void tlb_unshare_pmd_ptdesc(struct mmu_gather *tlb, struct ptdesc *pt,
+ unsigned long addr)
+{
+ /*
+ * The caller must make sure that concurrent unsharing + exclusive
+ * reuse is impossible until tlb_flush_unshared_tables() was called.
+ */
+ VM_WARN_ON_ONCE(!ptdesc_pmd_is_shared(pt));
+ ptdesc_pmd_pts_dec(pt);
+
+ /* Clearing a PUD pointing at a PMD table with PMD leaves. */
+ tlb_flush_pmd_range(tlb, addr & PUD_MASK, PUD_SIZE);
+
+ /*
+ * If the page table is now exclusively owned, we fully unshared
+ * a page table.
+ */
+ if (!ptdesc_pmd_is_shared(pt))
+ tlb->fully_unshared_tables = true;
+ tlb->unshared_tables = true;
+}
+
+static inline void tlb_flush_unshared_tables(struct mmu_gather *tlb)
+{
+ /*
+ * As soon as the caller drops locks to allow for reuse of
+ * previously-shared tables, these tables could get modified and
+ * even reused outside of hugetlb context. So flush the TLB now.
+ *
+ * Note that we cannot defer the flush to a later point even if we are
+ * not the last sharer of the page table.
+ */
+ if (tlb->unshared_tables)
+ tlb_flush_mmu_tlbonly(tlb);
+
+ /*
+ * Similarly, we must make sure that concurrent GUP-fast will not
+ * walk previously-shared page tables that are getting modified+reused
+ * elsewhere. So broadcast an IPI to wait for any concurrent GUP-fast.
+ *
+ * We only perform this when we are the last sharer of a page table,
+ * as the IPI will reach all CPUs: any GUP-fast.
+ *
+ * Note that on configs where tlb_remove_table_sync_one() is a NOP,
+ * the expectation is that the tlb_flush_mmu_tlbonly() would have issued
+ * required IPIs already for us.
+ */
+ if (tlb->fully_unshared_tables) {
+ tlb_remove_table_sync_one();
+ tlb->fully_unshared_tables = false;
+ }
+}
+#endif /* CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING */
+
#endif /* CONFIG_MMU */
#endif /* _ASM_GENERIC__TLB_H */
--- a/include/linux/hugetlb.h~mm-hugetlb-fix-excessive-ipi-broadcasts-when-unsharing-pmd-tables-using-mmu_gather
+++ a/include/linux/hugetlb.h
@@ -240,8 +240,9 @@ pte_t *huge_pte_alloc(struct mm_struct *
pte_t *huge_pte_offset(struct mm_struct *mm,
unsigned long addr, unsigned long sz);
unsigned long hugetlb_mask_last_page(struct hstate *h);
-int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long addr, pte_t *ptep);
+int huge_pmd_unshare(struct mmu_gather *tlb, struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep);
+void huge_pmd_unshare_flush(struct mmu_gather *tlb, struct vm_area_struct *vma);
void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
unsigned long *start, unsigned long *end);
@@ -271,7 +272,7 @@ void hugetlb_vma_unlock_write(struct vm_
int hugetlb_vma_trylock_write(struct vm_area_struct *vma);
void hugetlb_vma_assert_locked(struct vm_area_struct *vma);
void hugetlb_vma_lock_release(struct kref *kref);
-long hugetlb_change_protection(struct vm_area_struct *vma,
+long hugetlb_change_protection(struct mmu_gather *tlb, struct vm_area_struct *vma,
unsigned long address, unsigned long end, pgprot_t newprot,
unsigned long cp_flags);
void hugetlb_unshare_all_pmds(struct vm_area_struct *vma);
@@ -300,13 +301,17 @@ static inline struct address_space *huge
return NULL;
}
-static inline int huge_pmd_unshare(struct mm_struct *mm,
- struct vm_area_struct *vma,
- unsigned long addr, pte_t *ptep)
+static inline int huge_pmd_unshare(struct mmu_gather *tlb,
+ struct vm_area_struct *vma, unsigned long addr, pte_t *ptep)
{
return 0;
}
+static inline void huge_pmd_unshare_flush(struct mmu_gather *tlb,
+ struct vm_area_struct *vma)
+{
+}
+
static inline void adjust_range_if_pmd_sharing_possible(
struct vm_area_struct *vma,
unsigned long *start, unsigned long *end)
@@ -432,7 +437,7 @@ static inline void move_hugetlb_state(st
{
}
-static inline long hugetlb_change_protection(
+static inline long hugetlb_change_protection(struct mmu_gather *tlb,
struct vm_area_struct *vma, unsigned long address,
unsigned long end, pgprot_t newprot,
unsigned long cp_flags)
--- a/mm/hugetlb.c~mm-hugetlb-fix-excessive-ipi-broadcasts-when-unsharing-pmd-tables-using-mmu_gather
+++ a/mm/hugetlb.c
@@ -5096,8 +5096,9 @@ int move_hugetlb_page_tables(struct vm_a
unsigned long last_addr_mask;
pte_t *src_pte, *dst_pte;
struct mmu_notifier_range range;
- bool shared_pmd = false;
+ struct mmu_gather tlb;
+ tlb_gather_mmu(&tlb, vma->vm_mm);
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, old_addr,
old_end);
adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end);
@@ -5122,12 +5123,12 @@ int move_hugetlb_page_tables(struct vm_a
if (huge_pte_none(huge_ptep_get(mm, old_addr, src_pte)))
continue;
- if (huge_pmd_unshare(mm, vma, old_addr, src_pte)) {
- shared_pmd = true;
+ if (huge_pmd_unshare(&tlb, vma, old_addr, src_pte)) {
old_addr |= last_addr_mask;
new_addr |= last_addr_mask;
continue;
}
+ tlb_remove_huge_tlb_entry(h, &tlb, src_pte, old_addr);
dst_pte = huge_pte_alloc(mm, new_vma, new_addr, sz);
if (!dst_pte)
@@ -5136,13 +5137,13 @@ int move_hugetlb_page_tables(struct vm_a
move_huge_pte(vma, old_addr, new_addr, src_pte, dst_pte, sz);
}
- if (shared_pmd)
- flush_hugetlb_tlb_range(vma, range.start, range.end);
- else
- flush_hugetlb_tlb_range(vma, old_end - len, old_end);
+ tlb_flush_mmu_tlbonly(&tlb);
+ huge_pmd_unshare_flush(&tlb, vma);
+
mmu_notifier_invalidate_range_end(&range);
i_mmap_unlock_write(mapping);
hugetlb_vma_unlock_write(vma);
+ tlb_finish_mmu(&tlb);
return len + old_addr - old_end;
}
@@ -5161,7 +5162,6 @@ void __unmap_hugepage_range(struct mmu_g
unsigned long sz = huge_page_size(h);
bool adjust_reservation;
unsigned long last_addr_mask;
- bool force_flush = false;
WARN_ON(!is_vm_hugetlb_page(vma));
BUG_ON(start & ~huge_page_mask(h));
@@ -5184,10 +5184,8 @@ void __unmap_hugepage_range(struct mmu_g
}
ptl = huge_pte_lock(h, mm, ptep);
- if (huge_pmd_unshare(mm, vma, address, ptep)) {
+ if (huge_pmd_unshare(tlb, vma, address, ptep)) {
spin_unlock(ptl);
- tlb_flush_pmd_range(tlb, address & PUD_MASK, PUD_SIZE);
- force_flush = true;
address |= last_addr_mask;
continue;
}
@@ -5303,14 +5301,7 @@ void __unmap_hugepage_range(struct mmu_g
}
tlb_end_vma(tlb, vma);
- /*
- * There is nothing protecting a previously-shared page table that we
- * unshared through huge_pmd_unshare() from getting freed after we
- * release i_mmap_rwsem, so flush the TLB now. If huge_pmd_unshare()
- * succeeded, flush the range corresponding to the pud.
- */
- if (force_flush)
- tlb_flush_mmu_tlbonly(tlb);
+ huge_pmd_unshare_flush(tlb, vma);
}
void __hugetlb_zap_begin(struct vm_area_struct *vma,
@@ -6399,7 +6390,7 @@ out_release_nounlock:
}
#endif /* CONFIG_USERFAULTFD */
-long hugetlb_change_protection(struct vm_area_struct *vma,
+long hugetlb_change_protection(struct mmu_gather *tlb, struct vm_area_struct *vma,
unsigned long address, unsigned long end,
pgprot_t newprot, unsigned long cp_flags)
{
@@ -6409,7 +6400,6 @@ long hugetlb_change_protection(struct vm
pte_t pte;
struct hstate *h = hstate_vma(vma);
long pages = 0, psize = huge_page_size(h);
- bool shared_pmd = false;
struct mmu_notifier_range range;
unsigned long last_addr_mask;
bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
@@ -6452,7 +6442,7 @@ long hugetlb_change_protection(struct vm
}
}
ptl = huge_pte_lock(h, mm, ptep);
- if (huge_pmd_unshare(mm, vma, address, ptep)) {
+ if (huge_pmd_unshare(tlb, vma, address, ptep)) {
/*
* When uffd-wp is enabled on the vma, unshare
* shouldn't happen at all. Warn about it if it
@@ -6461,7 +6451,6 @@ long hugetlb_change_protection(struct vm
WARN_ON_ONCE(uffd_wp || uffd_wp_resolve);
pages++;
spin_unlock(ptl);
- shared_pmd = true;
address |= last_addr_mask;
continue;
}
@@ -6522,22 +6511,16 @@ long hugetlb_change_protection(struct vm
pte = huge_pte_clear_uffd_wp(pte);
huge_ptep_modify_prot_commit(vma, address, ptep, old_pte, pte);
pages++;
+ tlb_remove_huge_tlb_entry(h, tlb, ptep, address);
}
next:
spin_unlock(ptl);
cond_resched();
}
- /*
- * There is nothing protecting a previously-shared page table that we
- * unshared through huge_pmd_unshare() from getting freed after we
- * release i_mmap_rwsem, so flush the TLB now. If huge_pmd_unshare()
- * succeeded, flush the range corresponding to the pud.
- */
- if (shared_pmd)
- flush_hugetlb_tlb_range(vma, range.start, range.end);
- else
- flush_hugetlb_tlb_range(vma, start, end);
+
+ tlb_flush_mmu_tlbonly(tlb);
+ huge_pmd_unshare_flush(tlb, vma);
/*
* No need to call mmu_notifier_arch_invalidate_secondary_tlbs() we are
* downgrading page table protection not changing it to point to a new
@@ -6904,18 +6887,27 @@ out:
return pte;
}
-/*
- * unmap huge page backed by shared pte.
+/**
+ * huge_pmd_unshare - Unmap a pmd table if it is shared by multiple users
+ * @tlb: the current mmu_gather.
+ * @vma: the vma covering the pmd table.
+ * @addr: the address we are trying to unshare.
+ * @ptep: pointer into the (pmd) page table.
+ *
+ * Called with the page table lock held, the i_mmap_rwsem held in write mode
+ * and the hugetlb vma lock held in write mode.
*
- * Called with page table lock held.
+ * Note: The caller must call huge_pmd_unshare_flush() before dropping the
+ * i_mmap_rwsem.
*
- * returns: 1 successfully unmapped a shared pte page
- * 0 the underlying pte page is not shared, or it is the last user
+ * Returns: 1 if it was a shared PMD table and it got unmapped, or 0 if it
+ * was not a shared PMD table.
*/
-int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long addr, pte_t *ptep)
+int huge_pmd_unshare(struct mmu_gather *tlb, struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep)
{
unsigned long sz = huge_page_size(hstate_vma(vma));
+ struct mm_struct *mm = vma->vm_mm;
pgd_t *pgd = pgd_offset(mm, addr);
p4d_t *p4d = p4d_offset(pgd, addr);
pud_t *pud = pud_offset(p4d, addr);
@@ -6927,18 +6919,36 @@ int huge_pmd_unshare(struct mm_struct *m
i_mmap_assert_write_locked(vma->vm_file->f_mapping);
hugetlb_vma_assert_locked(vma);
pud_clear(pud);
- /*
- * Once our caller drops the rmap lock, some other process might be
- * using this page table as a normal, non-hugetlb page table.
- * Wait for pending gup_fast() in other threads to finish before letting
- * that happen.
- */
- tlb_remove_table_sync_one();
- ptdesc_pmd_pts_dec(virt_to_ptdesc(ptep));
+
+ tlb_unshare_pmd_ptdesc(tlb, virt_to_ptdesc(ptep), addr);
+
mm_dec_nr_pmds(mm);
return 1;
}
+/*
+ * huge_pmd_unshare_flush - Complete a sequence of huge_pmd_unshare() calls
+ * @tlb: the current mmu_gather.
+ * @vma: the vma covering the pmd table.
+ *
+ * Perform necessary TLB flushes or IPI broadcasts to synchronize PMD table
+ * unsharing with concurrent page table walkers (TLB, GUP-fast, etc.).
+ *
+ * This function must be called after a sequence of huge_pmd_unshare()
+ * calls while still holding the i_mmap_rwsem.
+ */
+void huge_pmd_unshare_flush(struct mmu_gather *tlb, struct vm_area_struct *vma)
+{
+ /*
+ * We must synchronize page table unsharing such that nobody will
+ * try reusing a previously-shared page table while it might still
+ * be in use by previous sharers (TLB, GUP_fast).
+ */
+ i_mmap_assert_write_locked(vma->vm_file->f_mapping);
+
+ tlb_flush_unshared_tables(tlb);
+}
+
#else /* !CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING */
pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
@@ -6947,12 +6957,16 @@ pte_t *huge_pmd_share(struct mm_struct *
return NULL;
}
-int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long addr, pte_t *ptep)
+int huge_pmd_unshare(struct mmu_gather *tlb, struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep)
{
return 0;
}
+void huge_pmd_unshare_flush(struct mmu_gather *tlb, struct vm_area_struct *vma)
+{
+}
+
void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
unsigned long *start, unsigned long *end)
{
@@ -7219,6 +7233,7 @@ static void hugetlb_unshare_pmds(struct
unsigned long sz = huge_page_size(h);
struct mm_struct *mm = vma->vm_mm;
struct mmu_notifier_range range;
+ struct mmu_gather tlb;
unsigned long address;
spinlock_t *ptl;
pte_t *ptep;
@@ -7229,6 +7244,7 @@ static void hugetlb_unshare_pmds(struct
if (start >= end)
return;
+ tlb_gather_mmu(&tlb, mm);
flush_cache_range(vma, start, end);
/*
* No need to call adjust_range_if_pmd_sharing_possible(), because
@@ -7248,10 +7264,10 @@ static void hugetlb_unshare_pmds(struct
if (!ptep)
continue;
ptl = huge_pte_lock(h, mm, ptep);
- huge_pmd_unshare(mm, vma, address, ptep);
+ huge_pmd_unshare(&tlb, vma, address, ptep);
spin_unlock(ptl);
}
- flush_hugetlb_tlb_range(vma, start, end);
+ huge_pmd_unshare_flush(&tlb, vma);
if (take_locks) {
i_mmap_unlock_write(vma->vm_file->f_mapping);
hugetlb_vma_unlock_write(vma);
@@ -7261,6 +7277,7 @@ static void hugetlb_unshare_pmds(struct
* Documentation/mm/mmu_notifier.rst.
*/
mmu_notifier_invalidate_range_end(&range);
+ tlb_finish_mmu(&tlb);
}
/*
--- a/mm/mmu_gather.c~mm-hugetlb-fix-excessive-ipi-broadcasts-when-unsharing-pmd-tables-using-mmu_gather
+++ a/mm/mmu_gather.c
@@ -469,6 +469,12 @@ void tlb_gather_mmu_fullmm(struct mmu_ga
void tlb_finish_mmu(struct mmu_gather *tlb)
{
/*
+ * We expect an earlier huge_pmd_unshare_flush() call to sort this out,
+ * due to complicated locking requirements with page table unsharing.
+ */
+ VM_WARN_ON_ONCE(tlb->fully_unshared_tables);
+
+ /*
* If there are parallel threads are doing PTE changes on same range
* under non-exclusive lock (e.g., mmap_lock read-side) but defer TLB
* flush by batching, one thread may end up seeing inconsistent PTEs
--- a/mm/mprotect.c~mm-hugetlb-fix-excessive-ipi-broadcasts-when-unsharing-pmd-tables-using-mmu_gather
+++ a/mm/mprotect.c
@@ -652,7 +652,7 @@ long change_protection(struct mmu_gather
#endif
if (is_vm_hugetlb_page(vma))
- pages = hugetlb_change_protection(vma, start, end, newprot,
+ pages = hugetlb_change_protection(tlb, vma, start, end, newprot,
cp_flags);
else
pages = change_protection_range(tlb, vma, start, end, newprot,
--- a/mm/rmap.c~mm-hugetlb-fix-excessive-ipi-broadcasts-when-unsharing-pmd-tables-using-mmu_gather
+++ a/mm/rmap.c
@@ -76,7 +76,7 @@
#include <linux/mm_inline.h>
#include <linux/oom.h>
-#include <asm/tlbflush.h>
+#include <asm/tlb.h>
#define CREATE_TRACE_POINTS
#include <trace/events/migrate.h>
@@ -2008,13 +2008,17 @@ static bool try_to_unmap_one(struct foli
* if unsuccessful.
*/
if (!anon) {
+ struct mmu_gather tlb;
+
VM_BUG_ON(!(flags & TTU_RMAP_LOCKED));
if (!hugetlb_vma_trylock_write(vma))
goto walk_abort;
- if (huge_pmd_unshare(mm, vma, address, pvmw.pte)) {
+
+ tlb_gather_mmu(&tlb, mm);
+ if (huge_pmd_unshare(&tlb, vma, address, pvmw.pte)) {
hugetlb_vma_unlock_write(vma);
- flush_tlb_range(vma,
- range.start, range.end);
+ huge_pmd_unshare_flush(&tlb, vma);
+ tlb_finish_mmu(&tlb);
/*
* The PMD table was unmapped,
* consequently unmapping the folio.
@@ -2022,6 +2026,7 @@ static bool try_to_unmap_one(struct foli
goto walk_done;
}
hugetlb_vma_unlock_write(vma);
+ tlb_finish_mmu(&tlb);
}
pteval = huge_ptep_clear_flush(vma, address, pvmw.pte);
if (pte_dirty(pteval))
@@ -2398,17 +2403,20 @@ static bool try_to_migrate_one(struct fo
* fail if unsuccessful.
*/
if (!anon) {
+ struct mmu_gather tlb;
+
VM_BUG_ON(!(flags & TTU_RMAP_LOCKED));
if (!hugetlb_vma_trylock_write(vma)) {
page_vma_mapped_walk_done(&pvmw);
ret = false;
break;
}
- if (huge_pmd_unshare(mm, vma, address, pvmw.pte)) {
- hugetlb_vma_unlock_write(vma);
- flush_tlb_range(vma,
- range.start, range.end);
+ tlb_gather_mmu(&tlb, mm);
+ if (huge_pmd_unshare(&tlb, vma, address, pvmw.pte)) {
+ hugetlb_vma_unlock_write(vma);
+ huge_pmd_unshare_flush(&tlb, vma);
+ tlb_finish_mmu(&tlb);
/*
* The PMD table was unmapped,
* consequently unmapping the folio.
@@ -2417,6 +2425,7 @@ static bool try_to_migrate_one(struct fo
break;
}
hugetlb_vma_unlock_write(vma);
+ tlb_finish_mmu(&tlb);
}
/* Nuke the hugetlb page table entry */
pteval = huge_ptep_clear_flush(vma, address, pvmw.pte);
_
Patches currently in -mm which might be from david(a)kernel.org are
The quilt patch titled
Subject: mm/hugetlb: fix hugetlb_pmd_shared()
has been removed from the -mm tree. Its filename was
mm-hugetlb-fix-hugetlb_pmd_shared.patch
This patch was dropped because it was merged into the mm-stable branch
of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
------------------------------------------------------
From: "David Hildenbrand (Red Hat)" <david(a)kernel.org>
Subject: mm/hugetlb: fix hugetlb_pmd_shared()
Date: Fri, 5 Dec 2025 22:35:55 +0100
Patch series "mm/hugetlb: fixes for PMD table sharing (incl. using
mmu_gather)".
One functional fix, one performance regression fix, and two related
comment fixes.
I cleaned up my prototype I recently shared [1] for the performance fix,
deferring most of the cleanups I had in the prototype to a later point.
While doing that I identified the other things.
The goal of this patch set is to be backported to stable trees "fairly"
easily. At least patch #1 and #4.
Patch #1 fixes hugetlb_pmd_shared() not detecting any sharing
Patch #2 + #3 are simple comment fixes that patch #4 interacts with.
Patch #4 is a fix for the reported performance regression due to excessive
IPI broadcasts during fork()+exit().
The last patch is all about TLB flushes, IPIs and mmu_gather. Read:
complicated
I added as much comments + description that I possibly could, and I am
hoping for review from Jann.
There are plenty of cleanups in the future to be had + one reasonable
optimization on x86. But that's all out of scope for this series.
This patch (of 4):
We switched from (wrongly) using the page count to an independent shared
count. Now, shared page tables have a refcount of 1 (excluding
speculative references) and instead use ptdesc->pt_share_count to identify
sharing.
We didn't convert hugetlb_pmd_shared(), so right now, we would never
detect a shared PMD table as such, because sharing/unsharing no longer
touches the refcount of a PMD table.
Page migration, like mbind() or migrate_pages() would allow for migrating
folios mapped into such shared PMD tables, even though the folios are not
exclusive. In smaps we would account them as "private" although they are
"shared", and we would be wrongly setting the PM_MMAP_EXCLUSIVE in the
pagemap interface.
Fix it by properly using ptdesc_pmd_is_shared() in hugetlb_pmd_shared().
Link: https://lkml.kernel.org/r/20251205213558.2980480-1-david@kernel.org
Link: https://lkml.kernel.org/r/20251205213558.2980480-2-david@kernel.org
Link: https://lore.kernel.org/all/8cab934d-4a56-44aa-b641-bfd7e23bd673@kernel.org/ [1]
Fixes: 59d9094df3d7 ("mm: hugetlb: independent PMD page table shared count")
Signed-off-by: David Hildenbrand (Red Hat) <david(a)kernel.org>
Tested-by: Laurence Oberman <loberman(a)redhat.com>
Reviewed-by: Rik van Riel <riel(a)surriel.com>
Reviewed-by: Lance Yang <lance.yang(a)linux.dev>
Cc: Liu Shixin <liushixin2(a)huawei.com>
Cc: "Aneesh Kumar K.V" <aneesh.kumar(a)kernel.org>
Cc: Arnd Bergmann <arnd(a)arndb.de>
Cc: Jann Horn <jannh(a)google.com>
Cc: Liam Howlett <liam.howlett(a)oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes(a)oracle.com>
Cc: Muchun Song <muchun.song(a)linux.dev>
Cc: Nadav Amit <nadav.amit(a)gmail.com>
Cc: Nicholas Piggin <npiggin(a)gmail.com>
Cc: Oscar Salvador <osalvador(a)suse.de>
Cc: Peter Zijlstra <peterz(a)infradead.org>
Cc: Prakash Sangappa <prakash.sangappa(a)oracle.com>
Cc: Vlastimil Babka <vbabka(a)suse.cz>
Cc: Will Deacon <will(a)kernel.org>
Cc: Uschakow, Stanislav" <suschako(a)amazon.de>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
include/linux/hugetlb.h | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
--- a/include/linux/hugetlb.h~mm-hugetlb-fix-hugetlb_pmd_shared
+++ a/include/linux/hugetlb.h
@@ -1326,7 +1326,7 @@ static inline __init void hugetlb_cma_re
#ifdef CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING
static inline bool hugetlb_pmd_shared(pte_t *pte)
{
- return page_count(virt_to_page(pte)) > 1;
+ return ptdesc_pmd_is_shared(virt_to_ptdesc(pte));
}
#else
static inline bool hugetlb_pmd_shared(pte_t *pte)
_
Patches currently in -mm which might be from david(a)kernel.org are
The quilt patch titled
Subject: powerpc/pseries/cmm: adjust BALLOON_MIGRATE when migrating pages
has been removed from the -mm tree. Its filename was
powerpc-pseries-cmm-adjust-balloon_migrate-when-migrating-pages.patch
This patch was dropped because it was merged into the mm-stable branch
of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
------------------------------------------------------
From: David Hildenbrand <david(a)redhat.com>
Subject: powerpc/pseries/cmm: adjust BALLOON_MIGRATE when migrating pages
Date: Tue, 21 Oct 2025 12:06:06 +0200
Let's properly adjust BALLOON_MIGRATE like the other drivers.
Note that the INFLATE/DEFLATE events are triggered from the core when
enqueueing/dequeueing pages.
This was found by code inspection.
Link: https://lkml.kernel.org/r/20251021100606.148294-3-david@redhat.com
Fixes: fe030c9b85e6 ("powerpc/pseries/cmm: Implement balloon compaction")
Signed-off-by: David Hildenbrand <david(a)redhat.com>
Reviewed-by: Ritesh Harjani (IBM) <ritesh.list(a)gmail.com>
Cc: Christophe Leroy <christophe.leroy(a)csgroup.eu>
Cc: Madhavan Srinivasan <maddy(a)linux.ibm.com>
Cc: Michael Ellerman <mpe(a)ellerman.id.au>
Cc: Nicholas Piggin <npiggin(a)gmail.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
arch/powerpc/platforms/pseries/cmm.c | 1 +
1 file changed, 1 insertion(+)
--- a/arch/powerpc/platforms/pseries/cmm.c~powerpc-pseries-cmm-adjust-balloon_migrate-when-migrating-pages
+++ a/arch/powerpc/platforms/pseries/cmm.c
@@ -532,6 +532,7 @@ static int cmm_migratepage(struct balloo
spin_lock_irqsave(&b_dev_info->pages_lock, flags);
balloon_page_insert(b_dev_info, newpage);
+ __count_vm_event(BALLOON_MIGRATE);
b_dev_info->isolated_pages--;
spin_unlock_irqrestore(&b_dev_info->pages_lock, flags);
_
Patches currently in -mm which might be from david(a)redhat.com are
The quilt patch titled
Subject: powerpc/pseries/cmm: call balloon_devinfo_init() also without CONFIG_BALLOON_COMPACTION
has been removed from the -mm tree. Its filename was
powerpc-pseries-cmm-call-balloon_devinfo_init-also-without-config_balloon_compaction.patch
This patch was dropped because it was merged into the mm-stable branch
of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
------------------------------------------------------
From: David Hildenbrand <david(a)redhat.com>
Subject: powerpc/pseries/cmm: call balloon_devinfo_init() also without CONFIG_BALLOON_COMPACTION
Date: Tue, 21 Oct 2025 12:06:05 +0200
Patch series "powerpc/pseries/cmm: two smaller fixes".
Two smaller fixes identified while doing a bigger rework.
This patch (of 2):
We always have to initialize the balloon_dev_info, even when compaction is
not configured in: otherwise the containing list and the lock are left
uninitialized.
Likely not many such configs exist in practice, but let's CC stable to
be sure.
This was found by code inspection.
Link: https://lkml.kernel.org/r/20251021100606.148294-1-david@redhat.com
Link: https://lkml.kernel.org/r/20251021100606.148294-2-david@redhat.com
Fixes: fe030c9b85e6 ("powerpc/pseries/cmm: Implement balloon compaction")
Signed-off-by: David Hildenbrand <david(a)redhat.com>
Reviewed-by: Ritesh Harjani (IBM) <ritesh.list(a)gmail.com>
Cc: Christophe Leroy <christophe.leroy(a)csgroup.eu>
Cc: Madhavan Srinivasan <maddy(a)linux.ibm.com>
Cc: Michael Ellerman <mpe(a)ellerman.id.au>
Cc: Nicholas Piggin <npiggin(a)gmail.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
arch/powerpc/platforms/pseries/cmm.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
--- a/arch/powerpc/platforms/pseries/cmm.c~powerpc-pseries-cmm-call-balloon_devinfo_init-also-without-config_balloon_compaction
+++ a/arch/powerpc/platforms/pseries/cmm.c
@@ -550,7 +550,6 @@ static int cmm_migratepage(struct balloo
static void cmm_balloon_compaction_init(void)
{
- balloon_devinfo_init(&b_dev_info);
b_dev_info.migratepage = cmm_migratepage;
}
#else /* CONFIG_BALLOON_COMPACTION */
@@ -572,6 +571,7 @@ static int cmm_init(void)
if (!firmware_has_feature(FW_FEATURE_CMO) && !simulate)
return -EOPNOTSUPP;
+ balloon_devinfo_init(&b_dev_info);
cmm_balloon_compaction_init();
rc = register_oom_notifier(&cmm_oom_nb);
_
Patches currently in -mm which might be from david(a)redhat.com are
This reverts commit 3b5e5185881edf4ee5a1af575e3aedac4a38a764.
The REO queue lookup table feature was enabled in 6.12.y due to an
upstream backport, but it causes severe RX performance degradation on
QCN9274 hw2.0 devices.
With this feature enabled, the vast majority of received packets are
dropped, reducing throughput drastically and making the device nearly
unusable.
Reverting this change restores full RX performance.
Tested-on: QCN9274 hw2.0 PCI WLAN.WBE.1.5-01651-QCAHKSWPL_SILICONZ-1
Fixes: 3b5e5185881e ("wifi: ath12k: Enable REO queue lookup table feature on QCN9274 hw2.0")
Signed-off-by: Oliver Sedlbauer <os(a)dev.tdt.de>
---
Note:
This commit reverts a backport that was not a fix. The backported change
breaks previously working behavior on QCN9274 hw2.0 devices and should
not have been applied to the 6.12.y stable kernel.
drivers/net/wireless/ath/ath12k/hw.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/net/wireless/ath/ath12k/hw.c b/drivers/net/wireless/ath/ath12k/hw.c
index 057ef2d282b2..e3eb22bb9e1c 100644
--- a/drivers/net/wireless/ath/ath12k/hw.c
+++ b/drivers/net/wireless/ath/ath12k/hw.c
@@ -1084,7 +1084,7 @@ static const struct ath12k_hw_params ath12k_hw_params[] = {
.download_calib = true,
.supports_suspend = false,
.tcl_ring_retry = true,
- .reoq_lut_support = true,
+ .reoq_lut_support = false,
.supports_shadow_regs = false,
.num_tcl_banks = 48,
--
2.39.5
`build_assert` relies on the compiler to optimize out its error path,
lest build fails with the dreaded error:
ERROR: modpost: "rust_build_error" [path/to/module.ko] undefined!
It has been observed that very trivial code performing I/O accesses
(sometimes even using an immediate value) would seemingly randomly fail
with this error whenever `CLIPPY=1` was set. The same behavior was also
observed until different, very similar conditions [1][2].
The cause, as pointed out by Gary Guo [3], appears to be that the
failing function is eventually using `build_assert` with its argument,
but is only annotated with `#[inline]`. This gives the compiler freedom
to not inline the function, which it notably did when Clippy was active,
triggering the error.
The fix is to annotate functions passing their argument to
`build_assert` with `#[inline(always)]`, telling the compiler to be as
aggressive as possible with their inlining. This is also the correct
behavior as inlining is mandatory for correct behavior in these cases.
This series fixes all possible points of failure in the kernel crate,
and adds documentation to `build_assert` explaining how to properly
inline functions for which this behavior may arise.
[1] https://lore.kernel.org/all/DEEUYUOAEZU3.1J1HM2YQ10EX1@nvidia.com/
[2] https://lore.kernel.org/all/A1A280D4-836E-4D75-863E-30B1C276C80C@collabora.…
[3] https://lore.kernel.org/all/20251121143008.2f5acc33.gary@garyguo.net/
Signed-off-by: Alexandre Courbot <acourbot(a)nvidia.com>
---
Changes in v3:
- Add "Fixes:" tags.
- CC stable on fixup patches.
- Link to v2: https://patch.msgid.link/20251128-io-build-assert-v2-0-a9ea9ce7d45d@nvidia.…
Changes in v2:
- Turn into a series and address other similar cases in the kernel crate.
- Link to v1: https://patch.msgid.link/20251127-io-build-assert-v1-1-04237f2e5850@nvidia.…
---
Alexandre Courbot (7):
rust: build_assert: add instructions for use with function arguments
rust: io: always inline functions using build_assert with arguments
rust: cpufreq: always inline functions using build_assert with arguments
rust: bits: always inline functions using build_assert with arguments
rust: sync: refcount: always inline functions using build_assert with arguments
rust: irq: always inline functions using build_assert with arguments
rust: num: bounded: add missing comment for always inlined function
rust/kernel/bits.rs | 6 ++++--
rust/kernel/build_assert.rs | 7 ++++++-
rust/kernel/cpufreq.rs | 2 ++
rust/kernel/io.rs | 9 ++++++---
rust/kernel/io/resource.rs | 2 ++
rust/kernel/irq/flags.rs | 2 ++
rust/kernel/num/bounded.rs | 1 +
rust/kernel/sync/refcount.rs | 3 ++-
8 files changed, 25 insertions(+), 7 deletions(-)
---
base-commit: ba65a4e7120a616d9c592750d9147f6dcafedffa
change-id: 20251127-io-build-assert-3579a5bfb81c
Best regards,
--
Alexandre Courbot <acourbot(a)nvidia.com>
The following commit has been merged into the perf/urgent branch of tip:
Commit-ID: 01439286514ce9d13b8123f8ec3717d7135ff1d6
Gitweb: https://git.kernel.org/tip/01439286514ce9d13b8123f8ec3717d7135ff1d6
Author: Sandipan Das <sandipan.das(a)amd.com>
AuthorDate: Tue, 09 Dec 2025 13:56:38 +05:30
Committer: Ingo Molnar <mingo(a)kernel.org>
CommitterDate: Tue, 09 Dec 2025 12:21:47 +01:00
perf/x86/amd/uncore: Fix the return value of amd_uncore_df_event_init() on error
If amd_uncore_event_init() fails, return an error irrespective of the
pmu_version. Setting hwc->config should be safe even if there is an
error so use this opportunity to simplify the code.
Closes: https://lore.kernel.org/all/aTaI0ci3vZ44lmBn@stanley.mountain/
Fixes: d6389d3ccc13 ("perf/x86/amd/uncore: Refactor uncore management")
Reported-by: Dan Carpenter <dan.carpenter(a)linaro.org>
Signed-off-by: Sandipan Das <sandipan.das(a)amd.com>
Signed-off-by: Ingo Molnar <mingo(a)kernel.org>
Cc: Peter Zijlstra <peterz(a)infradead.org>
Cc: stable(a)vger.kernel.org
Link: https://patch.msgid.link/076935e23a70335d33bd6e23308b75ae0ad35ba2.176526866…
---
arch/x86/events/amd/uncore.c | 5 +----
1 file changed, 1 insertion(+), 4 deletions(-)
diff --git a/arch/x86/events/amd/uncore.c b/arch/x86/events/amd/uncore.c
index e8b6af1..9293ce5 100644
--- a/arch/x86/events/amd/uncore.c
+++ b/arch/x86/events/amd/uncore.c
@@ -656,14 +656,11 @@ static int amd_uncore_df_event_init(struct perf_event *event)
struct hw_perf_event *hwc = &event->hw;
int ret = amd_uncore_event_init(event);
- if (ret || pmu_version < 2)
- return ret;
-
hwc->config = event->attr.config &
(pmu_version >= 2 ? AMD64_PERFMON_V2_RAW_EVENT_MASK_NB :
AMD64_RAW_EVENT_MASK_NB);
- return 0;
+ return ret;
}
static int amd_uncore_df_add(struct perf_event *event, int flags)
On hardware based on Toradex Verdin AM62 the recovery mechanism added by
commit ad5c6ecef27e ("drm: bridge: ti-sn65dsi83: Add error recovery
mechanism") has been reported [0] to make the display turn on and off and
and the kernel logging "Unexpected link status 0x01".
According to the report, the error recovery mechanism is triggered by the
PLL_UNLOCK error going active. Analysis suggested the board is unable to
provide the correct DSI clock neede by the SN65DSI84, to which the TI
SN65DSI84 reacts by raising the PLL_UNLOCK, while the display still works
apparently without issues.
On other hardware, where all the clocks are within the components
specifications, the PLL_UNLOCK bit does not trigger while the display is in
normal use. It can trigger for e.g. electromagnetic interference, which is
a transient event and exactly the reason why the error recovery mechanism
has been implemented.
Idelly the PLL_UNLOCK bit could be ignored when working out of
specification, but this requires to detect in software whether it triggers
because the device is working out of specification but visually correctly
for the user or for good reasons (e.g. EMI, or even because working out of
specifications but compromising the visual output).
The ongoing analysis as of this writing [1][2] has not yet found a way for
the driver to discriminate among the two cases. So as a temporary measure
mask the PLL_UNLOCK error bit unconditionally.
[0] https://lore.kernel.org/r/bhkn6hley4xrol5o3ytn343h4unkwsr26p6s6ltcwexnrsjsd…
[1] https://lore.kernel.org/all/b71e941c-fc8a-4ac1-9407-0fe7df73b412@gmail.com/
[2] https://lore.kernel.org/all/20251125103900.31750-1-francesco@dolcini.it/
Closes: https://lore.kernel.org/r/bhkn6hley4xrol5o3ytn343h4unkwsr26p6s6ltcwexnrsjsd…
Cc: stable(a)vger.kernel.org # 6.15+
Co-developed-by: Hervé Codina <herve.codina(a)bootlin.com>
Signed-off-by: Hervé Codina <herve.codina(a)bootlin.com>
Signed-off-by: Luca Ceresoli <luca.ceresoli(a)bootlin.com>
---
Francesco, Emanuele, João: can you please apply this patch and report
whether the display on the affected boards gets back to working as before?
Cc: João Paulo Gonçalves <jpaulo.silvagoncalves(a)gmail.com>
Cc: Francesco Dolcini <francesco(a)dolcini.it>
Cc: Emanuele Ghidoli <ghidoliemanuele(a)gmail.com>
---
drivers/gpu/drm/bridge/ti-sn65dsi83.c | 11 +++++++++--
1 file changed, 9 insertions(+), 2 deletions(-)
diff --git a/drivers/gpu/drm/bridge/ti-sn65dsi83.c b/drivers/gpu/drm/bridge/ti-sn65dsi83.c
index 033c44326552..fffb47b62f43 100644
--- a/drivers/gpu/drm/bridge/ti-sn65dsi83.c
+++ b/drivers/gpu/drm/bridge/ti-sn65dsi83.c
@@ -429,7 +429,14 @@ static void sn65dsi83_handle_errors(struct sn65dsi83 *ctx)
*/
ret = regmap_read(ctx->regmap, REG_IRQ_STAT, &irq_stat);
- if (ret || irq_stat) {
+
+ /*
+ * Some hardware (Toradex Verdin AM62) is known to report the
+ * PLL_UNLOCK error interrupt while working without visible
+ * problems. In lack of a reliable way to discriminate such cases
+ * from user-visible PLL_UNLOCK cases, ignore that bit entirely.
+ */
+ if (ret || irq_stat & ~REG_IRQ_STAT_CHA_PLL_UNLOCK) {
/*
* IRQ acknowledged is not always possible (the bridge can be in
* a state where it doesn't answer anymore). To prevent an
@@ -654,7 +661,7 @@ static void sn65dsi83_atomic_enable(struct drm_bridge *bridge,
if (ctx->irq) {
/* Enable irq to detect errors */
regmap_write(ctx->regmap, REG_IRQ_GLOBAL, REG_IRQ_GLOBAL_IRQ_EN);
- regmap_write(ctx->regmap, REG_IRQ_EN, 0xff);
+ regmap_write(ctx->regmap, REG_IRQ_EN, 0xff & ~REG_IRQ_EN_CHA_PLL_UNLOCK_EN);
} else {
/* Use the polling task */
sn65dsi83_monitor_start(ctx);
---
base-commit: c884ee70b15a8d63184d7c1e02eba99676a6fcf7
change-id: 20251126-drm-ti-sn65dsi83-ignore-pll-unlock-4a28aa29eb5c
Best regards,
--
Luca Ceresoli <luca.ceresoli(a)bootlin.com>
When trying to get the system name in the _HID path, after successfully
retrieving the subsystem ID the return value isn't set to 0 but instead
still kept at -ENODATA, leading to a false negative:
[ 12.382507] cs35l41 spi-VLV1776:00: Subsystem ID: VLV1776
[ 12.382521] cs35l41 spi-VLV1776:00: probe with driver cs35l41 failed with error -61
Always return 0 when a subsystem ID is found to mitigate these false
negatives.
Link: https://github.com/CachyOS/CachyOS-Handheld/issues/83
Fixes: 46c8b4d2a693 ("ASoC: cs35l41: Fallback to reading Subsystem ID property if not ACPI")
Cc: stable(a)vger.kernel.org # 6.18
Signed-off-by: Eric Naim <dnaim(a)cachyos.org>
---
sound/soc/codecs/cs35l41.c | 7 ++++---
1 file changed, 4 insertions(+), 3 deletions(-)
diff --git a/sound/soc/codecs/cs35l41.c b/sound/soc/codecs/cs35l41.c
index 173d7c59b725..5001a546a3e7 100644
--- a/sound/soc/codecs/cs35l41.c
+++ b/sound/soc/codecs/cs35l41.c
@@ -1188,13 +1188,14 @@ static int cs35l41_get_system_name(struct cs35l41_private *cs35l41)
}
}
-err:
if (sub) {
cs35l41->dsp.system_name = sub;
dev_info(cs35l41->dev, "Subsystem ID: %s\n", cs35l41->dsp.system_name);
- } else
- dev_warn(cs35l41->dev, "Subsystem ID not found\n");
+ return 0;
+ }
+err:
+ dev_warn(cs35l41->dev, "Subsystem ID not found\n");
return ret;
}
--
2.52.0
Vincent reports:
> The ath5k driver seems to do an array-index-out-of-bounds access as
> shown by the UBSAN kernel message:
> UBSAN: array-index-out-of-bounds in drivers/net/wireless/ath/ath5k/base.c:1741:20
> index 4 is out of range for type 'ieee80211_tx_rate [4]'
> ...
> Call Trace:
> <TASK>
> dump_stack_lvl+0x5d/0x80
> ubsan_epilogue+0x5/0x2b
> __ubsan_handle_out_of_bounds.cold+0x46/0x4b
> ath5k_tasklet_tx+0x4e0/0x560 [ath5k]
> tasklet_action_common+0xb5/0x1c0
It is real. 'ts->ts_final_idx' can be 3 on 5212, so:
info->status.rates[ts->ts_final_idx + 1].idx = -1;
with the array defined as:
struct ieee80211_tx_rate rates[IEEE80211_TX_MAX_RATES];
while the size is:
#define IEEE80211_TX_MAX_RATES 4
is indeed bogus.
Set this 'idx = -1' sentinel only if the array index is less than the
array size. As mac80211 will not look at rates beyond the size
(IEEE80211_TX_MAX_RATES).
Note: The effect of the OOB write is negligible. It just overwrites the
next member of info->status, i.e. ack_signal.
Signed-off-by: Jiri Slaby (SUSE) <jirislaby(a)kernel.org>
Reported-by: Vincent Danjean <vdanjean(a)debian.org>
Link: https://lore.kernel.org/all/aQYUkIaT87ccDCin@eldamar.lan
Closes: https://bugs.debian.org/1119093
Fixes: 6d7b97b23e11 ("ath5k: fix tx status reporting issues")
Cc: stable(a)vger.kernel.org
---
[v2] added Fixes + Cc: stable
Cc: 1119093(a)bugs.debian.org
Cc: Salvatore Bonaccorso <carnil(a)debian.org>
Cc: Nick Kossifidis <mickflemm(a)gmail.com>,
Cc: Luis Chamberlain <mcgrof(a)kernel.org>
Cc: linux-wireless(a)vger.kernel.org
---
drivers/net/wireless/ath/ath5k/base.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/drivers/net/wireless/ath/ath5k/base.c b/drivers/net/wireless/ath/ath5k/base.c
index 4d88b02ffa79..917e1b087924 100644
--- a/drivers/net/wireless/ath/ath5k/base.c
+++ b/drivers/net/wireless/ath/ath5k/base.c
@@ -1738,7 +1738,8 @@ ath5k_tx_frame_completed(struct ath5k_hw *ah, struct sk_buff *skb,
}
info->status.rates[ts->ts_final_idx].count = ts->ts_final_retry;
- info->status.rates[ts->ts_final_idx + 1].idx = -1;
+ if (ts->ts_final_idx + 1 < IEEE80211_TX_MAX_RATES)
+ info->status.rates[ts->ts_final_idx + 1].idx = -1;
if (unlikely(ts->ts_status)) {
ah->stats.ack_fail++;
--
2.52.0
The following commit has been merged into the perf/urgent branch of tip:
Commit-ID: 3ce7078debf267e12aab93528b40ba6c373bfa52
Gitweb: https://git.kernel.org/tip/3ce7078debf267e12aab93528b40ba6c373bfa52
Author: Sandipan Das <sandipan.das(a)amd.com>
AuthorDate: Tue, 09 Dec 2025 13:56:38 +05:30
Committer: Ingo Molnar <mingo(a)kernel.org>
CommitterDate: Tue, 09 Dec 2025 09:59:14 +01:00
perf/x86/amd/uncore: Fix the return value of amd_uncore_df_event_init() on error
If amd_uncore_event_init() fails, return an error irrespective of the
pmu_version. Setting hwc->config should be safe even if there is an
error so use this opportunity to simplify the code.
Closes: https://lore.kernel.org/all/aTaI0ci3vZ44lmBn@stanley.mountain/
Fixes: d6389d3ccc13 ("perf/x86/amd/uncore: Refactor uncore management")
Reported-by: Dan Carpenter <dan.carpenter(a)linaro.org>
Signed-off-by: Sandipan Das <sandipan.das(a)amd.com>
Signed-off-by: Ingo Molnar <mingo(a)kernel.org>
Cc: Peter Zijlstra <peterz(a)infradead.org>
Cc: stable(a)vger.kernel.org
Link: https://patch.msgid.link/076935e23a70335d33bd6e23308b75ae0ad35ba2.176526866…
---
arch/x86/events/amd/uncore.c | 5 +----
1 file changed, 1 insertion(+), 4 deletions(-)
diff --git a/arch/x86/events/amd/uncore.c b/arch/x86/events/amd/uncore.c
index e8b6af1..9293ce5 100644
--- a/arch/x86/events/amd/uncore.c
+++ b/arch/x86/events/amd/uncore.c
@@ -656,14 +656,11 @@ static int amd_uncore_df_event_init(struct perf_event *event)
struct hw_perf_event *hwc = &event->hw;
int ret = amd_uncore_event_init(event);
- if (ret || pmu_version < 2)
- return ret;
-
hwc->config = event->attr.config &
(pmu_version >= 2 ? AMD64_PERFMON_V2_RAW_EVENT_MASK_NB :
AMD64_RAW_EVENT_MASK_NB);
- return 0;
+ return ret;
}
static int amd_uncore_df_add(struct perf_event *event, int flags)
In the non-RT kernel, local_bh_disable() merely disables preemption,
whereas it maps to an actual spin lock in the RT kernel. Consequently,
when attempting to refill RX buffers via netdev_alloc_skb() in
macb_mac_link_up(), a deadlock scenario arises as follows:
Chain caused by macb_mac_link_up():
&bp->lock --> (softirq_ctrl.lock)
Chain caused by macb_start_xmit():
(softirq_ctrl.lock) --> _xmit_ETHER#2 --> &bp->lock
Notably, invoking the mog_init_rings() callback upon link establishment
is unnecessary. Instead, we can exclusively call mog_init_rings() within
the ndo_open() callback. This adjustment resolves the deadlock issue.
Given that mog_init_rings() is only applicable to
non-MACB_CAPS_MACB_IS_EMAC cases, we can simply move it to macb_open()
and simultaneously eliminate the MACB_CAPS_MACB_IS_EMAC check.
Fixes: 633e98a711ac ("net: macb: use resolved link config in mac_link_up()")
Cc: stable(a)vger.kernel.org
Suggested-by: Kevin Hao <kexin.hao(a)windriver.com>
Signed-off-by: Xiaolei Wang <xiaolei.wang(a)windriver.com>
---
V1: https://patchwork.kernel.org/project/netdevbpf/patch/20251128103647.351259-…
V2: Update the correct lock dependency chain and add the Fix tag.
drivers/net/ethernet/cadence/macb_main.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/drivers/net/ethernet/cadence/macb_main.c b/drivers/net/ethernet/cadence/macb_main.c
index ca2386b83473..064fccdcf699 100644
--- a/drivers/net/ethernet/cadence/macb_main.c
+++ b/drivers/net/ethernet/cadence/macb_main.c
@@ -744,7 +744,6 @@ static void macb_mac_link_up(struct phylink_config *config,
/* Initialize rings & buffers as clearing MACB_BIT(TE) in link down
* cleared the pipeline and control registers.
*/
- bp->macbgem_ops.mog_init_rings(bp);
macb_init_buffers(bp);
for (q = 0, queue = bp->queues; q < bp->num_queues; ++q, ++queue)
@@ -2991,6 +2990,8 @@ static int macb_open(struct net_device *dev)
goto pm_exit;
}
+ bp->macbgem_ops.mog_init_rings(bp);
+
for (q = 0, queue = bp->queues; q < bp->num_queues; ++q, ++queue) {
napi_enable(&queue->napi_rx);
napi_enable(&queue->napi_tx);
--
2.43.0
Dear Sir,
I'm one of the registered and authorized International procurement consultants of the Mexico 2026 FIFA world cup.
The government of Mexico through the Mexico 2026 FIFA World Cup Local Organizing Committee has approved a massive procurement of your products for nationwide distribution to Airports facilities, offices, stadiums, Hostels, Hotels, Shops, etc as part of their national and regional sensitization
programs towards the hosting of the FIFA world cup 2026. The packaging of the products will be customized with the Mexico 2026 Logo.
The Local Organizing Committee is looking for a capable company to handle the supply under a bidding process.
I wish to know if your company will be interested to participate in this project so that we could discuss our possible collaboration to ensure a successful bidding.
More details would be provided upon hearing from you.
Best regards,
Best Regards,
Dr. Ms Valeria R Elena
Green Palms México Ltd
Mexico City
La Ciudad de los Palacios
Memory region assignment in ath11k_qmi_assign_target_mem_chunk()
assumes that:
1. firmware will make a HOST_DDR_REGION_TYPE request, and
2. this request is processed before CALDB_MEM_REGION_TYPE
In this case CALDB_MEM_REGION_TYPE, can safely be assigned immediately
after the host region.
However, if the HOST_DDR_REGION_TYPE request is not made, or the
reserved-memory node is not present, then res.start and res.end are 0,
and host_ddr_sz remains uninitialized. The physical address should
fall back to ATH11K_QMI_CALDB_ADDRESS. That doesn't happen:
resource_size(&res) returns 1 for an empty resource, and thus the if
clause never takes the fallback path. ab->qmi.target_mem[idx].paddr
is assigned the uninitialized value of host_ddr_sz + 0 (res.start).
Use "if (res.end > res.start)" for the predicate, which correctly
falls back to ATH11K_QMI_CALDB_ADDRESS.
Fixes: 900730dc4705 ("wifi: ath: Use of_reserved_mem_region_to_resource() for "memory-region"")
Cc: stable(a)vger.kernel.org # v6.18
Signed-off-by: Alexandru Gagniuc <mr.nuke.me(a)gmail.com>
---
drivers/net/wireless/ath/ath11k/qmi.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/drivers/net/wireless/ath/ath11k/qmi.c b/drivers/net/wireless/ath/ath11k/qmi.c
index aea56c38bf8f3..6cc26d1c1e2a4 100644
--- a/drivers/net/wireless/ath/ath11k/qmi.c
+++ b/drivers/net/wireless/ath/ath11k/qmi.c
@@ -2054,7 +2054,7 @@ static int ath11k_qmi_assign_target_mem_chunk(struct ath11k_base *ab)
return ret;
}
- if (res.end - res.start + 1 < ab->qmi.target_mem[i].size) {
+ if (resource_size(&res) < ab->qmi.target_mem[i].size) {
ath11k_dbg(ab, ATH11K_DBG_QMI,
"fail to assign memory of sz\n");
return -EINVAL;
@@ -2086,7 +2086,7 @@ static int ath11k_qmi_assign_target_mem_chunk(struct ath11k_base *ab)
}
if (ath11k_core_coldboot_cal_support(ab)) {
- if (resource_size(&res)) {
+ if (res.end > res.start) {
ab->qmi.target_mem[idx].paddr =
res.start + host_ddr_sz;
ab->qmi.target_mem[idx].iaddr =
--
2.45.1
When VM boots with one virtio-crypto PCI device and builtin backend,
run openssl benchmark command with multiple processes, such as
openssl speed -evp aes-128-cbc -engine afalg -seconds 10 -multi 32
openssl processes will hangup and there is error reported like this:
virtio_crypto virtio0: dataq.0:id 3 is not a head!
It seems that the data virtqueue need protection when it is handled
for virtio done notification. If the spinlock protection is added
in virtcrypto_done_task(), openssl benchmark with multiple processes
works well.
Fixes: fed93fb62e05 ("crypto: virtio - Handle dataq logic with tasklet")
Cc: stable(a)vger.kernel.org
Signed-off-by: Bibo Mao <maobibo(a)loongson.cn>
---
drivers/crypto/virtio/virtio_crypto_core.c | 5 +++++
1 file changed, 5 insertions(+)
diff --git a/drivers/crypto/virtio/virtio_crypto_core.c b/drivers/crypto/virtio/virtio_crypto_core.c
index 3d241446099c..ccc6b5c1b24b 100644
--- a/drivers/crypto/virtio/virtio_crypto_core.c
+++ b/drivers/crypto/virtio/virtio_crypto_core.c
@@ -75,15 +75,20 @@ static void virtcrypto_done_task(unsigned long data)
struct data_queue *data_vq = (struct data_queue *)data;
struct virtqueue *vq = data_vq->vq;
struct virtio_crypto_request *vc_req;
+ unsigned long flags;
unsigned int len;
+ spin_lock_irqsave(&data_vq->lock, flags);
do {
virtqueue_disable_cb(vq);
while ((vc_req = virtqueue_get_buf(vq, &len)) != NULL) {
+ spin_unlock_irqrestore(&data_vq->lock, flags);
if (vc_req->alg_cb)
vc_req->alg_cb(vc_req, len);
+ spin_lock_irqsave(&data_vq->lock, flags);
}
} while (!virtqueue_enable_cb(vq));
+ spin_unlock_irqrestore(&data_vq->lock, flags);
}
static void virtcrypto_dataq_callback(struct virtqueue *vq)
--
2.39.3
Guten Morgen,
mein Name ist Shun Tung, und ich vertrete die Interessen der Familie
Wing Mau, die an verschiedenen Investitionsmöglichkeiten in Ihrem Land
interessiert ist. Besonders interessieren wir uns für die Sektoren
Immobilien, Tourismus, erneuerbare Energien und Fertigung.
Aufgrund politischer Herausforderungen in unserem Heimatland suchen
wir einen zuverlässigen und vertrauenswürdigen Partner im Ausland, der
diese Investitionen in unserem Auftrag verwaltet und absichert.
Ihre Rolle würde darin bestehen, die Investitionen zu überwachen und
sicherzustellen, dass sie erfolgreich sind. Im Gegenzug würden Sie
einen vereinbarten Prozentsatz der Managementgebühren erhalten,
während die Gewinne gleichmäßig zwischen beiden Parteien aufgeteilt
werden.
Falls dieses Angebot Ihr Interesse weckt, würde ich mich über ein
weiteres Gespräch freuen. Ich freue mich auf Ihre Antwort.
Mit freundlichen Grüßen,
Shun Tung
When VM boots with one virtio-crypto PCI device and builtin backend,
run openssl benchmark command with multiple processes, such as
openssl speed -evp aes-128-cbc -engine afalg -seconds 10 -multi 32
openssl processes will hangup and there is error reported like this:
virtio_crypto virtio0: dataq.0:id 3 is not a head!
It seems that the data virtqueue need protection when it is handled
for virtio done notification. If the spinlock protection is added
in virtcrypto_done_task(), openssl benchmark with multiple processes
works well.
Fixes: fed93fb62e05 ("crypto: virtio - Handle dataq logic with tasklet")
Cc: stable(a)vger.kernel.org
Signed-off-by: Bibo Mao <maobibo(a)loongson.cn>
---
drivers/crypto/virtio/virtio_crypto_core.c | 5 +++++
1 file changed, 5 insertions(+)
diff --git a/drivers/crypto/virtio/virtio_crypto_core.c b/drivers/crypto/virtio/virtio_crypto_core.c
index 3d241446099c..ccc6b5c1b24b 100644
--- a/drivers/crypto/virtio/virtio_crypto_core.c
+++ b/drivers/crypto/virtio/virtio_crypto_core.c
@@ -75,15 +75,20 @@ static void virtcrypto_done_task(unsigned long data)
struct data_queue *data_vq = (struct data_queue *)data;
struct virtqueue *vq = data_vq->vq;
struct virtio_crypto_request *vc_req;
+ unsigned long flags;
unsigned int len;
+ spin_lock_irqsave(&data_vq->lock, flags);
do {
virtqueue_disable_cb(vq);
while ((vc_req = virtqueue_get_buf(vq, &len)) != NULL) {
+ spin_unlock_irqrestore(&data_vq->lock, flags);
if (vc_req->alg_cb)
vc_req->alg_cb(vc_req, len);
+ spin_lock_irqsave(&data_vq->lock, flags);
}
} while (!virtqueue_enable_cb(vq));
+ spin_unlock_irqrestore(&data_vq->lock, flags);
}
static void virtcrypto_dataq_callback(struct virtqueue *vq)
--
2.39.3
When the SLAB_STORE_USER debug flag is used, any metadata placed after
the original kmalloc request size (orig_size) is not properly aligned
on 64-bit architectures because its type is unsigned int. When both KASAN
and SLAB_STORE_USER are enabled, kasan_alloc_meta is misaligned.
Note that 64-bit architectures without HAVE_EFFICIENT_UNALIGNED_ACCESS
are assumed to require 64-bit accesses to be 64-bit aligned.
See HAVE_64BIT_ALIGNED_ACCESS and commit adab66b71abf ("Revert:
"ring-buffer: Remove HAVE_64BIT_ALIGNED_ACCESS"") for more details.
Because not all architectures support unaligned memory accesses,
ensure that all metadata (track, orig_size, kasan_{alloc,free}_meta)
in a slab object are word-aligned. struct track, kasan_{alloc,free}_meta
are aligned by adding __aligned(__alignof__(unsigned long)).
For orig_size, use ALIGN(sizeof(unsigned int), sizeof(unsigned long)) to
make clear that its size remains unsigned int but it must be aligned to
a word boundary. On 64-bit architectures, this reserves 8 bytes for
orig_size, which is acceptable since kmalloc's original request size
tracking is intended for debugging rather than production use.
Cc: stable(a)vger.kernel.org
Fixes: 6edf2576a6cc ("mm/slub: enable debugging memory wasting of kmalloc")
Acked-by: Andrey Konovalov <andreyknvl(a)gmail.com>
Signed-off-by: Harry Yoo <harry.yoo(a)oracle.com>
---
v1 -> v2:
- Added Andrey's Acked-by.
- Added references to HAVE_64BIT_ALIGNED_ACCESS and the commit that
resurrected it.
- Used __alignof__() instead of sizeof(), as suggested by Pedro (off-list).
Note: either __alignof__ or sizeof() produces the exactly same mm/slub.o
files, so there's no functional difference.
Thanks!
mm/kasan/kasan.h | 4 ++--
mm/slub.c | 16 +++++++++++-----
2 files changed, 13 insertions(+), 7 deletions(-)
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h
index 129178be5e64..b86b6e9f456a 100644
--- a/mm/kasan/kasan.h
+++ b/mm/kasan/kasan.h
@@ -265,7 +265,7 @@ struct kasan_alloc_meta {
struct kasan_track alloc_track;
/* Free track is stored in kasan_free_meta. */
depot_stack_handle_t aux_stack[2];
-};
+} __aligned(__alignof__(unsigned long));
struct qlist_node {
struct qlist_node *next;
@@ -289,7 +289,7 @@ struct qlist_node {
struct kasan_free_meta {
struct qlist_node quarantine_link;
struct kasan_track free_track;
-};
+} __aligned(__alignof__(unsigned long));
#endif /* CONFIG_KASAN_GENERIC */
diff --git a/mm/slub.c b/mm/slub.c
index a585d0ac45d4..462a39d57b3a 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -344,7 +344,7 @@ struct track {
int cpu; /* Was running on cpu */
int pid; /* Pid context */
unsigned long when; /* When did the operation occur */
-};
+} __aligned(__alignof__(unsigned long));
enum track_item { TRACK_ALLOC, TRACK_FREE };
@@ -1196,7 +1196,7 @@ static void print_trailer(struct kmem_cache *s, struct slab *slab, u8 *p)
off += 2 * sizeof(struct track);
if (slub_debug_orig_size(s))
- off += sizeof(unsigned int);
+ off += ALIGN(sizeof(unsigned int), __alignof__(unsigned long));
off += kasan_metadata_size(s, false);
@@ -1392,7 +1392,8 @@ static int check_pad_bytes(struct kmem_cache *s, struct slab *slab, u8 *p)
off += 2 * sizeof(struct track);
if (s->flags & SLAB_KMALLOC)
- off += sizeof(unsigned int);
+ off += ALIGN(sizeof(unsigned int),
+ __alignof__(unsigned long));
}
off += kasan_metadata_size(s, false);
@@ -7820,9 +7821,14 @@ static int calculate_sizes(struct kmem_cache_args *args, struct kmem_cache *s)
*/
size += 2 * sizeof(struct track);
- /* Save the original kmalloc request size */
+ /*
+ * Save the original kmalloc request size.
+ * Although the request size is an unsigned int,
+ * make sure that is aligned to word boundary.
+ */
if (flags & SLAB_KMALLOC)
- size += sizeof(unsigned int);
+ size += ALIGN(sizeof(unsigned int),
+ __alignof__(unsigned long));
}
#endif
--
2.43.0
Hi,
I would like to request backporting commit b441cf3f8c4b ("xfrm: delete
x->tunnel as we delete x") to all LTS kernels.
This patch actually fixes a use-after-free issue, but it hasn't been
backported to any of the LTS versions, which are still being affected.
As the patch describes, a specific trigger scenario could be:
If a tunnel packet is received (e.g., in ip_local_deliver()), with the
outer layer being IPComp protocol and the inner layer being fragmented
packets, during outer packet processing, it will go through xfrm_input()
to hold a reference to the IPComp xfrm_state. Then, it is re-injected into
the network stack via gro_cells_receive() and placed in the reassembly
queue. When exiting the netns and calling cleanup_net(), although
ipv4_frags_exit_net() is called before xfrm_net_exit(), due to asynchronous
scheduling, fqdir_free_work() may execute after xfrm_state_fini().
In xfrm_state_fini(), xfrm_state_flush() puts and deletes the xfrm_state
for IPPROTO_COMP, but does not delete the xfrm_state for IPPROTO_IPIP.
Meanwhile, the skb in the reassembly queue holds the last reference to the
IPPROTO_COMP xfrm_state, so it isn't destroyed yet. Only when the skb in
the reassembly queue is destroyed does the IPPROTO_COMP xfrm_state get
fully destroyed, which calls ipcomp_destroy() to delete the IPPROTO_IPIP
xfrm_state. However, by this time, the hash tables (net->xfrm.state_byxxx)
have already been kfreed in xfrm_state_fini(), leading to a use-after-free
during the deletion.
The bug has existed since kernel v2.6.29, so the patch should be
backported to all LTS kernels.
thanks,
Slavin Liu
More details about DUOONE OLED for your setup
Hello,
Following up on your interest in portable dual-screen setups, I’d love to share the latest details about the DUOONE OLED SPAN & FOLD, now launching on Kickstarter.
This monitor is designed to eliminate the hassle of heavy or complicated multi-screen setups—whether you're gaming with friends or getting work done. Here’s what makes it stand out:
• True Portability – Ultra-light design that fits in your bag, ready for work or play anywhere.
• One-Cable Setup – Connect with a single USB-C cable (HDMI supported as well).
• OLED Visuals – Vivid colors, deep blacks, and smooth motion for gaming, design, and content creation.
• Flexible Modes – Switch between extended desktop for productivity or head-to-head mode for multiplayer gaming.
You can view the full specs and current Kickstarter pricing here:
View product details on Kickstarter ( https://eeenew.com/index.php?option=com_acym&ctrl=fronturl&task=click&urlid… )
If you have any questions, feel free to reply — happy to help anytime.
Best regards,
Shirley Xue
DUOONE OLED Team
From: Namjae Jeon <linkinjeon(a)kernel.org>
[ Upstream commit b39a1833cc4a2755b02603eec3a71a85e9dff926 ]
Under high concurrency, A tree-connection object (tcon) is freed on
a disconnect path while another path still holds a reference and later
executes *_put()/write on it.
Reported-by: Qianchang Zhao <pioooooooooip(a)gmail.com>
Reported-by: Zhitong Liu <liuzhitong1993(a)gmail.com>
Signed-off-by: Namjae Jeon <linkinjeon(a)kernel.org>
Signed-off-by: Steve French <stfrench(a)microsoft.com>
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
---
LLM Generated explanations, may be completely bogus:
### 3. CLASSIFICATION
This is a **security/stability bug fix**:
- **Use-after-free (UAF)** is a serious memory safety issue
- ksmbd is **network-facing code** (SMB server) - security-sensitive
- Can lead to kernel crashes, data corruption, or potential remote
exploitation
### 4. SCOPE AND RISK ASSESSMENT
**Size**: ~30 lines changed across 3 files
**Files affected**:
- `fs/smb/server/mgmt/tree_connect.c` - core reference counting logic
- `fs/smb/server/mgmt/tree_connect.h` - struct definition
- `fs/smb/server/smb2pdu.c` - disconnect path
**Technical mechanism of the bug**:
The previous fix (commit 33b235a6e6ebe) introduced a waitqueue-based
refcount mechanism:
1. `ksmbd_tree_conn_disconnect()` would decrement refcount, wait for it
to hit 0, then always call `kfree()`
2. `ksmbd_tree_connect_put()` would decrement refcount and wake up
waiters
**The race condition**:
- Thread A: In disconnect, waits for refcount to hit 0, then runs more
code before `kfree()`
- Thread B: Drops last reference via `_put()`, refcount hits 0
- Thread A: Wakes up, but Thread B might still be executing code that
accesses `tcon`
- Thread A: Frees `tcon`
- Thread B: UAF when accessing `tcon` after `_put()` returns
**The fix**: Changes to standard "kref-style" reference counting:
- Whoever drops refcount to 0 immediately calls `kfree()`
- No window between refcount hitting 0 and free
- Removes the buggy waitqueue mechanism entirely
**Risk**: LOW
- The new pattern (free on last put) is the standard kernel pattern
(kref)
- Simpler code is easier to verify correct
- Self-contained within tree_connect subsystem
### 5. USER IMPACT
- **Affected users**: Anyone running ksmbd (kernel SMB server)
- **Trigger**: High concurrency - realistic for file servers
- **Severity**: HIGH - kernel crash or potential security exploitation
- **Real-world occurrence**: Two Reported-by tags confirm users hit this
### 6. STABILITY INDICATORS
- Signed-off by ksmbd maintainer (Namjae Jeon) and CIFS/SMB maintainer
(Steve French)
- Two independent reporters indicate real bug
- The buggy code was introduced in commit 33b235a6e6ebe (v6.6-rc1)
### 7. DEPENDENCY CHECK
- The fix is self-contained
- Depends only on commit 33b235a6e6ebe being present (which introduced
the bug)
- Affects: v6.6 and all later versions
- Should apply cleanly - only 2 minor unrelated commits to
tree_connect.c since the buggy commit
### Summary
| Criterion | Assessment |
|-----------|------------|
| Fixes real bug | ✅ UAF in network-facing code |
| Security impact | ✅ High - potential remote exploitation |
| Small and contained | ✅ ~30 lines, 3 files, single subsystem |
| No new features | ✅ Pure bug fix |
| User-reported | ✅ Two Reported-by tags |
| Clean backport | ✅ Self-contained fix |
This commit fixes a use-after-free vulnerability in ksmbd, the in-kernel
SMB server. The bug exists in the reference counting mechanism for tree
connections and can be triggered under concurrent access - a realistic
scenario for network file servers. UAF bugs in network-facing kernel
code are serious security issues. The fix is small, uses a well-
established kernel pattern (kref-style refcounting), and is self-
contained. It should be backported to all stable kernels containing
commit 33b235a6e6ebe (v6.6+).
**YES**
fs/smb/server/mgmt/tree_connect.c | 18 ++++--------------
fs/smb/server/mgmt/tree_connect.h | 1 -
fs/smb/server/smb2pdu.c | 3 ---
3 files changed, 4 insertions(+), 18 deletions(-)
diff --git a/fs/smb/server/mgmt/tree_connect.c b/fs/smb/server/mgmt/tree_connect.c
index ecfc575086712..d3483d9c757c7 100644
--- a/fs/smb/server/mgmt/tree_connect.c
+++ b/fs/smb/server/mgmt/tree_connect.c
@@ -78,7 +78,6 @@ ksmbd_tree_conn_connect(struct ksmbd_work *work, const char *share_name)
tree_conn->t_state = TREE_NEW;
status.tree_conn = tree_conn;
atomic_set(&tree_conn->refcount, 1);
- init_waitqueue_head(&tree_conn->refcount_q);
ret = xa_err(xa_store(&sess->tree_conns, tree_conn->id, tree_conn,
KSMBD_DEFAULT_GFP));
@@ -100,14 +99,8 @@ ksmbd_tree_conn_connect(struct ksmbd_work *work, const char *share_name)
void ksmbd_tree_connect_put(struct ksmbd_tree_connect *tcon)
{
- /*
- * Checking waitqueue to releasing tree connect on
- * tree disconnect. waitqueue_active is safe because it
- * uses atomic operation for condition.
- */
- if (!atomic_dec_return(&tcon->refcount) &&
- waitqueue_active(&tcon->refcount_q))
- wake_up(&tcon->refcount_q);
+ if (atomic_dec_and_test(&tcon->refcount))
+ kfree(tcon);
}
int ksmbd_tree_conn_disconnect(struct ksmbd_session *sess,
@@ -119,14 +112,11 @@ int ksmbd_tree_conn_disconnect(struct ksmbd_session *sess,
xa_erase(&sess->tree_conns, tree_conn->id);
write_unlock(&sess->tree_conns_lock);
- if (!atomic_dec_and_test(&tree_conn->refcount))
- wait_event(tree_conn->refcount_q,
- atomic_read(&tree_conn->refcount) == 0);
-
ret = ksmbd_ipc_tree_disconnect_request(sess->id, tree_conn->id);
ksmbd_release_tree_conn_id(sess, tree_conn->id);
ksmbd_share_config_put(tree_conn->share_conf);
- kfree(tree_conn);
+ if (atomic_dec_and_test(&tree_conn->refcount))
+ kfree(tree_conn);
return ret;
}
diff --git a/fs/smb/server/mgmt/tree_connect.h b/fs/smb/server/mgmt/tree_connect.h
index a42cdd0510411..f0023d86716f2 100644
--- a/fs/smb/server/mgmt/tree_connect.h
+++ b/fs/smb/server/mgmt/tree_connect.h
@@ -33,7 +33,6 @@ struct ksmbd_tree_connect {
int maximal_access;
bool posix_extensions;
atomic_t refcount;
- wait_queue_head_t refcount_q;
unsigned int t_state;
};
diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c
index 447e76da44409..aae42d2abf7bc 100644
--- a/fs/smb/server/smb2pdu.c
+++ b/fs/smb/server/smb2pdu.c
@@ -2200,7 +2200,6 @@ int smb2_tree_disconnect(struct ksmbd_work *work)
goto err_out;
}
- WARN_ON_ONCE(atomic_dec_and_test(&tcon->refcount));
tcon->t_state = TREE_DISCONNECTED;
write_unlock(&sess->tree_conns_lock);
@@ -2210,8 +2209,6 @@ int smb2_tree_disconnect(struct ksmbd_work *work)
goto err_out;
}
- work->tcon = NULL;
-
rsp->StructureSize = cpu_to_le16(4);
err = ksmbd_iov_pin_rsp(work, rsp,
sizeof(struct smb2_tree_disconnect_rsp));
--
2.51.0
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.10.y
git checkout FETCH_HEAD
git cherry-pick -x baeb66fbd4201d1c4325074e78b1f557dff89b5b
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025120129-earthlike-curse-b3f8@gregkh' --subject-prefix 'PATCH 5.10.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From baeb66fbd4201d1c4325074e78b1f557dff89b5b Mon Sep 17 00:00:00 2001
From: Jimmy Hu <hhhuuu(a)google.com>
Date: Thu, 23 Oct 2025 05:49:45 +0000
Subject: [PATCH] usb: gadget: udc: fix use-after-free in usb_gadget_state_work
A race condition during gadget teardown can lead to a use-after-free
in usb_gadget_state_work(), as reported by KASAN:
BUG: KASAN: invalid-access in sysfs_notify+0x2c/0xd0
Workqueue: events usb_gadget_state_work
The fundamental race occurs because a concurrent event (e.g., an
interrupt) can call usb_gadget_set_state() and schedule gadget->work
at any time during the cleanup process in usb_del_gadget().
Commit 399a45e5237c ("usb: gadget: core: flush gadget workqueue after
device removal") attempted to fix this by moving flush_work() to after
device_del(). However, this does not fully solve the race, as a new
work item can still be scheduled *after* flush_work() completes but
before the gadget's memory is freed, leading to the same use-after-free.
This patch fixes the race condition robustly by introducing a 'teardown'
flag and a 'state_lock' spinlock to the usb_gadget struct. The flag is
set during cleanup in usb_del_gadget() *before* calling flush_work() to
prevent any new work from being scheduled once cleanup has commenced.
The scheduling site, usb_gadget_set_state(), now checks this flag under
the lock before queueing the work, thus safely closing the race window.
Fixes: 5702f75375aa9 ("usb: gadget: udc-core: move sysfs_notify() to a workqueue")
Cc: stable <stable(a)kernel.org>
Signed-off-by: Jimmy Hu <hhhuuu(a)google.com>
Link: https://patch.msgid.link/20251023054945.233861-1-hhhuuu@google.com
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
diff --git a/drivers/usb/gadget/udc/core.c b/drivers/usb/gadget/udc/core.c
index 694653761c44..8dbe79bdc0f9 100644
--- a/drivers/usb/gadget/udc/core.c
+++ b/drivers/usb/gadget/udc/core.c
@@ -1126,8 +1126,13 @@ static void usb_gadget_state_work(struct work_struct *work)
void usb_gadget_set_state(struct usb_gadget *gadget,
enum usb_device_state state)
{
+ unsigned long flags;
+
+ spin_lock_irqsave(&gadget->state_lock, flags);
gadget->state = state;
- schedule_work(&gadget->work);
+ if (!gadget->teardown)
+ schedule_work(&gadget->work);
+ spin_unlock_irqrestore(&gadget->state_lock, flags);
trace_usb_gadget_set_state(gadget, 0);
}
EXPORT_SYMBOL_GPL(usb_gadget_set_state);
@@ -1361,6 +1366,8 @@ static void usb_udc_nop_release(struct device *dev)
void usb_initialize_gadget(struct device *parent, struct usb_gadget *gadget,
void (*release)(struct device *dev))
{
+ spin_lock_init(&gadget->state_lock);
+ gadget->teardown = false;
INIT_WORK(&gadget->work, usb_gadget_state_work);
gadget->dev.parent = parent;
@@ -1535,6 +1542,7 @@ EXPORT_SYMBOL_GPL(usb_add_gadget_udc);
void usb_del_gadget(struct usb_gadget *gadget)
{
struct usb_udc *udc = gadget->udc;
+ unsigned long flags;
if (!udc)
return;
@@ -1548,6 +1556,13 @@ void usb_del_gadget(struct usb_gadget *gadget)
kobject_uevent(&udc->dev.kobj, KOBJ_REMOVE);
sysfs_remove_link(&udc->dev.kobj, "gadget");
device_del(&gadget->dev);
+ /*
+ * Set the teardown flag before flushing the work to prevent new work
+ * from being scheduled while we are cleaning up.
+ */
+ spin_lock_irqsave(&gadget->state_lock, flags);
+ gadget->teardown = true;
+ spin_unlock_irqrestore(&gadget->state_lock, flags);
flush_work(&gadget->work);
ida_free(&gadget_id_numbers, gadget->id_number);
cancel_work_sync(&udc->vbus_work);
diff --git a/include/linux/usb/gadget.h b/include/linux/usb/gadget.h
index 3aaf19e77558..8285b19a25e0 100644
--- a/include/linux/usb/gadget.h
+++ b/include/linux/usb/gadget.h
@@ -376,6 +376,9 @@ struct usb_gadget_ops {
* can handle. The UDC must support this and all slower speeds and lower
* number of lanes.
* @state: the state we are now (attached, suspended, configured, etc)
+ * @state_lock: Spinlock protecting the `state` and `teardown` members.
+ * @teardown: True if the device is undergoing teardown, used to prevent
+ * new work from being scheduled during cleanup.
* @name: Identifies the controller hardware type. Used in diagnostics
* and sometimes configuration.
* @dev: Driver model state for this abstract device.
@@ -451,6 +454,8 @@ struct usb_gadget {
enum usb_ssp_rate max_ssp_rate;
enum usb_device_state state;
+ spinlock_t state_lock;
+ bool teardown;
const char *name;
struct device dev;
unsigned isoch_delay;
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.15.y
git checkout FETCH_HEAD
git cherry-pick -x baeb66fbd4201d1c4325074e78b1f557dff89b5b
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025120123-borax-cut-5c52@gregkh' --subject-prefix 'PATCH 5.15.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From baeb66fbd4201d1c4325074e78b1f557dff89b5b Mon Sep 17 00:00:00 2001
From: Jimmy Hu <hhhuuu(a)google.com>
Date: Thu, 23 Oct 2025 05:49:45 +0000
Subject: [PATCH] usb: gadget: udc: fix use-after-free in usb_gadget_state_work
A race condition during gadget teardown can lead to a use-after-free
in usb_gadget_state_work(), as reported by KASAN:
BUG: KASAN: invalid-access in sysfs_notify+0x2c/0xd0
Workqueue: events usb_gadget_state_work
The fundamental race occurs because a concurrent event (e.g., an
interrupt) can call usb_gadget_set_state() and schedule gadget->work
at any time during the cleanup process in usb_del_gadget().
Commit 399a45e5237c ("usb: gadget: core: flush gadget workqueue after
device removal") attempted to fix this by moving flush_work() to after
device_del(). However, this does not fully solve the race, as a new
work item can still be scheduled *after* flush_work() completes but
before the gadget's memory is freed, leading to the same use-after-free.
This patch fixes the race condition robustly by introducing a 'teardown'
flag and a 'state_lock' spinlock to the usb_gadget struct. The flag is
set during cleanup in usb_del_gadget() *before* calling flush_work() to
prevent any new work from being scheduled once cleanup has commenced.
The scheduling site, usb_gadget_set_state(), now checks this flag under
the lock before queueing the work, thus safely closing the race window.
Fixes: 5702f75375aa9 ("usb: gadget: udc-core: move sysfs_notify() to a workqueue")
Cc: stable <stable(a)kernel.org>
Signed-off-by: Jimmy Hu <hhhuuu(a)google.com>
Link: https://patch.msgid.link/20251023054945.233861-1-hhhuuu@google.com
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
diff --git a/drivers/usb/gadget/udc/core.c b/drivers/usb/gadget/udc/core.c
index 694653761c44..8dbe79bdc0f9 100644
--- a/drivers/usb/gadget/udc/core.c
+++ b/drivers/usb/gadget/udc/core.c
@@ -1126,8 +1126,13 @@ static void usb_gadget_state_work(struct work_struct *work)
void usb_gadget_set_state(struct usb_gadget *gadget,
enum usb_device_state state)
{
+ unsigned long flags;
+
+ spin_lock_irqsave(&gadget->state_lock, flags);
gadget->state = state;
- schedule_work(&gadget->work);
+ if (!gadget->teardown)
+ schedule_work(&gadget->work);
+ spin_unlock_irqrestore(&gadget->state_lock, flags);
trace_usb_gadget_set_state(gadget, 0);
}
EXPORT_SYMBOL_GPL(usb_gadget_set_state);
@@ -1361,6 +1366,8 @@ static void usb_udc_nop_release(struct device *dev)
void usb_initialize_gadget(struct device *parent, struct usb_gadget *gadget,
void (*release)(struct device *dev))
{
+ spin_lock_init(&gadget->state_lock);
+ gadget->teardown = false;
INIT_WORK(&gadget->work, usb_gadget_state_work);
gadget->dev.parent = parent;
@@ -1535,6 +1542,7 @@ EXPORT_SYMBOL_GPL(usb_add_gadget_udc);
void usb_del_gadget(struct usb_gadget *gadget)
{
struct usb_udc *udc = gadget->udc;
+ unsigned long flags;
if (!udc)
return;
@@ -1548,6 +1556,13 @@ void usb_del_gadget(struct usb_gadget *gadget)
kobject_uevent(&udc->dev.kobj, KOBJ_REMOVE);
sysfs_remove_link(&udc->dev.kobj, "gadget");
device_del(&gadget->dev);
+ /*
+ * Set the teardown flag before flushing the work to prevent new work
+ * from being scheduled while we are cleaning up.
+ */
+ spin_lock_irqsave(&gadget->state_lock, flags);
+ gadget->teardown = true;
+ spin_unlock_irqrestore(&gadget->state_lock, flags);
flush_work(&gadget->work);
ida_free(&gadget_id_numbers, gadget->id_number);
cancel_work_sync(&udc->vbus_work);
diff --git a/include/linux/usb/gadget.h b/include/linux/usb/gadget.h
index 3aaf19e77558..8285b19a25e0 100644
--- a/include/linux/usb/gadget.h
+++ b/include/linux/usb/gadget.h
@@ -376,6 +376,9 @@ struct usb_gadget_ops {
* can handle. The UDC must support this and all slower speeds and lower
* number of lanes.
* @state: the state we are now (attached, suspended, configured, etc)
+ * @state_lock: Spinlock protecting the `state` and `teardown` members.
+ * @teardown: True if the device is undergoing teardown, used to prevent
+ * new work from being scheduled during cleanup.
* @name: Identifies the controller hardware type. Used in diagnostics
* and sometimes configuration.
* @dev: Driver model state for this abstract device.
@@ -451,6 +454,8 @@ struct usb_gadget {
enum usb_ssp_rate max_ssp_rate;
enum usb_device_state state;
+ spinlock_t state_lock;
+ bool teardown;
const char *name;
struct device dev;
unsigned isoch_delay;
On converting to the of_reserved_mem_region_to_resource() helper with
commit 900730dc4705 ("wifi: ath: Use
of_reserved_mem_region_to_resource() for "memory-region"") a logic error
was introduced in the ath11k_core_coldboot_cal_support() if condition.
The original code checked for hremote_node presence and skipped
ath11k_core_coldboot_cal_support() in the other switch case but now
everything is driven entirely on the values of the resource struct.
resource_size() (in this case) is wrongly assumed to return a size of
zero if the passed resource struct is init to zero. This is not the case
as a resource struct should be always init with correct values (or at
best set the end value to -1 to signal it's not configured)
(the return value of resource_size() for a resource struct with start
and end set to zero is 1)
On top of this, using resource_size() to check if a resource struct is
initialized or not is generally wrong and other measure should be used
instead.
To better handle this, use the DEFINE_RES macro to initialize the
resource struct and set the IORESOURCE_UNSET flag by default.
Replace the resource_size() check with checking for the resource struct
flags and check if it's IORESOURCE_UNSET.
This change effectively restore the original logic and restore correct
loading of the ath11k firmware (restoring correct functionality of
Wi-Fi)
Cc: stable(a)vger.kernel.org
Fixes: 900730dc4705 ("wifi: ath: Use of_reserved_mem_region_to_resource() for "memory-region"")
Link: https://lore.kernel.org/all/20251207215359.28895-1-ansuelsmth@gmail.com/T/#…
Cc: Ilpo Järvinen <ilpo.jarvinen(a)linux.intel.com>
Signed-off-by: Christian Marangi <ansuelsmth(a)gmail.com>
---
drivers/net/wireless/ath/ath11k/qmi.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/drivers/net/wireless/ath/ath11k/qmi.c b/drivers/net/wireless/ath/ath11k/qmi.c
index ff6a97e328b8..afa663c00620 100644
--- a/drivers/net/wireless/ath/ath11k/qmi.c
+++ b/drivers/net/wireless/ath/ath11k/qmi.c
@@ -2039,8 +2039,8 @@ static int ath11k_qmi_alloc_target_mem_chunk(struct ath11k_base *ab)
static int ath11k_qmi_assign_target_mem_chunk(struct ath11k_base *ab)
{
+ struct resource res = DEFINE_RES(0, 0, IORESOURCE_UNSET);
struct device *dev = ab->dev;
- struct resource res = {};
u32 host_ddr_sz;
int i, idx, ret;
@@ -2086,7 +2086,7 @@ static int ath11k_qmi_assign_target_mem_chunk(struct ath11k_base *ab)
}
if (ath11k_core_coldboot_cal_support(ab)) {
- if (resource_size(&res)) {
+ if (res.flags != IORESOURCE_UNSET) {
ab->qmi.target_mem[idx].paddr =
res.start + host_ddr_sz;
ab->qmi.target_mem[idx].iaddr =
--
2.51.0
When the filesystem is being mounted, the kernel panics while the data
regarding slot map allocation to the local node, is being written to the
disk. This occurs because the value of slot map buffer head block
number, which should have been greater than or equal to
`OCFS2_SUPER_BLOCK_BLKNO` (evaluating to 2) is less than it, indicative
of disk metadata corruption. This triggers
BUG_ON(bh->b_blocknr < OCFS2_SUPER_BLOCK_BLKNO) in ocfs2_write_block(),
causing the kernel to panic.
This is fixed by introducing an if condition block in
ocfs2_update_disk_slot(), right before calling ocfs2_write_block(), which
checks if `bh->b_blocknr` is lesser than `OCFS2_SUPER_BLOCK_BLKNO`; if
yes, then ocfs2_error is called, which prints the error log, for
debugging purposes, and the return value of ocfs2_error() is returned
back to caller of ocfs2_update_disk_slot() i.e. ocfs2_find_slot(). If
the return value is zero. then error code EIO is returned.
Reported-by: syzbot+c818e5c4559444f88aa0(a)syzkaller.appspotmail.com
Closes: https://syzkaller.appspot.com/bug?extid=c818e5c4559444f88aa0
Tested-by: syzbot+c818e5c4559444f88aa0(a)syzkaller.appspotmail.com
Cc: stable(a)vger.kernel.org
Signed-off-by: Prithvi Tambewagh <activprithvi(a)gmail.com>
---
fs/ocfs2/slot_map.c | 10 ++++++++++
1 file changed, 10 insertions(+)
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c
index e544c704b583..788924fc3663 100644
--- a/fs/ocfs2/slot_map.c
+++ b/fs/ocfs2/slot_map.c
@@ -193,6 +193,16 @@ static int ocfs2_update_disk_slot(struct ocfs2_super *osb,
else
ocfs2_update_disk_slot_old(si, slot_num, &bh);
spin_unlock(&osb->osb_lock);
+ if (bh->b_blocknr < OCFS2_SUPER_BLOCK_BLKNO) {
+ status = ocfs2_error(osb->sb,
+ "Invalid Slot Map Buffer Head "
+ "Block Number : %llu, Should be >= %d",
+ le16_to_cpu(bh->b_blocknr),
+ le16_to_cpu((int)OCFS2_SUPER_BLOCK_BLKNO));
+ if (!status)
+ return -EIO;
+ return status;
+ }
status = ocfs2_write_block(osb, bh, INODE_CACHE(si->si_inode));
if (status < 0)
base-commit: 24172e0d79900908cf5ebf366600616d29c9b417
--
2.43.0
tpm2_key_decode() overrides the explicit keyhandle parameter, which can
lead to problems, if the loaded parent handle does not match the handle
stored to the key file. This can easily happen as handle by definition
is an ambiguous attribute.
Cc: stable(a)vger.kernel.org # v5.13+
Fixes: f2219745250f ("security: keys: trusted: use ASN.1 TPM2 key format for the blobs")
Signed-off-by: Jarkko Sakkinen <jarkko(a)kernel.org>
---
security/keys/trusted-keys/trusted_tpm2.c | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/security/keys/trusted-keys/trusted_tpm2.c b/security/keys/trusted-keys/trusted_tpm2.c
index fb76c4ea496f..950684e54c71 100644
--- a/security/keys/trusted-keys/trusted_tpm2.c
+++ b/security/keys/trusted-keys/trusted_tpm2.c
@@ -121,7 +121,9 @@ static int tpm2_key_decode(struct trusted_key_payload *payload,
return -ENOMEM;
*buf = blob;
- options->keyhandle = ctx.parent;
+
+ if (!options->keyhandle)
+ options->keyhandle = ctx.parent;
memcpy(blob, ctx.priv, ctx.priv_len);
blob += ctx.priv_len;
--
2.39.5
The patch below does not apply to the 6.1-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.1.y
git checkout FETCH_HEAD
git cherry-pick -x 4da3768e1820cf15cced390242d8789aed34f54d
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025120802-remedy-glimmer-fc9d@gregkh' --subject-prefix 'PATCH 6.1.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 4da3768e1820cf15cced390242d8789aed34f54d Mon Sep 17 00:00:00 2001
From: Omar Sandoval <osandov(a)fb.com>
Date: Tue, 4 Nov 2025 09:55:26 -0800
Subject: [PATCH] KVM: SVM: Don't skip unrelated instruction if INT3/INTO is
replaced
When re-injecting a soft interrupt from an INT3, INT0, or (select) INTn
instruction, discard the exception and retry the instruction if the code
stream is changed (e.g. by a different vCPU) between when the CPU
executes the instruction and when KVM decodes the instruction to get the
next RIP.
As effectively predicted by commit 6ef88d6e36c2 ("KVM: SVM: Re-inject
INT3/INTO instead of retrying the instruction"), failure to verify that
the correct INTn instruction was decoded can effectively clobber guest
state due to decoding the wrong instruction and thus specifying the
wrong next RIP.
The bug most often manifests as "Oops: int3" panics on static branch
checks in Linux guests. Enabling or disabling a static branch in Linux
uses the kernel's "text poke" code patching mechanism. To modify code
while other CPUs may be executing that code, Linux (temporarily)
replaces the first byte of the original instruction with an int3 (opcode
0xcc), then patches in the new code stream except for the first byte,
and finally replaces the int3 with the first byte of the new code
stream. If a CPU hits the int3, i.e. executes the code while it's being
modified, then the guest kernel must look up the RIP to determine how to
handle the #BP, e.g. by emulating the new instruction. If the RIP is
incorrect, then this lookup fails and the guest kernel panics.
The bug reproduces almost instantly by hacking the guest kernel to
repeatedly check a static branch[1] while running a drgn script[2] on
the host to constantly swap out the memory containing the guest's TSS.
[1]: https://gist.github.com/osandov/44d17c51c28c0ac998ea0334edf90b5a
[2]: https://gist.github.com/osandov/10e45e45afa29b11e0c7209247afc00b
Fixes: 6ef88d6e36c2 ("KVM: SVM: Re-inject INT3/INTO instead of retrying the instruction")
Cc: stable(a)vger.kernel.org
Co-developed-by: Sean Christopherson <seanjc(a)google.com>
Signed-off-by: Omar Sandoval <osandov(a)fb.com>
Link: https://patch.msgid.link/1cc6dcdf36e3add7ee7c8d90ad58414eeb6c3d34.176227876…
Signed-off-by: Sean Christopherson <seanjc(a)google.com>
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 48598d017d6f..974d64bf0a4d 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -2143,6 +2143,11 @@ u64 vcpu_tsc_khz(struct kvm_vcpu *vcpu);
* the gfn, i.e. retrying the instruction will hit a
* !PRESENT fault, which results in a new shadow page
* and sends KVM back to square one.
+ *
+ * EMULTYPE_SKIP_SOFT_INT - Set in combination with EMULTYPE_SKIP to only skip
+ * an instruction if it could generate a given software
+ * interrupt, which must be encoded via
+ * EMULTYPE_SET_SOFT_INT_VECTOR().
*/
#define EMULTYPE_NO_DECODE (1 << 0)
#define EMULTYPE_TRAP_UD (1 << 1)
@@ -2153,6 +2158,10 @@ u64 vcpu_tsc_khz(struct kvm_vcpu *vcpu);
#define EMULTYPE_PF (1 << 6)
#define EMULTYPE_COMPLETE_USER_EXIT (1 << 7)
#define EMULTYPE_WRITE_PF_TO_SP (1 << 8)
+#define EMULTYPE_SKIP_SOFT_INT (1 << 9)
+
+#define EMULTYPE_SET_SOFT_INT_VECTOR(v) ((u32)((v) & 0xff) << 16)
+#define EMULTYPE_GET_SOFT_INT_VECTOR(e) (((e) >> 16) & 0xff)
static inline bool kvm_can_emulate_event_vectoring(int emul_type)
{
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 1ae7b3c5a7c5..459db8154929 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -272,6 +272,7 @@ static void svm_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
}
static int __svm_skip_emulated_instruction(struct kvm_vcpu *vcpu,
+ int emul_type,
bool commit_side_effects)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -293,7 +294,7 @@ static int __svm_skip_emulated_instruction(struct kvm_vcpu *vcpu,
if (unlikely(!commit_side_effects))
old_rflags = svm->vmcb->save.rflags;
- if (!kvm_emulate_instruction(vcpu, EMULTYPE_SKIP))
+ if (!kvm_emulate_instruction(vcpu, emul_type))
return 0;
if (unlikely(!commit_side_effects))
@@ -311,11 +312,13 @@ static int __svm_skip_emulated_instruction(struct kvm_vcpu *vcpu,
static int svm_skip_emulated_instruction(struct kvm_vcpu *vcpu)
{
- return __svm_skip_emulated_instruction(vcpu, true);
+ return __svm_skip_emulated_instruction(vcpu, EMULTYPE_SKIP, true);
}
-static int svm_update_soft_interrupt_rip(struct kvm_vcpu *vcpu)
+static int svm_update_soft_interrupt_rip(struct kvm_vcpu *vcpu, u8 vector)
{
+ const int emul_type = EMULTYPE_SKIP | EMULTYPE_SKIP_SOFT_INT |
+ EMULTYPE_SET_SOFT_INT_VECTOR(vector);
unsigned long rip, old_rip = kvm_rip_read(vcpu);
struct vcpu_svm *svm = to_svm(vcpu);
@@ -331,7 +334,7 @@ static int svm_update_soft_interrupt_rip(struct kvm_vcpu *vcpu)
* in use, the skip must not commit any side effects such as clearing
* the interrupt shadow or RFLAGS.RF.
*/
- if (!__svm_skip_emulated_instruction(vcpu, !nrips))
+ if (!__svm_skip_emulated_instruction(vcpu, emul_type, !nrips))
return -EIO;
rip = kvm_rip_read(vcpu);
@@ -367,7 +370,7 @@ static void svm_inject_exception(struct kvm_vcpu *vcpu)
kvm_deliver_exception_payload(vcpu, ex);
if (kvm_exception_is_soft(ex->vector) &&
- svm_update_soft_interrupt_rip(vcpu))
+ svm_update_soft_interrupt_rip(vcpu, ex->vector))
return;
svm->vmcb->control.event_inj = ex->vector
@@ -3642,11 +3645,12 @@ static bool svm_set_vnmi_pending(struct kvm_vcpu *vcpu)
static void svm_inject_irq(struct kvm_vcpu *vcpu, bool reinjected)
{
+ struct kvm_queued_interrupt *intr = &vcpu->arch.interrupt;
struct vcpu_svm *svm = to_svm(vcpu);
u32 type;
- if (vcpu->arch.interrupt.soft) {
- if (svm_update_soft_interrupt_rip(vcpu))
+ if (intr->soft) {
+ if (svm_update_soft_interrupt_rip(vcpu, intr->nr))
return;
type = SVM_EVTINJ_TYPE_SOFT;
@@ -3654,12 +3658,10 @@ static void svm_inject_irq(struct kvm_vcpu *vcpu, bool reinjected)
type = SVM_EVTINJ_TYPE_INTR;
}
- trace_kvm_inj_virq(vcpu->arch.interrupt.nr,
- vcpu->arch.interrupt.soft, reinjected);
+ trace_kvm_inj_virq(intr->nr, intr->soft, reinjected);
++vcpu->stat.irq_injections;
- svm->vmcb->control.event_inj = vcpu->arch.interrupt.nr |
- SVM_EVTINJ_VALID | type;
+ svm->vmcb->control.event_inj = intr->nr | SVM_EVTINJ_VALID | type;
}
void svm_complete_interrupt_delivery(struct kvm_vcpu *vcpu, int delivery_mode,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 42ecd093bb4c..8e14c0292809 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -9338,6 +9338,23 @@ static bool is_vmware_backdoor_opcode(struct x86_emulate_ctxt *ctxt)
return false;
}
+static bool is_soft_int_instruction(struct x86_emulate_ctxt *ctxt,
+ int emulation_type)
+{
+ u8 vector = EMULTYPE_GET_SOFT_INT_VECTOR(emulation_type);
+
+ switch (ctxt->b) {
+ case 0xcc:
+ return vector == BP_VECTOR;
+ case 0xcd:
+ return vector == ctxt->src.val;
+ case 0xce:
+ return vector == OF_VECTOR;
+ default:
+ return false;
+ }
+}
+
/*
* Decode an instruction for emulation. The caller is responsible for handling
* code breakpoints. Note, manually detecting code breakpoints is unnecessary
@@ -9448,6 +9465,10 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
* injecting single-step #DBs.
*/
if (emulation_type & EMULTYPE_SKIP) {
+ if (emulation_type & EMULTYPE_SKIP_SOFT_INT &&
+ !is_soft_int_instruction(ctxt, emulation_type))
+ return 0;
+
if (ctxt->mode != X86EMUL_MODE_PROT64)
ctxt->eip = (u32)ctxt->_eip;
else
¿Cuánto cuesta una mala contratación?
body {
margin: 0;
padding: 0;
font-family: Arial, Helvetica, sans-serif;
font-size: 14px;
color: #333;
background-color: #ffffff;
}
table {
border-spacing: 0;
width: 100%;
max-width: 600px;
margin: auto;
}
td {
padding: 12px 20px;
}
a {
color: #1a73e8;
text-decoration: none;
}
.footer {
font-size: 12px;
color: #888888;
text-align: center;
}
Una mala contratación cuesta 3X el salario. Evítalo con datos, no percepciones.
Hola, ,
¿Sabías que una mala contratación cuesta hasta 3 veces el salario anual?
El 74% de empresas admite haber contratado a la persona equivocada. El motivo: decisiones basadas en percepciones, no en datos objetivos.
PsicoSmart te ayuda a evaluar talento con precisión:
31 pruebas psicométricas validadas para medir liderazgo, honestidad e inteligencia
2,500+ exámenes técnicos especializados por industria
Verificación de identidad con captura fotográfica automática
Resultados en minutos, accesible desde cualquier dispositivo
Reduce hasta 60% el riesgo de error en selección.
¿Quieres una demostración gratuita? Responde este correo y te contacto en menos de 24 horas.
Saludos,
--------------
Atte.: Valeria Pérez
Ciudad de México: (55) 5018 0565
WhatsApp: +52 33 1607 2089
Si no deseas recibir más correos, haz clic aquí para darte de baja.
Para remover su dirección de esta lista haga <a href="https://s1.arrobamail.com/unsuscribe.php?id=yiwtsrewiswqrpseup">click aquí</a>
The function of_node_get() returns a node pointer with its refcount
incremented, but in the error path of the fsi_master_gpio_probe(),
this reference is not released before freeing the master structure,
causing a refcount leak.
Add the missing of_node_put() in the error handling path to properly
release the device tree node reference.
Fixes: 8ef9ccf81044 ("fsi: master-gpio: Add missing release function")
Cc: stable(a)vger.kernel.org
Signed-off-by: Wentao Liang <vulab(a)iscas.ac.cn>
---
drivers/fsi/fsi-master-gpio.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/drivers/fsi/fsi-master-gpio.c b/drivers/fsi/fsi-master-gpio.c
index 69de0b5b9cbd..ae9e156284d0 100644
--- a/drivers/fsi/fsi-master-gpio.c
+++ b/drivers/fsi/fsi-master-gpio.c
@@ -861,6 +861,7 @@ static int fsi_master_gpio_probe(struct platform_device *pdev)
}
return 0;
err_free:
+ of_node_put(master->master.dev.of_node);
kfree(master);
return rc;
}
--
2.34.1
Commit 900730dc4705 ("wifi: ath: Use
of_reserved_mem_region_to_resource() for "memory-region"") uncovered a
massive problem with the usage of resource_size() helper.
The reported commit caused a regression with ath11k WiFi firmware
loading and the change was just a simple replacement of duplicate code
with a new helper of_reserved_mem_region_to_resource().
On reworking this, in the commit also a check for the presence of the
node was replaced with resource_size(&res). This was done following the
logic that if the node wasn't present then it's expected that also the
resource_size is zero, mimicking the same if-else logic.
This was also the reason the regression was mostly hard to catch at
first sight as the rework is correctly done given the assumption on the
used helpers.
BUT this is actually not the case. On further inspection on
resource_size() it was found that it NEVER actually returns 0.
Even if the resource value of start and end are 0, the return value of
resource_size() will ALWAYS be 1, resulting in the broken if-else
condition ALWAYS going in the first if condition.
This was simply confirmed by reading the resource_size() logic:
return res->end - res->start + 1;
Given the confusion, also other case of such usage were searched in the
kernel and with great suprise it seems LOTS of place assume
resource_size() should return zero in the context of the resource start
and end set to 0.
Quoting for example comments in drivers/vfio/pci/vfio_pci_core.c:
/*
* The PCI core shouldn't set up a resource with a
* type but zero size. But there may be bugs that
* cause us to do that.
*/
if (!resource_size(res))
goto no_mmap;
It really seems resource_size() was tought with the assumption that
resource struct was always correctly initialized before calling it and
never set to zero.
But across the year this got lost and now there are lots of driver that
assume resource_size() returns 0 if start and end are also 0.
To better handle this and make resource_size() returns correct value in
such case, add a simple check and return 0 if both resource start and
resource end are zero.
Cc: Rob Herring (Arm) <robh(a)kernel.org>
Cc: stable(a)vger.kernel.org
Fixes: 1a4e564b7db9 ("resource: add resource_size()")
Signed-off-by: Christian Marangi <ansuelsmth(a)gmail.com>
---
include/linux/ioport.h | 3 +++
1 file changed, 3 insertions(+)
diff --git a/include/linux/ioport.h b/include/linux/ioport.h
index 9afa30f9346f..1b8ce62255db 100644
--- a/include/linux/ioport.h
+++ b/include/linux/ioport.h
@@ -288,6 +288,9 @@ static inline void resource_set_range(struct resource *res,
static inline resource_size_t resource_size(const struct resource *res)
{
+ if (!res->start && !res->end)
+ return 0;
+
return res->end - res->start + 1;
}
static inline unsigned long resource_type(const struct resource *res)
--
2.51.0
Hi,
After a stable kernel update, the hwclock command seems no longer
functional on my SPARC system with an ST M48T59Y-70PC1 RTC:
# hwclock
[...long delay...]
hwclock: select() to /dev/rtc0 to wait for clock tick timed out
On prior kernels, there is no problem:
# hwclock
2025-10-22 22:21:04.806992-04:00
I reproduced the same failure on 6.18-rc2 and bisected to this commit:
commit 795cda8338eab036013314dbc0b04aae728880ab
Author: Esben Haabendal <esben(a)geanix.com>
Date: Fri May 16 09:23:35 2025 +0200
rtc: interface: Fix long-standing race when setting alarm
This commit was backported to all current 6.x stable branches,
as well as 5.15.x, so they all have the same regression.
Reverting this commit on top of 6.18-rc2 corrects the problem.
Let me know if you need any more info!
Thanks,
Nick
Nilay reported that since commit daaa574aba6f ("powerpc/pseries/msi: Switch
to msi_create_parent_irq_domain()"), the NVMe driver cannot enable MSI-X
when the device's MSI-X table size is larger than the firmware's MSI quota
for the device.
This is because the commit changes how rtas_prepare_msi_irqs() is called:
- Before, it is called when interrupts are allocated at the global
interrupt domain with nvec_in being the number of allocated interrupts.
rtas_prepare_msi_irqs() can return a positive number and the allocation
will be retried.
- Now, it is called at the creation of per-device interrupt domain with
nvec_in being the number of interrupts that the device supports. If
rtas_prepare_msi_irqs() returns positive, domain creation just fails.
For Nilay's NVMe driver case, rtas_prepare_msi_irqs() returns a positive
number (the quota). This causes per-device interrupt domain creation to
fail and thus the NVMe driver cannot enable MSI-X.
Rework to make this scenario works again:
- pseries_msi_ops_prepare() only prepares as many interrupts as the quota
permit.
- pseries_irq_domain_alloc() fails if the device's quota is exceeded.
Now, if the quota is exceeded, pseries_msi_ops_prepare() will only prepare
as allowed by the quota. If device drivers attempt to allocate more
interrupts than the quota permits, pseries_irq_domain_alloc() will return
an error code and msi_handle_pci_fail() will allow device drivers a retry.
Reported-by: Nilay Shroff <nilay(a)inux.ibm.com>
Closes: https://lore.kernel.org/linuxppc-dev/6af2c4c2-97f6-4758-be33-256638ef39e5@l…
Fixes: daaa574aba6f ("powerpc/pseries/msi: Switch to msi_create_parent_irq_domain()")
Signed-off-by: Nam Cao <namcao(a)linutronix.de>
Acked-by: Nilay Shroff <nilay(a)inux.ibm.com>
Cc: stable(a)vger.kernel.org
---
arch/powerpc/platforms/pseries/msi.c | 42 ++++++++++++++++++++++++++--
1 file changed, 39 insertions(+), 3 deletions(-)
diff --git a/arch/powerpc/platforms/pseries/msi.c b/arch/powerpc/platforms/pseries/msi.c
index a82aaa786e9e..8898a968a59b 100644
--- a/arch/powerpc/platforms/pseries/msi.c
+++ b/arch/powerpc/platforms/pseries/msi.c
@@ -19,6 +19,11 @@
#include "pseries.h"
+struct pseries_msi_device {
+ unsigned int msi_quota;
+ unsigned int msi_used;
+};
+
static int query_token, change_token;
#define RTAS_QUERY_FN 0
@@ -433,8 +438,26 @@ static int pseries_msi_ops_prepare(struct irq_domain *domain, struct device *dev
struct msi_domain_info *info = domain->host_data;
struct pci_dev *pdev = to_pci_dev(dev);
int type = (info->flags & MSI_FLAG_PCI_MSIX) ? PCI_CAP_ID_MSIX : PCI_CAP_ID_MSI;
+ int ret;
+
+ struct pseries_msi_device *pseries_dev __free(kfree)
+ = kmalloc(sizeof(*pseries_dev), GFP_KERNEL);
+ if (!pseries_dev)
+ return -ENOMEM;
+
+ ret = rtas_prepare_msi_irqs(pdev, nvec, type, arg);
+ if (ret > 0) {
+ nvec = ret;
+ ret = rtas_prepare_msi_irqs(pdev, nvec, type, arg);
+ }
+ if (ret < 0)
+ return ret;
- return rtas_prepare_msi_irqs(pdev, nvec, type, arg);
+ pseries_dev->msi_quota = nvec;
+ pseries_dev->msi_used = 0;
+
+ arg->scratchpad[0].ptr = no_free_ptr(pseries_dev);
+ return 0;
}
/*
@@ -443,9 +466,13 @@ static int pseries_msi_ops_prepare(struct irq_domain *domain, struct device *dev
*/
static void pseries_msi_ops_teardown(struct irq_domain *domain, msi_alloc_info_t *arg)
{
+ struct pseries_msi_device *pseries_dev = arg->scratchpad[0].ptr;
struct pci_dev *pdev = to_pci_dev(domain->dev);
rtas_disable_msi(pdev);
+
+ WARN_ON(pseries_dev->msi_used);
+ kfree(pseries_dev);
}
static void pseries_msi_shutdown(struct irq_data *d)
@@ -546,12 +573,18 @@ static int pseries_irq_domain_alloc(struct irq_domain *domain, unsigned int virq
unsigned int nr_irqs, void *arg)
{
struct pci_controller *phb = domain->host_data;
+ struct pseries_msi_device *pseries_dev;
msi_alloc_info_t *info = arg;
struct msi_desc *desc = info->desc;
struct pci_dev *pdev = msi_desc_to_pci_dev(desc);
int hwirq;
int i, ret;
+ pseries_dev = info->scratchpad[0].ptr;
+
+ if (pseries_dev->msi_used + nr_irqs > pseries_dev->msi_quota)
+ return -ENOSPC;
+
hwirq = rtas_query_irq_number(pci_get_pdn(pdev), desc->msi_index);
if (hwirq < 0) {
dev_err(&pdev->dev, "Failed to query HW IRQ: %d\n", hwirq);
@@ -567,9 +600,10 @@ static int pseries_irq_domain_alloc(struct irq_domain *domain, unsigned int virq
goto out;
irq_domain_set_hwirq_and_chip(domain, virq + i, hwirq + i,
- &pseries_msi_irq_chip, domain->host_data);
+ &pseries_msi_irq_chip, pseries_dev);
}
+ pseries_dev->msi_used++;
return 0;
out:
@@ -582,9 +616,11 @@ static void pseries_irq_domain_free(struct irq_domain *domain, unsigned int virq
unsigned int nr_irqs)
{
struct irq_data *d = irq_domain_get_irq_data(domain, virq);
- struct pci_controller *phb = irq_data_get_irq_chip_data(d);
+ struct pseries_msi_device *pseries_dev = irq_data_get_irq_chip_data(d);
+ struct pci_controller *phb = domain->host_data;
pr_debug("%s bridge %pOF %d #%d\n", __func__, phb->dn, virq, nr_irqs);
+ pseries_dev->msi_used -= nr_irqs;
irq_domain_free_irqs_parent(domain, virq, nr_irqs);
}
--
2.47.3