When bailing out due to group_priority_permit() failure, the queue_args
need to be freed. Fix it by rearranging the function to use the
goto-on-error pattern, such that the success case flows straight without
indentation while error cases jump forward to cleanup.
Cc: stable(a)vger.kernel.org
Fixes: 5f7762042f8a ("drm/panthor: Restrict high priorities on group_create")
Signed-off-by: Jann Horn <jannh(a)google.com>
---
testcase:
```
#include <err.h>
#include <fcntl.h>
#include <stddef.h>
#include <sys/ioctl.h>
#include <drm/panthor_drm.h>
#define SYSCHK(x) ({ \
typeof(x) __res = (x); \
if (__res == (typeof(x))-1) \
err(1, "SYSCHK(" #x ")"); \
__res; \
})
#define GPU_PATH "/dev/dri/by-path/platform-fb000000.gpu-card"
int main(void) {
int fd = SYSCHK(open(GPU_PATH, O_RDWR));
while (1) {
struct drm_panthor_queue_create qc[16] = {};
struct drm_panthor_group_create gc = {
.queues = {
.stride = sizeof(struct drm_panthor_queue_create),
.count = 16,
.array = (unsigned long)qc
},
.priority = PANTHOR_GROUP_PRIORITY_HIGH+1/*invalid*/
};
ioctl(fd, DRM_IOCTL_PANTHOR_GROUP_CREATE, &gc);
}
}
```
I have tested that without this patch, after running the testcase for a
few seconds and then manually killing it, 2G of RAM in kmalloc-128 have
been leaked. With the patch applied, the memory leak is gone.
(By the way, get_maintainer.pl suggests that I also send this patch to
the general DRM maintainers and the DRM-misc maintainers; looking at
MAINTAINERS, it looks like it is normal that the general DRM maintainers
are listed for everything under drivers/gpu/, but DRM-misc has exclusion
rules for a bunch of drivers but not panthor. I don't know if that is
intentional.)
---
drivers/gpu/drm/panthor/panthor_drv.c | 11 ++++++-----
1 file changed, 6 insertions(+), 5 deletions(-)
diff --git a/drivers/gpu/drm/panthor/panthor_drv.c b/drivers/gpu/drm/panthor/panthor_drv.c
index c520f156e2d73f7e735f8bf2d6d8e8efacec9362..815c23cff25f305d884e8e3e263fa22888f7d5ce 100644
--- a/drivers/gpu/drm/panthor/panthor_drv.c
+++ b/drivers/gpu/drm/panthor/panthor_drv.c
@@ -1032,14 +1032,15 @@ static int panthor_ioctl_group_create(struct drm_device *ddev, void *data,
ret = group_priority_permit(file, args->priority);
if (ret)
- return ret;
+ goto out;
ret = panthor_group_create(pfile, args, queue_args);
- if (ret >= 0) {
- args->group_handle = ret;
- ret = 0;
- }
+ if (ret < 0)
+ goto out;
+ args->group_handle = ret;
+ ret = 0;
+out:
kvfree(queue_args);
return ret;
}
---
base-commit: 9f8e716d46c68112484a23d1742d9ec725e082fc
change-id: 20241113-panthor-fix-gcq-bailout-2d9ac36590ed
--
Jann Horn <jannh(a)google.com>
The patch below does not apply to the 6.6-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.6.y
git checkout FETCH_HEAD
git cherry-pick -x 358de8b4f201bc05712484b15f0109b1ae3516a8
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024021932-lavish-expel-58e5@gregkh' --subject-prefix 'PATCH 6.6.y' HEAD^..
Possible dependencies:
358de8b4f201 ("kbuild: rpm-pkg: simplify installkernel %post")
0df8e9708594 ("scripts: clean up IA-64 code")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 358de8b4f201bc05712484b15f0109b1ae3516a8 Mon Sep 17 00:00:00 2001
From: Jose Ignacio Tornos Martinez <jtornosm(a)redhat.com>
Date: Mon, 29 Jan 2024 10:28:19 +0100
Subject: [PATCH] kbuild: rpm-pkg: simplify installkernel %post
The new installkernel application that is now included in systemd-udev
package allows installation although destination files are already present
in the boot directory of the kernel package, but is failing with the
implemented workaround for the old installkernel application from grubby
package.
For the new installkernel application, as Davide says:
<<The %post currently does a shuffling dance before calling installkernel.
This isn't actually necessary afaict, and the current implementation
ends up triggering downstream issues such as
https://github.com/systemd/systemd/issues/29568
This commit simplifies the logic to remove the shuffling. For reference,
the original logic was added in commit 3c9c7a14b627("rpm-pkg: add %post
section to create initramfs and grub hooks").>>
But we need to keep the old behavior as well, because the old installkernel
application from grubby package, does not allow this simplification and
we need to be backward compatible to avoid issues with the different
packages.
Mimic Fedora shipping process and store vmlinuz, config amd System.map
in the module directory instead of the boot directory. In this way, we will
avoid the commented problem for all the cases, because the new destination
files are not going to exist in the boot directory of the kernel package.
Replace installkernel tool with kernel-install tool, because the latter is
more complete.
Besides, after installkernel tool execution, check to complete if the
correct package files vmlinuz, System.map and config files are present
in /boot directory, and if necessary, copy manually for install operation.
In this way, take into account if files were not previously copied from
/usr/lib/kernel/install.d/* scripts and if the suitable files for the
requested package are present (it could be others if the rpm files were
replace with a new pacakge with the same release and a different build).
Tested with Fedora 38, Fedora 39, RHEL 9, Oracle Linux 9.3,
openSUSE Tumbleweed and openMandrive ROME, using dnf/zypper and rpm tools.
cc: stable(a)vger.kernel.org
Co-Developed-by: Davide Cavalca <dcavalca(a)meta.com>
Signed-off-by: Jose Ignacio Tornos Martinez <jtornosm(a)redhat.com>
Signed-off-by: Masahiro Yamada <masahiroy(a)kernel.org>
diff --git a/scripts/package/kernel.spec b/scripts/package/kernel.spec
index 89298983a169..f58726671fb3 100644
--- a/scripts/package/kernel.spec
+++ b/scripts/package/kernel.spec
@@ -55,12 +55,12 @@ patch -p1 < %{SOURCE2}
%{make} %{makeflags} KERNELRELEASE=%{KERNELRELEASE} KBUILD_BUILD_VERSION=%{release}
%install
-mkdir -p %{buildroot}/boot
-cp $(%{make} %{makeflags} -s image_name) %{buildroot}/boot/vmlinuz-%{KERNELRELEASE}
+mkdir -p %{buildroot}/lib/modules/%{KERNELRELEASE}
+cp $(%{make} %{makeflags} -s image_name) %{buildroot}/lib/modules/%{KERNELRELEASE}/vmlinuz
%{make} %{makeflags} INSTALL_MOD_PATH=%{buildroot} modules_install
%{make} %{makeflags} INSTALL_HDR_PATH=%{buildroot}/usr headers_install
-cp System.map %{buildroot}/boot/System.map-%{KERNELRELEASE}
-cp .config %{buildroot}/boot/config-%{KERNELRELEASE}
+cp System.map %{buildroot}/lib/modules/%{KERNELRELEASE}
+cp .config %{buildroot}/lib/modules/%{KERNELRELEASE}/config
ln -fns /usr/src/kernels/%{KERNELRELEASE} %{buildroot}/lib/modules/%{KERNELRELEASE}/build
%if %{with_devel}
%{make} %{makeflags} run-command KBUILD_RUN_COMMAND='${srctree}/scripts/package/install-extmod-build %{buildroot}/usr/src/kernels/%{KERNELRELEASE}'
@@ -70,13 +70,14 @@ ln -fns /usr/src/kernels/%{KERNELRELEASE} %{buildroot}/lib/modules/%{KERNELRELEA
rm -rf %{buildroot}
%post
-if [ -x /sbin/installkernel -a -r /boot/vmlinuz-%{KERNELRELEASE} -a -r /boot/System.map-%{KERNELRELEASE} ]; then
-cp /boot/vmlinuz-%{KERNELRELEASE} /boot/.vmlinuz-%{KERNELRELEASE}-rpm
-cp /boot/System.map-%{KERNELRELEASE} /boot/.System.map-%{KERNELRELEASE}-rpm
-rm -f /boot/vmlinuz-%{KERNELRELEASE} /boot/System.map-%{KERNELRELEASE}
-/sbin/installkernel %{KERNELRELEASE} /boot/.vmlinuz-%{KERNELRELEASE}-rpm /boot/.System.map-%{KERNELRELEASE}-rpm
-rm -f /boot/.vmlinuz-%{KERNELRELEASE}-rpm /boot/.System.map-%{KERNELRELEASE}-rpm
+if [ -x /usr/bin/kernel-install ]; then
+ /usr/bin/kernel-install add %{KERNELRELEASE} /lib/modules/%{KERNELRELEASE}/vmlinuz
fi
+for file in vmlinuz System.map config; do
+ if ! cmp --silent "/lib/modules/%{KERNELRELEASE}/${file}" "/boot/${file}-%{KERNELRELEASE}"; then
+ cp "/lib/modules/%{KERNELRELEASE}/${file}" "/boot/${file}-%{KERNELRELEASE}"
+ fi
+done
%preun
if [ -x /sbin/new-kernel-pkg ]; then
@@ -94,7 +95,6 @@ fi
%defattr (-, root, root)
/lib/modules/%{KERNELRELEASE}
%exclude /lib/modules/%{KERNELRELEASE}/build
-/boot/*
%files headers
%defattr (-, root, root)
The patch below does not apply to the 6.12-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.12.y
git checkout FETCH_HEAD
git cherry-pick -x 1e758b613212b6964518a67939535910b5aee831
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025021010-antarctic-untried-a72b@gregkh' --subject-prefix 'PATCH 6.12.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 1e758b613212b6964518a67939535910b5aee831 Mon Sep 17 00:00:00 2001
From: Angelo Dureghello <adureghello(a)baylibre.com>
Date: Wed, 8 Jan 2025 18:29:15 +0100
Subject: [PATCH] iio: dac: ad3552r-common: fix ad3541/2r ranges
Fix ad3541/2r voltage ranges to be as per ad3542r datasheet,
rev. C, table 38 (page 57).
The wrong ad354xr ranges was generating erroneous Vpp output.
In more details:
- fix wrong number of ranges, they are 5 ranges, not 6,
- remove non-existent 0-3V range,
- adjust order, since ad3552r_find_range() get a wrong index,
producing a wrong Vpp as output.
Retested all the ranges on real hardware, EVALAD3542RFMCZ:
adi,output-range-microvolt (fdt):
<(000000) (2500000)>; ok (Rfbx1, switch 10)
<(000000) (5000000)>; ok (Rfbx1, switch 10)
<(000000) (10000000)>; ok (Rfbx1, switch 10)
<(-5000000) (5000000)>; ok (Rfbx2, switch +/- 5)
<(-2500000) (7500000)>; ok (Rfbx2, switch -2.5/7.5)
Fixes: 8f2b54824b28 ("drivers:iio:dac: Add AD3552R driver support")
Signed-off-by: Angelo Dureghello <adureghello(a)baylibre.com>
Reviewed-by: David Lechner <dlechner(a)baylibre.com>
Link: https://patch.msgid.link/20250108-wip-bl-ad3552r-axi-v0-iio-testing-carlos-…
Cc: <Stable(a)vger.kernel.org>
Signed-off-by: Jonathan Cameron <Jonathan.Cameron(a)huawei.com>
diff --git a/drivers/iio/dac/ad3552r-common.c b/drivers/iio/dac/ad3552r-common.c
index 0f495df2e5ce..03e0864f5084 100644
--- a/drivers/iio/dac/ad3552r-common.c
+++ b/drivers/iio/dac/ad3552r-common.c
@@ -22,11 +22,10 @@ EXPORT_SYMBOL_NS_GPL(ad3552r_ch_ranges, "IIO_AD3552R");
const s32 ad3542r_ch_ranges[AD3542R_MAX_RANGES][2] = {
[AD3542R_CH_OUTPUT_RANGE_0__2P5V] = { 0, 2500 },
- [AD3542R_CH_OUTPUT_RANGE_0__3V] = { 0, 3000 },
[AD3542R_CH_OUTPUT_RANGE_0__5V] = { 0, 5000 },
[AD3542R_CH_OUTPUT_RANGE_0__10V] = { 0, 10000 },
- [AD3542R_CH_OUTPUT_RANGE_NEG_2P5__7P5V] = { -2500, 7500 },
- [AD3542R_CH_OUTPUT_RANGE_NEG_5__5V] = { -5000, 5000 }
+ [AD3542R_CH_OUTPUT_RANGE_NEG_5__5V] = { -5000, 5000 },
+ [AD3542R_CH_OUTPUT_RANGE_NEG_2P5__7P5V] = { -2500, 7500 }
};
EXPORT_SYMBOL_NS_GPL(ad3542r_ch_ranges, "IIO_AD3552R");
diff --git a/drivers/iio/dac/ad3552r.h b/drivers/iio/dac/ad3552r.h
index fd5a3dfd1d1c..4b5581039ae9 100644
--- a/drivers/iio/dac/ad3552r.h
+++ b/drivers/iio/dac/ad3552r.h
@@ -131,7 +131,7 @@
#define AD3552R_CH1_ACTIVE BIT(1)
#define AD3552R_MAX_RANGES 5
-#define AD3542R_MAX_RANGES 6
+#define AD3542R_MAX_RANGES 5
#define AD3552R_QUAD_SPI 2
extern const s32 ad3552r_ch_ranges[AD3552R_MAX_RANGES][2];
@@ -189,16 +189,14 @@ enum ad3552r_ch_vref_select {
enum ad3542r_ch_output_range {
/* Range from 0 V to 2.5 V. Requires Rfb1x connection */
AD3542R_CH_OUTPUT_RANGE_0__2P5V,
- /* Range from 0 V to 3 V. Requires Rfb1x connection */
- AD3542R_CH_OUTPUT_RANGE_0__3V,
/* Range from 0 V to 5 V. Requires Rfb1x connection */
AD3542R_CH_OUTPUT_RANGE_0__5V,
/* Range from 0 V to 10 V. Requires Rfb2x connection */
AD3542R_CH_OUTPUT_RANGE_0__10V,
- /* Range from -2.5 V to 7.5 V. Requires Rfb2x connection */
- AD3542R_CH_OUTPUT_RANGE_NEG_2P5__7P5V,
/* Range from -5 V to 5 V. Requires Rfb2x connection */
AD3542R_CH_OUTPUT_RANGE_NEG_5__5V,
+ /* Range from -2.5 V to 7.5 V. Requires Rfb2x connection */
+ AD3542R_CH_OUTPUT_RANGE_NEG_2P5__7P5V,
};
enum ad3552r_ch_output_range {
Hi Kees,
Bill's PR to disable __counted_by for "whole struct" __bdos cases has now
been merged into 19.1.3 [1], so here's the patch to disable __counted_by
for clang versions < 19.1.3 in the kernel.
Hopefully in the near future __counted_by for whole struct __bdos can be
enabled once again in coordination between the kernel, gcc, and clang.
There has been recent progress on this in [2] thanks to Tavian.
Also see previous discussion on the mailing list [3]
Thanks to everyone for moving this issue along. In particular, Bill for
his PR to clang/llvm, Kees and Thorsten for reproducers of the two issues,
Nathan for Kconfig-ifying this patch, and Miguel for reviewing.
Info for the stable team:
This patch should be backported to kernels >= 6.6 to make sure that those
build correctly with the effected clang versions. This patch cherry-picks
cleanly onto linux-6.11.y. For linux-6.6.y three prerequiste commits are
neded:
16c31dd7fdf6: Compiler Attributes: counted_by: bump min gcc version
2993eb7a8d34: Compiler Attributes: counted_by: fixup clang URL
231dc3f0c936: lkdtm/bugs: Improve warning message for compilers without counted_by support
There are still two merge conflicts even with those prerequistes.
Here's the correct resolution:
1. include/linux/compiler_types.h:
use the incoming change until before (but not including) the
"Apply __counted_by() when the Endianness matches to increase test coverage."
comment
2. lib/overflow_kunit.c:
HEAD is correct
[1] https://github.com/llvm/llvm-project/pull/112786
[2] https://github.com/llvm/llvm-project/pull/112636
[3] https://lore.kernel.org/lkml/3E304FB2-799D-478F-889A-CDFC1A52DCD8@toblux.co…
Best Regards
Jan
Jan Hendrik Farr (1):
Compiler Attributes: disable __counted_by for clang < 19.1.3
drivers/misc/lkdtm/bugs.c | 2 +-
include/linux/compiler_attributes.h | 13 -------------
include/linux/compiler_types.h | 19 +++++++++++++++++++
init/Kconfig | 9 +++++++++
lib/overflow_kunit.c | 2 +-
5 files changed, 30 insertions(+), 15 deletions(-)
--
2.47.0
The xHC resources allocated for USB devices are not released in correct
order after resuming in case when while suspend device was reconnected.
This issue has been detected during the fallowing scenario:
- connect hub HS to root port
- connect LS/FS device to hub port
- wait for enumeration to finish
- force host to suspend
- reconnect hub attached to root port
- wake host
For this scenario during enumeration of USB LS/FS device the Cadence xHC
reports completion error code for xHC commands because the xHC resources
used for devices has not been properly released.
XHCI specification doesn't mention that device can be reset in any order
so, we should not treat this issue as Cadence xHC controller bug.
Similar as during disconnecting in this case the device resources should
be cleared starting form the last usb device in tree toward the root hub.
To fix this issue usbcore driver should call hcd->driver->reset_device
for all USB devices connected to hub which was reconnected while
suspending.
Fixes: 3d82904559f4 ("usb: cdnsp: cdns3 Add main part of Cadence USBSSP DRD Driver")
cc: <stable(a)vger.kernel.org>
Signed-off-by: Pawel Laszczak <pawell(a)cadence.com>
---
Changelog:
v3:
- Changed patch title
- Corrected typo
- Moved hub_hc_release_resources above mutex_lock(hcd->address0_mutex)
v2:
- Replaced disconnection procedure with releasing only the xHC resources
drivers/usb/core/hub.c | 33 +++++++++++++++++++++++++++++++++
1 file changed, 33 insertions(+)
diff --git a/drivers/usb/core/hub.c b/drivers/usb/core/hub.c
index a76bb50b6202..dcba4281ea48 100644
--- a/drivers/usb/core/hub.c
+++ b/drivers/usb/core/hub.c
@@ -6065,6 +6065,36 @@ void usb_hub_cleanup(void)
usb_deregister(&hub_driver);
} /* usb_hub_cleanup() */
+/**
+ * hub_hc_release_resources - clear resources used by host controller
+ * @udev: pointer to device being released
+ *
+ * Context: task context, might sleep
+ *
+ * Function releases the host controller resources in correct order before
+ * making any operation on resuming usb device. The host controller resources
+ * allocated for devices in tree should be released starting from the last
+ * usb device in tree toward the root hub. This function is used only during
+ * resuming device when usb device require reinitialization – that is, when
+ * flag udev->reset_resume is set.
+ *
+ * This call is synchronous, and may not be used in an interrupt context.
+ */
+static void hub_hc_release_resources(struct usb_device *udev)
+{
+ struct usb_hub *hub = usb_hub_to_struct_hub(udev);
+ struct usb_hcd *hcd = bus_to_hcd(udev->bus);
+ int i;
+
+ /* Release up resources for all children before this device */
+ for (i = 0; i < udev->maxchild; i++)
+ if (hub->ports[i]->child)
+ hub_hc_release_resources(hub->ports[i]->child);
+
+ if (hcd->driver->reset_device)
+ hcd->driver->reset_device(hcd, udev);
+}
+
/**
* usb_reset_and_verify_device - perform a USB port reset to reinitialize a device
* @udev: device to reset (not in SUSPENDED or NOTATTACHED state)
@@ -6129,6 +6159,9 @@ static int usb_reset_and_verify_device(struct usb_device *udev)
bos = udev->bos;
udev->bos = NULL;
+ if (udev->reset_resume)
+ hub_hc_release_resources(udev);
+
mutex_lock(hcd->address0_mutex);
for (i = 0; i < PORT_INIT_TRIES; ++i) {
--
2.43.0
The patch below does not apply to the 6.6-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.6.y
git checkout FETCH_HEAD
git cherry-pick -x 53dac345395c0d2493cbc2f4c85fe38aef5b63f5
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025021052-avenging-aflutter-192c@gregkh' --subject-prefix 'PATCH 6.6.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 53dac345395c0d2493cbc2f4c85fe38aef5b63f5 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic(a)kernel.org>
Date: Sat, 18 Jan 2025 00:24:33 +0100
Subject: [PATCH] hrtimers: Force migrate away hrtimers queued after
CPUHP_AP_HRTIMERS_DYING
hrtimers are migrated away from the dying CPU to any online target at
the CPUHP_AP_HRTIMERS_DYING stage in order not to delay bandwidth timers
handling tasks involved in the CPU hotplug forward progress.
However wakeups can still be performed by the outgoing CPU after
CPUHP_AP_HRTIMERS_DYING. Those can result again in bandwidth timers being
armed. Depending on several considerations (crystal ball power management
based election, earliest timer already enqueued, timer migration enabled or
not), the target may eventually be the current CPU even if offline. If that
happens, the timer is eventually ignored.
The most notable example is RCU which had to deal with each and every of
those wake-ups by deferring them to an online CPU, along with related
workarounds:
_ e787644caf76 (rcu: Defer RCU kthreads wakeup when CPU is dying)
_ 9139f93209d1 (rcu/nocb: Fix RT throttling hrtimer armed from offline CPU)
_ f7345ccc62a4 (rcu/nocb: Fix rcuog wake-up from offline softirq)
The problem isn't confined to RCU though as the stop machine kthread
(which runs CPUHP_AP_HRTIMERS_DYING) reports its completion at the end
of its work through cpu_stop_signal_done() and performs a wake up that
eventually arms the deadline server timer:
WARNING: CPU: 94 PID: 588 at kernel/time/hrtimer.c:1086 hrtimer_start_range_ns+0x289/0x2d0
CPU: 94 UID: 0 PID: 588 Comm: migration/94 Not tainted
Stopper: multi_cpu_stop+0x0/0x120 <- stop_machine_cpuslocked+0x66/0xc0
RIP: 0010:hrtimer_start_range_ns+0x289/0x2d0
Call Trace:
<TASK>
start_dl_timer
enqueue_dl_entity
dl_server_start
enqueue_task_fair
enqueue_task
ttwu_do_activate
try_to_wake_up
complete
cpu_stopper_thread
Instead of providing yet another bandaid to work around the situation, fix
it in the hrtimers infrastructure instead: always migrate away a timer to
an online target whenever it is enqueued from an offline CPU.
This will also allow to revert all the above RCU disgraceful hacks.
Fixes: 5c0930ccaad5 ("hrtimers: Push pending hrtimers away from outgoing CPU earlier")
Reported-by: Vlad Poenaru <vlad.wing(a)gmail.com>
Reported-by: Usama Arif <usamaarif642(a)gmail.com>
Signed-off-by: Frederic Weisbecker <frederic(a)kernel.org>
Signed-off-by: Paul E. McKenney <paulmck(a)kernel.org>
Signed-off-by: Thomas Gleixner <tglx(a)linutronix.de>
Cc: stable(a)vger.kernel.org
Tested-by: Paul E. McKenney <paulmck(a)kernel.org>
Link: https://lore.kernel.org/all/20250117232433.24027-1-frederic@kernel.org
Closes: 20241213203739.1519801-1-usamaarif642(a)gmail.com
diff --git a/include/linux/hrtimer_defs.h b/include/linux/hrtimer_defs.h
index c3b4b7ed7c16..84a5045f80f3 100644
--- a/include/linux/hrtimer_defs.h
+++ b/include/linux/hrtimer_defs.h
@@ -125,6 +125,7 @@ struct hrtimer_cpu_base {
ktime_t softirq_expires_next;
struct hrtimer *softirq_next_timer;
struct hrtimer_clock_base clock_base[HRTIMER_MAX_CLOCK_BASES];
+ call_single_data_t csd;
} ____cacheline_aligned;
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 4fb81f8c6f1c..deb1aa32814e 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -58,6 +58,8 @@
#define HRTIMER_ACTIVE_SOFT (HRTIMER_ACTIVE_HARD << MASK_SHIFT)
#define HRTIMER_ACTIVE_ALL (HRTIMER_ACTIVE_SOFT | HRTIMER_ACTIVE_HARD)
+static void retrigger_next_event(void *arg);
+
/*
* The timer bases:
*
@@ -111,7 +113,8 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
.clockid = CLOCK_TAI,
.get_time = &ktime_get_clocktai,
},
- }
+ },
+ .csd = CSD_INIT(retrigger_next_event, NULL)
};
static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = {
@@ -124,6 +127,14 @@ static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = {
[CLOCK_TAI] = HRTIMER_BASE_TAI,
};
+static inline bool hrtimer_base_is_online(struct hrtimer_cpu_base *base)
+{
+ if (!IS_ENABLED(CONFIG_HOTPLUG_CPU))
+ return true;
+ else
+ return likely(base->online);
+}
+
/*
* Functions and macros which are different for UP/SMP systems are kept in a
* single place
@@ -178,27 +189,54 @@ struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,
}
/*
- * We do not migrate the timer when it is expiring before the next
- * event on the target cpu. When high resolution is enabled, we cannot
- * reprogram the target cpu hardware and we would cause it to fire
- * late. To keep it simple, we handle the high resolution enabled and
- * disabled case similar.
+ * Check if the elected target is suitable considering its next
+ * event and the hotplug state of the current CPU.
+ *
+ * If the elected target is remote and its next event is after the timer
+ * to queue, then a remote reprogram is necessary. However there is no
+ * guarantee the IPI handling the operation would arrive in time to meet
+ * the high resolution deadline. In this case the local CPU becomes a
+ * preferred target, unless it is offline.
+ *
+ * High and low resolution modes are handled the same way for simplicity.
*
* Called with cpu_base->lock of target cpu held.
*/
-static int
-hrtimer_check_target(struct hrtimer *timer, struct hrtimer_clock_base *new_base)
+static bool hrtimer_suitable_target(struct hrtimer *timer, struct hrtimer_clock_base *new_base,
+ struct hrtimer_cpu_base *new_cpu_base,
+ struct hrtimer_cpu_base *this_cpu_base)
{
ktime_t expires;
+ /*
+ * The local CPU clockevent can be reprogrammed. Also get_target_base()
+ * guarantees it is online.
+ */
+ if (new_cpu_base == this_cpu_base)
+ return true;
+
+ /*
+ * The offline local CPU can't be the default target if the
+ * next remote target event is after this timer. Keep the
+ * elected new base. An IPI will we issued to reprogram
+ * it as a last resort.
+ */
+ if (!hrtimer_base_is_online(this_cpu_base))
+ return true;
+
expires = ktime_sub(hrtimer_get_expires(timer), new_base->offset);
- return expires < new_base->cpu_base->expires_next;
+
+ return expires >= new_base->cpu_base->expires_next;
}
-static inline
-struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base,
- int pinned)
+static inline struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base, int pinned)
{
+ if (!hrtimer_base_is_online(base)) {
+ int cpu = cpumask_any_and(cpu_online_mask, housekeeping_cpumask(HK_TYPE_TIMER));
+
+ return &per_cpu(hrtimer_bases, cpu);
+ }
+
#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
if (static_branch_likely(&timers_migration_enabled) && !pinned)
return &per_cpu(hrtimer_bases, get_nohz_timer_target());
@@ -249,8 +287,8 @@ switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base,
raw_spin_unlock(&base->cpu_base->lock);
raw_spin_lock(&new_base->cpu_base->lock);
- if (new_cpu_base != this_cpu_base &&
- hrtimer_check_target(timer, new_base)) {
+ if (!hrtimer_suitable_target(timer, new_base, new_cpu_base,
+ this_cpu_base)) {
raw_spin_unlock(&new_base->cpu_base->lock);
raw_spin_lock(&base->cpu_base->lock);
new_cpu_base = this_cpu_base;
@@ -259,8 +297,7 @@ switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base,
}
WRITE_ONCE(timer->base, new_base);
} else {
- if (new_cpu_base != this_cpu_base &&
- hrtimer_check_target(timer, new_base)) {
+ if (!hrtimer_suitable_target(timer, new_base, new_cpu_base, this_cpu_base)) {
new_cpu_base = this_cpu_base;
goto again;
}
@@ -706,8 +743,6 @@ static inline int hrtimer_is_hres_enabled(void)
return hrtimer_hres_enabled;
}
-static void retrigger_next_event(void *arg);
-
/*
* Switch to high resolution mode
*/
@@ -1195,6 +1230,7 @@ static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
u64 delta_ns, const enum hrtimer_mode mode,
struct hrtimer_clock_base *base)
{
+ struct hrtimer_cpu_base *this_cpu_base = this_cpu_ptr(&hrtimer_bases);
struct hrtimer_clock_base *new_base;
bool force_local, first;
@@ -1206,9 +1242,15 @@ static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
* and enforce reprogramming after it is queued no matter whether
* it is the new first expiring timer again or not.
*/
- force_local = base->cpu_base == this_cpu_ptr(&hrtimer_bases);
+ force_local = base->cpu_base == this_cpu_base;
force_local &= base->cpu_base->next_timer == timer;
+ /*
+ * Don't force local queuing if this enqueue happens on a unplugged
+ * CPU after hrtimer_cpu_dying() has been invoked.
+ */
+ force_local &= this_cpu_base->online;
+
/*
* Remove an active timer from the queue. In case it is not queued
* on the current CPU, make sure that remove_hrtimer() updates the
@@ -1238,8 +1280,27 @@ static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
}
first = enqueue_hrtimer(timer, new_base, mode);
- if (!force_local)
- return first;
+ if (!force_local) {
+ /*
+ * If the current CPU base is online, then the timer is
+ * never queued on a remote CPU if it would be the first
+ * expiring timer there.
+ */
+ if (hrtimer_base_is_online(this_cpu_base))
+ return first;
+
+ /*
+ * Timer was enqueued remote because the current base is
+ * already offline. If the timer is the first to expire,
+ * kick the remote CPU to reprogram the clock event.
+ */
+ if (first) {
+ struct hrtimer_cpu_base *new_cpu_base = new_base->cpu_base;
+
+ smp_call_function_single_async(new_cpu_base->cpu, &new_cpu_base->csd);
+ }
+ return 0;
+ }
/*
* Timer was forced to stay on the current CPU to avoid
The patch below does not apply to the 6.1-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.1.y
git checkout FETCH_HEAD
git cherry-pick -x 53dac345395c0d2493cbc2f4c85fe38aef5b63f5
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025021053-unranked-silt-0282@gregkh' --subject-prefix 'PATCH 6.1.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 53dac345395c0d2493cbc2f4c85fe38aef5b63f5 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic(a)kernel.org>
Date: Sat, 18 Jan 2025 00:24:33 +0100
Subject: [PATCH] hrtimers: Force migrate away hrtimers queued after
CPUHP_AP_HRTIMERS_DYING
hrtimers are migrated away from the dying CPU to any online target at
the CPUHP_AP_HRTIMERS_DYING stage in order not to delay bandwidth timers
handling tasks involved in the CPU hotplug forward progress.
However wakeups can still be performed by the outgoing CPU after
CPUHP_AP_HRTIMERS_DYING. Those can result again in bandwidth timers being
armed. Depending on several considerations (crystal ball power management
based election, earliest timer already enqueued, timer migration enabled or
not), the target may eventually be the current CPU even if offline. If that
happens, the timer is eventually ignored.
The most notable example is RCU which had to deal with each and every of
those wake-ups by deferring them to an online CPU, along with related
workarounds:
_ e787644caf76 (rcu: Defer RCU kthreads wakeup when CPU is dying)
_ 9139f93209d1 (rcu/nocb: Fix RT throttling hrtimer armed from offline CPU)
_ f7345ccc62a4 (rcu/nocb: Fix rcuog wake-up from offline softirq)
The problem isn't confined to RCU though as the stop machine kthread
(which runs CPUHP_AP_HRTIMERS_DYING) reports its completion at the end
of its work through cpu_stop_signal_done() and performs a wake up that
eventually arms the deadline server timer:
WARNING: CPU: 94 PID: 588 at kernel/time/hrtimer.c:1086 hrtimer_start_range_ns+0x289/0x2d0
CPU: 94 UID: 0 PID: 588 Comm: migration/94 Not tainted
Stopper: multi_cpu_stop+0x0/0x120 <- stop_machine_cpuslocked+0x66/0xc0
RIP: 0010:hrtimer_start_range_ns+0x289/0x2d0
Call Trace:
<TASK>
start_dl_timer
enqueue_dl_entity
dl_server_start
enqueue_task_fair
enqueue_task
ttwu_do_activate
try_to_wake_up
complete
cpu_stopper_thread
Instead of providing yet another bandaid to work around the situation, fix
it in the hrtimers infrastructure instead: always migrate away a timer to
an online target whenever it is enqueued from an offline CPU.
This will also allow to revert all the above RCU disgraceful hacks.
Fixes: 5c0930ccaad5 ("hrtimers: Push pending hrtimers away from outgoing CPU earlier")
Reported-by: Vlad Poenaru <vlad.wing(a)gmail.com>
Reported-by: Usama Arif <usamaarif642(a)gmail.com>
Signed-off-by: Frederic Weisbecker <frederic(a)kernel.org>
Signed-off-by: Paul E. McKenney <paulmck(a)kernel.org>
Signed-off-by: Thomas Gleixner <tglx(a)linutronix.de>
Cc: stable(a)vger.kernel.org
Tested-by: Paul E. McKenney <paulmck(a)kernel.org>
Link: https://lore.kernel.org/all/20250117232433.24027-1-frederic@kernel.org
Closes: 20241213203739.1519801-1-usamaarif642(a)gmail.com
diff --git a/include/linux/hrtimer_defs.h b/include/linux/hrtimer_defs.h
index c3b4b7ed7c16..84a5045f80f3 100644
--- a/include/linux/hrtimer_defs.h
+++ b/include/linux/hrtimer_defs.h
@@ -125,6 +125,7 @@ struct hrtimer_cpu_base {
ktime_t softirq_expires_next;
struct hrtimer *softirq_next_timer;
struct hrtimer_clock_base clock_base[HRTIMER_MAX_CLOCK_BASES];
+ call_single_data_t csd;
} ____cacheline_aligned;
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 4fb81f8c6f1c..deb1aa32814e 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -58,6 +58,8 @@
#define HRTIMER_ACTIVE_SOFT (HRTIMER_ACTIVE_HARD << MASK_SHIFT)
#define HRTIMER_ACTIVE_ALL (HRTIMER_ACTIVE_SOFT | HRTIMER_ACTIVE_HARD)
+static void retrigger_next_event(void *arg);
+
/*
* The timer bases:
*
@@ -111,7 +113,8 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
.clockid = CLOCK_TAI,
.get_time = &ktime_get_clocktai,
},
- }
+ },
+ .csd = CSD_INIT(retrigger_next_event, NULL)
};
static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = {
@@ -124,6 +127,14 @@ static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = {
[CLOCK_TAI] = HRTIMER_BASE_TAI,
};
+static inline bool hrtimer_base_is_online(struct hrtimer_cpu_base *base)
+{
+ if (!IS_ENABLED(CONFIG_HOTPLUG_CPU))
+ return true;
+ else
+ return likely(base->online);
+}
+
/*
* Functions and macros which are different for UP/SMP systems are kept in a
* single place
@@ -178,27 +189,54 @@ struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,
}
/*
- * We do not migrate the timer when it is expiring before the next
- * event on the target cpu. When high resolution is enabled, we cannot
- * reprogram the target cpu hardware and we would cause it to fire
- * late. To keep it simple, we handle the high resolution enabled and
- * disabled case similar.
+ * Check if the elected target is suitable considering its next
+ * event and the hotplug state of the current CPU.
+ *
+ * If the elected target is remote and its next event is after the timer
+ * to queue, then a remote reprogram is necessary. However there is no
+ * guarantee the IPI handling the operation would arrive in time to meet
+ * the high resolution deadline. In this case the local CPU becomes a
+ * preferred target, unless it is offline.
+ *
+ * High and low resolution modes are handled the same way for simplicity.
*
* Called with cpu_base->lock of target cpu held.
*/
-static int
-hrtimer_check_target(struct hrtimer *timer, struct hrtimer_clock_base *new_base)
+static bool hrtimer_suitable_target(struct hrtimer *timer, struct hrtimer_clock_base *new_base,
+ struct hrtimer_cpu_base *new_cpu_base,
+ struct hrtimer_cpu_base *this_cpu_base)
{
ktime_t expires;
+ /*
+ * The local CPU clockevent can be reprogrammed. Also get_target_base()
+ * guarantees it is online.
+ */
+ if (new_cpu_base == this_cpu_base)
+ return true;
+
+ /*
+ * The offline local CPU can't be the default target if the
+ * next remote target event is after this timer. Keep the
+ * elected new base. An IPI will we issued to reprogram
+ * it as a last resort.
+ */
+ if (!hrtimer_base_is_online(this_cpu_base))
+ return true;
+
expires = ktime_sub(hrtimer_get_expires(timer), new_base->offset);
- return expires < new_base->cpu_base->expires_next;
+
+ return expires >= new_base->cpu_base->expires_next;
}
-static inline
-struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base,
- int pinned)
+static inline struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base, int pinned)
{
+ if (!hrtimer_base_is_online(base)) {
+ int cpu = cpumask_any_and(cpu_online_mask, housekeeping_cpumask(HK_TYPE_TIMER));
+
+ return &per_cpu(hrtimer_bases, cpu);
+ }
+
#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
if (static_branch_likely(&timers_migration_enabled) && !pinned)
return &per_cpu(hrtimer_bases, get_nohz_timer_target());
@@ -249,8 +287,8 @@ switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base,
raw_spin_unlock(&base->cpu_base->lock);
raw_spin_lock(&new_base->cpu_base->lock);
- if (new_cpu_base != this_cpu_base &&
- hrtimer_check_target(timer, new_base)) {
+ if (!hrtimer_suitable_target(timer, new_base, new_cpu_base,
+ this_cpu_base)) {
raw_spin_unlock(&new_base->cpu_base->lock);
raw_spin_lock(&base->cpu_base->lock);
new_cpu_base = this_cpu_base;
@@ -259,8 +297,7 @@ switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base,
}
WRITE_ONCE(timer->base, new_base);
} else {
- if (new_cpu_base != this_cpu_base &&
- hrtimer_check_target(timer, new_base)) {
+ if (!hrtimer_suitable_target(timer, new_base, new_cpu_base, this_cpu_base)) {
new_cpu_base = this_cpu_base;
goto again;
}
@@ -706,8 +743,6 @@ static inline int hrtimer_is_hres_enabled(void)
return hrtimer_hres_enabled;
}
-static void retrigger_next_event(void *arg);
-
/*
* Switch to high resolution mode
*/
@@ -1195,6 +1230,7 @@ static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
u64 delta_ns, const enum hrtimer_mode mode,
struct hrtimer_clock_base *base)
{
+ struct hrtimer_cpu_base *this_cpu_base = this_cpu_ptr(&hrtimer_bases);
struct hrtimer_clock_base *new_base;
bool force_local, first;
@@ -1206,9 +1242,15 @@ static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
* and enforce reprogramming after it is queued no matter whether
* it is the new first expiring timer again or not.
*/
- force_local = base->cpu_base == this_cpu_ptr(&hrtimer_bases);
+ force_local = base->cpu_base == this_cpu_base;
force_local &= base->cpu_base->next_timer == timer;
+ /*
+ * Don't force local queuing if this enqueue happens on a unplugged
+ * CPU after hrtimer_cpu_dying() has been invoked.
+ */
+ force_local &= this_cpu_base->online;
+
/*
* Remove an active timer from the queue. In case it is not queued
* on the current CPU, make sure that remove_hrtimer() updates the
@@ -1238,8 +1280,27 @@ static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
}
first = enqueue_hrtimer(timer, new_base, mode);
- if (!force_local)
- return first;
+ if (!force_local) {
+ /*
+ * If the current CPU base is online, then the timer is
+ * never queued on a remote CPU if it would be the first
+ * expiring timer there.
+ */
+ if (hrtimer_base_is_online(this_cpu_base))
+ return first;
+
+ /*
+ * Timer was enqueued remote because the current base is
+ * already offline. If the timer is the first to expire,
+ * kick the remote CPU to reprogram the clock event.
+ */
+ if (first) {
+ struct hrtimer_cpu_base *new_cpu_base = new_base->cpu_base;
+
+ smp_call_function_single_async(new_cpu_base->cpu, &new_cpu_base->csd);
+ }
+ return 0;
+ }
/*
* Timer was forced to stay on the current CPU to avoid
The patch below does not apply to the 6.1-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.1.y
git checkout FETCH_HEAD
git cherry-pick -x 3e74859ee35edc33a022c3f3971df066ea0ca6b9
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024123045-parka-sublet-a95d@gregkh' --subject-prefix 'PATCH 6.1.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 3e74859ee35edc33a022c3f3971df066ea0ca6b9 Mon Sep 17 00:00:00 2001
From: Boris Burkov <boris(a)bur.io>
Date: Fri, 13 Dec 2024 12:22:32 -0800
Subject: [PATCH] btrfs: check folio mapping after unlock in
relocate_one_folio()
When we call btrfs_read_folio() to bring a folio uptodate, we unlock the
folio. The result of that is that a different thread can modify the
mapping (like remove it with invalidate) before we call folio_lock().
This results in an invalid page and we need to try again.
In particular, if we are relocating concurrently with aborting a
transaction, this can result in a crash like the following:
BUG: kernel NULL pointer dereference, address: 0000000000000000
PGD 0 P4D 0
Oops: 0000 [#1] SMP
CPU: 76 PID: 1411631 Comm: kworker/u322:5
Workqueue: events_unbound btrfs_reclaim_bgs_work
RIP: 0010:set_page_extent_mapped+0x20/0xb0
RSP: 0018:ffffc900516a7be8 EFLAGS: 00010246
RAX: ffffea009e851d08 RBX: ffffea009e0b1880 RCX: 0000000000000000
RDX: 0000000000000000 RSI: ffffc900516a7b90 RDI: ffffea009e0b1880
RBP: 0000000003573000 R08: 0000000000000001 R09: ffff88c07fd2f3f0
R10: 0000000000000000 R11: 0000194754b575be R12: 0000000003572000
R13: 0000000003572fff R14: 0000000000100cca R15: 0000000005582fff
FS: 0000000000000000(0000) GS:ffff88c07fd00000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 0000000000000000 CR3: 000000407d00f002 CR4: 00000000007706f0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
PKRU: 55555554
Call Trace:
<TASK>
? __die+0x78/0xc0
? page_fault_oops+0x2a8/0x3a0
? __switch_to+0x133/0x530
? wq_worker_running+0xa/0x40
? exc_page_fault+0x63/0x130
? asm_exc_page_fault+0x22/0x30
? set_page_extent_mapped+0x20/0xb0
relocate_file_extent_cluster+0x1a7/0x940
relocate_data_extent+0xaf/0x120
relocate_block_group+0x20f/0x480
btrfs_relocate_block_group+0x152/0x320
btrfs_relocate_chunk+0x3d/0x120
btrfs_reclaim_bgs_work+0x2ae/0x4e0
process_scheduled_works+0x184/0x370
worker_thread+0xc6/0x3e0
? blk_add_timer+0xb0/0xb0
kthread+0xae/0xe0
? flush_tlb_kernel_range+0x90/0x90
ret_from_fork+0x2f/0x40
? flush_tlb_kernel_range+0x90/0x90
ret_from_fork_asm+0x11/0x20
</TASK>
This occurs because cleanup_one_transaction() calls
destroy_delalloc_inodes() which calls invalidate_inode_pages2() which
takes the folio_lock before setting mapping to NULL. We fail to check
this, and subsequently call set_extent_mapping(), which assumes that
mapping != NULL (in fact it asserts that in debug mode)
Note that the "fixes" patch here is not the one that introduced the
race (the very first iteration of this code from 2009) but a more recent
change that made this particular crash happen in practice.
Fixes: e7f1326cc24e ("btrfs: set page extent mapped after read_folio in relocate_one_page")
CC: stable(a)vger.kernel.org # 6.1+
Reviewed-by: Qu Wenruo <wqu(a)suse.com>
Signed-off-by: Boris Burkov <boris(a)bur.io>
Signed-off-by: David Sterba <dsterba(a)suse.com>
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index bf267bdfa8f8..db8b42f674b7 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -2902,6 +2902,7 @@ static int relocate_one_folio(struct reloc_control *rc,
const bool use_rst = btrfs_need_stripe_tree_update(fs_info, rc->block_group->flags);
ASSERT(index <= last_index);
+again:
folio = filemap_lock_folio(inode->i_mapping, index);
if (IS_ERR(folio)) {
@@ -2937,6 +2938,11 @@ static int relocate_one_folio(struct reloc_control *rc,
ret = -EIO;
goto release_folio;
}
+ if (folio->mapping != inode->i_mapping) {
+ folio_unlock(folio);
+ folio_put(folio);
+ goto again;
+ }
}
/*
The patch below does not apply to the 6.6-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.6.y
git checkout FETCH_HEAD
git cherry-pick -x 3e74859ee35edc33a022c3f3971df066ea0ca6b9
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024123042-limelight-doily-8703@gregkh' --subject-prefix 'PATCH 6.6.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 3e74859ee35edc33a022c3f3971df066ea0ca6b9 Mon Sep 17 00:00:00 2001
From: Boris Burkov <boris(a)bur.io>
Date: Fri, 13 Dec 2024 12:22:32 -0800
Subject: [PATCH] btrfs: check folio mapping after unlock in
relocate_one_folio()
When we call btrfs_read_folio() to bring a folio uptodate, we unlock the
folio. The result of that is that a different thread can modify the
mapping (like remove it with invalidate) before we call folio_lock().
This results in an invalid page and we need to try again.
In particular, if we are relocating concurrently with aborting a
transaction, this can result in a crash like the following:
BUG: kernel NULL pointer dereference, address: 0000000000000000
PGD 0 P4D 0
Oops: 0000 [#1] SMP
CPU: 76 PID: 1411631 Comm: kworker/u322:5
Workqueue: events_unbound btrfs_reclaim_bgs_work
RIP: 0010:set_page_extent_mapped+0x20/0xb0
RSP: 0018:ffffc900516a7be8 EFLAGS: 00010246
RAX: ffffea009e851d08 RBX: ffffea009e0b1880 RCX: 0000000000000000
RDX: 0000000000000000 RSI: ffffc900516a7b90 RDI: ffffea009e0b1880
RBP: 0000000003573000 R08: 0000000000000001 R09: ffff88c07fd2f3f0
R10: 0000000000000000 R11: 0000194754b575be R12: 0000000003572000
R13: 0000000003572fff R14: 0000000000100cca R15: 0000000005582fff
FS: 0000000000000000(0000) GS:ffff88c07fd00000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 0000000000000000 CR3: 000000407d00f002 CR4: 00000000007706f0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
PKRU: 55555554
Call Trace:
<TASK>
? __die+0x78/0xc0
? page_fault_oops+0x2a8/0x3a0
? __switch_to+0x133/0x530
? wq_worker_running+0xa/0x40
? exc_page_fault+0x63/0x130
? asm_exc_page_fault+0x22/0x30
? set_page_extent_mapped+0x20/0xb0
relocate_file_extent_cluster+0x1a7/0x940
relocate_data_extent+0xaf/0x120
relocate_block_group+0x20f/0x480
btrfs_relocate_block_group+0x152/0x320
btrfs_relocate_chunk+0x3d/0x120
btrfs_reclaim_bgs_work+0x2ae/0x4e0
process_scheduled_works+0x184/0x370
worker_thread+0xc6/0x3e0
? blk_add_timer+0xb0/0xb0
kthread+0xae/0xe0
? flush_tlb_kernel_range+0x90/0x90
ret_from_fork+0x2f/0x40
? flush_tlb_kernel_range+0x90/0x90
ret_from_fork_asm+0x11/0x20
</TASK>
This occurs because cleanup_one_transaction() calls
destroy_delalloc_inodes() which calls invalidate_inode_pages2() which
takes the folio_lock before setting mapping to NULL. We fail to check
this, and subsequently call set_extent_mapping(), which assumes that
mapping != NULL (in fact it asserts that in debug mode)
Note that the "fixes" patch here is not the one that introduced the
race (the very first iteration of this code from 2009) but a more recent
change that made this particular crash happen in practice.
Fixes: e7f1326cc24e ("btrfs: set page extent mapped after read_folio in relocate_one_page")
CC: stable(a)vger.kernel.org # 6.1+
Reviewed-by: Qu Wenruo <wqu(a)suse.com>
Signed-off-by: Boris Burkov <boris(a)bur.io>
Signed-off-by: David Sterba <dsterba(a)suse.com>
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index bf267bdfa8f8..db8b42f674b7 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -2902,6 +2902,7 @@ static int relocate_one_folio(struct reloc_control *rc,
const bool use_rst = btrfs_need_stripe_tree_update(fs_info, rc->block_group->flags);
ASSERT(index <= last_index);
+again:
folio = filemap_lock_folio(inode->i_mapping, index);
if (IS_ERR(folio)) {
@@ -2937,6 +2938,11 @@ static int relocate_one_folio(struct reloc_control *rc,
ret = -EIO;
goto release_folio;
}
+ if (folio->mapping != inode->i_mapping) {
+ folio_unlock(folio);
+ folio_put(folio);
+ goto again;
+ }
}
/*
The patch below does not apply to the 6.1-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.1.y
git checkout FETCH_HEAD
git cherry-pick -x dc7eb8755797ed41a0d1b5c0c39df3c8f401b3d9
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024012617-overlap-reborn-e124@gregkh' --subject-prefix 'PATCH 6.1.y' HEAD^..
Possible dependencies:
dc7eb8755797 ("arm64/sme: Always exit sme_alloc() early with existing storage")
5d0a8d2fba50 ("arm64/ptrace: Ensure that SME is set up for target when writing SSVE state")
f90b529bcbe5 ("arm64/sme: Implement ZT0 ptrace support")
ce514000da4f ("arm64/sme: Rename za_state to sme_state")
1192b93ba352 ("arm64/fp: Use a struct to pass data to fpsimd_bind_state_to_cpu()")
deeb8f9a80fd ("arm64/fpsimd: Have KVM explicitly say which FP registers to save")
baa8515281b3 ("arm64/fpsimd: Track the saved FPSIMD state type separately to TIF_SVE")
93ae6b01bafe ("KVM: arm64: Discard any SVE state when entering KVM guests")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From dc7eb8755797ed41a0d1b5c0c39df3c8f401b3d9 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie(a)kernel.org>
Date: Mon, 15 Jan 2024 20:15:46 +0000
Subject: [PATCH] arm64/sme: Always exit sme_alloc() early with existing
storage
When sme_alloc() is called with existing storage and we are not flushing we
will always allocate new storage, both leaking the existing storage and
corrupting the state. Fix this by separating the checks for flushing and
for existing storage as we do for SVE.
Callers that reallocate (eg, due to changing the vector length) should
call sme_free() themselves.
Fixes: 5d0a8d2fba50 ("arm64/ptrace: Ensure that SME is set up for target when writing SSVE state")
Signed-off-by: Mark Brown <broonie(a)kernel.org>
Cc: <stable(a)vger.kernel.org>
Link: https://lore.kernel.org/r/20240115-arm64-sme-flush-v1-1-7472bd3459b7@kernel…
Signed-off-by: Will Deacon <will(a)kernel.org>
diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c
index 0983be2b1b61..a5dc6f764195 100644
--- a/arch/arm64/kernel/fpsimd.c
+++ b/arch/arm64/kernel/fpsimd.c
@@ -1217,8 +1217,10 @@ void fpsimd_release_task(struct task_struct *dead_task)
*/
void sme_alloc(struct task_struct *task, bool flush)
{
- if (task->thread.sme_state && flush) {
- memset(task->thread.sme_state, 0, sme_state_size(task));
+ if (task->thread.sme_state) {
+ if (flush)
+ memset(task->thread.sme_state, 0,
+ sme_state_size(task));
return;
}
Commit 60e3318e3e900 ("cifs: use fs_context for automounts") was
released in v6.1.54 and broke the failover when one of the servers
inside DFS becomes unavailable. We reproduced the problem on the EC2
instances of different types. Reverting aforementioned commint on top of
the latest stable verison v6.1.94 helps to resolve the problem.
Earliest working version is v6.2-rc1. There were two big merges of CIFS fixes:
[1] and [2]. We would like to ask for the help to investigate this problem and
if some of those patches need to be backported. Also, is it safe to just revert
problematic commit until proper fixes/backports will be available?
We will help to do testing and confirm if fix works, but let me also list the
steps we used to reproduce the problem if it will help to identify the problem:
1. Create Active Directory domain eg. 'corp.fsxtest.local' in AWS Directory
Service with:
- three AWS FSX file systems filesystem1..filesystem3
- three Windows servers; They have DFS installed as per
https://learn.microsoft.com/en-us/windows-server/storage/dfs-namespaces/dfs…:
- dfs-srv1: EC2AMAZ-2EGTM59
- dfs-srv2: EC2AMAZ-1N36PRD
- dfs-srv3: EC2AMAZ-0PAUH2U
2. Create DFS namespace eg. 'dfs-namespace' in Windows server 2008 mode
and three folders targets in it:
- referral-a mapped to filesystem1.corp.local
- referral-b mapped to filesystem2.corp.local
- referral-c mapped to filesystem3.corp.local
- local folders dfs-srv1..dfs-srv3 in C:\DFSRoots\dfs-namespace of every
Windows server. This helps to quickly define underlying server when
DFS is mounted.
3. Enabled cifs debug logs:
```
echo 'module cifs +p' > /sys/kernel/debug/dynamic_debug/control
echo 'file fs/cifs/* +p' > /sys/kernel/debug/dynamic_debug/control
echo 7 > /proc/fs/cifs/cifsFYI
```
4. Mount DFS namespace on Amazon Linux 2023 instance running any vanilla
kernel v6.1.54+:
```
dmesg -c &>/dev/null
cd /mnt
mount -t cifs -o cred=/mnt/creds,echo_interval=5 \
//corp.fsxtest.local/dfs-namespace \
./dfs-namespace
```
5. List DFS root, it's also required to avoid recursive mounts that happen
during regular 'ls' run:
```
sh -c 'ls dfs-namespace'
dfs-srv2 referral-a referral-b
```
The DFS server is EC2AMAZ-1N36PRD, it's also listed in mount:
```
[root@ip-172-31-2-82 mnt]# mount | grep dfs
//corp.fsxtest.local/dfs-namespace on /mnt/dfs-namespace type cifs (rw,relatime,vers=3.1.1,cache=strict,username=Admin,domain=corp.fsxtest.local,uid=0,noforceuid,gid=0,noforcegid,addr=172.31.11.26,file_mode=0755,dir_mode=0755,soft,nounix,mapposix,rsize=4194304,wsize=4194304,bsize=1048576,echo_interval=5,actimeo=1,closetimeo=1)
//EC2AMAZ-1N36PRD.corp.fsxtest.local/dfs-namespace/referral-a on /mnt/dfs-namespace/referral-a type cifs (rw,relatime,vers=3.1.1,cache=strict,username=Admin,domain=corp.fsxtest.local,uid=0,noforceuid,gid=0,noforcegid,addr=172.31.12.80,file_mode=0755,dir_mode=0755,soft,nounix,mapposix,rsize=4194304,wsize=4194304,bsize=1048576,echo_interval=5,actimeo=1,closetimeo=1)
```
List files in first folder:
```
sh -c 'ls dfs-namespace/referral-a'
filea.txt.txt
```
6. Shutdown DFS server-2.
List DFS root again, server changed from dfs-srv2 to dfs-srv1 EC2AMAZ-2EGTM59:
```
sh -c 'ls dfs-namespace'
dfs-srv1 referral-a referral-b
```
7. Try to list files in another folder, this causes ls to fail with error:
```
sh -c 'ls dfs-namespace/referral-b'
ls: cannot access 'dfs-namespace/referral-b': No route to host```
Sometimes it's also 'Operation now in progress' error.
mount shows the same output:
```
//corp.fsxtest.local/dfs-namespace on /mnt/dfs-namespace type cifs (rw,relatime,vers=3.1.1,cache=strict,username=Admin,domain=corp.fsxtest.local,uid=0,noforceuid,gid=0,noforcegid,addr=172.31.11.26,file_mode=0755,dir_mode=0755,soft,nounix,mapposix,rsize=4194304,wsize=4194304,bsize=1048576,echo_interval=5,actimeo=1,closetimeo=1)
//EC2AMAZ-1N36PRD.corp.fsxtest.local/dfs-namespace/referral-a on /mnt/dfs-namespace/referral-a type cifs (rw,relatime,vers=3.1.1,cache=strict,username=Admin,domain=corp.fsxtest.local,uid=0,noforceuid,gid=0,noforcegid,addr=172.31.12.80,file_mode=0755,dir_mode=0755,soft,nounix,mapposix,rsize=4194304,wsize=4194304,bsize=1048576,echo_interval=5,actimeo=1,closetimeo=1)
```
I also attached kernel debug logs from this test.
[1] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?…
[2] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?…
Reported-by: Andrei Paniakin <apanyaki(a)amazon.com>
Bisected-by: Simba Bonga <simbarb(a)amazon.com>
---
#regzbot introduced: v6.1.54..v6.2-rc1
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.15.y
git checkout FETCH_HEAD
git cherry-pick -x c2fee09fc167c74a64adb08656cb993ea475197e
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025021815-protract-greasily-cdea@gregkh' --subject-prefix 'PATCH 5.15.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From c2fee09fc167c74a64adb08656cb993ea475197e Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc(a)google.com>
Date: Fri, 24 Jan 2025 17:18:33 -0800
Subject: [PATCH] KVM: x86: Load DR6 with guest value only before entering
.vcpu_run() loop
Move the conditional loading of hardware DR6 with the guest's DR6 value
out of the core .vcpu_run() loop to fix a bug where KVM can load hardware
with a stale vcpu->arch.dr6.
When the guest accesses a DR and host userspace isn't debugging the guest,
KVM disables DR interception and loads the guest's values into hardware on
VM-Enter and saves them on VM-Exit. This allows the guest to access DRs
at will, e.g. so that a sequence of DR accesses to configure a breakpoint
only generates one VM-Exit.
For DR0-DR3, the logic/behavior is identical between VMX and SVM, and also
identical between KVM_DEBUGREG_BP_ENABLED (userspace debugging the guest)
and KVM_DEBUGREG_WONT_EXIT (guest using DRs), and so KVM handles loading
DR0-DR3 in common code, _outside_ of the core kvm_x86_ops.vcpu_run() loop.
But for DR6, the guest's value doesn't need to be loaded into hardware for
KVM_DEBUGREG_BP_ENABLED, and SVM provides a dedicated VMCB field whereas
VMX requires software to manually load the guest value, and so loading the
guest's value into DR6 is handled by {svm,vmx}_vcpu_run(), i.e. is done
_inside_ the core run loop.
Unfortunately, saving the guest values on VM-Exit is initiated by common
x86, again outside of the core run loop. If the guest modifies DR6 (in
hardware, when DR interception is disabled), and then the next VM-Exit is
a fastpath VM-Exit, KVM will reload hardware DR6 with vcpu->arch.dr6 and
clobber the guest's actual value.
The bug shows up primarily with nested VMX because KVM handles the VMX
preemption timer in the fastpath, and the window between hardware DR6
being modified (in guest context) and DR6 being read by guest software is
orders of magnitude larger in a nested setup. E.g. in non-nested, the
VMX preemption timer would need to fire precisely between #DB injection
and the #DB handler's read of DR6, whereas with a KVM-on-KVM setup, the
window where hardware DR6 is "dirty" extends all the way from L1 writing
DR6 to VMRESUME (in L1).
L1's view:
==========
<L1 disables DR interception>
CPU 0/KVM-7289 [023] d.... 2925.640961: kvm_entry: vcpu 0
A: L1 Writes DR6
CPU 0/KVM-7289 [023] d.... 2925.640963: <hack>: Set DRs, DR6 = 0xffff0ff1
B: CPU 0/KVM-7289 [023] d.... 2925.640967: kvm_exit: vcpu 0 reason EXTERNAL_INTERRUPT intr_info 0x800000ec
D: L1 reads DR6, arch.dr6 = 0
CPU 0/KVM-7289 [023] d.... 2925.640969: <hack>: Sync DRs, DR6 = 0xffff0ff0
CPU 0/KVM-7289 [023] d.... 2925.640976: kvm_entry: vcpu 0
L2 reads DR6, L1 disables DR interception
CPU 0/KVM-7289 [023] d.... 2925.640980: kvm_exit: vcpu 0 reason DR_ACCESS info1 0x0000000000000216
CPU 0/KVM-7289 [023] d.... 2925.640983: kvm_entry: vcpu 0
CPU 0/KVM-7289 [023] d.... 2925.640983: <hack>: Set DRs, DR6 = 0xffff0ff0
L2 detects failure
CPU 0/KVM-7289 [023] d.... 2925.640987: kvm_exit: vcpu 0 reason HLT
L1 reads DR6 (confirms failure)
CPU 0/KVM-7289 [023] d.... 2925.640990: <hack>: Sync DRs, DR6 = 0xffff0ff0
L0's view:
==========
L2 reads DR6, arch.dr6 = 0
CPU 23/KVM-5046 [001] d.... 3410.005610: kvm_exit: vcpu 23 reason DR_ACCESS info1 0x0000000000000216
CPU 23/KVM-5046 [001] ..... 3410.005610: kvm_nested_vmexit: vcpu 23 reason DR_ACCESS info1 0x0000000000000216
L2 => L1 nested VM-Exit
CPU 23/KVM-5046 [001] ..... 3410.005610: kvm_nested_vmexit_inject: reason: DR_ACCESS ext_inf1: 0x0000000000000216
CPU 23/KVM-5046 [001] d.... 3410.005610: kvm_entry: vcpu 23
CPU 23/KVM-5046 [001] d.... 3410.005611: kvm_exit: vcpu 23 reason VMREAD
CPU 23/KVM-5046 [001] d.... 3410.005611: kvm_entry: vcpu 23
CPU 23/KVM-5046 [001] d.... 3410.005612: kvm_exit: vcpu 23 reason VMREAD
CPU 23/KVM-5046 [001] d.... 3410.005612: kvm_entry: vcpu 23
L1 writes DR7, L0 disables DR interception
CPU 23/KVM-5046 [001] d.... 3410.005612: kvm_exit: vcpu 23 reason DR_ACCESS info1 0x0000000000000007
CPU 23/KVM-5046 [001] d.... 3410.005613: kvm_entry: vcpu 23
L0 writes DR6 = 0 (arch.dr6)
CPU 23/KVM-5046 [001] d.... 3410.005613: <hack>: Set DRs, DR6 = 0xffff0ff0
A: <L1 writes DR6 = 1, no interception, arch.dr6 is still '0'>
B: CPU 23/KVM-5046 [001] d.... 3410.005614: kvm_exit: vcpu 23 reason PREEMPTION_TIMER
CPU 23/KVM-5046 [001] d.... 3410.005614: kvm_entry: vcpu 23
C: L0 writes DR6 = 0 (arch.dr6)
CPU 23/KVM-5046 [001] d.... 3410.005614: <hack>: Set DRs, DR6 = 0xffff0ff0
L1 => L2 nested VM-Enter
CPU 23/KVM-5046 [001] d.... 3410.005616: kvm_exit: vcpu 23 reason VMRESUME
L0 reads DR6, arch.dr6 = 0
Reported-by: John Stultz <jstultz(a)google.com>
Closes: https://lkml.kernel.org/r/CANDhNCq5_F3HfFYABqFGCA1bPd_%2BxgNj-iDQhH4tDk%2Bw…
Fixes: 375e28ffc0cf ("KVM: X86: Set host DR6 only on VMX and for KVM_DEBUGREG_WONT_EXIT")
Fixes: d67668e9dd76 ("KVM: x86, SVM: isolate vcpu->arch.dr6 from vmcb->save.dr6")
Cc: stable(a)vger.kernel.org
Cc: Jim Mattson <jmattson(a)google.com>
Tested-by: John Stultz <jstultz(a)google.com>
Link: https://lore.kernel.org/r/20250125011833.3644371-1-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc(a)google.com>
diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h
index c35550581da0..823c0434bbad 100644
--- a/arch/x86/include/asm/kvm-x86-ops.h
+++ b/arch/x86/include/asm/kvm-x86-ops.h
@@ -48,6 +48,7 @@ KVM_X86_OP(set_idt)
KVM_X86_OP(get_gdt)
KVM_X86_OP(set_gdt)
KVM_X86_OP(sync_dirty_debug_regs)
+KVM_X86_OP(set_dr6)
KVM_X86_OP(set_dr7)
KVM_X86_OP(cache_reg)
KVM_X86_OP(get_rflags)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index b15cde0a9b5c..0b7af5902ff7 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1696,6 +1696,7 @@ struct kvm_x86_ops {
void (*get_gdt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
void (*set_gdt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
void (*sync_dirty_debug_regs)(struct kvm_vcpu *vcpu);
+ void (*set_dr6)(struct kvm_vcpu *vcpu, unsigned long value);
void (*set_dr7)(struct kvm_vcpu *vcpu, unsigned long value);
void (*cache_reg)(struct kvm_vcpu *vcpu, enum kvm_reg reg);
unsigned long (*get_rflags)(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 7640a84e554a..a713c803a3a3 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -1991,11 +1991,11 @@ static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd)
svm->asid = sd->next_asid++;
}
-static void svm_set_dr6(struct vcpu_svm *svm, unsigned long value)
+static void svm_set_dr6(struct kvm_vcpu *vcpu, unsigned long value)
{
- struct vmcb *vmcb = svm->vmcb;
+ struct vmcb *vmcb = to_svm(vcpu)->vmcb;
- if (svm->vcpu.arch.guest_state_protected)
+ if (vcpu->arch.guest_state_protected)
return;
if (unlikely(value != vmcb->save.dr6)) {
@@ -4247,10 +4247,8 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu,
* Run with all-zero DR6 unless needed, so that we can get the exact cause
* of a #DB.
*/
- if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT))
- svm_set_dr6(svm, vcpu->arch.dr6);
- else
- svm_set_dr6(svm, DR6_ACTIVE_LOW);
+ if (likely(!(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)))
+ svm_set_dr6(vcpu, DR6_ACTIVE_LOW);
clgi();
kvm_load_guest_xsave_state(vcpu);
@@ -5043,6 +5041,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
.set_idt = svm_set_idt,
.get_gdt = svm_get_gdt,
.set_gdt = svm_set_gdt,
+ .set_dr6 = svm_set_dr6,
.set_dr7 = svm_set_dr7,
.sync_dirty_debug_regs = svm_sync_dirty_debug_regs,
.cache_reg = svm_cache_reg,
diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c
index 2427f918e763..43ee9ed11291 100644
--- a/arch/x86/kvm/vmx/main.c
+++ b/arch/x86/kvm/vmx/main.c
@@ -61,6 +61,7 @@ struct kvm_x86_ops vt_x86_ops __initdata = {
.set_idt = vmx_set_idt,
.get_gdt = vmx_get_gdt,
.set_gdt = vmx_set_gdt,
+ .set_dr6 = vmx_set_dr6,
.set_dr7 = vmx_set_dr7,
.sync_dirty_debug_regs = vmx_sync_dirty_debug_regs,
.cache_reg = vmx_cache_reg,
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index f72835e85b6d..6c56d5235f0f 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -5648,6 +5648,12 @@ void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
set_debugreg(DR6_RESERVED, 6);
}
+void vmx_set_dr6(struct kvm_vcpu *vcpu, unsigned long val)
+{
+ lockdep_assert_irqs_disabled();
+ set_debugreg(vcpu->arch.dr6, 6);
+}
+
void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val)
{
vmcs_writel(GUEST_DR7, val);
@@ -7417,10 +7423,6 @@ fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit)
vmx->loaded_vmcs->host_state.cr4 = cr4;
}
- /* When KVM_DEBUGREG_WONT_EXIT, dr6 is accessible in guest. */
- if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT))
- set_debugreg(vcpu->arch.dr6, 6);
-
/* When single-stepping over STI and MOV SS, we must clear the
* corresponding interruptibility bits in the guest state. Otherwise
* vmentry fails as it then expects bit 14 (BS) in pending debug
diff --git a/arch/x86/kvm/vmx/x86_ops.h b/arch/x86/kvm/vmx/x86_ops.h
index ce3295a67c04..430773a5ef8e 100644
--- a/arch/x86/kvm/vmx/x86_ops.h
+++ b/arch/x86/kvm/vmx/x86_ops.h
@@ -73,6 +73,7 @@ void vmx_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
void vmx_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
void vmx_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
void vmx_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
+void vmx_set_dr6(struct kvm_vcpu *vcpu, unsigned long val);
void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val);
void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu);
void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 8e77e61d4fbd..02159c967d29 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -10961,6 +10961,9 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
set_debugreg(vcpu->arch.eff_db[1], 1);
set_debugreg(vcpu->arch.eff_db[2], 2);
set_debugreg(vcpu->arch.eff_db[3], 3);
+ /* When KVM_DEBUGREG_WONT_EXIT, dr6 is accessible in guest. */
+ if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT))
+ kvm_x86_call(set_dr6)(vcpu, vcpu->arch.dr6);
} else if (unlikely(hw_breakpoint_active())) {
set_debugreg(0, 7);
}
The patch below does not apply to the 6.6-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.6.y
git checkout FETCH_HEAD
git cherry-pick -x c2fee09fc167c74a64adb08656cb993ea475197e
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025021809-census-mammal-224b@gregkh' --subject-prefix 'PATCH 6.6.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From c2fee09fc167c74a64adb08656cb993ea475197e Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc(a)google.com>
Date: Fri, 24 Jan 2025 17:18:33 -0800
Subject: [PATCH] KVM: x86: Load DR6 with guest value only before entering
.vcpu_run() loop
Move the conditional loading of hardware DR6 with the guest's DR6 value
out of the core .vcpu_run() loop to fix a bug where KVM can load hardware
with a stale vcpu->arch.dr6.
When the guest accesses a DR and host userspace isn't debugging the guest,
KVM disables DR interception and loads the guest's values into hardware on
VM-Enter and saves them on VM-Exit. This allows the guest to access DRs
at will, e.g. so that a sequence of DR accesses to configure a breakpoint
only generates one VM-Exit.
For DR0-DR3, the logic/behavior is identical between VMX and SVM, and also
identical between KVM_DEBUGREG_BP_ENABLED (userspace debugging the guest)
and KVM_DEBUGREG_WONT_EXIT (guest using DRs), and so KVM handles loading
DR0-DR3 in common code, _outside_ of the core kvm_x86_ops.vcpu_run() loop.
But for DR6, the guest's value doesn't need to be loaded into hardware for
KVM_DEBUGREG_BP_ENABLED, and SVM provides a dedicated VMCB field whereas
VMX requires software to manually load the guest value, and so loading the
guest's value into DR6 is handled by {svm,vmx}_vcpu_run(), i.e. is done
_inside_ the core run loop.
Unfortunately, saving the guest values on VM-Exit is initiated by common
x86, again outside of the core run loop. If the guest modifies DR6 (in
hardware, when DR interception is disabled), and then the next VM-Exit is
a fastpath VM-Exit, KVM will reload hardware DR6 with vcpu->arch.dr6 and
clobber the guest's actual value.
The bug shows up primarily with nested VMX because KVM handles the VMX
preemption timer in the fastpath, and the window between hardware DR6
being modified (in guest context) and DR6 being read by guest software is
orders of magnitude larger in a nested setup. E.g. in non-nested, the
VMX preemption timer would need to fire precisely between #DB injection
and the #DB handler's read of DR6, whereas with a KVM-on-KVM setup, the
window where hardware DR6 is "dirty" extends all the way from L1 writing
DR6 to VMRESUME (in L1).
L1's view:
==========
<L1 disables DR interception>
CPU 0/KVM-7289 [023] d.... 2925.640961: kvm_entry: vcpu 0
A: L1 Writes DR6
CPU 0/KVM-7289 [023] d.... 2925.640963: <hack>: Set DRs, DR6 = 0xffff0ff1
B: CPU 0/KVM-7289 [023] d.... 2925.640967: kvm_exit: vcpu 0 reason EXTERNAL_INTERRUPT intr_info 0x800000ec
D: L1 reads DR6, arch.dr6 = 0
CPU 0/KVM-7289 [023] d.... 2925.640969: <hack>: Sync DRs, DR6 = 0xffff0ff0
CPU 0/KVM-7289 [023] d.... 2925.640976: kvm_entry: vcpu 0
L2 reads DR6, L1 disables DR interception
CPU 0/KVM-7289 [023] d.... 2925.640980: kvm_exit: vcpu 0 reason DR_ACCESS info1 0x0000000000000216
CPU 0/KVM-7289 [023] d.... 2925.640983: kvm_entry: vcpu 0
CPU 0/KVM-7289 [023] d.... 2925.640983: <hack>: Set DRs, DR6 = 0xffff0ff0
L2 detects failure
CPU 0/KVM-7289 [023] d.... 2925.640987: kvm_exit: vcpu 0 reason HLT
L1 reads DR6 (confirms failure)
CPU 0/KVM-7289 [023] d.... 2925.640990: <hack>: Sync DRs, DR6 = 0xffff0ff0
L0's view:
==========
L2 reads DR6, arch.dr6 = 0
CPU 23/KVM-5046 [001] d.... 3410.005610: kvm_exit: vcpu 23 reason DR_ACCESS info1 0x0000000000000216
CPU 23/KVM-5046 [001] ..... 3410.005610: kvm_nested_vmexit: vcpu 23 reason DR_ACCESS info1 0x0000000000000216
L2 => L1 nested VM-Exit
CPU 23/KVM-5046 [001] ..... 3410.005610: kvm_nested_vmexit_inject: reason: DR_ACCESS ext_inf1: 0x0000000000000216
CPU 23/KVM-5046 [001] d.... 3410.005610: kvm_entry: vcpu 23
CPU 23/KVM-5046 [001] d.... 3410.005611: kvm_exit: vcpu 23 reason VMREAD
CPU 23/KVM-5046 [001] d.... 3410.005611: kvm_entry: vcpu 23
CPU 23/KVM-5046 [001] d.... 3410.005612: kvm_exit: vcpu 23 reason VMREAD
CPU 23/KVM-5046 [001] d.... 3410.005612: kvm_entry: vcpu 23
L1 writes DR7, L0 disables DR interception
CPU 23/KVM-5046 [001] d.... 3410.005612: kvm_exit: vcpu 23 reason DR_ACCESS info1 0x0000000000000007
CPU 23/KVM-5046 [001] d.... 3410.005613: kvm_entry: vcpu 23
L0 writes DR6 = 0 (arch.dr6)
CPU 23/KVM-5046 [001] d.... 3410.005613: <hack>: Set DRs, DR6 = 0xffff0ff0
A: <L1 writes DR6 = 1, no interception, arch.dr6 is still '0'>
B: CPU 23/KVM-5046 [001] d.... 3410.005614: kvm_exit: vcpu 23 reason PREEMPTION_TIMER
CPU 23/KVM-5046 [001] d.... 3410.005614: kvm_entry: vcpu 23
C: L0 writes DR6 = 0 (arch.dr6)
CPU 23/KVM-5046 [001] d.... 3410.005614: <hack>: Set DRs, DR6 = 0xffff0ff0
L1 => L2 nested VM-Enter
CPU 23/KVM-5046 [001] d.... 3410.005616: kvm_exit: vcpu 23 reason VMRESUME
L0 reads DR6, arch.dr6 = 0
Reported-by: John Stultz <jstultz(a)google.com>
Closes: https://lkml.kernel.org/r/CANDhNCq5_F3HfFYABqFGCA1bPd_%2BxgNj-iDQhH4tDk%2Bw…
Fixes: 375e28ffc0cf ("KVM: X86: Set host DR6 only on VMX and for KVM_DEBUGREG_WONT_EXIT")
Fixes: d67668e9dd76 ("KVM: x86, SVM: isolate vcpu->arch.dr6 from vmcb->save.dr6")
Cc: stable(a)vger.kernel.org
Cc: Jim Mattson <jmattson(a)google.com>
Tested-by: John Stultz <jstultz(a)google.com>
Link: https://lore.kernel.org/r/20250125011833.3644371-1-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc(a)google.com>
diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h
index c35550581da0..823c0434bbad 100644
--- a/arch/x86/include/asm/kvm-x86-ops.h
+++ b/arch/x86/include/asm/kvm-x86-ops.h
@@ -48,6 +48,7 @@ KVM_X86_OP(set_idt)
KVM_X86_OP(get_gdt)
KVM_X86_OP(set_gdt)
KVM_X86_OP(sync_dirty_debug_regs)
+KVM_X86_OP(set_dr6)
KVM_X86_OP(set_dr7)
KVM_X86_OP(cache_reg)
KVM_X86_OP(get_rflags)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index b15cde0a9b5c..0b7af5902ff7 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1696,6 +1696,7 @@ struct kvm_x86_ops {
void (*get_gdt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
void (*set_gdt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
void (*sync_dirty_debug_regs)(struct kvm_vcpu *vcpu);
+ void (*set_dr6)(struct kvm_vcpu *vcpu, unsigned long value);
void (*set_dr7)(struct kvm_vcpu *vcpu, unsigned long value);
void (*cache_reg)(struct kvm_vcpu *vcpu, enum kvm_reg reg);
unsigned long (*get_rflags)(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 7640a84e554a..a713c803a3a3 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -1991,11 +1991,11 @@ static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd)
svm->asid = sd->next_asid++;
}
-static void svm_set_dr6(struct vcpu_svm *svm, unsigned long value)
+static void svm_set_dr6(struct kvm_vcpu *vcpu, unsigned long value)
{
- struct vmcb *vmcb = svm->vmcb;
+ struct vmcb *vmcb = to_svm(vcpu)->vmcb;
- if (svm->vcpu.arch.guest_state_protected)
+ if (vcpu->arch.guest_state_protected)
return;
if (unlikely(value != vmcb->save.dr6)) {
@@ -4247,10 +4247,8 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu,
* Run with all-zero DR6 unless needed, so that we can get the exact cause
* of a #DB.
*/
- if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT))
- svm_set_dr6(svm, vcpu->arch.dr6);
- else
- svm_set_dr6(svm, DR6_ACTIVE_LOW);
+ if (likely(!(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)))
+ svm_set_dr6(vcpu, DR6_ACTIVE_LOW);
clgi();
kvm_load_guest_xsave_state(vcpu);
@@ -5043,6 +5041,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
.set_idt = svm_set_idt,
.get_gdt = svm_get_gdt,
.set_gdt = svm_set_gdt,
+ .set_dr6 = svm_set_dr6,
.set_dr7 = svm_set_dr7,
.sync_dirty_debug_regs = svm_sync_dirty_debug_regs,
.cache_reg = svm_cache_reg,
diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c
index 2427f918e763..43ee9ed11291 100644
--- a/arch/x86/kvm/vmx/main.c
+++ b/arch/x86/kvm/vmx/main.c
@@ -61,6 +61,7 @@ struct kvm_x86_ops vt_x86_ops __initdata = {
.set_idt = vmx_set_idt,
.get_gdt = vmx_get_gdt,
.set_gdt = vmx_set_gdt,
+ .set_dr6 = vmx_set_dr6,
.set_dr7 = vmx_set_dr7,
.sync_dirty_debug_regs = vmx_sync_dirty_debug_regs,
.cache_reg = vmx_cache_reg,
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index f72835e85b6d..6c56d5235f0f 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -5648,6 +5648,12 @@ void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
set_debugreg(DR6_RESERVED, 6);
}
+void vmx_set_dr6(struct kvm_vcpu *vcpu, unsigned long val)
+{
+ lockdep_assert_irqs_disabled();
+ set_debugreg(vcpu->arch.dr6, 6);
+}
+
void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val)
{
vmcs_writel(GUEST_DR7, val);
@@ -7417,10 +7423,6 @@ fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit)
vmx->loaded_vmcs->host_state.cr4 = cr4;
}
- /* When KVM_DEBUGREG_WONT_EXIT, dr6 is accessible in guest. */
- if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT))
- set_debugreg(vcpu->arch.dr6, 6);
-
/* When single-stepping over STI and MOV SS, we must clear the
* corresponding interruptibility bits in the guest state. Otherwise
* vmentry fails as it then expects bit 14 (BS) in pending debug
diff --git a/arch/x86/kvm/vmx/x86_ops.h b/arch/x86/kvm/vmx/x86_ops.h
index ce3295a67c04..430773a5ef8e 100644
--- a/arch/x86/kvm/vmx/x86_ops.h
+++ b/arch/x86/kvm/vmx/x86_ops.h
@@ -73,6 +73,7 @@ void vmx_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
void vmx_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
void vmx_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
void vmx_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
+void vmx_set_dr6(struct kvm_vcpu *vcpu, unsigned long val);
void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val);
void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu);
void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 8e77e61d4fbd..02159c967d29 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -10961,6 +10961,9 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
set_debugreg(vcpu->arch.eff_db[1], 1);
set_debugreg(vcpu->arch.eff_db[2], 2);
set_debugreg(vcpu->arch.eff_db[3], 3);
+ /* When KVM_DEBUGREG_WONT_EXIT, dr6 is accessible in guest. */
+ if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT))
+ kvm_x86_call(set_dr6)(vcpu, vcpu->arch.dr6);
} else if (unlikely(hw_breakpoint_active())) {
set_debugreg(0, 7);
}
The patch below does not apply to the 6.1-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.1.y
git checkout FETCH_HEAD
git cherry-pick -x c2fee09fc167c74a64adb08656cb993ea475197e
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025021812-candied-hamper-7f08@gregkh' --subject-prefix 'PATCH 6.1.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From c2fee09fc167c74a64adb08656cb993ea475197e Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc(a)google.com>
Date: Fri, 24 Jan 2025 17:18:33 -0800
Subject: [PATCH] KVM: x86: Load DR6 with guest value only before entering
.vcpu_run() loop
Move the conditional loading of hardware DR6 with the guest's DR6 value
out of the core .vcpu_run() loop to fix a bug where KVM can load hardware
with a stale vcpu->arch.dr6.
When the guest accesses a DR and host userspace isn't debugging the guest,
KVM disables DR interception and loads the guest's values into hardware on
VM-Enter and saves them on VM-Exit. This allows the guest to access DRs
at will, e.g. so that a sequence of DR accesses to configure a breakpoint
only generates one VM-Exit.
For DR0-DR3, the logic/behavior is identical between VMX and SVM, and also
identical between KVM_DEBUGREG_BP_ENABLED (userspace debugging the guest)
and KVM_DEBUGREG_WONT_EXIT (guest using DRs), and so KVM handles loading
DR0-DR3 in common code, _outside_ of the core kvm_x86_ops.vcpu_run() loop.
But for DR6, the guest's value doesn't need to be loaded into hardware for
KVM_DEBUGREG_BP_ENABLED, and SVM provides a dedicated VMCB field whereas
VMX requires software to manually load the guest value, and so loading the
guest's value into DR6 is handled by {svm,vmx}_vcpu_run(), i.e. is done
_inside_ the core run loop.
Unfortunately, saving the guest values on VM-Exit is initiated by common
x86, again outside of the core run loop. If the guest modifies DR6 (in
hardware, when DR interception is disabled), and then the next VM-Exit is
a fastpath VM-Exit, KVM will reload hardware DR6 with vcpu->arch.dr6 and
clobber the guest's actual value.
The bug shows up primarily with nested VMX because KVM handles the VMX
preemption timer in the fastpath, and the window between hardware DR6
being modified (in guest context) and DR6 being read by guest software is
orders of magnitude larger in a nested setup. E.g. in non-nested, the
VMX preemption timer would need to fire precisely between #DB injection
and the #DB handler's read of DR6, whereas with a KVM-on-KVM setup, the
window where hardware DR6 is "dirty" extends all the way from L1 writing
DR6 to VMRESUME (in L1).
L1's view:
==========
<L1 disables DR interception>
CPU 0/KVM-7289 [023] d.... 2925.640961: kvm_entry: vcpu 0
A: L1 Writes DR6
CPU 0/KVM-7289 [023] d.... 2925.640963: <hack>: Set DRs, DR6 = 0xffff0ff1
B: CPU 0/KVM-7289 [023] d.... 2925.640967: kvm_exit: vcpu 0 reason EXTERNAL_INTERRUPT intr_info 0x800000ec
D: L1 reads DR6, arch.dr6 = 0
CPU 0/KVM-7289 [023] d.... 2925.640969: <hack>: Sync DRs, DR6 = 0xffff0ff0
CPU 0/KVM-7289 [023] d.... 2925.640976: kvm_entry: vcpu 0
L2 reads DR6, L1 disables DR interception
CPU 0/KVM-7289 [023] d.... 2925.640980: kvm_exit: vcpu 0 reason DR_ACCESS info1 0x0000000000000216
CPU 0/KVM-7289 [023] d.... 2925.640983: kvm_entry: vcpu 0
CPU 0/KVM-7289 [023] d.... 2925.640983: <hack>: Set DRs, DR6 = 0xffff0ff0
L2 detects failure
CPU 0/KVM-7289 [023] d.... 2925.640987: kvm_exit: vcpu 0 reason HLT
L1 reads DR6 (confirms failure)
CPU 0/KVM-7289 [023] d.... 2925.640990: <hack>: Sync DRs, DR6 = 0xffff0ff0
L0's view:
==========
L2 reads DR6, arch.dr6 = 0
CPU 23/KVM-5046 [001] d.... 3410.005610: kvm_exit: vcpu 23 reason DR_ACCESS info1 0x0000000000000216
CPU 23/KVM-5046 [001] ..... 3410.005610: kvm_nested_vmexit: vcpu 23 reason DR_ACCESS info1 0x0000000000000216
L2 => L1 nested VM-Exit
CPU 23/KVM-5046 [001] ..... 3410.005610: kvm_nested_vmexit_inject: reason: DR_ACCESS ext_inf1: 0x0000000000000216
CPU 23/KVM-5046 [001] d.... 3410.005610: kvm_entry: vcpu 23
CPU 23/KVM-5046 [001] d.... 3410.005611: kvm_exit: vcpu 23 reason VMREAD
CPU 23/KVM-5046 [001] d.... 3410.005611: kvm_entry: vcpu 23
CPU 23/KVM-5046 [001] d.... 3410.005612: kvm_exit: vcpu 23 reason VMREAD
CPU 23/KVM-5046 [001] d.... 3410.005612: kvm_entry: vcpu 23
L1 writes DR7, L0 disables DR interception
CPU 23/KVM-5046 [001] d.... 3410.005612: kvm_exit: vcpu 23 reason DR_ACCESS info1 0x0000000000000007
CPU 23/KVM-5046 [001] d.... 3410.005613: kvm_entry: vcpu 23
L0 writes DR6 = 0 (arch.dr6)
CPU 23/KVM-5046 [001] d.... 3410.005613: <hack>: Set DRs, DR6 = 0xffff0ff0
A: <L1 writes DR6 = 1, no interception, arch.dr6 is still '0'>
B: CPU 23/KVM-5046 [001] d.... 3410.005614: kvm_exit: vcpu 23 reason PREEMPTION_TIMER
CPU 23/KVM-5046 [001] d.... 3410.005614: kvm_entry: vcpu 23
C: L0 writes DR6 = 0 (arch.dr6)
CPU 23/KVM-5046 [001] d.... 3410.005614: <hack>: Set DRs, DR6 = 0xffff0ff0
L1 => L2 nested VM-Enter
CPU 23/KVM-5046 [001] d.... 3410.005616: kvm_exit: vcpu 23 reason VMRESUME
L0 reads DR6, arch.dr6 = 0
Reported-by: John Stultz <jstultz(a)google.com>
Closes: https://lkml.kernel.org/r/CANDhNCq5_F3HfFYABqFGCA1bPd_%2BxgNj-iDQhH4tDk%2Bw…
Fixes: 375e28ffc0cf ("KVM: X86: Set host DR6 only on VMX and for KVM_DEBUGREG_WONT_EXIT")
Fixes: d67668e9dd76 ("KVM: x86, SVM: isolate vcpu->arch.dr6 from vmcb->save.dr6")
Cc: stable(a)vger.kernel.org
Cc: Jim Mattson <jmattson(a)google.com>
Tested-by: John Stultz <jstultz(a)google.com>
Link: https://lore.kernel.org/r/20250125011833.3644371-1-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc(a)google.com>
diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h
index c35550581da0..823c0434bbad 100644
--- a/arch/x86/include/asm/kvm-x86-ops.h
+++ b/arch/x86/include/asm/kvm-x86-ops.h
@@ -48,6 +48,7 @@ KVM_X86_OP(set_idt)
KVM_X86_OP(get_gdt)
KVM_X86_OP(set_gdt)
KVM_X86_OP(sync_dirty_debug_regs)
+KVM_X86_OP(set_dr6)
KVM_X86_OP(set_dr7)
KVM_X86_OP(cache_reg)
KVM_X86_OP(get_rflags)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index b15cde0a9b5c..0b7af5902ff7 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1696,6 +1696,7 @@ struct kvm_x86_ops {
void (*get_gdt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
void (*set_gdt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
void (*sync_dirty_debug_regs)(struct kvm_vcpu *vcpu);
+ void (*set_dr6)(struct kvm_vcpu *vcpu, unsigned long value);
void (*set_dr7)(struct kvm_vcpu *vcpu, unsigned long value);
void (*cache_reg)(struct kvm_vcpu *vcpu, enum kvm_reg reg);
unsigned long (*get_rflags)(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 7640a84e554a..a713c803a3a3 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -1991,11 +1991,11 @@ static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd)
svm->asid = sd->next_asid++;
}
-static void svm_set_dr6(struct vcpu_svm *svm, unsigned long value)
+static void svm_set_dr6(struct kvm_vcpu *vcpu, unsigned long value)
{
- struct vmcb *vmcb = svm->vmcb;
+ struct vmcb *vmcb = to_svm(vcpu)->vmcb;
- if (svm->vcpu.arch.guest_state_protected)
+ if (vcpu->arch.guest_state_protected)
return;
if (unlikely(value != vmcb->save.dr6)) {
@@ -4247,10 +4247,8 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu,
* Run with all-zero DR6 unless needed, so that we can get the exact cause
* of a #DB.
*/
- if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT))
- svm_set_dr6(svm, vcpu->arch.dr6);
- else
- svm_set_dr6(svm, DR6_ACTIVE_LOW);
+ if (likely(!(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)))
+ svm_set_dr6(vcpu, DR6_ACTIVE_LOW);
clgi();
kvm_load_guest_xsave_state(vcpu);
@@ -5043,6 +5041,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
.set_idt = svm_set_idt,
.get_gdt = svm_get_gdt,
.set_gdt = svm_set_gdt,
+ .set_dr6 = svm_set_dr6,
.set_dr7 = svm_set_dr7,
.sync_dirty_debug_regs = svm_sync_dirty_debug_regs,
.cache_reg = svm_cache_reg,
diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c
index 2427f918e763..43ee9ed11291 100644
--- a/arch/x86/kvm/vmx/main.c
+++ b/arch/x86/kvm/vmx/main.c
@@ -61,6 +61,7 @@ struct kvm_x86_ops vt_x86_ops __initdata = {
.set_idt = vmx_set_idt,
.get_gdt = vmx_get_gdt,
.set_gdt = vmx_set_gdt,
+ .set_dr6 = vmx_set_dr6,
.set_dr7 = vmx_set_dr7,
.sync_dirty_debug_regs = vmx_sync_dirty_debug_regs,
.cache_reg = vmx_cache_reg,
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index f72835e85b6d..6c56d5235f0f 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -5648,6 +5648,12 @@ void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
set_debugreg(DR6_RESERVED, 6);
}
+void vmx_set_dr6(struct kvm_vcpu *vcpu, unsigned long val)
+{
+ lockdep_assert_irqs_disabled();
+ set_debugreg(vcpu->arch.dr6, 6);
+}
+
void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val)
{
vmcs_writel(GUEST_DR7, val);
@@ -7417,10 +7423,6 @@ fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit)
vmx->loaded_vmcs->host_state.cr4 = cr4;
}
- /* When KVM_DEBUGREG_WONT_EXIT, dr6 is accessible in guest. */
- if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT))
- set_debugreg(vcpu->arch.dr6, 6);
-
/* When single-stepping over STI and MOV SS, we must clear the
* corresponding interruptibility bits in the guest state. Otherwise
* vmentry fails as it then expects bit 14 (BS) in pending debug
diff --git a/arch/x86/kvm/vmx/x86_ops.h b/arch/x86/kvm/vmx/x86_ops.h
index ce3295a67c04..430773a5ef8e 100644
--- a/arch/x86/kvm/vmx/x86_ops.h
+++ b/arch/x86/kvm/vmx/x86_ops.h
@@ -73,6 +73,7 @@ void vmx_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
void vmx_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
void vmx_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
void vmx_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
+void vmx_set_dr6(struct kvm_vcpu *vcpu, unsigned long val);
void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val);
void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu);
void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 8e77e61d4fbd..02159c967d29 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -10961,6 +10961,9 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
set_debugreg(vcpu->arch.eff_db[1], 1);
set_debugreg(vcpu->arch.eff_db[2], 2);
set_debugreg(vcpu->arch.eff_db[3], 3);
+ /* When KVM_DEBUGREG_WONT_EXIT, dr6 is accessible in guest. */
+ if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT))
+ kvm_x86_call(set_dr6)(vcpu, vcpu->arch.dr6);
} else if (unlikely(hw_breakpoint_active())) {
set_debugreg(0, 7);
}
FEAT_PMUv3p9 registers such as PMICNTR_EL0, PMICFILTR_EL0, and PMUACR_EL1
access from EL1 requires appropriate EL2 fine grained trap configuration
via FEAT_FGT2 based trap control registers HDFGRTR2_EL2 and HDFGWTR2_EL2.
Otherwise such register accesses will result in traps into EL2.
Add a new helper __init_el2_fgt2() which initializes FEAT_FGT2 based fine
grained trap control registers HDFGRTR2_EL2 and HDFGWTR2_EL2 (setting the
bits nPMICNTR_EL0, nPMICFILTR_EL0 and nPMUACR_EL1) to enable access into
PMICNTR_EL0, PMICFILTR_EL0, and PMUACR_EL1 registers.
Also update booting.rst with SCR_EL3.FGTEn2 requirement for all FEAT_FGT2
based registers to be accessible in EL2.
Cc: Catalin Marinas <catalin.marinas(a)arm.com>
Cc: Will Deacon <will(a)kernel.org>
Cc: Mark Rutland <mark.rutland(a)arm.com>
Cc: Rob Herring <robh(a)kernel.org>
Cc: Jonathan Corbet <corbet(a)lwn.net>
Cc: Marc Zyngier <maz(a)kernel.org>
Cc: Oliver Upton <oliver.upton(a)linux.dev>
Cc: linux-arm-kernel(a)lists.infradead.org
Cc: linux-doc(a)vger.kernel.org
Cc: linux-kernel(a)vger.kernel.org
Cc: kvmarm(a)lists.linux.dev
Fixes: 0bbff9ed8165 ("perf/arm_pmuv3: Add PMUv3.9 per counter EL0 access control")
Fixes: d8226d8cfbaf ("perf: arm_pmuv3: Add support for Armv9.4 PMU instruction counter")
Cc: stable(a)vger.kernel.org
Tested-by: Rob Herring (Arm) <robh(a)kernel.org>
Reviewed-by: Rob Herring (Arm) <robh(a)kernel.org>
Signed-off-by: Anshuman Khandual <anshuman.khandual(a)arm.com>
---
Changes in V3:
- Added 'MDCR_EL3.EnPM2 = 0b1' as a booting requirement per Mark
- Added 'Fixes:' and 'CC: stable' tags per Mark
Changes in V2:
https://lore.kernel.org/all/20250203050828.1049370-8-anshuman.khandual@arm.…
Documentation/arch/arm64/booting.rst | 22 ++++++++++++++++++++++
arch/arm64/include/asm/el2_setup.h | 25 +++++++++++++++++++++++++
2 files changed, 47 insertions(+)
diff --git a/Documentation/arch/arm64/booting.rst b/Documentation/arch/arm64/booting.rst
index cad6fdc96b98..dee7b6de864f 100644
--- a/Documentation/arch/arm64/booting.rst
+++ b/Documentation/arch/arm64/booting.rst
@@ -288,6 +288,12 @@ Before jumping into the kernel, the following conditions must be met:
- SCR_EL3.FGTEn (bit 27) must be initialised to 0b1.
+ For CPUs with the Fine Grained Traps 2 (FEAT_FGT2) extension present:
+
+ - If EL3 is present and the kernel is entered at EL2:
+
+ - SCR_EL3.FGTEn2 (bit 59) must be initialised to 0b1.
+
For CPUs with support for HCRX_EL2 (FEAT_HCX) present:
- If EL3 is present and the kernel is entered at EL2:
@@ -382,6 +388,22 @@ Before jumping into the kernel, the following conditions must be met:
- SMCR_EL2.EZT0 (bit 30) must be initialised to 0b1.
+ For CPUs with the Performance Monitors Extension (FEAT_PMUv3p9):
+
+ - If EL3 is present:
+
+ - MDCR_EL3.EnPM2 (bit 7) must be initialised to 0b1.
+
+ - If the kernel is entered at EL1 and EL2 is present:
+
+ - HDFGRTR2_EL2.nPMICNTR_EL0 (bit 2) must be initialised to 0b1.
+ - HDFGRTR2_EL2.nPMICFILTR_EL0 (bit 3) must be initialised to 0b1.
+ - HDFGRTR2_EL2.nPMUACR_EL1 (bit 4) must be initialised to 0b1.
+
+ - HDFGWTR2_EL2.nPMICNTR_EL0 (bit 2) must be initialised to 0b1.
+ - HDFGWTR2_EL2.nPMICFILTR_EL0 (bit 3) must be initialised to 0b1.
+ - HDFGWTR2_EL2.nPMUACR_EL1 (bit 4) must be initialised to 0b1.
+
For CPUs with Memory Copy and Memory Set instructions (FEAT_MOPS):
- If the kernel is entered at EL1 and EL2 is present:
diff --git a/arch/arm64/include/asm/el2_setup.h b/arch/arm64/include/asm/el2_setup.h
index 25e162651750..1a0071faf57e 100644
--- a/arch/arm64/include/asm/el2_setup.h
+++ b/arch/arm64/include/asm/el2_setup.h
@@ -233,6 +233,30 @@
.Lskip_fgt_\@:
.endm
+.macro __init_el2_fgt2
+ mrs x1, id_aa64mmfr0_el1
+ ubfx x1, x1, #ID_AA64MMFR0_EL1_FGT_SHIFT, #4
+ cmp x1, #ID_AA64MMFR0_EL1_FGT_FGT2
+ b.lt .Lskip_fgt2_\@
+
+ mov x0, xzr
+ mrs x1, id_aa64dfr0_el1
+ ubfx x1, x1, #ID_AA64DFR0_EL1_PMUVer_SHIFT, #4
+ cmp x1, #ID_AA64DFR0_EL1_PMUVer_V3P9
+ b.lt .Lskip_pmuv3p9_\@
+
+ orr x0, x0, #HDFGRTR2_EL2_nPMICNTR_EL0
+ orr x0, x0, #HDFGRTR2_EL2_nPMICFILTR_EL0
+ orr x0, x0, #HDFGRTR2_EL2_nPMUACR_EL1
+.Lskip_pmuv3p9_\@:
+ msr_s SYS_HDFGRTR2_EL2, x0
+ msr_s SYS_HDFGWTR2_EL2, x0
+ msr_s SYS_HFGRTR2_EL2, xzr
+ msr_s SYS_HFGWTR2_EL2, xzr
+ msr_s SYS_HFGITR2_EL2, xzr
+.Lskip_fgt2_\@:
+.endm
+
.macro __init_el2_gcs
mrs_s x1, SYS_ID_AA64PFR1_EL1
ubfx x1, x1, #ID_AA64PFR1_EL1_GCS_SHIFT, #4
@@ -283,6 +307,7 @@
__init_el2_nvhe_idregs
__init_el2_cptr
__init_el2_fgt
+ __init_el2_fgt2
__init_el2_gcs
.endm
--
2.25.1
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.15.y
git checkout FETCH_HEAD
git cherry-pick -x 6bda857bcbb86fb9d0e54fbef93a093d51172acc
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024120342-monsoon-wildcat-d0a1@gregkh' --subject-prefix 'PATCH 5.15.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 6bda857bcbb86fb9d0e54fbef93a093d51172acc Mon Sep 17 00:00:00 2001
From: Muchun Song <muchun.song(a)linux.dev>
Date: Mon, 14 Oct 2024 17:29:33 +0800
Subject: [PATCH] block: fix ordering between checking QUEUE_FLAG_QUIESCED
request adding
Supposing the following scenario.
CPU0 CPU1
blk_mq_insert_request() 1) store
blk_mq_unquiesce_queue()
blk_queue_flag_clear() 3) store
blk_mq_run_hw_queues()
blk_mq_run_hw_queue()
if (!blk_mq_hctx_has_pending()) 4) load
return
blk_mq_run_hw_queue()
if (blk_queue_quiesced()) 2) load
return
blk_mq_sched_dispatch_requests()
The full memory barrier should be inserted between 1) and 2), as well as
between 3) and 4) to make sure that either CPU0 sees QUEUE_FLAG_QUIESCED
is cleared or CPU1 sees dispatch list or setting of bitmap of software
queue. Otherwise, either CPU will not rerun the hardware queue causing
starvation.
So the first solution is to 1) add a pair of memory barrier to fix the
problem, another solution is to 2) use hctx->queue->queue_lock to
synchronize QUEUE_FLAG_QUIESCED. Here, we chose 2) to fix it since
memory barrier is not easy to be maintained.
Fixes: f4560ffe8cec ("blk-mq: use QUEUE_FLAG_QUIESCED to quiesce queue")
Cc: stable(a)vger.kernel.org
Cc: Muchun Song <muchun.song(a)linux.dev>
Signed-off-by: Muchun Song <songmuchun(a)bytedance.com>
Reviewed-by: Ming Lei <ming.lei(a)redhat.com>
Link: https://lore.kernel.org/r/20241014092934.53630-3-songmuchun@bytedance.com
Signed-off-by: Jens Axboe <axboe(a)kernel.dk>
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 5deb9dffca0a..bb4ee2380dce 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2227,6 +2227,24 @@ void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs)
}
EXPORT_SYMBOL(blk_mq_delay_run_hw_queue);
+static inline bool blk_mq_hw_queue_need_run(struct blk_mq_hw_ctx *hctx)
+{
+ bool need_run;
+
+ /*
+ * When queue is quiesced, we may be switching io scheduler, or
+ * updating nr_hw_queues, or other things, and we can't run queue
+ * any more, even blk_mq_hctx_has_pending() can't be called safely.
+ *
+ * And queue will be rerun in blk_mq_unquiesce_queue() if it is
+ * quiesced.
+ */
+ __blk_mq_run_dispatch_ops(hctx->queue, false,
+ need_run = !blk_queue_quiesced(hctx->queue) &&
+ blk_mq_hctx_has_pending(hctx));
+ return need_run;
+}
+
/**
* blk_mq_run_hw_queue - Start to run a hardware queue.
* @hctx: Pointer to the hardware queue to run.
@@ -2247,20 +2265,23 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
might_sleep_if(!async && hctx->flags & BLK_MQ_F_BLOCKING);
- /*
- * When queue is quiesced, we may be switching io scheduler, or
- * updating nr_hw_queues, or other things, and we can't run queue
- * any more, even __blk_mq_hctx_has_pending() can't be called safely.
- *
- * And queue will be rerun in blk_mq_unquiesce_queue() if it is
- * quiesced.
- */
- __blk_mq_run_dispatch_ops(hctx->queue, false,
- need_run = !blk_queue_quiesced(hctx->queue) &&
- blk_mq_hctx_has_pending(hctx));
+ need_run = blk_mq_hw_queue_need_run(hctx);
+ if (!need_run) {
+ unsigned long flags;
- if (!need_run)
- return;
+ /*
+ * Synchronize with blk_mq_unquiesce_queue(), because we check
+ * if hw queue is quiesced locklessly above, we need the use
+ * ->queue_lock to make sure we see the up-to-date status to
+ * not miss rerunning the hw queue.
+ */
+ spin_lock_irqsave(&hctx->queue->queue_lock, flags);
+ need_run = blk_mq_hw_queue_need_run(hctx);
+ spin_unlock_irqrestore(&hctx->queue->queue_lock, flags);
+
+ if (!need_run)
+ return;
+ }
if (async || !cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask)) {
blk_mq_delay_run_hw_queue(hctx, 0);
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.15.y
git checkout FETCH_HEAD
git cherry-pick -x 2003ee8a9aa14d766b06088156978d53c2e9be3d
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024120323-snowiness-subway-3844@gregkh' --subject-prefix 'PATCH 5.15.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 2003ee8a9aa14d766b06088156978d53c2e9be3d Mon Sep 17 00:00:00 2001
From: Muchun Song <muchun.song(a)linux.dev>
Date: Mon, 14 Oct 2024 17:29:32 +0800
Subject: [PATCH] block: fix missing dispatching request when queue is started
or unquiesced
Supposing the following scenario with a virtio_blk driver.
CPU0 CPU1 CPU2
blk_mq_try_issue_directly()
__blk_mq_issue_directly()
q->mq_ops->queue_rq()
virtio_queue_rq()
blk_mq_stop_hw_queue()
virtblk_done()
blk_mq_try_issue_directly()
if (blk_mq_hctx_stopped())
blk_mq_request_bypass_insert() blk_mq_run_hw_queue()
blk_mq_run_hw_queue() blk_mq_run_hw_queue()
blk_mq_insert_request()
return
After CPU0 has marked the queue as stopped, CPU1 will see the queue is
stopped. But before CPU1 puts the request on the dispatch list, CPU2
receives the interrupt of completion of request, so it will run the
hardware queue and marks the queue as non-stopped. Meanwhile, CPU1 also
runs the same hardware queue. After both CPU1 and CPU2 complete
blk_mq_run_hw_queue(), CPU1 just puts the request to the same hardware
queue and returns. It misses dispatching a request. Fix it by running
the hardware queue explicitly. And blk_mq_request_issue_directly()
should handle a similar situation. Fix it as well.
Fixes: d964f04a8fde ("blk-mq: fix direct issue")
Cc: stable(a)vger.kernel.org
Cc: Muchun Song <muchun.song(a)linux.dev>
Signed-off-by: Muchun Song <songmuchun(a)bytedance.com>
Reviewed-by: Ming Lei <ming.lei(a)redhat.com>
Link: https://lore.kernel.org/r/20241014092934.53630-2-songmuchun@bytedance.com
Signed-off-by: Jens Axboe <axboe(a)kernel.dk>
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 7d05a56e3639..5deb9dffca0a 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2647,6 +2647,7 @@ static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
if (blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(rq->q)) {
blk_mq_insert_request(rq, 0);
+ blk_mq_run_hw_queue(hctx, false);
return;
}
@@ -2677,6 +2678,7 @@ static blk_status_t blk_mq_request_issue_directly(struct request *rq, bool last)
if (blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(rq->q)) {
blk_mq_insert_request(rq, 0);
+ blk_mq_run_hw_queue(hctx, false);
return BLK_STS_OK;
}
From: yangge <yangge1116(a)126.com>
If a large number of CMA memory are configured in system (for example, the
CMA memory accounts for 50% of the system memory), starting a virtual
virtual machine with device passthrough, it will
call pin_user_pages_remote(..., FOLL_LONGTERM, ...) to pin memory.
Normally if a page is present and in CMA area, pin_user_pages_remote()
will migrate the page from CMA area to non-CMA area because of
FOLL_LONGTERM flag. But the current code will cause the migration failure
due to unexpected page refcounts, and eventually cause the virtual machine
fail to start.
If a page is added in LRU batch, its refcount increases one, remove the
page from LRU batch decreases one. Page migration requires the page is not
referenced by others except page mapping. Before migrating a page, we
should try to drain the page from LRU batch in case the page is in it,
however, folio_test_lru() is not sufficient to tell whether the page is
in LRU batch or not, if the page is in LRU batch, the migration will fail.
To solve the problem above, we modify the logic of adding to LRU batch.
Before adding a page to LRU batch, we clear the LRU flag of the page so
that we can check whether the page is in LRU batch by folio_test_lru(page).
It's quite valuable, because likely we don't want to blindly drain the LRU
batch simply because there is some unexpected reference on a page, as
described above.
This change makes the LRU flag of a page invisible for longer, which
may impact some programs. For example, as long as a page is on a LRU
batch, we cannot isolate it, and we cannot check if it's an LRU page.
Further, a page can now only be on exactly one LRU batch. This doesn't
seem to matter much, because a new page is allocated from buddy and
added to the lru batch, or be isolated, it's LRU flag may also be
invisible for a long time.
Fixes: 9a4e9f3b2d73 ("mm: update get_user_pages_longterm to migrate pages allocated from CMA region")
Cc: <stable(a)vger.kernel.org>
Signed-off-by: yangge <yangge1116(a)126.com>
---
mm/swap.c | 43 +++++++++++++++++++++++++++++++------------
1 file changed, 31 insertions(+), 12 deletions(-)
V4:
Adjust commit message according to David's comments
V3:
Add fixes tag
V2:
Adjust code and commit message according to David's comments
diff --git a/mm/swap.c b/mm/swap.c
index dc205bd..9caf6b0 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -211,10 +211,6 @@ static void folio_batch_move_lru(struct folio_batch *fbatch, move_fn_t move_fn)
for (i = 0; i < folio_batch_count(fbatch); i++) {
struct folio *folio = fbatch->folios[i];
- /* block memcg migration while the folio moves between lru */
- if (move_fn != lru_add_fn && !folio_test_clear_lru(folio))
- continue;
-
folio_lruvec_relock_irqsave(folio, &lruvec, &flags);
move_fn(lruvec, folio);
@@ -255,11 +251,16 @@ static void lru_move_tail_fn(struct lruvec *lruvec, struct folio *folio)
void folio_rotate_reclaimable(struct folio *folio)
{
if (!folio_test_locked(folio) && !folio_test_dirty(folio) &&
- !folio_test_unevictable(folio) && folio_test_lru(folio)) {
+ !folio_test_unevictable(folio)) {
struct folio_batch *fbatch;
unsigned long flags;
folio_get(folio);
+ if (!folio_test_clear_lru(folio)) {
+ folio_put(folio);
+ return;
+ }
+
local_lock_irqsave(&lru_rotate.lock, flags);
fbatch = this_cpu_ptr(&lru_rotate.fbatch);
folio_batch_add_and_move(fbatch, folio, lru_move_tail_fn);
@@ -352,11 +353,15 @@ static void folio_activate_drain(int cpu)
void folio_activate(struct folio *folio)
{
- if (folio_test_lru(folio) && !folio_test_active(folio) &&
- !folio_test_unevictable(folio)) {
+ if (!folio_test_active(folio) && !folio_test_unevictable(folio)) {
struct folio_batch *fbatch;
folio_get(folio);
+ if (!folio_test_clear_lru(folio)) {
+ folio_put(folio);
+ return;
+ }
+
local_lock(&cpu_fbatches.lock);
fbatch = this_cpu_ptr(&cpu_fbatches.activate);
folio_batch_add_and_move(fbatch, folio, folio_activate_fn);
@@ -700,6 +705,11 @@ void deactivate_file_folio(struct folio *folio)
return;
folio_get(folio);
+ if (!folio_test_clear_lru(folio)) {
+ folio_put(folio);
+ return;
+ }
+
local_lock(&cpu_fbatches.lock);
fbatch = this_cpu_ptr(&cpu_fbatches.lru_deactivate_file);
folio_batch_add_and_move(fbatch, folio, lru_deactivate_file_fn);
@@ -716,11 +726,16 @@ void deactivate_file_folio(struct folio *folio)
*/
void folio_deactivate(struct folio *folio)
{
- if (folio_test_lru(folio) && !folio_test_unevictable(folio) &&
- (folio_test_active(folio) || lru_gen_enabled())) {
+ if (!folio_test_unevictable(folio) && (folio_test_active(folio) ||
+ lru_gen_enabled())) {
struct folio_batch *fbatch;
folio_get(folio);
+ if (!folio_test_clear_lru(folio)) {
+ folio_put(folio);
+ return;
+ }
+
local_lock(&cpu_fbatches.lock);
fbatch = this_cpu_ptr(&cpu_fbatches.lru_deactivate);
folio_batch_add_and_move(fbatch, folio, lru_deactivate_fn);
@@ -737,12 +752,16 @@ void folio_deactivate(struct folio *folio)
*/
void folio_mark_lazyfree(struct folio *folio)
{
- if (folio_test_lru(folio) && folio_test_anon(folio) &&
- folio_test_swapbacked(folio) && !folio_test_swapcache(folio) &&
- !folio_test_unevictable(folio)) {
+ if (folio_test_anon(folio) && folio_test_swapbacked(folio) &&
+ !folio_test_swapcache(folio) && !folio_test_unevictable(folio)) {
struct folio_batch *fbatch;
folio_get(folio);
+ if (!folio_test_clear_lru(folio)) {
+ folio_put(folio);
+ return;
+ }
+
local_lock(&cpu_fbatches.lock);
fbatch = this_cpu_ptr(&cpu_fbatches.lru_lazyfree);
folio_batch_add_and_move(fbatch, folio, lru_lazyfree_fn);
--
2.7.4