From: Jeff Xu <jeffxu(a)google.com>
By default, memfd_create() creates a non-sealable MFD, unless the
MFD_ALLOW_SEALING flag is set.
When the MFD_NOEXEC_SEAL flag is initially introduced, the MFD created
with that flag is sealable, even though MFD_ALLOW_SEALING is not set.
This patch changes MFD_NOEXEC_SEAL to be non-sealable by default,
unless MFD_ALLOW_SEALING is explicitly set.
This is a non-backward compatible change. However, as MFD_NOEXEC_SEAL
is new, we expect not many applications will rely on the nature of
MFD_NOEXEC_SEAL being sealable. In most cases, the application already
sets MFD_ALLOW_SEALING if they need a sealable MFD.
Additionally, this enhances the useability of pid namespace sysctl
vm.memfd_noexec. When vm.memfd_noexec equals 1 or 2, the kernel will
add MFD_NOEXEC_SEAL if mfd_create does not specify MFD_EXEC or
MFD_NOEXEC_SEAL, and the addition of MFD_NOEXEC_SEAL enables the MFD
to be sealable. This means, any application that does not desire this
behavior will be unable to utilize vm.memfd_noexec = 1 or 2 to
migrate/enforce non-executable MFD. This adjustment ensures that
applications can anticipate that the sealable characteristic will
remain unmodified by vm.memfd_noexec.
This patch was initially developed by Barnabás Pőcze, and Barnabás
used Debian Code Search and GitHub to try to find potential breakages
and could only find a single one. Dbus-broker's memfd_create() wrapper
is aware of this implicit `MFD_ALLOW_SEALING` behavior, and tries to
work around it [1]. This workaround will break. Luckily, this only
affects the test suite, it does not affect
the normal operations of dbus-broker. There is a PR with a fix[2]. In
addition, David Rheinsberg also raised similar fix in [3]
[1]: https://github.com/bus1/dbus-broker/blob/9eb0b7e5826fc76cad7b025bc46f267d4a…
[2]: https://github.com/bus1/dbus-broker/pull/366
[3]: https://lore.kernel.org/lkml/20230714114753.170814-1-david@readahead.eu/
Cc: stable(a)vger.kernel.org
Fixes: 105ff5339f498a ("mm/memfd: add MFD_NOEXEC_SEAL and MFD_EXEC")
Signed-off-by: Barnabás Pőcze <pobrn(a)protonmail.com>
Signed-off-by: Jeff Xu <jeffxu(a)google.com>
Reviewed-by: David Rheinsberg <david(a)readahead.eu>
---
mm/memfd.c | 9 ++++----
tools/testing/selftests/memfd/memfd_test.c | 26 +++++++++++++++++++++-
2 files changed, 29 insertions(+), 6 deletions(-)
diff --git a/mm/memfd.c b/mm/memfd.c
index 7d8d3ab3fa37..8b7f6afee21d 100644
--- a/mm/memfd.c
+++ b/mm/memfd.c
@@ -356,12 +356,11 @@ SYSCALL_DEFINE2(memfd_create,
inode->i_mode &= ~0111;
file_seals = memfd_file_seals_ptr(file);
- if (file_seals) {
- *file_seals &= ~F_SEAL_SEAL;
+ if (file_seals)
*file_seals |= F_SEAL_EXEC;
- }
- } else if (flags & MFD_ALLOW_SEALING) {
- /* MFD_EXEC and MFD_ALLOW_SEALING are set */
+ }
+
+ if (flags & MFD_ALLOW_SEALING) {
file_seals = memfd_file_seals_ptr(file);
if (file_seals)
*file_seals &= ~F_SEAL_SEAL;
diff --git a/tools/testing/selftests/memfd/memfd_test.c b/tools/testing/selftests/memfd/memfd_test.c
index 95af2d78fd31..8579a93d006b 100644
--- a/tools/testing/selftests/memfd/memfd_test.c
+++ b/tools/testing/selftests/memfd/memfd_test.c
@@ -1151,7 +1151,7 @@ static void test_noexec_seal(void)
mfd_def_size,
MFD_CLOEXEC | MFD_NOEXEC_SEAL);
mfd_assert_mode(fd, 0666);
- mfd_assert_has_seals(fd, F_SEAL_EXEC);
+ mfd_assert_has_seals(fd, F_SEAL_SEAL | F_SEAL_EXEC);
mfd_fail_chmod(fd, 0777);
close(fd);
}
@@ -1169,6 +1169,14 @@ static void test_sysctl_sysctl0(void)
mfd_assert_has_seals(fd, 0);
mfd_assert_chmod(fd, 0644);
close(fd);
+
+ fd = mfd_assert_new("kern_memfd_sysctl_0_dfl",
+ mfd_def_size,
+ MFD_CLOEXEC);
+ mfd_assert_mode(fd, 0777);
+ mfd_assert_has_seals(fd, F_SEAL_SEAL);
+ mfd_assert_chmod(fd, 0644);
+ close(fd);
}
static void test_sysctl_set_sysctl0(void)
@@ -1206,6 +1214,14 @@ static void test_sysctl_sysctl1(void)
mfd_assert_has_seals(fd, F_SEAL_EXEC);
mfd_fail_chmod(fd, 0777);
close(fd);
+
+ fd = mfd_assert_new("kern_memfd_sysctl_1_noexec_nosealable",
+ mfd_def_size,
+ MFD_CLOEXEC | MFD_NOEXEC_SEAL);
+ mfd_assert_mode(fd, 0666);
+ mfd_assert_has_seals(fd, F_SEAL_EXEC | F_SEAL_SEAL);
+ mfd_fail_chmod(fd, 0777);
+ close(fd);
}
static void test_sysctl_set_sysctl1(void)
@@ -1238,6 +1254,14 @@ static void test_sysctl_sysctl2(void)
mfd_assert_has_seals(fd, F_SEAL_EXEC);
mfd_fail_chmod(fd, 0777);
close(fd);
+
+ fd = mfd_assert_new("kern_memfd_sysctl_2_noexec_notsealable",
+ mfd_def_size,
+ MFD_CLOEXEC | MFD_NOEXEC_SEAL);
+ mfd_assert_mode(fd, 0666);
+ mfd_assert_has_seals(fd, F_SEAL_EXEC | F_SEAL_SEAL);
+ mfd_fail_chmod(fd, 0777);
+ close(fd);
}
static void test_sysctl_set_sysctl2(void)
--
2.45.1.288.g0e0cd299f1-goog
amd_rng_mod_init() uses pci_read_config_dword() that returns PCIBIOS_*
codes. The return code is then returned as is but amd_rng_mod_init() is
a module_init() function that should return normal errnos.
Convert PCIBIOS_* returns code using pcibios_err_to_errno() into normal
errno before returning it.
Fixes: 96d63c0297cc ("[PATCH] Add AMD HW RNG driver")
Cc: stable(a)vger.kernel.org
Signed-off-by: Ilpo Järvinen <ilpo.jarvinen(a)linux.intel.com>
---
drivers/char/hw_random/amd-rng.c | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/drivers/char/hw_random/amd-rng.c b/drivers/char/hw_random/amd-rng.c
index 86162a13681e..9a24d19236dc 100644
--- a/drivers/char/hw_random/amd-rng.c
+++ b/drivers/char/hw_random/amd-rng.c
@@ -143,8 +143,10 @@ static int __init amd_rng_mod_init(void)
found:
err = pci_read_config_dword(pdev, 0x58, &pmbase);
- if (err)
+ if (err) {
+ err = pcibios_err_to_errno(err);
goto put_dev;
+ }
pmbase &= 0x0000FF00;
if (pmbase == 0) {
--
2.39.2
Please consider commit
15aa8fb852f995dd
x86/efistub: Omit physical KASLR when memory reservations exist
for backporting to v6.1 and later.
Thanks,
Ard.
This is the start of the stable review cycle for the 5.15.160 release.
There are 23 patches in this series, all will be posted as a response
to this one. If anyone has any issues with these being applied, please
let me know.
Responses should be made by Sat, 25 May 2024 13:03:15 +0000.
Anything received after that time might be too late.
The whole patch series can be found in one patch at:
https://www.kernel.org/pub/linux/kernel/v5.x/stable-review/patch-5.15.160-r…
or in the git tree and branch at:
git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git linux-5.15.y
and the diffstat can be found below.
thanks,
greg k-h
-------------
Pseudo-Shortlog of commits:
Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
Linux 5.15.160-rc1
Akira Yokosawa <akiyks(a)gmail.com>
docs: kernel_include.py: Cope with docutils 0.21
Thomas Weißschuh <linux(a)weissschuh.net>
admin-guide/hw-vuln/core-scheduling: fix return type of PR_SCHED_CORE_GET
Jarkko Sakkinen <jarkko(a)kernel.org>
KEYS: trusted: Do not use WARN when encode fails
AngeloGioacchino Del Regno <angelogioacchino.delregno(a)collabora.com>
remoteproc: mediatek: Make sure IPI buffer fits in L2TCM
Daniel Thompson <daniel.thompson(a)linaro.org>
serial: kgdboc: Fix NMI-safety problems from keyboard reset code
Heikki Krogerus <heikki.krogerus(a)linux.intel.com>
usb: typec: ucsi: displayport: Fix potential deadlock
Carlos Llamas <cmllamas(a)google.com>
binder: fix max_thread type inconsistency
Srinivasan Shanmugam <srinivasan.shanmugam(a)amd.com>
drm/amdgpu: Fix possible NULL dereference in amdgpu_ras_query_error_status_helper()
Sean Christopherson <seanjc(a)google.com>
KVM: x86: Clear "has_error_code", not "error_code", for RM exception injection
Eric Dumazet <edumazet(a)google.com>
netlink: annotate data-races around sk->sk_err
Eric Dumazet <edumazet(a)google.com>
netlink: annotate lockless accesses to nlk->max_recvmsg_len
Jakub Kicinski <kuba(a)kernel.org>
net: tls: handle backlogging of crypto requests
Jakub Kicinski <kuba(a)kernel.org>
tls: fix race between async notify and socket close
Jakub Kicinski <kuba(a)kernel.org>
net: tls: factor out tls_*crypt_async_wait()
Sabrina Dubroca <sd(a)queasysnail.net>
tls: extract context alloc/initialization out of tls_set_sw_offload
Jakub Kicinski <kuba(a)kernel.org>
tls: rx: simplify async wait
Doug Berger <opendmb(a)gmail.com>
net: bcmgenet: synchronize UMAC_CMD access
Doug Berger <opendmb(a)gmail.com>
net: bcmgenet: synchronize EXT_RGMII_OOB_CTRL access
Harshit Mogalapalli <harshit.m.mogalapalli(a)oracle.com>
Revert "selftests: mm: fix map_hugetlb failure on 64K page size systems"
Jarkko Sakkinen <jarkko(a)kernel.org>
KEYS: trusted: Fix memory leak in tpm2_key_encode()
NeilBrown <neilb(a)suse.de>
nfsd: don't allow nfsd threads to be signalled.
Sergey Shtylyov <s.shtylyov(a)omp.ru>
pinctrl: core: handle radix_tree_insert() errors in pinctrl_register_one_pin()
Jose Fernandez <josef(a)netflix.com>
drm/amd/display: Fix division by zero in setup_dsc_config
-------------
Diffstat:
.../admin-guide/hw-vuln/core-scheduling.rst | 4 +-
Documentation/sphinx/kernel_include.py | 1 -
Makefile | 4 +-
arch/x86/kvm/x86.c | 11 +-
drivers/android/binder.c | 2 +-
drivers/android/binder_internal.h | 2 +-
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 3 +
drivers/gpu/drm/amd/display/dc/dsc/dc_dsc.c | 7 +-
drivers/net/ethernet/broadcom/genet/bcmgenet.c | 12 +-
drivers/net/ethernet/broadcom/genet/bcmgenet.h | 2 +
drivers/net/ethernet/broadcom/genet/bcmgenet_wol.c | 6 +
drivers/net/ethernet/broadcom/genet/bcmmii.c | 4 +
drivers/pinctrl/core.c | 14 +-
drivers/remoteproc/mtk_scp.c | 10 +-
drivers/tty/serial/kgdboc.c | 30 +++-
drivers/usb/typec/ucsi/displayport.c | 4 -
fs/nfs/callback.c | 9 +-
fs/nfsd/nfs4proc.c | 5 +-
fs/nfsd/nfssvc.c | 12 --
include/net/tls.h | 6 -
net/netlink/af_netlink.c | 23 +--
net/sunrpc/svc_xprt.c | 16 +-
net/tls/tls_sw.c | 199 +++++++++++----------
security/keys/trusted-keys/trusted_tpm2.c | 25 ++-
tools/testing/selftests/vm/map_hugetlb.c | 7 -
25 files changed, 243 insertions(+), 175 deletions(-)
On Thu, 30 May 2024 at 21:11, Sasha Levin <sashal(a)kernel.org> wrote:
>
> This is a note to let you know that I've just added the patch titled
>
> arm64: fpsimd: Drop unneeded 'busy' flag
>
> to the 6.6-stable tree
Why?
> which can be found at:
> http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum…
>
> The filename of the patch is:
> arm64-fpsimd-drop-unneeded-busy-flag.patch
> and it can be found in the queue-6.6 subdirectory.
>
> If you, or anyone else, feels it should not be added to the stable tree,
> please let <stable(a)vger.kernel.org> know about it.
>
>
>
> commit 37f2773a1ef05374538d5e4ed26cbacebe363241
> Author: Ard Biesheuvel <ardb(a)kernel.org>
> Date: Fri Dec 8 12:32:20 2023 +0100
>
> arm64: fpsimd: Drop unneeded 'busy' flag
>
> [ Upstream commit 9b19700e623f96222c69ecb2adecb1a3e3664cc0 ]
>
> Kernel mode NEON will preserve the user mode FPSIMD state by saving it
> into the task struct before clobbering the registers. In order to avoid
> the need for preserving kernel mode state too, we disallow nested use of
> kernel mode NEON, i..e, use in softirq context while the interrupted
> task context was using kernel mode NEON too.
>
> Originally, this policy was implemented using a per-CPU flag which was
> exposed via may_use_simd(), requiring the users of the kernel mode NEON
> to deal with the possibility that it might return false, and having NEON
> and non-NEON code paths. This policy was changed by commit
> 13150149aa6ded1 ("arm64: fpsimd: run kernel mode NEON with softirqs
> disabled"), and now, softirq processing is disabled entirely instead,
> and so may_use_simd() can never fail when called from task or softirq
> context.
>
> This means we can drop the fpsimd_context_busy flag entirely, and
> instead, ensure that we disable softirq processing in places where we
> formerly relied on the flag for preventing races in the FPSIMD preserve
> routines.
>
> Signed-off-by: Ard Biesheuvel <ardb(a)kernel.org>
> Reviewed-by: Mark Brown <broonie(a)kernel.org>
> Tested-by: Geert Uytterhoeven <geert+renesas(a)glider.be>
> Link: https://lore.kernel.org/r/20231208113218.3001940-7-ardb@google.com
> [will: Folded in fix from CAMj1kXFhzbJRyWHELCivQW1yJaF=p07LLtbuyXYX3G1WtsdyQg(a)mail.gmail.com]
> Signed-off-by: Will Deacon <will(a)kernel.org>
> Stable-dep-of: b8995a184170 ("Revert "arm64: fpsimd: Implement lazy restore for kernel mode FPSIMD"")
> Signed-off-by: Sasha Levin <sashal(a)kernel.org>
>
> diff --git a/arch/arm64/include/asm/simd.h b/arch/arm64/include/asm/simd.h
> index 6a75d7ecdcaa2..8e86c9e70e483 100644
> --- a/arch/arm64/include/asm/simd.h
> +++ b/arch/arm64/include/asm/simd.h
> @@ -12,8 +12,6 @@
> #include <linux/preempt.h>
> #include <linux/types.h>
>
> -DECLARE_PER_CPU(bool, fpsimd_context_busy);
> -
> #ifdef CONFIG_KERNEL_MODE_NEON
>
> /*
> @@ -28,17 +26,10 @@ static __must_check inline bool may_use_simd(void)
> /*
> * We must make sure that the SVE has been initialized properly
> * before using the SIMD in kernel.
> - * fpsimd_context_busy is only set while preemption is disabled,
> - * and is clear whenever preemption is enabled. Since
> - * this_cpu_read() is atomic w.r.t. preemption, fpsimd_context_busy
> - * cannot change under our feet -- if it's set we cannot be
> - * migrated, and if it's clear we cannot be migrated to a CPU
> - * where it is set.
> */
> return !WARN_ON(!system_capabilities_finalized()) &&
> system_supports_fpsimd() &&
> - !in_hardirq() && !irqs_disabled() && !in_nmi() &&
> - !this_cpu_read(fpsimd_context_busy);
> + !in_hardirq() && !irqs_disabled() && !in_nmi();
> }
>
> #else /* ! CONFIG_KERNEL_MODE_NEON */
> diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c
> index 5cdfcc9e3e54b..b805bdab284c4 100644
> --- a/arch/arm64/kernel/fpsimd.c
> +++ b/arch/arm64/kernel/fpsimd.c
> @@ -85,13 +85,13 @@
> * softirq kicks in. Upon vcpu_put(), KVM will save the vcpu FP state and
> * flag the register state as invalid.
> *
> - * In order to allow softirq handlers to use FPSIMD, kernel_neon_begin() may
> - * save the task's FPSIMD context back to task_struct from softirq context.
> - * To prevent this from racing with the manipulation of the task's FPSIMD state
> - * from task context and thereby corrupting the state, it is necessary to
> - * protect any manipulation of a task's fpsimd_state or TIF_FOREIGN_FPSTATE
> - * flag with {, __}get_cpu_fpsimd_context(). This will still allow softirqs to
> - * run but prevent them to use FPSIMD.
> + * In order to allow softirq handlers to use FPSIMD, kernel_neon_begin() may be
> + * called from softirq context, which will save the task's FPSIMD context back
> + * to task_struct. To prevent this from racing with the manipulation of the
> + * task's FPSIMD state from task context and thereby corrupting the state, it
> + * is necessary to protect any manipulation of a task's fpsimd_state or
> + * TIF_FOREIGN_FPSTATE flag with get_cpu_fpsimd_context(), which will suspend
> + * softirq servicing entirely until put_cpu_fpsimd_context() is called.
> *
> * For a certain task, the sequence may look something like this:
> * - the task gets scheduled in; if both the task's fpsimd_cpu field
> @@ -209,27 +209,14 @@ static inline void sme_free(struct task_struct *t) { }
>
> #endif
>
> -DEFINE_PER_CPU(bool, fpsimd_context_busy);
> -EXPORT_PER_CPU_SYMBOL(fpsimd_context_busy);
> -
> static void fpsimd_bind_task_to_cpu(void);
>
> -static void __get_cpu_fpsimd_context(void)
> -{
> - bool busy = __this_cpu_xchg(fpsimd_context_busy, true);
> -
> - WARN_ON(busy);
> -}
> -
> /*
> * Claim ownership of the CPU FPSIMD context for use by the calling context.
> *
> * The caller may freely manipulate the FPSIMD context metadata until
> * put_cpu_fpsimd_context() is called.
> *
> - * The double-underscore version must only be called if you know the task
> - * can't be preempted.
> - *
> * On RT kernels local_bh_disable() is not sufficient because it only
> * serializes soft interrupt related sections via a local lock, but stays
> * preemptible. Disabling preemption is the right choice here as bottom
> @@ -242,14 +229,6 @@ static void get_cpu_fpsimd_context(void)
> local_bh_disable();
> else
> preempt_disable();
> - __get_cpu_fpsimd_context();
> -}
> -
> -static void __put_cpu_fpsimd_context(void)
> -{
> - bool busy = __this_cpu_xchg(fpsimd_context_busy, false);
> -
> - WARN_ON(!busy); /* No matching get_cpu_fpsimd_context()? */
> }
>
> /*
> @@ -261,18 +240,12 @@ static void __put_cpu_fpsimd_context(void)
> */
> static void put_cpu_fpsimd_context(void)
> {
> - __put_cpu_fpsimd_context();
> if (!IS_ENABLED(CONFIG_PREEMPT_RT))
> local_bh_enable();
> else
> preempt_enable();
> }
>
> -static bool have_cpu_fpsimd_context(void)
> -{
> - return !preemptible() && __this_cpu_read(fpsimd_context_busy);
> -}
> -
> unsigned int task_get_vl(const struct task_struct *task, enum vec_type type)
> {
> return task->thread.vl[type];
> @@ -383,7 +356,7 @@ static void task_fpsimd_load(void)
> bool restore_ffr;
>
> WARN_ON(!system_supports_fpsimd());
> - WARN_ON(!have_cpu_fpsimd_context());
> + WARN_ON(preemptible());
>
> if (system_supports_sve() || system_supports_sme()) {
> switch (current->thread.fp_type) {
> @@ -467,7 +440,7 @@ static void fpsimd_save(void)
> unsigned int vl;
>
> WARN_ON(!system_supports_fpsimd());
> - WARN_ON(!have_cpu_fpsimd_context());
> + WARN_ON(preemptible());
>
> if (test_thread_flag(TIF_FOREIGN_FPSTATE))
> return;
> @@ -1583,7 +1556,7 @@ void fpsimd_thread_switch(struct task_struct *next)
> if (!system_supports_fpsimd())
> return;
>
> - __get_cpu_fpsimd_context();
> + WARN_ON_ONCE(!irqs_disabled());
>
> /* Save unsaved fpsimd state, if any: */
> fpsimd_save();
> @@ -1599,8 +1572,6 @@ void fpsimd_thread_switch(struct task_struct *next)
>
> update_tsk_thread_flag(next, TIF_FOREIGN_FPSTATE,
> wrong_task || wrong_cpu);
> -
> - __put_cpu_fpsimd_context();
> }
>
> static void fpsimd_flush_thread_vl(enum vec_type type)
> @@ -1892,13 +1863,15 @@ static void fpsimd_flush_cpu_state(void)
> */
> void fpsimd_save_and_flush_cpu_state(void)
> {
> + unsigned long flags;
> +
> if (!system_supports_fpsimd())
> return;
> WARN_ON(preemptible());
> - __get_cpu_fpsimd_context();
> + local_irq_save(flags);
> fpsimd_save();
> fpsimd_flush_cpu_state();
> - __put_cpu_fpsimd_context();
> + local_irq_restore(flags);
> }
>
> #ifdef CONFIG_KERNEL_MODE_NEON
commit 4a63bd179fa8d3fcc44a0d9d71d941ddd62f0c4e upstream.
Currently ALSA timer doesn't have the lower limit of the start tick
time, and it allows a very small size, e.g. 1 tick with 1ns resolution
for hrtimer. Such a situation may lead to an unexpected RCU stall,
where the callback repeatedly queuing the expire update, as reported
by fuzzer.
This patch introduces a sanity check of the timer start tick time, so
that the system returns an error when a too small start size is set.
As of this patch, the lower limit is hard-coded to 100us, which is
small enough but can still work somehow.
[ backport note: the error handling is changed, as the original commit
is based on the recent cleanup with guard() in commit beb45974dd49
-- tiwai ]
Reported-by: syzbot+43120c2af6ca2938cc38(a)syzkaller.appspotmail.com
Closes: https://lore.kernel.org/r/000000000000fa00a1061740ab6d@google.com
Cc: <stable(a)vger.kernel.org>
Link: https://lore.kernel.org/r/20240514182745.4015-1-tiwai@suse.de
Signed-off-by: Takashi Iwai <tiwai(a)suse.de>
---
Greg, this is an alternative fix to the original cherry-pick; apply
to 6.8.y and older stable kernels. Thanks!
sound/core/timer.c | 10 ++++++++++
1 file changed, 10 insertions(+)
diff --git a/sound/core/timer.c b/sound/core/timer.c
index e6e551d4a29e..a0b515981ee9 100644
--- a/sound/core/timer.c
+++ b/sound/core/timer.c
@@ -553,6 +553,16 @@ static int snd_timer_start1(struct snd_timer_instance *timeri,
goto unlock;
}
+ /* check the actual time for the start tick;
+ * bail out as error if it's way too low (< 100us)
+ */
+ if (start) {
+ if ((u64)snd_timer_hw_resolution(timer) * ticks < 100000) {
+ result = -EINVAL;
+ goto unlock;
+ }
+ }
+
if (start)
timeri->ticks = timeri->cticks = ticks;
else if (!timeri->cticks)
--
2.43.0
In our production environment, we found many hung tasks which are
blocked for more than 18 hours. Their call traces are like this:
[346278.191038] __schedule+0x2d8/0x890
[346278.191046] schedule+0x4e/0xb0
[346278.191049] perf_event_free_task+0x220/0x270
[346278.191056] ? init_wait_var_entry+0x50/0x50
[346278.191060] copy_process+0x663/0x18d0
[346278.191068] kernel_clone+0x9d/0x3d0
[346278.191072] __do_sys_clone+0x5d/0x80
[346278.191076] __x64_sys_clone+0x25/0x30
[346278.191079] do_syscall_64+0x5c/0xc0
[346278.191083] ? syscall_exit_to_user_mode+0x27/0x50
[346278.191086] ? do_syscall_64+0x69/0xc0
[346278.191088] ? irqentry_exit_to_user_mode+0x9/0x20
[346278.191092] ? irqentry_exit+0x19/0x30
[346278.191095] ? exc_page_fault+0x89/0x160
[346278.191097] ? asm_exc_page_fault+0x8/0x30
[346278.191102] entry_SYSCALL_64_after_hwframe+0x44/0xae
The task was waiting for the refcount become to 1, but from the vmcore,
we found the refcount has already been 1. It seems that the task didn't
get woken up by perf_event_release_kernel() and got stuck forever. The
below scenario may cause the problem.
Thread A Thread B
... ...
perf_event_free_task perf_event_release_kernel
...
acquire event->child_mutex
...
get_ctx
... release event->child_mutex
acquire ctx->mutex
...
perf_free_event (acquire/release event->child_mutex)
...
release ctx->mutex
wait_var_event
acquire ctx->mutex
acquire event->child_mutex
# move existing events to free_list
release event->child_mutex
release ctx->mutex
put_ctx
... ...
In this case, all events of the ctx have been freed, so we couldn't
find the ctx in free_list and Thread A will miss the wakeup. It's thus
necessary to add a wakeup after dropping the reference.
Fixes: 1cf8dfe8a661 ("perf/core: Fix race between close() and fork()")
Cc: stable(a)vger.kernel.org
Signed-off-by: Haifeng Xu <haifeng.xu(a)shopee.com>
Reviewed-by: Frederic Weisbecker <frederic(a)kernel.org>
Acked-by: Mark Rutland <mark.rutland(a)arm.com>
---
Changes since v1:
- Add the fixed tag.
- Simplify v1's patch. (Frederic)
Changes since v2:
- Use Reviewed-by tag instead of Signed-off-by tag.
Changes since v3:
- Add Acked-by tag.
- Cc stable(a)vger.kernel.org. (Mark)
---
kernel/events/core.c | 13 +++++++++++++
1 file changed, 13 insertions(+)
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 4f0c45ab8d7d..15c35070db6a 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -5340,6 +5340,7 @@ int perf_event_release_kernel(struct perf_event *event)
again:
mutex_lock(&event->child_mutex);
list_for_each_entry(child, &event->child_list, child_list) {
+ void *var = NULL;
/*
* Cannot change, child events are not migrated, see the
@@ -5380,11 +5381,23 @@ int perf_event_release_kernel(struct perf_event *event)
* this can't be the last reference.
*/
put_event(event);
+ } else {
+ var = &ctx->refcount;
}
mutex_unlock(&event->child_mutex);
mutex_unlock(&ctx->mutex);
put_ctx(ctx);
+
+ if (var) {
+ /*
+ * If perf_event_free_task() has deleted all events from the
+ * ctx while the child_mutex got released above, make sure to
+ * notify about the preceding put_ctx().
+ */
+ smp_mb(); /* pairs with wait_var_event() */
+ wake_up_var(var);
+ }
goto again;
}
mutex_unlock(&event->child_mutex);
--
2.25.1
For atomic_sub_and_test() the @i parameter is the value to subtract, not
add. Fix the kerneldoc comment accordingly.
Fixes: ad8110706f38 ("locking/atomic: scripts: generate kerneldoc comments")
Cc: Mark Rutland <mark.rutland(a)arm.com>
Cc: stable(a)vger.kernel.org
Signed-off-by: Carlos Llamas <cmllamas(a)google.com>
---
include/linux/atomic/atomic-instrumented.h | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/include/linux/atomic/atomic-instrumented.h b/include/linux/atomic/atomic-instrumented.h
index debd487fe971..12b558c05384 100644
--- a/include/linux/atomic/atomic-instrumented.h
+++ b/include/linux/atomic/atomic-instrumented.h
@@ -1349,7 +1349,7 @@ atomic_try_cmpxchg_relaxed(atomic_t *v, int *old, int new)
/**
* atomic_sub_and_test() - atomic subtract and test if zero with full ordering
- * @i: int value to add
+ * @i: int value to subtract
* @v: pointer to atomic_t
*
* Atomically updates @v to (@v - @i) with full ordering.
--
2.45.0.rc1.225.g2a3ae87e7f-goog
Use {READ,WRITE}_ONCE() to access kvm->last_boosted_vcpu to ensure the
loads and stores are atomic. In the extremely unlikely scenario the
compiler tears the stores, it's theoretically possible for KVM to attempt
to get a vCPU using an out-of-bounds index, e.g. if the write is split
into multiple 8-bit stores, and is paired with a 32-bit load on a VM with
257 vCPUs:
CPU0 CPU1
last_boosted_vcpu = 0xff;
(last_boosted_vcpu = 0x100)
last_boosted_vcpu[15:8] = 0x01;
i = (last_boosted_vcpu = 0x1ff)
last_boosted_vcpu[7:0] = 0x00;
vcpu = kvm->vcpu_array[0x1ff];
As detected by KCSAN:
BUG: KCSAN: data-race in kvm_vcpu_on_spin [kvm] / kvm_vcpu_on_spin [kvm]
write to 0xffffc90025a92344 of 4 bytes by task 4340 on cpu 16:
kvm_vcpu_on_spin (arch/x86/kvm/../../../virt/kvm/kvm_main.c:4112) kvm
handle_pause (arch/x86/kvm/vmx/vmx.c:5929) kvm_intel
vmx_handle_exit (arch/x86/kvm/vmx/vmx.c:?
arch/x86/kvm/vmx/vmx.c:6606) kvm_intel
vcpu_run (arch/x86/kvm/x86.c:11107 arch/x86/kvm/x86.c:11211) kvm
kvm_arch_vcpu_ioctl_run (arch/x86/kvm/x86.c:?) kvm
kvm_vcpu_ioctl (arch/x86/kvm/../../../virt/kvm/kvm_main.c:?) kvm
__se_sys_ioctl (fs/ioctl.c:52 fs/ioctl.c:904 fs/ioctl.c:890)
__x64_sys_ioctl (fs/ioctl.c:890)
x64_sys_call (arch/x86/entry/syscall_64.c:33)
do_syscall_64 (arch/x86/entry/common.c:?)
entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:130)
read to 0xffffc90025a92344 of 4 bytes by task 4342 on cpu 4:
kvm_vcpu_on_spin (arch/x86/kvm/../../../virt/kvm/kvm_main.c:4069) kvm
handle_pause (arch/x86/kvm/vmx/vmx.c:5929) kvm_intel
vmx_handle_exit (arch/x86/kvm/vmx/vmx.c:?
arch/x86/kvm/vmx/vmx.c:6606) kvm_intel
vcpu_run (arch/x86/kvm/x86.c:11107 arch/x86/kvm/x86.c:11211) kvm
kvm_arch_vcpu_ioctl_run (arch/x86/kvm/x86.c:?) kvm
kvm_vcpu_ioctl (arch/x86/kvm/../../../virt/kvm/kvm_main.c:?) kvm
__se_sys_ioctl (fs/ioctl.c:52 fs/ioctl.c:904 fs/ioctl.c:890)
__x64_sys_ioctl (fs/ioctl.c:890)
x64_sys_call (arch/x86/entry/syscall_64.c:33)
do_syscall_64 (arch/x86/entry/common.c:?)
entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:130)
value changed: 0x00000012 -> 0x00000000
Fixes: 217ece6129f2 ("KVM: use yield_to instead of sleep in kvm_vcpu_on_spin")
Cc: stable(a)vger.kernel.org
Signed-off-by: Breno Leitao <leitao(a)debian.org>
---
Changelog:
v2:
* Reworded the git commit as suggested by Sean
* Dropped the me->kvm->last_boosted_vcpu in favor of
kvm->last_boosted_vcpu as suggested by Sean
v1:
* https://lore.kernel.org/all/20240509090146.146153-1-leitao@debian.org/
---
virt/kvm/kvm_main.c | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index ff0a20565f90..d9ce063c76f9 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -4066,12 +4066,13 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool yield_to_kernel_mode)
{
struct kvm *kvm = me->kvm;
struct kvm_vcpu *vcpu;
- int last_boosted_vcpu = me->kvm->last_boosted_vcpu;
+ int last_boosted_vcpu;
unsigned long i;
int yielded = 0;
int try = 3;
int pass;
+ last_boosted_vcpu = READ_ONCE(kvm->last_boosted_vcpu);
kvm_vcpu_set_in_spin_loop(me, true);
/*
* We boost the priority of a VCPU that is runnable but not
@@ -4109,7 +4110,7 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool yield_to_kernel_mode)
yielded = kvm_vcpu_yield_to(vcpu);
if (yielded > 0) {
- kvm->last_boosted_vcpu = i;
+ WRITE_ONCE(kvm->last_boosted_vcpu, i);
break;
} else if (yielded < 0) {
try--;
--
2.43.0