From: "Luis Henriques (SUSE)" <luis.henriques(a)linux.dev>
[ commit 23dfdb56581ad92a9967bcd720c8c23356af74c1 upstream ]
The following kernel trace can be triggered with fstest generic/629 when
executed against a filesystem with fast-commit feature enabled:
INFO: trying to register non-static key.
The code is fine but needs lockdep annotation, or maybe
you didn't initialize this object before use?
turning off the locking correctness validator.
CPU: 0 PID: 866 Comm: mount Not tainted 6.10.0+ #11
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.2-3-gd478f380-prebuilt.qemu.org 04/01/2014
Call Trace:
<TASK>
dump_stack_lvl+0x66/0x90
register_lock_class+0x759/0x7d0
__lock_acquire+0x85/0x2630
? __find_get_block+0xb4/0x380
lock_acquire+0xd1/0x2d0
? __ext4_journal_get_write_access+0xd5/0x160
_raw_spin_lock+0x33/0x40
? __ext4_journal_get_write_access+0xd5/0x160
__ext4_journal_get_write_access+0xd5/0x160
ext4_reserve_inode_write+0x61/0xb0
__ext4_mark_inode_dirty+0x79/0x270
? ext4_ext_replay_set_iblocks+0x2f8/0x450
ext4_ext_replay_set_iblocks+0x330/0x450
ext4_fc_replay+0x14c8/0x1540
? jread+0x88/0x2e0
? rcu_is_watching+0x11/0x40
do_one_pass+0x447/0xd00
jbd2_journal_recover+0x139/0x1b0
jbd2_journal_load+0x96/0x390
ext4_load_and_init_journal+0x253/0xd40
ext4_fill_super+0x2cc6/0x3180
...
In the replay path there's an attempt to lock sbi->s_bdev_wb_lock in
function ext4_check_bdev_write_error(). Unfortunately, at this point this
spinlock has not been initialized yet. Moving it's initialization to an
earlier point in __ext4_fill_super() fixes this splat.
Signed-off-by: Luis Henriques (SUSE) <luis.henriques(a)linux.dev>
Link: https://patch.msgid.link/20240718094356.7863-1-luis.henriques@linux.dev
Signed-off-by: Theodore Ts'o <tytso(a)mit.edu>
Cc: stable(a)kernel.org
Signed-off-by: Imkanmod Khan <imkanmodkhan(a)gmail.com>
---
fs/ext4/super.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 0b2591c07166..53f1deb049ec 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -5264,6 +5264,8 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */
mutex_init(&sbi->s_orphan_lock);
+ spin_lock_init(&sbi->s_bdev_wb_lock);
+
ext4_fast_commit_init(sb);
sb->s_root = NULL;
@@ -5514,7 +5516,6 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
* Save the original bdev mapping's wb_err value which could be
* used to detect the metadata async write error.
*/
- spin_lock_init(&sbi->s_bdev_wb_lock);
errseq_check_and_advance(&sb->s_bdev->bd_inode->i_mapping->wb_err,
&sbi->s_bdev_wb_err);
sb->s_bdev->bd_super = sb;
--
2.25.1
From: "Luis Henriques (SUSE)" <luis.henriques(a)linux.dev>
[ commit 23dfdb56581ad92a9967bcd720c8c23356af74c1 upstream ]
The following kernel trace can be triggered with fstest generic/629 when
executed against a filesystem with fast-commit feature enabled:
INFO: trying to register non-static key.
The code is fine but needs lockdep annotation, or maybe
you didn't initialize this object before use?
turning off the locking correctness validator.
CPU: 0 PID: 866 Comm: mount Not tainted 6.10.0+ #11
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.2-3-gd478f380-prebuilt.qemu.org 04/01/2014
Call Trace:
<TASK>
dump_stack_lvl+0x66/0x90
register_lock_class+0x759/0x7d0
__lock_acquire+0x85/0x2630
? __find_get_block+0xb4/0x380
lock_acquire+0xd1/0x2d0
? __ext4_journal_get_write_access+0xd5/0x160
_raw_spin_lock+0x33/0x40
? __ext4_journal_get_write_access+0xd5/0x160
__ext4_journal_get_write_access+0xd5/0x160
ext4_reserve_inode_write+0x61/0xb0
__ext4_mark_inode_dirty+0x79/0x270
? ext4_ext_replay_set_iblocks+0x2f8/0x450
ext4_ext_replay_set_iblocks+0x330/0x450
ext4_fc_replay+0x14c8/0x1540
? jread+0x88/0x2e0
? rcu_is_watching+0x11/0x40
do_one_pass+0x447/0xd00
jbd2_journal_recover+0x139/0x1b0
jbd2_journal_load+0x96/0x390
ext4_load_and_init_journal+0x253/0xd40
ext4_fill_super+0x2cc6/0x3180
...
In the replay path there's an attempt to lock sbi->s_bdev_wb_lock in
function ext4_check_bdev_write_error(). Unfortunately, at this point this
spinlock has not been initialized yet. Moving it's initialization to an
earlier point in __ext4_fill_super() fixes this splat.
Signed-off-by: Luis Henriques (SUSE) <luis.henriques(a)linux.dev>
Link: https://patch.msgid.link/20240718094356.7863-1-luis.henriques@linux.dev
Signed-off-by: Theodore Ts'o <tytso(a)mit.edu>
Cc: stable(a)kernel.org
Signed-off-by: Imkanmod Khan <imkanmodkhan(a)gmail.com>
---
fs/ext4/super.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 71ced0ada9a2..f019ce64eba4 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -5366,6 +5366,8 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */
mutex_init(&sbi->s_orphan_lock);
+ spin_lock_init(&sbi->s_bdev_wb_lock);
+
ext4_fast_commit_init(sb);
sb->s_root = NULL;
@@ -5586,7 +5588,6 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
* Save the original bdev mapping's wb_err value which could be
* used to detect the metadata async write error.
*/
- spin_lock_init(&sbi->s_bdev_wb_lock);
errseq_check_and_advance(&sb->s_bdev->bd_inode->i_mapping->wb_err,
&sbi->s_bdev_wb_err);
EXT4_SB(sb)->s_mount_state |= EXT4_ORPHAN_FS;
--
2.25.1
From: David Woodhouse <dwmw(a)amazon.co.uk>
[ Upstream commit 4b5bc2ec9a239bce261ffeafdd63571134102323 ]
Now that the following fix:
d0ceea662d45 ("x86/mm: Add _PAGE_NOPTISHADOW bit to avoid updating userspace page tables")
stops kernel_ident_mapping_init() from scribbling over the end of a
4KiB PGD by assuming the following 4KiB will be a userspace PGD,
there's no good reason for the kexec PGD to be part of a single
8KiB allocation with the control_code_page.
( It's not clear that that was the reason for x86_64 kexec doing it that
way in the first place either; there were no comments to that effect and
it seems to have been the case even before PTI came along. It looks like
it was just a happy accident which prevented memory corruption on kexec. )
Either way, it definitely isn't needed now. Just allocate the PGD
separately on x86_64, like i386 already does.
Signed-off-by: David Woodhouse <dwmw(a)amazon.co.uk>
Signed-off-by: Ingo Molnar <mingo(a)kernel.org>
Cc: Baoquan He <bhe(a)redhat.com>
Cc: Vivek Goyal <vgoyal(a)redhat.com>
Cc: Dave Young <dyoung(a)redhat.com>
Cc: Eric Biederman <ebiederm(a)xmission.com>
Cc: Ard Biesheuvel <ardb(a)kernel.org>
Cc: "H. Peter Anvin" <hpa(a)zytor.com>
Link: https://lore.kernel.org/r/20241205153343.3275139-6-dwmw2@infradead.org
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
---
arch/x86/include/asm/kexec.h | 18 +++++++++---
arch/x86/kernel/machine_kexec_64.c | 45 ++++++++++++++++--------------
2 files changed, 38 insertions(+), 25 deletions(-)
diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h
index 256eee99afc8f..e2e1ec99c9998 100644
--- a/arch/x86/include/asm/kexec.h
+++ b/arch/x86/include/asm/kexec.h
@@ -16,6 +16,7 @@
# define PAGES_NR 4
#endif
+# define KEXEC_CONTROL_PAGE_SIZE 4096
# define KEXEC_CONTROL_CODE_MAX_SIZE 2048
#ifndef __ASSEMBLY__
@@ -44,7 +45,6 @@ struct kimage;
/* Maximum address we can use for the control code buffer */
# define KEXEC_CONTROL_MEMORY_LIMIT TASK_SIZE
-# define KEXEC_CONTROL_PAGE_SIZE 4096
/* The native architecture */
# define KEXEC_ARCH KEXEC_ARCH_386
@@ -59,9 +59,6 @@ struct kimage;
/* Maximum address we can use for the control pages */
# define KEXEC_CONTROL_MEMORY_LIMIT (MAXMEM-1)
-/* Allocate one page for the pdp and the second for the code */
-# define KEXEC_CONTROL_PAGE_SIZE (4096UL + 4096UL)
-
/* The native architecture */
# define KEXEC_ARCH KEXEC_ARCH_X86_64
#endif
@@ -146,6 +143,19 @@ struct kimage_arch {
};
#else
struct kimage_arch {
+ /*
+ * This is a kimage control page, as it must not overlap with either
+ * source or destination address ranges.
+ */
+ pgd_t *pgd;
+ /*
+ * The virtual mapping of the control code page itself is used only
+ * during the transition, while the current kernel's pages are all
+ * in place. Thus the intermediate page table pages used to map it
+ * are not control pages, but instead just normal pages obtained
+ * with get_zeroed_page(). And have to be tracked (below) so that
+ * they can be freed.
+ */
p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index 24b6eaacc81eb..5d61a342871b5 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -149,7 +149,8 @@ static void free_transition_pgtable(struct kimage *image)
image->arch.pte = NULL;
}
-static int init_transition_pgtable(struct kimage *image, pgd_t *pgd)
+static int init_transition_pgtable(struct kimage *image, pgd_t *pgd,
+ unsigned long control_page)
{
pgprot_t prot = PAGE_KERNEL_EXEC_NOENC;
unsigned long vaddr, paddr;
@@ -160,7 +161,7 @@ static int init_transition_pgtable(struct kimage *image, pgd_t *pgd)
pte_t *pte;
vaddr = (unsigned long)relocate_kernel;
- paddr = __pa(page_address(image->control_code_page)+PAGE_SIZE);
+ paddr = control_page;
pgd += pgd_index(vaddr);
if (!pgd_present(*pgd)) {
p4d = (p4d_t *)get_zeroed_page(GFP_KERNEL);
@@ -219,7 +220,7 @@ static void *alloc_pgt_page(void *data)
return p;
}
-static int init_pgtable(struct kimage *image, unsigned long start_pgtable)
+static int init_pgtable(struct kimage *image, unsigned long control_page)
{
struct x86_mapping_info info = {
.alloc_pgt_page = alloc_pgt_page,
@@ -228,12 +229,12 @@ static int init_pgtable(struct kimage *image, unsigned long start_pgtable)
.kernpg_flag = _KERNPG_TABLE_NOENC,
};
unsigned long mstart, mend;
- pgd_t *level4p;
int result;
int i;
- level4p = (pgd_t *)__va(start_pgtable);
- clear_page(level4p);
+ image->arch.pgd = alloc_pgt_page(image);
+ if (!image->arch.pgd)
+ return -ENOMEM;
if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) {
info.page_flag |= _PAGE_ENC;
@@ -247,8 +248,8 @@ static int init_pgtable(struct kimage *image, unsigned long start_pgtable)
mstart = pfn_mapped[i].start << PAGE_SHIFT;
mend = pfn_mapped[i].end << PAGE_SHIFT;
- result = kernel_ident_mapping_init(&info,
- level4p, mstart, mend);
+ result = kernel_ident_mapping_init(&info, image->arch.pgd,
+ mstart, mend);
if (result)
return result;
}
@@ -263,8 +264,8 @@ static int init_pgtable(struct kimage *image, unsigned long start_pgtable)
mstart = image->segment[i].mem;
mend = mstart + image->segment[i].memsz;
- result = kernel_ident_mapping_init(&info,
- level4p, mstart, mend);
+ result = kernel_ident_mapping_init(&info, image->arch.pgd,
+ mstart, mend);
if (result)
return result;
@@ -274,15 +275,19 @@ static int init_pgtable(struct kimage *image, unsigned long start_pgtable)
* Prepare EFI systab and ACPI tables for kexec kernel since they are
* not covered by pfn_mapped.
*/
- result = map_efi_systab(&info, level4p);
+ result = map_efi_systab(&info, image->arch.pgd);
if (result)
return result;
- result = map_acpi_tables(&info, level4p);
+ result = map_acpi_tables(&info, image->arch.pgd);
if (result)
return result;
- return init_transition_pgtable(image, level4p);
+ /*
+ * This must be last because the intermediate page table pages it
+ * allocates will not be control pages and may overlap the image.
+ */
+ return init_transition_pgtable(image, image->arch.pgd, control_page);
}
static void load_segments(void)
@@ -299,14 +304,14 @@ static void load_segments(void)
int machine_kexec_prepare(struct kimage *image)
{
- unsigned long start_pgtable;
+ unsigned long control_page;
int result;
/* Calculate the offsets */
- start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT;
+ control_page = page_to_pfn(image->control_code_page) << PAGE_SHIFT;
/* Setup the identity mapped 64bit page table */
- result = init_pgtable(image, start_pgtable);
+ result = init_pgtable(image, control_page);
if (result)
return result;
@@ -353,13 +358,12 @@ void machine_kexec(struct kimage *image)
#endif
}
- control_page = page_address(image->control_code_page) + PAGE_SIZE;
+ control_page = page_address(image->control_code_page);
__memcpy(control_page, relocate_kernel, KEXEC_CONTROL_CODE_MAX_SIZE);
page_list[PA_CONTROL_PAGE] = virt_to_phys(control_page);
page_list[VA_CONTROL_PAGE] = (unsigned long)control_page;
- page_list[PA_TABLE_PAGE] =
- (unsigned long)__pa(page_address(image->control_code_page));
+ page_list[PA_TABLE_PAGE] = (unsigned long)__pa(image->arch.pgd);
if (image->type == KEXEC_TYPE_DEFAULT)
page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page)
@@ -578,8 +582,7 @@ static void kexec_mark_crashkres(bool protect)
/* Don't touch the control code page used in crash_kexec().*/
control = PFN_PHYS(page_to_pfn(kexec_crash_image->control_code_page));
- /* Control code page is located in the 2nd page. */
- kexec_mark_range(crashk_res.start, control + PAGE_SIZE - 1, protect);
+ kexec_mark_range(crashk_res.start, control - 1, protect);
control += KEXEC_CONTROL_PAGE_SIZE;
kexec_mark_range(control, crashk_res.end, protect);
}
--
2.39.5
From: David Woodhouse <dwmw(a)amazon.co.uk>
[ Upstream commit 4b5bc2ec9a239bce261ffeafdd63571134102323 ]
Now that the following fix:
d0ceea662d45 ("x86/mm: Add _PAGE_NOPTISHADOW bit to avoid updating userspace page tables")
stops kernel_ident_mapping_init() from scribbling over the end of a
4KiB PGD by assuming the following 4KiB will be a userspace PGD,
there's no good reason for the kexec PGD to be part of a single
8KiB allocation with the control_code_page.
( It's not clear that that was the reason for x86_64 kexec doing it that
way in the first place either; there were no comments to that effect and
it seems to have been the case even before PTI came along. It looks like
it was just a happy accident which prevented memory corruption on kexec. )
Either way, it definitely isn't needed now. Just allocate the PGD
separately on x86_64, like i386 already does.
Signed-off-by: David Woodhouse <dwmw(a)amazon.co.uk>
Signed-off-by: Ingo Molnar <mingo(a)kernel.org>
Cc: Baoquan He <bhe(a)redhat.com>
Cc: Vivek Goyal <vgoyal(a)redhat.com>
Cc: Dave Young <dyoung(a)redhat.com>
Cc: Eric Biederman <ebiederm(a)xmission.com>
Cc: Ard Biesheuvel <ardb(a)kernel.org>
Cc: "H. Peter Anvin" <hpa(a)zytor.com>
Link: https://lore.kernel.org/r/20241205153343.3275139-6-dwmw2@infradead.org
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
---
arch/x86/include/asm/kexec.h | 18 +++++++++---
arch/x86/kernel/machine_kexec_64.c | 45 ++++++++++++++++--------------
2 files changed, 38 insertions(+), 25 deletions(-)
diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h
index c9f6a6c5de3cf..d54dd7084a3e1 100644
--- a/arch/x86/include/asm/kexec.h
+++ b/arch/x86/include/asm/kexec.h
@@ -16,6 +16,7 @@
# define PAGES_NR 4
#endif
+# define KEXEC_CONTROL_PAGE_SIZE 4096
# define KEXEC_CONTROL_CODE_MAX_SIZE 2048
#ifndef __ASSEMBLY__
@@ -44,7 +45,6 @@ struct kimage;
/* Maximum address we can use for the control code buffer */
# define KEXEC_CONTROL_MEMORY_LIMIT TASK_SIZE
-# define KEXEC_CONTROL_PAGE_SIZE 4096
/* The native architecture */
# define KEXEC_ARCH KEXEC_ARCH_386
@@ -59,9 +59,6 @@ struct kimage;
/* Maximum address we can use for the control pages */
# define KEXEC_CONTROL_MEMORY_LIMIT (MAXMEM-1)
-/* Allocate one page for the pdp and the second for the code */
-# define KEXEC_CONTROL_PAGE_SIZE (4096UL + 4096UL)
-
/* The native architecture */
# define KEXEC_ARCH KEXEC_ARCH_X86_64
#endif
@@ -146,6 +143,19 @@ struct kimage_arch {
};
#else
struct kimage_arch {
+ /*
+ * This is a kimage control page, as it must not overlap with either
+ * source or destination address ranges.
+ */
+ pgd_t *pgd;
+ /*
+ * The virtual mapping of the control code page itself is used only
+ * during the transition, while the current kernel's pages are all
+ * in place. Thus the intermediate page table pages used to map it
+ * are not control pages, but instead just normal pages obtained
+ * with get_zeroed_page(). And have to be tracked (below) so that
+ * they can be freed.
+ */
p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index 2fa12d1dc6760..8509d809a9f1b 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -149,7 +149,8 @@ static void free_transition_pgtable(struct kimage *image)
image->arch.pte = NULL;
}
-static int init_transition_pgtable(struct kimage *image, pgd_t *pgd)
+static int init_transition_pgtable(struct kimage *image, pgd_t *pgd,
+ unsigned long control_page)
{
pgprot_t prot = PAGE_KERNEL_EXEC_NOENC;
unsigned long vaddr, paddr;
@@ -160,7 +161,7 @@ static int init_transition_pgtable(struct kimage *image, pgd_t *pgd)
pte_t *pte;
vaddr = (unsigned long)relocate_kernel;
- paddr = __pa(page_address(image->control_code_page)+PAGE_SIZE);
+ paddr = control_page;
pgd += pgd_index(vaddr);
if (!pgd_present(*pgd)) {
p4d = (p4d_t *)get_zeroed_page(GFP_KERNEL);
@@ -219,7 +220,7 @@ static void *alloc_pgt_page(void *data)
return p;
}
-static int init_pgtable(struct kimage *image, unsigned long start_pgtable)
+static int init_pgtable(struct kimage *image, unsigned long control_page)
{
struct x86_mapping_info info = {
.alloc_pgt_page = alloc_pgt_page,
@@ -228,12 +229,12 @@ static int init_pgtable(struct kimage *image, unsigned long start_pgtable)
.kernpg_flag = _KERNPG_TABLE_NOENC,
};
unsigned long mstart, mend;
- pgd_t *level4p;
int result;
int i;
- level4p = (pgd_t *)__va(start_pgtable);
- clear_page(level4p);
+ image->arch.pgd = alloc_pgt_page(image);
+ if (!image->arch.pgd)
+ return -ENOMEM;
if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) {
info.page_flag |= _PAGE_ENC;
@@ -247,8 +248,8 @@ static int init_pgtable(struct kimage *image, unsigned long start_pgtable)
mstart = pfn_mapped[i].start << PAGE_SHIFT;
mend = pfn_mapped[i].end << PAGE_SHIFT;
- result = kernel_ident_mapping_init(&info,
- level4p, mstart, mend);
+ result = kernel_ident_mapping_init(&info, image->arch.pgd,
+ mstart, mend);
if (result)
return result;
}
@@ -263,8 +264,8 @@ static int init_pgtable(struct kimage *image, unsigned long start_pgtable)
mstart = image->segment[i].mem;
mend = mstart + image->segment[i].memsz;
- result = kernel_ident_mapping_init(&info,
- level4p, mstart, mend);
+ result = kernel_ident_mapping_init(&info, image->arch.pgd,
+ mstart, mend);
if (result)
return result;
@@ -274,15 +275,19 @@ static int init_pgtable(struct kimage *image, unsigned long start_pgtable)
* Prepare EFI systab and ACPI tables for kexec kernel since they are
* not covered by pfn_mapped.
*/
- result = map_efi_systab(&info, level4p);
+ result = map_efi_systab(&info, image->arch.pgd);
if (result)
return result;
- result = map_acpi_tables(&info, level4p);
+ result = map_acpi_tables(&info, image->arch.pgd);
if (result)
return result;
- return init_transition_pgtable(image, level4p);
+ /*
+ * This must be last because the intermediate page table pages it
+ * allocates will not be control pages and may overlap the image.
+ */
+ return init_transition_pgtable(image, image->arch.pgd, control_page);
}
static void load_segments(void)
@@ -299,14 +304,14 @@ static void load_segments(void)
int machine_kexec_prepare(struct kimage *image)
{
- unsigned long start_pgtable;
+ unsigned long control_page;
int result;
/* Calculate the offsets */
- start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT;
+ control_page = page_to_pfn(image->control_code_page) << PAGE_SHIFT;
/* Setup the identity mapped 64bit page table */
- result = init_pgtable(image, start_pgtable);
+ result = init_pgtable(image, control_page);
if (result)
return result;
@@ -360,13 +365,12 @@ void machine_kexec(struct kimage *image)
#endif
}
- control_page = page_address(image->control_code_page) + PAGE_SIZE;
+ control_page = page_address(image->control_code_page);
__memcpy(control_page, relocate_kernel, KEXEC_CONTROL_CODE_MAX_SIZE);
page_list[PA_CONTROL_PAGE] = virt_to_phys(control_page);
page_list[VA_CONTROL_PAGE] = (unsigned long)control_page;
- page_list[PA_TABLE_PAGE] =
- (unsigned long)__pa(page_address(image->control_code_page));
+ page_list[PA_TABLE_PAGE] = (unsigned long)__pa(image->arch.pgd);
if (image->type == KEXEC_TYPE_DEFAULT)
page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page)
@@ -574,8 +578,7 @@ static void kexec_mark_crashkres(bool protect)
/* Don't touch the control code page used in crash_kexec().*/
control = PFN_PHYS(page_to_pfn(kexec_crash_image->control_code_page));
- /* Control code page is located in the 2nd page. */
- kexec_mark_range(crashk_res.start, control + PAGE_SIZE - 1, protect);
+ kexec_mark_range(crashk_res.start, control - 1, protect);
control += KEXEC_CONTROL_PAGE_SIZE;
kexec_mark_range(control, crashk_res.end, protect);
}
--
2.39.5
From: Bard Liao <yung-chuan.liao(a)linux.intel.com>
[ Upstream commit 569922b82ca660f8b24e705f6cf674e6b1f99cc7 ]
Each cpu DAI should associate with a widget. However, the topology might
not create the right number of DAI widgets for aggregated amps. And it
will cause NULL pointer deference.
Check that the DAI widget associated with the CPU DAI is valid to prevent
NULL pointer deference due to missing DAI widgets in topologies with
aggregated amps.
Signed-off-by: Bard Liao <yung-chuan.liao(a)linux.intel.com>
Reviewed-by: Ranjani Sridharan <ranjani.sridharan(a)linux.intel.com>
Reviewed-by: Péter Ujfalusi <peter.ujfalusi(a)linux.intel.com>
Reviewed-by: Liam Girdwood <liam.r.girdwood(a)intel.com>
Link: https://patch.msgid.link/20241203104853.56956-1-yung-chuan.liao@linux.intel…
Signed-off-by: Mark Brown <broonie(a)kernel.org>
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
---
sound/soc/sof/intel/hda-dai.c | 12 ++++++++++++
sound/soc/sof/intel/hda.c | 5 +++++
2 files changed, 17 insertions(+)
diff --git a/sound/soc/sof/intel/hda-dai.c b/sound/soc/sof/intel/hda-dai.c
index 82f46ecd94301..2e58a264da556 100644
--- a/sound/soc/sof/intel/hda-dai.c
+++ b/sound/soc/sof/intel/hda-dai.c
@@ -503,6 +503,12 @@ int sdw_hda_dai_hw_params(struct snd_pcm_substream *substream,
int ret;
int i;
+ if (!w) {
+ dev_err(cpu_dai->dev, "%s widget not found, check amp link num in the topology\n",
+ cpu_dai->name);
+ return -EINVAL;
+ }
+
ops = hda_dai_get_ops(substream, cpu_dai);
if (!ops) {
dev_err(cpu_dai->dev, "DAI widget ops not set\n");
@@ -582,6 +588,12 @@ int sdw_hda_dai_hw_params(struct snd_pcm_substream *substream,
*/
for_each_rtd_cpu_dais(rtd, i, dai) {
w = snd_soc_dai_get_widget(dai, substream->stream);
+ if (!w) {
+ dev_err(cpu_dai->dev,
+ "%s widget not found, check amp link num in the topology\n",
+ dai->name);
+ return -EINVAL;
+ }
ipc4_copier = widget_to_copier(w);
memcpy(&ipc4_copier->dma_config_tlv[cpu_dai_id], dma_config_tlv,
sizeof(*dma_config_tlv));
diff --git a/sound/soc/sof/intel/hda.c b/sound/soc/sof/intel/hda.c
index 70fc08c8fc99e..f10ed4d102501 100644
--- a/sound/soc/sof/intel/hda.c
+++ b/sound/soc/sof/intel/hda.c
@@ -63,6 +63,11 @@ static int sdw_params_stream(struct device *dev,
struct snd_soc_dapm_widget *w = snd_soc_dai_get_widget(d, params_data->substream->stream);
struct snd_sof_dai_config_data data = { 0 };
+ if (!w) {
+ dev_err(dev, "%s widget not found, check amp link num in the topology\n",
+ d->name);
+ return -EINVAL;
+ }
data.dai_index = (params_data->link_id << 8) | d->id;
data.dai_data = params_data->alh_stream_id;
data.dai_node_id = data.dai_data;
--
2.39.5
From: Stas Sergeev <stsp2(a)yandex.ru>
[ Upstream commit 3ca459eaba1bf96a8c7878de84fa8872259a01e3 ]
Currently tun checks the group permission even if the user have matched.
Besides going against the usual permission semantic, this has a
very interesting implication: if the tun group is not among the
supplementary groups of the tun user, then effectively no one can
access the tun device. CAP_SYS_ADMIN still can, but its the same as
not setting the tun ownership.
This patch relaxes the group checking so that either the user match
or the group match is enough. This avoids the situation when no one
can access the device even though the ownership is properly set.
Also I simplified the logic by removing the redundant inversions:
tun_not_capable() --> !tun_capable()
Signed-off-by: Stas Sergeev <stsp2(a)yandex.ru>
Reviewed-by: Willem de Bruijn <willemb(a)google.com>
Acked-by: Jason Wang <jasowang(a)redhat.com>
Link: https://patch.msgid.link/20241205073614.294773-1-stsp2@yandex.ru
Signed-off-by: Jakub Kicinski <kuba(a)kernel.org>
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
---
drivers/net/tun.c | 14 +++++++++-----
1 file changed, 9 insertions(+), 5 deletions(-)
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 0adce9bf7a1e5..87cc7d778c3cf 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -636,14 +636,18 @@ static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb,
return ret;
}
-static inline bool tun_not_capable(struct tun_struct *tun)
+static inline bool tun_capable(struct tun_struct *tun)
{
const struct cred *cred = current_cred();
struct net *net = dev_net(tun->dev);
- return ((uid_valid(tun->owner) && !uid_eq(cred->euid, tun->owner)) ||
- (gid_valid(tun->group) && !in_egroup_p(tun->group))) &&
- !ns_capable(net->user_ns, CAP_NET_ADMIN);
+ if (ns_capable(net->user_ns, CAP_NET_ADMIN))
+ return 1;
+ if (uid_valid(tun->owner) && uid_eq(cred->euid, tun->owner))
+ return 1;
+ if (gid_valid(tun->group) && in_egroup_p(tun->group))
+ return 1;
+ return 0;
}
static void tun_set_real_num_queues(struct tun_struct *tun)
@@ -2838,7 +2842,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
!!(tun->flags & IFF_MULTI_QUEUE))
return -EINVAL;
- if (tun_not_capable(tun))
+ if (!tun_capable(tun))
return -EPERM;
err = security_tun_dev_open(tun->security);
if (err < 0)
--
2.39.5
From: Stas Sergeev <stsp2(a)yandex.ru>
[ Upstream commit 3ca459eaba1bf96a8c7878de84fa8872259a01e3 ]
Currently tun checks the group permission even if the user have matched.
Besides going against the usual permission semantic, this has a
very interesting implication: if the tun group is not among the
supplementary groups of the tun user, then effectively no one can
access the tun device. CAP_SYS_ADMIN still can, but its the same as
not setting the tun ownership.
This patch relaxes the group checking so that either the user match
or the group match is enough. This avoids the situation when no one
can access the device even though the ownership is properly set.
Also I simplified the logic by removing the redundant inversions:
tun_not_capable() --> !tun_capable()
Signed-off-by: Stas Sergeev <stsp2(a)yandex.ru>
Reviewed-by: Willem de Bruijn <willemb(a)google.com>
Acked-by: Jason Wang <jasowang(a)redhat.com>
Link: https://patch.msgid.link/20241205073614.294773-1-stsp2@yandex.ru
Signed-off-by: Jakub Kicinski <kuba(a)kernel.org>
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
---
drivers/net/tun.c | 14 +++++++++-----
1 file changed, 9 insertions(+), 5 deletions(-)
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index c34c6f0d23efe..52ea9f81d388b 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -586,14 +586,18 @@ static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb,
return ret;
}
-static inline bool tun_not_capable(struct tun_struct *tun)
+static inline bool tun_capable(struct tun_struct *tun)
{
const struct cred *cred = current_cred();
struct net *net = dev_net(tun->dev);
- return ((uid_valid(tun->owner) && !uid_eq(cred->euid, tun->owner)) ||
- (gid_valid(tun->group) && !in_egroup_p(tun->group))) &&
- !ns_capable(net->user_ns, CAP_NET_ADMIN);
+ if (ns_capable(net->user_ns, CAP_NET_ADMIN))
+ return 1;
+ if (uid_valid(tun->owner) && uid_eq(cred->euid, tun->owner))
+ return 1;
+ if (gid_valid(tun->group) && in_egroup_p(tun->group))
+ return 1;
+ return 0;
}
static void tun_set_real_num_queues(struct tun_struct *tun)
@@ -2772,7 +2776,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
!!(tun->flags & IFF_MULTI_QUEUE))
return -EINVAL;
- if (tun_not_capable(tun))
+ if (!tun_capable(tun))
return -EPERM;
err = security_tun_dev_open(tun->security);
if (err < 0)
--
2.39.5
From: Stas Sergeev <stsp2(a)yandex.ru>
[ Upstream commit 3ca459eaba1bf96a8c7878de84fa8872259a01e3 ]
Currently tun checks the group permission even if the user have matched.
Besides going against the usual permission semantic, this has a
very interesting implication: if the tun group is not among the
supplementary groups of the tun user, then effectively no one can
access the tun device. CAP_SYS_ADMIN still can, but its the same as
not setting the tun ownership.
This patch relaxes the group checking so that either the user match
or the group match is enough. This avoids the situation when no one
can access the device even though the ownership is properly set.
Also I simplified the logic by removing the redundant inversions:
tun_not_capable() --> !tun_capable()
Signed-off-by: Stas Sergeev <stsp2(a)yandex.ru>
Reviewed-by: Willem de Bruijn <willemb(a)google.com>
Acked-by: Jason Wang <jasowang(a)redhat.com>
Link: https://patch.msgid.link/20241205073614.294773-1-stsp2@yandex.ru
Signed-off-by: Jakub Kicinski <kuba(a)kernel.org>
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
---
drivers/net/tun.c | 14 +++++++++-----
1 file changed, 9 insertions(+), 5 deletions(-)
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 959ca6b9cd138..a85f743aa1573 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -575,14 +575,18 @@ static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb,
return ret;
}
-static inline bool tun_not_capable(struct tun_struct *tun)
+static inline bool tun_capable(struct tun_struct *tun)
{
const struct cred *cred = current_cred();
struct net *net = dev_net(tun->dev);
- return ((uid_valid(tun->owner) && !uid_eq(cred->euid, tun->owner)) ||
- (gid_valid(tun->group) && !in_egroup_p(tun->group))) &&
- !ns_capable(net->user_ns, CAP_NET_ADMIN);
+ if (ns_capable(net->user_ns, CAP_NET_ADMIN))
+ return 1;
+ if (uid_valid(tun->owner) && uid_eq(cred->euid, tun->owner))
+ return 1;
+ if (gid_valid(tun->group) && in_egroup_p(tun->group))
+ return 1;
+ return 0;
}
static void tun_set_real_num_queues(struct tun_struct *tun)
@@ -2718,7 +2722,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
!!(tun->flags & IFF_MULTI_QUEUE))
return -EINVAL;
- if (tun_not_capable(tun))
+ if (!tun_capable(tun))
return -EPERM;
err = security_tun_dev_open(tun->security);
if (err < 0)
--
2.39.5
From: Stas Sergeev <stsp2(a)yandex.ru>
[ Upstream commit 3ca459eaba1bf96a8c7878de84fa8872259a01e3 ]
Currently tun checks the group permission even if the user have matched.
Besides going against the usual permission semantic, this has a
very interesting implication: if the tun group is not among the
supplementary groups of the tun user, then effectively no one can
access the tun device. CAP_SYS_ADMIN still can, but its the same as
not setting the tun ownership.
This patch relaxes the group checking so that either the user match
or the group match is enough. This avoids the situation when no one
can access the device even though the ownership is properly set.
Also I simplified the logic by removing the redundant inversions:
tun_not_capable() --> !tun_capable()
Signed-off-by: Stas Sergeev <stsp2(a)yandex.ru>
Reviewed-by: Willem de Bruijn <willemb(a)google.com>
Acked-by: Jason Wang <jasowang(a)redhat.com>
Link: https://patch.msgid.link/20241205073614.294773-1-stsp2@yandex.ru
Signed-off-by: Jakub Kicinski <kuba(a)kernel.org>
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
---
drivers/net/tun.c | 14 +++++++++-----
1 file changed, 9 insertions(+), 5 deletions(-)
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index ea98d93138c12..a6c9f9062dbd4 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -574,14 +574,18 @@ static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb,
return ret;
}
-static inline bool tun_not_capable(struct tun_struct *tun)
+static inline bool tun_capable(struct tun_struct *tun)
{
const struct cred *cred = current_cred();
struct net *net = dev_net(tun->dev);
- return ((uid_valid(tun->owner) && !uid_eq(cred->euid, tun->owner)) ||
- (gid_valid(tun->group) && !in_egroup_p(tun->group))) &&
- !ns_capable(net->user_ns, CAP_NET_ADMIN);
+ if (ns_capable(net->user_ns, CAP_NET_ADMIN))
+ return 1;
+ if (uid_valid(tun->owner) && uid_eq(cred->euid, tun->owner))
+ return 1;
+ if (gid_valid(tun->group) && in_egroup_p(tun->group))
+ return 1;
+ return 0;
}
static void tun_set_real_num_queues(struct tun_struct *tun)
@@ -2767,7 +2771,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
!!(tun->flags & IFF_MULTI_QUEUE))
return -EINVAL;
- if (tun_not_capable(tun))
+ if (!tun_capable(tun))
return -EPERM;
err = security_tun_dev_open(tun->security);
if (err < 0)
--
2.39.5
From: Suleiman Souhlal <suleiman(a)google.com>
[ Upstream commit 108ad0999085df2366dd9ef437573955cb3f5586 ]
When steal time exceeds the measured delta when updating clock_task, we
currently try to catch up the excess in future updates.
However, this results in inaccurate run times for the future things using
clock_task, in some situations, as they end up getting additional steal
time that did not actually happen.
This is because there is a window between reading the elapsed time in
update_rq_clock() and sampling the steal time in update_rq_clock_task().
If the VCPU gets preempted between those two points, any additional
steal time is accounted to the outgoing task even though the calculated
delta did not actually contain any of that "stolen" time.
When this race happens, we can end up with steal time that exceeds the
calculated delta, and the previous code would try to catch up that excess
steal time in future clock updates, which is given to the next,
incoming task, even though it did not actually have any time stolen.
This behavior is particularly bad when steal time can be very long,
which we've seen when trying to extend steal time to contain the duration
that the host was suspended [0]. When this happens, clock_task stays
frozen, during which the running task stays running for the whole
duration, since its run time doesn't increase.
However the race can happen even under normal operation.
Ideally we would read the elapsed cpu time and the steal time atomically,
to prevent this race from happening in the first place, but doing so
is non-trivial.
Since the time between those two points isn't otherwise accounted anywhere,
neither to the outgoing task nor the incoming task (because the "end of
outgoing task" and "start of incoming task" timestamps are the same),
I would argue that the right thing to do is to simply drop any excess steal
time, in order to prevent these issues.
[0] https://lore.kernel.org/kvm/20240820043543.837914-1-suleiman@google.com/
Signed-off-by: Suleiman Souhlal <suleiman(a)google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz(a)infradead.org>
Link: https://lore.kernel.org/r/20241118043745.1857272-1-suleiman@google.com
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
---
kernel/sched/core.c | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 51ac62637e4ed..39ce8a3d8c573 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -176,13 +176,15 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
#endif
#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
if (static_key_false((¶virt_steal_rq_enabled))) {
- steal = paravirt_steal_clock(cpu_of(rq));
+ u64 prev_steal;
+
+ steal = prev_steal = paravirt_steal_clock(cpu_of(rq));
steal -= rq->prev_steal_time_rq;
if (unlikely(steal > delta))
steal = delta;
- rq->prev_steal_time_rq += steal;
+ rq->prev_steal_time_rq = prev_steal;
delta -= steal;
}
#endif
--
2.39.5
From: Suleiman Souhlal <suleiman(a)google.com>
[ Upstream commit 108ad0999085df2366dd9ef437573955cb3f5586 ]
When steal time exceeds the measured delta when updating clock_task, we
currently try to catch up the excess in future updates.
However, this results in inaccurate run times for the future things using
clock_task, in some situations, as they end up getting additional steal
time that did not actually happen.
This is because there is a window between reading the elapsed time in
update_rq_clock() and sampling the steal time in update_rq_clock_task().
If the VCPU gets preempted between those two points, any additional
steal time is accounted to the outgoing task even though the calculated
delta did not actually contain any of that "stolen" time.
When this race happens, we can end up with steal time that exceeds the
calculated delta, and the previous code would try to catch up that excess
steal time in future clock updates, which is given to the next,
incoming task, even though it did not actually have any time stolen.
This behavior is particularly bad when steal time can be very long,
which we've seen when trying to extend steal time to contain the duration
that the host was suspended [0]. When this happens, clock_task stays
frozen, during which the running task stays running for the whole
duration, since its run time doesn't increase.
However the race can happen even under normal operation.
Ideally we would read the elapsed cpu time and the steal time atomically,
to prevent this race from happening in the first place, but doing so
is non-trivial.
Since the time between those two points isn't otherwise accounted anywhere,
neither to the outgoing task nor the incoming task (because the "end of
outgoing task" and "start of incoming task" timestamps are the same),
I would argue that the right thing to do is to simply drop any excess steal
time, in order to prevent these issues.
[0] https://lore.kernel.org/kvm/20240820043543.837914-1-suleiman@google.com/
Signed-off-by: Suleiman Souhlal <suleiman(a)google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz(a)infradead.org>
Link: https://lore.kernel.org/r/20241118043745.1857272-1-suleiman@google.com
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
---
kernel/sched/core.c | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 7cf45d506688c..42dad8c8d6f28 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -279,13 +279,15 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
#endif
#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
if (static_key_false((¶virt_steal_rq_enabled))) {
- steal = paravirt_steal_clock(cpu_of(rq));
+ u64 prev_steal;
+
+ steal = prev_steal = paravirt_steal_clock(cpu_of(rq));
steal -= rq->prev_steal_time_rq;
if (unlikely(steal > delta))
steal = delta;
- rq->prev_steal_time_rq += steal;
+ rq->prev_steal_time_rq = prev_steal;
delta -= steal;
}
#endif
--
2.39.5
From: Suleiman Souhlal <suleiman(a)google.com>
[ Upstream commit 108ad0999085df2366dd9ef437573955cb3f5586 ]
When steal time exceeds the measured delta when updating clock_task, we
currently try to catch up the excess in future updates.
However, this results in inaccurate run times for the future things using
clock_task, in some situations, as they end up getting additional steal
time that did not actually happen.
This is because there is a window between reading the elapsed time in
update_rq_clock() and sampling the steal time in update_rq_clock_task().
If the VCPU gets preempted between those two points, any additional
steal time is accounted to the outgoing task even though the calculated
delta did not actually contain any of that "stolen" time.
When this race happens, we can end up with steal time that exceeds the
calculated delta, and the previous code would try to catch up that excess
steal time in future clock updates, which is given to the next,
incoming task, even though it did not actually have any time stolen.
This behavior is particularly bad when steal time can be very long,
which we've seen when trying to extend steal time to contain the duration
that the host was suspended [0]. When this happens, clock_task stays
frozen, during which the running task stays running for the whole
duration, since its run time doesn't increase.
However the race can happen even under normal operation.
Ideally we would read the elapsed cpu time and the steal time atomically,
to prevent this race from happening in the first place, but doing so
is non-trivial.
Since the time between those two points isn't otherwise accounted anywhere,
neither to the outgoing task nor the incoming task (because the "end of
outgoing task" and "start of incoming task" timestamps are the same),
I would argue that the right thing to do is to simply drop any excess steal
time, in order to prevent these issues.
[0] https://lore.kernel.org/kvm/20240820043543.837914-1-suleiman@google.com/
Signed-off-by: Suleiman Souhlal <suleiman(a)google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz(a)infradead.org>
Link: https://lore.kernel.org/r/20241118043745.1857272-1-suleiman@google.com
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
---
kernel/sched/core.c | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index ed92b75f7e024..691af58c156e1 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -642,13 +642,15 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
#endif
#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
if (static_key_false((¶virt_steal_rq_enabled))) {
- steal = paravirt_steal_clock(cpu_of(rq));
+ u64 prev_steal;
+
+ steal = prev_steal = paravirt_steal_clock(cpu_of(rq));
steal -= rq->prev_steal_time_rq;
if (unlikely(steal > delta))
steal = delta;
- rq->prev_steal_time_rq += steal;
+ rq->prev_steal_time_rq = prev_steal;
delta -= steal;
}
#endif
--
2.39.5
From: Suleiman Souhlal <suleiman(a)google.com>
[ Upstream commit 108ad0999085df2366dd9ef437573955cb3f5586 ]
When steal time exceeds the measured delta when updating clock_task, we
currently try to catch up the excess in future updates.
However, this results in inaccurate run times for the future things using
clock_task, in some situations, as they end up getting additional steal
time that did not actually happen.
This is because there is a window between reading the elapsed time in
update_rq_clock() and sampling the steal time in update_rq_clock_task().
If the VCPU gets preempted between those two points, any additional
steal time is accounted to the outgoing task even though the calculated
delta did not actually contain any of that "stolen" time.
When this race happens, we can end up with steal time that exceeds the
calculated delta, and the previous code would try to catch up that excess
steal time in future clock updates, which is given to the next,
incoming task, even though it did not actually have any time stolen.
This behavior is particularly bad when steal time can be very long,
which we've seen when trying to extend steal time to contain the duration
that the host was suspended [0]. When this happens, clock_task stays
frozen, during which the running task stays running for the whole
duration, since its run time doesn't increase.
However the race can happen even under normal operation.
Ideally we would read the elapsed cpu time and the steal time atomically,
to prevent this race from happening in the first place, but doing so
is non-trivial.
Since the time between those two points isn't otherwise accounted anywhere,
neither to the outgoing task nor the incoming task (because the "end of
outgoing task" and "start of incoming task" timestamps are the same),
I would argue that the right thing to do is to simply drop any excess steal
time, in order to prevent these issues.
[0] https://lore.kernel.org/kvm/20240820043543.837914-1-suleiman@google.com/
Signed-off-by: Suleiman Souhlal <suleiman(a)google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz(a)infradead.org>
Link: https://lore.kernel.org/r/20241118043745.1857272-1-suleiman@google.com
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
---
kernel/sched/core.c | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 54af671e8d510..3927904984fd5 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -704,13 +704,15 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
#endif
#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
if (static_key_false((¶virt_steal_rq_enabled))) {
- steal = paravirt_steal_clock(cpu_of(rq));
+ u64 prev_steal;
+
+ steal = prev_steal = paravirt_steal_clock(cpu_of(rq));
steal -= rq->prev_steal_time_rq;
if (unlikely(steal > delta))
steal = delta;
- rq->prev_steal_time_rq += steal;
+ rq->prev_steal_time_rq = prev_steal;
delta -= steal;
}
#endif
--
2.39.5
From: Suleiman Souhlal <suleiman(a)google.com>
[ Upstream commit 108ad0999085df2366dd9ef437573955cb3f5586 ]
When steal time exceeds the measured delta when updating clock_task, we
currently try to catch up the excess in future updates.
However, this results in inaccurate run times for the future things using
clock_task, in some situations, as they end up getting additional steal
time that did not actually happen.
This is because there is a window between reading the elapsed time in
update_rq_clock() and sampling the steal time in update_rq_clock_task().
If the VCPU gets preempted between those two points, any additional
steal time is accounted to the outgoing task even though the calculated
delta did not actually contain any of that "stolen" time.
When this race happens, we can end up with steal time that exceeds the
calculated delta, and the previous code would try to catch up that excess
steal time in future clock updates, which is given to the next,
incoming task, even though it did not actually have any time stolen.
This behavior is particularly bad when steal time can be very long,
which we've seen when trying to extend steal time to contain the duration
that the host was suspended [0]. When this happens, clock_task stays
frozen, during which the running task stays running for the whole
duration, since its run time doesn't increase.
However the race can happen even under normal operation.
Ideally we would read the elapsed cpu time and the steal time atomically,
to prevent this race from happening in the first place, but doing so
is non-trivial.
Since the time between those two points isn't otherwise accounted anywhere,
neither to the outgoing task nor the incoming task (because the "end of
outgoing task" and "start of incoming task" timestamps are the same),
I would argue that the right thing to do is to simply drop any excess steal
time, in order to prevent these issues.
[0] https://lore.kernel.org/kvm/20240820043543.837914-1-suleiman@google.com/
Signed-off-by: Suleiman Souhlal <suleiman(a)google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz(a)infradead.org>
Link: https://lore.kernel.org/r/20241118043745.1857272-1-suleiman@google.com
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
---
kernel/sched/core.c | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 86606fb9e6bc6..c686d826a91cf 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -726,13 +726,15 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
#endif
#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
if (static_key_false((¶virt_steal_rq_enabled))) {
- steal = paravirt_steal_clock(cpu_of(rq));
+ u64 prev_steal;
+
+ steal = prev_steal = paravirt_steal_clock(cpu_of(rq));
steal -= rq->prev_steal_time_rq;
if (unlikely(steal > delta))
steal = delta;
- rq->prev_steal_time_rq += steal;
+ rq->prev_steal_time_rq = prev_steal;
delta -= steal;
}
#endif
--
2.39.5
From: Suleiman Souhlal <suleiman(a)google.com>
[ Upstream commit 108ad0999085df2366dd9ef437573955cb3f5586 ]
When steal time exceeds the measured delta when updating clock_task, we
currently try to catch up the excess in future updates.
However, this results in inaccurate run times for the future things using
clock_task, in some situations, as they end up getting additional steal
time that did not actually happen.
This is because there is a window between reading the elapsed time in
update_rq_clock() and sampling the steal time in update_rq_clock_task().
If the VCPU gets preempted between those two points, any additional
steal time is accounted to the outgoing task even though the calculated
delta did not actually contain any of that "stolen" time.
When this race happens, we can end up with steal time that exceeds the
calculated delta, and the previous code would try to catch up that excess
steal time in future clock updates, which is given to the next,
incoming task, even though it did not actually have any time stolen.
This behavior is particularly bad when steal time can be very long,
which we've seen when trying to extend steal time to contain the duration
that the host was suspended [0]. When this happens, clock_task stays
frozen, during which the running task stays running for the whole
duration, since its run time doesn't increase.
However the race can happen even under normal operation.
Ideally we would read the elapsed cpu time and the steal time atomically,
to prevent this race from happening in the first place, but doing so
is non-trivial.
Since the time between those two points isn't otherwise accounted anywhere,
neither to the outgoing task nor the incoming task (because the "end of
outgoing task" and "start of incoming task" timestamps are the same),
I would argue that the right thing to do is to simply drop any excess steal
time, in order to prevent these issues.
[0] https://lore.kernel.org/kvm/20240820043543.837914-1-suleiman@google.com/
Signed-off-by: Suleiman Souhlal <suleiman(a)google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz(a)infradead.org>
Link: https://lore.kernel.org/r/20241118043745.1857272-1-suleiman@google.com
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
---
kernel/sched/core.c | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d07dc87787dff..9015c792830c2 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -766,13 +766,15 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
#endif
#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
if (static_key_false((¶virt_steal_rq_enabled))) {
- steal = paravirt_steal_clock(cpu_of(rq));
+ u64 prev_steal;
+
+ steal = prev_steal = paravirt_steal_clock(cpu_of(rq));
steal -= rq->prev_steal_time_rq;
if (unlikely(steal > delta))
steal = delta;
- rq->prev_steal_time_rq += steal;
+ rq->prev_steal_time_rq = prev_steal;
delta -= steal;
}
#endif
--
2.39.5
From: Suleiman Souhlal <suleiman(a)google.com>
[ Upstream commit 108ad0999085df2366dd9ef437573955cb3f5586 ]
When steal time exceeds the measured delta when updating clock_task, we
currently try to catch up the excess in future updates.
However, this results in inaccurate run times for the future things using
clock_task, in some situations, as they end up getting additional steal
time that did not actually happen.
This is because there is a window between reading the elapsed time in
update_rq_clock() and sampling the steal time in update_rq_clock_task().
If the VCPU gets preempted between those two points, any additional
steal time is accounted to the outgoing task even though the calculated
delta did not actually contain any of that "stolen" time.
When this race happens, we can end up with steal time that exceeds the
calculated delta, and the previous code would try to catch up that excess
steal time in future clock updates, which is given to the next,
incoming task, even though it did not actually have any time stolen.
This behavior is particularly bad when steal time can be very long,
which we've seen when trying to extend steal time to contain the duration
that the host was suspended [0]. When this happens, clock_task stays
frozen, during which the running task stays running for the whole
duration, since its run time doesn't increase.
However the race can happen even under normal operation.
Ideally we would read the elapsed cpu time and the steal time atomically,
to prevent this race from happening in the first place, but doing so
is non-trivial.
Since the time between those two points isn't otherwise accounted anywhere,
neither to the outgoing task nor the incoming task (because the "end of
outgoing task" and "start of incoming task" timestamps are the same),
I would argue that the right thing to do is to simply drop any excess steal
time, in order to prevent these issues.
[0] https://lore.kernel.org/kvm/20240820043543.837914-1-suleiman@google.com/
Signed-off-by: Suleiman Souhlal <suleiman(a)google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz(a)infradead.org>
Link: https://lore.kernel.org/r/20241118043745.1857272-1-suleiman@google.com
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
---
kernel/sched/core.c | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 3e5a6bf587f91..296e77380318e 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -766,13 +766,15 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
#endif
#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
if (static_key_false((¶virt_steal_rq_enabled))) {
- steal = paravirt_steal_clock(cpu_of(rq));
+ u64 prev_steal;
+
+ steal = prev_steal = paravirt_steal_clock(cpu_of(rq));
steal -= rq->prev_steal_time_rq;
if (unlikely(steal > delta))
steal = delta;
- rq->prev_steal_time_rq += steal;
+ rq->prev_steal_time_rq = prev_steal;
delta -= steal;
}
#endif
--
2.39.5
From: Josef Bacik <josef(a)toxicpanda.com>
[ Upstream commit 6a4730b325aaa48f7a5d5ba97aff0a955e2d9cec ]
This BUG_ON is meant to catch backref cache problems, but these can
arise from either bugs in the backref cache or corruption in the extent
tree. Fix it to be a proper error.
Reviewed-by: Boris Burkov <boris(a)bur.io>
Signed-off-by: Josef Bacik <josef(a)toxicpanda.com>
Reviewed-by: David Sterba <dsterba(a)suse.com>
Signed-off-by: David Sterba <dsterba(a)suse.com>
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
---
fs/btrfs/relocation.c | 14 ++++++++++++--
1 file changed, 12 insertions(+), 2 deletions(-)
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 89ad7e12c08bb..062154c6a65f6 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -4789,8 +4789,18 @@ int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
WARN_ON(!first_cow && level == 0);
node = rc->backref_cache.path[level];
- BUG_ON(node->bytenr != buf->start &&
- node->new_bytenr != buf->start);
+
+ /*
+ * If node->bytenr != buf->start and node->new_bytenr !=
+ * buf->start then we've got the wrong backref node for what we
+ * expected to see here and the cache is incorrect.
+ */
+ if (unlikely(node->bytenr != buf->start && node->new_bytenr != buf->start)) {
+ btrfs_err(fs_info,
+"bytenr %llu was found but our backref cache was expecting %llu or %llu",
+ buf->start, node->bytenr, node->new_bytenr);
+ return -EUCLEAN;
+ }
drop_node_buffer(node);
extent_buffer_get(cow);
--
2.39.5
From: Josef Bacik <josef(a)toxicpanda.com>
[ Upstream commit 6a4730b325aaa48f7a5d5ba97aff0a955e2d9cec ]
This BUG_ON is meant to catch backref cache problems, but these can
arise from either bugs in the backref cache or corruption in the extent
tree. Fix it to be a proper error.
Reviewed-by: Boris Burkov <boris(a)bur.io>
Signed-off-by: Josef Bacik <josef(a)toxicpanda.com>
Reviewed-by: David Sterba <dsterba(a)suse.com>
Signed-off-by: David Sterba <dsterba(a)suse.com>
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
---
fs/btrfs/relocation.c | 14 ++++++++++++--
1 file changed, 12 insertions(+), 2 deletions(-)
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 98e3b3749ec12..5b921e6ed94e2 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -3976,8 +3976,18 @@ int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
WARN_ON(!first_cow && level == 0);
node = rc->backref_cache.path[level];
- BUG_ON(node->bytenr != buf->start &&
- node->new_bytenr != buf->start);
+
+ /*
+ * If node->bytenr != buf->start and node->new_bytenr !=
+ * buf->start then we've got the wrong backref node for what we
+ * expected to see here and the cache is incorrect.
+ */
+ if (unlikely(node->bytenr != buf->start && node->new_bytenr != buf->start)) {
+ btrfs_err(fs_info,
+"bytenr %llu was found but our backref cache was expecting %llu or %llu",
+ buf->start, node->bytenr, node->new_bytenr);
+ return -EUCLEAN;
+ }
btrfs_backref_drop_node_buffer(node);
atomic_inc(&cow->refs);
--
2.39.5
From: Hao-ran Zheng <zhenghaoran154(a)gmail.com>
[ Upstream commit 5324c4e10e9c2ce307a037e904c0d9671d7137d9 ]
A data race occurs when the function `insert_ordered_extent_file_extent()`
and the function `btrfs_inode_safe_disk_i_size_write()` are executed
concurrently. The function `insert_ordered_extent_file_extent()` is not
locked when reading inode->disk_i_size, causing
`btrfs_inode_safe_disk_i_size_write()` to cause data competition when
writing inode->disk_i_size, thus affecting the value of `modify_tree`.
The specific call stack that appears during testing is as follows:
============DATA_RACE============
btrfs_drop_extents+0x89a/0xa060 [btrfs]
insert_reserved_file_extent+0xb54/0x2960 [btrfs]
insert_ordered_extent_file_extent+0xff5/0x1760 [btrfs]
btrfs_finish_one_ordered+0x1b85/0x36a0 [btrfs]
btrfs_finish_ordered_io+0x37/0x60 [btrfs]
finish_ordered_fn+0x3e/0x50 [btrfs]
btrfs_work_helper+0x9c9/0x27a0 [btrfs]
process_scheduled_works+0x716/0xf10
worker_thread+0xb6a/0x1190
kthread+0x292/0x330
ret_from_fork+0x4d/0x80
ret_from_fork_asm+0x1a/0x30
============OTHER_INFO============
btrfs_inode_safe_disk_i_size_write+0x4ec/0x600 [btrfs]
btrfs_finish_one_ordered+0x24c7/0x36a0 [btrfs]
btrfs_finish_ordered_io+0x37/0x60 [btrfs]
finish_ordered_fn+0x3e/0x50 [btrfs]
btrfs_work_helper+0x9c9/0x27a0 [btrfs]
process_scheduled_works+0x716/0xf10
worker_thread+0xb6a/0x1190
kthread+0x292/0x330
ret_from_fork+0x4d/0x80
ret_from_fork_asm+0x1a/0x30
=================================
The main purpose of the check of the inode's disk_i_size is to avoid
taking write locks on a btree path when we have a write at or beyond
EOF, since in these cases we don't expect to find extent items in the
root to drop. However if we end up taking write locks due to a data
race on disk_i_size, everything is still correct, we only add extra
lock contention on the tree in case there's concurrency from other tasks.
If the race causes us to not take write locks when we actually need them,
then everything is functionally correct as well, since if we find out we
have extent items to drop and we took read locks (modify_tree set to 0),
we release the path and retry again with write locks.
Since this data race does not affect the correctness of the function,
it is a harmless data race, use data_race() to check inode->disk_i_size.
Reviewed-by: Filipe Manana <fdmanana(a)suse.com>
Signed-off-by: Hao-ran Zheng <zhenghaoran154(a)gmail.com>
Signed-off-by: Filipe Manana <fdmanana(a)suse.com>
Reviewed-by: David Sterba <dsterba(a)suse.com>
Signed-off-by: David Sterba <dsterba(a)suse.com>
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
---
fs/btrfs/file.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 44160d4ad53e0..31b25cb2f5cc3 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -731,7 +731,7 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans,
if (args->drop_cache)
btrfs_drop_extent_cache(inode, args->start, args->end - 1, 0);
- if (args->start >= inode->disk_i_size && !args->replace_extent)
+ if (data_race(args->start >= inode->disk_i_size) && !args->replace_extent)
modify_tree = 0;
update_refs = (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID);
--
2.39.5
The quilt patch titled
Subject: mm/compaction: fix UBSAN shift-out-of-bounds warning
has been removed from the -mm tree. Its filename was
mm-compaction-fix-ubsan-shift-out-of-bounds-warning.patch
This patch was dropped because it was merged into the mm-stable branch
of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
------------------------------------------------------
From: Liu Shixin <liushixin2(a)huawei.com>
Subject: mm/compaction: fix UBSAN shift-out-of-bounds warning
Date: Thu, 23 Jan 2025 10:10:29 +0800
syzkaller reported a UBSAN shift-out-of-bounds warning of (1UL << order)
in isolate_freepages_block(). The bogus compound_order can be any value
because it is union with flags. Add back the MAX_PAGE_ORDER check to fix
the warning.
Link: https://lkml.kernel.org/r/20250123021029.2826736-1-liushixin2@huawei.com
Fixes: 3da0272a4c7d ("mm/compaction: correctly return failure with bogus compound_order in strict mode")
Signed-off-by: Liu Shixin <liushixin2(a)huawei.com>
Reviewed-by: Kemeng Shi <shikemeng(a)huaweicloud.com>
Acked-by: David Hildenbrand <david(a)redhat.com>
Reviewed-by: Oscar Salvador <osalvador(a)suse.de>
Cc: Baolin Wang <baolin.wang(a)linux.alibaba.com>
Cc: David Hildenbrand <david(a)redhat.com>
Cc: Kemeng Shi <shikemeng(a)huaweicloud.com>
Cc: Matthew Wilcox <willy(a)infradead.org>
Cc: Mel Gorman <mgorman(a)techsingularity.net>
Cc: Nanyong Sun <sunnanyong(a)huawei.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/compaction.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
--- a/mm/compaction.c~mm-compaction-fix-ubsan-shift-out-of-bounds-warning
+++ a/mm/compaction.c
@@ -631,7 +631,8 @@ static unsigned long isolate_freepages_b
if (PageCompound(page)) {
const unsigned int order = compound_order(page);
- if (blockpfn + (1UL << order) <= end_pfn) {
+ if ((order <= MAX_PAGE_ORDER) &&
+ (blockpfn + (1UL << order) <= end_pfn)) {
blockpfn += (1UL << order) - 1;
page += (1UL << order) - 1;
nr_scanned += (1UL << order) - 1;
_
Patches currently in -mm which might be from liushixin2(a)huawei.com are
mm-page_isolation-avoid-call-folio_hstate-without-hugetlb_lock.patch
Also add a kunit testcase to make sure the function works correctly now
and doesn't regress in the future.
Signed-off-by: Thomas Weißschuh <thomas.weissschuh(a)linutronix.de>
---
Thomas Weißschuh (2):
of: address: Fix empty resource handling in __of_address_resource_bounds()
of: address: Add kunit test for __of_address_resource_bounds()
drivers/of/address.c | 17 +++----
drivers/of/of_private.h | 4 ++
drivers/of/of_test.c | 120 +++++++++++++++++++++++++++++++++++++++++++++++-
3 files changed, 132 insertions(+), 9 deletions(-)
---
base-commit: ffd294d346d185b70e28b1a28abe367bbfe53c04
change-id: 20250120-of-address-overflow-a59476362885
Best regards,
--
Thomas Weißschuh <thomas.weissschuh(a)linutronix.de>
From: Gabriele Monaco <gmonaco(a)redhat.com>
RV per-task monitors are implemented through a monitor structure
available for each task_struct. This structure is reset every time the
monitor is (re-)started, to avoid inconsistencies if the monitor was
activated previously.
To do so, we reset the monitor on all threads using the macro
for_each_process_thread. However, this macro excludes the idle tasks on
each CPU. Idle tasks could be considered tasks on their own right and it
should be up to the model whether to ignore them or not.
Reset monitors also on the idle tasks for each present CPU whenever we
reset all per-task monitors.
Cc: stable(a)vger.kernel.org
Cc: Juri Lelli <juri.lelli(a)redhat.com>
Cc: Thomas Gleixner <tglx(a)linutronix.de>
Cc: Peter Zijlstra <peterz(a)infradead.org>
Cc: John Kacur <jkacur(a)redhat.com>
Link: https://lore.kernel.org/20250115151547.605750-2-gmonaco@redhat.com
Fixes: 792575348ff7 ("rv/include: Add deterministic automata monitor definition via C macros")
Signed-off-by: Gabriele Monaco <gmonaco(a)redhat.com>
Signed-off-by: Steven Rostedt (Google) <rostedt(a)goodmis.org>
---
include/rv/da_monitor.h | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/include/rv/da_monitor.h b/include/rv/da_monitor.h
index 9705b2a98e49..510c88bfabd4 100644
--- a/include/rv/da_monitor.h
+++ b/include/rv/da_monitor.h
@@ -14,6 +14,7 @@
#include <rv/automata.h>
#include <linux/rv.h>
#include <linux/bug.h>
+#include <linux/sched.h>
#ifdef CONFIG_RV_REACTORS
@@ -324,10 +325,13 @@ static inline struct da_monitor *da_get_monitor_##name(struct task_struct *tsk)
static void da_monitor_reset_all_##name(void) \
{ \
struct task_struct *g, *p; \
+ int cpu; \
\
read_lock(&tasklist_lock); \
for_each_process_thread(g, p) \
da_monitor_reset_##name(da_get_monitor_##name(p)); \
+ for_each_present_cpu(cpu) \
+ da_monitor_reset_##name(da_get_monitor_##name(idle_task(cpu))); \
read_unlock(&tasklist_lock); \
} \
\
--
2.45.2
From: Tomas Glozar <tglozar(a)redhat.com>
When using rtla timerlat with userspace threads (-u or -U), rtla
disables the OSNOISE_WORKLOAD option in
/sys/kernel/tracing/osnoise/options. This option is not re-enabled in a
subsequent run with kernel-space threads, leading to rtla collecting no
results if the previous run exited abnormally:
$ rtla timerlat top -u
^\Quit (core dumped)
$ rtla timerlat top -k -d 1s
Timer Latency
0 00:00:01 | IRQ Timer Latency (us) | Thread Timer Latency (us)
CPU COUNT | cur min avg max | cur min avg max
The issue persists until OSNOISE_WORKLOAD is set manually by running:
$ echo OSNOISE_WORKLOAD > /sys/kernel/tracing/osnoise/options
Set OSNOISE_WORKLOAD when running rtla with kernel-space threads if
available to fix the issue.
Cc: stable(a)vger.kernel.org
Cc: John Kacur <jkacur(a)redhat.com>
Cc: Luis Goncalves <lgoncalv(a)redhat.com>
Link: https://lore.kernel.org/20250107144823.239782-4-tglozar@redhat.com
Fixes: cdca4f4e5e8e ("rtla/timerlat_top: Add timerlat user-space support")
Signed-off-by: Tomas Glozar <tglozar(a)redhat.com>
Signed-off-by: Steven Rostedt (Google) <rostedt(a)goodmis.org>
---
tools/tracing/rtla/src/timerlat_top.c | 15 +++++++++------
1 file changed, 9 insertions(+), 6 deletions(-)
diff --git a/tools/tracing/rtla/src/timerlat_top.c b/tools/tracing/rtla/src/timerlat_top.c
index d358cd39f360..f387597d3ac2 100644
--- a/tools/tracing/rtla/src/timerlat_top.c
+++ b/tools/tracing/rtla/src/timerlat_top.c
@@ -851,12 +851,15 @@ timerlat_top_apply_config(struct osnoise_tool *top, struct timerlat_top_params *
}
}
- if (params->user_top) {
- retval = osnoise_set_workload(top->context, 0);
- if (retval) {
- err_msg("Failed to set OSNOISE_WORKLOAD option\n");
- goto out_err;
- }
+ /*
+ * Set workload according to type of thread if the kernel supports it.
+ * On kernels without support, user threads will have already failed
+ * on missing timerlat_fd, and kernel threads do not need it.
+ */
+ retval = osnoise_set_workload(top->context, params->kernel_workload);
+ if (retval < -1) {
+ err_msg("Failed to set OSNOISE_WORKLOAD option\n");
+ goto out_err;
}
if (isatty(STDOUT_FILENO) && !params->quiet)
--
2.45.2
From: Tomas Glozar <tglozar(a)redhat.com>
When using rtla timerlat with userspace threads (-u or -U), rtla
disables the OSNOISE_WORKLOAD option in
/sys/kernel/tracing/osnoise/options. This option is not re-enabled in a
subsequent run with kernel-space threads, leading to rtla collecting no
results if the previous run exited abnormally:
$ rtla timerlat hist -u
^\Quit (core dumped)
$ rtla timerlat hist -k -d 1s
Index
over:
count:
min:
avg:
max:
ALL: IRQ Thr Usr
count: 0 0 0
min: - - -
avg: - - -
max: - - -
The issue persists until OSNOISE_WORKLOAD is set manually by running:
$ echo OSNOISE_WORKLOAD > /sys/kernel/tracing/osnoise/options
Set OSNOISE_WORKLOAD when running rtla with kernel-space threads if
available to fix the issue.
Cc: stable(a)vger.kernel.org
Cc: John Kacur <jkacur(a)redhat.com>
Cc: Luis Goncalves <lgoncalv(a)redhat.com>
Link: https://lore.kernel.org/20250107144823.239782-3-tglozar@redhat.com
Fixes: ed774f7481fa ("rtla/timerlat_hist: Add timerlat user-space support")
Signed-off-by: Tomas Glozar <tglozar(a)redhat.com>
Signed-off-by: Steven Rostedt (Google) <rostedt(a)goodmis.org>
---
tools/tracing/rtla/src/timerlat_hist.c | 15 +++++++++------
1 file changed, 9 insertions(+), 6 deletions(-)
diff --git a/tools/tracing/rtla/src/timerlat_hist.c b/tools/tracing/rtla/src/timerlat_hist.c
index 53ded187b33a..d4bd02c01c53 100644
--- a/tools/tracing/rtla/src/timerlat_hist.c
+++ b/tools/tracing/rtla/src/timerlat_hist.c
@@ -1085,12 +1085,15 @@ timerlat_hist_apply_config(struct osnoise_tool *tool, struct timerlat_hist_param
}
}
- if (params->user_hist) {
- retval = osnoise_set_workload(tool->context, 0);
- if (retval) {
- err_msg("Failed to set OSNOISE_WORKLOAD option\n");
- goto out_err;
- }
+ /*
+ * Set workload according to type of thread if the kernel supports it.
+ * On kernels without support, user threads will have already failed
+ * on missing timerlat_fd, and kernel threads do not need it.
+ */
+ retval = osnoise_set_workload(tool->context, params->kernel_workload);
+ if (retval < -1) {
+ err_msg("Failed to set OSNOISE_WORKLOAD option\n");
+ goto out_err;
}
return 0;
--
2.45.2
From: Tomas Glozar <tglozar(a)redhat.com>
Currently, when either SIGINT from the user or SIGALRM from the duration
timer is caught by rtla-timerlat, stop_tracing is set to break out of
the main loop. This is not sufficient for cases where the timerlat
tracer is producing more data than rtla can consume, since in that case,
rtla is looping indefinitely inside tracefs_iterate_raw_events, never
reaches the check of stop_tracing and hangs.
In addition to setting stop_tracing, also stop the timerlat tracer on
received signal (SIGINT or SIGALRM). This will stop new samples so that
the existing samples may be processed and tracefs_iterate_raw_events
eventually exits.
Cc: stable(a)vger.kernel.org
Cc: John Kacur <jkacur(a)redhat.com>
Cc: Luis Goncalves <lgoncalv(a)redhat.com>
Cc: Gabriele Monaco <gmonaco(a)redhat.com>
Link: https://lore.kernel.org/20250116144931.649593-4-tglozar@redhat.com
Fixes: a828cd18bc4a ("rtla: Add timerlat tool and timelart top mode")
Signed-off-by: Tomas Glozar <tglozar(a)redhat.com>
Signed-off-by: Steven Rostedt (Google) <rostedt(a)goodmis.org>
---
tools/tracing/rtla/src/timerlat_top.c | 12 +++++++++++-
1 file changed, 11 insertions(+), 1 deletion(-)
diff --git a/tools/tracing/rtla/src/timerlat_top.c b/tools/tracing/rtla/src/timerlat_top.c
index 059b468981e4..d21a21053917 100644
--- a/tools/tracing/rtla/src/timerlat_top.c
+++ b/tools/tracing/rtla/src/timerlat_top.c
@@ -900,9 +900,12 @@ static struct osnoise_tool
}
static int stop_tracing;
+static struct trace_instance *top_inst = NULL;
static void stop_top(int sig)
{
stop_tracing = 1;
+ if (top_inst)
+ trace_instance_stop(top_inst);
}
/*
@@ -950,6 +953,13 @@ int timerlat_top_main(int argc, char *argv[])
}
trace = &top->trace;
+ /*
+ * Save trace instance into global variable so that SIGINT can stop
+ * the timerlat tracer.
+ * Otherwise, rtla could loop indefinitely when overloaded.
+ */
+ top_inst = trace;
+
retval = enable_timerlat(trace);
if (retval) {
@@ -1131,7 +1141,7 @@ int timerlat_top_main(int argc, char *argv[])
return_value = 0;
- if (trace_is_off(&top->trace, &record->trace)) {
+ if (trace_is_off(&top->trace, &record->trace) && !stop_tracing) {
printf("rtla timerlat hit stop tracing\n");
if (!params->no_aa)
--
2.45.2
From: Tomas Glozar <tglozar(a)redhat.com>
Currently, when either SIGINT from the user or SIGALRM from the duration
timer is caught by rtla-timerlat, stop_tracing is set to break out of
the main loop. This is not sufficient for cases where the timerlat
tracer is producing more data than rtla can consume, since in that case,
rtla is looping indefinitely inside tracefs_iterate_raw_events, never
reaches the check of stop_tracing and hangs.
In addition to setting stop_tracing, also stop the timerlat tracer on
received signal (SIGINT or SIGALRM). This will stop new samples so that
the existing samples may be processed and tracefs_iterate_raw_events
eventually exits.
Cc: stable(a)vger.kernel.org
Cc: John Kacur <jkacur(a)redhat.com>
Cc: Luis Goncalves <lgoncalv(a)redhat.com>
Cc: Gabriele Monaco <gmonaco(a)redhat.com>
Link: https://lore.kernel.org/20250116144931.649593-3-tglozar@redhat.com
Fixes: 1eeb6328e8b3 ("rtla/timerlat: Add timerlat hist mode")
Signed-off-by: Tomas Glozar <tglozar(a)redhat.com>
Signed-off-by: Steven Rostedt (Google) <rostedt(a)goodmis.org>
---
tools/tracing/rtla/src/timerlat_hist.c | 11 ++++++++++-
1 file changed, 10 insertions(+), 1 deletion(-)
diff --git a/tools/tracing/rtla/src/timerlat_hist.c b/tools/tracing/rtla/src/timerlat_hist.c
index 8b66387e5f35..f1edf1c8a7b0 100644
--- a/tools/tracing/rtla/src/timerlat_hist.c
+++ b/tools/tracing/rtla/src/timerlat_hist.c
@@ -1131,9 +1131,12 @@ static struct osnoise_tool
}
static int stop_tracing;
+static struct trace_instance *hist_inst = NULL;
static void stop_hist(int sig)
{
stop_tracing = 1;
+ if (hist_inst)
+ trace_instance_stop(hist_inst);
}
/*
@@ -1180,6 +1183,12 @@ int timerlat_hist_main(int argc, char *argv[])
}
trace = &tool->trace;
+ /*
+ * Save trace instance into global variable so that SIGINT can stop
+ * the timerlat tracer.
+ * Otherwise, rtla could loop indefinitely when overloaded.
+ */
+ hist_inst = trace;
retval = enable_timerlat(trace);
if (retval) {
@@ -1348,7 +1357,7 @@ int timerlat_hist_main(int argc, char *argv[])
return_value = 0;
- if (trace_is_off(&tool->trace, &record->trace)) {
+ if (trace_is_off(&tool->trace, &record->trace) && !stop_tracing) {
printf("rtla timerlat hit stop tracing\n");
if (!params->no_aa)
--
2.45.2
In the US country code, to avoid including 6 GHz rules in the 5 GHz rules
list, the number of 5 GHz rules is set to a default constant value of 4
(REG_US_5G_NUM_REG_RULES). However, if there are more than 4 valid 5 GHz
rules, the current logic will bypass the legitimate 6 GHz rules.
For example, if there are 5 valid 5 GHz rules and 1 valid 6 GHz rule, the
current logic will only consider 4 of the 5 GHz rules, treating the last
valid rule as a 6 GHz rule. Consequently, the actual 6 GHz rule is never
processed, leading to the eventual disabling of 6 GHz channels.
To fix this issue, instead of hardcoding the value to 4, use a helper
function to determine the number of 6 GHz rules present in the 5 GHz rules
list and ignore only those rules.
Tested-on: QCN9274 hw2.0 PCI WLAN.WBE.1.3.1-00173-QCAHKSWPL_SILICONZ-1
Cc: stable(a)vger.kernel.org
Fixes: d889913205cf ("wifi: ath12k: driver for Qualcomm Wi-Fi 7 devices")
Signed-off-by: Aditya Kumar Singh <aditya.kumar.singh(a)oss.qualcomm.com>
---
drivers/net/wireless/ath/ath12k/wmi.c | 61 ++++++++++++++++++++++++++---------
drivers/net/wireless/ath/ath12k/wmi.h | 1 -
2 files changed, 45 insertions(+), 17 deletions(-)
diff --git a/drivers/net/wireless/ath/ath12k/wmi.c b/drivers/net/wireless/ath/ath12k/wmi.c
index a7625a3d1a3a9b1bbcf90cee30d3c707de03c439..296238c7ee0fd95045a3e0859d730e3bd73ec906 100644
--- a/drivers/net/wireless/ath/ath12k/wmi.c
+++ b/drivers/net/wireless/ath/ath12k/wmi.c
@@ -4852,6 +4852,22 @@ static struct ath12k_reg_rule
return reg_rule_ptr;
}
+static u8 ath12k_wmi_ignore_num_extra_rules(struct ath12k_wmi_reg_rule_ext_params *rule,
+ u32 num_reg_rules)
+{
+ u8 num_invalid_5ghz_rules = 0;
+ u32 count, start_freq;
+
+ for (count = 0; count < num_reg_rules; count++) {
+ start_freq = le32_get_bits(rule[count].freq_info, REG_RULE_START_FREQ);
+
+ if (start_freq >= ATH12K_MIN_6G_FREQ)
+ num_invalid_5ghz_rules++;
+ }
+
+ return num_invalid_5ghz_rules;
+}
+
static int ath12k_pull_reg_chan_list_ext_update_ev(struct ath12k_base *ab,
struct sk_buff *skb,
struct ath12k_reg_info *reg_info)
@@ -4862,6 +4878,7 @@ static int ath12k_pull_reg_chan_list_ext_update_ev(struct ath12k_base *ab,
u32 num_2g_reg_rules, num_5g_reg_rules;
u32 num_6g_reg_rules_ap[WMI_REG_CURRENT_MAX_AP_TYPE];
u32 num_6g_reg_rules_cl[WMI_REG_CURRENT_MAX_AP_TYPE][WMI_REG_MAX_CLIENT_TYPE];
+ u8 num_invalid_5ghz_ext_rules;
u32 total_reg_rules = 0;
int ret, i, j;
@@ -4955,20 +4972,6 @@ static int ath12k_pull_reg_chan_list_ext_update_ev(struct ath12k_base *ab,
memcpy(reg_info->alpha2, &ev->alpha2, REG_ALPHA2_LEN);
- /* FIXME: Currently FW includes 6G reg rule also in 5G rule
- * list for country US.
- * Having same 6G reg rule in 5G and 6G rules list causes
- * intersect check to be true, and same rules will be shown
- * multiple times in iw cmd. So added hack below to avoid
- * parsing 6G rule from 5G reg rule list, and this can be
- * removed later, after FW updates to remove 6G reg rule
- * from 5G rules list.
- */
- if (memcmp(reg_info->alpha2, "US", 2) == 0) {
- reg_info->num_5g_reg_rules = REG_US_5G_NUM_REG_RULES;
- num_5g_reg_rules = reg_info->num_5g_reg_rules;
- }
-
reg_info->dfs_region = le32_to_cpu(ev->dfs_region);
reg_info->phybitmap = le32_to_cpu(ev->phybitmap);
reg_info->num_phy = le32_to_cpu(ev->num_phy);
@@ -5071,8 +5074,29 @@ static int ath12k_pull_reg_chan_list_ext_update_ev(struct ath12k_base *ab,
}
}
+ ext_wmi_reg_rule += num_2g_reg_rules;
+
+ /* Firmware might include 6 GHz reg rule in 5 GHz rule list
+ * for few countries along with separate 6 GHz rule.
+ * Having same 6 GHz reg rule in 5 GHz and 6 GHz rules list
+ * causes intersect check to be true, and same rules will be
+ * shown multiple times in iw cmd.
+ * Hence, avoid parsing 6 GHz rule from 5 GHz reg rule list
+ */
+ num_invalid_5ghz_ext_rules = ath12k_wmi_ignore_num_extra_rules(ext_wmi_reg_rule,
+ num_5g_reg_rules);
+
+ if (num_invalid_5ghz_ext_rules) {
+ ath12k_dbg(ab, ATH12K_DBG_WMI,
+ "CC: %s 5 GHz reg rules number %d from fw, %d number of invalid 5 GHz rules",
+ reg_info->alpha2, reg_info->num_5g_reg_rules,
+ num_invalid_5ghz_ext_rules);
+
+ num_5g_reg_rules = num_5g_reg_rules - num_invalid_5ghz_ext_rules;
+ reg_info->num_5g_reg_rules = num_5g_reg_rules;
+ }
+
if (num_5g_reg_rules) {
- ext_wmi_reg_rule += num_2g_reg_rules;
reg_info->reg_rules_5g_ptr =
create_ext_reg_rules_from_wmi(num_5g_reg_rules,
ext_wmi_reg_rule);
@@ -5084,7 +5108,12 @@ static int ath12k_pull_reg_chan_list_ext_update_ev(struct ath12k_base *ab,
}
}
- ext_wmi_reg_rule += num_5g_reg_rules;
+ /* We have adjusted the number of 5 GHz reg rules above. But still those
+ * many rules needs to be adjusted in ext_wmi_reg_rule.
+ *
+ * NOTE: num_invalid_5ghz_ext_rules will be 0 for rest other cases.
+ */
+ ext_wmi_reg_rule += (num_5g_reg_rules + num_invalid_5ghz_ext_rules);
for (i = 0; i < WMI_REG_CURRENT_MAX_AP_TYPE; i++) {
reg_info->reg_rules_6g_ap_ptr[i] =
diff --git a/drivers/net/wireless/ath/ath12k/wmi.h b/drivers/net/wireless/ath/ath12k/wmi.h
index 38aed18b99f4239e40d369181549b4bc78faa862..81248253a36872f6fd93db772b5394f3ef9d3bf9 100644
--- a/drivers/net/wireless/ath/ath12k/wmi.h
+++ b/drivers/net/wireless/ath/ath12k/wmi.h
@@ -4093,7 +4093,6 @@ struct ath12k_wmi_eht_rate_set_params {
#define MAX_REG_RULES 10
#define REG_ALPHA2_LEN 2
#define MAX_6G_REG_RULES 5
-#define REG_US_5G_NUM_REG_RULES 4
enum wmi_start_event_param {
WMI_VDEV_START_RESP_EVENT = 0,
---
base-commit: c4e4e37afc8b8d178b0f00f607c30b3b81253597
change-id: 20250123-fix_6ghz_rules_handling-ff85f1efb30b
This change is specific to Hyper-V VM user.
If the Virtual Machine Connection window is focused,
a Hyper-V VM user can unintentionally touch the keyboard/mouse
when the VM is hibernating or resuming, and consequently the
hibernation or resume operation can be aborted unexpectedly.
Fix the issue by no longer registering the keyboard/mouse as
wakeup devices (see the other two patches for the
changes to drivers/input/serio/hyperv-keyboard.c and
drivers/hid/hid-hyperv.c).
The keyboard/mouse were registered as wakeup devices because the
VM needs to be woken up from the Suspend-to-Idle state after
a user runs "echo freeze > /sys/power/state". It seems like
the Suspend-to-Idle feature has no real users in practice, so
let's no longer support that by returning -EOPNOTSUPP if a
user tries to use that.
Fixes: 1a06d017fb3f ("Drivers: hv: vmbus: Fix Suspend-to-Idle for Generation-2 VM")
Cc: stable(a)vger.kernel.org
Signed-off-by: Saurabh Sengar <ssengar(a)linux.microsoft.com>
Signed-off-by: Erni Sri Satya Vennela <ernis(a)linux.microsoft.com>>
---
Changes in v4:
* No change
Changes in v3:
* Add "Cc: stable(a)vger.kernel.org" in sign-off area.
Changes in v2:
* Add "#define vmbus_freeze NULL" when CONFIG_PM_SLEEP is not
enabled.
---
drivers/hv/vmbus_drv.c | 16 +++++++++++++++-
1 file changed, 15 insertions(+), 1 deletion(-)
diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c
index 6d89d37b069a..4df6b12bf6a1 100644
--- a/drivers/hv/vmbus_drv.c
+++ b/drivers/hv/vmbus_drv.c
@@ -900,6 +900,19 @@ static void vmbus_shutdown(struct device *child_device)
}
#ifdef CONFIG_PM_SLEEP
+/*
+ * vmbus_freeze - Suspend-to-Idle
+ */
+static int vmbus_freeze(struct device *child_device)
+{
+/*
+ * Do not support Suspend-to-Idle ("echo freeze > /sys/power/state") as
+ * that would require registering the Hyper-V synthetic mouse/keyboard
+ * devices as wakeup devices, which can abort hibernation/resume unexpectedly.
+ */
+ return -EOPNOTSUPP;
+}
+
/*
* vmbus_suspend - Suspend a vmbus device
*/
@@ -938,6 +951,7 @@ static int vmbus_resume(struct device *child_device)
return drv->resume(dev);
}
#else
+#define vmbus_freeze NULL
#define vmbus_suspend NULL
#define vmbus_resume NULL
#endif /* CONFIG_PM_SLEEP */
@@ -969,7 +983,7 @@ static void vmbus_device_release(struct device *device)
*/
static const struct dev_pm_ops vmbus_pm = {
- .suspend_noirq = NULL,
+ .suspend_noirq = vmbus_freeze,
.resume_noirq = NULL,
.freeze_noirq = vmbus_suspend,
.thaw_noirq = vmbus_resume,
--
2.34.1
Hi,
This series has two main parts: 1) add error handling all around, and 2)
update the drivers according to latest (mostly non-public) information
from TI.
The "Update UB9702 init sequences" patch basically rewrites the init
sequence from scratch, and to make that patch easier to read, the
previous patch first removes the current init sequence. In the final
version these two patches need to be squashed together.
Tomi
Signed-off-by: Tomi Valkeinen <tomi.valkeinen(a)ideasonboard.com>
---
Changes in v2:
- Add new patch "media: i2c: ds90ub913: Fix returned fmt from .set_fmt()"
- Reformat the reg write in "Speed-up I2C watchdog timer"
- Add 'it' parameter to rxport for_each macros
- Move 'enable_sscg' module parameter to DT
- Change serializer 'i2c-addr' property to serializer 'reg' property
- Split ub953 registers to a header file
- Link to v1: https://lore.kernel.org/r/20250110-ub9xx-improvements-v1-0-e0b9a1f644da@ide…
---
Jai Luthra (6):
media: i2c: ds90ub953: Speed-up I2C watchdog timer
media: dt-bindings: ti,ds90ub960: Add ti,enable-sscg property
media: dt-bindings: ti,ds90ub960: Allow setting serializer address
media: i2c: ds90ub960: Enable SSCG for UB9702
media: i2c: ds90ub960: Configure serializer using back-channel
media: i2c: ds90ub9xx: Set serializer temperature ramp
Tomi Valkeinen (16):
media: i2c: ds90ub953: Fix error prints
media: i2c: ds90ub913: Fix returned fmt from .set_fmt()
media: i2c: ds90ub913: Align ub913_read() with other similar functions
media: i2c: ds90ub9xx: Add err parameter to read/write funcs
media: i2c: ds90ub960: Add error handling to multiple places
media: i2c: ds90ub953: Add error handling to ub953_log_status()
media: i2c: ds90ub913: Add error handling to ub913_log_status()
media: i2c: ds90ub960: Move UB9702 registers to a separate section
media: i2c: ds90ub960: Add UB9702 specific registers
media: i2c: ds90ub960: Split ub960_init_tx_ports()
media: i2c: ds90ub960: Refresh ub960_init_tx_ports_ub9702()
media: i2c: ds90ub960: Add RX port iteration support
media: i2c: ds90ub960: Move all RX port init code into ub960_init_rx_ports()
media: i2c: ds90ub960: Remove old ub9702 RX port init code (SQUASH)
media: i2c: ds90ub960: Update UB9702 init sequences
media: i2c: ds90ub953: Move reg defines to a header file
.../bindings/media/i2c/ti,ds90ub953.yaml | 77 +-
.../bindings/media/i2c/ti,ds90ub960.yaml | 21 +-
drivers/media/i2c/ds90ub913.c | 82 +-
drivers/media/i2c/ds90ub953.c | 242 +--
drivers/media/i2c/ds90ub953.h | 104 +
drivers/media/i2c/ds90ub960.c | 2264 +++++++++++++++-----
6 files changed, 2070 insertions(+), 720 deletions(-)
---
base-commit: c4b7779abc6633677e6edb79e2809f4f61fde157
change-id: 20250110-ub9xx-improvements-9172b44eb0bc
Best regards,
--
Tomi Valkeinen <tomi.valkeinen(a)ideasonboard.com>
When device_register(&child->dev) failed, calling put_device() to
explicitly release child->dev. Otherwise, it could cause double free
problem.
device_register() includes device_add(). As comment of device_add()
says, 'if device_add() succeeds, you should call device_del() when you
want to get rid of it. If device_add() has not succeeded, use only
put_device() to drop the reference count'.
Found by code review.
Cc: stable(a)vger.kernel.org
Fixes: 4f535093cf8f ("PCI: Put pci_dev in device tree as early as possible")
Signed-off-by: Ma Ke <make24(a)iscas.ac.cn>
---
Changes in v2:
- added the bug description about the comment of device_add();
- fixed the patch as suggestions;
- added Cc and Fixes table.
---
drivers/pci/probe.c | 5 ++++-
1 file changed, 4 insertions(+), 1 deletion(-)
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index 2e81ab0f5a25..ae12f92c6a9d 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -1174,7 +1174,10 @@ static struct pci_bus *pci_alloc_child_bus(struct pci_bus *parent,
add_dev:
pci_set_bus_msi_domain(child);
ret = device_register(&child->dev);
- WARN_ON(ret < 0);
+ if (WARN_ON(ret < 0)) {
+ put_device(&child->dev);
+ return NULL;
+ }
pcibios_add_bus(child);
--
2.25.1
devm_blk_crypto_profile_init() registers a cleanup handler to run when
the associated (platform-) device is being released. For UFS, the
crypto private data and pointers are stored as part of the ufs_hba's
data structure 'struct ufs_hba::crypto_profile'. This structure is
allocated as part of the underlying ufshd allocation.
During driver release or during error handling in ufshcd_pltfrm_init(),
this structure is released as part of ufshcd_dealloc_host() before the
(platform-) device associated with the crypto call above is released.
Once this device is released, the crypto cleanup code will run, using
the just-released 'struct ufs_hba::crypto_profile'. This causes a
use-after-free situation:
exynos-ufshc 14700000.ufs: ufshcd_pltfrm_init() failed -11
exynos-ufshc 14700000.ufs: probe with driver exynos-ufshc failed with error -11
Unable to handle kernel paging request at virtual address 01adafad6dadad88
Mem abort info:
ESR = 0x0000000096000004
EC = 0x25: DABT (current EL), IL = 32 bits
SET = 0, FnV = 0
EA = 0, S1PTW = 0
FSC = 0x04: level 0 translation fault
Data abort info:
ISV = 0, ISS = 0x00000004, ISS2 = 0x00000000
CM = 0, WnR = 0, TnD = 0, TagAccess = 0
GCS = 0, Overlay = 0, DirtyBit = 0, Xs = 0
[01adafad6dadad88] address between user and kernel address ranges
Internal error: Oops: 0000000096000004 [#1] PREEMPT SMP
Modules linked in:
CPU: 0 UID: 0 PID: 1 Comm: swapper/0 Tainted: G W 6.13.0-rc5-next-20250106+ #70
Tainted: [W]=WARN
Hardware name: Oriole (DT)
pstate: 20400005 (nzCv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--)
pc : kfree+0x60/0x2d8
lr : kvfree+0x44/0x60
sp : ffff80008009ba80
x29: ffff80008009ba90 x28: 0000000000000000 x27: ffffbcc6591e0130
x26: ffffbcc659309960 x25: ffffbcc658f89c50 x24: ffffbcc659539d80
x23: ffff22e000940040 x22: ffff22e001539010 x21: ffffbcc65714b22c
x20: 6b6b6b6b6b6b6b6b x19: 01adafad6dadad80 x18: 0000000000000000
x17: ffffbcc6579fbac8 x16: ffffbcc657a04300 x15: ffffbcc657a027f4
x14: ffffbcc656f969cc x13: ffffbcc6579fdc80 x12: ffffbcc6579fb194
x11: ffffbcc6579fbc34 x10: 0000000000000000 x9 : ffffbcc65714b22c
x8 : ffff80008009b880 x7 : 0000000000000000 x6 : ffff80008009b940
x5 : ffff80008009b8c0 x4 : ffff22e000940518 x3 : ffff22e006f54f40
x2 : ffffbcc657a02268 x1 : ffff80007fffffff x0 : ffffc1ffc0000000
Call trace:
kfree+0x60/0x2d8 (P)
kvfree+0x44/0x60
blk_crypto_profile_destroy_callback+0x28/0x70
devm_action_release+0x1c/0x30
release_nodes+0x6c/0x108
devres_release_all+0x98/0x100
device_unbind_cleanup+0x20/0x70
really_probe+0x218/0x2d0
In other words, the initialisation code flow is:
platform-device probe
ufshcd_pltfrm_init()
ufshcd_alloc_host()
scsi_host_alloc()
allocation of struct ufs_hba
creation of scsi-host devices
devm_blk_crypto_profile_init()
devm registration of cleanup handler using platform-device
and during error handling of ufshcd_pltfrm_init() or during driver
removal:
ufshcd_dealloc_host()
scsi_host_put()
put_device(scsi-host)
release of struct ufs_hba
put_device(platform-device)
crypto cleanup handler
To fix this use-after free, change ufshcd_alloc_host() to register a
devres action to automatically cleanup the underlying SCSI device on
ufshcd destruction, without requiring explicit calls to
ufshcd_dealloc_host(). This way:
* the crypto profile and all other ufs_hba-owned resources are
destroyed before SCSI (as they've been registered after)
* a memleak is plugged in tc-dwc-g210-pci.c remove() as a
side-effect
* EXPORT_SYMBOL_GPL(ufshcd_dealloc_host) can be removed fully as
it's not needed anymore
* no future drivers using ufshcd_alloc_host() could ever forget
adding the cleanup
Fixes: cb77cb5abe1f ("blk-crypto: rename blk_keyslot_manager to blk_crypto_profile")
Fixes: d76d9d7d1009 ("scsi: ufs: use devm_blk_ksm_init()")
Cc: stable(a)vger.kernel.org
Signed-off-by: André Draszik <andre.draszik(a)linaro.org>
---
Changes in v3:
- rename devres action handler to ufshcd_devres_release() (Bart)
- Link to v2: https://lore.kernel.org/r/20250114-ufshcd-fix-v2-1-2dc627590a4a@linaro.org
Changes in v2:
- completely new approach using devres action for Scsi_host cleanup, to
ensure ordering
- add Fixes: and CC: stable tags (Eric)
- Link to v1: https://lore.kernel.org/r/20250113-ufshcd-fix-v1-1-ca63d1d4bd55@linaro.org
---
In my case, as per above trace I initially encountered an error in
ufshcd_verify_dev_init(), which made me notice this problem both during
error handling and release. For reproducing, it'd be possible to change
that function to just return an error, or rmmod the platform glue
driver.
Other approaches for solving this issue I see are the following, but I
believe this one here is the cleanest:
* turn 'struct ufs_hba::crypto_profile' into a dynamically allocated
pointer, in which case it doesn't matter if cleanup runs after
scsi_host_put()
* add an explicit devm_blk_crypto_profile_deinit() to be called by API
users when necessary, e.g. before ufshcd_dealloc_host() in this case
* register the crypto cleanup handler against the scsi-host device
instead, like in v1 of this patch
---
drivers/ufs/core/ufshcd.c | 27 +++++++++++++++++----------
drivers/ufs/host/ufshcd-pci.c | 2 --
drivers/ufs/host/ufshcd-pltfrm.c | 11 ++++-------
include/ufs/ufshcd.h | 1 -
4 files changed, 21 insertions(+), 20 deletions(-)
diff --git a/drivers/ufs/core/ufshcd.c b/drivers/ufs/core/ufshcd.c
index 43ddae7318cb..8351795296bb 100644
--- a/drivers/ufs/core/ufshcd.c
+++ b/drivers/ufs/core/ufshcd.c
@@ -10279,16 +10279,6 @@ int ufshcd_system_thaw(struct device *dev)
EXPORT_SYMBOL_GPL(ufshcd_system_thaw);
#endif /* CONFIG_PM_SLEEP */
-/**
- * ufshcd_dealloc_host - deallocate Host Bus Adapter (HBA)
- * @hba: pointer to Host Bus Adapter (HBA)
- */
-void ufshcd_dealloc_host(struct ufs_hba *hba)
-{
- scsi_host_put(hba->host);
-}
-EXPORT_SYMBOL_GPL(ufshcd_dealloc_host);
-
/**
* ufshcd_set_dma_mask - Set dma mask based on the controller
* addressing capability
@@ -10307,6 +10297,16 @@ static int ufshcd_set_dma_mask(struct ufs_hba *hba)
return dma_set_mask_and_coherent(hba->dev, DMA_BIT_MASK(32));
}
+/**
+ * ufshcd_devres_release - devres cleanup handler, invoked during release of
+ * hba->dev
+ * @host: pointer to SCSI host
+ */
+static void ufshcd_devres_release(void *host)
+{
+ scsi_host_put(host);
+}
+
/**
* ufshcd_alloc_host - allocate Host Bus Adapter (HBA)
* @dev: pointer to device handle
@@ -10334,6 +10334,13 @@ int ufshcd_alloc_host(struct device *dev, struct ufs_hba **hba_handle)
err = -ENOMEM;
goto out_error;
}
+
+ err = devm_add_action_or_reset(dev, ufshcd_devres_release,
+ host);
+ if (err)
+ return dev_err_probe(dev, err,
+ "failed to add ufshcd dealloc action\n");
+
host->nr_maps = HCTX_TYPE_POLL + 1;
hba = shost_priv(host);
hba->host = host;
diff --git a/drivers/ufs/host/ufshcd-pci.c b/drivers/ufs/host/ufshcd-pci.c
index ea39c5d5b8cf..9cfcaad23cf9 100644
--- a/drivers/ufs/host/ufshcd-pci.c
+++ b/drivers/ufs/host/ufshcd-pci.c
@@ -562,7 +562,6 @@ static void ufshcd_pci_remove(struct pci_dev *pdev)
pm_runtime_forbid(&pdev->dev);
pm_runtime_get_noresume(&pdev->dev);
ufshcd_remove(hba);
- ufshcd_dealloc_host(hba);
}
/**
@@ -605,7 +604,6 @@ ufshcd_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
err = ufshcd_init(hba, mmio_base, pdev->irq);
if (err) {
dev_err(&pdev->dev, "Initialization failed\n");
- ufshcd_dealloc_host(hba);
return err;
}
diff --git a/drivers/ufs/host/ufshcd-pltfrm.c b/drivers/ufs/host/ufshcd-pltfrm.c
index 505572d4fa87..adb0a65d9df5 100644
--- a/drivers/ufs/host/ufshcd-pltfrm.c
+++ b/drivers/ufs/host/ufshcd-pltfrm.c
@@ -488,13 +488,13 @@ int ufshcd_pltfrm_init(struct platform_device *pdev,
if (err) {
dev_err(dev, "%s: clock parse failed %d\n",
__func__, err);
- goto dealloc_host;
+ goto out;
}
err = ufshcd_parse_regulator_info(hba);
if (err) {
dev_err(dev, "%s: regulator init failed %d\n",
__func__, err);
- goto dealloc_host;
+ goto out;
}
ufshcd_init_lanes_per_dir(hba);
@@ -502,14 +502,14 @@ int ufshcd_pltfrm_init(struct platform_device *pdev,
err = ufshcd_parse_operating_points(hba);
if (err) {
dev_err(dev, "%s: OPP parse failed %d\n", __func__, err);
- goto dealloc_host;
+ goto out;
}
err = ufshcd_init(hba, mmio_base, irq);
if (err) {
dev_err_probe(dev, err, "Initialization failed with error %d\n",
err);
- goto dealloc_host;
+ goto out;
}
pm_runtime_set_active(dev);
@@ -517,8 +517,6 @@ int ufshcd_pltfrm_init(struct platform_device *pdev,
return 0;
-dealloc_host:
- ufshcd_dealloc_host(hba);
out:
return err;
}
@@ -534,7 +532,6 @@ void ufshcd_pltfrm_remove(struct platform_device *pdev)
pm_runtime_get_sync(&pdev->dev);
ufshcd_remove(hba);
- ufshcd_dealloc_host(hba);
pm_runtime_disable(&pdev->dev);
pm_runtime_put_noidle(&pdev->dev);
}
diff --git a/include/ufs/ufshcd.h b/include/ufs/ufshcd.h
index da0fa5c65081..58eb6e897827 100644
--- a/include/ufs/ufshcd.h
+++ b/include/ufs/ufshcd.h
@@ -1311,7 +1311,6 @@ static inline void ufshcd_rmwl(struct ufs_hba *hba, u32 mask, u32 val, u32 reg)
void ufshcd_enable_irq(struct ufs_hba *hba);
void ufshcd_disable_irq(struct ufs_hba *hba);
int ufshcd_alloc_host(struct device *, struct ufs_hba **);
-void ufshcd_dealloc_host(struct ufs_hba *);
int ufshcd_hba_enable(struct ufs_hba *hba);
int ufshcd_init(struct ufs_hba *, void __iomem *, unsigned int);
int ufshcd_link_recovery(struct ufs_hba *hba);
---
base-commit: 4e16367cfe0ce395f29d0482b78970cce8e1db73
change-id: 20250113-ufshcd-fix-52409f2d32ff
Best regards,
--
André Draszik <andre.draszik(a)linaro.org>
The patch titled
Subject: mm/compaction: fix UBSAN shift-out-of-bounds warning
has been added to the -mm mm-unstable branch. Its filename is
mm-compaction-fix-ubsan-shift-out-of-bounds-warning.patch
This patch will shortly appear at
https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patche…
This patch will later appear in the mm-unstable branch at
git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next via the mm-everything
branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
and is updated there every 2-3 working days
------------------------------------------------------
From: Liu Shixin <liushixin2(a)huawei.com>
Subject: mm/compaction: fix UBSAN shift-out-of-bounds warning
Date: Thu, 23 Jan 2025 10:10:29 +0800
syzkaller reported a UBSAN shift-out-of-bounds warning of (1UL << order)
in isolate_freepages_block(). The bogus compound_order can be any value
because it is union with flags. Add back the MAX_PAGE_ORDER check to fix
the warning.
Link: https://lkml.kernel.org/r/20250123021029.2826736-1-liushixin2@huawei.com
Fixes: 3da0272a4c7d ("mm/compaction: correctly return failure with bogus compound_order in strict mode")
Signed-off-by: Liu Shixin <liushixin2(a)huawei.com>
Cc: Baolin Wang <baolin.wang(a)linux.alibaba.com>
Cc: David Hildenbrand <david(a)redhat.com>
Cc: Kefeng Wang <wangkefeng.wang(a)huawei.com>
Cc: Kemeng Shi <shikemeng(a)huaweicloud.com>
Cc: Mattew Wilcox <willy(a)infradead.org> [English fixes]
Cc: Mel Gorman <mgorman(a)techsingularity.net>
Cc: Nanyong Sun <sunnanyong(a)huawei.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/compaction.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
--- a/mm/compaction.c~mm-compaction-fix-ubsan-shift-out-of-bounds-warning
+++ a/mm/compaction.c
@@ -631,7 +631,8 @@ static unsigned long isolate_freepages_b
if (PageCompound(page)) {
const unsigned int order = compound_order(page);
- if (blockpfn + (1UL << order) <= end_pfn) {
+ if ((order <= MAX_PAGE_ORDER) &&
+ (blockpfn + (1UL << order) <= end_pfn)) {
blockpfn += (1UL << order) - 1;
page += (1UL << order) - 1;
nr_scanned += (1UL << order) - 1;
_
Patches currently in -mm which might be from liushixin2(a)huawei.com are
mm-page_isolation-avoid-call-folio_hstate-without-hugetlb_lock.patch
mm-compaction-fix-ubsan-shift-out-of-bounds-warning.patch
Protect access to fore200e->available_cell_rate with rate_mtx lock to
prevent potential data race.
In this case, since the update depends on a prior read, a data race
could lead to a wrong fore200e.available_cell_rate value.
The field fore200e.available_cell_rate is generally protected by the lock
fore200e.rate_mtx when accessed. In all other read and write cases, this
field is consistently protected by the lock, except for this case and
during initialization.
This potential bug was detected by our experimental static analysis tool,
which analyzes locking APIs and paired functions to identify data races
and atomicity violations.
Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
Cc: stable(a)vger.kernel.org
Signed-off-by: Gui-Dong Han <2045gemini(a)gmail.com>
---
v2:
* Added a description of the data race hazard in fore200e_open(), as
suggested by Jakub Kicinski and Simon Horman.
---
drivers/atm/fore200e.c | 2 ++
1 file changed, 2 insertions(+)
diff --git a/drivers/atm/fore200e.c b/drivers/atm/fore200e.c
index 4fea1149e003..f62e38571440 100644
--- a/drivers/atm/fore200e.c
+++ b/drivers/atm/fore200e.c
@@ -1374,7 +1374,9 @@ fore200e_open(struct atm_vcc *vcc)
vcc->dev_data = NULL;
+ mutex_lock(&fore200e->rate_mtx);
fore200e->available_cell_rate += vcc->qos.txtp.max_pcr;
+ mutex_unlock(&fore200e->rate_mtx);
kfree(fore200e_vcc);
return -EINVAL;
--
2.25.1
Hello,
My name is wioleta. we would like to know if you export to
Poland, as We have active
projects that require most products as seen on your website. If
yes, please kindly keep us informed upon your feedback so we can
send our preferred listing for quote. For further information or
have any questions, please do not hesitate to write us.
Sten Arnlund
Purchase Manager
wioleta.raimer(a)invpolamd.com
a: Vedwalterdige by 2, Holmerskulle, 432 68 Poland.
From: Rand Deeb <rand.sec96(a)gmail.com>
[ Upstream commit 789c17185fb0f39560496c2beab9b57ce1d0cbe7 ]
The ssb_device_uevent() function first attempts to convert the 'dev' pointer
to 'struct ssb_device *'. However, it mistakenly dereferences 'dev' before
performing the NULL check, potentially leading to a NULL pointer
dereference if 'dev' is NULL.
To fix this issue, move the NULL check before dereferencing the 'dev' pointer,
ensuring that the pointer is valid before attempting to use it.
Found by Linux Verification Center (linuxtesting.org) with SVACE.
Signed-off-by: Rand Deeb <rand.sec96(a)gmail.com>
Signed-off-by: Kalle Valo <kvalo(a)kernel.org>
Link: https://msgid.link/20240306123028.164155-1-rand.sec96@gmail.com
Signed-off-by: Imkanmod Khan <imkanmodkhan(a)gmail.com>
---
drivers/ssb/main.c | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/drivers/ssb/main.c b/drivers/ssb/main.c
index d52e91258e98..aae50a5dfb57 100644
--- a/drivers/ssb/main.c
+++ b/drivers/ssb/main.c
@@ -341,11 +341,13 @@ static int ssb_bus_match(struct device *dev, struct device_driver *drv)
static int ssb_device_uevent(struct device *dev, struct kobj_uevent_env *env)
{
- struct ssb_device *ssb_dev = dev_to_ssb_dev(dev);
+ struct ssb_device *ssb_dev;
if (!dev)
return -ENODEV;
+ ssb_dev = dev_to_ssb_dev(dev);
+
return add_uevent_var(env,
"MODALIAS=ssb:v%04Xid%04Xrev%02X",
ssb_dev->id.vendor, ssb_dev->id.coreid,
--
2.25.1
From: Lucas De Marchi <lucas.demarchi(a)intel.com>
Commit 70fb86a85dc9 ("drm/xe: Revert some changes that break a mesa
debug tool") partially reverted some changes to workaround breakage
caused to mesa tools. However, in doing so it also broke fetching the
GuC log via debugfs since xe_print_blob_ascii85() simply bails out.
The fix is to avoid the extra newlines: the devcoredump interface is
line-oriented and adding random newlines in the middle breaks it. If a
tool is able to parse it by looking at the data and checking for chars
that are out of the ascii85 space, it can still do so. A format change
that breaks the line-oriented output on devcoredump however needs better
coordination with existing tools.
Reviewed-by: José Roberto de Souza <jose.souza(a)intel.com>
Cc: John Harrison <John.C.Harrison(a)Intel.com>
Cc: Julia Filipchuk <julia.filipchuk(a)intel.com>
Cc: José Roberto de Souza <jose.souza(a)intel.com>
Cc: stable(a)vger.kernel.org
Fixes: 70fb86a85dc9 ("drm/xe: Revert some changes that break a mesa debug tool")
Fixes: ec1455ce7e35 ("drm/xe/devcoredump: Add ASCII85 dump helper function")
Signed-off-by: Lucas De Marchi <lucas.demarchi(a)intel.com>
---
drivers/gpu/drm/xe/xe_devcoredump.c | 30 +++++++++--------------------
drivers/gpu/drm/xe/xe_devcoredump.h | 2 +-
drivers/gpu/drm/xe/xe_guc_ct.c | 3 ++-
drivers/gpu/drm/xe/xe_guc_log.c | 4 +++-
4 files changed, 15 insertions(+), 24 deletions(-)
diff --git a/drivers/gpu/drm/xe/xe_devcoredump.c b/drivers/gpu/drm/xe/xe_devcoredump.c
index 81dc7795c0651..1c86e6456d60f 100644
--- a/drivers/gpu/drm/xe/xe_devcoredump.c
+++ b/drivers/gpu/drm/xe/xe_devcoredump.c
@@ -395,42 +395,30 @@ int xe_devcoredump_init(struct xe_device *xe)
/**
* xe_print_blob_ascii85 - print a BLOB to some useful location in ASCII85
*
- * The output is split to multiple lines because some print targets, e.g. dmesg
- * cannot handle arbitrarily long lines. Note also that printing to dmesg in
- * piece-meal fashion is not possible, each separate call to drm_puts() has a
- * line-feed automatically added! Therefore, the entire output line must be
- * constructed in a local buffer first, then printed in one atomic output call.
+ * The output is split to multiple print calls because some print targets, e.g.
+ * dmesg cannot handle arbitrarily long lines. These targets may add newline
+ * between calls.
*
* There is also a scheduler yield call to prevent the 'task has been stuck for
* 120s' kernel hang check feature from firing when printing to a slow target
* such as dmesg over a serial port.
*
- * TODO: Add compression prior to the ASCII85 encoding to shrink huge buffers down.
- *
* @p: the printer object to output to
* @prefix: optional prefix to add to output string
* @blob: the Binary Large OBject to dump out
* @offset: offset in bytes to skip from the front of the BLOB, must be a multiple of sizeof(u32)
* @size: the size in bytes of the BLOB, must be a multiple of sizeof(u32)
*/
-void xe_print_blob_ascii85(struct drm_printer *p, const char *prefix,
+void xe_print_blob_ascii85(struct drm_printer *p, const char *prefix, char suffix,
const void *blob, size_t offset, size_t size)
{
const u32 *blob32 = (const u32 *)blob;
char buff[ASCII85_BUFSZ], *line_buff;
size_t line_pos = 0;
- /*
- * Splitting blobs across multiple lines is not compatible with the mesa
- * debug decoder tool. Note that even dropping the explicit '\n' below
- * doesn't help because the GuC log is so big some underlying implementation
- * still splits the lines at 512K characters. So just bail completely for
- * the moment.
- */
- return;
-
#define DMESG_MAX_LINE_LEN 800
-#define MIN_SPACE (ASCII85_BUFSZ + 2) /* 85 + "\n\0" */
+ /* Always leave space for the suffix char and the \0 */
+#define MIN_SPACE (ASCII85_BUFSZ + 2) /* 85 + "<suffix>\0" */
if (size & 3)
drm_printf(p, "Size not word aligned: %zu", size);
@@ -462,7 +450,6 @@ void xe_print_blob_ascii85(struct drm_printer *p, const char *prefix,
line_pos += strlen(line_buff + line_pos);
if ((line_pos + MIN_SPACE) >= DMESG_MAX_LINE_LEN) {
- line_buff[line_pos++] = '\n';
line_buff[line_pos++] = 0;
drm_puts(p, line_buff);
@@ -474,10 +461,11 @@ void xe_print_blob_ascii85(struct drm_printer *p, const char *prefix,
}
}
+ if (suffix)
+ line_buff[line_pos++] = suffix;
+
if (line_pos) {
- line_buff[line_pos++] = '\n';
line_buff[line_pos++] = 0;
-
drm_puts(p, line_buff);
}
diff --git a/drivers/gpu/drm/xe/xe_devcoredump.h b/drivers/gpu/drm/xe/xe_devcoredump.h
index 6a17e6d601022..5391a80a4d1ba 100644
--- a/drivers/gpu/drm/xe/xe_devcoredump.h
+++ b/drivers/gpu/drm/xe/xe_devcoredump.h
@@ -29,7 +29,7 @@ static inline int xe_devcoredump_init(struct xe_device *xe)
}
#endif
-void xe_print_blob_ascii85(struct drm_printer *p, const char *prefix,
+void xe_print_blob_ascii85(struct drm_printer *p, const char *prefix, char suffix,
const void *blob, size_t offset, size_t size);
#endif
diff --git a/drivers/gpu/drm/xe/xe_guc_ct.c b/drivers/gpu/drm/xe/xe_guc_ct.c
index 8b65c5e959cc2..50c8076b51585 100644
--- a/drivers/gpu/drm/xe/xe_guc_ct.c
+++ b/drivers/gpu/drm/xe/xe_guc_ct.c
@@ -1724,7 +1724,8 @@ void xe_guc_ct_snapshot_print(struct xe_guc_ct_snapshot *snapshot,
snapshot->g2h_outstanding);
if (snapshot->ctb)
- xe_print_blob_ascii85(p, "CTB data", snapshot->ctb, 0, snapshot->ctb_size);
+ xe_print_blob_ascii85(p, "CTB data", '\n',
+ snapshot->ctb, 0, snapshot->ctb_size);
} else {
drm_puts(p, "CT disabled\n");
}
diff --git a/drivers/gpu/drm/xe/xe_guc_log.c b/drivers/gpu/drm/xe/xe_guc_log.c
index 80151ff6a71f8..44482ea919924 100644
--- a/drivers/gpu/drm/xe/xe_guc_log.c
+++ b/drivers/gpu/drm/xe/xe_guc_log.c
@@ -207,8 +207,10 @@ void xe_guc_log_snapshot_print(struct xe_guc_log_snapshot *snapshot, struct drm_
remain = snapshot->size;
for (i = 0; i < snapshot->num_chunks; i++) {
size_t size = min(GUC_LOG_CHUNK_SIZE, remain);
+ const char *prefix = i ? NULL : "Log data";
+ char suffix = i == snapshot->num_chunks - 1 ? '\n' : 0;
- xe_print_blob_ascii85(p, i ? NULL : "Log data", snapshot->copy[i], 0, size);
+ xe_print_blob_ascii85(p, prefix, suffix, snapshot->copy[i], 0, size);
remain -= size;
}
}
--
2.48.1
Commit 70fb86a85dc9 ("drm/xe: Revert some changes that break a mesa
debug tool") partially reverted some changes to workaround breakage
caused to mesa tools. However, in doing so it also broke fetching the
GuC log via debugfs since xe_print_blob_ascii85() simply bails out.
The fix is to avoid the extra newlines: the devcoredump interface is
line-oriented and adding random newlines in the middle breaks it. If a
tool is able to parse it by looking at the data and checking for chars
that are out of the ascii85 space, it can still do so. A format change
that breaks the line-oriented output on devcoredump however needs better
coordination with existing tools.
Cc: John Harrison <John.C.Harrison(a)Intel.com>
Cc: Julia Filipchuk <julia.filipchuk(a)intel.com>
Cc: José Roberto de Souza <jose.souza(a)intel.com>
Cc: stable(a)vger.kernel.org
Fixes: 70fb86a85dc9 ("drm/xe: Revert some changes that break a mesa debug tool")
Fixes: ec1455ce7e35 ("drm/xe/devcoredump: Add ASCII85 dump helper function")
Signed-off-by: Lucas De Marchi <lucas.demarchi(a)intel.com>
---
drivers/gpu/drm/xe/xe_devcoredump.c | 30 +++++++++--------------------
drivers/gpu/drm/xe/xe_devcoredump.h | 2 +-
drivers/gpu/drm/xe/xe_guc_ct.c | 3 ++-
drivers/gpu/drm/xe/xe_guc_log.c | 4 +++-
4 files changed, 15 insertions(+), 24 deletions(-)
diff --git a/drivers/gpu/drm/xe/xe_devcoredump.c b/drivers/gpu/drm/xe/xe_devcoredump.c
index a7946a76777e7..d9b71bb690860 100644
--- a/drivers/gpu/drm/xe/xe_devcoredump.c
+++ b/drivers/gpu/drm/xe/xe_devcoredump.c
@@ -391,42 +391,30 @@ int xe_devcoredump_init(struct xe_device *xe)
/**
* xe_print_blob_ascii85 - print a BLOB to some useful location in ASCII85
*
- * The output is split to multiple lines because some print targets, e.g. dmesg
- * cannot handle arbitrarily long lines. Note also that printing to dmesg in
- * piece-meal fashion is not possible, each separate call to drm_puts() has a
- * line-feed automatically added! Therefore, the entire output line must be
- * constructed in a local buffer first, then printed in one atomic output call.
+ * The output is split to multiple print calls because some print targets, e.g.
+ * dmesg cannot handle arbitrarily long lines. These targets may add newline
+ * between calls.
*
* There is also a scheduler yield call to prevent the 'task has been stuck for
* 120s' kernel hang check feature from firing when printing to a slow target
* such as dmesg over a serial port.
*
- * TODO: Add compression prior to the ASCII85 encoding to shrink huge buffers down.
- *
* @p: the printer object to output to
* @prefix: optional prefix to add to output string
* @blob: the Binary Large OBject to dump out
* @offset: offset in bytes to skip from the front of the BLOB, must be a multiple of sizeof(u32)
* @size: the size in bytes of the BLOB, must be a multiple of sizeof(u32)
*/
-void xe_print_blob_ascii85(struct drm_printer *p, const char *prefix,
+void xe_print_blob_ascii85(struct drm_printer *p, const char *prefix, char suffix,
const void *blob, size_t offset, size_t size)
{
const u32 *blob32 = (const u32 *)blob;
char buff[ASCII85_BUFSZ], *line_buff;
size_t line_pos = 0;
- /*
- * Splitting blobs across multiple lines is not compatible with the mesa
- * debug decoder tool. Note that even dropping the explicit '\n' below
- * doesn't help because the GuC log is so big some underlying implementation
- * still splits the lines at 512K characters. So just bail completely for
- * the moment.
- */
- return;
-
#define DMESG_MAX_LINE_LEN 800
-#define MIN_SPACE (ASCII85_BUFSZ + 2) /* 85 + "\n\0" */
+ /* Always leave space for the suffix char and the \0 */
+#define MIN_SPACE (ASCII85_BUFSZ + 2) /* 85 + "<suffix>\0" */
if (size & 3)
drm_printf(p, "Size not word aligned: %zu", size);
@@ -458,7 +446,6 @@ void xe_print_blob_ascii85(struct drm_printer *p, const char *prefix,
line_pos += strlen(line_buff + line_pos);
if ((line_pos + MIN_SPACE) >= DMESG_MAX_LINE_LEN) {
- line_buff[line_pos++] = '\n';
line_buff[line_pos++] = 0;
drm_puts(p, line_buff);
@@ -470,10 +457,11 @@ void xe_print_blob_ascii85(struct drm_printer *p, const char *prefix,
}
}
+ if (suffix)
+ line_buff[line_pos++] = suffix;
+
if (line_pos) {
- line_buff[line_pos++] = '\n';
line_buff[line_pos++] = 0;
-
drm_puts(p, line_buff);
}
diff --git a/drivers/gpu/drm/xe/xe_devcoredump.h b/drivers/gpu/drm/xe/xe_devcoredump.h
index 6a17e6d601022..5391a80a4d1ba 100644
--- a/drivers/gpu/drm/xe/xe_devcoredump.h
+++ b/drivers/gpu/drm/xe/xe_devcoredump.h
@@ -29,7 +29,7 @@ static inline int xe_devcoredump_init(struct xe_device *xe)
}
#endif
-void xe_print_blob_ascii85(struct drm_printer *p, const char *prefix,
+void xe_print_blob_ascii85(struct drm_printer *p, const char *prefix, char suffix,
const void *blob, size_t offset, size_t size);
#endif
diff --git a/drivers/gpu/drm/xe/xe_guc_ct.c b/drivers/gpu/drm/xe/xe_guc_ct.c
index 8b65c5e959cc2..50c8076b51585 100644
--- a/drivers/gpu/drm/xe/xe_guc_ct.c
+++ b/drivers/gpu/drm/xe/xe_guc_ct.c
@@ -1724,7 +1724,8 @@ void xe_guc_ct_snapshot_print(struct xe_guc_ct_snapshot *snapshot,
snapshot->g2h_outstanding);
if (snapshot->ctb)
- xe_print_blob_ascii85(p, "CTB data", snapshot->ctb, 0, snapshot->ctb_size);
+ xe_print_blob_ascii85(p, "CTB data", '\n',
+ snapshot->ctb, 0, snapshot->ctb_size);
} else {
drm_puts(p, "CT disabled\n");
}
diff --git a/drivers/gpu/drm/xe/xe_guc_log.c b/drivers/gpu/drm/xe/xe_guc_log.c
index 80151ff6a71f8..44482ea919924 100644
--- a/drivers/gpu/drm/xe/xe_guc_log.c
+++ b/drivers/gpu/drm/xe/xe_guc_log.c
@@ -207,8 +207,10 @@ void xe_guc_log_snapshot_print(struct xe_guc_log_snapshot *snapshot, struct drm_
remain = snapshot->size;
for (i = 0; i < snapshot->num_chunks; i++) {
size_t size = min(GUC_LOG_CHUNK_SIZE, remain);
+ const char *prefix = i ? NULL : "Log data";
+ char suffix = i == snapshot->num_chunks - 1 ? '\n' : 0;
- xe_print_blob_ascii85(p, i ? NULL : "Log data", snapshot->copy[i], 0, size);
+ xe_print_blob_ascii85(p, prefix, suffix, snapshot->copy[i], 0, size);
remain -= size;
}
}
--
2.48.0
The following commit has been merged into the timers/urgent branch of tip:
Commit-ID: 53dac345395c0d2493cbc2f4c85fe38aef5b63f5
Gitweb: https://git.kernel.org/tip/53dac345395c0d2493cbc2f4c85fe38aef5b63f5
Author: Frederic Weisbecker <frederic(a)kernel.org>
AuthorDate: Sat, 18 Jan 2025 00:24:33 +01:00
Committer: Thomas Gleixner <tglx(a)linutronix.de>
CommitterDate: Thu, 23 Jan 2025 20:06:35 +01:00
hrtimers: Force migrate away hrtimers queued after CPUHP_AP_HRTIMERS_DYING
hrtimers are migrated away from the dying CPU to any online target at
the CPUHP_AP_HRTIMERS_DYING stage in order not to delay bandwidth timers
handling tasks involved in the CPU hotplug forward progress.
However wakeups can still be performed by the outgoing CPU after
CPUHP_AP_HRTIMERS_DYING. Those can result again in bandwidth timers being
armed. Depending on several considerations (crystal ball power management
based election, earliest timer already enqueued, timer migration enabled or
not), the target may eventually be the current CPU even if offline. If that
happens, the timer is eventually ignored.
The most notable example is RCU which had to deal with each and every of
those wake-ups by deferring them to an online CPU, along with related
workarounds:
_ e787644caf76 (rcu: Defer RCU kthreads wakeup when CPU is dying)
_ 9139f93209d1 (rcu/nocb: Fix RT throttling hrtimer armed from offline CPU)
_ f7345ccc62a4 (rcu/nocb: Fix rcuog wake-up from offline softirq)
The problem isn't confined to RCU though as the stop machine kthread
(which runs CPUHP_AP_HRTIMERS_DYING) reports its completion at the end
of its work through cpu_stop_signal_done() and performs a wake up that
eventually arms the deadline server timer:
WARNING: CPU: 94 PID: 588 at kernel/time/hrtimer.c:1086 hrtimer_start_range_ns+0x289/0x2d0
CPU: 94 UID: 0 PID: 588 Comm: migration/94 Not tainted
Stopper: multi_cpu_stop+0x0/0x120 <- stop_machine_cpuslocked+0x66/0xc0
RIP: 0010:hrtimer_start_range_ns+0x289/0x2d0
Call Trace:
<TASK>
start_dl_timer
enqueue_dl_entity
dl_server_start
enqueue_task_fair
enqueue_task
ttwu_do_activate
try_to_wake_up
complete
cpu_stopper_thread
Instead of providing yet another bandaid to work around the situation, fix
it in the hrtimers infrastructure instead: always migrate away a timer to
an online target whenever it is enqueued from an offline CPU.
This will also allow to revert all the above RCU disgraceful hacks.
Fixes: 5c0930ccaad5 ("hrtimers: Push pending hrtimers away from outgoing CPU earlier")
Reported-by: Vlad Poenaru <vlad.wing(a)gmail.com>
Reported-by: Usama Arif <usamaarif642(a)gmail.com>
Signed-off-by: Frederic Weisbecker <frederic(a)kernel.org>
Signed-off-by: Paul E. McKenney <paulmck(a)kernel.org>
Signed-off-by: Thomas Gleixner <tglx(a)linutronix.de>
Cc: stable(a)vger.kernel.org
Tested-by: Paul E. McKenney <paulmck(a)kernel.org>
Link: https://lore.kernel.org/all/20250117232433.24027-1-frederic@kernel.org
Closes: 20241213203739.1519801-1-usamaarif642(a)gmail.com
---
include/linux/hrtimer_defs.h | 1 +-
kernel/time/hrtimer.c | 103 +++++++++++++++++++++++++++-------
2 files changed, 83 insertions(+), 21 deletions(-)
diff --git a/include/linux/hrtimer_defs.h b/include/linux/hrtimer_defs.h
index c3b4b7e..84a5045 100644
--- a/include/linux/hrtimer_defs.h
+++ b/include/linux/hrtimer_defs.h
@@ -125,6 +125,7 @@ struct hrtimer_cpu_base {
ktime_t softirq_expires_next;
struct hrtimer *softirq_next_timer;
struct hrtimer_clock_base clock_base[HRTIMER_MAX_CLOCK_BASES];
+ call_single_data_t csd;
} ____cacheline_aligned;
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 4fb81f8..deb1aa3 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -58,6 +58,8 @@
#define HRTIMER_ACTIVE_SOFT (HRTIMER_ACTIVE_HARD << MASK_SHIFT)
#define HRTIMER_ACTIVE_ALL (HRTIMER_ACTIVE_SOFT | HRTIMER_ACTIVE_HARD)
+static void retrigger_next_event(void *arg);
+
/*
* The timer bases:
*
@@ -111,7 +113,8 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
.clockid = CLOCK_TAI,
.get_time = &ktime_get_clocktai,
},
- }
+ },
+ .csd = CSD_INIT(retrigger_next_event, NULL)
};
static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = {
@@ -124,6 +127,14 @@ static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = {
[CLOCK_TAI] = HRTIMER_BASE_TAI,
};
+static inline bool hrtimer_base_is_online(struct hrtimer_cpu_base *base)
+{
+ if (!IS_ENABLED(CONFIG_HOTPLUG_CPU))
+ return true;
+ else
+ return likely(base->online);
+}
+
/*
* Functions and macros which are different for UP/SMP systems are kept in a
* single place
@@ -178,27 +189,54 @@ struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,
}
/*
- * We do not migrate the timer when it is expiring before the next
- * event on the target cpu. When high resolution is enabled, we cannot
- * reprogram the target cpu hardware and we would cause it to fire
- * late. To keep it simple, we handle the high resolution enabled and
- * disabled case similar.
+ * Check if the elected target is suitable considering its next
+ * event and the hotplug state of the current CPU.
+ *
+ * If the elected target is remote and its next event is after the timer
+ * to queue, then a remote reprogram is necessary. However there is no
+ * guarantee the IPI handling the operation would arrive in time to meet
+ * the high resolution deadline. In this case the local CPU becomes a
+ * preferred target, unless it is offline.
+ *
+ * High and low resolution modes are handled the same way for simplicity.
*
* Called with cpu_base->lock of target cpu held.
*/
-static int
-hrtimer_check_target(struct hrtimer *timer, struct hrtimer_clock_base *new_base)
+static bool hrtimer_suitable_target(struct hrtimer *timer, struct hrtimer_clock_base *new_base,
+ struct hrtimer_cpu_base *new_cpu_base,
+ struct hrtimer_cpu_base *this_cpu_base)
{
ktime_t expires;
+ /*
+ * The local CPU clockevent can be reprogrammed. Also get_target_base()
+ * guarantees it is online.
+ */
+ if (new_cpu_base == this_cpu_base)
+ return true;
+
+ /*
+ * The offline local CPU can't be the default target if the
+ * next remote target event is after this timer. Keep the
+ * elected new base. An IPI will we issued to reprogram
+ * it as a last resort.
+ */
+ if (!hrtimer_base_is_online(this_cpu_base))
+ return true;
+
expires = ktime_sub(hrtimer_get_expires(timer), new_base->offset);
- return expires < new_base->cpu_base->expires_next;
+
+ return expires >= new_base->cpu_base->expires_next;
}
-static inline
-struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base,
- int pinned)
+static inline struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base, int pinned)
{
+ if (!hrtimer_base_is_online(base)) {
+ int cpu = cpumask_any_and(cpu_online_mask, housekeeping_cpumask(HK_TYPE_TIMER));
+
+ return &per_cpu(hrtimer_bases, cpu);
+ }
+
#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
if (static_branch_likely(&timers_migration_enabled) && !pinned)
return &per_cpu(hrtimer_bases, get_nohz_timer_target());
@@ -249,8 +287,8 @@ again:
raw_spin_unlock(&base->cpu_base->lock);
raw_spin_lock(&new_base->cpu_base->lock);
- if (new_cpu_base != this_cpu_base &&
- hrtimer_check_target(timer, new_base)) {
+ if (!hrtimer_suitable_target(timer, new_base, new_cpu_base,
+ this_cpu_base)) {
raw_spin_unlock(&new_base->cpu_base->lock);
raw_spin_lock(&base->cpu_base->lock);
new_cpu_base = this_cpu_base;
@@ -259,8 +297,7 @@ again:
}
WRITE_ONCE(timer->base, new_base);
} else {
- if (new_cpu_base != this_cpu_base &&
- hrtimer_check_target(timer, new_base)) {
+ if (!hrtimer_suitable_target(timer, new_base, new_cpu_base, this_cpu_base)) {
new_cpu_base = this_cpu_base;
goto again;
}
@@ -706,8 +743,6 @@ static inline int hrtimer_is_hres_enabled(void)
return hrtimer_hres_enabled;
}
-static void retrigger_next_event(void *arg);
-
/*
* Switch to high resolution mode
*/
@@ -1195,6 +1230,7 @@ static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
u64 delta_ns, const enum hrtimer_mode mode,
struct hrtimer_clock_base *base)
{
+ struct hrtimer_cpu_base *this_cpu_base = this_cpu_ptr(&hrtimer_bases);
struct hrtimer_clock_base *new_base;
bool force_local, first;
@@ -1206,10 +1242,16 @@ static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
* and enforce reprogramming after it is queued no matter whether
* it is the new first expiring timer again or not.
*/
- force_local = base->cpu_base == this_cpu_ptr(&hrtimer_bases);
+ force_local = base->cpu_base == this_cpu_base;
force_local &= base->cpu_base->next_timer == timer;
/*
+ * Don't force local queuing if this enqueue happens on a unplugged
+ * CPU after hrtimer_cpu_dying() has been invoked.
+ */
+ force_local &= this_cpu_base->online;
+
+ /*
* Remove an active timer from the queue. In case it is not queued
* on the current CPU, make sure that remove_hrtimer() updates the
* remote data correctly.
@@ -1238,8 +1280,27 @@ static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
}
first = enqueue_hrtimer(timer, new_base, mode);
- if (!force_local)
- return first;
+ if (!force_local) {
+ /*
+ * If the current CPU base is online, then the timer is
+ * never queued on a remote CPU if it would be the first
+ * expiring timer there.
+ */
+ if (hrtimer_base_is_online(this_cpu_base))
+ return first;
+
+ /*
+ * Timer was enqueued remote because the current base is
+ * already offline. If the timer is the first to expire,
+ * kick the remote CPU to reprogram the clock event.
+ */
+ if (first) {
+ struct hrtimer_cpu_base *new_cpu_base = new_base->cpu_base;
+
+ smp_call_function_single_async(new_cpu_base->cpu, &new_cpu_base->csd);
+ }
+ return 0;
+ }
/*
* Timer was forced to stay on the current CPU to avoid
From: Zhihao Cheng <chengzhihao1(a)huawei.com>
commit aec8e6bf839101784f3ef037dcdb9432c3f32343 ("btrfs:
fix use-after-free of block device file in __btrfs_free_extra_devids()")
Mounting btrfs from two images (which have the same one fsid and two
different dev_uuids) in certain executing order may trigger an UAF for
variable 'device->bdev_file' in __btrfs_free_extra_devids(). And
following are the details:
1. Attach image_1 to loop0, attach image_2 to loop1, and scan btrfs
devices by ioctl(BTRFS_IOC_SCAN_DEV):
/ btrfs_device_1 → loop0
fs_device
\ btrfs_device_2 → loop1
2. mount /dev/loop0 /mnt
btrfs_open_devices
btrfs_device_1->bdev_file = btrfs_get_bdev_and_sb(loop0)
btrfs_device_2->bdev_file = btrfs_get_bdev_and_sb(loop1)
btrfs_fill_super
open_ctree
fail: btrfs_close_devices // -ENOMEM
btrfs_close_bdev(btrfs_device_1)
fput(btrfs_device_1->bdev_file)
// btrfs_device_1->bdev_file is freed
btrfs_close_bdev(btrfs_device_2)
fput(btrfs_device_2->bdev_file)
3. mount /dev/loop1 /mnt
btrfs_open_devices
btrfs_get_bdev_and_sb(&bdev_file)
// EIO, btrfs_device_1->bdev_file is not assigned,
// which points to a freed memory area
btrfs_device_2->bdev_file = btrfs_get_bdev_and_sb(loop1)
btrfs_fill_super
open_ctree
btrfs_free_extra_devids
if (btrfs_device_1->bdev_file)
fput(btrfs_device_1->bdev_file) // UAF !
Fix it by setting 'device->bdev_file' as 'NULL' after closing the
btrfs_device in btrfs_close_one_device().
Fixes: CVE-2024-50217
Fixes: 142388194191 ("btrfs: do not background blkdev_put()")
CC: stable(a)vger.kernel.org # 4.19+
Link: https://bugzilla.kernel.org/show_bug.cgi?id=219408
Signed-off-by: Zhihao Cheng <chengzhihao1(a)huawei.com>
Reviewed-by: David Sterba <dsterba(a)suse.com>
Signed-off-by: David Sterba <dsterba(a)suse.com>
(cherry picked from commit aec8e6bf839101784f3ef037dcdb9432c3f32343)
Signed-off-by: Shubham Pushpkar <spushpka(a)cisco.com>
---
fs/btrfs/volumes.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index b9a0b26d08e1..ab2412542ce5 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1176,6 +1176,7 @@ static void btrfs_close_one_device(struct btrfs_device *device)
if (device->bdev) {
fs_devices->open_devices--;
device->bdev = NULL;
+ device->bdev_file = NULL;
}
clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
btrfs_destroy_dev_zone_info(device);
--
2.35.6
This reverts commit 27ab05e1b7e5c5ec9b4f658e1b2464c0908298a6.
I tried to upgrade a RasPi 3B+ with Waveshare 7inch HDMI LCD
from 6.1.y to 6.6.y but found that the display is broken with
this log message:
[ 17.776315] vc4-drm soc:gpu: bound 3f400000.hvs (ops vc4_drm_unregister [vc4])
[ 17.784034] platform 3f806000.vec: deferred probe pending
Some tests revealed that while 6.1.y works, 6.2-rc1 is already broken but all
newer kernels as well. And a bisect did lead me to this patch.
I could repair several versions up to 6.13-rc7 by doing
this revert. Newer kernels have just to take care of
commit f702475b839c ("ARM: dts: bcm2835-rpi: Move duplicate firmware-clocks to bcm2835-rpi.dtsi")
but that is straightforward.
Fixes: 27ab05e1b7e5 ("ARM: dts: bcm2835-rpi: Use firmware clocks for display")
Signed-off-by: H. Nikolaus Schaller <hns(a)goldelico.com>
---
arch/arm/boot/dts/bcm2835-rpi-common.dtsi | 17 -----------------
1 file changed, 17 deletions(-)
diff --git a/arch/arm/boot/dts/bcm2835-rpi-common.dtsi b/arch/arm/boot/dts/bcm2835-rpi-common.dtsi
index 4e7b4a592da7c..8a55b6cded592 100644
--- a/arch/arm/boot/dts/bcm2835-rpi-common.dtsi
+++ b/arch/arm/boot/dts/bcm2835-rpi-common.dtsi
@@ -7,23 +7,6 @@
#include <dt-bindings/power/raspberrypi-power.h>
-&firmware {
- firmware_clocks: clocks {
- compatible = "raspberrypi,firmware-clocks";
- #clock-cells = <1>;
- };
-};
-
-&hdmi {
- clocks = <&firmware_clocks 9>,
- <&firmware_clocks 13>;
- clock-names = "pixel", "hdmi";
-};
-
&v3d {
power-domains = <&power RPI_POWER_DOMAIN_V3D>;
};
-
-&vec {
- clocks = <&firmware_clocks 15>;
-};
--
2.47.0
The following commit has been merged into the timers/urgent branch of tip:
Commit-ID: b7a110336261147ccb373f4100cc88271c54bd91
Gitweb: https://git.kernel.org/tip/b7a110336261147ccb373f4100cc88271c54bd91
Author: Frederic Weisbecker <frederic(a)kernel.org>
AuthorDate: Sat, 18 Jan 2025 00:24:33 +01:00
Committer: Thomas Gleixner <tglx(a)linutronix.de>
CommitterDate: Thu, 23 Jan 2025 11:47:23 +01:00
hrtimers: Force migrate away hrtimers queued after CPUHP_AP_HRTIMERS_DYING
hrtimers are migrated away from the dying CPU to any online target at
the CPUHP_AP_HRTIMERS_DYING stage in order not to delay bandwidth timers
handling tasks involved in the CPU hotplug forward progress.
However wakeups can still be performed by the outgoing CPU after
CPUHP_AP_HRTIMERS_DYING. Those can result again in bandwidth timers being
armed. Depending on several considerations (crystal ball power management
based election, earliest timer already enqueued, timer migration enabled or
not), the target may eventually be the current CPU even if offline. If that
happens, the timer is eventually ignored.
The most notable example is RCU which had to deal with each and every of
those wake-ups by deferring them to an online CPU, along with related
workarounds:
_ e787644caf76 (rcu: Defer RCU kthreads wakeup when CPU is dying)
_ 9139f93209d1 (rcu/nocb: Fix RT throttling hrtimer armed from offline CPU)
_ f7345ccc62a4 (rcu/nocb: Fix rcuog wake-up from offline softirq)
The problem isn't confined to RCU though as the stop machine kthread
(which runs CPUHP_AP_HRTIMERS_DYING) reports its completion at the end
of its work through cpu_stop_signal_done() and performs a wake up that
eventually arms the deadline server timer:
WARNING: CPU: 94 PID: 588 at kernel/time/hrtimer.c:1086 hrtimer_start_range_ns+0x289/0x2d0
CPU: 94 UID: 0 PID: 588 Comm: migration/94 Not tainted
Stopper: multi_cpu_stop+0x0/0x120 <- stop_machine_cpuslocked+0x66/0xc0
RIP: 0010:hrtimer_start_range_ns+0x289/0x2d0
Call Trace:
<TASK>
start_dl_timer
enqueue_dl_entity
dl_server_start
enqueue_task_fair
enqueue_task
ttwu_do_activate
try_to_wake_up
complete
cpu_stopper_thread
Instead of providing yet another bandaid to work around the situation, fix
it in the hrtimers infrastructure instead: always migrate away a timer to
an online target whenever it is enqueued from an offline CPU.
This will also allow to revert all the above RCU disgraceful hacks.
Fixes: 5c0930ccaad5 ("hrtimers: Push pending hrtimers away from outgoing CPU earlier")
Reported-by: Vlad Poenaru <vlad.wing(a)gmail.com>
Reported-by: Usama Arif <usamaarif642(a)gmail.com>
Signed-off-by: Frederic Weisbecker <frederic(a)kernel.org>
Signed-off-by: Paul E. McKenney <paulmck(a)kernel.org>
Signed-off-by: Thomas Gleixner <tglx(a)linutronix.de>
Cc: stable(a)vger.kernel.org
Tested-by: Paul E. McKenney <paulmck(a)kernel.org>
Link: https://lore.kernel.org/all/20250117232433.24027-1-frederic@kernel.org
Closes: 20241213203739.1519801-1-usamaarif642(a)gmail.com
---
include/linux/hrtimer_defs.h | 1 +-
kernel/time/hrtimer.c | 103 +++++++++++++++++++++++++++-------
2 files changed, 83 insertions(+), 21 deletions(-)
diff --git a/include/linux/hrtimer_defs.h b/include/linux/hrtimer_defs.h
index c3b4b7e..84a5045 100644
--- a/include/linux/hrtimer_defs.h
+++ b/include/linux/hrtimer_defs.h
@@ -125,6 +125,7 @@ struct hrtimer_cpu_base {
ktime_t softirq_expires_next;
struct hrtimer *softirq_next_timer;
struct hrtimer_clock_base clock_base[HRTIMER_MAX_CLOCK_BASES];
+ call_single_data_t csd;
} ____cacheline_aligned;
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 14bd09c..0feb38b 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -58,6 +58,8 @@
#define HRTIMER_ACTIVE_SOFT (HRTIMER_ACTIVE_HARD << MASK_SHIFT)
#define HRTIMER_ACTIVE_ALL (HRTIMER_ACTIVE_SOFT | HRTIMER_ACTIVE_HARD)
+static void retrigger_next_event(void *arg);
+
/*
* The timer bases:
*
@@ -111,7 +113,8 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
.clockid = CLOCK_TAI,
.get_time = &ktime_get_clocktai,
},
- }
+ },
+ .csd = CSD_INIT(retrigger_next_event, NULL)
};
static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = {
@@ -124,6 +127,14 @@ static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = {
[CLOCK_TAI] = HRTIMER_BASE_TAI,
};
+static inline bool hrtimer_base_is_online(struct hrtimer_cpu_base *base)
+{
+ if (!IS_ENABLED(CONFIG_HOTPLUG_CPU))
+ return true;
+ else
+ return likely(base->online);
+}
+
/*
* Functions and macros which are different for UP/SMP systems are kept in a
* single place
@@ -183,27 +194,54 @@ struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,
}
/*
- * We do not migrate the timer when it is expiring before the next
- * event on the target cpu. When high resolution is enabled, we cannot
- * reprogram the target cpu hardware and we would cause it to fire
- * late. To keep it simple, we handle the high resolution enabled and
- * disabled case similar.
+ * Check if the elected target is suitable considering its next
+ * event and the hotplug state of the current CPU.
+ *
+ * If the elected target is remote and its next event is after the timer
+ * to queue, then a remote reprogram is necessary. However there is no
+ * guarantee the IPI handling the operation would arrive in time to meet
+ * the high resolution deadline. In this case the local CPU becomes a
+ * preferred target, unless it is offline.
+ *
+ * High and low resolution modes are handled the same way for simplicity.
*
* Called with cpu_base->lock of target cpu held.
*/
-static int
-hrtimer_check_target(struct hrtimer *timer, struct hrtimer_clock_base *new_base)
+static bool hrtimer_suitable_target(struct hrtimer *timer, struct hrtimer_clock_base *new_base,
+ struct hrtimer_cpu_base *new_cpu_base,
+ struct hrtimer_cpu_base *this_cpu_base)
{
ktime_t expires;
+ /*
+ * The local CPU clockevent can be reprogrammed. Also get_target_base()
+ * guarantees it is online.
+ */
+ if (new_cpu_base == this_cpu_base)
+ return true;
+
+ /*
+ * The offline local CPU can't be the default target if the
+ * next remote target event is after this timer. Keep the
+ * elected new base. An IPI will we issued to reprogram
+ * it as a last resort.
+ */
+ if (!hrtimer_base_is_online(this_cpu_base))
+ return true;
+
expires = ktime_sub(hrtimer_get_expires(timer), new_base->offset);
- return expires < new_base->cpu_base->expires_next;
+
+ return expires >= new_base->cpu_base->expires_next;
}
-static inline
-struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base,
- int pinned)
+static inline struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base, int pinned)
{
+ if (!hrtimer_base_is_online(base)) {
+ int cpu = cpumask_any_and(cpu_online_mask, housekeeping_cpumask(HK_TYPE_TIMER));
+
+ return &per_cpu(hrtimer_bases, cpu);
+ }
+
#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
if (static_branch_likely(&timers_migration_enabled) && !pinned)
return &per_cpu(hrtimer_bases, get_nohz_timer_target());
@@ -254,8 +292,8 @@ again:
raw_spin_unlock(&base->cpu_base->lock);
raw_spin_lock(&new_base->cpu_base->lock);
- if (new_cpu_base != this_cpu_base &&
- hrtimer_check_target(timer, new_base)) {
+ if (!hrtimer_suitable_target(timer, new_base, new_cpu_base,
+ this_cpu_base)) {
raw_spin_unlock(&new_base->cpu_base->lock);
raw_spin_lock(&base->cpu_base->lock);
new_cpu_base = this_cpu_base;
@@ -264,8 +302,7 @@ again:
}
WRITE_ONCE(timer->base, new_base);
} else {
- if (new_cpu_base != this_cpu_base &&
- hrtimer_check_target(timer, new_base)) {
+ if (!hrtimer_suitable_target(timer, new_base, new_cpu_base, this_cpu_base)) {
new_cpu_base = this_cpu_base;
goto again;
}
@@ -716,8 +753,6 @@ static inline int hrtimer_is_hres_enabled(void)
return hrtimer_hres_enabled;
}
-static void retrigger_next_event(void *arg);
-
/*
* Switch to high resolution mode
*/
@@ -1205,6 +1240,7 @@ static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
u64 delta_ns, const enum hrtimer_mode mode,
struct hrtimer_clock_base *base)
{
+ struct hrtimer_cpu_base *this_cpu_base = this_cpu_ptr(&hrtimer_bases);
struct hrtimer_clock_base *new_base;
bool force_local, first;
@@ -1216,10 +1252,16 @@ static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
* and enforce reprogramming after it is queued no matter whether
* it is the new first expiring timer again or not.
*/
- force_local = base->cpu_base == this_cpu_ptr(&hrtimer_bases);
+ force_local = base->cpu_base == this_cpu_base;
force_local &= base->cpu_base->next_timer == timer;
/*
+ * Don't force local queuing if this enqueue happens on a unplugged
+ * CPU after hrtimer_cpu_dying() has been invoked.
+ */
+ force_local &= this_cpu_base->online;
+
+ /*
* Remove an active timer from the queue. In case it is not queued
* on the current CPU, make sure that remove_hrtimer() updates the
* remote data correctly.
@@ -1248,8 +1290,27 @@ static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
}
first = enqueue_hrtimer(timer, new_base, mode);
- if (!force_local)
- return first;
+ if (!force_local) {
+ /*
+ * If the current CPU base is online, then the timer is
+ * never queued on a remote CPU if it would be the first
+ * expiring timer there.
+ */
+ if (hrtimer_base_is_online(this_cpu_base))
+ return first;
+
+ /*
+ * Timer was enqueued remote because the current base is
+ * already offline. If the timer is the first to expire,
+ * kick the remote CPU to reprogram the clock event.
+ */
+ if (first) {
+ struct hrtimer_cpu_base *new_cpu_base = new_base->cpu_base;
+
+ smp_call_function_single_async(new_cpu_base->cpu, &new_cpu_base->csd);
+ }
+ return 0;
+ }
/*
* Timer was forced to stay on the current CPU to avoid
The NCSI state machine as it's currently implemented assumes that
transition to the next logical state is performed either explicitly by
calling `schedule_work(&ndp->work)` to re-queue itself or implicitly
after processing the predefined (ndp->pending_req_num) number of
replies. Thus to avoid the configuration FSM from advancing prematurely
and getting out of sync with the process it's essential to not skip
waiting for a reply.
This patch makes the code wait for reception of the Deselect Package
response for the last package probed before proceeding to channel
configuration.
Thanks go to Potin Lai and Cosmo Chou for the initial investigation and
testing.
Fixes: 8e13f70be05e ("net/ncsi: Probe single packages to avoid conflict")
Cc: stable(a)vger.kernel.org
Signed-off-by: Paul Fertser <fercerpav(a)gmail.com>
---
net/ncsi/ncsi-manage.c | 13 +++++++------
1 file changed, 7 insertions(+), 6 deletions(-)
diff --git a/net/ncsi/ncsi-manage.c b/net/ncsi/ncsi-manage.c
index 5cf55bde366d..bf8e27b84a66 100644
--- a/net/ncsi/ncsi-manage.c
+++ b/net/ncsi/ncsi-manage.c
@@ -1373,6 +1373,12 @@ static void ncsi_probe_channel(struct ncsi_dev_priv *ndp)
nd->state = ncsi_dev_state_probe_package;
break;
case ncsi_dev_state_probe_package:
+ if (ndp->package_probe_id >= 8) {
+ /* Last package probed, finishing */
+ ndp->flags |= NCSI_DEV_PROBED;
+ break;
+ }
+
ndp->pending_req_num = 1;
nca.type = NCSI_PKT_CMD_SP;
@@ -1489,13 +1495,8 @@ static void ncsi_probe_channel(struct ncsi_dev_priv *ndp)
if (ret)
goto error;
- /* Probe next package */
+ /* Probe next package after receiving response */
ndp->package_probe_id++;
- if (ndp->package_probe_id >= 8) {
- /* Probe finished */
- ndp->flags |= NCSI_DEV_PROBED;
- break;
- }
nd->state = ncsi_dev_state_probe_package;
ndp->active_package = NULL;
break;
--
2.34.1
On Wed, Jan 15, 2025 at 12:03:27PM -0700, Keith Busch wrote:
> On Wed, Jan 15, 2025 at 06:10:05PM +0100, Paolo Bonzini wrote:
> > You can implement something like pthread_once():
>
> ...
>
> > Where to put it I don't know. It doesn't belong in
> > include/linux/once.h. I'm okay with arch/x86/kvm/call_once.h and just
> > pull it with #include "call_once.h".
>
> Thanks for the suggestion, I can work with that. As to where to put it,
> I think the new 'struct once' needs to be a member of struct kvm_arch,
> so I've put it in arch/x86/include/asm/.
>
> Here's the result with that folded in. If this is okay, I'll send a v2,
> and can split out the call_once as a prep patch with your attribution if
> you like.
Has there been any progress here? I'm also affected by the crosvm
regression, and it's been backported to the LTS stable kernel.
(CCing the stable and regressions lists to make sure the regression is
tracked.)
#regzbot introduced: d96c77bd4eeb
From: Nick Child <nnac123(a)linux.ibm.com>
From: Nick Child <nnac123(a)linux.ibm.com>
commit 0983d288caf984de0202c66641577b739caad561 upstream.
Below is a summary of how the driver stores a reference to an skb during
transmit:
tx_buff[free_map[consumer_index]]->skb = new_skb;
free_map[consumer_index] = IBMVNIC_INVALID_MAP;
consumer_index ++;
Where variable data looks like this:
free_map == [4, IBMVNIC_INVALID_MAP, IBMVNIC_INVALID_MAP, 0, 3]
consumer_index^
tx_buff == [skb=null, skb=<ptr>, skb=<ptr>, skb=null, skb=null]
The driver has checks to ensure that free_map[consumer_index] pointed to
a valid index but there was no check to ensure that this index pointed
to an unused/null skb address. So, if, by some chance, our free_map and
tx_buff lists become out of sync then we were previously risking an
skb memory leak. This could then cause tcp congestion control to stop
sending packets, eventually leading to ETIMEDOUT.
Therefore, add a conditional to ensure that the skb address is null. If
not then warn the user (because this is still a bug that should be
patched) and free the old pointer to prevent memleak/tcp problems.
Signed-off-by: Nick Child <nnac123(a)linux.ibm.com>
Signed-off-by: Paolo Abeni <pabeni(a)redhat.com>
[Denis: minor fix to resolve merge conflict.]
Signed-off-by: Denis Arefev <arefev(a)swemel.ru>
---
Backport fix for CVE-2024-41066
Link: https://nvd.nist.gov/vuln/detail/CVE-2024-41066
---
drivers/net/ethernet/ibm/ibmvnic.c | 12 ++++++++++++
1 file changed, 12 insertions(+)
diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c
index 84da6ccaf339..439796975cbf 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.c
+++ b/drivers/net/ethernet/ibm/ibmvnic.c
@@ -1625,6 +1625,18 @@ static netdev_tx_t ibmvnic_xmit(struct sk_buff *skb, struct net_device *netdev)
(tx_pool->consumer_index + 1) % tx_pool->num_buffers;
tx_buff = &tx_pool->tx_buff[index];
+
+ /* Sanity checks on our free map to make sure it points to an index
+ * that is not being occupied by another skb. If skb memory is
+ * not freed then we see congestion control kick in and halt tx.
+ */
+ if (unlikely(tx_buff->skb)) {
+ dev_warn_ratelimited(dev, "TX free map points to untracked skb (%s %d idx=%d)\n",
+ skb_is_gso(skb) ? "tso_pool" : "tx_pool",
+ queue_num, bufidx);
+ dev_kfree_skb_any(tx_buff->skb);
+ }
+
tx_buff->skb = skb;
tx_buff->data_dma[0] = data_dma_addr;
tx_buff->data_len[0] = skb->len;
--
2.43.0
#regzbot introduced v6.6.69..v6.6.70
#regzbot introduced: ad91a2dacbf8c26a446658cdd55e8324dfeff1e7
We hit this regression when updating our guest vm kernel from 6.6.69
to 6.6.70 -- bisecting, this problem was introduced in
ad91a2dacbf8c26a446658cdd55e8324dfeff1e7 -- net: restrict SO_REUSEPORT
to inet sockets
We're getting a timeout when trying to connect to the vsocket in the
guest VM when launching a kata containers 3.10.1 agent which
unsurprisingly ... uses a vsocket to communicate back to the host.
We updated this commit and added an additional sk_is_vsock check and
recompiled and this works correctly for us.
- if (valbool && !sk_is_inet(sk))
+ if (valbool && !(sk_is_inet(sk) || sk_is_vsock(sk)))
My understanding is limited here so I've added Stefano as he is likely
to better understand what makes sense here.
This commit was backported from v6.13 to v6.12.8..6.12.9.
-Simon
From: "Eric W. Biederman" <ebiederm(a)xmission.com>
[ Upstream commit a3616a3c02722d1edb95acc7fceade242f6553ba ]
In the fpsp040 code when copyin or copyout fails call
force_sigsegv(SIGSEGV) instead of do_exit(SIGSEGV).
This solves a couple of problems. Because do_exit embeds the ptrace
stop PTRACE_EVENT_EXIT a complete stack frame needs to be present for
that to work correctly. There is always the information needed for a
ptrace stop where get_signal is called. So exiting with a signal
solves the ptrace issue.
Further exiting with a signal ensures that all of the threads in a
process are killed not just the thread that malfunctioned. Which
avoids confusing userspace.
To make force_sigsegv(SIGSEGV) work in fpsp040_die modify the code to
save all of the registers and jump to ret_from_exception (which
ultimately calls get_signal) after fpsp040_die returns.
v2: Updated the branches to use gas's pseudo ops that automatically
calculate the best branch instruction to use for the purpose.
v1: https://lkml.kernel.org/r/87a6m8kgtx.fsf_-_@disp2133
Link: https://lkml.kernel.org/r/87tukghjfs.fsf_-_@disp2133
Acked-by: Geert Uytterhoeven <geert(a)linux-m68k.org>
Signed-off-by: "Eric W. Biederman" <ebiederm(a)xmission.com>
Signed-off-by: Finn Thain <fthain(a)linux-m68k.org>
---
arch/m68k/fpsp040/skeleton.S | 3 ++-
arch/m68k/kernel/traps.c | 2 +-
2 files changed, 3 insertions(+), 2 deletions(-)
diff --git a/arch/m68k/fpsp040/skeleton.S b/arch/m68k/fpsp040/skeleton.S
index 31a9c634c81e..081922c72daa 100644
--- a/arch/m68k/fpsp040/skeleton.S
+++ b/arch/m68k/fpsp040/skeleton.S
@@ -502,7 +502,8 @@ in_ea:
.section .fixup,"ax"
.even
1:
- jbra fpsp040_die
+ jbsr fpsp040_die
+ jbra .Lnotkern
.section __ex_table,"a"
.align 4
diff --git a/arch/m68k/kernel/traps.c b/arch/m68k/kernel/traps.c
index 35f706d836c5..c6f18dc5884b 100644
--- a/arch/m68k/kernel/traps.c
+++ b/arch/m68k/kernel/traps.c
@@ -1155,7 +1155,7 @@ asmlinkage void set_esp0(unsigned long ssp)
*/
asmlinkage void fpsp040_die(void)
{
- do_exit(SIGSEGV);
+ force_sigsegv(SIGSEGV);
}
#ifdef CONFIG_M68KFPU_EMU
From: Liam Howlett <liam.howlett(a)oracle.com>
[ Upstream commit f829b4b212a315b912cb23fd10aaf30534bb5ce9 ]
When the superuser flushes the entire cache, the mmap_read_lock() is not
taken, but mmap_read_unlock() is called. Add the missing
mmap_read_lock() call.
Fixes: cd2567b6850b1648 ("m68k: call find_vma with the mmap_sem held in sys_cacheflush()")
Signed-off-by: Liam R. Howlett <Liam.Howlett(a)Oracle.com>
Reviewed-by: Matthew Wilcox (Oracle) <willy(a)infradead.org>
Link: https://lore.kernel.org/r/20210407200032.764445-1-Liam.Howlett@Oracle.com
Signed-off-by: Geert Uytterhoeven <geert(a)linux-m68k.org>
[ mmap_read_lock() open-coded using down_read() as was done prior to v5.8 ]
Signed-off-by: Finn Thain <fthain(a)linux-m68k.org>
---
arch/m68k/kernel/sys_m68k.c | 2 ++
1 file changed, 2 insertions(+)
diff --git a/arch/m68k/kernel/sys_m68k.c b/arch/m68k/kernel/sys_m68k.c
index 6363ec83a290..38dcc1a2097d 100644
--- a/arch/m68k/kernel/sys_m68k.c
+++ b/arch/m68k/kernel/sys_m68k.c
@@ -388,6 +388,8 @@ sys_cacheflush (unsigned long addr, int scope, int cache, unsigned long len)
ret = -EPERM;
if (!capable(CAP_SYS_ADMIN))
goto out;
+
+ down_read(¤t->mm->mmap_sem);
} else {
struct vm_area_struct *vma;
From: Omid Ehtemam-Haghighi <omid.ehtemamhaghighi(a)menlosecurity.com>
[ Upstream commit d9ccb18f83ea2bb654289b6ecf014fd267cc988b ]
Soft lockups have been observed on a cluster of Linux-based edge routers
located in a highly dynamic environment. Using the `bird` service, these
routers continuously update BGP-advertised routes due to frequently
changing nexthop destinations, while also managing significant IPv6
traffic. The lockups occur during the traversal of the multipath
circular linked-list in the `fib6_select_path` function, particularly
while iterating through the siblings in the list. The issue typically
arises when the nodes of the linked list are unexpectedly deleted
concurrently on a different core—indicated by their 'next' and
'previous' elements pointing back to the node itself and their reference
count dropping to zero. This results in an infinite loop, leading to a
soft lockup that triggers a system panic via the watchdog timer.
Apply RCU primitives in the problematic code sections to resolve the
issue. Where necessary, update the references to fib6_siblings to
annotate or use the RCU APIs.
Include a test script that reproduces the issue. The script
periodically updates the routing table while generating a heavy load
of outgoing IPv6 traffic through multiple iperf3 clients. It
consistently induces infinite soft lockups within a couple of minutes.
Kernel log:
0 [ffffbd13003e8d30] machine_kexec at ffffffff8ceaf3eb
1 [ffffbd13003e8d90] __crash_kexec at ffffffff8d0120e3
2 [ffffbd13003e8e58] panic at ffffffff8cef65d4
3 [ffffbd13003e8ed8] watchdog_timer_fn at ffffffff8d05cb03
4 [ffffbd13003e8f08] __hrtimer_run_queues at ffffffff8cfec62f
5 [ffffbd13003e8f70] hrtimer_interrupt at ffffffff8cfed756
6 [ffffbd13003e8fd0] __sysvec_apic_timer_interrupt at ffffffff8cea01af
7 [ffffbd13003e8ff0] sysvec_apic_timer_interrupt at ffffffff8df1b83d
-- <IRQ stack> --
8 [ffffbd13003d3708] asm_sysvec_apic_timer_interrupt at ffffffff8e000ecb
[exception RIP: fib6_select_path+299]
RIP: ffffffff8ddafe7b RSP: ffffbd13003d37b8 RFLAGS: 00000287
RAX: ffff975850b43600 RBX: ffff975850b40200 RCX: 0000000000000000
RDX: 000000003fffffff RSI: 0000000051d383e4 RDI: ffff975850b43618
RBP: ffffbd13003d3800 R8: 0000000000000000 R9: ffff975850b40200
R10: 0000000000000000 R11: 0000000000000000 R12: ffffbd13003d3830
R13: ffff975850b436a8 R14: ffff975850b43600 R15: 0000000000000007
ORIG_RAX: ffffffffffffffff CS: 0010 SS: 0018
9 [ffffbd13003d3808] ip6_pol_route at ffffffff8ddb030c
10 [ffffbd13003d3888] ip6_pol_route_input at ffffffff8ddb068c
11 [ffffbd13003d3898] fib6_rule_lookup at ffffffff8ddf02b5
12 [ffffbd13003d3928] ip6_route_input at ffffffff8ddb0f47
13 [ffffbd13003d3a18] ip6_rcv_finish_core.constprop.0 at ffffffff8dd950d0
14 [ffffbd13003d3a30] ip6_list_rcv_finish.constprop.0 at ffffffff8dd96274
15 [ffffbd13003d3a98] ip6_sublist_rcv at ffffffff8dd96474
16 [ffffbd13003d3af8] ipv6_list_rcv at ffffffff8dd96615
17 [ffffbd13003d3b60] __netif_receive_skb_list_core at ffffffff8dc16fec
18 [ffffbd13003d3be0] netif_receive_skb_list_internal at ffffffff8dc176b3
19 [ffffbd13003d3c50] napi_gro_receive at ffffffff8dc565b9
20 [ffffbd13003d3c80] ice_receive_skb at ffffffffc087e4f5 [ice]
21 [ffffbd13003d3c90] ice_clean_rx_irq at ffffffffc0881b80 [ice]
22 [ffffbd13003d3d20] ice_napi_poll at ffffffffc088232f [ice]
23 [ffffbd13003d3d80] __napi_poll at ffffffff8dc18000
24 [ffffbd13003d3db8] net_rx_action at ffffffff8dc18581
25 [ffffbd13003d3e40] __do_softirq at ffffffff8df352e9
26 [ffffbd13003d3eb0] run_ksoftirqd at ffffffff8ceffe47
27 [ffffbd13003d3ec0] smpboot_thread_fn at ffffffff8cf36a30
28 [ffffbd13003d3ee8] kthread at ffffffff8cf2b39f
29 [ffffbd13003d3f28] ret_from_fork at ffffffff8ce5fa64
30 [ffffbd13003d3f50] ret_from_fork_asm at ffffffff8ce03cbb
Fixes: 66f5d6ce53e6 ("ipv6: replace rwlock with rcu and spinlock in fib6_table")
Reported-by: Adrian Oliver <kernel(a)aoliver.ca>
Signed-off-by: Omid Ehtemam-Haghighi <omid.ehtemamhaghighi(a)menlosecurity.com>
Cc: Shuah Khan <shuah(a)kernel.org>
Cc: Ido Schimmel <idosch(a)idosch.org>
Cc: Kuniyuki Iwashima <kuniyu(a)amazon.com>
Cc: Simon Horman <horms(a)kernel.org>
Reviewed-by: David Ahern <dsahern(a)kernel.org>
Link: https://patch.msgid.link/20241106010236.1239299-1-omid.ehtemamhaghighi@menl…
Signed-off-by: Jakub Kicinski <kuba(a)kernel.org>
Signed-off-by: Rajani Kantha <rajanikantha(a)engineer.com>
---
net/ipv6/ip6_fib.c | 8 +-
net/ipv6/route.c | 45 ++-
tools/testing/selftests/net/Makefile | 1 +
.../net/ipv6_route_update_soft_lockup.sh | 262 ++++++++++++++++++
4 files changed, 297 insertions(+), 19 deletions(-)
create mode 100755 tools/testing/selftests/net/ipv6_route_update_soft_lockup.sh
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 0b45ef8b7ee2..b6a7cbd6bee0 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -1180,8 +1180,8 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt,
while (sibling) {
if (sibling->fib6_metric == rt->fib6_metric &&
rt6_qualify_for_ecmp(sibling)) {
- list_add_tail(&rt->fib6_siblings,
- &sibling->fib6_siblings);
+ list_add_tail_rcu(&rt->fib6_siblings,
+ &sibling->fib6_siblings);
break;
}
sibling = rcu_dereference_protected(sibling->fib6_next,
@@ -1242,7 +1242,7 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt,
fib6_siblings)
sibling->fib6_nsiblings--;
rt->fib6_nsiblings = 0;
- list_del_init(&rt->fib6_siblings);
+ list_del_rcu(&rt->fib6_siblings);
rt6_multipath_rebalance(next_sibling);
return err;
}
@@ -1955,7 +1955,7 @@ static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn,
&rt->fib6_siblings, fib6_siblings)
sibling->fib6_nsiblings--;
rt->fib6_nsiblings = 0;
- list_del_init(&rt->fib6_siblings);
+ list_del_rcu(&rt->fib6_siblings);
rt6_multipath_rebalance(next_sibling);
}
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 5ae3ff6ffb7e..f3268bac9f19 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -420,8 +420,8 @@ void fib6_select_path(const struct net *net, struct fib6_result *res,
struct flowi6 *fl6, int oif, bool have_oif_match,
const struct sk_buff *skb, int strict)
{
- struct fib6_info *sibling, *next_sibling;
struct fib6_info *match = res->f6i;
+ struct fib6_info *sibling;
if (!match->nh && (!match->fib6_nsiblings || have_oif_match))
goto out;
@@ -447,8 +447,8 @@ void fib6_select_path(const struct net *net, struct fib6_result *res,
if (fl6->mp_hash <= atomic_read(&match->fib6_nh->fib_nh_upper_bound))
goto out;
- list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings,
- fib6_siblings) {
+ list_for_each_entry_rcu(sibling, &match->fib6_siblings,
+ fib6_siblings) {
const struct fib6_nh *nh = sibling->fib6_nh;
int nh_upper_bound;
@@ -5189,14 +5189,18 @@ static void ip6_route_mpath_notify(struct fib6_info *rt,
* nexthop. Since sibling routes are always added at the end of
* the list, find the first sibling of the last route appended
*/
+ rcu_read_lock();
+
if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->fib6_nsiblings) {
- rt = list_first_entry(&rt_last->fib6_siblings,
- struct fib6_info,
- fib6_siblings);
+ rt = list_first_or_null_rcu(&rt_last->fib6_siblings,
+ struct fib6_info,
+ fib6_siblings);
}
if (rt)
inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
+
+ rcu_read_unlock();
}
static bool ip6_route_mpath_should_notify(const struct fib6_info *rt)
@@ -5541,17 +5545,21 @@ static size_t rt6_nlmsg_size(struct fib6_info *f6i)
nexthop_for_each_fib6_nh(f6i->nh, rt6_nh_nlmsg_size,
&nexthop_len);
} else {
- struct fib6_info *sibling, *next_sibling;
struct fib6_nh *nh = f6i->fib6_nh;
+ struct fib6_info *sibling;
nexthop_len = 0;
if (f6i->fib6_nsiblings) {
rt6_nh_nlmsg_size(nh, &nexthop_len);
- list_for_each_entry_safe(sibling, next_sibling,
- &f6i->fib6_siblings, fib6_siblings) {
+ rcu_read_lock();
+
+ list_for_each_entry_rcu(sibling, &f6i->fib6_siblings,
+ fib6_siblings) {
rt6_nh_nlmsg_size(sibling->fib6_nh, &nexthop_len);
}
+
+ rcu_read_unlock();
}
nexthop_len += lwtunnel_get_encap_size(nh->fib_nh_lws);
}
@@ -5715,7 +5723,7 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb,
lwtunnel_fill_encap(skb, dst->lwtstate, RTA_ENCAP, RTA_ENCAP_TYPE) < 0)
goto nla_put_failure;
} else if (rt->fib6_nsiblings) {
- struct fib6_info *sibling, *next_sibling;
+ struct fib6_info *sibling;
struct nlattr *mp;
mp = nla_nest_start_noflag(skb, RTA_MULTIPATH);
@@ -5727,14 +5735,21 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb,
0) < 0)
goto nla_put_failure;
- list_for_each_entry_safe(sibling, next_sibling,
- &rt->fib6_siblings, fib6_siblings) {
+ rcu_read_lock();
+
+ list_for_each_entry_rcu(sibling, &rt->fib6_siblings,
+ fib6_siblings) {
if (fib_add_nexthop(skb, &sibling->fib6_nh->nh_common,
sibling->fib6_nh->fib_nh_weight,
- AF_INET6, 0) < 0)
+ AF_INET6, 0) < 0) {
+ rcu_read_unlock();
+
goto nla_put_failure;
+ }
}
+ rcu_read_unlock();
+
nla_nest_end(skb, mp);
} else if (rt->nh) {
if (nla_put_u32(skb, RTA_NH_ID, rt->nh->id))
@@ -6171,7 +6186,7 @@ void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info,
err = -ENOBUFS;
seq = info->nlh ? info->nlh->nlmsg_seq : 0;
- skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
+ skb = nlmsg_new(rt6_nlmsg_size(rt), GFP_ATOMIC);
if (!skb)
goto errout;
@@ -6184,7 +6199,7 @@ void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info,
goto errout;
}
rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
- info->nlh, gfp_any());
+ info->nlh, GFP_ATOMIC);
return;
errout:
if (err < 0)
diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile
index 48d1a68be1d5..080860a8826b 100644
--- a/tools/testing/selftests/net/Makefile
+++ b/tools/testing/selftests/net/Makefile
@@ -72,6 +72,7 @@ TEST_GEN_PROGS += sk_bind_sendto_listen
TEST_GEN_PROGS += sk_connect_zero_addr
TEST_PROGS += test_ingress_egress_chaining.sh
TEST_GEN_FILES += nat6to4.o
+TEST_PROGS += ipv6_route_update_soft_lockup.sh
TEST_FILES := settings
diff --git a/tools/testing/selftests/net/ipv6_route_update_soft_lockup.sh b/tools/testing/selftests/net/ipv6_route_update_soft_lockup.sh
new file mode 100755
index 000000000000..a6b2b1f9c641
--- /dev/null
+++ b/tools/testing/selftests/net/ipv6_route_update_soft_lockup.sh
@@ -0,0 +1,262 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Testing for potential kernel soft lockup during IPv6 routing table
+# refresh under heavy outgoing IPv6 traffic. If a kernel soft lockup
+# occurs, a kernel panic will be triggered to prevent associated issues.
+#
+#
+# Test Environment Layout
+#
+# ┌----------------┐ ┌----------------┐
+# | SOURCE_NS | | SINK_NS |
+# | NAMESPACE | | NAMESPACE |
+# |(iperf3 clients)| |(iperf3 servers)|
+# | | | |
+# | | | |
+# | ┌-----------| nexthops |---------┐ |
+# | |veth_source|<--------------------------------------->|veth_sink|<┐ |
+# | └-----------|2001:0DB8:1::0:1/96 2001:0DB8:1::1:1/96 |---------┘ | |
+# | | ^ 2001:0DB8:1::1:2/96 | | |
+# | | . . | fwd | |
+# | ┌---------┐ | . . | | |
+# | | IPv6 | | . . | V |
+# | | routing | | . 2001:0DB8:1::1:80/96| ┌-----┐ |
+# | | table | | . | | lo | |
+# | | nexthop | | . └--------┴-----┴-┘
+# | | update | | ............................> 2001:0DB8:2::1:1/128
+# | └-------- ┘ |
+# └----------------┘
+#
+# The test script sets up two network namespaces, source_ns and sink_ns,
+# connected via a veth link. Within source_ns, it continuously updates the
+# IPv6 routing table by flushing and inserting IPV6_NEXTHOP_ADDR_COUNT nexthop
+# IPs destined for SINK_LOOPBACK_IP_ADDR in sink_ns. This refresh occurs at a
+# rate of 1/ROUTING_TABLE_REFRESH_PERIOD per second for TEST_DURATION seconds.
+#
+# Simultaneously, multiple iperf3 clients within source_ns generate heavy
+# outgoing IPv6 traffic. Each client is assigned a unique port number starting
+# at 5000 and incrementing sequentially. Each client targets a unique iperf3
+# server running in sink_ns, connected to the SINK_LOOPBACK_IFACE interface
+# using the same port number.
+#
+# The number of iperf3 servers and clients is set to half of the total
+# available cores on each machine.
+#
+# NOTE: We have tested this script on machines with various CPU specifications,
+# ranging from lower to higher performance as listed below. The test script
+# effectively triggered a kernel soft lockup on machines running an unpatched
+# kernel in under a minute:
+#
+# - 1x Intel Xeon E-2278G 8-Core Processor @ 3.40GHz
+# - 1x Intel Xeon E-2378G Processor 8-Core @ 2.80GHz
+# - 1x AMD EPYC 7401P 24-Core Processor @ 2.00GHz
+# - 1x AMD EPYC 7402P 24-Core Processor @ 2.80GHz
+# - 2x Intel Xeon Gold 5120 14-Core Processor @ 2.20GHz
+# - 1x Ampere Altra Q80-30 80-Core Processor @ 3.00GHz
+# - 2x Intel Xeon Gold 5120 14-Core Processor @ 2.20GHz
+# - 2x Intel Xeon Silver 4214 24-Core Processor @ 2.20GHz
+# - 1x AMD EPYC 7502P 32-Core @ 2.50GHz
+# - 1x Intel Xeon Gold 6314U 32-Core Processor @ 2.30GHz
+# - 2x Intel Xeon Gold 6338 32-Core Processor @ 2.00GHz
+#
+# On less performant machines, you may need to increase the TEST_DURATION
+# parameter to enhance the likelihood of encountering a race condition leading
+# to a kernel soft lockup and avoid a false negative result.
+#
+# NOTE: The test may not produce the expected result in virtualized
+# environments (e.g., qemu) due to differences in timing and CPU handling,
+# which can affect the conditions needed to trigger a soft lockup.
+
+source lib.sh
+source net_helper.sh
+
+TEST_DURATION=300
+ROUTING_TABLE_REFRESH_PERIOD=0.01
+
+IPERF3_BITRATE="300m"
+
+
+IPV6_NEXTHOP_ADDR_COUNT="128"
+IPV6_NEXTHOP_ADDR_MASK="96"
+IPV6_NEXTHOP_PREFIX="2001:0DB8:1"
+
+
+SOURCE_TEST_IFACE="veth_source"
+SOURCE_TEST_IP_ADDR="2001:0DB8:1::0:1/96"
+
+SINK_TEST_IFACE="veth_sink"
+# ${SINK_TEST_IFACE} is populated with the following range of IPv6 addresses:
+# 2001:0DB8:1::1:1 to 2001:0DB8:1::1:${IPV6_NEXTHOP_ADDR_COUNT}
+SINK_LOOPBACK_IFACE="lo"
+SINK_LOOPBACK_IP_MASK="128"
+SINK_LOOPBACK_IP_ADDR="2001:0DB8:2::1:1"
+
+nexthop_ip_list=""
+termination_signal=""
+kernel_softlokup_panic_prev_val=""
+
+terminate_ns_processes_by_pattern() {
+ local ns=$1
+ local pattern=$2
+
+ for pid in $(ip netns pids ${ns}); do
+ [ -e /proc/$pid/cmdline ] && grep -qe "${pattern}" /proc/$pid/cmdline && kill -9 $pid
+ done
+}
+
+cleanup() {
+ echo "info: cleaning up namespaces and terminating all processes within them..."
+
+
+ # Terminate iperf3 instances running in the source_ns. To avoid race
+ # conditions, first iterate over the PIDs and terminate those
+ # associated with the bash shells running the
+ # `while true; do iperf3 -c ...; done` loops. In a second iteration,
+ # terminate the individual `iperf3 -c ...` instances.
+ terminate_ns_processes_by_pattern ${source_ns} while
+ terminate_ns_processes_by_pattern ${source_ns} iperf3
+
+ # Repeat the same process for sink_ns
+ terminate_ns_processes_by_pattern ${sink_ns} while
+ terminate_ns_processes_by_pattern ${sink_ns} iperf3
+
+ # Check if any iperf3 instances are still running. This could happen
+ # if a core has entered an infinite loop and the timeout for detecting
+ # the soft lockup has not expired, but either the test interval has
+ # already elapsed or the test was terminated manually (e.g., with ^C)
+ for pid in $(ip netns pids ${source_ns}); do
+ if [ -e /proc/$pid/cmdline ] && grep -qe 'iperf3' /proc/$pid/cmdline; then
+ echo "FAIL: unable to terminate some iperf3 instances. Soft lockup is underway. A kernel panic is on the way!"
+ exit ${ksft_fail}
+ fi
+ done
+
+ if [ "$termination_signal" == "SIGINT" ]; then
+ echo "SKIP: Termination due to ^C (SIGINT)"
+ elif [ "$termination_signal" == "SIGALRM" ]; then
+ echo "PASS: No kernel soft lockup occurred during this ${TEST_DURATION} second test"
+ fi
+
+ cleanup_ns ${source_ns} ${sink_ns}
+
+ sysctl -qw kernel.softlockup_panic=${kernel_softlokup_panic_prev_val}
+}
+
+setup_prepare() {
+ setup_ns source_ns sink_ns
+
+ ip -n ${source_ns} link add name ${SOURCE_TEST_IFACE} type veth peer name ${SINK_TEST_IFACE} netns ${sink_ns}
+
+ # Setting up the Source namespace
+ ip -n ${source_ns} addr add ${SOURCE_TEST_IP_ADDR} dev ${SOURCE_TEST_IFACE}
+ ip -n ${source_ns} link set dev ${SOURCE_TEST_IFACE} qlen 10000
+ ip -n ${source_ns} link set dev ${SOURCE_TEST_IFACE} up
+ ip netns exec ${source_ns} sysctl -qw net.ipv6.fib_multipath_hash_policy=1
+
+ # Setting up the Sink namespace
+ ip -n ${sink_ns} addr add ${SINK_LOOPBACK_IP_ADDR}/${SINK_LOOPBACK_IP_MASK} dev ${SINK_LOOPBACK_IFACE}
+ ip -n ${sink_ns} link set dev ${SINK_LOOPBACK_IFACE} up
+ ip netns exec ${sink_ns} sysctl -qw net.ipv6.conf.${SINK_LOOPBACK_IFACE}.forwarding=1
+
+ ip -n ${sink_ns} link set ${SINK_TEST_IFACE} up
+ ip netns exec ${sink_ns} sysctl -qw net.ipv6.conf.${SINK_TEST_IFACE}.forwarding=1
+
+
+ # Populate nexthop IPv6 addresses on the test interface in the sink_ns
+ echo "info: populating ${IPV6_NEXTHOP_ADDR_COUNT} IPv6 addresses on the ${SINK_TEST_IFACE} interface ..."
+ for IP in $(seq 1 ${IPV6_NEXTHOP_ADDR_COUNT}); do
+ ip -n ${sink_ns} addr add ${IPV6_NEXTHOP_PREFIX}::$(printf "1:%x" "${IP}")/${IPV6_NEXTHOP_ADDR_MASK} dev ${SINK_TEST_IFACE};
+ done
+
+ # Preparing list of nexthops
+ for IP in $(seq 1 ${IPV6_NEXTHOP_ADDR_COUNT}); do
+ nexthop_ip_list=$nexthop_ip_list" nexthop via ${IPV6_NEXTHOP_PREFIX}::$(printf "1:%x" $IP) dev ${SOURCE_TEST_IFACE} weight 1"
+ done
+}
+
+
+test_soft_lockup_during_routing_table_refresh() {
+ # Start num_of_iperf_servers iperf3 servers in the sink_ns namespace,
+ # each listening on ports starting at 5001 and incrementing
+ # sequentially. Since iperf3 instances may terminate unexpectedly, a
+ # while loop is used to automatically restart them in such cases.
+ echo "info: starting ${num_of_iperf_servers} iperf3 servers in the sink_ns namespace ..."
+ for i in $(seq 1 ${num_of_iperf_servers}); do
+ cmd="iperf3 --bind ${SINK_LOOPBACK_IP_ADDR} -s -p $(printf '5%03d' ${i}) --rcv-timeout 200 &>/dev/null"
+ ip netns exec ${sink_ns} bash -c "while true; do ${cmd}; done &" &>/dev/null
+ done
+
+ # Wait for the iperf3 servers to be ready
+ for i in $(seq ${num_of_iperf_servers}); do
+ port=$(printf '5%03d' ${i});
+ wait_local_port_listen ${sink_ns} ${port} tcp
+ done
+
+ # Continuously refresh the routing table in the background within
+ # the source_ns namespace
+ ip netns exec ${source_ns} bash -c "
+ while \$(ip netns list | grep -q ${source_ns}); do
+ ip -6 route add ${SINK_LOOPBACK_IP_ADDR}/${SINK_LOOPBACK_IP_MASK} ${nexthop_ip_list};
+ sleep ${ROUTING_TABLE_REFRESH_PERIOD};
+ ip -6 route delete ${SINK_LOOPBACK_IP_ADDR}/${SINK_LOOPBACK_IP_MASK};
+ done &"
+
+ # Start num_of_iperf_servers iperf3 clients in the source_ns namespace,
+ # each sending TCP traffic on sequential ports starting at 5001.
+ # Since iperf3 instances may terminate unexpectedly (e.g., if the route
+ # to the server is deleted in the background during a route refresh), a
+ # while loop is used to automatically restart them in such cases.
+ echo "info: starting ${num_of_iperf_servers} iperf3 clients in the source_ns namespace ..."
+ for i in $(seq 1 ${num_of_iperf_servers}); do
+ cmd="iperf3 -c ${SINK_LOOPBACK_IP_ADDR} -p $(printf '5%03d' ${i}) --length 64 --bitrate ${IPERF3_BITRATE} -t 0 --connect-timeout 150 &>/dev/null"
+ ip netns exec ${source_ns} bash -c "while true; do ${cmd}; done &" &>/dev/null
+ done
+
+ echo "info: IPv6 routing table is being updated at the rate of $(echo "1/${ROUTING_TABLE_REFRESH_PERIOD}" | bc)/s for ${TEST_DURATION} seconds ..."
+ echo "info: A kernel soft lockup, if detected, results in a kernel panic!"
+
+ wait
+}
+
+# Make sure 'iperf3' is installed, skip the test otherwise
+if [ ! -x "$(command -v "iperf3")" ]; then
+ echo "SKIP: 'iperf3' is not installed. Skipping the test."
+ exit ${ksft_skip}
+fi
+
+# Determine the number of cores on the machine
+num_of_iperf_servers=$(( $(nproc)/2 ))
+
+# Check if we are running on a multi-core machine, skip the test otherwise
+if [ "${num_of_iperf_servers}" -eq 0 ]; then
+ echo "SKIP: This test is not valid on a single core machine!"
+ exit ${ksft_skip}
+fi
+
+# Since the kernel soft lockup we're testing causes at least one core to enter
+# an infinite loop, destabilizing the host and likely affecting subsequent
+# tests, we trigger a kernel panic instead of reporting a failure and
+# continuing
+kernel_softlokup_panic_prev_val=$(sysctl -n kernel.softlockup_panic)
+sysctl -qw kernel.softlockup_panic=1
+
+handle_sigint() {
+ termination_signal="SIGINT"
+ cleanup
+ exit ${ksft_skip}
+}
+
+handle_sigalrm() {
+ termination_signal="SIGALRM"
+ cleanup
+ exit ${ksft_pass}
+}
+
+trap handle_sigint SIGINT
+trap handle_sigalrm SIGALRM
+
+(sleep ${TEST_DURATION} && kill -s SIGALRM $$)&
+
+setup_prepare
+test_soft_lockup_during_routing_table_refresh
--
2.35.3
From: Selvin Xavier <selvin.xavier(a)broadcom.com>
[ Upstream commit 8be3e5b0c96beeefe9d5486b96575d104d3e7d17 ]
Driver waits indefinitely for the fifo occupancy to go below a threshold
as soon as the pacing interrupt is received. This can cause soft lockup on
one of the processors, if the rate of DB is very high.
Add a loop count for FPGA and exit the __wait_for_fifo_occupancy_below_th
if the loop is taking more time. Pacing will be continuing until the
occupancy is below the threshold. This is ensured by the checks in
bnxt_re_pacing_timer_exp and further scheduling the work for pacing based
on the fifo occupancy.
Fixes: 2ad4e6303a6d ("RDMA/bnxt_re: Implement doorbell pacing algorithm")
Link: https://patch.msgid.link/r/1728373302-19530-7-git-send-email-selvin.xavier@…
Reviewed-by: Kalesh AP <kalesh-anakkur.purayil(a)broadcom.com>
Reviewed-by: Chandramohan Akula <chandramohan.akula(a)broadcom.com>
Signed-off-by: Selvin Xavier <selvin.xavier(a)broadcom.com>
Signed-off-by: Jason Gunthorpe <jgg(a)nvidia.com>
[ Add the declaration of variable pacing_data to make it work on 6.6.y ]
Signed-off-by: Alva Lan <alvalan9(a)foxmail.com>
---
drivers/infiniband/hw/bnxt_re/main.c | 10 ++++++++++
1 file changed, 10 insertions(+)
diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c
index c7e51cc2ea26..082a383c4913 100644
--- a/drivers/infiniband/hw/bnxt_re/main.c
+++ b/drivers/infiniband/hw/bnxt_re/main.c
@@ -485,6 +485,8 @@ static void bnxt_re_set_default_pacing_data(struct bnxt_re_dev *rdev)
static void __wait_for_fifo_occupancy_below_th(struct bnxt_re_dev *rdev)
{
u32 read_val, fifo_occup;
+ struct bnxt_qplib_db_pacing_data *pacing_data = rdev->qplib_res.pacing_data;
+ u32 retry_fifo_check = 1000;
/* loop shouldn't run infintely as the occupancy usually goes
* below pacing algo threshold as soon as pacing kicks in.
@@ -500,6 +502,14 @@ static void __wait_for_fifo_occupancy_below_th(struct bnxt_re_dev *rdev)
if (fifo_occup < rdev->qplib_res.pacing_data->pacing_th)
break;
+ if (!retry_fifo_check--) {
+ dev_info_once(rdev_to_dev(rdev),
+ "%s: fifo_occup = 0x%xfifo_max_depth = 0x%x pacing_th = 0x%x\n",
+ __func__, fifo_occup, pacing_data->fifo_max_depth,
+ pacing_data->pacing_th);
+ break;
+ }
+
}
}
--
2.43.0
From: Al Viro <viro(a)zeniv.linux.org.uk>
[ Upstream commit 50e43a57334400668952f8e551c9d87d3ed2dfef ]
We get there when sigreturn has performed obscene acts on kernel stack;
in particular, the location of pt_regs has shifted. We are about to call
syscall_trace(), which might stop for tracer. If that happens, we'd better
have task_pt_regs() returning correct result...
Fucked-up-by: Al Viro <viro(a)zeniv.linux.org.uk>
Fixes: bd6f56a75bb2 ("m68k: Missing syscall_trace() on sigreturn")
Signed-off-by: Al Viro <viro(a)zeniv.linux.org.uk>
Tested-by: Michael Schmitz <schmitzmic(a)gmail.com>
Reviewed-by: Michael Schmitz <schmitzmic(a)gmail.com>
Tested-by: Finn Thain <fthain(a)linux-m68k.org>
Link: https://lore.kernel.org/r/YP2dMWeV1LkHiOpr@zeniv-ca.linux.org.uk
Signed-off-by: Geert Uytterhoeven <geert(a)linux-m68k.org>
Signed-off-by: Finn Thain <fthain(a)linux-m68k.org>
---
arch/m68k/kernel/entry.S | 2 ++
1 file changed, 2 insertions(+)
diff --git a/arch/m68k/kernel/entry.S b/arch/m68k/kernel/entry.S
index 417d8f0e8962..0d03b4f2077b 100644
--- a/arch/m68k/kernel/entry.S
+++ b/arch/m68k/kernel/entry.S
@@ -182,6 +182,8 @@ ENTRY(ret_from_signal)
movel %curptr@(TASK_STACK),%a1
tstb %a1@(TINFO_FLAGS+2)
jge 1f
+ lea %sp@(SWITCH_STACK_SIZE),%a1
+ movel %a1,%curptr@(TASK_THREAD+THREAD_ESP0)
jbsr syscall_trace
1: RESTORE_SWITCH_STACK
addql #4,%sp
From: Esben Haabendal <esben(a)geanix.com>
commit e533e4c62e9993e62e947ae9bbec34e4c7ae81c2 upstream.
By waiting at most 1 second for USR2_TXDC to be set, we avoid a potential
deadlock.
In case of the timeout, there is not much we can do, so we simply ignore
the transmitter state and optimistically try to continue.
Signed-off-by: Esben Haabendal <esben(a)geanix.com>
Acked-by: Marc Kleine-Budde <mkl(a)pengutronix.de>
Link: https://lore.kernel.org/r/919647898c337a46604edcabaf13d42d80c0915d.17128376…
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
[Denis: minor fix to resolve merge conflict.]
Signed-off-by: Denis Arefev <arefev(a)swemel.ru>
---
Backport fix for CVE-2024-40967
Link: https://nvd.nist.gov/vuln/detail/CVE-2024-40967
---
drivers/tty/serial/imx.c | 9 +++++----
1 file changed, 5 insertions(+), 4 deletions(-)
diff --git a/drivers/tty/serial/imx.c b/drivers/tty/serial/imx.c
index 6e49928bb864..5abf6685fe3c 100644
--- a/drivers/tty/serial/imx.c
+++ b/drivers/tty/serial/imx.c
@@ -27,6 +27,7 @@
#include <linux/of.h>
#include <linux/of_device.h>
#include <linux/io.h>
+#include <linux/iopoll.h>
#include <linux/dma-mapping.h>
#include <asm/irq.h>
@@ -2006,8 +2007,8 @@ imx_uart_console_write(struct console *co, const char *s, unsigned int count)
{
struct imx_port *sport = imx_uart_ports[co->index];
struct imx_port_ucrs old_ucr;
- unsigned int ucr1;
- unsigned long flags = 0;
+ unsigned long flags;
+ unsigned int ucr1, usr2;
int locked = 1;
if (sport->port.sysrq)
@@ -2038,8 +2039,8 @@ imx_uart_console_write(struct console *co, const char *s, unsigned int count)
* Finally, wait for transmitter to become empty
* and restore UCR1/2/3
*/
- while (!(imx_uart_readl(sport, USR2) & USR2_TXDC));
-
+ read_poll_timeout_atomic(imx_uart_readl, usr2, usr2 & USR2_TXDC,
+ 0, USEC_PER_SEC, false, sport, USR2);
imx_uart_ucrs_restore(sport, &old_ucr);
if (locked)
--
2.43.0
From: Omid Ehtemam-Haghighi <omid.ehtemamhaghighi(a)menlosecurity.com>
[ Upstream commit d9ccb18f83ea2bb654289b6ecf014fd267cc988b ]
Soft lockups have been observed on a cluster of Linux-based edge routers
located in a highly dynamic environment. Using the `bird` service, these
routers continuously update BGP-advertised routes due to frequently
changing nexthop destinations, while also managing significant IPv6
traffic. The lockups occur during the traversal of the multipath
circular linked-list in the `fib6_select_path` function, particularly
while iterating through the siblings in the list. The issue typically
arises when the nodes of the linked list are unexpectedly deleted
concurrently on a different core—indicated by their 'next' and
'previous' elements pointing back to the node itself and their reference
count dropping to zero. This results in an infinite loop, leading to a
soft lockup that triggers a system panic via the watchdog timer.
Apply RCU primitives in the problematic code sections to resolve the
issue. Where necessary, update the references to fib6_siblings to
annotate or use the RCU APIs.
Include a test script that reproduces the issue. The script
periodically updates the routing table while generating a heavy load
of outgoing IPv6 traffic through multiple iperf3 clients. It
consistently induces infinite soft lockups within a couple of minutes.
Kernel log:
0 [ffffbd13003e8d30] machine_kexec at ffffffff8ceaf3eb
1 [ffffbd13003e8d90] __crash_kexec at ffffffff8d0120e3
2 [ffffbd13003e8e58] panic at ffffffff8cef65d4
3 [ffffbd13003e8ed8] watchdog_timer_fn at ffffffff8d05cb03
4 [ffffbd13003e8f08] __hrtimer_run_queues at ffffffff8cfec62f
5 [ffffbd13003e8f70] hrtimer_interrupt at ffffffff8cfed756
6 [ffffbd13003e8fd0] __sysvec_apic_timer_interrupt at ffffffff8cea01af
7 [ffffbd13003e8ff0] sysvec_apic_timer_interrupt at ffffffff8df1b83d
-- <IRQ stack> --
8 [ffffbd13003d3708] asm_sysvec_apic_timer_interrupt at ffffffff8e000ecb
[exception RIP: fib6_select_path+299]
RIP: ffffffff8ddafe7b RSP: ffffbd13003d37b8 RFLAGS: 00000287
RAX: ffff975850b43600 RBX: ffff975850b40200 RCX: 0000000000000000
RDX: 000000003fffffff RSI: 0000000051d383e4 RDI: ffff975850b43618
RBP: ffffbd13003d3800 R8: 0000000000000000 R9: ffff975850b40200
R10: 0000000000000000 R11: 0000000000000000 R12: ffffbd13003d3830
R13: ffff975850b436a8 R14: ffff975850b43600 R15: 0000000000000007
ORIG_RAX: ffffffffffffffff CS: 0010 SS: 0018
9 [ffffbd13003d3808] ip6_pol_route at ffffffff8ddb030c
10 [ffffbd13003d3888] ip6_pol_route_input at ffffffff8ddb068c
11 [ffffbd13003d3898] fib6_rule_lookup at ffffffff8ddf02b5
12 [ffffbd13003d3928] ip6_route_input at ffffffff8ddb0f47
13 [ffffbd13003d3a18] ip6_rcv_finish_core.constprop.0 at ffffffff8dd950d0
14 [ffffbd13003d3a30] ip6_list_rcv_finish.constprop.0 at ffffffff8dd96274
15 [ffffbd13003d3a98] ip6_sublist_rcv at ffffffff8dd96474
16 [ffffbd13003d3af8] ipv6_list_rcv at ffffffff8dd96615
17 [ffffbd13003d3b60] __netif_receive_skb_list_core at ffffffff8dc16fec
18 [ffffbd13003d3be0] netif_receive_skb_list_internal at ffffffff8dc176b3
19 [ffffbd13003d3c50] napi_gro_receive at ffffffff8dc565b9
20 [ffffbd13003d3c80] ice_receive_skb at ffffffffc087e4f5 [ice]
21 [ffffbd13003d3c90] ice_clean_rx_irq at ffffffffc0881b80 [ice]
22 [ffffbd13003d3d20] ice_napi_poll at ffffffffc088232f [ice]
23 [ffffbd13003d3d80] __napi_poll at ffffffff8dc18000
24 [ffffbd13003d3db8] net_rx_action at ffffffff8dc18581
25 [ffffbd13003d3e40] __do_softirq at ffffffff8df352e9
26 [ffffbd13003d3eb0] run_ksoftirqd at ffffffff8ceffe47
27 [ffffbd13003d3ec0] smpboot_thread_fn at ffffffff8cf36a30
28 [ffffbd13003d3ee8] kthread at ffffffff8cf2b39f
29 [ffffbd13003d3f28] ret_from_fork at ffffffff8ce5fa64
30 [ffffbd13003d3f50] ret_from_fork_asm at ffffffff8ce03cbb
Fixes: 66f5d6ce53e6 ("ipv6: replace rwlock with rcu and spinlock in fib6_table")
Reported-by: Adrian Oliver <kernel(a)aoliver.ca>
Signed-off-by: Omid Ehtemam-Haghighi <omid.ehtemamhaghighi(a)menlosecurity.com>
Cc: Shuah Khan <shuah(a)kernel.org>
Cc: Ido Schimmel <idosch(a)idosch.org>
Cc: Kuniyuki Iwashima <kuniyu(a)amazon.com>
Cc: Simon Horman <horms(a)kernel.org>
Reviewed-by: David Ahern <dsahern(a)kernel.org>
Link: https://patch.msgid.link/20241106010236.1239299-1-omid.ehtemamhaghighi@menl…
Signed-off-by: Jakub Kicinski <kuba(a)kernel.org>
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
Signed-off-by: Rajani Kantha <rajanikantha(a)engineer.com>
---
net/ipv6/ip6_fib.c | 8 +-
net/ipv6/route.c | 45 ++-
tools/testing/selftests/net/Makefile | 1 +
.../net/ipv6_route_update_soft_lockup.sh | 262 ++++++++++++++++++
4 files changed, 297 insertions(+), 19 deletions(-)
create mode 100755 tools/testing/selftests/net/ipv6_route_update_soft_lockup.sh
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index afa9073567dc..023ac39041a2 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -1179,8 +1179,8 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt,
while (sibling) {
if (sibling->fib6_metric == rt->fib6_metric &&
rt6_qualify_for_ecmp(sibling)) {
- list_add_tail(&rt->fib6_siblings,
- &sibling->fib6_siblings);
+ list_add_tail_rcu(&rt->fib6_siblings,
+ &sibling->fib6_siblings);
break;
}
sibling = rcu_dereference_protected(sibling->fib6_next,
@@ -1241,7 +1241,7 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt,
fib6_siblings)
sibling->fib6_nsiblings--;
rt->fib6_nsiblings = 0;
- list_del_init(&rt->fib6_siblings);
+ list_del_rcu(&rt->fib6_siblings);
rt6_multipath_rebalance(next_sibling);
return err;
}
@@ -1954,7 +1954,7 @@ static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn,
&rt->fib6_siblings, fib6_siblings)
sibling->fib6_nsiblings--;
rt->fib6_nsiblings = 0;
- list_del_init(&rt->fib6_siblings);
+ list_del_rcu(&rt->fib6_siblings);
rt6_multipath_rebalance(next_sibling);
}
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index fc5c53462025..c5cee40a658b 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -418,8 +418,8 @@ void fib6_select_path(const struct net *net, struct fib6_result *res,
struct flowi6 *fl6, int oif, bool have_oif_match,
const struct sk_buff *skb, int strict)
{
- struct fib6_info *sibling, *next_sibling;
struct fib6_info *match = res->f6i;
+ struct fib6_info *sibling;
if (!match->nh && (!match->fib6_nsiblings || have_oif_match))
goto out;
@@ -445,8 +445,8 @@ void fib6_select_path(const struct net *net, struct fib6_result *res,
if (fl6->mp_hash <= atomic_read(&match->fib6_nh->fib_nh_upper_bound))
goto out;
- list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings,
- fib6_siblings) {
+ list_for_each_entry_rcu(sibling, &match->fib6_siblings,
+ fib6_siblings) {
const struct fib6_nh *nh = sibling->fib6_nh;
int nh_upper_bound;
@@ -5186,14 +5186,18 @@ static void ip6_route_mpath_notify(struct fib6_info *rt,
* nexthop. Since sibling routes are always added at the end of
* the list, find the first sibling of the last route appended
*/
+ rcu_read_lock();
+
if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->fib6_nsiblings) {
- rt = list_first_entry(&rt_last->fib6_siblings,
- struct fib6_info,
- fib6_siblings);
+ rt = list_first_or_null_rcu(&rt_last->fib6_siblings,
+ struct fib6_info,
+ fib6_siblings);
}
if (rt)
inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
+
+ rcu_read_unlock();
}
static bool ip6_route_mpath_should_notify(const struct fib6_info *rt)
@@ -5538,17 +5542,21 @@ static size_t rt6_nlmsg_size(struct fib6_info *f6i)
nexthop_for_each_fib6_nh(f6i->nh, rt6_nh_nlmsg_size,
&nexthop_len);
} else {
- struct fib6_info *sibling, *next_sibling;
struct fib6_nh *nh = f6i->fib6_nh;
+ struct fib6_info *sibling;
nexthop_len = 0;
if (f6i->fib6_nsiblings) {
rt6_nh_nlmsg_size(nh, &nexthop_len);
- list_for_each_entry_safe(sibling, next_sibling,
- &f6i->fib6_siblings, fib6_siblings) {
+ rcu_read_lock();
+
+ list_for_each_entry_rcu(sibling, &f6i->fib6_siblings,
+ fib6_siblings) {
rt6_nh_nlmsg_size(sibling->fib6_nh, &nexthop_len);
}
+
+ rcu_read_unlock();
}
nexthop_len += lwtunnel_get_encap_size(nh->fib_nh_lws);
}
@@ -5712,7 +5720,7 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb,
lwtunnel_fill_encap(skb, dst->lwtstate, RTA_ENCAP, RTA_ENCAP_TYPE) < 0)
goto nla_put_failure;
} else if (rt->fib6_nsiblings) {
- struct fib6_info *sibling, *next_sibling;
+ struct fib6_info *sibling;
struct nlattr *mp;
mp = nla_nest_start_noflag(skb, RTA_MULTIPATH);
@@ -5724,14 +5732,21 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb,
0) < 0)
goto nla_put_failure;
- list_for_each_entry_safe(sibling, next_sibling,
- &rt->fib6_siblings, fib6_siblings) {
+ rcu_read_lock();
+
+ list_for_each_entry_rcu(sibling, &rt->fib6_siblings,
+ fib6_siblings) {
if (fib_add_nexthop(skb, &sibling->fib6_nh->nh_common,
sibling->fib6_nh->fib_nh_weight,
- AF_INET6, 0) < 0)
+ AF_INET6, 0) < 0) {
+ rcu_read_unlock();
+
goto nla_put_failure;
+ }
}
+ rcu_read_unlock();
+
nla_nest_end(skb, mp);
} else if (rt->nh) {
if (nla_put_u32(skb, RTA_NH_ID, rt->nh->id))
@@ -6168,7 +6183,7 @@ void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info,
err = -ENOBUFS;
seq = info->nlh ? info->nlh->nlmsg_seq : 0;
- skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
+ skb = nlmsg_new(rt6_nlmsg_size(rt), GFP_ATOMIC);
if (!skb)
goto errout;
@@ -6181,7 +6196,7 @@ void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info,
goto errout;
}
rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
- info->nlh, gfp_any());
+ info->nlh, GFP_ATOMIC);
return;
errout:
if (err < 0)
diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile
index 91a48efb140b..efaf0e0bc459 100644
--- a/tools/testing/selftests/net/Makefile
+++ b/tools/testing/selftests/net/Makefile
@@ -91,6 +91,7 @@ TEST_PROGS += test_vxlan_mdb.sh
TEST_PROGS += test_bridge_neigh_suppress.sh
TEST_PROGS += test_vxlan_nolocalbypass.sh
TEST_PROGS += test_bridge_backup_port.sh
+TEST_PROGS += ipv6_route_update_soft_lockup.sh
TEST_FILES := settings
TEST_FILES += in_netns.sh lib.sh net_helper.sh setup_loopback.sh setup_veth.sh
diff --git a/tools/testing/selftests/net/ipv6_route_update_soft_lockup.sh b/tools/testing/selftests/net/ipv6_route_update_soft_lockup.sh
new file mode 100755
index 000000000000..a6b2b1f9c641
--- /dev/null
+++ b/tools/testing/selftests/net/ipv6_route_update_soft_lockup.sh
@@ -0,0 +1,262 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Testing for potential kernel soft lockup during IPv6 routing table
+# refresh under heavy outgoing IPv6 traffic. If a kernel soft lockup
+# occurs, a kernel panic will be triggered to prevent associated issues.
+#
+#
+# Test Environment Layout
+#
+# ┌----------------┐ ┌----------------┐
+# | SOURCE_NS | | SINK_NS |
+# | NAMESPACE | | NAMESPACE |
+# |(iperf3 clients)| |(iperf3 servers)|
+# | | | |
+# | | | |
+# | ┌-----------| nexthops |---------┐ |
+# | |veth_source|<--------------------------------------->|veth_sink|<┐ |
+# | └-----------|2001:0DB8:1::0:1/96 2001:0DB8:1::1:1/96 |---------┘ | |
+# | | ^ 2001:0DB8:1::1:2/96 | | |
+# | | . . | fwd | |
+# | ┌---------┐ | . . | | |
+# | | IPv6 | | . . | V |
+# | | routing | | . 2001:0DB8:1::1:80/96| ┌-----┐ |
+# | | table | | . | | lo | |
+# | | nexthop | | . └--------┴-----┴-┘
+# | | update | | ............................> 2001:0DB8:2::1:1/128
+# | └-------- ┘ |
+# └----------------┘
+#
+# The test script sets up two network namespaces, source_ns and sink_ns,
+# connected via a veth link. Within source_ns, it continuously updates the
+# IPv6 routing table by flushing and inserting IPV6_NEXTHOP_ADDR_COUNT nexthop
+# IPs destined for SINK_LOOPBACK_IP_ADDR in sink_ns. This refresh occurs at a
+# rate of 1/ROUTING_TABLE_REFRESH_PERIOD per second for TEST_DURATION seconds.
+#
+# Simultaneously, multiple iperf3 clients within source_ns generate heavy
+# outgoing IPv6 traffic. Each client is assigned a unique port number starting
+# at 5000 and incrementing sequentially. Each client targets a unique iperf3
+# server running in sink_ns, connected to the SINK_LOOPBACK_IFACE interface
+# using the same port number.
+#
+# The number of iperf3 servers and clients is set to half of the total
+# available cores on each machine.
+#
+# NOTE: We have tested this script on machines with various CPU specifications,
+# ranging from lower to higher performance as listed below. The test script
+# effectively triggered a kernel soft lockup on machines running an unpatched
+# kernel in under a minute:
+#
+# - 1x Intel Xeon E-2278G 8-Core Processor @ 3.40GHz
+# - 1x Intel Xeon E-2378G Processor 8-Core @ 2.80GHz
+# - 1x AMD EPYC 7401P 24-Core Processor @ 2.00GHz
+# - 1x AMD EPYC 7402P 24-Core Processor @ 2.80GHz
+# - 2x Intel Xeon Gold 5120 14-Core Processor @ 2.20GHz
+# - 1x Ampere Altra Q80-30 80-Core Processor @ 3.00GHz
+# - 2x Intel Xeon Gold 5120 14-Core Processor @ 2.20GHz
+# - 2x Intel Xeon Silver 4214 24-Core Processor @ 2.20GHz
+# - 1x AMD EPYC 7502P 32-Core @ 2.50GHz
+# - 1x Intel Xeon Gold 6314U 32-Core Processor @ 2.30GHz
+# - 2x Intel Xeon Gold 6338 32-Core Processor @ 2.00GHz
+#
+# On less performant machines, you may need to increase the TEST_DURATION
+# parameter to enhance the likelihood of encountering a race condition leading
+# to a kernel soft lockup and avoid a false negative result.
+#
+# NOTE: The test may not produce the expected result in virtualized
+# environments (e.g., qemu) due to differences in timing and CPU handling,
+# which can affect the conditions needed to trigger a soft lockup.
+
+source lib.sh
+source net_helper.sh
+
+TEST_DURATION=300
+ROUTING_TABLE_REFRESH_PERIOD=0.01
+
+IPERF3_BITRATE="300m"
+
+
+IPV6_NEXTHOP_ADDR_COUNT="128"
+IPV6_NEXTHOP_ADDR_MASK="96"
+IPV6_NEXTHOP_PREFIX="2001:0DB8:1"
+
+
+SOURCE_TEST_IFACE="veth_source"
+SOURCE_TEST_IP_ADDR="2001:0DB8:1::0:1/96"
+
+SINK_TEST_IFACE="veth_sink"
+# ${SINK_TEST_IFACE} is populated with the following range of IPv6 addresses:
+# 2001:0DB8:1::1:1 to 2001:0DB8:1::1:${IPV6_NEXTHOP_ADDR_COUNT}
+SINK_LOOPBACK_IFACE="lo"
+SINK_LOOPBACK_IP_MASK="128"
+SINK_LOOPBACK_IP_ADDR="2001:0DB8:2::1:1"
+
+nexthop_ip_list=""
+termination_signal=""
+kernel_softlokup_panic_prev_val=""
+
+terminate_ns_processes_by_pattern() {
+ local ns=$1
+ local pattern=$2
+
+ for pid in $(ip netns pids ${ns}); do
+ [ -e /proc/$pid/cmdline ] && grep -qe "${pattern}" /proc/$pid/cmdline && kill -9 $pid
+ done
+}
+
+cleanup() {
+ echo "info: cleaning up namespaces and terminating all processes within them..."
+
+
+ # Terminate iperf3 instances running in the source_ns. To avoid race
+ # conditions, first iterate over the PIDs and terminate those
+ # associated with the bash shells running the
+ # `while true; do iperf3 -c ...; done` loops. In a second iteration,
+ # terminate the individual `iperf3 -c ...` instances.
+ terminate_ns_processes_by_pattern ${source_ns} while
+ terminate_ns_processes_by_pattern ${source_ns} iperf3
+
+ # Repeat the same process for sink_ns
+ terminate_ns_processes_by_pattern ${sink_ns} while
+ terminate_ns_processes_by_pattern ${sink_ns} iperf3
+
+ # Check if any iperf3 instances are still running. This could happen
+ # if a core has entered an infinite loop and the timeout for detecting
+ # the soft lockup has not expired, but either the test interval has
+ # already elapsed or the test was terminated manually (e.g., with ^C)
+ for pid in $(ip netns pids ${source_ns}); do
+ if [ -e /proc/$pid/cmdline ] && grep -qe 'iperf3' /proc/$pid/cmdline; then
+ echo "FAIL: unable to terminate some iperf3 instances. Soft lockup is underway. A kernel panic is on the way!"
+ exit ${ksft_fail}
+ fi
+ done
+
+ if [ "$termination_signal" == "SIGINT" ]; then
+ echo "SKIP: Termination due to ^C (SIGINT)"
+ elif [ "$termination_signal" == "SIGALRM" ]; then
+ echo "PASS: No kernel soft lockup occurred during this ${TEST_DURATION} second test"
+ fi
+
+ cleanup_ns ${source_ns} ${sink_ns}
+
+ sysctl -qw kernel.softlockup_panic=${kernel_softlokup_panic_prev_val}
+}
+
+setup_prepare() {
+ setup_ns source_ns sink_ns
+
+ ip -n ${source_ns} link add name ${SOURCE_TEST_IFACE} type veth peer name ${SINK_TEST_IFACE} netns ${sink_ns}
+
+ # Setting up the Source namespace
+ ip -n ${source_ns} addr add ${SOURCE_TEST_IP_ADDR} dev ${SOURCE_TEST_IFACE}
+ ip -n ${source_ns} link set dev ${SOURCE_TEST_IFACE} qlen 10000
+ ip -n ${source_ns} link set dev ${SOURCE_TEST_IFACE} up
+ ip netns exec ${source_ns} sysctl -qw net.ipv6.fib_multipath_hash_policy=1
+
+ # Setting up the Sink namespace
+ ip -n ${sink_ns} addr add ${SINK_LOOPBACK_IP_ADDR}/${SINK_LOOPBACK_IP_MASK} dev ${SINK_LOOPBACK_IFACE}
+ ip -n ${sink_ns} link set dev ${SINK_LOOPBACK_IFACE} up
+ ip netns exec ${sink_ns} sysctl -qw net.ipv6.conf.${SINK_LOOPBACK_IFACE}.forwarding=1
+
+ ip -n ${sink_ns} link set ${SINK_TEST_IFACE} up
+ ip netns exec ${sink_ns} sysctl -qw net.ipv6.conf.${SINK_TEST_IFACE}.forwarding=1
+
+
+ # Populate nexthop IPv6 addresses on the test interface in the sink_ns
+ echo "info: populating ${IPV6_NEXTHOP_ADDR_COUNT} IPv6 addresses on the ${SINK_TEST_IFACE} interface ..."
+ for IP in $(seq 1 ${IPV6_NEXTHOP_ADDR_COUNT}); do
+ ip -n ${sink_ns} addr add ${IPV6_NEXTHOP_PREFIX}::$(printf "1:%x" "${IP}")/${IPV6_NEXTHOP_ADDR_MASK} dev ${SINK_TEST_IFACE};
+ done
+
+ # Preparing list of nexthops
+ for IP in $(seq 1 ${IPV6_NEXTHOP_ADDR_COUNT}); do
+ nexthop_ip_list=$nexthop_ip_list" nexthop via ${IPV6_NEXTHOP_PREFIX}::$(printf "1:%x" $IP) dev ${SOURCE_TEST_IFACE} weight 1"
+ done
+}
+
+
+test_soft_lockup_during_routing_table_refresh() {
+ # Start num_of_iperf_servers iperf3 servers in the sink_ns namespace,
+ # each listening on ports starting at 5001 and incrementing
+ # sequentially. Since iperf3 instances may terminate unexpectedly, a
+ # while loop is used to automatically restart them in such cases.
+ echo "info: starting ${num_of_iperf_servers} iperf3 servers in the sink_ns namespace ..."
+ for i in $(seq 1 ${num_of_iperf_servers}); do
+ cmd="iperf3 --bind ${SINK_LOOPBACK_IP_ADDR} -s -p $(printf '5%03d' ${i}) --rcv-timeout 200 &>/dev/null"
+ ip netns exec ${sink_ns} bash -c "while true; do ${cmd}; done &" &>/dev/null
+ done
+
+ # Wait for the iperf3 servers to be ready
+ for i in $(seq ${num_of_iperf_servers}); do
+ port=$(printf '5%03d' ${i});
+ wait_local_port_listen ${sink_ns} ${port} tcp
+ done
+
+ # Continuously refresh the routing table in the background within
+ # the source_ns namespace
+ ip netns exec ${source_ns} bash -c "
+ while \$(ip netns list | grep -q ${source_ns}); do
+ ip -6 route add ${SINK_LOOPBACK_IP_ADDR}/${SINK_LOOPBACK_IP_MASK} ${nexthop_ip_list};
+ sleep ${ROUTING_TABLE_REFRESH_PERIOD};
+ ip -6 route delete ${SINK_LOOPBACK_IP_ADDR}/${SINK_LOOPBACK_IP_MASK};
+ done &"
+
+ # Start num_of_iperf_servers iperf3 clients in the source_ns namespace,
+ # each sending TCP traffic on sequential ports starting at 5001.
+ # Since iperf3 instances may terminate unexpectedly (e.g., if the route
+ # to the server is deleted in the background during a route refresh), a
+ # while loop is used to automatically restart them in such cases.
+ echo "info: starting ${num_of_iperf_servers} iperf3 clients in the source_ns namespace ..."
+ for i in $(seq 1 ${num_of_iperf_servers}); do
+ cmd="iperf3 -c ${SINK_LOOPBACK_IP_ADDR} -p $(printf '5%03d' ${i}) --length 64 --bitrate ${IPERF3_BITRATE} -t 0 --connect-timeout 150 &>/dev/null"
+ ip netns exec ${source_ns} bash -c "while true; do ${cmd}; done &" &>/dev/null
+ done
+
+ echo "info: IPv6 routing table is being updated at the rate of $(echo "1/${ROUTING_TABLE_REFRESH_PERIOD}" | bc)/s for ${TEST_DURATION} seconds ..."
+ echo "info: A kernel soft lockup, if detected, results in a kernel panic!"
+
+ wait
+}
+
+# Make sure 'iperf3' is installed, skip the test otherwise
+if [ ! -x "$(command -v "iperf3")" ]; then
+ echo "SKIP: 'iperf3' is not installed. Skipping the test."
+ exit ${ksft_skip}
+fi
+
+# Determine the number of cores on the machine
+num_of_iperf_servers=$(( $(nproc)/2 ))
+
+# Check if we are running on a multi-core machine, skip the test otherwise
+if [ "${num_of_iperf_servers}" -eq 0 ]; then
+ echo "SKIP: This test is not valid on a single core machine!"
+ exit ${ksft_skip}
+fi
+
+# Since the kernel soft lockup we're testing causes at least one core to enter
+# an infinite loop, destabilizing the host and likely affecting subsequent
+# tests, we trigger a kernel panic instead of reporting a failure and
+# continuing
+kernel_softlokup_panic_prev_val=$(sysctl -n kernel.softlockup_panic)
+sysctl -qw kernel.softlockup_panic=1
+
+handle_sigint() {
+ termination_signal="SIGINT"
+ cleanup
+ exit ${ksft_skip}
+}
+
+handle_sigalrm() {
+ termination_signal="SIGALRM"
+ cleanup
+ exit ${ksft_pass}
+}
+
+trap handle_sigint SIGINT
+trap handle_sigalrm SIGALRM
+
+(sleep ${TEST_DURATION} && kill -s SIGALRM $$)&
+
+setup_prepare
+test_soft_lockup_during_routing_table_refresh
--
2.35.3
From: Al Viro <viro(a)zeniv.linux.org.uk>
[ Upstream commit 50e43a57334400668952f8e551c9d87d3ed2dfef ]
We get there when sigreturn has performed obscene acts on kernel stack;
in particular, the location of pt_regs has shifted. We are about to call
syscall_trace(), which might stop for tracer. If that happens, we'd better
have task_pt_regs() returning correct result...
Fucked-up-by: Al Viro <viro(a)zeniv.linux.org.uk>
Fixes: bd6f56a75bb2 ("m68k: Missing syscall_trace() on sigreturn")
Signed-off-by: Al Viro <viro(a)zeniv.linux.org.uk>
Tested-by: Michael Schmitz <schmitzmic(a)gmail.com>
Reviewed-by: Michael Schmitz <schmitzmic(a)gmail.com>
Tested-by: Finn Thain <fthain(a)linux-m68k.org>
Link: https://lore.kernel.org/r/YP2dMWeV1LkHiOpr@zeniv-ca.linux.org.uk
Signed-off-by: Geert Uytterhoeven <geert(a)linux-m68k.org>
Signed-off-by: Finn Thain <fthain(a)linux-m68k.org>
---
arch/m68k/kernel/entry.S | 2 ++
1 file changed, 2 insertions(+)
diff --git a/arch/m68k/kernel/entry.S b/arch/m68k/kernel/entry.S
index 417d8f0e8962..0d03b4f2077b 100644
--- a/arch/m68k/kernel/entry.S
+++ b/arch/m68k/kernel/entry.S
@@ -182,6 +182,8 @@ ENTRY(ret_from_signal)
movel %curptr@(TASK_STACK),%a1
tstb %a1@(TINFO_FLAGS+2)
jge 1f
+ lea %sp@(SWITCH_STACK_SIZE),%a1
+ movel %a1,%curptr@(TASK_THREAD+THREAD_ESP0)
jbsr syscall_trace
1: RESTORE_SWITCH_STACK
addql #4,%sp
From: "Eric W. Biederman" <ebiederm(a)xmission.com>
[ Upstream commit a3616a3c02722d1edb95acc7fceade242f6553ba ]
In the fpsp040 code when copyin or copyout fails call
force_sigsegv(SIGSEGV) instead of do_exit(SIGSEGV).
This solves a couple of problems. Because do_exit embeds the ptrace
stop PTRACE_EVENT_EXIT a complete stack frame needs to be present for
that to work correctly. There is always the information needed for a
ptrace stop where get_signal is called. So exiting with a signal
solves the ptrace issue.
Further exiting with a signal ensures that all of the threads in a
process are killed not just the thread that malfunctioned. Which
avoids confusing userspace.
To make force_sigsegv(SIGSEGV) work in fpsp040_die modify the code to
save all of the registers and jump to ret_from_exception (which
ultimately calls get_signal) after fpsp040_die returns.
v2: Updated the branches to use gas's pseudo ops that automatically
calculate the best branch instruction to use for the purpose.
v1: https://lkml.kernel.org/r/87a6m8kgtx.fsf_-_@disp2133
Link: https://lkml.kernel.org/r/87tukghjfs.fsf_-_@disp2133
Acked-by: Geert Uytterhoeven <geert(a)linux-m68k.org>
Signed-off-by: "Eric W. Biederman" <ebiederm(a)xmission.com>
Signed-off-by: Finn Thain <fthain(a)linux-m68k.org>
---
arch/m68k/fpsp040/skeleton.S | 3 ++-
arch/m68k/kernel/traps.c | 2 +-
2 files changed, 3 insertions(+), 2 deletions(-)
diff --git a/arch/m68k/fpsp040/skeleton.S b/arch/m68k/fpsp040/skeleton.S
index 31a9c634c81e..081922c72daa 100644
--- a/arch/m68k/fpsp040/skeleton.S
+++ b/arch/m68k/fpsp040/skeleton.S
@@ -502,7 +502,8 @@ in_ea:
.section .fixup,"ax"
.even
1:
- jbra fpsp040_die
+ jbsr fpsp040_die
+ jbra .Lnotkern
.section __ex_table,"a"
.align 4
diff --git a/arch/m68k/kernel/traps.c b/arch/m68k/kernel/traps.c
index 35f706d836c5..c6f18dc5884b 100644
--- a/arch/m68k/kernel/traps.c
+++ b/arch/m68k/kernel/traps.c
@@ -1155,7 +1155,7 @@ asmlinkage void set_esp0(unsigned long ssp)
*/
asmlinkage void fpsp040_die(void)
{
- do_exit(SIGSEGV);
+ force_sigsegv(SIGSEGV);
}
#ifdef CONFIG_M68KFPU_EMU
Since commit 25f39d3dcb48 ("s390/pci: Ignore RID for isolated VFs") PFs
which are not initially configured but in standby are considered
isolated. That is they create only a single function PCI domain. Due to
the PCI domains being created on discovery, this means that even if they
are configured later on, sibling PFs and their child VFs will not be
added to their PCI domain breaking SR-IOV expectations.
The reason the referenced commit ignored standby PFs for the creation of
multi-function PCI subhierarchies, was to work around a PCI domain
renumbering scenario on reboot. The renumbering would occur after
removing a previously in standby PF, whose domain number is used for its
configured sibling PFs and their child VFs, but which itself remained in
standby. When this is followed by a reboot, the sibling PF is used
instead to determine the PCI domain number of it and its child VFs.
In principle it is not possible to know which standby PFs will be
configured later and which may be removed. The PCI domain and root bus
are pre-requisites for hotplug slots so the decision of which functions
belong to which domain can not be postponed. With the renumbering
occurring only in rare circumstances and being generally benign, accept
it as an oddity and fix SR-IOV for initially standby PFs simply by
allowing them to create PCI domains.
Cc: stable(a)vger.kernel.org
Reviewed-by: Gerd Bayer <gbayer(a)linux.ibm.com>
Fixes: 25f39d3dcb48 ("s390/pci: Ignore RID for isolated VFs")
Signed-off-by: Niklas Schnelle <schnelle(a)linux.ibm.com>
---
Changes in v3:
- Add R-b from Gerd
- Add Cc: stable…
- Add commas (Sandy)
Changes in v2:
- Reword commit message
---
arch/s390/pci/pci_bus.c | 1 -
1 file changed, 1 deletion(-)
diff --git a/arch/s390/pci/pci_bus.c b/arch/s390/pci/pci_bus.c
index d5ace00d10f04285f899284481f1e426187d4ff4..857afbc4828f0c677f88cc80dd4a5fff104a615a 100644
--- a/arch/s390/pci/pci_bus.c
+++ b/arch/s390/pci/pci_bus.c
@@ -171,7 +171,6 @@ void zpci_bus_scan_busses(void)
static bool zpci_bus_is_multifunction_root(struct zpci_dev *zdev)
{
return !s390_pci_no_rid && zdev->rid_available &&
- zpci_is_device_configured(zdev) &&
!zdev->vfn;
}
---
base-commit: 6b7afe1a2b6905e42fe45bd7015f20baa856e28e
change-id: 20250116-fix_standby_pf-e1d51394e9b3
Best regards,
--
Niklas Schnelle
On pSeries, when user attempts to use the same vfio container used by
different iommu group, the spapr_tce_set_window() returns -EPERM
and the subsequent cleanup leads to the below crash.
Kernel attempted to read user page (308) - exploit attempt?
BUG: Kernel NULL pointer dereference on read at 0x00000308
Faulting instruction address: 0xc0000000001ce358
Oops: Kernel access of bad area, sig: 11 [#1]
NIP: c0000000001ce358 LR: c0000000001ce05c CTR: c00000000005add0
<snip>
NIP [c0000000001ce358] spapr_tce_unset_window+0x3b8/0x510
LR [c0000000001ce05c] spapr_tce_unset_window+0xbc/0x510
Call Trace:
spapr_tce_unset_window+0xbc/0x510 (unreliable)
tce_iommu_attach_group+0x24c/0x340 [vfio_iommu_spapr_tce]
vfio_container_attach_group+0xec/0x240 [vfio]
vfio_group_fops_unl_ioctl+0x548/0xb00 [vfio]
sys_ioctl+0x754/0x1580
system_call_exception+0x13c/0x330
system_call_vectored_common+0x15c/0x2ec
<snip>
--- interrupt: 3000
Fix this by having null check for the tbl passed to the
spapr_tce_unset_window().
Fixes: f431a8cde7f1 ("powerpc/iommu: Reimplement the iommu_table_group_ops for pSeries")
Cc: stable(a)vger.kernel.org
Reported-by: Vaishnavi Bhat <vaish123(a)in.ibm.com>
Signed-off-by: Shivaprasad G Bhat <sbhat(a)linux.ibm.com>
---
arch/powerpc/platforms/pseries/iommu.c | 3 +++
1 file changed, 3 insertions(+)
diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
index 534cd159e9ab..78b895b568b3 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -2205,6 +2205,9 @@ static long spapr_tce_unset_window(struct iommu_table_group *table_group, int nu
const char *win_name;
int ret = -ENODEV;
+ if (!tbl) /* The table was never created OR window was never opened */
+ return 0;
+
mutex_lock(&dma_win_init_mutex);
if ((num == 0) && is_default_window_table(table_group, tbl))
The PE Reset State "0" returned by RTAS calls
"ibm_read_slot_reset_[state|state2]" indicates that the reset is
deactivated and the PE is in a state where MMIO and DMA are allowed.
However, the current implementation of "pseries_eeh_get_state()" does
not reflect this, causing drivers to incorrectly assume that MMIO and
DMA operations cannot be resumed.
The userspace drivers as a part of EEH recovery using VFIO ioctls fail
to detect when the recovery process is complete. The VFIO_EEH_PE_GET_STATE
ioctl does not report the expected EEH_PE_STATE_NORMAL state, preventing
userspace drivers from functioning properly on pseries systems.
The patch addresses this issue by updating 'pseries_eeh_get_state()'
to include "EEH_STATE_MMIO_ENABLED" and "EEH_STATE_DMA_ENABLED" in
the result mask for PE Reset State "0". This ensures correct state
reporting to the callers, aligning the behavior with the PAPR specification
and fixing the bug in EEH recovery for VFIO user workflows.
Fixes: 00ba05a12b3c ("powerpc/pseries: Cleanup on pseries_eeh_get_state()")
Cc: <stable(a)vger.kernel.org>
Reviewed-by: Ritesh Harjani (IBM) <ritesh.list(a)gmail.com>
Signed-off-by: Narayana Murty N <nnmlinux(a)linux.ibm.com>
---
Changelog:
V1:https://lore.kernel.org/all/20241107042027.338065-1-nnmlinux@linux.ibm.c…
--added Fixes tag for "powerpc/pseries: Cleanup on
pseries_eeh_get_state()".
V2:https://lore.kernel.org/stable/20241212075044.10563-1-nnmlinux%40linux.i…
--Updated the patch description to include it in the stable kernel tree.
V3:https://lore.kernel.org/all/87v7vm8pwz.fsf@gmail.com/
--Updated commit description.
---
arch/powerpc/platforms/pseries/eeh_pseries.c | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/arch/powerpc/platforms/pseries/eeh_pseries.c b/arch/powerpc/platforms/pseries/eeh_pseries.c
index 1893f66371fa..b12ef382fec7 100644
--- a/arch/powerpc/platforms/pseries/eeh_pseries.c
+++ b/arch/powerpc/platforms/pseries/eeh_pseries.c
@@ -580,8 +580,10 @@ static int pseries_eeh_get_state(struct eeh_pe *pe, int *delay)
switch(rets[0]) {
case 0:
- result = EEH_STATE_MMIO_ACTIVE |
- EEH_STATE_DMA_ACTIVE;
+ result = EEH_STATE_MMIO_ACTIVE |
+ EEH_STATE_DMA_ACTIVE |
+ EEH_STATE_MMIO_ENABLED |
+ EEH_STATE_DMA_ENABLED;
break;
case 1:
result = EEH_STATE_RESET_ACTIVE |
--
2.47.1
There is a period of time after returning from a KVM_RUN ioctl where
userspace may use SVE without trapping, but the kernel can unexpectedly
discard the live SVE state. Eric Auger has observed this causing QEMU
crashes where SVE is used by memmove():
https://issues.redhat.com/browse/RHEL-68997
The only state discarded is the user SVE state of the task which issued
the KVM_RUN ioctl. Other tasks are unaffected, plain FPSIMD state is
unaffected, and kernel state is unaffected.
This happens because fpsimd_kvm_prepare() incorrectly manipulates the
FPSIMD/SVE state. When the vCPU is loaded, fpsimd_kvm_prepare()
unconditionally clears TIF_SVE but does not reconfigure CPACR_EL1.ZEN to
trap userspace SVE usage. If the vCPU does not use FPSIMD/SVE and hyp
does not save the host's FPSIMD/SVE state, the kernel may return to
userspace with TIF_SVE clear while SVE is still enabled in
CPACR_EL1.ZEN. Subsequent userspace usage of SVE will not be trapped,
and the next save of userspace FPSIMD/SVE state will only store the
FPSIMD portion due to TIF_SVE being clear, discarding any SVE state.
The broken logic was originally introduced in commit:
93ae6b01bafee8fa ("KVM: arm64: Discard any SVE state when entering KVM guests")
... though at the time fp_user_discard() would reconfigure CPACR_EL1.ZEN
to trap subsequent SVE usage, masking the issue until that logic was
removed in commit:
8c845e2731041f0f ("arm64/sve: Leave SVE enabled on syscall if we don't context switch")
Avoid this issue by reconfiguring CPACR_EL1.ZEN when clearing
TIF_SVE. At the same time, add a comment to explain why
current->thread.fp_type must be set even though the FPSIMD state is not
foreign. A similar issue exists when SME is enabled, and will require
further rework. As SME currently depends on BROKEN, a BUILD_BUG() and
comment are added for now, and this issue will need to be fixed properly
in a follow-up patch.
Commit 93ae6b01bafee8fa also introduced an unintended ptrace ABI change.
Unconditionally clearing TIF_SVE regardless of whether the state is
foreign discards saved SVE state created by ptrace after syscall entry.
Avoid this by only clearing TIF_SVE when the FPSIMD/SVE state is not
foreign. When the state is foreign, KVM hyp code does not need to save
any host state, and so this will not affect KVM.
There appear to be further issues with unintentional SVE state
discarding, largely impacting ptrace and signal handling, which will
need to be addressed in separate patches.
Reported-by: Eric Auger <eauger(a)redhat.com>
Reported-by: Wilco Dijkstra <wilco.dijkstra(a)arm.com>
Cc: stable(a)vger.kernel.org
Cc: Catalin Marinas <catalin.marinas(a)arm.com>
Cc: Florian Weimer <fweimer(a)redhat.com>
Cc: Jeremy Linton <jeremy.linton(a)arm.com>
Cc: Marc Zyngier <maz(a)kernel.org>
Cc: Mark Brown <broonie(a)kernel.org>
Cc: Oliver Upton <oliver.upton(a)linux.dev>
Cc: Paolo Bonzini <pbonzini(a)redhat.com>
Cc: Will Deacon <will(a)kernel.org>
Signed-off-by: Mark Rutland <mark.rutland(a)arm.com>
---
arch/arm64/kernel/fpsimd.c | 20 ++++++++++++++++++--
1 file changed, 18 insertions(+), 2 deletions(-)
I believe there are some other issues in this area, but I'm sending this
out on its own because I beleive the other issues are more complex while
this is self-contained, and people are actively hitting this case in
production.
I intend to follow-up with fixes for the other cases I mention in the
commit message, and for the SME case with the BUILD_BUG_ON().
Mark.
diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c
index 8c4c1a2186cc5..e4053a90ed240 100644
--- a/arch/arm64/kernel/fpsimd.c
+++ b/arch/arm64/kernel/fpsimd.c
@@ -1711,8 +1711,24 @@ void fpsimd_kvm_prepare(void)
*/
get_cpu_fpsimd_context();
- if (test_and_clear_thread_flag(TIF_SVE)) {
- sve_to_fpsimd(current);
+ if (!test_thread_flag(TIF_FOREIGN_FPSTATE) &&
+ test_and_clear_thread_flag(TIF_SVE)) {
+ sve_user_disable();
+
+ /*
+ * The KVM hyp code doesn't set fp_type when saving the host's
+ * FPSIMD state. Set fp_type here in case the hyp code saves
+ * the host state.
+ *
+ * If hyp code does not save the host state, then the host
+ * state remains live on the CPU and saved fp_type is
+ * irrelevant until it is overwritten by a later call to
+ * fpsimd_save_user_state().
+ *
+ * This is *NOT* sufficient when CONFIG_ARM64_SME=y, where
+ * fp_type can be FP_STATE_SVE regardless of TIF_SVE.
+ */
+ BUILD_BUG_ON(IS_ENABLED(CONFIG_ARM64_SME));
current->thread.fp_type = FP_STATE_FPSIMD;
}
--
2.30.2
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.15.y
git checkout FETCH_HEAD
git cherry-pick -x 2ca06a2f65310aeef30bb69b7405437a14766e4d
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025012037-siesta-sulfite-8b05@gregkh' --subject-prefix 'PATCH 5.15.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 2ca06a2f65310aeef30bb69b7405437a14766e4d Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni(a)redhat.com>
Date: Mon, 13 Jan 2025 16:44:56 +0100
Subject: [PATCH] mptcp: be sure to send ack when mptcp-level window re-opens
mptcp_cleanup_rbuf() is responsible to send acks when the user-space
reads enough data to update the receive windows significantly.
It tries hard to avoid acquiring the subflow sockets locks by checking
conditions similar to the ones implemented at the TCP level.
To avoid too much code duplication - the MPTCP protocol can't reuse the
TCP helpers as part of the relevant status is maintained into the msk
socket - and multiple costly window size computation, mptcp_cleanup_rbuf
uses a rough estimate for the most recently advertised window size:
the MPTCP receive free space, as recorded as at last-ack time.
Unfortunately the above does not allow mptcp_cleanup_rbuf() to detect
a zero to non-zero win change in some corner cases, skipping the
tcp_cleanup_rbuf call and leaving the peer stuck.
After commit ea66758c1795 ("tcp: allow MPTCP to update the announced
window"), MPTCP has actually cheap access to the announced window value.
Use it in mptcp_cleanup_rbuf() for a more accurate ack generation.
Fixes: e3859603ba13 ("mptcp: better msk receive window updates")
Cc: stable(a)vger.kernel.org
Reported-by: Jakub Kicinski <kuba(a)kernel.org>
Closes: https://lore.kernel.org/20250107131845.5e5de3c5@kernel.org
Signed-off-by: Paolo Abeni <pabeni(a)redhat.com>
Reviewed-by: Matthieu Baerts (NGI0) <matttbe(a)kernel.org>
Signed-off-by: Matthieu Baerts (NGI0) <matttbe(a)kernel.org>
Link: https://patch.msgid.link/20250113-net-mptcp-connect-st-flakes-v1-1-0d986ee7…
Signed-off-by: Jakub Kicinski <kuba(a)kernel.org>
diff --git a/net/mptcp/options.c b/net/mptcp/options.c
index a62bc874bf1e..123f3f297284 100644
--- a/net/mptcp/options.c
+++ b/net/mptcp/options.c
@@ -607,7 +607,6 @@ static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb,
}
opts->ext_copy.use_ack = 1;
opts->suboptions = OPTION_MPTCP_DSS;
- WRITE_ONCE(msk->old_wspace, __mptcp_space((struct sock *)msk));
/* Add kind/length/subtype/flag overhead if mapping is not populated */
if (dss_size == 0)
@@ -1288,7 +1287,7 @@ static void mptcp_set_rwin(struct tcp_sock *tp, struct tcphdr *th)
}
MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_RCVWNDCONFLICT);
}
- return;
+ goto update_wspace;
}
if (rcv_wnd_new != rcv_wnd_old) {
@@ -1313,6 +1312,9 @@ static void mptcp_set_rwin(struct tcp_sock *tp, struct tcphdr *th)
th->window = htons(new_win);
MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_RCVWNDSHARED);
}
+
+update_wspace:
+ WRITE_ONCE(msk->old_wspace, tp->rcv_wnd);
}
__sum16 __mptcp_make_csum(u64 data_seq, u32 subflow_seq, u16 data_len, __wsum sum)
Hi,
We noticed that the patch 0f022d32c3ec should be probably ported to 6.1 and 6.6
LTS according to the bug introducing commit. Also, it can be applied
to the latest version of these two LTS branches without conflicts. Its
bug introducing commit is 3bcb846ca4cf. According to our
manual analysis, the vulnerability is a deadlock caused by recursive
locking of the qdisc lock (`sch->q.lock`) when packets are redirected
in a loop (e.g., mirroring or redirecting packets to the same device).
This happens because the same qdisc lock is attempted to be acquired
multiple times by the same CPU, leading to a deadlock. The commit
3bcb846ca4cf removes the `spin_trylock()` in `net_tx_action()` and
replaces it with `spin_lock()`. By doing so, it eliminates the
non-blocking lock attempt (`spin_trylock()`), which would fail if the
lock was already held, preventing recursive locking. The
`spin_lock()` will block (wait) if the lock is already held, allowing
for the possibility of the same CPU attempting to acquire the same
lock recursively, leading to a deadlock. The patch adds an `owner`
field to the `Qdisc` structure to track the CPU that currently owns
the qdisc. Before enqueueing a packet to the qdisc, it checks if the
current CPU is the owner. If so, it drops the packet to prevent the
recursive locking. This effectively prevents the deadlock by ensuring
that the same CPU doesn't attempt to acquire the lock recursively.
--
Yours sincerely,
Xingyu
Hi,
We noticed that the patch 11a4d6f67cf5 should be ported to 5.10 and
5.15 LTS according to the bug introducing commit. Also, it can be
applied
to the latest version of these two LTS branches without conflicts. Its
bug introducing commit is f25dcc7687d4. The kernel warning and stack
trace indicate a problem when sending a SYN message in TIPC
(Transparent Inter-Process Communication). The issue arises because
`copy_from_iter()` is being called with an uninitialized `iov_iter`
structure, leading to invalid memory operations. The commit
(`f25dcc7687d4`) introduces the vulnerability by replacing the old
data copying mechanisms with the new `copy_from_iter()` function
without ensuring that the `iov_iter` structure is properly initialized
in all code paths. The patch adds initialization of `iov_iter` with
"iov_iter_kvec(&m.msg_iter, ITER_SOURCE, NULL, 0, 0);", which ensures
that even when there's no data to send, the `iov_iter` is correctly
set up, preventing the kernel warning/crash when `copy_from_iter()` is
called.
--
Yours sincerely,
Xingyu
Hi,
We noticed that the patch 120f1c857a73 should be ported to 5.4, 5.10
and 5.15 LTS according to bug introducing commits. Also, they can be
applied to the latest version of these three branches without
conflicting.
Its bug introducing commit is 9b52e3f267a6. According to our manual
analysis, the commit (9b52e3f267a6) introduced a
`WARN_ON_ONCE(!net);` statement in the `__skb_flow_dissect` function
within `net/core/flow_dissector.c`. This change began triggering
warnings (splat messages) when `net` is `NULL`, which can happen in
legitimate use cases, such as when `__skb_get_hash()` is called by the
nftables tracing infrastructure to identify packets in traces. The
patch provided replaces this `WARN_ON_ONCE(!net);` with
`DEBUG_NET_WARN_ON_ONCE(!net);`, which is more appropriate for
situations where `net` can be `NULL` without it indicating a critical
issue. This change prevents unnecessary warning messages from
appearing, which can clutter logs and potentially mask real issues.
Therefore, the prior commit introduced the issue (the unnecessary
warnings when `net` is `NULL`), and the patch fixes this by adjusting
the warning mechanism.
--
Yours sincerely,
Xingyu
From: Cosmin Tanislav <demonsingur(a)gmail.com>
commit 3061e170381af96d1e66799d34264e6414d428a7 upstream.
This is a reland commit:
- f373a189817584d0af5f922e91cad40e45f12314 backported it incorrectly.
- 7aee9bcc5c56086af6c063b4aaef59d4e42e0a69 reverted the wrong commit.
- This re-backports it.
At the end of __regmap_init(), if dev is not NULL, regmap_attach_dev()
is called, which adds a devres reference to the regmap, to be able to
retrieve a dev's regmap by name using dev_get_regmap().
When calling regmap_exit, the opposite does not happen, and the
reference is kept until the dev is detached.
Add a regmap_detach_dev() function and call it in regmap_exit() to make
sure that the devres reference is not kept.
Cc: stable(a)vger.kernel.org
Fixes: 72b39f6f2b5a ("regmap: Implement dev_get_regmap()")
Signed-off-by: Cosmin Tanislav <demonsingur(a)gmail.com>
Rule: add
Link: https://lore.kernel.org/stable/20241128130554.362486-1-demonsingur%40gmail.…
Link: https://patch.msgid.link/20241128131625.363835-1-demonsingur@gmail.com
Signed-off-by: Mark Brown <broonie(a)kernel.org>
Link: https://lore.kernel.org/r/20250115033314.2540588-1-tzungbi@kernel.org
Link: https://lore.kernel.org/r/20250115033244.2540522-1-tzungbi@kernel.org
Signed-off-by: Tzung-Bi Shih <tzungbi(a)kernel.org>
---
drivers/base/regmap/regmap.c | 12 ++++++++++++
1 file changed, 12 insertions(+)
diff --git a/drivers/base/regmap/regmap.c b/drivers/base/regmap/regmap.c
index 85d324fd6a87..6d94ad8bf1eb 100644
--- a/drivers/base/regmap/regmap.c
+++ b/drivers/base/regmap/regmap.c
@@ -663,6 +663,17 @@ int regmap_attach_dev(struct device *dev, struct regmap *map,
}
EXPORT_SYMBOL_GPL(regmap_attach_dev);
+static int dev_get_regmap_match(struct device *dev, void *res, void *data);
+
+static int regmap_detach_dev(struct device *dev, struct regmap *map)
+{
+ if (!dev)
+ return 0;
+
+ return devres_release(dev, dev_get_regmap_release,
+ dev_get_regmap_match, (void *)map->name);
+}
+
static enum regmap_endian regmap_get_reg_endian(const struct regmap_bus *bus,
const struct regmap_config *config)
{
@@ -1531,6 +1542,7 @@ void regmap_exit(struct regmap *map)
{
struct regmap_async *async;
+ regmap_detach_dev(map->dev, map);
regcache_exit(map);
regmap_debugfs_exit(map);
regmap_range_exit(map);
--
2.48.0.rc2.279.g1de40edade-goog
From: Cosmin Tanislav <demonsingur(a)gmail.com>
commit 3061e170381af96d1e66799d34264e6414d428a7 upstream.
This is a reland commit:
- 48dc44f3c1afa29390cb2fbc8badad1b1111cea4 backported it incorrectly.
- 276185236bd8281dca88863b751b481e027cada7 reverted the wrong commit.
- This re-backports it.
At the end of __regmap_init(), if dev is not NULL, regmap_attach_dev()
is called, which adds a devres reference to the regmap, to be able to
retrieve a dev's regmap by name using dev_get_regmap().
When calling regmap_exit, the opposite does not happen, and the
reference is kept until the dev is detached.
Add a regmap_detach_dev() function and call it in regmap_exit() to make
sure that the devres reference is not kept.
Cc: stable(a)vger.kernel.org
Fixes: 72b39f6f2b5a ("regmap: Implement dev_get_regmap()")
Signed-off-by: Cosmin Tanislav <demonsingur(a)gmail.com>
Rule: add
Link: https://lore.kernel.org/stable/20241128130554.362486-1-demonsingur%40gmail.…
Link: https://patch.msgid.link/20241128131625.363835-1-demonsingur@gmail.com
Signed-off-by: Mark Brown <broonie(a)kernel.org>
Link: https://lore.kernel.org/r/20250115033244.2540522-1-tzungbi@kernel.org
Signed-off-by: Tzung-Bi Shih <tzungbi(a)kernel.org>
---
drivers/base/regmap/regmap.c | 12 ++++++++++++
1 file changed, 12 insertions(+)
diff --git a/drivers/base/regmap/regmap.c b/drivers/base/regmap/regmap.c
index 15b37a4163d3..f0e314abcafc 100644
--- a/drivers/base/regmap/regmap.c
+++ b/drivers/base/regmap/regmap.c
@@ -652,6 +652,17 @@ int regmap_attach_dev(struct device *dev, struct regmap *map,
}
EXPORT_SYMBOL_GPL(regmap_attach_dev);
+static int dev_get_regmap_match(struct device *dev, void *res, void *data);
+
+static int regmap_detach_dev(struct device *dev, struct regmap *map)
+{
+ if (!dev)
+ return 0;
+
+ return devres_release(dev, dev_get_regmap_release,
+ dev_get_regmap_match, (void *)map->name);
+}
+
static enum regmap_endian regmap_get_reg_endian(const struct regmap_bus *bus,
const struct regmap_config *config)
{
@@ -1536,6 +1547,7 @@ void regmap_exit(struct regmap *map)
{
struct regmap_async *async;
+ regmap_detach_dev(map->dev, map);
regcache_exit(map);
regmap_debugfs_exit(map);
regmap_range_exit(map);
--
2.48.0.rc2.279.g1de40edade-goog
Protect access to fore200e->available_cell_rate with rate_mtx lock to
prevent potential data race.
The field fore200e.available_cell_rate is generally protected by the lock
fore200e.rate_mtx when accessed. In all other read and write cases, this
field is consistently protected by the lock, except for this case and
during initialization.
This potential bug was detected by our experimental static analysis tool,
which analyzes locking APIs and paired functions to identify data races
and atomicity violations.
Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
Cc: stable(a)vger.kernel.org
Signed-off-by: Gui-Dong Han <2045gemini(a)gmail.com>
---
drivers/atm/fore200e.c | 2 ++
1 file changed, 2 insertions(+)
diff --git a/drivers/atm/fore200e.c b/drivers/atm/fore200e.c
index 4fea1149e003..f62e38571440 100644
--- a/drivers/atm/fore200e.c
+++ b/drivers/atm/fore200e.c
@@ -1374,7 +1374,9 @@ fore200e_open(struct atm_vcc *vcc)
vcc->dev_data = NULL;
+ mutex_lock(&fore200e->rate_mtx);
fore200e->available_cell_rate += vcc->qos.txtp.max_pcr;
+ mutex_unlock(&fore200e->rate_mtx);
kfree(fore200e_vcc);
return -EINVAL;
--
2.25.1
From: Steven Rostedt <rostedt(a)goodmis.org>
Some architectures can not safely do atomic64 operations in NMI context.
Since the ring buffer relies on atomic64 operations to do its time
keeping, if an event is requested in NMI context, reject it for these
architectures.
Cc: stable(a)vger.kernel.org
Cc: Mark Rutland <mark.rutland(a)arm.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers(a)efficios.com>
Cc: Andrew Morton <akpm(a)linux-foundation.org>
Cc: Peter Zijlstra <peterz(a)infradead.org>
Cc: Thomas Gleixner <tglx(a)linutronix.de>
Cc: Linus Torvalds <torvalds(a)linux-foundation.org>
Cc: Andreas Larsson <andreas(a)gaisler.com>
Link: https://lore.kernel.org/20250120235721.407068250@goodmis.org
Fixes: c84897c0ff592 ("ring-buffer: Remove 32bit timestamp logic")
Closes: https://lore.kernel.org/all/86fb4f86-a0e4-45a2-a2df-3154acc4f086@gaisler.co…
Reported-by: Ludwig Rydberg <ludwig.rydberg(a)gaisler.com>
Signed-off-by: Steven Rostedt (Google) <rostedt(a)goodmis.org>
Reviewed-by: Masami Hiramatsu (Google) <mhiramat(a)kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt(a)goodmis.org>
---
kernel/trace/ring_buffer.c | 9 +++++++--
1 file changed, 7 insertions(+), 2 deletions(-)
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 6d61ff78926b..b8e0ae15ca5b 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -4398,8 +4398,13 @@ rb_reserve_next_event(struct trace_buffer *buffer,
int nr_loops = 0;
int add_ts_default;
- /* ring buffer does cmpxchg, make sure it is safe in NMI context */
- if (!IS_ENABLED(CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG) &&
+ /*
+ * ring buffer does cmpxchg as well as atomic64 operations
+ * (which some archs use locking for atomic64), make sure this
+ * is safe in NMI context
+ */
+ if ((!IS_ENABLED(CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG) ||
+ IS_ENABLED(CONFIG_GENERIC_ATOMIC64)) &&
(unlikely(in_nmi()))) {
return NULL;
}
--
2.45.2
Greg,
Per your request, here is a manual backport of the overlayfs fixes that
were applied in v6.6.72 and reverted in v6.6.73.
For the record, this overlayfs series from v6.7 [2] changes subtle
internal semantics across overlayfs code, which are not detectable by
build error and therefore are a backporting landmine.
This is exactly what happened with the automatic apply of dependecy
patch in v6.6.72.
I will try to be extra diligent about review of auto backports below
v6.7 from now on.
Luckily, the leaked mount reference was caught by a vfs assertion and
promptly reported by Ignat from Cloudflare team.
Thanks!
Amir.
[1] https://lore.kernel.org/stable/2025012123-cable-reburial-568e@gregkh/
[2] https://lore.kernel.org/linux-unionfs/20230816152334.924960-1-amir73il@gmai…
Amir Goldstein (3):
ovl: pass realinode to ovl_encode_real_fh() instead of realdentry
ovl: support encoding fid from inode with no alias
fs: relax assertions on failure to encode file handles
fs/notify/fdinfo.c | 4 +---
fs/overlayfs/copy_up.c | 16 ++++++-------
fs/overlayfs/export.c | 49 ++++++++++++++++++++++------------------
fs/overlayfs/namei.c | 4 ++--
fs/overlayfs/overlayfs.h | 2 +-
5 files changed, 39 insertions(+), 36 deletions(-)
--
2.34.1
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.10.y
git checkout FETCH_HEAD
git cherry-pick -x 91751e248256efc111e52e15115840c35d85abaf
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025012045-irritably-duplex-5af0@gregkh' --subject-prefix 'PATCH 5.10.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 91751e248256efc111e52e15115840c35d85abaf Mon Sep 17 00:00:00 2001
From: Stefano Garzarella <sgarzare(a)redhat.com>
Date: Fri, 10 Jan 2025 09:35:11 +0100
Subject: [PATCH] vsock: prevent null-ptr-deref in vsock_*[has_data|has_space]
Recent reports have shown how we sometimes call vsock_*_has_data()
when a vsock socket has been de-assigned from a transport (see attached
links), but we shouldn't.
Previous commits should have solved the real problems, but we may have
more in the future, so to avoid null-ptr-deref, we can return 0
(no space, no data available) but with a warning.
This way the code should continue to run in a nearly consistent state
and have a warning that allows us to debug future problems.
Fixes: c0cfa2d8a788 ("vsock: add multi-transports support")
Cc: stable(a)vger.kernel.org
Link: https://lore.kernel.org/netdev/Z2K%2FI4nlHdfMRTZC@v4bel-B760M-AORUS-ELITE-A…
Link: https://lore.kernel.org/netdev/5ca20d4c-1017-49c2-9516-f6f75fd331e9@rbox.co/
Link: https://lore.kernel.org/netdev/677f84a8.050a0220.25a300.01b3.GAE@google.com/
Co-developed-by: Hyunwoo Kim <v4bel(a)theori.io>
Signed-off-by: Hyunwoo Kim <v4bel(a)theori.io>
Co-developed-by: Wongi Lee <qwerty(a)theori.io>
Signed-off-by: Wongi Lee <qwerty(a)theori.io>
Signed-off-by: Stefano Garzarella <sgarzare(a)redhat.com>
Reviewed-by: Luigi Leonardi <leonardi(a)redhat.com>
Reviewed-by: Hyunwoo Kim <v4bel(a)theori.io>
Signed-off-by: Paolo Abeni <pabeni(a)redhat.com>
diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index 74d35a871644..fa9d1b49599b 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -879,6 +879,9 @@ EXPORT_SYMBOL_GPL(vsock_create_connected);
s64 vsock_stream_has_data(struct vsock_sock *vsk)
{
+ if (WARN_ON(!vsk->transport))
+ return 0;
+
return vsk->transport->stream_has_data(vsk);
}
EXPORT_SYMBOL_GPL(vsock_stream_has_data);
@@ -887,6 +890,9 @@ s64 vsock_connectible_has_data(struct vsock_sock *vsk)
{
struct sock *sk = sk_vsock(vsk);
+ if (WARN_ON(!vsk->transport))
+ return 0;
+
if (sk->sk_type == SOCK_SEQPACKET)
return vsk->transport->seqpacket_has_data(vsk);
else
@@ -896,6 +902,9 @@ EXPORT_SYMBOL_GPL(vsock_connectible_has_data);
s64 vsock_stream_has_space(struct vsock_sock *vsk)
{
+ if (WARN_ON(!vsk->transport))
+ return 0;
+
return vsk->transport->stream_has_space(vsk);
}
EXPORT_SYMBOL_GPL(vsock_stream_has_space);
From: Suraj Sonawane <surajsonawane0215(a)gmail.com>
[ Upstream commit f10593ad9bc36921f623361c9e3dd96bd52d85ee ]
Fix a use-after-free bug in sg_release(), detected by syzbot with KASAN:
BUG: KASAN: slab-use-after-free in lock_release+0x151/0xa30
kernel/locking/lockdep.c:5838
__mutex_unlock_slowpath+0xe2/0x750 kernel/locking/mutex.c:912
sg_release+0x1f4/0x2e0 drivers/scsi/sg.c:407
In sg_release(), the function kref_put(&sfp->f_ref, sg_remove_sfp) is
called before releasing the open_rel_lock mutex. The kref_put() call may
decrement the reference count of sfp to zero, triggering its cleanup
through sg_remove_sfp(). This cleanup includes scheduling deferred work
via sg_remove_sfp_usercontext(), which ultimately frees sfp.
After kref_put(), sg_release() continues to unlock open_rel_lock and may
reference sfp or sdp. If sfp has already been freed, this results in a
slab-use-after-free error.
Move the kref_put(&sfp->f_ref, sg_remove_sfp) call after unlocking the
open_rel_lock mutex. This ensures:
- No references to sfp or sdp occur after the reference count is
decremented.
- Cleanup functions such as sg_remove_sfp() and
sg_remove_sfp_usercontext() can safely execute without impacting the
mutex handling in sg_release().
The fix has been tested and validated by syzbot. This patch closes the
bug reported at the following syzkaller link and ensures proper
sequencing of resource cleanup and mutex operations, eliminating the
risk of use-after-free errors in sg_release().
Reported-by: syzbot+7efb5850a17ba6ce098b(a)syzkaller.appspotmail.com
Closes: https://syzkaller.appspot.com/bug?extid=7efb5850a17ba6ce098b
Tested-by: syzbot+7efb5850a17ba6ce098b(a)syzkaller.appspotmail.com
Fixes: cc833acbee9d ("sg: O_EXCL and other lock handling")
Signed-off-by: Suraj Sonawane <surajsonawane0215(a)gmail.com>
Link: https://lore.kernel.org/r/20241120125944.88095-1-surajsonawane0215@gmail.com
Reviewed-by: Bart Van Assche <bvanassche(a)acm.org>
Signed-off-by: Martin K. Petersen <martin.petersen(a)oracle.com>
Signed-off-by: BRUNO VERNAY <bruno.vernay(a)se.com>
Signed-off-by: Hugo SIMELIERE <hsimeliere.opensource(a)witekio.com>
---
drivers/scsi/sg.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c
index 12344be14232..1946cc96c172 100644
--- a/drivers/scsi/sg.c
+++ b/drivers/scsi/sg.c
@@ -390,7 +390,6 @@ sg_release(struct inode *inode, struct file *filp)
mutex_lock(&sdp->open_rel_lock);
scsi_autopm_put_device(sdp->device);
- kref_put(&sfp->f_ref, sg_remove_sfp);
sdp->open_cnt--;
/* possibly many open()s waiting on exlude clearing, start many;
@@ -402,6 +401,7 @@ sg_release(struct inode *inode, struct file *filp)
wake_up_interruptible(&sdp->open_wait);
}
mutex_unlock(&sdp->open_rel_lock);
+ kref_put(&sfp->f_ref, sg_remove_sfp);
return 0;
}
--
2.43.0
Fixes a use-after-free and a struct without an initialiser
Stuart Hayhurst (2):
HID: corsair-void: Add missing delayed work cancel for headset status
HID: corsair-void: Initialise memory for psy_cfg
drivers/hid/hid-corsair-void.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
--
2.47.1
The patch below does not apply to the 6.6-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.6.y
git checkout FETCH_HEAD
git cherry-pick -x 12dcb0ef540629a281533f9dedc1b6b8e14cfb65
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025012012-capacity-swiftly-0214@gregkh' --subject-prefix 'PATCH 6.6.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 12dcb0ef540629a281533f9dedc1b6b8e14cfb65 Mon Sep 17 00:00:00 2001
From: Yosry Ahmed <yosryahmed(a)google.com>
Date: Wed, 8 Jan 2025 22:24:41 +0000
Subject: [PATCH] mm: zswap: properly synchronize freeing resources during CPU
hotunplug
In zswap_compress() and zswap_decompress(), the per-CPU acomp_ctx of the
current CPU at the beginning of the operation is retrieved and used
throughout. However, since neither preemption nor migration are disabled,
it is possible that the operation continues on a different CPU.
If the original CPU is hotunplugged while the acomp_ctx is still in use,
we run into a UAF bug as some of the resources attached to the acomp_ctx
are freed during hotunplug in zswap_cpu_comp_dead() (i.e.
acomp_ctx.buffer, acomp_ctx.req, or acomp_ctx.acomp).
The problem was introduced in commit 1ec3b5fe6eec ("mm/zswap: move to use
crypto_acomp API for hardware acceleration") when the switch to the
crypto_acomp API was made. Prior to that, the per-CPU crypto_comp was
retrieved using get_cpu_ptr() which disables preemption and makes sure the
CPU cannot go away from under us. Preemption cannot be disabled with the
crypto_acomp API as a sleepable context is needed.
Use the acomp_ctx.mutex to synchronize CPU hotplug callbacks allocating
and freeing resources with compression/decompression paths. Make sure
that acomp_ctx.req is NULL when the resources are freed. In the
compression/decompression paths, check if acomp_ctx.req is NULL after
acquiring the mutex (meaning the CPU was offlined) and retry on the new
CPU.
The initialization of acomp_ctx.mutex is moved from the CPU hotplug
callback to the pool initialization where it belongs (where the mutex is
allocated). In addition to adding clarity, this makes sure that CPU
hotplug cannot reinitialize a mutex that is already locked by
compression/decompression.
Previously a fix was attempted by holding cpus_read_lock() [1]. This
would have caused a potential deadlock as it is possible for code already
holding the lock to fall into reclaim and enter zswap (causing a
deadlock). A fix was also attempted using SRCU for synchronization, but
Johannes pointed out that synchronize_srcu() cannot be used in CPU hotplug
notifiers [2].
Alternative fixes that were considered/attempted and could have worked:
- Refcounting the per-CPU acomp_ctx. This involves complexity in
handling the race between the refcount dropping to zero in
zswap_[de]compress() and the refcount being re-initialized when the
CPU is onlined.
- Disabling migration before getting the per-CPU acomp_ctx [3], but
that's discouraged and is a much bigger hammer than needed, and could
result in subtle performance issues.
[1]https://lkml.kernel.org/20241219212437.2714151-1-yosryahmed@google.com/
[2]https://lkml.kernel.org/20250107074724.1756696-2-yosryahmed@google.com/
[3]https://lkml.kernel.org/20250107222236.2715883-2-yosryahmed@google.com/
[yosryahmed(a)google.com: remove comment]
Link: https://lkml.kernel.org/r/CAJD7tkaxS1wjn+swugt8QCvQ-rVF5RZnjxwPGX17k8x9zSMa…
Link: https://lkml.kernel.org/r/20250108222441.3622031-1-yosryahmed@google.com
Fixes: 1ec3b5fe6eec ("mm/zswap: move to use crypto_acomp API for hardware acceleration")
Signed-off-by: Yosry Ahmed <yosryahmed(a)google.com>
Reported-by: Johannes Weiner <hannes(a)cmpxchg.org>
Closes: https://lore.kernel.org/lkml/20241113213007.GB1564047@cmpxchg.org/
Reported-by: Sam Sun <samsun1006219(a)gmail.com>
Closes: https://lore.kernel.org/lkml/CAEkJfYMtSdM5HceNsXUDf5haghD5+o2e7Qv4OcuruL4tP…
Cc: Barry Song <baohua(a)kernel.org>
Cc: Chengming Zhou <chengming.zhou(a)linux.dev>
Cc: Kanchana P Sridhar <kanchana.p.sridhar(a)intel.com>
Cc: Nhat Pham <nphamcs(a)gmail.com>
Cc: Vitaly Wool <vitalywool(a)gmail.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
diff --git a/mm/zswap.c b/mm/zswap.c
index f6316b66fb23..30f5a27a6862 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -251,7 +251,7 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
struct zswap_pool *pool;
char name[38]; /* 'zswap' + 32 char (max) num + \0 */
gfp_t gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM;
- int ret;
+ int ret, cpu;
if (!zswap_has_pool) {
/* if either are unset, pool initialization failed, and we
@@ -285,6 +285,9 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
goto error;
}
+ for_each_possible_cpu(cpu)
+ mutex_init(&per_cpu_ptr(pool->acomp_ctx, cpu)->mutex);
+
ret = cpuhp_state_add_instance(CPUHP_MM_ZSWP_POOL_PREPARE,
&pool->node);
if (ret)
@@ -821,11 +824,12 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node)
struct acomp_req *req;
int ret;
- mutex_init(&acomp_ctx->mutex);
-
+ mutex_lock(&acomp_ctx->mutex);
acomp_ctx->buffer = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu));
- if (!acomp_ctx->buffer)
- return -ENOMEM;
+ if (!acomp_ctx->buffer) {
+ ret = -ENOMEM;
+ goto buffer_fail;
+ }
acomp = crypto_alloc_acomp_node(pool->tfm_name, 0, 0, cpu_to_node(cpu));
if (IS_ERR(acomp)) {
@@ -855,12 +859,15 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node)
acomp_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG,
crypto_req_done, &acomp_ctx->wait);
+ mutex_unlock(&acomp_ctx->mutex);
return 0;
req_fail:
crypto_free_acomp(acomp_ctx->acomp);
acomp_fail:
kfree(acomp_ctx->buffer);
+buffer_fail:
+ mutex_unlock(&acomp_ctx->mutex);
return ret;
}
@@ -869,17 +876,45 @@ static int zswap_cpu_comp_dead(unsigned int cpu, struct hlist_node *node)
struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node);
struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu);
+ mutex_lock(&acomp_ctx->mutex);
if (!IS_ERR_OR_NULL(acomp_ctx)) {
if (!IS_ERR_OR_NULL(acomp_ctx->req))
acomp_request_free(acomp_ctx->req);
+ acomp_ctx->req = NULL;
if (!IS_ERR_OR_NULL(acomp_ctx->acomp))
crypto_free_acomp(acomp_ctx->acomp);
kfree(acomp_ctx->buffer);
}
+ mutex_unlock(&acomp_ctx->mutex);
return 0;
}
+static struct crypto_acomp_ctx *acomp_ctx_get_cpu_lock(struct zswap_pool *pool)
+{
+ struct crypto_acomp_ctx *acomp_ctx;
+
+ for (;;) {
+ acomp_ctx = raw_cpu_ptr(pool->acomp_ctx);
+ mutex_lock(&acomp_ctx->mutex);
+ if (likely(acomp_ctx->req))
+ return acomp_ctx;
+ /*
+ * It is possible that we were migrated to a different CPU after
+ * getting the per-CPU ctx but before the mutex was acquired. If
+ * the old CPU got offlined, zswap_cpu_comp_dead() could have
+ * already freed ctx->req (among other things) and set it to
+ * NULL. Just try again on the new CPU that we ended up on.
+ */
+ mutex_unlock(&acomp_ctx->mutex);
+ }
+}
+
+static void acomp_ctx_put_unlock(struct crypto_acomp_ctx *acomp_ctx)
+{
+ mutex_unlock(&acomp_ctx->mutex);
+}
+
static bool zswap_compress(struct page *page, struct zswap_entry *entry,
struct zswap_pool *pool)
{
@@ -893,10 +928,7 @@ static bool zswap_compress(struct page *page, struct zswap_entry *entry,
gfp_t gfp;
u8 *dst;
- acomp_ctx = raw_cpu_ptr(pool->acomp_ctx);
-
- mutex_lock(&acomp_ctx->mutex);
-
+ acomp_ctx = acomp_ctx_get_cpu_lock(pool);
dst = acomp_ctx->buffer;
sg_init_table(&input, 1);
sg_set_page(&input, page, PAGE_SIZE, 0);
@@ -949,7 +981,7 @@ static bool zswap_compress(struct page *page, struct zswap_entry *entry,
else if (alloc_ret)
zswap_reject_alloc_fail++;
- mutex_unlock(&acomp_ctx->mutex);
+ acomp_ctx_put_unlock(acomp_ctx);
return comp_ret == 0 && alloc_ret == 0;
}
@@ -960,9 +992,7 @@ static void zswap_decompress(struct zswap_entry *entry, struct folio *folio)
struct crypto_acomp_ctx *acomp_ctx;
u8 *src;
- acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx);
- mutex_lock(&acomp_ctx->mutex);
-
+ acomp_ctx = acomp_ctx_get_cpu_lock(entry->pool);
src = zpool_map_handle(zpool, entry->handle, ZPOOL_MM_RO);
/*
* If zpool_map_handle is atomic, we cannot reliably utilize its mapped buffer
@@ -986,10 +1016,10 @@ static void zswap_decompress(struct zswap_entry *entry, struct folio *folio)
acomp_request_set_params(acomp_ctx->req, &input, &output, entry->length, PAGE_SIZE);
BUG_ON(crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait));
BUG_ON(acomp_ctx->req->dlen != PAGE_SIZE);
- mutex_unlock(&acomp_ctx->mutex);
if (src != acomp_ctx->buffer)
zpool_unmap_handle(zpool, entry->handle);
+ acomp_ctx_put_unlock(acomp_ctx);
}
/*********************************
The patch below does not apply to the 6.12-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.12.y
git checkout FETCH_HEAD
git cherry-pick -x 12dcb0ef540629a281533f9dedc1b6b8e14cfb65
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025012011-urgency-shredder-353e@gregkh' --subject-prefix 'PATCH 6.12.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 12dcb0ef540629a281533f9dedc1b6b8e14cfb65 Mon Sep 17 00:00:00 2001
From: Yosry Ahmed <yosryahmed(a)google.com>
Date: Wed, 8 Jan 2025 22:24:41 +0000
Subject: [PATCH] mm: zswap: properly synchronize freeing resources during CPU
hotunplug
In zswap_compress() and zswap_decompress(), the per-CPU acomp_ctx of the
current CPU at the beginning of the operation is retrieved and used
throughout. However, since neither preemption nor migration are disabled,
it is possible that the operation continues on a different CPU.
If the original CPU is hotunplugged while the acomp_ctx is still in use,
we run into a UAF bug as some of the resources attached to the acomp_ctx
are freed during hotunplug in zswap_cpu_comp_dead() (i.e.
acomp_ctx.buffer, acomp_ctx.req, or acomp_ctx.acomp).
The problem was introduced in commit 1ec3b5fe6eec ("mm/zswap: move to use
crypto_acomp API for hardware acceleration") when the switch to the
crypto_acomp API was made. Prior to that, the per-CPU crypto_comp was
retrieved using get_cpu_ptr() which disables preemption and makes sure the
CPU cannot go away from under us. Preemption cannot be disabled with the
crypto_acomp API as a sleepable context is needed.
Use the acomp_ctx.mutex to synchronize CPU hotplug callbacks allocating
and freeing resources with compression/decompression paths. Make sure
that acomp_ctx.req is NULL when the resources are freed. In the
compression/decompression paths, check if acomp_ctx.req is NULL after
acquiring the mutex (meaning the CPU was offlined) and retry on the new
CPU.
The initialization of acomp_ctx.mutex is moved from the CPU hotplug
callback to the pool initialization where it belongs (where the mutex is
allocated). In addition to adding clarity, this makes sure that CPU
hotplug cannot reinitialize a mutex that is already locked by
compression/decompression.
Previously a fix was attempted by holding cpus_read_lock() [1]. This
would have caused a potential deadlock as it is possible for code already
holding the lock to fall into reclaim and enter zswap (causing a
deadlock). A fix was also attempted using SRCU for synchronization, but
Johannes pointed out that synchronize_srcu() cannot be used in CPU hotplug
notifiers [2].
Alternative fixes that were considered/attempted and could have worked:
- Refcounting the per-CPU acomp_ctx. This involves complexity in
handling the race between the refcount dropping to zero in
zswap_[de]compress() and the refcount being re-initialized when the
CPU is onlined.
- Disabling migration before getting the per-CPU acomp_ctx [3], but
that's discouraged and is a much bigger hammer than needed, and could
result in subtle performance issues.
[1]https://lkml.kernel.org/20241219212437.2714151-1-yosryahmed@google.com/
[2]https://lkml.kernel.org/20250107074724.1756696-2-yosryahmed@google.com/
[3]https://lkml.kernel.org/20250107222236.2715883-2-yosryahmed@google.com/
[yosryahmed(a)google.com: remove comment]
Link: https://lkml.kernel.org/r/CAJD7tkaxS1wjn+swugt8QCvQ-rVF5RZnjxwPGX17k8x9zSMa…
Link: https://lkml.kernel.org/r/20250108222441.3622031-1-yosryahmed@google.com
Fixes: 1ec3b5fe6eec ("mm/zswap: move to use crypto_acomp API for hardware acceleration")
Signed-off-by: Yosry Ahmed <yosryahmed(a)google.com>
Reported-by: Johannes Weiner <hannes(a)cmpxchg.org>
Closes: https://lore.kernel.org/lkml/20241113213007.GB1564047@cmpxchg.org/
Reported-by: Sam Sun <samsun1006219(a)gmail.com>
Closes: https://lore.kernel.org/lkml/CAEkJfYMtSdM5HceNsXUDf5haghD5+o2e7Qv4OcuruL4tP…
Cc: Barry Song <baohua(a)kernel.org>
Cc: Chengming Zhou <chengming.zhou(a)linux.dev>
Cc: Kanchana P Sridhar <kanchana.p.sridhar(a)intel.com>
Cc: Nhat Pham <nphamcs(a)gmail.com>
Cc: Vitaly Wool <vitalywool(a)gmail.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
diff --git a/mm/zswap.c b/mm/zswap.c
index f6316b66fb23..30f5a27a6862 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -251,7 +251,7 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
struct zswap_pool *pool;
char name[38]; /* 'zswap' + 32 char (max) num + \0 */
gfp_t gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM;
- int ret;
+ int ret, cpu;
if (!zswap_has_pool) {
/* if either are unset, pool initialization failed, and we
@@ -285,6 +285,9 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
goto error;
}
+ for_each_possible_cpu(cpu)
+ mutex_init(&per_cpu_ptr(pool->acomp_ctx, cpu)->mutex);
+
ret = cpuhp_state_add_instance(CPUHP_MM_ZSWP_POOL_PREPARE,
&pool->node);
if (ret)
@@ -821,11 +824,12 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node)
struct acomp_req *req;
int ret;
- mutex_init(&acomp_ctx->mutex);
-
+ mutex_lock(&acomp_ctx->mutex);
acomp_ctx->buffer = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu));
- if (!acomp_ctx->buffer)
- return -ENOMEM;
+ if (!acomp_ctx->buffer) {
+ ret = -ENOMEM;
+ goto buffer_fail;
+ }
acomp = crypto_alloc_acomp_node(pool->tfm_name, 0, 0, cpu_to_node(cpu));
if (IS_ERR(acomp)) {
@@ -855,12 +859,15 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node)
acomp_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG,
crypto_req_done, &acomp_ctx->wait);
+ mutex_unlock(&acomp_ctx->mutex);
return 0;
req_fail:
crypto_free_acomp(acomp_ctx->acomp);
acomp_fail:
kfree(acomp_ctx->buffer);
+buffer_fail:
+ mutex_unlock(&acomp_ctx->mutex);
return ret;
}
@@ -869,17 +876,45 @@ static int zswap_cpu_comp_dead(unsigned int cpu, struct hlist_node *node)
struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node);
struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu);
+ mutex_lock(&acomp_ctx->mutex);
if (!IS_ERR_OR_NULL(acomp_ctx)) {
if (!IS_ERR_OR_NULL(acomp_ctx->req))
acomp_request_free(acomp_ctx->req);
+ acomp_ctx->req = NULL;
if (!IS_ERR_OR_NULL(acomp_ctx->acomp))
crypto_free_acomp(acomp_ctx->acomp);
kfree(acomp_ctx->buffer);
}
+ mutex_unlock(&acomp_ctx->mutex);
return 0;
}
+static struct crypto_acomp_ctx *acomp_ctx_get_cpu_lock(struct zswap_pool *pool)
+{
+ struct crypto_acomp_ctx *acomp_ctx;
+
+ for (;;) {
+ acomp_ctx = raw_cpu_ptr(pool->acomp_ctx);
+ mutex_lock(&acomp_ctx->mutex);
+ if (likely(acomp_ctx->req))
+ return acomp_ctx;
+ /*
+ * It is possible that we were migrated to a different CPU after
+ * getting the per-CPU ctx but before the mutex was acquired. If
+ * the old CPU got offlined, zswap_cpu_comp_dead() could have
+ * already freed ctx->req (among other things) and set it to
+ * NULL. Just try again on the new CPU that we ended up on.
+ */
+ mutex_unlock(&acomp_ctx->mutex);
+ }
+}
+
+static void acomp_ctx_put_unlock(struct crypto_acomp_ctx *acomp_ctx)
+{
+ mutex_unlock(&acomp_ctx->mutex);
+}
+
static bool zswap_compress(struct page *page, struct zswap_entry *entry,
struct zswap_pool *pool)
{
@@ -893,10 +928,7 @@ static bool zswap_compress(struct page *page, struct zswap_entry *entry,
gfp_t gfp;
u8 *dst;
- acomp_ctx = raw_cpu_ptr(pool->acomp_ctx);
-
- mutex_lock(&acomp_ctx->mutex);
-
+ acomp_ctx = acomp_ctx_get_cpu_lock(pool);
dst = acomp_ctx->buffer;
sg_init_table(&input, 1);
sg_set_page(&input, page, PAGE_SIZE, 0);
@@ -949,7 +981,7 @@ static bool zswap_compress(struct page *page, struct zswap_entry *entry,
else if (alloc_ret)
zswap_reject_alloc_fail++;
- mutex_unlock(&acomp_ctx->mutex);
+ acomp_ctx_put_unlock(acomp_ctx);
return comp_ret == 0 && alloc_ret == 0;
}
@@ -960,9 +992,7 @@ static void zswap_decompress(struct zswap_entry *entry, struct folio *folio)
struct crypto_acomp_ctx *acomp_ctx;
u8 *src;
- acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx);
- mutex_lock(&acomp_ctx->mutex);
-
+ acomp_ctx = acomp_ctx_get_cpu_lock(entry->pool);
src = zpool_map_handle(zpool, entry->handle, ZPOOL_MM_RO);
/*
* If zpool_map_handle is atomic, we cannot reliably utilize its mapped buffer
@@ -986,10 +1016,10 @@ static void zswap_decompress(struct zswap_entry *entry, struct folio *folio)
acomp_request_set_params(acomp_ctx->req, &input, &output, entry->length, PAGE_SIZE);
BUG_ON(crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait));
BUG_ON(acomp_ctx->req->dlen != PAGE_SIZE);
- mutex_unlock(&acomp_ctx->mutex);
if (src != acomp_ctx->buffer)
zpool_unmap_handle(zpool, entry->handle);
+ acomp_ctx_put_unlock(acomp_ctx);
}
/*********************************
Hello!
On Tue 21-01-25 08:40:50, Xingyu Li wrote:
> We noticed that patch 6f861765464f should be probably ported to Linux 6.6
> LTS. Its bug introducing commit is probably 05bdb9965305.
The truth is we have always allowed writing to mounted block devices. This
is traditional Unix behavior and Linux has been following it. So in
principle any kernel before commit 6f861765464f or with
CONFIG_BLKDEV_WRITE_MOUNTED=y is prone to the problem. Because
unpriviledged users are not generally allowed to write to *any* block
device, this is not a security problem. Also note that there are userspace
programs (such as filesystem management tools) that need to write to
mounted block devices so just disabling CONFIG_BLKDEV_WRITE_MOUNTED is not
a generally acceptable option (also for example older versions of mount
break if you do this). Hence backporting these changes to stable kernels
makes little sense as people are unlikely to be able to use them.
CONFIG_BLKDEV_WRITE_MOUNTED is generally useful only for setups doing
system fuzzing or tighly controlled locked-down systems where even system
administrator is not supposed to get arbitrary priviledges.
Honza
--
Jan Kara <jack(a)suse.com>
SUSE Labs, CR
Hi,
We noticed that patch 6f861765464f should be probably ported to Linux
6.6 LTS. Its bug introducing commit is probably 05bdb9965305. So the
vulnerability exists in Linux 6.6 LTS, but the patch is not ported
into 6.6 LTS. According to our manual analysis, the commit
(05bdb9965305) introduced a vulnerability by replacing `fmode_t` with
`blk_mode_t` without preserving the write restrictions on mounted
block devices. Specifically, the `sb_open_mode(flags)` macro was
changed from using `FMODE_READ` and `FMODE_WRITE` to `BLK_OPEN_READ`
and `BLK_OPEN_WRITE`:
```diff
#define sb_open_mode(flags) \
- (FMODE_READ | (((flags) & SB_RDONLY) ? 0 : FMODE_WRITE))
+ (BLK_OPEN_READ | (((flags) & SB_RDONLY) ? 0 : BLK_OPEN_WRITE))
```
However, unlike `FMODE_WRITE`, the `BLK_OPEN_WRITE` flag does not
inherently prevent unsafe writes to block devices that are mounted by
filesystems. This oversight allowed for the possibility of writes
directly to the mounted block device, bypassing filesystem controls
and potentially leading to data corruption or security breaches.
The later patch (commit 6f861765464f43a71462d52026fbddfc858239a5)
addressed this vulnerability by introducing the
`BLK_OPEN_RESTRICT_WRITES` flag to the `sb_open_mode(flags)` macro:
```diff
#define sb_open_mode(flags) \
+ (BLK_OPEN_READ | BLK_OPEN_RESTRICT_WRITES | \
+ (((flags) & SB_RDONLY) ? 0 : BLK_OPEN_WRITE))
```
By adding `BLK_OPEN_RESTRICT_WRITES`, the block layer is instructed to
block unsafe writes to block devices that are in use by filesystems,
restoring the necessary protection that was inadvertently removed in
the previous commit.
At the same time, we noticed that this patch fixes a bug reported on
syzkaller https://syzkaller.appspot.com/bug?extid=c300ab283ba3bc072439,
the crash list of this bug contains one report in cbf3a2cb156a(between
6.6-rc4 and 6.6-rc5), so it confirms again that this bug is introduced
in 6.6 LTS
--
Yours sincerely,
Xingyu
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.10.y
git checkout FETCH_HEAD
git cherry-pick -x 2cb7c756f605ec02ffe562fb26828e4bcc5fdfc1
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025012005-supervise-armband-ab52@gregkh' --subject-prefix 'PATCH 5.10.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 2cb7c756f605ec02ffe562fb26828e4bcc5fdfc1 Mon Sep 17 00:00:00 2001
From: Stefano Garzarella <sgarzare(a)redhat.com>
Date: Fri, 10 Jan 2025 09:35:07 +0100
Subject: [PATCH] vsock/virtio: discard packets if the transport changes
If the socket has been de-assigned or assigned to another transport,
we must discard any packets received because they are not expected
and would cause issues when we access vsk->transport.
A possible scenario is described by Hyunwoo Kim in the attached link,
where after a first connect() interrupted by a signal, and a second
connect() failed, we can find `vsk->transport` at NULL, leading to a
NULL pointer dereference.
Fixes: c0cfa2d8a788 ("vsock: add multi-transports support")
Cc: stable(a)vger.kernel.org
Reported-by: Hyunwoo Kim <v4bel(a)theori.io>
Reported-by: Wongi Lee <qwerty(a)theori.io>
Closes: https://lore.kernel.org/netdev/Z2LvdTTQR7dBmPb5@v4bel-B760M-AORUS-ELITE-AX/
Signed-off-by: Stefano Garzarella <sgarzare(a)redhat.com>
Reviewed-by: Hyunwoo Kim <v4bel(a)theori.io>
Signed-off-by: Paolo Abeni <pabeni(a)redhat.com>
diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c
index 9acc13ab3f82..51a494b69be8 100644
--- a/net/vmw_vsock/virtio_transport_common.c
+++ b/net/vmw_vsock/virtio_transport_common.c
@@ -1628,8 +1628,11 @@ void virtio_transport_recv_pkt(struct virtio_transport *t,
lock_sock(sk);
- /* Check if sk has been closed before lock_sock */
- if (sock_flag(sk, SOCK_DONE)) {
+ /* Check if sk has been closed or assigned to another transport before
+ * lock_sock (note: listener sockets are not assigned to any transport)
+ */
+ if (sock_flag(sk, SOCK_DONE) ||
+ (sk->sk_state != TCP_LISTEN && vsk->transport != &t->transport)) {
(void)virtio_transport_reset_no_sock(t, skb);
release_sock(sk);
sock_put(sk);
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.15.y
git checkout FETCH_HEAD
git cherry-pick -x 2cb7c756f605ec02ffe562fb26828e4bcc5fdfc1
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025012004-rise-cavity-58aa@gregkh' --subject-prefix 'PATCH 5.15.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 2cb7c756f605ec02ffe562fb26828e4bcc5fdfc1 Mon Sep 17 00:00:00 2001
From: Stefano Garzarella <sgarzare(a)redhat.com>
Date: Fri, 10 Jan 2025 09:35:07 +0100
Subject: [PATCH] vsock/virtio: discard packets if the transport changes
If the socket has been de-assigned or assigned to another transport,
we must discard any packets received because they are not expected
and would cause issues when we access vsk->transport.
A possible scenario is described by Hyunwoo Kim in the attached link,
where after a first connect() interrupted by a signal, and a second
connect() failed, we can find `vsk->transport` at NULL, leading to a
NULL pointer dereference.
Fixes: c0cfa2d8a788 ("vsock: add multi-transports support")
Cc: stable(a)vger.kernel.org
Reported-by: Hyunwoo Kim <v4bel(a)theori.io>
Reported-by: Wongi Lee <qwerty(a)theori.io>
Closes: https://lore.kernel.org/netdev/Z2LvdTTQR7dBmPb5@v4bel-B760M-AORUS-ELITE-AX/
Signed-off-by: Stefano Garzarella <sgarzare(a)redhat.com>
Reviewed-by: Hyunwoo Kim <v4bel(a)theori.io>
Signed-off-by: Paolo Abeni <pabeni(a)redhat.com>
diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c
index 9acc13ab3f82..51a494b69be8 100644
--- a/net/vmw_vsock/virtio_transport_common.c
+++ b/net/vmw_vsock/virtio_transport_common.c
@@ -1628,8 +1628,11 @@ void virtio_transport_recv_pkt(struct virtio_transport *t,
lock_sock(sk);
- /* Check if sk has been closed before lock_sock */
- if (sock_flag(sk, SOCK_DONE)) {
+ /* Check if sk has been closed or assigned to another transport before
+ * lock_sock (note: listener sockets are not assigned to any transport)
+ */
+ if (sock_flag(sk, SOCK_DONE) ||
+ (sk->sk_state != TCP_LISTEN && vsk->transport != &t->transport)) {
(void)virtio_transport_reset_no_sock(t, skb);
release_sock(sk);
sock_put(sk);
From: Wei Fang <wei.fang(a)nxp.com>
commit c2e0c58b25a0a0c37ec643255558c5af4450c9f5 upstream.
There is a deadlock issue found in sungem driver, please refer to the
commit ac0a230f719b ("eth: sungem: remove .ndo_poll_controller to avoid
deadlocks"). The root cause of the issue is that netpoll is in atomic
context and disable_irq() is called by .ndo_poll_controller interface
of sungem driver, however, disable_irq() might sleep. After analyzing
the implementation of fec_poll_controller(), the fec driver should have
the same issue. Due to the fec driver uses NAPI for TX completions, the
.ndo_poll_controller is unnecessary to be implemented in the fec driver,
so fec_poll_controller() can be safely removed.
Fixes: 7f5c6addcdc0 ("net/fec: add poll controller function for fec nic")
Signed-off-by: Wei Fang <wei.fang(a)nxp.com>
Link: https://lore.kernel.org/r/20240511062009.652918-1-wei.fang@nxp.com
Signed-off-by: Jakub Kicinski <kuba(a)kernel.org>
[Denis: minor fix to resolve merge conflict.]
Signed-off-by: Denis Arefev <arefev(a)swemel.ru>
---
Backport fix for CVE-2024-38553
Link: https://nvd.nist.gov/vuln/detail/cve-2024-38553
---
drivers/net/ethernet/freescale/fec_main.c | 26 -----------------------
1 file changed, 26 deletions(-)
diff --git a/drivers/net/ethernet/freescale/fec_main.c b/drivers/net/ethernet/freescale/fec_main.c
index a591ca0b3778..815062c23708 100644
--- a/drivers/net/ethernet/freescale/fec_main.c
+++ b/drivers/net/ethernet/freescale/fec_main.c
@@ -3226,29 +3226,6 @@ fec_set_mac_address(struct net_device *ndev, void *p)
return 0;
}
-#ifdef CONFIG_NET_POLL_CONTROLLER
-/**
- * fec_poll_controller - FEC Poll controller function
- * @dev: The FEC network adapter
- *
- * Polled functionality used by netconsole and others in non interrupt mode
- *
- */
-static void fec_poll_controller(struct net_device *dev)
-{
- int i;
- struct fec_enet_private *fep = netdev_priv(dev);
-
- for (i = 0; i < FEC_IRQ_NUM; i++) {
- if (fep->irq[i] > 0) {
- disable_irq(fep->irq[i]);
- fec_enet_interrupt(fep->irq[i], dev);
- enable_irq(fep->irq[i]);
- }
- }
-}
-#endif
-
static inline void fec_enet_set_netdev_features(struct net_device *netdev,
netdev_features_t features)
{
@@ -3322,9 +3299,6 @@ static const struct net_device_ops fec_netdev_ops = {
.ndo_tx_timeout = fec_timeout,
.ndo_set_mac_address = fec_set_mac_address,
.ndo_do_ioctl = fec_enet_ioctl,
-#ifdef CONFIG_NET_POLL_CONTROLLER
- .ndo_poll_controller = fec_poll_controller,
-#endif
.ndo_set_features = fec_set_features,
};
--
2.43.0
Hi Sasha, Greg -
This is upstream commit 8e6e2ffa6569a205f1805cbaeca143b556581da6. I've
received a request for it to be applied specifically to v5.15.y. Can you
apply it to all LTS kernels back to v5.10 ?
-------- Forwarded Message --------
Subject: [PATCH] nfsd: add list_head nf_gc to struct nfsd_file
Date: Wed, 10 Jul 2024 10:40:35 -0400
From: Youzhong Yang <youzhong(a)gmail.com>
To: jlayton(a)kernel.org, chuck.lever(a)oracle.com,
linux-nfs(a)vger.kernel.org, youzhong(a)gmail.com
nfsd_file_put() in one thread can race with another thread doing
garbage collection (running nfsd_file_gc() -> list_lru_walk() ->
nfsd_file_lru_cb()):
* In nfsd_file_put(), nf->nf_ref is 1, so it tries to do
nfsd_file_lru_add().
* nfsd_file_lru_add() returns true (with NFSD_FILE_REFERENCED bit set)
* garbage collector kicks in, nfsd_file_lru_cb() clears REFERENCED
bit and
returns LRU_ROTATE.
* garbage collector kicks in again, nfsd_file_lru_cb() now decrements
nf->nf_ref
to 0, runs nfsd_file_unhash(), removes it from the LRU and adds to
the dispose
list [list_lru_isolate_move(lru, &nf->nf_lru, head)]
* nfsd_file_put() detects NFSD_FILE_HASHED bit is cleared, so it
tries to remove
the 'nf' from the LRU [if (!nfsd_file_lru_remove(nf))]. The 'nf'
has been added
to the 'dispose' list by nfsd_file_lru_cb(), so
nfsd_file_lru_remove(nf) simply
treats it as part of the LRU and removes it, which leads to its
removal from
the 'dispose' list.
* At this moment, 'nf' is unhashed with its nf_ref being 0, and not
on the LRU.
nfsd_file_put() continues its execution [if
(refcount_dec_and_test(&nf->nf_ref))],
as nf->nf_ref is already 0, nf->nf_ref is set to
REFCOUNT_SATURATED, and the 'nf'
gets no chance of being freed.
nfsd_file_put() can also race with nfsd_file_cond_queue():
* In nfsd_file_put(), nf->nf_ref is 1, so it tries to do
nfsd_file_lru_add().
* nfsd_file_lru_add() sets REFERENCED bit and returns true.
* Some userland application runs 'exportfs -f' or something like
that, which triggers
__nfsd_file_cache_purge() -> nfsd_file_cond_queue().
* In nfsd_file_cond_queue(), it runs [if (!nfsd_file_unhash(nf))],
unhash is done
successfully.
* nfsd_file_cond_queue() runs [if (!nfsd_file_get(nf))], now
nf->nf_ref goes to 2.
* nfsd_file_cond_queue() runs [if (nfsd_file_lru_remove(nf))], it
succeeds.
* nfsd_file_cond_queue() runs [if (refcount_sub_and_test(decrement,
&nf->nf_ref))]
(with "decrement" being 2), so the nf->nf_ref goes to 0, the 'nf'
is added to the
dispose list [list_add(&nf->nf_lru, dispose)]
* nfsd_file_put() detects NFSD_FILE_HASHED bit is cleared, so it
tries to remove
the 'nf' from the LRU [if (!nfsd_file_lru_remove(nf))], although
the 'nf' is not
in the LRU, but it is linked in the 'dispose' list,
nfsd_file_lru_remove() simply
treats it as part of the LRU and removes it. This leads to its
removal from
the 'dispose' list!
* Now nf->ref is 0, unhashed. nfsd_file_put() continues its execution
and set
nf->nf_ref to REFCOUNT_SATURATED.
As shown in the above analysis, using nf_lru for both the LRU list and
dispose list
can cause the leaks. This patch adds a new list_head nf_gc in struct
nfsd_file, and uses
it for the dispose list. This does not fix the nfsd_file leaking issue
completely.
Signed-off-by: Youzhong Yang <youzhong(a)gmail.com>
---
fs/nfsd/filecache.c | 18 ++++++++++--------
fs/nfsd/filecache.h | 1 +
2 files changed, 11 insertions(+), 8 deletions(-)
diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
index ad9083ca144b..22ebd7fb8639 100644
--- a/fs/nfsd/filecache.c
+++ b/fs/nfsd/filecache.c
@@ -216,6 +216,7 @@ nfsd_file_alloc(struct net *net, struct inode
*inode, unsigned char need,
return NULL;
INIT_LIST_HEAD(&nf->nf_lru);
+ INIT_LIST_HEAD(&nf->nf_gc);
nf->nf_birthtime = ktime_get();
nf->nf_file = NULL;
nf->nf_cred = get_current_cred();
@@ -393,8 +394,8 @@ nfsd_file_dispose_list(struct list_head *dispose)
struct nfsd_file *nf;
while (!list_empty(dispose)) {
- nf = list_first_entry(dispose, struct nfsd_file, nf_lru);
- list_del_init(&nf->nf_lru);
+ nf = list_first_entry(dispose, struct nfsd_file, nf_gc);
+ list_del_init(&nf->nf_gc);
nfsd_file_free(nf);
}
}
@@ -411,12 +412,12 @@ nfsd_file_dispose_list_delayed(struct list_head
*dispose)
{
while(!list_empty(dispose)) {
struct nfsd_file *nf = list_first_entry(dispose,
- struct nfsd_file, nf_lru);
+ struct nfsd_file, nf_gc);
struct nfsd_net *nn = net_generic(nf->nf_net, nfsd_net_id);
struct nfsd_fcache_disposal *l = nn->fcache_disposal;
spin_lock(&l->lock);
- list_move_tail(&nf->nf_lru, &l->freeme);
+ list_move_tail(&nf->nf_gc, &l->freeme);
spin_unlock(&l->lock);
svc_wake_up(nn->nfsd_serv);
}
@@ -503,7 +504,8 @@ nfsd_file_lru_cb(struct list_head *item, struct
list_lru_one *lru,
/* Refcount went to zero. Unhash it and queue it to the dispose list */
nfsd_file_unhash(nf);
- list_lru_isolate_move(lru, &nf->nf_lru, head);
+ list_lru_isolate(lru, &nf->nf_lru);
+ list_add(&nf->nf_gc, head);
this_cpu_inc(nfsd_file_evictions);
trace_nfsd_file_gc_disposed(nf);
return LRU_REMOVED;
@@ -578,7 +580,7 @@ nfsd_file_cond_queue(struct nfsd_file *nf, struct
list_head *dispose)
/* If refcount goes to 0, then put on the dispose list */
if (refcount_sub_and_test(decrement, &nf->nf_ref)) {
- list_add(&nf->nf_lru, dispose);
+ list_add(&nf->nf_gc, dispose);
trace_nfsd_file_closing(nf);
}
}
@@ -654,8 +656,8 @@ nfsd_file_close_inode_sync(struct inode *inode)
nfsd_file_queue_for_close(inode, &dispose);
while (!list_empty(&dispose)) {
- nf = list_first_entry(&dispose, struct nfsd_file, nf_lru);
- list_del_init(&nf->nf_lru);
+ nf = list_first_entry(&dispose, struct nfsd_file, nf_gc);
+ list_del_init(&nf->nf_gc);
nfsd_file_free(nf);
}
}
diff --git a/fs/nfsd/filecache.h b/fs/nfsd/filecache.h
index c61884def906..3fbec24eea6c 100644
--- a/fs/nfsd/filecache.h
+++ b/fs/nfsd/filecache.h
@@ -44,6 +44,7 @@ struct nfsd_file {
struct nfsd_file_mark *nf_mark;
struct list_head nf_lru;
+ struct list_head nf_gc;
struct rcu_head nf_rcu;
ktime_t nf_birthtime;
};
--
2.45.2
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.10.y
git checkout FETCH_HEAD
git cherry-pick -x a2e740e216f5bf49ccb83b6d490c72a340558a43
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024122301-uncommon-enquirer-5f71@gregkh' --subject-prefix 'PATCH 5.10.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From a2e740e216f5bf49ccb83b6d490c72a340558a43 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy(a)infradead.org>
Date: Wed, 11 Dec 2024 20:25:37 +0000
Subject: [PATCH] vmalloc: fix accounting with i915
If the caller of vmap() specifies VM_MAP_PUT_PAGES (currently only the
i915 driver), we will decrement nr_vmalloc_pages and MEMCG_VMALLOC in
vfree(). These counters are incremented by vmalloc() but not by vmap() so
this will cause an underflow. Check the VM_MAP_PUT_PAGES flag before
decrementing either counter.
Link: https://lkml.kernel.org/r/20241211202538.168311-1-willy@infradead.org
Fixes: b944afc9d64d ("mm: add a VM_MAP_PUT_PAGES flag for vmap")
Signed-off-by: Matthew Wilcox (Oracle) <willy(a)infradead.org>
Acked-by: Johannes Weiner <hannes(a)cmpxchg.org>
Reviewed-by: Shakeel Butt <shakeel.butt(a)linux.dev>
Reviewed-by: Balbir Singh <balbirs(a)nvidia.com>
Acked-by: Michal Hocko <mhocko(a)suse.com>
Cc: Christoph Hellwig <hch(a)lst.de>
Cc: Muchun Song <muchun.song(a)linux.dev>
Cc: Roman Gushchin <roman.gushchin(a)linux.dev>
Cc: "Uladzislau Rezki (Sony)" <urezki(a)gmail.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index f009b21705c1..5c88d0e90c20 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -3374,7 +3374,8 @@ void vfree(const void *addr)
struct page *page = vm->pages[i];
BUG_ON(!page);
- mod_memcg_page_state(page, MEMCG_VMALLOC, -1);
+ if (!(vm->flags & VM_MAP_PUT_PAGES))
+ mod_memcg_page_state(page, MEMCG_VMALLOC, -1);
/*
* High-order allocs for huge vmallocs are split, so
* can be freed as an array of order-0 allocations
@@ -3382,7 +3383,8 @@ void vfree(const void *addr)
__free_page(page);
cond_resched();
}
- atomic_long_sub(vm->nr_pages, &nr_vmalloc_pages);
+ if (!(vm->flags & VM_MAP_PUT_PAGES))
+ atomic_long_sub(vm->nr_pages, &nr_vmalloc_pages);
kvfree(vm->pages);
kfree(vm);
}
[Public]
Hi Greg, Sasha,
The original patch 73dae652dcac (drm/amdgpu: rework resume handling for display (v2)), was only targeted at kernels 6.11 and newer. It did not apply cleanly to 6.12 so I backported it and it backport landed as 99a02eab8251 ("drm/amdgpu: rework resume handling for display (v2)"), however there was a bug in the backport that was subsequently fixed in 063d380ca28e ("drm/amdgpu: fix backport of commit 73dae652dcac"). None of this was intended for kernels older than 6.11, however the original backport eventually landed in 6.6, 6.1, and 5.15. Please revert the change from kernels 6.6, 6.1, and 5.15.
6.6.y:
Please revert 2daba7d857e4 ("drm/amdgpu: rework resume handling for display (v2)").
6.1.y:
Please revert c807ab3a861f ("drm/amdgpu: rework resume handling for display (v2)").
5.15.y:
Please revert d897650c5897 ("drm/amdgpu: rework resume handling for display (v2)").
Thanks,
Alex
UART controllers without flow control seem to behave unstable
in case DMA is enabled. The issues were indicated in the message:
https://lore.kernel.org/linux-arm-kernel/CAMdYzYpXtMocCtCpZLU_xuWmOp2Ja_v0A…
In case of PX30-uQ7 Ringneck SoM, it was noticed that after couple
of hours of UART communication, the CPU stall was occurring,
leading to the system becoming unresponsive.
After disabling the DMA, extensive UART communication tests for
up to two weeks were performed, and no issues were further
observed.
The flow control pins for uart5 are not available on PX30-uQ7
Ringneck, as configured by pinctrl-0, so the DMA nodes were
removed on SoM dtsi.
Cc: stable(a)vger.kernel.org
Fixes: c484cf93f61b ("arm64: dts: rockchip: add PX30-µQ7 (Ringneck) SoM with Haikou baseboard")
Reviewed-by: Quentin Schulz <quentin.schulz(a)cherry.de>
Signed-off-by: Lukasz Czechowski <lukasz.czechowski(a)thaumatec.com>
---
arch/arm64/boot/dts/rockchip/px30-ringneck.dtsi | 2 ++
1 file changed, 2 insertions(+)
diff --git a/arch/arm64/boot/dts/rockchip/px30-ringneck.dtsi b/arch/arm64/boot/dts/rockchip/px30-ringneck.dtsi
index 2c87005c89bd3..e80412abec081 100644
--- a/arch/arm64/boot/dts/rockchip/px30-ringneck.dtsi
+++ b/arch/arm64/boot/dts/rockchip/px30-ringneck.dtsi
@@ -397,6 +397,8 @@ &u2phy_host {
};
&uart5 {
+ /delete-property/ dmas;
+ /delete-property/ dma-names;
pinctrl-0 = <&uart5_xfer>;
};
--
2.43.0
In the PX30-uQ7 (Ringneck) SoM, the hardware CTS and RTS pins for
uart5 cannot be used for the UART CTS/RTS, because they are already
allocated for different purposes. CTS pin is routed to SUS_S3#
signal, while RTS pin is used internally and is not available on
Q7 connector. Move definition of the pinctrl-0 property from
px30-ringneck-haikou.dts to px30-ringneck.dtsi.
This commit is a dependency to next commit in the patch series,
that disables DMA for uart5.
Cc: stable(a)vger.kernel.org
Reviewed-by: Quentin Schulz <quentin.schulz(a)cherry.de>
Signed-off-by: Lukasz Czechowski <lukasz.czechowski(a)thaumatec.com>
---
arch/arm64/boot/dts/rockchip/px30-ringneck-haikou.dts | 1 -
arch/arm64/boot/dts/rockchip/px30-ringneck.dtsi | 4 ++++
2 files changed, 4 insertions(+), 1 deletion(-)
diff --git a/arch/arm64/boot/dts/rockchip/px30-ringneck-haikou.dts b/arch/arm64/boot/dts/rockchip/px30-ringneck-haikou.dts
index e4517f47d519c..eb9470a00e549 100644
--- a/arch/arm64/boot/dts/rockchip/px30-ringneck-haikou.dts
+++ b/arch/arm64/boot/dts/rockchip/px30-ringneck-haikou.dts
@@ -226,7 +226,6 @@ &uart0 {
};
&uart5 {
- pinctrl-0 = <&uart5_xfer>;
rts-gpios = <&gpio0 RK_PB5 GPIO_ACTIVE_HIGH>;
status = "okay";
};
diff --git a/arch/arm64/boot/dts/rockchip/px30-ringneck.dtsi b/arch/arm64/boot/dts/rockchip/px30-ringneck.dtsi
index ae050cc6cd050..2c87005c89bd3 100644
--- a/arch/arm64/boot/dts/rockchip/px30-ringneck.dtsi
+++ b/arch/arm64/boot/dts/rockchip/px30-ringneck.dtsi
@@ -396,6 +396,10 @@ &u2phy_host {
status = "okay";
};
+&uart5 {
+ pinctrl-0 = <&uart5_xfer>;
+};
+
/* Mule UCAN */
&usb_host0_ehci {
status = "okay";
--
2.43.0
Hello,
with v6.12 we encountered a kernel BUG (panic on our systems) that is
caused by a NULL pointer dereference inside the apparmor's
profile_transition code. I've contacted John Johansen as the
maintainer for the apparmor system, and he pointed me to 17d0d04f3c99
as a fix for that issue. That commit has now landed in v6.13, would it
be possible to backport this to v6.12? Commit is
17d0d04f3c99 apparmor: allocate xmatch for nullpdb inside aa_alloc_null
Thanks,
Paul
I'm announcing the release of the 6.6.73 kernel.
Only users of overlayfs need to upgrade, this release reverts some
changes that were reported to be causing problems. Thanks to Ignat and
Amir for the quick reporting on the issue.
The updated 6.6.y git tree can be found at:
git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable.git linux-6.6.y
and can be browsed at the normal kernel.org git web browser:
https://git.kernel.org/?p=linux/kernel/git/stable/linux-stable.git;a=summary
thanks,
greg k-h
------------
Makefile | 2 -
fs/overlayfs/copy_up.c | 62 ++++++++++++++++++-----------------------------
fs/overlayfs/export.c | 49 ++++++++++++++++---------------------
fs/overlayfs/namei.c | 41 ++++++++-----------------------
fs/overlayfs/overlayfs.h | 28 ++++++---------------
fs/overlayfs/super.c | 20 ++++-----------
fs/overlayfs/util.c | 10 -------
7 files changed, 72 insertions(+), 140 deletions(-)
Greg Kroah-Hartman (4):
Revert "ovl: support encoding fid from inode with no alias"
Revert "ovl: pass realinode to ovl_encode_real_fh() instead of realdentry"
Revert "ovl: do not encode lower fh with upper sb_writers held"
Linux 6.6.73
From: Steven Rostedt <rostedt(a)goodmis.org>
Some architectures can not safely do atomic64 operations in NMI context.
Since the ring buffer relies on atomic64 operations to do its time
keeping, if an event is requested in NMI context, reject it for these
architectures.
Cc: stable(a)vger.kernel.org
Fixes: c84897c0ff592 ("ring-buffer: Remove 32bit timestamp logic")
Closes: https://lore.kernel.org/all/86fb4f86-a0e4-45a2-a2df-3154acc4f086@gaisler.co…
Reported-by: Ludwig Rydberg <ludwig.rydberg(a)gaisler.com>
Signed-off-by: Steven Rostedt (Google) <rostedt(a)goodmis.org>
---
kernel/trace/ring_buffer.c | 9 +++++++--
1 file changed, 7 insertions(+), 2 deletions(-)
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 6d61ff78926b..b8e0ae15ca5b 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -4398,8 +4398,13 @@ rb_reserve_next_event(struct trace_buffer *buffer,
int nr_loops = 0;
int add_ts_default;
- /* ring buffer does cmpxchg, make sure it is safe in NMI context */
- if (!IS_ENABLED(CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG) &&
+ /*
+ * ring buffer does cmpxchg as well as atomic64 operations
+ * (which some archs use locking for atomic64), make sure this
+ * is safe in NMI context
+ */
+ if ((!IS_ENABLED(CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG) ||
+ IS_ENABLED(CONFIG_GENERIC_ATOMIC64)) &&
(unlikely(in_nmi()))) {
return NULL;
}
--
2.45.2
If find_linux_pte fails, IRQs will not be restored. This is unlikely
to happen in practice since it would have been reported as hanging
hosts, but it should of course be fixed anyway.
Cc: stable(a)vger.kernel.org
Reported-by: Sean Christopherson <seanjc(a)google.com>
Signed-off-by: Paolo Bonzini <pbonzini(a)redhat.com>
---
arch/powerpc/kvm/e500_mmu_host.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/arch/powerpc/kvm/e500_mmu_host.c b/arch/powerpc/kvm/e500_mmu_host.c
index e5a145b578a4..6824e8139801 100644
--- a/arch/powerpc/kvm/e500_mmu_host.c
+++ b/arch/powerpc/kvm/e500_mmu_host.c
@@ -479,7 +479,6 @@ static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500,
if (pte_present(pte)) {
wimg = (pte_val(pte) >> PTE_WIMGE_SHIFT) &
MAS2_WIMGE_MASK;
- local_irq_restore(flags);
} else {
local_irq_restore(flags);
pr_err_ratelimited("%s: pte not present: gfn %lx,pfn %lx\n",
@@ -488,8 +487,9 @@ static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500,
goto out;
}
}
+ local_irq_restore(flags);
+
writable = kvmppc_e500_ref_setup(ref, gtlbe, pfn, wimg);
-
kvmppc_e500_setup_stlbe(&vcpu_e500->vcpu, gtlbe, tsize,
ref, gvaddr, stlbe);
--
2.47.1
In case of possible unpredictably large arguments passed to
rose_setsockopt() and multiplied by extra values on top of that,
integer overflows may occur.
Do the safest minimum and fix these issues by checking the
contents of 'opt' and returning -EINVAL if they are too large. Also,
switch to unsigned int and remove useless check for negative 'opt'
in ROSE_IDLE case.
Found by Linux Verification Center (linuxtesting.org) with static
analysis tool SVACE.
Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
Cc: stable(a)vger.kernel.org
Signed-off-by: Nikita Zhandarovich <n.zhandarovich(a)fintech.ru>
---
net/rose/af_rose.c | 16 ++++++++--------
1 file changed, 8 insertions(+), 8 deletions(-)
diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c
index 59050caab65c..72c65d938a15 100644
--- a/net/rose/af_rose.c
+++ b/net/rose/af_rose.c
@@ -397,15 +397,15 @@ static int rose_setsockopt(struct socket *sock, int level, int optname,
{
struct sock *sk = sock->sk;
struct rose_sock *rose = rose_sk(sk);
- int opt;
+ unsigned int opt;
if (level != SOL_ROSE)
return -ENOPROTOOPT;
- if (optlen < sizeof(int))
+ if (optlen < sizeof(unsigned int))
return -EINVAL;
- if (copy_from_sockptr(&opt, optval, sizeof(int)))
+ if (copy_from_sockptr(&opt, optval, sizeof(unsigned int)))
return -EFAULT;
switch (optname) {
@@ -414,31 +414,31 @@ static int rose_setsockopt(struct socket *sock, int level, int optname,
return 0;
case ROSE_T1:
- if (opt < 1)
+ if (opt < 1 || opt > UINT_MAX / HZ)
return -EINVAL;
rose->t1 = opt * HZ;
return 0;
case ROSE_T2:
- if (opt < 1)
+ if (opt < 1 || opt > UINT_MAX / HZ)
return -EINVAL;
rose->t2 = opt * HZ;
return 0;
case ROSE_T3:
- if (opt < 1)
+ if (opt < 1 || opt > UINT_MAX / HZ)
return -EINVAL;
rose->t3 = opt * HZ;
return 0;
case ROSE_HOLDBACK:
- if (opt < 1)
+ if (opt < 1 || opt > UINT_MAX / HZ)
return -EINVAL;
rose->hb = opt * HZ;
return 0;
case ROSE_IDLE:
- if (opt < 0)
+ if (opt > UINT_MAX / (60 * HZ))
return -EINVAL;
rose->idle = opt * 60 * HZ;
return 0;
pci-legacy systems are not using logic_pio to managed PIO
allocations, thus the generic pci_address_to_pio won't work
when PCI_IOBASE is defined.
Override the function to use architecture implementation to
fix the problem.
Cc: stable(a)vger.kernel.org
Fixes: 4bfb53e7d317 ("mips: add <asm-generic/io.h> including")
Reported-by: Mateusz Jończyk <mat.jonczyk(a)o2.pl>
Closes: https://lore.kernel.org/r/99f75c66-4c2d-45dc-a808-b5ba440c7551@app.fastmail…
Signed-off-by: Jiaxun Yang <jiaxun.yang(a)flygoat.com>
---
This is a quick fix for fixes tree and stable backporting.
In long term, we should get logic_pio accept fixed mapping,
and make PCI core code aware of platforms not using vmap
for PCI_IOBASE.
---
arch/mips/pci/pci-legacy.c | 8 ++++++++
1 file changed, 8 insertions(+)
diff --git a/arch/mips/pci/pci-legacy.c b/arch/mips/pci/pci-legacy.c
index ec2567f8efd83bff7b106cbbd9ec7a6de0308c4c..66898fd182dc1fec1d1e9ae4c908873d59777182 100644
--- a/arch/mips/pci/pci-legacy.c
+++ b/arch/mips/pci/pci-legacy.c
@@ -29,6 +29,14 @@ static LIST_HEAD(controllers);
static int pci_initialized;
+unsigned long pci_address_to_pio(phys_addr_t address)
+{
+ if (address > IO_SPACE_LIMIT)
+ return (unsigned long)-1;
+
+ return (unsigned long) address;
+}
+
/*
* We need to avoid collisions with `mirrored' VGA ports
* and other strange ISA hardware, so we always want the
---
base-commit: dab2734f8e9ecba609d66d1dd087a392a7774c04
change-id: 20250114-malta-io-fixes-85e14b1b9f8b
Best regards,
--
Jiaxun Yang <jiaxun.yang(a)flygoat.com>
The comparison function cmpworker() violates the C standard's
requirements for qsort() comparison functions, which mandate symmetry
and transitivity:
Symmetry: If x < y, then y > x.
Transitivity: If x < y and y < z, then x < z.
In its current implementation, cmpworker() incorrectly returns 0 when
w1->tid < w2->tid, which breaks both symmetry and transitivity. This
violation causes undefined behavior, potentially leading to issues such
as memory corruption in glibc [1].
Fix the issue by returning -1 when w1->tid < w2->tid, ensuring
compliance with the C standard and preventing undefined behavior.
Link: https://www.qualys.com/2024/01/30/qsort.txt [1]
Fixes: 121dd9ea0116 ("perf bench: Add epoll parallel epoll_wait benchmark")
Cc: stable(a)vger.kernel.org
Signed-off-by: Kuan-Wei Chiu <visitorckw(a)gmail.com>
---
Changes in v3:
- Perform a full comparison for clarity, as suggested by James.
tools/perf/bench/epoll-wait.c | 7 ++++++-
1 file changed, 6 insertions(+), 1 deletion(-)
diff --git a/tools/perf/bench/epoll-wait.c b/tools/perf/bench/epoll-wait.c
index ef5c4257844d..20fe4f72b4af 100644
--- a/tools/perf/bench/epoll-wait.c
+++ b/tools/perf/bench/epoll-wait.c
@@ -420,7 +420,12 @@ static int cmpworker(const void *p1, const void *p2)
struct worker *w1 = (struct worker *) p1;
struct worker *w2 = (struct worker *) p2;
- return w1->tid > w2->tid;
+
+ if (w1->tid > w2->tid)
+ return 1;
+ if (w1->tid < w2->tid)
+ return -1;
+ return 0;
}
int bench_epoll_wait(int argc, const char **argv)
--
2.34.1
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.10.y
git checkout FETCH_HEAD
git cherry-pick -x 9322d1915f9d976ee48c09d800fbd5169bc2ddcc
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025012049-connector-oxford-19f1@gregkh' --subject-prefix 'PATCH 5.10.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 9322d1915f9d976ee48c09d800fbd5169bc2ddcc Mon Sep 17 00:00:00 2001
From: Joe Hattori <joe(a)pf.is.s.u-tokyo.ac.jp>
Date: Sun, 15 Dec 2024 12:39:45 +0900
Subject: [PATCH] irqchip: Plug a OF node reference leak in
platform_irqchip_probe()
platform_irqchip_probe() leaks a OF node when irq_init_cb() fails. Fix it
by declaring par_np with the __free(device_node) cleanup construct.
This bug was found by an experimental static analysis tool that I am
developing.
Fixes: f8410e626569 ("irqchip: Add IRQCHIP_PLATFORM_DRIVER_BEGIN/END and IRQCHIP_MATCH helper macros")
Signed-off-by: Joe Hattori <joe(a)pf.is.s.u-tokyo.ac.jp>
Signed-off-by: Thomas Gleixner <tglx(a)linutronix.de>
Cc: stable(a)vger.kernel.org
Link: https://lore.kernel.org/all/20241215033945.3414223-1-joe@pf.is.s.u-tokyo.ac…
diff --git a/drivers/irqchip/irqchip.c b/drivers/irqchip/irqchip.c
index 1eeb0d0156ce..0ee7b6b71f5f 100644
--- a/drivers/irqchip/irqchip.c
+++ b/drivers/irqchip/irqchip.c
@@ -35,11 +35,10 @@ void __init irqchip_init(void)
int platform_irqchip_probe(struct platform_device *pdev)
{
struct device_node *np = pdev->dev.of_node;
- struct device_node *par_np = of_irq_find_parent(np);
+ struct device_node *par_np __free(device_node) = of_irq_find_parent(np);
of_irq_init_cb_t irq_init_cb = of_device_get_match_data(&pdev->dev);
if (!irq_init_cb) {
- of_node_put(par_np);
return -EINVAL;
}
@@ -55,7 +54,6 @@ int platform_irqchip_probe(struct platform_device *pdev)
* interrupt controller can check for specific domains as necessary.
*/
if (par_np && !irq_find_matching_host(par_np, DOMAIN_BUS_ANY)) {
- of_node_put(par_np);
return -EPROBE_DEFER;
}
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.15.y
git checkout FETCH_HEAD
git cherry-pick -x 9322d1915f9d976ee48c09d800fbd5169bc2ddcc
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025012050-motor-prevalent-e802@gregkh' --subject-prefix 'PATCH 5.15.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 9322d1915f9d976ee48c09d800fbd5169bc2ddcc Mon Sep 17 00:00:00 2001
From: Joe Hattori <joe(a)pf.is.s.u-tokyo.ac.jp>
Date: Sun, 15 Dec 2024 12:39:45 +0900
Subject: [PATCH] irqchip: Plug a OF node reference leak in
platform_irqchip_probe()
platform_irqchip_probe() leaks a OF node when irq_init_cb() fails. Fix it
by declaring par_np with the __free(device_node) cleanup construct.
This bug was found by an experimental static analysis tool that I am
developing.
Fixes: f8410e626569 ("irqchip: Add IRQCHIP_PLATFORM_DRIVER_BEGIN/END and IRQCHIP_MATCH helper macros")
Signed-off-by: Joe Hattori <joe(a)pf.is.s.u-tokyo.ac.jp>
Signed-off-by: Thomas Gleixner <tglx(a)linutronix.de>
Cc: stable(a)vger.kernel.org
Link: https://lore.kernel.org/all/20241215033945.3414223-1-joe@pf.is.s.u-tokyo.ac…
diff --git a/drivers/irqchip/irqchip.c b/drivers/irqchip/irqchip.c
index 1eeb0d0156ce..0ee7b6b71f5f 100644
--- a/drivers/irqchip/irqchip.c
+++ b/drivers/irqchip/irqchip.c
@@ -35,11 +35,10 @@ void __init irqchip_init(void)
int platform_irqchip_probe(struct platform_device *pdev)
{
struct device_node *np = pdev->dev.of_node;
- struct device_node *par_np = of_irq_find_parent(np);
+ struct device_node *par_np __free(device_node) = of_irq_find_parent(np);
of_irq_init_cb_t irq_init_cb = of_device_get_match_data(&pdev->dev);
if (!irq_init_cb) {
- of_node_put(par_np);
return -EINVAL;
}
@@ -55,7 +54,6 @@ int platform_irqchip_probe(struct platform_device *pdev)
* interrupt controller can check for specific domains as necessary.
*/
if (par_np && !irq_find_matching_host(par_np, DOMAIN_BUS_ANY)) {
- of_node_put(par_np);
return -EPROBE_DEFER;
}
The patch below does not apply to the 6.6-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.6.y
git checkout FETCH_HEAD
git cherry-pick -x 0cef0bb836e3cfe00f08f9606c72abd72fe78ca3
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025012032-buffoon-cabbie-1e42@gregkh' --subject-prefix 'PATCH 6.6.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 0cef0bb836e3cfe00f08f9606c72abd72fe78ca3 Mon Sep 17 00:00:00 2001
From: Ryan Roberts <ryan.roberts(a)arm.com>
Date: Tue, 7 Jan 2025 14:47:52 +0000
Subject: [PATCH] mm: clear uffd-wp PTE/PMD state on mremap()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
When mremap()ing a memory region previously registered with userfaultfd as
write-protected but without UFFD_FEATURE_EVENT_REMAP, an inconsistency in
flag clearing leads to a mismatch between the vma flags (which have
uffd-wp cleared) and the pte/pmd flags (which do not have uffd-wp
cleared). This mismatch causes a subsequent mprotect(PROT_WRITE) to
trigger a warning in page_table_check_pte_flags() due to setting the pte
to writable while uffd-wp is still set.
Fix this by always explicitly clearing the uffd-wp pte/pmd flags on any
such mremap() so that the values are consistent with the existing clearing
of VM_UFFD_WP. Be careful to clear the logical flag regardless of its
physical form; a PTE bit, a swap PTE bit, or a PTE marker. Cover PTE,
huge PMD and hugetlb paths.
Link: https://lkml.kernel.org/r/20250107144755.1871363-2-ryan.roberts@arm.com
Co-developed-by: Mikołaj Lenczewski <miko.lenczewski(a)arm.com>
Signed-off-by: Mikołaj Lenczewski <miko.lenczewski(a)arm.com>
Signed-off-by: Ryan Roberts <ryan.roberts(a)arm.com>
Closes: https://lore.kernel.org/linux-mm/810b44a8-d2ae-4107-b665-5a42eae2d948@arm.c…
Fixes: 63b2d4174c4a ("userfaultfd: wp: add the writeprotect API to userfaultfd ioctl")
Cc: David Hildenbrand <david(a)redhat.com>
Cc: Jann Horn <jannh(a)google.com>
Cc: Liam R. Howlett <Liam.Howlett(a)Oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes(a)oracle.com>
Cc: Mark Rutland <mark.rutland(a)arm.com>
Cc: Muchun Song <muchun.song(a)linux.dev>
Cc: Peter Xu <peterx(a)redhat.com>
Cc: Shuah Khan <shuah(a)kernel.org>
Cc: Vlastimil Babka <vbabka(a)suse.cz>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index cb40f1a1d081..75342022d144 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -247,6 +247,13 @@ static inline bool vma_can_userfault(struct vm_area_struct *vma,
vma_is_shmem(vma);
}
+static inline bool vma_has_uffd_without_event_remap(struct vm_area_struct *vma)
+{
+ struct userfaultfd_ctx *uffd_ctx = vma->vm_userfaultfd_ctx.ctx;
+
+ return uffd_ctx && (uffd_ctx->features & UFFD_FEATURE_EVENT_REMAP) == 0;
+}
+
extern int dup_userfaultfd(struct vm_area_struct *, struct list_head *);
extern void dup_userfaultfd_complete(struct list_head *);
void dup_userfaultfd_fail(struct list_head *);
@@ -402,6 +409,11 @@ static inline bool userfaultfd_wp_async(struct vm_area_struct *vma)
return false;
}
+static inline bool vma_has_uffd_without_event_remap(struct vm_area_struct *vma)
+{
+ return false;
+}
+
#endif /* CONFIG_USERFAULTFD */
static inline bool userfaultfd_wp_use_markers(struct vm_area_struct *vma)
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index e53d83b3e5cf..db64116a4f84 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2206,6 +2206,16 @@ static pmd_t move_soft_dirty_pmd(pmd_t pmd)
return pmd;
}
+static pmd_t clear_uffd_wp_pmd(pmd_t pmd)
+{
+ if (pmd_present(pmd))
+ pmd = pmd_clear_uffd_wp(pmd);
+ else if (is_swap_pmd(pmd))
+ pmd = pmd_swp_clear_uffd_wp(pmd);
+
+ return pmd;
+}
+
bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
unsigned long new_addr, pmd_t *old_pmd, pmd_t *new_pmd)
{
@@ -2244,6 +2254,8 @@ bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
pgtable_trans_huge_deposit(mm, new_pmd, pgtable);
}
pmd = move_soft_dirty_pmd(pmd);
+ if (vma_has_uffd_without_event_remap(vma))
+ pmd = clear_uffd_wp_pmd(pmd);
set_pmd_at(mm, new_addr, new_pmd, pmd);
if (force_flush)
flush_pmd_tlb_range(vma, old_addr, old_addr + PMD_SIZE);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index c498874a7170..eaaec19caa7c 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -5402,6 +5402,7 @@ static void move_huge_pte(struct vm_area_struct *vma, unsigned long old_addr,
unsigned long new_addr, pte_t *src_pte, pte_t *dst_pte,
unsigned long sz)
{
+ bool need_clear_uffd_wp = vma_has_uffd_without_event_remap(vma);
struct hstate *h = hstate_vma(vma);
struct mm_struct *mm = vma->vm_mm;
spinlock_t *src_ptl, *dst_ptl;
@@ -5418,7 +5419,18 @@ static void move_huge_pte(struct vm_area_struct *vma, unsigned long old_addr,
spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
pte = huge_ptep_get_and_clear(mm, old_addr, src_pte);
- set_huge_pte_at(mm, new_addr, dst_pte, pte, sz);
+
+ if (need_clear_uffd_wp && pte_marker_uffd_wp(pte))
+ huge_pte_clear(mm, new_addr, dst_pte, sz);
+ else {
+ if (need_clear_uffd_wp) {
+ if (pte_present(pte))
+ pte = huge_pte_clear_uffd_wp(pte);
+ else if (is_swap_pte(pte))
+ pte = pte_swp_clear_uffd_wp(pte);
+ }
+ set_huge_pte_at(mm, new_addr, dst_pte, pte, sz);
+ }
if (src_ptl != dst_ptl)
spin_unlock(src_ptl);
diff --git a/mm/mremap.c b/mm/mremap.c
index 60473413836b..cff7f552f909 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -138,6 +138,7 @@ static int move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
struct vm_area_struct *new_vma, pmd_t *new_pmd,
unsigned long new_addr, bool need_rmap_locks)
{
+ bool need_clear_uffd_wp = vma_has_uffd_without_event_remap(vma);
struct mm_struct *mm = vma->vm_mm;
pte_t *old_pte, *new_pte, pte;
pmd_t dummy_pmdval;
@@ -216,7 +217,18 @@ static int move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
force_flush = true;
pte = move_pte(pte, old_addr, new_addr);
pte = move_soft_dirty_pte(pte);
- set_pte_at(mm, new_addr, new_pte, pte);
+
+ if (need_clear_uffd_wp && pte_marker_uffd_wp(pte))
+ pte_clear(mm, new_addr, new_pte);
+ else {
+ if (need_clear_uffd_wp) {
+ if (pte_present(pte))
+ pte = pte_clear_uffd_wp(pte);
+ else if (is_swap_pte(pte))
+ pte = pte_swp_clear_uffd_wp(pte);
+ }
+ set_pte_at(mm, new_addr, new_pte, pte);
+ }
}
arch_leave_lazy_mmu_mode();
@@ -278,6 +290,15 @@ static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr,
if (WARN_ON_ONCE(!pmd_none(*new_pmd)))
return false;
+ /* If this pmd belongs to a uffd vma with remap events disabled, we need
+ * to ensure that the uffd-wp state is cleared from all pgtables. This
+ * means recursing into lower page tables in move_page_tables(), and we
+ * can reuse the existing code if we simply treat the entry as "not
+ * moved".
+ */
+ if (vma_has_uffd_without_event_remap(vma))
+ return false;
+
/*
* We don't have to worry about the ordering of src and dst
* ptlocks because exclusive mmap_lock prevents deadlock.
@@ -333,6 +354,15 @@ static bool move_normal_pud(struct vm_area_struct *vma, unsigned long old_addr,
if (WARN_ON_ONCE(!pud_none(*new_pud)))
return false;
+ /* If this pud belongs to a uffd vma with remap events disabled, we need
+ * to ensure that the uffd-wp state is cleared from all pgtables. This
+ * means recursing into lower page tables in move_page_tables(), and we
+ * can reuse the existing code if we simply treat the entry as "not
+ * moved".
+ */
+ if (vma_has_uffd_without_event_remap(vma))
+ return false;
+
/*
* We don't have to worry about the ordering of src and dst
* ptlocks because exclusive mmap_lock prevents deadlock.
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.10.y
git checkout FETCH_HEAD
git cherry-pick -x 0cef0bb836e3cfe00f08f9606c72abd72fe78ca3
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025012006-abnormal-unnoticed-6f89@gregkh' --subject-prefix 'PATCH 5.10.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 0cef0bb836e3cfe00f08f9606c72abd72fe78ca3 Mon Sep 17 00:00:00 2001
From: Ryan Roberts <ryan.roberts(a)arm.com>
Date: Tue, 7 Jan 2025 14:47:52 +0000
Subject: [PATCH] mm: clear uffd-wp PTE/PMD state on mremap()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
When mremap()ing a memory region previously registered with userfaultfd as
write-protected but without UFFD_FEATURE_EVENT_REMAP, an inconsistency in
flag clearing leads to a mismatch between the vma flags (which have
uffd-wp cleared) and the pte/pmd flags (which do not have uffd-wp
cleared). This mismatch causes a subsequent mprotect(PROT_WRITE) to
trigger a warning in page_table_check_pte_flags() due to setting the pte
to writable while uffd-wp is still set.
Fix this by always explicitly clearing the uffd-wp pte/pmd flags on any
such mremap() so that the values are consistent with the existing clearing
of VM_UFFD_WP. Be careful to clear the logical flag regardless of its
physical form; a PTE bit, a swap PTE bit, or a PTE marker. Cover PTE,
huge PMD and hugetlb paths.
Link: https://lkml.kernel.org/r/20250107144755.1871363-2-ryan.roberts@arm.com
Co-developed-by: Mikołaj Lenczewski <miko.lenczewski(a)arm.com>
Signed-off-by: Mikołaj Lenczewski <miko.lenczewski(a)arm.com>
Signed-off-by: Ryan Roberts <ryan.roberts(a)arm.com>
Closes: https://lore.kernel.org/linux-mm/810b44a8-d2ae-4107-b665-5a42eae2d948@arm.c…
Fixes: 63b2d4174c4a ("userfaultfd: wp: add the writeprotect API to userfaultfd ioctl")
Cc: David Hildenbrand <david(a)redhat.com>
Cc: Jann Horn <jannh(a)google.com>
Cc: Liam R. Howlett <Liam.Howlett(a)Oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes(a)oracle.com>
Cc: Mark Rutland <mark.rutland(a)arm.com>
Cc: Muchun Song <muchun.song(a)linux.dev>
Cc: Peter Xu <peterx(a)redhat.com>
Cc: Shuah Khan <shuah(a)kernel.org>
Cc: Vlastimil Babka <vbabka(a)suse.cz>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index cb40f1a1d081..75342022d144 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -247,6 +247,13 @@ static inline bool vma_can_userfault(struct vm_area_struct *vma,
vma_is_shmem(vma);
}
+static inline bool vma_has_uffd_without_event_remap(struct vm_area_struct *vma)
+{
+ struct userfaultfd_ctx *uffd_ctx = vma->vm_userfaultfd_ctx.ctx;
+
+ return uffd_ctx && (uffd_ctx->features & UFFD_FEATURE_EVENT_REMAP) == 0;
+}
+
extern int dup_userfaultfd(struct vm_area_struct *, struct list_head *);
extern void dup_userfaultfd_complete(struct list_head *);
void dup_userfaultfd_fail(struct list_head *);
@@ -402,6 +409,11 @@ static inline bool userfaultfd_wp_async(struct vm_area_struct *vma)
return false;
}
+static inline bool vma_has_uffd_without_event_remap(struct vm_area_struct *vma)
+{
+ return false;
+}
+
#endif /* CONFIG_USERFAULTFD */
static inline bool userfaultfd_wp_use_markers(struct vm_area_struct *vma)
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index e53d83b3e5cf..db64116a4f84 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2206,6 +2206,16 @@ static pmd_t move_soft_dirty_pmd(pmd_t pmd)
return pmd;
}
+static pmd_t clear_uffd_wp_pmd(pmd_t pmd)
+{
+ if (pmd_present(pmd))
+ pmd = pmd_clear_uffd_wp(pmd);
+ else if (is_swap_pmd(pmd))
+ pmd = pmd_swp_clear_uffd_wp(pmd);
+
+ return pmd;
+}
+
bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
unsigned long new_addr, pmd_t *old_pmd, pmd_t *new_pmd)
{
@@ -2244,6 +2254,8 @@ bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
pgtable_trans_huge_deposit(mm, new_pmd, pgtable);
}
pmd = move_soft_dirty_pmd(pmd);
+ if (vma_has_uffd_without_event_remap(vma))
+ pmd = clear_uffd_wp_pmd(pmd);
set_pmd_at(mm, new_addr, new_pmd, pmd);
if (force_flush)
flush_pmd_tlb_range(vma, old_addr, old_addr + PMD_SIZE);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index c498874a7170..eaaec19caa7c 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -5402,6 +5402,7 @@ static void move_huge_pte(struct vm_area_struct *vma, unsigned long old_addr,
unsigned long new_addr, pte_t *src_pte, pte_t *dst_pte,
unsigned long sz)
{
+ bool need_clear_uffd_wp = vma_has_uffd_without_event_remap(vma);
struct hstate *h = hstate_vma(vma);
struct mm_struct *mm = vma->vm_mm;
spinlock_t *src_ptl, *dst_ptl;
@@ -5418,7 +5419,18 @@ static void move_huge_pte(struct vm_area_struct *vma, unsigned long old_addr,
spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
pte = huge_ptep_get_and_clear(mm, old_addr, src_pte);
- set_huge_pte_at(mm, new_addr, dst_pte, pte, sz);
+
+ if (need_clear_uffd_wp && pte_marker_uffd_wp(pte))
+ huge_pte_clear(mm, new_addr, dst_pte, sz);
+ else {
+ if (need_clear_uffd_wp) {
+ if (pte_present(pte))
+ pte = huge_pte_clear_uffd_wp(pte);
+ else if (is_swap_pte(pte))
+ pte = pte_swp_clear_uffd_wp(pte);
+ }
+ set_huge_pte_at(mm, new_addr, dst_pte, pte, sz);
+ }
if (src_ptl != dst_ptl)
spin_unlock(src_ptl);
diff --git a/mm/mremap.c b/mm/mremap.c
index 60473413836b..cff7f552f909 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -138,6 +138,7 @@ static int move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
struct vm_area_struct *new_vma, pmd_t *new_pmd,
unsigned long new_addr, bool need_rmap_locks)
{
+ bool need_clear_uffd_wp = vma_has_uffd_without_event_remap(vma);
struct mm_struct *mm = vma->vm_mm;
pte_t *old_pte, *new_pte, pte;
pmd_t dummy_pmdval;
@@ -216,7 +217,18 @@ static int move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
force_flush = true;
pte = move_pte(pte, old_addr, new_addr);
pte = move_soft_dirty_pte(pte);
- set_pte_at(mm, new_addr, new_pte, pte);
+
+ if (need_clear_uffd_wp && pte_marker_uffd_wp(pte))
+ pte_clear(mm, new_addr, new_pte);
+ else {
+ if (need_clear_uffd_wp) {
+ if (pte_present(pte))
+ pte = pte_clear_uffd_wp(pte);
+ else if (is_swap_pte(pte))
+ pte = pte_swp_clear_uffd_wp(pte);
+ }
+ set_pte_at(mm, new_addr, new_pte, pte);
+ }
}
arch_leave_lazy_mmu_mode();
@@ -278,6 +290,15 @@ static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr,
if (WARN_ON_ONCE(!pmd_none(*new_pmd)))
return false;
+ /* If this pmd belongs to a uffd vma with remap events disabled, we need
+ * to ensure that the uffd-wp state is cleared from all pgtables. This
+ * means recursing into lower page tables in move_page_tables(), and we
+ * can reuse the existing code if we simply treat the entry as "not
+ * moved".
+ */
+ if (vma_has_uffd_without_event_remap(vma))
+ return false;
+
/*
* We don't have to worry about the ordering of src and dst
* ptlocks because exclusive mmap_lock prevents deadlock.
@@ -333,6 +354,15 @@ static bool move_normal_pud(struct vm_area_struct *vma, unsigned long old_addr,
if (WARN_ON_ONCE(!pud_none(*new_pud)))
return false;
+ /* If this pud belongs to a uffd vma with remap events disabled, we need
+ * to ensure that the uffd-wp state is cleared from all pgtables. This
+ * means recursing into lower page tables in move_page_tables(), and we
+ * can reuse the existing code if we simply treat the entry as "not
+ * moved".
+ */
+ if (vma_has_uffd_without_event_remap(vma))
+ return false;
+
/*
* We don't have to worry about the ordering of src and dst
* ptlocks because exclusive mmap_lock prevents deadlock.
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.15.y
git checkout FETCH_HEAD
git cherry-pick -x 0cef0bb836e3cfe00f08f9606c72abd72fe78ca3
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025012005-persecute-bankroll-36e6@gregkh' --subject-prefix 'PATCH 5.15.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 0cef0bb836e3cfe00f08f9606c72abd72fe78ca3 Mon Sep 17 00:00:00 2001
From: Ryan Roberts <ryan.roberts(a)arm.com>
Date: Tue, 7 Jan 2025 14:47:52 +0000
Subject: [PATCH] mm: clear uffd-wp PTE/PMD state on mremap()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
When mremap()ing a memory region previously registered with userfaultfd as
write-protected but without UFFD_FEATURE_EVENT_REMAP, an inconsistency in
flag clearing leads to a mismatch between the vma flags (which have
uffd-wp cleared) and the pte/pmd flags (which do not have uffd-wp
cleared). This mismatch causes a subsequent mprotect(PROT_WRITE) to
trigger a warning in page_table_check_pte_flags() due to setting the pte
to writable while uffd-wp is still set.
Fix this by always explicitly clearing the uffd-wp pte/pmd flags on any
such mremap() so that the values are consistent with the existing clearing
of VM_UFFD_WP. Be careful to clear the logical flag regardless of its
physical form; a PTE bit, a swap PTE bit, or a PTE marker. Cover PTE,
huge PMD and hugetlb paths.
Link: https://lkml.kernel.org/r/20250107144755.1871363-2-ryan.roberts@arm.com
Co-developed-by: Mikołaj Lenczewski <miko.lenczewski(a)arm.com>
Signed-off-by: Mikołaj Lenczewski <miko.lenczewski(a)arm.com>
Signed-off-by: Ryan Roberts <ryan.roberts(a)arm.com>
Closes: https://lore.kernel.org/linux-mm/810b44a8-d2ae-4107-b665-5a42eae2d948@arm.c…
Fixes: 63b2d4174c4a ("userfaultfd: wp: add the writeprotect API to userfaultfd ioctl")
Cc: David Hildenbrand <david(a)redhat.com>
Cc: Jann Horn <jannh(a)google.com>
Cc: Liam R. Howlett <Liam.Howlett(a)Oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes(a)oracle.com>
Cc: Mark Rutland <mark.rutland(a)arm.com>
Cc: Muchun Song <muchun.song(a)linux.dev>
Cc: Peter Xu <peterx(a)redhat.com>
Cc: Shuah Khan <shuah(a)kernel.org>
Cc: Vlastimil Babka <vbabka(a)suse.cz>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index cb40f1a1d081..75342022d144 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -247,6 +247,13 @@ static inline bool vma_can_userfault(struct vm_area_struct *vma,
vma_is_shmem(vma);
}
+static inline bool vma_has_uffd_without_event_remap(struct vm_area_struct *vma)
+{
+ struct userfaultfd_ctx *uffd_ctx = vma->vm_userfaultfd_ctx.ctx;
+
+ return uffd_ctx && (uffd_ctx->features & UFFD_FEATURE_EVENT_REMAP) == 0;
+}
+
extern int dup_userfaultfd(struct vm_area_struct *, struct list_head *);
extern void dup_userfaultfd_complete(struct list_head *);
void dup_userfaultfd_fail(struct list_head *);
@@ -402,6 +409,11 @@ static inline bool userfaultfd_wp_async(struct vm_area_struct *vma)
return false;
}
+static inline bool vma_has_uffd_without_event_remap(struct vm_area_struct *vma)
+{
+ return false;
+}
+
#endif /* CONFIG_USERFAULTFD */
static inline bool userfaultfd_wp_use_markers(struct vm_area_struct *vma)
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index e53d83b3e5cf..db64116a4f84 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2206,6 +2206,16 @@ static pmd_t move_soft_dirty_pmd(pmd_t pmd)
return pmd;
}
+static pmd_t clear_uffd_wp_pmd(pmd_t pmd)
+{
+ if (pmd_present(pmd))
+ pmd = pmd_clear_uffd_wp(pmd);
+ else if (is_swap_pmd(pmd))
+ pmd = pmd_swp_clear_uffd_wp(pmd);
+
+ return pmd;
+}
+
bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
unsigned long new_addr, pmd_t *old_pmd, pmd_t *new_pmd)
{
@@ -2244,6 +2254,8 @@ bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
pgtable_trans_huge_deposit(mm, new_pmd, pgtable);
}
pmd = move_soft_dirty_pmd(pmd);
+ if (vma_has_uffd_without_event_remap(vma))
+ pmd = clear_uffd_wp_pmd(pmd);
set_pmd_at(mm, new_addr, new_pmd, pmd);
if (force_flush)
flush_pmd_tlb_range(vma, old_addr, old_addr + PMD_SIZE);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index c498874a7170..eaaec19caa7c 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -5402,6 +5402,7 @@ static void move_huge_pte(struct vm_area_struct *vma, unsigned long old_addr,
unsigned long new_addr, pte_t *src_pte, pte_t *dst_pte,
unsigned long sz)
{
+ bool need_clear_uffd_wp = vma_has_uffd_without_event_remap(vma);
struct hstate *h = hstate_vma(vma);
struct mm_struct *mm = vma->vm_mm;
spinlock_t *src_ptl, *dst_ptl;
@@ -5418,7 +5419,18 @@ static void move_huge_pte(struct vm_area_struct *vma, unsigned long old_addr,
spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
pte = huge_ptep_get_and_clear(mm, old_addr, src_pte);
- set_huge_pte_at(mm, new_addr, dst_pte, pte, sz);
+
+ if (need_clear_uffd_wp && pte_marker_uffd_wp(pte))
+ huge_pte_clear(mm, new_addr, dst_pte, sz);
+ else {
+ if (need_clear_uffd_wp) {
+ if (pte_present(pte))
+ pte = huge_pte_clear_uffd_wp(pte);
+ else if (is_swap_pte(pte))
+ pte = pte_swp_clear_uffd_wp(pte);
+ }
+ set_huge_pte_at(mm, new_addr, dst_pte, pte, sz);
+ }
if (src_ptl != dst_ptl)
spin_unlock(src_ptl);
diff --git a/mm/mremap.c b/mm/mremap.c
index 60473413836b..cff7f552f909 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -138,6 +138,7 @@ static int move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
struct vm_area_struct *new_vma, pmd_t *new_pmd,
unsigned long new_addr, bool need_rmap_locks)
{
+ bool need_clear_uffd_wp = vma_has_uffd_without_event_remap(vma);
struct mm_struct *mm = vma->vm_mm;
pte_t *old_pte, *new_pte, pte;
pmd_t dummy_pmdval;
@@ -216,7 +217,18 @@ static int move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
force_flush = true;
pte = move_pte(pte, old_addr, new_addr);
pte = move_soft_dirty_pte(pte);
- set_pte_at(mm, new_addr, new_pte, pte);
+
+ if (need_clear_uffd_wp && pte_marker_uffd_wp(pte))
+ pte_clear(mm, new_addr, new_pte);
+ else {
+ if (need_clear_uffd_wp) {
+ if (pte_present(pte))
+ pte = pte_clear_uffd_wp(pte);
+ else if (is_swap_pte(pte))
+ pte = pte_swp_clear_uffd_wp(pte);
+ }
+ set_pte_at(mm, new_addr, new_pte, pte);
+ }
}
arch_leave_lazy_mmu_mode();
@@ -278,6 +290,15 @@ static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr,
if (WARN_ON_ONCE(!pmd_none(*new_pmd)))
return false;
+ /* If this pmd belongs to a uffd vma with remap events disabled, we need
+ * to ensure that the uffd-wp state is cleared from all pgtables. This
+ * means recursing into lower page tables in move_page_tables(), and we
+ * can reuse the existing code if we simply treat the entry as "not
+ * moved".
+ */
+ if (vma_has_uffd_without_event_remap(vma))
+ return false;
+
/*
* We don't have to worry about the ordering of src and dst
* ptlocks because exclusive mmap_lock prevents deadlock.
@@ -333,6 +354,15 @@ static bool move_normal_pud(struct vm_area_struct *vma, unsigned long old_addr,
if (WARN_ON_ONCE(!pud_none(*new_pud)))
return false;
+ /* If this pud belongs to a uffd vma with remap events disabled, we need
+ * to ensure that the uffd-wp state is cleared from all pgtables. This
+ * means recursing into lower page tables in move_page_tables(), and we
+ * can reuse the existing code if we simply treat the entry as "not
+ * moved".
+ */
+ if (vma_has_uffd_without_event_remap(vma))
+ return false;
+
/*
* We don't have to worry about the ordering of src and dst
* ptlocks because exclusive mmap_lock prevents deadlock.
The patch below does not apply to the 6.1-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.1.y
git checkout FETCH_HEAD
git cherry-pick -x 0cef0bb836e3cfe00f08f9606c72abd72fe78ca3
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025012004-applaud-dipped-9aec@gregkh' --subject-prefix 'PATCH 6.1.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 0cef0bb836e3cfe00f08f9606c72abd72fe78ca3 Mon Sep 17 00:00:00 2001
From: Ryan Roberts <ryan.roberts(a)arm.com>
Date: Tue, 7 Jan 2025 14:47:52 +0000
Subject: [PATCH] mm: clear uffd-wp PTE/PMD state on mremap()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
When mremap()ing a memory region previously registered with userfaultfd as
write-protected but without UFFD_FEATURE_EVENT_REMAP, an inconsistency in
flag clearing leads to a mismatch between the vma flags (which have
uffd-wp cleared) and the pte/pmd flags (which do not have uffd-wp
cleared). This mismatch causes a subsequent mprotect(PROT_WRITE) to
trigger a warning in page_table_check_pte_flags() due to setting the pte
to writable while uffd-wp is still set.
Fix this by always explicitly clearing the uffd-wp pte/pmd flags on any
such mremap() so that the values are consistent with the existing clearing
of VM_UFFD_WP. Be careful to clear the logical flag regardless of its
physical form; a PTE bit, a swap PTE bit, or a PTE marker. Cover PTE,
huge PMD and hugetlb paths.
Link: https://lkml.kernel.org/r/20250107144755.1871363-2-ryan.roberts@arm.com
Co-developed-by: Mikołaj Lenczewski <miko.lenczewski(a)arm.com>
Signed-off-by: Mikołaj Lenczewski <miko.lenczewski(a)arm.com>
Signed-off-by: Ryan Roberts <ryan.roberts(a)arm.com>
Closes: https://lore.kernel.org/linux-mm/810b44a8-d2ae-4107-b665-5a42eae2d948@arm.c…
Fixes: 63b2d4174c4a ("userfaultfd: wp: add the writeprotect API to userfaultfd ioctl")
Cc: David Hildenbrand <david(a)redhat.com>
Cc: Jann Horn <jannh(a)google.com>
Cc: Liam R. Howlett <Liam.Howlett(a)Oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes(a)oracle.com>
Cc: Mark Rutland <mark.rutland(a)arm.com>
Cc: Muchun Song <muchun.song(a)linux.dev>
Cc: Peter Xu <peterx(a)redhat.com>
Cc: Shuah Khan <shuah(a)kernel.org>
Cc: Vlastimil Babka <vbabka(a)suse.cz>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index cb40f1a1d081..75342022d144 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -247,6 +247,13 @@ static inline bool vma_can_userfault(struct vm_area_struct *vma,
vma_is_shmem(vma);
}
+static inline bool vma_has_uffd_without_event_remap(struct vm_area_struct *vma)
+{
+ struct userfaultfd_ctx *uffd_ctx = vma->vm_userfaultfd_ctx.ctx;
+
+ return uffd_ctx && (uffd_ctx->features & UFFD_FEATURE_EVENT_REMAP) == 0;
+}
+
extern int dup_userfaultfd(struct vm_area_struct *, struct list_head *);
extern void dup_userfaultfd_complete(struct list_head *);
void dup_userfaultfd_fail(struct list_head *);
@@ -402,6 +409,11 @@ static inline bool userfaultfd_wp_async(struct vm_area_struct *vma)
return false;
}
+static inline bool vma_has_uffd_without_event_remap(struct vm_area_struct *vma)
+{
+ return false;
+}
+
#endif /* CONFIG_USERFAULTFD */
static inline bool userfaultfd_wp_use_markers(struct vm_area_struct *vma)
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index e53d83b3e5cf..db64116a4f84 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2206,6 +2206,16 @@ static pmd_t move_soft_dirty_pmd(pmd_t pmd)
return pmd;
}
+static pmd_t clear_uffd_wp_pmd(pmd_t pmd)
+{
+ if (pmd_present(pmd))
+ pmd = pmd_clear_uffd_wp(pmd);
+ else if (is_swap_pmd(pmd))
+ pmd = pmd_swp_clear_uffd_wp(pmd);
+
+ return pmd;
+}
+
bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
unsigned long new_addr, pmd_t *old_pmd, pmd_t *new_pmd)
{
@@ -2244,6 +2254,8 @@ bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
pgtable_trans_huge_deposit(mm, new_pmd, pgtable);
}
pmd = move_soft_dirty_pmd(pmd);
+ if (vma_has_uffd_without_event_remap(vma))
+ pmd = clear_uffd_wp_pmd(pmd);
set_pmd_at(mm, new_addr, new_pmd, pmd);
if (force_flush)
flush_pmd_tlb_range(vma, old_addr, old_addr + PMD_SIZE);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index c498874a7170..eaaec19caa7c 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -5402,6 +5402,7 @@ static void move_huge_pte(struct vm_area_struct *vma, unsigned long old_addr,
unsigned long new_addr, pte_t *src_pte, pte_t *dst_pte,
unsigned long sz)
{
+ bool need_clear_uffd_wp = vma_has_uffd_without_event_remap(vma);
struct hstate *h = hstate_vma(vma);
struct mm_struct *mm = vma->vm_mm;
spinlock_t *src_ptl, *dst_ptl;
@@ -5418,7 +5419,18 @@ static void move_huge_pte(struct vm_area_struct *vma, unsigned long old_addr,
spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
pte = huge_ptep_get_and_clear(mm, old_addr, src_pte);
- set_huge_pte_at(mm, new_addr, dst_pte, pte, sz);
+
+ if (need_clear_uffd_wp && pte_marker_uffd_wp(pte))
+ huge_pte_clear(mm, new_addr, dst_pte, sz);
+ else {
+ if (need_clear_uffd_wp) {
+ if (pte_present(pte))
+ pte = huge_pte_clear_uffd_wp(pte);
+ else if (is_swap_pte(pte))
+ pte = pte_swp_clear_uffd_wp(pte);
+ }
+ set_huge_pte_at(mm, new_addr, dst_pte, pte, sz);
+ }
if (src_ptl != dst_ptl)
spin_unlock(src_ptl);
diff --git a/mm/mremap.c b/mm/mremap.c
index 60473413836b..cff7f552f909 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -138,6 +138,7 @@ static int move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
struct vm_area_struct *new_vma, pmd_t *new_pmd,
unsigned long new_addr, bool need_rmap_locks)
{
+ bool need_clear_uffd_wp = vma_has_uffd_without_event_remap(vma);
struct mm_struct *mm = vma->vm_mm;
pte_t *old_pte, *new_pte, pte;
pmd_t dummy_pmdval;
@@ -216,7 +217,18 @@ static int move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
force_flush = true;
pte = move_pte(pte, old_addr, new_addr);
pte = move_soft_dirty_pte(pte);
- set_pte_at(mm, new_addr, new_pte, pte);
+
+ if (need_clear_uffd_wp && pte_marker_uffd_wp(pte))
+ pte_clear(mm, new_addr, new_pte);
+ else {
+ if (need_clear_uffd_wp) {
+ if (pte_present(pte))
+ pte = pte_clear_uffd_wp(pte);
+ else if (is_swap_pte(pte))
+ pte = pte_swp_clear_uffd_wp(pte);
+ }
+ set_pte_at(mm, new_addr, new_pte, pte);
+ }
}
arch_leave_lazy_mmu_mode();
@@ -278,6 +290,15 @@ static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr,
if (WARN_ON_ONCE(!pmd_none(*new_pmd)))
return false;
+ /* If this pmd belongs to a uffd vma with remap events disabled, we need
+ * to ensure that the uffd-wp state is cleared from all pgtables. This
+ * means recursing into lower page tables in move_page_tables(), and we
+ * can reuse the existing code if we simply treat the entry as "not
+ * moved".
+ */
+ if (vma_has_uffd_without_event_remap(vma))
+ return false;
+
/*
* We don't have to worry about the ordering of src and dst
* ptlocks because exclusive mmap_lock prevents deadlock.
@@ -333,6 +354,15 @@ static bool move_normal_pud(struct vm_area_struct *vma, unsigned long old_addr,
if (WARN_ON_ONCE(!pud_none(*new_pud)))
return false;
+ /* If this pud belongs to a uffd vma with remap events disabled, we need
+ * to ensure that the uffd-wp state is cleared from all pgtables. This
+ * means recursing into lower page tables in move_page_tables(), and we
+ * can reuse the existing code if we simply treat the entry as "not
+ * moved".
+ */
+ if (vma_has_uffd_without_event_remap(vma))
+ return false;
+
/*
* We don't have to worry about the ordering of src and dst
* ptlocks because exclusive mmap_lock prevents deadlock.
The patch below does not apply to the 6.1-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.1.y
git checkout FETCH_HEAD
git cherry-pick -x 35ca53b7b0f0ffd16c6675fd76abac9409cf83e0
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025012030-alongside-scenic-3cc9@gregkh' --subject-prefix 'PATCH 6.1.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 35ca53b7b0f0ffd16c6675fd76abac9409cf83e0 Mon Sep 17 00:00:00 2001
From: Leo Li <sunpeng.li(a)amd.com>
Date: Wed, 11 Dec 2024 12:06:24 -0500
Subject: [PATCH] drm/amd/display: Do not elevate mem_type change to full
update
[Why]
There should not be any need to revalidate bandwidth on memory placement
change, since the fb is expected to be pinned to DCN-accessable memory
before scanout. For APU it's DRAM, and DGPU, it's VRAM. However, async
flips + memory type change needs to be rejected.
[How]
Do not set lock_and_validation_needed on mem_type change. Instead,
reject an async_flip request if the crtc's buffer(s) changed mem_type.
This may fix stuttering/corruption experienced with PSR SU and PSR1
panels, if the compositor allocates fbs in both VRAM carveout and GTT
and flips between them.
Fixes: a7c0cad0dc06 ("drm/amd/display: ensure async flips are only accepted for fast updates")
Reviewed-by: Tom Chung <chiahsuan.chung(a)amd.com>
Signed-off-by: Leo Li <sunpeng.li(a)amd.com>
Signed-off-by: Tom Chung <chiahsuan.chung(a)amd.com>
Tested-by: Daniel Wheeler <daniel.wheeler(a)amd.com>
Signed-off-by: Alex Deucher <alexander.deucher(a)amd.com>
(cherry picked from commit 4caacd1671b7a013ad04cd8b6398f002540bdd4d)
Cc: stable(a)vger.kernel.org
diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
index 56b47e02db0b..dcc5d8ded662 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
@@ -11379,6 +11379,25 @@ static int dm_crtc_get_cursor_mode(struct amdgpu_device *adev,
return 0;
}
+static bool amdgpu_dm_crtc_mem_type_changed(struct drm_device *dev,
+ struct drm_atomic_state *state,
+ struct drm_crtc_state *crtc_state)
+{
+ struct drm_plane *plane;
+ struct drm_plane_state *new_plane_state, *old_plane_state;
+
+ drm_for_each_plane_mask(plane, dev, crtc_state->plane_mask) {
+ new_plane_state = drm_atomic_get_plane_state(state, plane);
+ old_plane_state = drm_atomic_get_plane_state(state, plane);
+
+ if (old_plane_state->fb && new_plane_state->fb &&
+ get_mem_type(old_plane_state->fb) != get_mem_type(new_plane_state->fb))
+ return true;
+ }
+
+ return false;
+}
+
/**
* amdgpu_dm_atomic_check() - Atomic check implementation for AMDgpu DM.
*
@@ -11576,10 +11595,6 @@ static int amdgpu_dm_atomic_check(struct drm_device *dev,
/* Remove exiting planes if they are modified */
for_each_oldnew_plane_in_descending_zpos(state, plane, old_plane_state, new_plane_state) {
- if (old_plane_state->fb && new_plane_state->fb &&
- get_mem_type(old_plane_state->fb) !=
- get_mem_type(new_plane_state->fb))
- lock_and_validation_needed = true;
ret = dm_update_plane_state(dc, state, plane,
old_plane_state,
@@ -11874,9 +11889,11 @@ static int amdgpu_dm_atomic_check(struct drm_device *dev,
/*
* Only allow async flips for fast updates that don't change
- * the FB pitch, the DCC state, rotation, etc.
+ * the FB pitch, the DCC state, rotation, mem_type, etc.
*/
- if (new_crtc_state->async_flip && lock_and_validation_needed) {
+ if (new_crtc_state->async_flip &&
+ (lock_and_validation_needed ||
+ amdgpu_dm_crtc_mem_type_changed(dev, state, new_crtc_state))) {
drm_dbg_atomic(crtc->dev,
"[CRTC:%d:%s] async flips are only supported for fast updates\n",
crtc->base.id, crtc->name);
The patch below does not apply to the 6.6-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.6.y
git checkout FETCH_HEAD
git cherry-pick -x 35ca53b7b0f0ffd16c6675fd76abac9409cf83e0
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025012030-xerox-tremor-f7d9@gregkh' --subject-prefix 'PATCH 6.6.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 35ca53b7b0f0ffd16c6675fd76abac9409cf83e0 Mon Sep 17 00:00:00 2001
From: Leo Li <sunpeng.li(a)amd.com>
Date: Wed, 11 Dec 2024 12:06:24 -0500
Subject: [PATCH] drm/amd/display: Do not elevate mem_type change to full
update
[Why]
There should not be any need to revalidate bandwidth on memory placement
change, since the fb is expected to be pinned to DCN-accessable memory
before scanout. For APU it's DRAM, and DGPU, it's VRAM. However, async
flips + memory type change needs to be rejected.
[How]
Do not set lock_and_validation_needed on mem_type change. Instead,
reject an async_flip request if the crtc's buffer(s) changed mem_type.
This may fix stuttering/corruption experienced with PSR SU and PSR1
panels, if the compositor allocates fbs in both VRAM carveout and GTT
and flips between them.
Fixes: a7c0cad0dc06 ("drm/amd/display: ensure async flips are only accepted for fast updates")
Reviewed-by: Tom Chung <chiahsuan.chung(a)amd.com>
Signed-off-by: Leo Li <sunpeng.li(a)amd.com>
Signed-off-by: Tom Chung <chiahsuan.chung(a)amd.com>
Tested-by: Daniel Wheeler <daniel.wheeler(a)amd.com>
Signed-off-by: Alex Deucher <alexander.deucher(a)amd.com>
(cherry picked from commit 4caacd1671b7a013ad04cd8b6398f002540bdd4d)
Cc: stable(a)vger.kernel.org
diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
index 56b47e02db0b..dcc5d8ded662 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
@@ -11379,6 +11379,25 @@ static int dm_crtc_get_cursor_mode(struct amdgpu_device *adev,
return 0;
}
+static bool amdgpu_dm_crtc_mem_type_changed(struct drm_device *dev,
+ struct drm_atomic_state *state,
+ struct drm_crtc_state *crtc_state)
+{
+ struct drm_plane *plane;
+ struct drm_plane_state *new_plane_state, *old_plane_state;
+
+ drm_for_each_plane_mask(plane, dev, crtc_state->plane_mask) {
+ new_plane_state = drm_atomic_get_plane_state(state, plane);
+ old_plane_state = drm_atomic_get_plane_state(state, plane);
+
+ if (old_plane_state->fb && new_plane_state->fb &&
+ get_mem_type(old_plane_state->fb) != get_mem_type(new_plane_state->fb))
+ return true;
+ }
+
+ return false;
+}
+
/**
* amdgpu_dm_atomic_check() - Atomic check implementation for AMDgpu DM.
*
@@ -11576,10 +11595,6 @@ static int amdgpu_dm_atomic_check(struct drm_device *dev,
/* Remove exiting planes if they are modified */
for_each_oldnew_plane_in_descending_zpos(state, plane, old_plane_state, new_plane_state) {
- if (old_plane_state->fb && new_plane_state->fb &&
- get_mem_type(old_plane_state->fb) !=
- get_mem_type(new_plane_state->fb))
- lock_and_validation_needed = true;
ret = dm_update_plane_state(dc, state, plane,
old_plane_state,
@@ -11874,9 +11889,11 @@ static int amdgpu_dm_atomic_check(struct drm_device *dev,
/*
* Only allow async flips for fast updates that don't change
- * the FB pitch, the DCC state, rotation, etc.
+ * the FB pitch, the DCC state, rotation, mem_type, etc.
*/
- if (new_crtc_state->async_flip && lock_and_validation_needed) {
+ if (new_crtc_state->async_flip &&
+ (lock_and_validation_needed ||
+ amdgpu_dm_crtc_mem_type_changed(dev, state, new_crtc_state))) {
drm_dbg_atomic(crtc->dev,
"[CRTC:%d:%s] async flips are only supported for fast updates\n",
crtc->base.id, crtc->name);
The patch below does not apply to the 6.12-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.12.y
git checkout FETCH_HEAD
git cherry-pick -x 05c82ee363f64c64b87a0cfd744298e9333475f5
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025012056-marina-lagging-26cc@gregkh' --subject-prefix 'PATCH 6.12.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 05c82ee363f64c64b87a0cfd744298e9333475f5 Mon Sep 17 00:00:00 2001
From: Suren Baghdasaryan <surenb(a)google.com>
Date: Thu, 26 Dec 2024 13:16:39 -0800
Subject: [PATCH] alloc_tag: skip pgalloc_tag_swap if profiling is disabled
When memory allocation profiling is disabled, there is no need to swap
allocation tags during migration. Skip it to avoid unnecessary overhead.
Once I added these checks, the overhead of the mode when memory profiling
is enabled but turned off went down by about 50%.
Link: https://lkml.kernel.org/r/20241226211639.1357704-2-surenb@google.com
Fixes: e0a955bf7f61 ("mm/codetag: add pgalloc_tag_copy()")
Signed-off-by: Suren Baghdasaryan <surenb(a)google.com>
Cc: David Wang <00107082(a)163.com>
Cc: Kent Overstreet <kent.overstreet(a)linux.dev>
Cc: Yu Zhao <yuzhao(a)google.com>
Cc: Zhenhua Huang <quic_zhenhuah(a)quicinc.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
diff --git a/lib/alloc_tag.c b/lib/alloc_tag.c
index 7dcebf118a3e..65e706e1bc19 100644
--- a/lib/alloc_tag.c
+++ b/lib/alloc_tag.c
@@ -195,6 +195,9 @@ void pgalloc_tag_swap(struct folio *new, struct folio *old)
union codetag_ref ref_old, ref_new;
struct alloc_tag *tag_old, *tag_new;
+ if (!mem_alloc_profiling_enabled())
+ return;
+
tag_old = pgalloc_tag_get(&old->page);
if (!tag_old)
return;
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.4.y
git checkout FETCH_HEAD
git cherry-pick -x b8ed9da102beb2d0926a1d7a7e652392190151c0
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025012032-tidal-chatting-cba2@gregkh' --subject-prefix 'PATCH 5.4.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From b8ed9da102beb2d0926a1d7a7e652392190151c0 Mon Sep 17 00:00:00 2001
From: Meetakshi Setiya <msetiya(a)microsoft.com>
Date: Fri, 10 Jan 2025 07:10:27 -0500
Subject: [PATCH] cifs: support reconnect with alternate password for SMB1
SMB1 shares the mount and remount code paths with SMB2/3 and already
supports password rotation in some scenarios. This patch extends the
password rotation support to SMB1 reconnects as well.
Cc: stable(a)vger.kernel.org
Signed-off-by: Meetakshi Setiya <msetiya(a)microsoft.com>
Signed-off-by: Steve French <stfrench(a)microsoft.com>
diff --git a/fs/smb/client/cifssmb.c b/fs/smb/client/cifssmb.c
index 6cb1e81993f8..ab0b949924d7 100644
--- a/fs/smb/client/cifssmb.c
+++ b/fs/smb/client/cifssmb.c
@@ -152,8 +152,17 @@ cifs_reconnect_tcon(struct cifs_tcon *tcon, int smb_command)
spin_unlock(&ses->ses_lock);
rc = cifs_negotiate_protocol(0, ses, server);
- if (!rc)
+ if (!rc) {
rc = cifs_setup_session(0, ses, server, ses->local_nls);
+ if ((rc == -EACCES) || (rc == -EHOSTDOWN) || (rc == -EKEYREVOKED)) {
+ /*
+ * Try alternate password for next reconnect if an alternate
+ * password is available.
+ */
+ if (ses->password2)
+ swap(ses->password2, ses->password);
+ }
+ }
/* do we need to reconnect tcon? */
if (rc || !tcon->need_reconnect) {
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.10.y
git checkout FETCH_HEAD
git cherry-pick -x b8ed9da102beb2d0926a1d7a7e652392190151c0
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025012032-goggles-gory-004a@gregkh' --subject-prefix 'PATCH 5.10.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From b8ed9da102beb2d0926a1d7a7e652392190151c0 Mon Sep 17 00:00:00 2001
From: Meetakshi Setiya <msetiya(a)microsoft.com>
Date: Fri, 10 Jan 2025 07:10:27 -0500
Subject: [PATCH] cifs: support reconnect with alternate password for SMB1
SMB1 shares the mount and remount code paths with SMB2/3 and already
supports password rotation in some scenarios. This patch extends the
password rotation support to SMB1 reconnects as well.
Cc: stable(a)vger.kernel.org
Signed-off-by: Meetakshi Setiya <msetiya(a)microsoft.com>
Signed-off-by: Steve French <stfrench(a)microsoft.com>
diff --git a/fs/smb/client/cifssmb.c b/fs/smb/client/cifssmb.c
index 6cb1e81993f8..ab0b949924d7 100644
--- a/fs/smb/client/cifssmb.c
+++ b/fs/smb/client/cifssmb.c
@@ -152,8 +152,17 @@ cifs_reconnect_tcon(struct cifs_tcon *tcon, int smb_command)
spin_unlock(&ses->ses_lock);
rc = cifs_negotiate_protocol(0, ses, server);
- if (!rc)
+ if (!rc) {
rc = cifs_setup_session(0, ses, server, ses->local_nls);
+ if ((rc == -EACCES) || (rc == -EHOSTDOWN) || (rc == -EKEYREVOKED)) {
+ /*
+ * Try alternate password for next reconnect if an alternate
+ * password is available.
+ */
+ if (ses->password2)
+ swap(ses->password2, ses->password);
+ }
+ }
/* do we need to reconnect tcon? */
if (rc || !tcon->need_reconnect) {
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.15.y
git checkout FETCH_HEAD
git cherry-pick -x b8ed9da102beb2d0926a1d7a7e652392190151c0
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025012031-dealer-graded-ea0e@gregkh' --subject-prefix 'PATCH 5.15.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From b8ed9da102beb2d0926a1d7a7e652392190151c0 Mon Sep 17 00:00:00 2001
From: Meetakshi Setiya <msetiya(a)microsoft.com>
Date: Fri, 10 Jan 2025 07:10:27 -0500
Subject: [PATCH] cifs: support reconnect with alternate password for SMB1
SMB1 shares the mount and remount code paths with SMB2/3 and already
supports password rotation in some scenarios. This patch extends the
password rotation support to SMB1 reconnects as well.
Cc: stable(a)vger.kernel.org
Signed-off-by: Meetakshi Setiya <msetiya(a)microsoft.com>
Signed-off-by: Steve French <stfrench(a)microsoft.com>
diff --git a/fs/smb/client/cifssmb.c b/fs/smb/client/cifssmb.c
index 6cb1e81993f8..ab0b949924d7 100644
--- a/fs/smb/client/cifssmb.c
+++ b/fs/smb/client/cifssmb.c
@@ -152,8 +152,17 @@ cifs_reconnect_tcon(struct cifs_tcon *tcon, int smb_command)
spin_unlock(&ses->ses_lock);
rc = cifs_negotiate_protocol(0, ses, server);
- if (!rc)
+ if (!rc) {
rc = cifs_setup_session(0, ses, server, ses->local_nls);
+ if ((rc == -EACCES) || (rc == -EHOSTDOWN) || (rc == -EKEYREVOKED)) {
+ /*
+ * Try alternate password for next reconnect if an alternate
+ * password is available.
+ */
+ if (ses->password2)
+ swap(ses->password2, ses->password);
+ }
+ }
/* do we need to reconnect tcon? */
if (rc || !tcon->need_reconnect) {
The patch below does not apply to the 6.1-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.1.y
git checkout FETCH_HEAD
git cherry-pick -x b8ed9da102beb2d0926a1d7a7e652392190151c0
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025012031-partly-sputter-b363@gregkh' --subject-prefix 'PATCH 6.1.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From b8ed9da102beb2d0926a1d7a7e652392190151c0 Mon Sep 17 00:00:00 2001
From: Meetakshi Setiya <msetiya(a)microsoft.com>
Date: Fri, 10 Jan 2025 07:10:27 -0500
Subject: [PATCH] cifs: support reconnect with alternate password for SMB1
SMB1 shares the mount and remount code paths with SMB2/3 and already
supports password rotation in some scenarios. This patch extends the
password rotation support to SMB1 reconnects as well.
Cc: stable(a)vger.kernel.org
Signed-off-by: Meetakshi Setiya <msetiya(a)microsoft.com>
Signed-off-by: Steve French <stfrench(a)microsoft.com>
diff --git a/fs/smb/client/cifssmb.c b/fs/smb/client/cifssmb.c
index 6cb1e81993f8..ab0b949924d7 100644
--- a/fs/smb/client/cifssmb.c
+++ b/fs/smb/client/cifssmb.c
@@ -152,8 +152,17 @@ cifs_reconnect_tcon(struct cifs_tcon *tcon, int smb_command)
spin_unlock(&ses->ses_lock);
rc = cifs_negotiate_protocol(0, ses, server);
- if (!rc)
+ if (!rc) {
rc = cifs_setup_session(0, ses, server, ses->local_nls);
+ if ((rc == -EACCES) || (rc == -EHOSTDOWN) || (rc == -EKEYREVOKED)) {
+ /*
+ * Try alternate password for next reconnect if an alternate
+ * password is available.
+ */
+ if (ses->password2)
+ swap(ses->password2, ses->password);
+ }
+ }
/* do we need to reconnect tcon? */
if (rc || !tcon->need_reconnect) {
The patch below does not apply to the 6.6-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.6.y
git checkout FETCH_HEAD
git cherry-pick -x b8ed9da102beb2d0926a1d7a7e652392190151c0
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025012030-scanning-undrilled-830d@gregkh' --subject-prefix 'PATCH 6.6.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From b8ed9da102beb2d0926a1d7a7e652392190151c0 Mon Sep 17 00:00:00 2001
From: Meetakshi Setiya <msetiya(a)microsoft.com>
Date: Fri, 10 Jan 2025 07:10:27 -0500
Subject: [PATCH] cifs: support reconnect with alternate password for SMB1
SMB1 shares the mount and remount code paths with SMB2/3 and already
supports password rotation in some scenarios. This patch extends the
password rotation support to SMB1 reconnects as well.
Cc: stable(a)vger.kernel.org
Signed-off-by: Meetakshi Setiya <msetiya(a)microsoft.com>
Signed-off-by: Steve French <stfrench(a)microsoft.com>
diff --git a/fs/smb/client/cifssmb.c b/fs/smb/client/cifssmb.c
index 6cb1e81993f8..ab0b949924d7 100644
--- a/fs/smb/client/cifssmb.c
+++ b/fs/smb/client/cifssmb.c
@@ -152,8 +152,17 @@ cifs_reconnect_tcon(struct cifs_tcon *tcon, int smb_command)
spin_unlock(&ses->ses_lock);
rc = cifs_negotiate_protocol(0, ses, server);
- if (!rc)
+ if (!rc) {
rc = cifs_setup_session(0, ses, server, ses->local_nls);
+ if ((rc == -EACCES) || (rc == -EHOSTDOWN) || (rc == -EKEYREVOKED)) {
+ /*
+ * Try alternate password for next reconnect if an alternate
+ * password is available.
+ */
+ if (ses->password2)
+ swap(ses->password2, ses->password);
+ }
+ }
/* do we need to reconnect tcon? */
if (rc || !tcon->need_reconnect) {
The patch below does not apply to the 6.12-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.12.y
git checkout FETCH_HEAD
git cherry-pick -x b8ed9da102beb2d0926a1d7a7e652392190151c0
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025012030-print-avert-f15d@gregkh' --subject-prefix 'PATCH 6.12.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From b8ed9da102beb2d0926a1d7a7e652392190151c0 Mon Sep 17 00:00:00 2001
From: Meetakshi Setiya <msetiya(a)microsoft.com>
Date: Fri, 10 Jan 2025 07:10:27 -0500
Subject: [PATCH] cifs: support reconnect with alternate password for SMB1
SMB1 shares the mount and remount code paths with SMB2/3 and already
supports password rotation in some scenarios. This patch extends the
password rotation support to SMB1 reconnects as well.
Cc: stable(a)vger.kernel.org
Signed-off-by: Meetakshi Setiya <msetiya(a)microsoft.com>
Signed-off-by: Steve French <stfrench(a)microsoft.com>
diff --git a/fs/smb/client/cifssmb.c b/fs/smb/client/cifssmb.c
index 6cb1e81993f8..ab0b949924d7 100644
--- a/fs/smb/client/cifssmb.c
+++ b/fs/smb/client/cifssmb.c
@@ -152,8 +152,17 @@ cifs_reconnect_tcon(struct cifs_tcon *tcon, int smb_command)
spin_unlock(&ses->ses_lock);
rc = cifs_negotiate_protocol(0, ses, server);
- if (!rc)
+ if (!rc) {
rc = cifs_setup_session(0, ses, server, ses->local_nls);
+ if ((rc == -EACCES) || (rc == -EHOSTDOWN) || (rc == -EKEYREVOKED)) {
+ /*
+ * Try alternate password for next reconnect if an alternate
+ * password is available.
+ */
+ if (ses->password2)
+ swap(ses->password2, ses->password);
+ }
+ }
/* do we need to reconnect tcon? */
if (rc || !tcon->need_reconnect) {
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.15.y
git checkout FETCH_HEAD
git cherry-pick -x 779b9955f64327c339a16f68055af98252fd3315
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025012002-spent-fervor-ada9@gregkh' --subject-prefix 'PATCH 5.15.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 779b9955f64327c339a16f68055af98252fd3315 Mon Sep 17 00:00:00 2001
From: Yosry Ahmed <yosryahmed(a)google.com>
Date: Mon, 13 Jan 2025 21:44:58 +0000
Subject: [PATCH] mm: zswap: move allocations during CPU init outside the lock
In zswap_cpu_comp_prepare(), allocations are made and assigned to various
members of acomp_ctx under acomp_ctx->mutex. However, allocations may
recurse into zswap through reclaim, trying to acquire the same mutex and
deadlocking.
Move the allocations before the mutex critical section. Only the
initialization of acomp_ctx needs to be done with the mutex held.
Link: https://lkml.kernel.org/r/20250113214458.2123410-1-yosryahmed@google.com
Fixes: 12dcb0ef5406 ("mm: zswap: properly synchronize freeing resources during CPU hotunplug")
Signed-off-by: Yosry Ahmed <yosryahmed(a)google.com>
Reviewed-by: Chengming Zhou <chengming.zhou(a)linux.dev>
Cc: Johannes Weiner <hannes(a)cmpxchg.org>
Cc: Nhat Pham <nphamcs(a)gmail.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
diff --git a/mm/zswap.c b/mm/zswap.c
index 30f5a27a6862..b84c20d889b1 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -820,15 +820,15 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node)
{
struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node);
struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu);
- struct crypto_acomp *acomp;
- struct acomp_req *req;
+ struct crypto_acomp *acomp = NULL;
+ struct acomp_req *req = NULL;
+ u8 *buffer = NULL;
int ret;
- mutex_lock(&acomp_ctx->mutex);
- acomp_ctx->buffer = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu));
- if (!acomp_ctx->buffer) {
+ buffer = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu));
+ if (!buffer) {
ret = -ENOMEM;
- goto buffer_fail;
+ goto fail;
}
acomp = crypto_alloc_acomp_node(pool->tfm_name, 0, 0, cpu_to_node(cpu));
@@ -836,21 +836,25 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node)
pr_err("could not alloc crypto acomp %s : %ld\n",
pool->tfm_name, PTR_ERR(acomp));
ret = PTR_ERR(acomp);
- goto acomp_fail;
+ goto fail;
}
- acomp_ctx->acomp = acomp;
- acomp_ctx->is_sleepable = acomp_is_async(acomp);
- req = acomp_request_alloc(acomp_ctx->acomp);
+ req = acomp_request_alloc(acomp);
if (!req) {
pr_err("could not alloc crypto acomp_request %s\n",
pool->tfm_name);
ret = -ENOMEM;
- goto req_fail;
+ goto fail;
}
- acomp_ctx->req = req;
+ /*
+ * Only hold the mutex after completing allocations, otherwise we may
+ * recurse into zswap through reclaim and attempt to hold the mutex
+ * again resulting in a deadlock.
+ */
+ mutex_lock(&acomp_ctx->mutex);
crypto_init_wait(&acomp_ctx->wait);
+
/*
* if the backend of acomp is async zip, crypto_req_done() will wakeup
* crypto_wait_req(); if the backend of acomp is scomp, the callback
@@ -859,15 +863,17 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node)
acomp_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG,
crypto_req_done, &acomp_ctx->wait);
+ acomp_ctx->buffer = buffer;
+ acomp_ctx->acomp = acomp;
+ acomp_ctx->is_sleepable = acomp_is_async(acomp);
+ acomp_ctx->req = req;
mutex_unlock(&acomp_ctx->mutex);
return 0;
-req_fail:
- crypto_free_acomp(acomp_ctx->acomp);
-acomp_fail:
- kfree(acomp_ctx->buffer);
-buffer_fail:
- mutex_unlock(&acomp_ctx->mutex);
+fail:
+ if (acomp)
+ crypto_free_acomp(acomp);
+ kfree(buffer);
return ret;
}
The patch below does not apply to the 6.1-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.1.y
git checkout FETCH_HEAD
git cherry-pick -x 779b9955f64327c339a16f68055af98252fd3315
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025012002-overstep-numbness-cf60@gregkh' --subject-prefix 'PATCH 6.1.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 779b9955f64327c339a16f68055af98252fd3315 Mon Sep 17 00:00:00 2001
From: Yosry Ahmed <yosryahmed(a)google.com>
Date: Mon, 13 Jan 2025 21:44:58 +0000
Subject: [PATCH] mm: zswap: move allocations during CPU init outside the lock
In zswap_cpu_comp_prepare(), allocations are made and assigned to various
members of acomp_ctx under acomp_ctx->mutex. However, allocations may
recurse into zswap through reclaim, trying to acquire the same mutex and
deadlocking.
Move the allocations before the mutex critical section. Only the
initialization of acomp_ctx needs to be done with the mutex held.
Link: https://lkml.kernel.org/r/20250113214458.2123410-1-yosryahmed@google.com
Fixes: 12dcb0ef5406 ("mm: zswap: properly synchronize freeing resources during CPU hotunplug")
Signed-off-by: Yosry Ahmed <yosryahmed(a)google.com>
Reviewed-by: Chengming Zhou <chengming.zhou(a)linux.dev>
Cc: Johannes Weiner <hannes(a)cmpxchg.org>
Cc: Nhat Pham <nphamcs(a)gmail.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
diff --git a/mm/zswap.c b/mm/zswap.c
index 30f5a27a6862..b84c20d889b1 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -820,15 +820,15 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node)
{
struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node);
struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu);
- struct crypto_acomp *acomp;
- struct acomp_req *req;
+ struct crypto_acomp *acomp = NULL;
+ struct acomp_req *req = NULL;
+ u8 *buffer = NULL;
int ret;
- mutex_lock(&acomp_ctx->mutex);
- acomp_ctx->buffer = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu));
- if (!acomp_ctx->buffer) {
+ buffer = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu));
+ if (!buffer) {
ret = -ENOMEM;
- goto buffer_fail;
+ goto fail;
}
acomp = crypto_alloc_acomp_node(pool->tfm_name, 0, 0, cpu_to_node(cpu));
@@ -836,21 +836,25 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node)
pr_err("could not alloc crypto acomp %s : %ld\n",
pool->tfm_name, PTR_ERR(acomp));
ret = PTR_ERR(acomp);
- goto acomp_fail;
+ goto fail;
}
- acomp_ctx->acomp = acomp;
- acomp_ctx->is_sleepable = acomp_is_async(acomp);
- req = acomp_request_alloc(acomp_ctx->acomp);
+ req = acomp_request_alloc(acomp);
if (!req) {
pr_err("could not alloc crypto acomp_request %s\n",
pool->tfm_name);
ret = -ENOMEM;
- goto req_fail;
+ goto fail;
}
- acomp_ctx->req = req;
+ /*
+ * Only hold the mutex after completing allocations, otherwise we may
+ * recurse into zswap through reclaim and attempt to hold the mutex
+ * again resulting in a deadlock.
+ */
+ mutex_lock(&acomp_ctx->mutex);
crypto_init_wait(&acomp_ctx->wait);
+
/*
* if the backend of acomp is async zip, crypto_req_done() will wakeup
* crypto_wait_req(); if the backend of acomp is scomp, the callback
@@ -859,15 +863,17 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node)
acomp_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG,
crypto_req_done, &acomp_ctx->wait);
+ acomp_ctx->buffer = buffer;
+ acomp_ctx->acomp = acomp;
+ acomp_ctx->is_sleepable = acomp_is_async(acomp);
+ acomp_ctx->req = req;
mutex_unlock(&acomp_ctx->mutex);
return 0;
-req_fail:
- crypto_free_acomp(acomp_ctx->acomp);
-acomp_fail:
- kfree(acomp_ctx->buffer);
-buffer_fail:
- mutex_unlock(&acomp_ctx->mutex);
+fail:
+ if (acomp)
+ crypto_free_acomp(acomp);
+ kfree(buffer);
return ret;
}
The patch below does not apply to the 6.6-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.6.y
git checkout FETCH_HEAD
git cherry-pick -x 779b9955f64327c339a16f68055af98252fd3315
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025012001-elbow-configure-d7c7@gregkh' --subject-prefix 'PATCH 6.6.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 779b9955f64327c339a16f68055af98252fd3315 Mon Sep 17 00:00:00 2001
From: Yosry Ahmed <yosryahmed(a)google.com>
Date: Mon, 13 Jan 2025 21:44:58 +0000
Subject: [PATCH] mm: zswap: move allocations during CPU init outside the lock
In zswap_cpu_comp_prepare(), allocations are made and assigned to various
members of acomp_ctx under acomp_ctx->mutex. However, allocations may
recurse into zswap through reclaim, trying to acquire the same mutex and
deadlocking.
Move the allocations before the mutex critical section. Only the
initialization of acomp_ctx needs to be done with the mutex held.
Link: https://lkml.kernel.org/r/20250113214458.2123410-1-yosryahmed@google.com
Fixes: 12dcb0ef5406 ("mm: zswap: properly synchronize freeing resources during CPU hotunplug")
Signed-off-by: Yosry Ahmed <yosryahmed(a)google.com>
Reviewed-by: Chengming Zhou <chengming.zhou(a)linux.dev>
Cc: Johannes Weiner <hannes(a)cmpxchg.org>
Cc: Nhat Pham <nphamcs(a)gmail.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
diff --git a/mm/zswap.c b/mm/zswap.c
index 30f5a27a6862..b84c20d889b1 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -820,15 +820,15 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node)
{
struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node);
struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu);
- struct crypto_acomp *acomp;
- struct acomp_req *req;
+ struct crypto_acomp *acomp = NULL;
+ struct acomp_req *req = NULL;
+ u8 *buffer = NULL;
int ret;
- mutex_lock(&acomp_ctx->mutex);
- acomp_ctx->buffer = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu));
- if (!acomp_ctx->buffer) {
+ buffer = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu));
+ if (!buffer) {
ret = -ENOMEM;
- goto buffer_fail;
+ goto fail;
}
acomp = crypto_alloc_acomp_node(pool->tfm_name, 0, 0, cpu_to_node(cpu));
@@ -836,21 +836,25 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node)
pr_err("could not alloc crypto acomp %s : %ld\n",
pool->tfm_name, PTR_ERR(acomp));
ret = PTR_ERR(acomp);
- goto acomp_fail;
+ goto fail;
}
- acomp_ctx->acomp = acomp;
- acomp_ctx->is_sleepable = acomp_is_async(acomp);
- req = acomp_request_alloc(acomp_ctx->acomp);
+ req = acomp_request_alloc(acomp);
if (!req) {
pr_err("could not alloc crypto acomp_request %s\n",
pool->tfm_name);
ret = -ENOMEM;
- goto req_fail;
+ goto fail;
}
- acomp_ctx->req = req;
+ /*
+ * Only hold the mutex after completing allocations, otherwise we may
+ * recurse into zswap through reclaim and attempt to hold the mutex
+ * again resulting in a deadlock.
+ */
+ mutex_lock(&acomp_ctx->mutex);
crypto_init_wait(&acomp_ctx->wait);
+
/*
* if the backend of acomp is async zip, crypto_req_done() will wakeup
* crypto_wait_req(); if the backend of acomp is scomp, the callback
@@ -859,15 +863,17 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node)
acomp_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG,
crypto_req_done, &acomp_ctx->wait);
+ acomp_ctx->buffer = buffer;
+ acomp_ctx->acomp = acomp;
+ acomp_ctx->is_sleepable = acomp_is_async(acomp);
+ acomp_ctx->req = req;
mutex_unlock(&acomp_ctx->mutex);
return 0;
-req_fail:
- crypto_free_acomp(acomp_ctx->acomp);
-acomp_fail:
- kfree(acomp_ctx->buffer);
-buffer_fail:
- mutex_unlock(&acomp_ctx->mutex);
+fail:
+ if (acomp)
+ crypto_free_acomp(acomp);
+ kfree(buffer);
return ret;
}
The patch below does not apply to the 6.12-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.12.y
git checkout FETCH_HEAD
git cherry-pick -x 779b9955f64327c339a16f68055af98252fd3315
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025012057-carrousel-marbled-221b@gregkh' --subject-prefix 'PATCH 6.12.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 779b9955f64327c339a16f68055af98252fd3315 Mon Sep 17 00:00:00 2001
From: Yosry Ahmed <yosryahmed(a)google.com>
Date: Mon, 13 Jan 2025 21:44:58 +0000
Subject: [PATCH] mm: zswap: move allocations during CPU init outside the lock
In zswap_cpu_comp_prepare(), allocations are made and assigned to various
members of acomp_ctx under acomp_ctx->mutex. However, allocations may
recurse into zswap through reclaim, trying to acquire the same mutex and
deadlocking.
Move the allocations before the mutex critical section. Only the
initialization of acomp_ctx needs to be done with the mutex held.
Link: https://lkml.kernel.org/r/20250113214458.2123410-1-yosryahmed@google.com
Fixes: 12dcb0ef5406 ("mm: zswap: properly synchronize freeing resources during CPU hotunplug")
Signed-off-by: Yosry Ahmed <yosryahmed(a)google.com>
Reviewed-by: Chengming Zhou <chengming.zhou(a)linux.dev>
Cc: Johannes Weiner <hannes(a)cmpxchg.org>
Cc: Nhat Pham <nphamcs(a)gmail.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
diff --git a/mm/zswap.c b/mm/zswap.c
index 30f5a27a6862..b84c20d889b1 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -820,15 +820,15 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node)
{
struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node);
struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu);
- struct crypto_acomp *acomp;
- struct acomp_req *req;
+ struct crypto_acomp *acomp = NULL;
+ struct acomp_req *req = NULL;
+ u8 *buffer = NULL;
int ret;
- mutex_lock(&acomp_ctx->mutex);
- acomp_ctx->buffer = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu));
- if (!acomp_ctx->buffer) {
+ buffer = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu));
+ if (!buffer) {
ret = -ENOMEM;
- goto buffer_fail;
+ goto fail;
}
acomp = crypto_alloc_acomp_node(pool->tfm_name, 0, 0, cpu_to_node(cpu));
@@ -836,21 +836,25 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node)
pr_err("could not alloc crypto acomp %s : %ld\n",
pool->tfm_name, PTR_ERR(acomp));
ret = PTR_ERR(acomp);
- goto acomp_fail;
+ goto fail;
}
- acomp_ctx->acomp = acomp;
- acomp_ctx->is_sleepable = acomp_is_async(acomp);
- req = acomp_request_alloc(acomp_ctx->acomp);
+ req = acomp_request_alloc(acomp);
if (!req) {
pr_err("could not alloc crypto acomp_request %s\n",
pool->tfm_name);
ret = -ENOMEM;
- goto req_fail;
+ goto fail;
}
- acomp_ctx->req = req;
+ /*
+ * Only hold the mutex after completing allocations, otherwise we may
+ * recurse into zswap through reclaim and attempt to hold the mutex
+ * again resulting in a deadlock.
+ */
+ mutex_lock(&acomp_ctx->mutex);
crypto_init_wait(&acomp_ctx->wait);
+
/*
* if the backend of acomp is async zip, crypto_req_done() will wakeup
* crypto_wait_req(); if the backend of acomp is scomp, the callback
@@ -859,15 +863,17 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node)
acomp_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG,
crypto_req_done, &acomp_ctx->wait);
+ acomp_ctx->buffer = buffer;
+ acomp_ctx->acomp = acomp;
+ acomp_ctx->is_sleepable = acomp_is_async(acomp);
+ acomp_ctx->req = req;
mutex_unlock(&acomp_ctx->mutex);
return 0;
-req_fail:
- crypto_free_acomp(acomp_ctx->acomp);
-acomp_fail:
- kfree(acomp_ctx->buffer);
-buffer_fail:
- mutex_unlock(&acomp_ctx->mutex);
+fail:
+ if (acomp)
+ crypto_free_acomp(acomp);
+ kfree(buffer);
return ret;
}
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.15.y
git checkout FETCH_HEAD
git cherry-pick -x 12dcb0ef540629a281533f9dedc1b6b8e14cfb65
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025012015-siren-backyard-e676@gregkh' --subject-prefix 'PATCH 5.15.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 12dcb0ef540629a281533f9dedc1b6b8e14cfb65 Mon Sep 17 00:00:00 2001
From: Yosry Ahmed <yosryahmed(a)google.com>
Date: Wed, 8 Jan 2025 22:24:41 +0000
Subject: [PATCH] mm: zswap: properly synchronize freeing resources during CPU
hotunplug
In zswap_compress() and zswap_decompress(), the per-CPU acomp_ctx of the
current CPU at the beginning of the operation is retrieved and used
throughout. However, since neither preemption nor migration are disabled,
it is possible that the operation continues on a different CPU.
If the original CPU is hotunplugged while the acomp_ctx is still in use,
we run into a UAF bug as some of the resources attached to the acomp_ctx
are freed during hotunplug in zswap_cpu_comp_dead() (i.e.
acomp_ctx.buffer, acomp_ctx.req, or acomp_ctx.acomp).
The problem was introduced in commit 1ec3b5fe6eec ("mm/zswap: move to use
crypto_acomp API for hardware acceleration") when the switch to the
crypto_acomp API was made. Prior to that, the per-CPU crypto_comp was
retrieved using get_cpu_ptr() which disables preemption and makes sure the
CPU cannot go away from under us. Preemption cannot be disabled with the
crypto_acomp API as a sleepable context is needed.
Use the acomp_ctx.mutex to synchronize CPU hotplug callbacks allocating
and freeing resources with compression/decompression paths. Make sure
that acomp_ctx.req is NULL when the resources are freed. In the
compression/decompression paths, check if acomp_ctx.req is NULL after
acquiring the mutex (meaning the CPU was offlined) and retry on the new
CPU.
The initialization of acomp_ctx.mutex is moved from the CPU hotplug
callback to the pool initialization where it belongs (where the mutex is
allocated). In addition to adding clarity, this makes sure that CPU
hotplug cannot reinitialize a mutex that is already locked by
compression/decompression.
Previously a fix was attempted by holding cpus_read_lock() [1]. This
would have caused a potential deadlock as it is possible for code already
holding the lock to fall into reclaim and enter zswap (causing a
deadlock). A fix was also attempted using SRCU for synchronization, but
Johannes pointed out that synchronize_srcu() cannot be used in CPU hotplug
notifiers [2].
Alternative fixes that were considered/attempted and could have worked:
- Refcounting the per-CPU acomp_ctx. This involves complexity in
handling the race between the refcount dropping to zero in
zswap_[de]compress() and the refcount being re-initialized when the
CPU is onlined.
- Disabling migration before getting the per-CPU acomp_ctx [3], but
that's discouraged and is a much bigger hammer than needed, and could
result in subtle performance issues.
[1]https://lkml.kernel.org/20241219212437.2714151-1-yosryahmed@google.com/
[2]https://lkml.kernel.org/20250107074724.1756696-2-yosryahmed@google.com/
[3]https://lkml.kernel.org/20250107222236.2715883-2-yosryahmed@google.com/
[yosryahmed(a)google.com: remove comment]
Link: https://lkml.kernel.org/r/CAJD7tkaxS1wjn+swugt8QCvQ-rVF5RZnjxwPGX17k8x9zSMa…
Link: https://lkml.kernel.org/r/20250108222441.3622031-1-yosryahmed@google.com
Fixes: 1ec3b5fe6eec ("mm/zswap: move to use crypto_acomp API for hardware acceleration")
Signed-off-by: Yosry Ahmed <yosryahmed(a)google.com>
Reported-by: Johannes Weiner <hannes(a)cmpxchg.org>
Closes: https://lore.kernel.org/lkml/20241113213007.GB1564047@cmpxchg.org/
Reported-by: Sam Sun <samsun1006219(a)gmail.com>
Closes: https://lore.kernel.org/lkml/CAEkJfYMtSdM5HceNsXUDf5haghD5+o2e7Qv4OcuruL4tP…
Cc: Barry Song <baohua(a)kernel.org>
Cc: Chengming Zhou <chengming.zhou(a)linux.dev>
Cc: Kanchana P Sridhar <kanchana.p.sridhar(a)intel.com>
Cc: Nhat Pham <nphamcs(a)gmail.com>
Cc: Vitaly Wool <vitalywool(a)gmail.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
diff --git a/mm/zswap.c b/mm/zswap.c
index f6316b66fb23..30f5a27a6862 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -251,7 +251,7 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
struct zswap_pool *pool;
char name[38]; /* 'zswap' + 32 char (max) num + \0 */
gfp_t gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM;
- int ret;
+ int ret, cpu;
if (!zswap_has_pool) {
/* if either are unset, pool initialization failed, and we
@@ -285,6 +285,9 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
goto error;
}
+ for_each_possible_cpu(cpu)
+ mutex_init(&per_cpu_ptr(pool->acomp_ctx, cpu)->mutex);
+
ret = cpuhp_state_add_instance(CPUHP_MM_ZSWP_POOL_PREPARE,
&pool->node);
if (ret)
@@ -821,11 +824,12 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node)
struct acomp_req *req;
int ret;
- mutex_init(&acomp_ctx->mutex);
-
+ mutex_lock(&acomp_ctx->mutex);
acomp_ctx->buffer = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu));
- if (!acomp_ctx->buffer)
- return -ENOMEM;
+ if (!acomp_ctx->buffer) {
+ ret = -ENOMEM;
+ goto buffer_fail;
+ }
acomp = crypto_alloc_acomp_node(pool->tfm_name, 0, 0, cpu_to_node(cpu));
if (IS_ERR(acomp)) {
@@ -855,12 +859,15 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node)
acomp_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG,
crypto_req_done, &acomp_ctx->wait);
+ mutex_unlock(&acomp_ctx->mutex);
return 0;
req_fail:
crypto_free_acomp(acomp_ctx->acomp);
acomp_fail:
kfree(acomp_ctx->buffer);
+buffer_fail:
+ mutex_unlock(&acomp_ctx->mutex);
return ret;
}
@@ -869,17 +876,45 @@ static int zswap_cpu_comp_dead(unsigned int cpu, struct hlist_node *node)
struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node);
struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu);
+ mutex_lock(&acomp_ctx->mutex);
if (!IS_ERR_OR_NULL(acomp_ctx)) {
if (!IS_ERR_OR_NULL(acomp_ctx->req))
acomp_request_free(acomp_ctx->req);
+ acomp_ctx->req = NULL;
if (!IS_ERR_OR_NULL(acomp_ctx->acomp))
crypto_free_acomp(acomp_ctx->acomp);
kfree(acomp_ctx->buffer);
}
+ mutex_unlock(&acomp_ctx->mutex);
return 0;
}
+static struct crypto_acomp_ctx *acomp_ctx_get_cpu_lock(struct zswap_pool *pool)
+{
+ struct crypto_acomp_ctx *acomp_ctx;
+
+ for (;;) {
+ acomp_ctx = raw_cpu_ptr(pool->acomp_ctx);
+ mutex_lock(&acomp_ctx->mutex);
+ if (likely(acomp_ctx->req))
+ return acomp_ctx;
+ /*
+ * It is possible that we were migrated to a different CPU after
+ * getting the per-CPU ctx but before the mutex was acquired. If
+ * the old CPU got offlined, zswap_cpu_comp_dead() could have
+ * already freed ctx->req (among other things) and set it to
+ * NULL. Just try again on the new CPU that we ended up on.
+ */
+ mutex_unlock(&acomp_ctx->mutex);
+ }
+}
+
+static void acomp_ctx_put_unlock(struct crypto_acomp_ctx *acomp_ctx)
+{
+ mutex_unlock(&acomp_ctx->mutex);
+}
+
static bool zswap_compress(struct page *page, struct zswap_entry *entry,
struct zswap_pool *pool)
{
@@ -893,10 +928,7 @@ static bool zswap_compress(struct page *page, struct zswap_entry *entry,
gfp_t gfp;
u8 *dst;
- acomp_ctx = raw_cpu_ptr(pool->acomp_ctx);
-
- mutex_lock(&acomp_ctx->mutex);
-
+ acomp_ctx = acomp_ctx_get_cpu_lock(pool);
dst = acomp_ctx->buffer;
sg_init_table(&input, 1);
sg_set_page(&input, page, PAGE_SIZE, 0);
@@ -949,7 +981,7 @@ static bool zswap_compress(struct page *page, struct zswap_entry *entry,
else if (alloc_ret)
zswap_reject_alloc_fail++;
- mutex_unlock(&acomp_ctx->mutex);
+ acomp_ctx_put_unlock(acomp_ctx);
return comp_ret == 0 && alloc_ret == 0;
}
@@ -960,9 +992,7 @@ static void zswap_decompress(struct zswap_entry *entry, struct folio *folio)
struct crypto_acomp_ctx *acomp_ctx;
u8 *src;
- acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx);
- mutex_lock(&acomp_ctx->mutex);
-
+ acomp_ctx = acomp_ctx_get_cpu_lock(entry->pool);
src = zpool_map_handle(zpool, entry->handle, ZPOOL_MM_RO);
/*
* If zpool_map_handle is atomic, we cannot reliably utilize its mapped buffer
@@ -986,10 +1016,10 @@ static void zswap_decompress(struct zswap_entry *entry, struct folio *folio)
acomp_request_set_params(acomp_ctx->req, &input, &output, entry->length, PAGE_SIZE);
BUG_ON(crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait));
BUG_ON(acomp_ctx->req->dlen != PAGE_SIZE);
- mutex_unlock(&acomp_ctx->mutex);
if (src != acomp_ctx->buffer)
zpool_unmap_handle(zpool, entry->handle);
+ acomp_ctx_put_unlock(acomp_ctx);
}
/*********************************