The patch titled
Subject: nilfs2: propagate directory read errors from nilfs_find_entry()
has been added to the -mm mm-hotfixes-unstable branch. Its filename is
nilfs2-propagate-directory-read-errors-from-nilfs_find_entry.patch
This patch will shortly appear at
https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patche…
This patch will later appear in the mm-hotfixes-unstable branch at
git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next via the mm-everything
branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
and is updated there every 2-3 working days
------------------------------------------------------
From: Ryusuke Konishi <konishi.ryusuke(a)gmail.com>
Subject: nilfs2: propagate directory read errors from nilfs_find_entry()
Date: Fri, 4 Oct 2024 12:35:31 +0900
Syzbot reported that a task hang occurs in vcs_open() during a fuzzing
test for nilfs2.
The root cause of this problem is that in nilfs_find_entry(), which
searches for directory entries, ignores errors when loading a directory
page/folio via nilfs_get_folio() fails.
If the filesystem images is corrupted, and the i_size of the directory
inode is large, and the directory page/folio is successfully read but
fails the sanity check, for example when it is zero-filled,
nilfs_check_folio() may continue to spit out error messages in bursts.
Fix this issue by propagating the error to the callers when loading a
page/folio fails in nilfs_find_entry().
The current interface of nilfs_find_entry() and its callers is outdated
and cannot propagate error codes such as -EIO and -ENOMEM returned via
nilfs_find_entry(), so fix it together.
Link: https://lkml.kernel.org/r/20241004033640.6841-1-konishi.ryusuke@gmail.com
Fixes: 2ba466d74ed7 ("nilfs2: directory entry operations")
Signed-off-by: Ryusuke Konishi <konishi.ryusuke(a)gmail.com>
Reported-by: Lizhi Xu <lizhi.xu(a)windriver.com>
Closes: https://lkml.kernel.org/r/20240927013806.3577931-1-lizhi.xu@windriver.com
Reported-by: syzbot+8a192e8d090fa9a31135(a)syzkaller.appspotmail.com
Closes: https://syzkaller.appspot.com/bug?extid=8a192e8d090fa9a31135
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
fs/nilfs2/dir.c | 48 ++++++++++++++++++++++----------------------
fs/nilfs2/namei.c | 39 +++++++++++++++++++++++------------
fs/nilfs2/nilfs.h | 2 -
3 files changed, 52 insertions(+), 37 deletions(-)
--- a/fs/nilfs2/dir.c~nilfs2-propagate-directory-read-errors-from-nilfs_find_entry
+++ a/fs/nilfs2/dir.c
@@ -289,7 +289,7 @@ static int nilfs_readdir(struct file *fi
* The folio is mapped and unlocked. When the caller is finished with
* the entry, it should call folio_release_kmap().
*
- * On failure, returns NULL and the caller should ignore foliop.
+ * On failure, returns an error pointer and the caller should ignore foliop.
*/
struct nilfs_dir_entry *nilfs_find_entry(struct inode *dir,
const struct qstr *qstr, struct folio **foliop)
@@ -312,22 +312,24 @@ struct nilfs_dir_entry *nilfs_find_entry
do {
char *kaddr = nilfs_get_folio(dir, n, foliop);
- if (!IS_ERR(kaddr)) {
- de = (struct nilfs_dir_entry *)kaddr;
- kaddr += nilfs_last_byte(dir, n) - reclen;
- while ((char *) de <= kaddr) {
- if (de->rec_len == 0) {
- nilfs_error(dir->i_sb,
- "zero-length directory entry");
- folio_release_kmap(*foliop, kaddr);
- goto out;
- }
- if (nilfs_match(namelen, name, de))
- goto found;
- de = nilfs_next_entry(de);
+ if (IS_ERR(kaddr))
+ return ERR_CAST(kaddr);
+
+ de = (struct nilfs_dir_entry *)kaddr;
+ kaddr += nilfs_last_byte(dir, n) - reclen;
+ while ((char *)de <= kaddr) {
+ if (de->rec_len == 0) {
+ nilfs_error(dir->i_sb,
+ "zero-length directory entry");
+ folio_release_kmap(*foliop, kaddr);
+ goto out;
}
- folio_release_kmap(*foliop, kaddr);
+ if (nilfs_match(namelen, name, de))
+ goto found;
+ de = nilfs_next_entry(de);
}
+ folio_release_kmap(*foliop, kaddr);
+
if (++n >= npages)
n = 0;
/* next folio is past the blocks we've got */
@@ -340,7 +342,7 @@ struct nilfs_dir_entry *nilfs_find_entry
}
} while (n != start);
out:
- return NULL;
+ return ERR_PTR(-ENOENT);
found:
ei->i_dir_start_lookup = n;
@@ -384,18 +386,18 @@ fail:
return NULL;
}
-ino_t nilfs_inode_by_name(struct inode *dir, const struct qstr *qstr)
+int nilfs_inode_by_name(struct inode *dir, const struct qstr *qstr, ino_t *ino)
{
- ino_t res = 0;
struct nilfs_dir_entry *de;
struct folio *folio;
de = nilfs_find_entry(dir, qstr, &folio);
- if (de) {
- res = le64_to_cpu(de->inode);
- folio_release_kmap(folio, de);
- }
- return res;
+ if (IS_ERR(de))
+ return PTR_ERR(de);
+
+ *ino = le64_to_cpu(de->inode);
+ folio_release_kmap(folio, de);
+ return 0;
}
void nilfs_set_link(struct inode *dir, struct nilfs_dir_entry *de,
--- a/fs/nilfs2/namei.c~nilfs2-propagate-directory-read-errors-from-nilfs_find_entry
+++ a/fs/nilfs2/namei.c
@@ -55,12 +55,20 @@ nilfs_lookup(struct inode *dir, struct d
{
struct inode *inode;
ino_t ino;
+ int res;
if (dentry->d_name.len > NILFS_NAME_LEN)
return ERR_PTR(-ENAMETOOLONG);
- ino = nilfs_inode_by_name(dir, &dentry->d_name);
- inode = ino ? nilfs_iget(dir->i_sb, NILFS_I(dir)->i_root, ino) : NULL;
+ res = nilfs_inode_by_name(dir, &dentry->d_name, &ino);
+ if (res) {
+ if (res != -ENOENT)
+ return ERR_PTR(res);
+ inode = NULL;
+ } else {
+ inode = nilfs_iget(dir->i_sb, NILFS_I(dir)->i_root, ino);
+ }
+
return d_splice_alias(inode, dentry);
}
@@ -263,10 +271,11 @@ static int nilfs_do_unlink(struct inode
struct folio *folio;
int err;
- err = -ENOENT;
de = nilfs_find_entry(dir, &dentry->d_name, &folio);
- if (!de)
+ if (IS_ERR(de)) {
+ err = PTR_ERR(de);
goto out;
+ }
inode = d_inode(dentry);
err = -EIO;
@@ -362,10 +371,11 @@ static int nilfs_rename(struct mnt_idmap
if (unlikely(err))
return err;
- err = -ENOENT;
old_de = nilfs_find_entry(old_dir, &old_dentry->d_name, &old_folio);
- if (!old_de)
+ if (IS_ERR(old_de)) {
+ err = PTR_ERR(old_de);
goto out;
+ }
if (S_ISDIR(old_inode->i_mode)) {
err = -EIO;
@@ -382,10 +392,12 @@ static int nilfs_rename(struct mnt_idmap
if (dir_de && !nilfs_empty_dir(new_inode))
goto out_dir;
- err = -ENOENT;
- new_de = nilfs_find_entry(new_dir, &new_dentry->d_name, &new_folio);
- if (!new_de)
+ new_de = nilfs_find_entry(new_dir, &new_dentry->d_name,
+ &new_folio);
+ if (IS_ERR(new_de)) {
+ err = PTR_ERR(new_de);
goto out_dir;
+ }
nilfs_set_link(new_dir, new_de, new_folio, old_inode);
folio_release_kmap(new_folio, new_de);
nilfs_mark_inode_dirty(new_dir);
@@ -440,12 +452,13 @@ out:
*/
static struct dentry *nilfs_get_parent(struct dentry *child)
{
- unsigned long ino;
+ ino_t ino;
+ int res;
struct nilfs_root *root;
- ino = nilfs_inode_by_name(d_inode(child), &dotdot_name);
- if (!ino)
- return ERR_PTR(-ENOENT);
+ res = nilfs_inode_by_name(d_inode(child), &dotdot_name, &ino);
+ if (res)
+ return ERR_PTR(res);
root = NILFS_I(d_inode(child))->i_root;
--- a/fs/nilfs2/nilfs.h~nilfs2-propagate-directory-read-errors-from-nilfs_find_entry
+++ a/fs/nilfs2/nilfs.h
@@ -254,7 +254,7 @@ static inline __u32 nilfs_mask_flags(umo
/* dir.c */
int nilfs_add_link(struct dentry *, struct inode *);
-ino_t nilfs_inode_by_name(struct inode *, const struct qstr *);
+int nilfs_inode_by_name(struct inode *dir, const struct qstr *qstr, ino_t *ino);
int nilfs_make_empty(struct inode *, struct inode *);
struct nilfs_dir_entry *nilfs_find_entry(struct inode *, const struct qstr *,
struct folio **);
_
Patches currently in -mm which might be from konishi.ryusuke(a)gmail.com are
nilfs2-propagate-directory-read-errors-from-nilfs_find_entry.patch
The quilt patch titled
Subject: secretmem: disable memfd_secret() if arch cannot set direct map
has been removed from the -mm tree. Its filename was
secretmem-disable-memfd_secret-if-arch-cannot-set-direct-map.patch
This patch was dropped because it was merged into the mm-hotfixes-stable branch
of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
------------------------------------------------------
From: Patrick Roy <roypat(a)amazon.co.uk>
Subject: secretmem: disable memfd_secret() if arch cannot set direct map
Date: Tue, 1 Oct 2024 09:00:41 +0100
Return -ENOSYS from memfd_secret() syscall if !can_set_direct_map(). This
is the case for example on some arm64 configurations, where marking 4k
PTEs in the direct map not present can only be done if the direct map is
set up at 4k granularity in the first place (as ARM's break-before-make
semantics do not easily allow breaking apart large/gigantic pages).
More precisely, on arm64 systems with !can_set_direct_map(),
set_direct_map_invalid_noflush() is a no-op, however it returns success
(0) instead of an error. This means that memfd_secret will seemingly
"work" (e.g. syscall succeeds, you can mmap the fd and fault in pages),
but it does not actually achieve its goal of removing its memory from the
direct map.
Note that with this patch, memfd_secret() will start erroring on systems
where can_set_direct_map() returns false (arm64 with
CONFIG_RODATA_FULL_DEFAULT_ENABLED=n, CONFIG_DEBUG_PAGEALLOC=n and
CONFIG_KFENCE=n), but that still seems better than the current silent
failure. Since CONFIG_RODATA_FULL_DEFAULT_ENABLED defaults to 'y', most
arm64 systems actually have a working memfd_secret() and aren't be
affected.
From going through the iterations of the original memfd_secret patch
series, it seems that disabling the syscall in these scenarios was the
intended behavior [1] (preferred over having
set_direct_map_invalid_noflush return an error as that would result in
SIGBUSes at page-fault time), however the check for it got dropped between
v16 [2] and v17 [3], when secretmem moved away from CMA allocations.
[1]: https://lore.kernel.org/lkml/20201124164930.GK8537@kernel.org/
[2]: https://lore.kernel.org/lkml/20210121122723.3446-11-rppt@kernel.org/#t
[3]: https://lore.kernel.org/lkml/20201125092208.12544-10-rppt@kernel.org/
Link: https://lkml.kernel.org/r/20241001080056.784735-1-roypat@amazon.co.uk
Fixes: 1507f51255c9 ("mm: introduce memfd_secret system call to create "secret" memory areas")
Signed-off-by: Patrick Roy <roypat(a)amazon.co.uk>
Reviewed-by: Mike Rapoport (Microsoft) <rppt(a)kernel.org>
Cc: Alexander Graf <graf(a)amazon.com>
Cc: David Hildenbrand <david(a)redhat.com>
Cc: James Gowans <jgowans(a)amazon.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/secretmem.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
--- a/mm/secretmem.c~secretmem-disable-memfd_secret-if-arch-cannot-set-direct-map
+++ a/mm/secretmem.c
@@ -238,7 +238,7 @@ SYSCALL_DEFINE1(memfd_secret, unsigned i
/* make sure local flags do not confict with global fcntl.h */
BUILD_BUG_ON(SECRETMEM_FLAGS_MASK & O_CLOEXEC);
- if (!secretmem_enable)
+ if (!secretmem_enable || !can_set_direct_map())
return -ENOSYS;
if (flags & ~(SECRETMEM_FLAGS_MASK | O_CLOEXEC))
@@ -280,7 +280,7 @@ static struct file_system_type secretmem
static int __init secretmem_init(void)
{
- if (!secretmem_enable)
+ if (!secretmem_enable || !can_set_direct_map())
return 0;
secretmem_mnt = kern_mount(&secretmem_fs);
_
Patches currently in -mm which might be from roypat(a)amazon.co.uk are
The quilt patch titled
Subject: fs/proc/kcore.c: allow translation of physical memory addresses
has been removed from the -mm tree. Its filename was
fs-proc-kcorec-allow-translation-of-physical-memory-addresses.patch
This patch was dropped because it was merged into the mm-hotfixes-stable branch
of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
------------------------------------------------------
From: Alexander Gordeev <agordeev(a)linux.ibm.com>
Subject: fs/proc/kcore.c: allow translation of physical memory addresses
Date: Mon, 30 Sep 2024 14:21:19 +0200
When /proc/kcore is read an attempt to read the first two pages results in
HW-specific page swap on s390 and another (so called prefix) pages are
accessed instead. That leads to a wrong read.
Allow architecture-specific translation of memory addresses using
kc_xlate_dev_mem_ptr() and kc_unxlate_dev_mem_ptr() callbacks similarily
to /dev/mem xlate_dev_mem_ptr() and unxlate_dev_mem_ptr() callbacks. That
way an architecture can deal with specific physical memory ranges.
Re-use the existing /dev/mem callback implementation on s390, which
handles the described prefix pages swapping correctly.
For other architectures the default callback is basically NOP. It is
expected the condition (vaddr == __va(__pa(vaddr))) always holds true for
KCORE_RAM memory type.
Link: https://lkml.kernel.org/r/20240930122119.1651546-1-agordeev@linux.ibm.com
Signed-off-by: Alexander Gordeev <agordeev(a)linux.ibm.com>
Suggested-by: Heiko Carstens <hca(a)linux.ibm.com>
Cc: Vasily Gorbik <gor(a)linux.ibm.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
arch/s390/include/asm/io.h | 2 +
fs/proc/kcore.c | 36 +++++++++++++++++++++++++++++++++--
2 files changed, 36 insertions(+), 2 deletions(-)
--- a/arch/s390/include/asm/io.h~fs-proc-kcorec-allow-translation-of-physical-memory-addresses
+++ a/arch/s390/include/asm/io.h
@@ -16,8 +16,10 @@
#include <asm/pci_io.h>
#define xlate_dev_mem_ptr xlate_dev_mem_ptr
+#define kc_xlate_dev_mem_ptr xlate_dev_mem_ptr
void *xlate_dev_mem_ptr(phys_addr_t phys);
#define unxlate_dev_mem_ptr unxlate_dev_mem_ptr
+#define kc_unxlate_dev_mem_ptr unxlate_dev_mem_ptr
void unxlate_dev_mem_ptr(phys_addr_t phys, void *addr);
#define IO_SPACE_LIMIT 0
--- a/fs/proc/kcore.c~fs-proc-kcorec-allow-translation-of-physical-memory-addresses
+++ a/fs/proc/kcore.c
@@ -50,6 +50,20 @@ static struct proc_dir_entry *proc_root_
#define kc_offset_to_vaddr(o) ((o) + PAGE_OFFSET)
#endif
+#ifndef kc_xlate_dev_mem_ptr
+#define kc_xlate_dev_mem_ptr kc_xlate_dev_mem_ptr
+static inline void *kc_xlate_dev_mem_ptr(phys_addr_t phys)
+{
+ return __va(phys);
+}
+#endif
+#ifndef kc_unxlate_dev_mem_ptr
+#define kc_unxlate_dev_mem_ptr kc_unxlate_dev_mem_ptr
+static inline void kc_unxlate_dev_mem_ptr(phys_addr_t phys, void *virt)
+{
+}
+#endif
+
static LIST_HEAD(kclist_head);
static DECLARE_RWSEM(kclist_lock);
static int kcore_need_update = 1;
@@ -471,6 +485,8 @@ static ssize_t read_kcore_iter(struct ki
while (buflen) {
struct page *page;
unsigned long pfn;
+ phys_addr_t phys;
+ void *__start;
/*
* If this is the first iteration or the address is not within
@@ -537,7 +553,8 @@ static ssize_t read_kcore_iter(struct ki
}
break;
case KCORE_RAM:
- pfn = __pa(start) >> PAGE_SHIFT;
+ phys = __pa(start);
+ pfn = phys >> PAGE_SHIFT;
page = pfn_to_online_page(pfn);
/*
@@ -557,13 +574,28 @@ static ssize_t read_kcore_iter(struct ki
fallthrough;
case KCORE_VMEMMAP:
case KCORE_TEXT:
+ if (m->type == KCORE_RAM) {
+ __start = kc_xlate_dev_mem_ptr(phys);
+ if (!__start) {
+ ret = -ENOMEM;
+ if (iov_iter_zero(tsz, iter) != tsz)
+ ret = -EFAULT;
+ goto out;
+ }
+ } else {
+ __start = (void *)start;
+ }
+
/*
* Sadly we must use a bounce buffer here to be able to
* make use of copy_from_kernel_nofault(), as these
* memory regions might not always be mapped on all
* architectures.
*/
- if (copy_from_kernel_nofault(buf, (void *)start, tsz)) {
+ ret = copy_from_kernel_nofault(buf, __start, tsz);
+ if (m->type == KCORE_RAM)
+ kc_unxlate_dev_mem_ptr(phys, __start);
+ if (ret) {
if (iov_iter_zero(tsz, iter) != tsz) {
ret = -EFAULT;
goto out;
_
Patches currently in -mm which might be from agordeev(a)linux.ibm.com are
The quilt patch titled
Subject: selftests/mm: fix incorrect buffer->mirror size in hmm2 double_map test
has been removed from the -mm tree. Its filename was
selftests-mm-fixed-incorrect-buffer-mirror-size-in-hmm2-double_map-test.patch
This patch was dropped because it was merged into the mm-hotfixes-stable branch
of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
------------------------------------------------------
From: Donet Tom <donettom(a)linux.ibm.com>
Subject: selftests/mm: fix incorrect buffer->mirror size in hmm2 double_map test
Date: Fri, 27 Sep 2024 00:07:52 -0500
The hmm2 double_map test was failing due to an incorrect buffer->mirror
size. The buffer->mirror size was 6, while buffer->ptr size was 6 *
PAGE_SIZE. The test failed because the kernel's copy_to_user function was
attempting to copy a 6 * PAGE_SIZE buffer to buffer->mirror. Since the
size of buffer->mirror was incorrect, copy_to_user failed.
This patch corrects the buffer->mirror size to 6 * PAGE_SIZE.
Test Result without this patch
==============================
# RUN hmm2.hmm2_device_private.double_map ...
# hmm-tests.c:1680:double_map:Expected ret (-14) == 0 (0)
# double_map: Test terminated by assertion
# FAIL hmm2.hmm2_device_private.double_map
not ok 53 hmm2.hmm2_device_private.double_map
Test Result with this patch
===========================
# RUN hmm2.hmm2_device_private.double_map ...
# OK hmm2.hmm2_device_private.double_map
ok 53 hmm2.hmm2_device_private.double_map
Link: https://lkml.kernel.org/r/20240927050752.51066-1-donettom@linux.ibm.com
Fixes: fee9f6d1b8df ("mm/hmm/test: add selftests for HMM")
Signed-off-by: Donet Tom <donettom(a)linux.ibm.com>
Reviewed-by: Muhammad Usama Anjum <usama.anjum(a)collabora.com>
Cc: J��r��me Glisse <jglisse(a)redhat.com>
Cc: Kees Cook <keescook(a)chromium.org>
Cc: Mark Brown <broonie(a)kernel.org>
Cc: Przemek Kitszel <przemyslaw.kitszel(a)intel.com>
Cc: Ritesh Harjani (IBM) <ritesh.list(a)gmail.com>
Cc: Shuah Khan <shuah(a)kernel.org>
Cc: Ralph Campbell <rcampbell(a)nvidia.com>
Cc: Jason Gunthorpe <jgg(a)mellanox.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
tools/testing/selftests/mm/hmm-tests.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
--- a/tools/testing/selftests/mm/hmm-tests.c~selftests-mm-fixed-incorrect-buffer-mirror-size-in-hmm2-double_map-test
+++ a/tools/testing/selftests/mm/hmm-tests.c
@@ -1657,7 +1657,7 @@ TEST_F(hmm2, double_map)
buffer->fd = -1;
buffer->size = size;
- buffer->mirror = malloc(npages);
+ buffer->mirror = malloc(size);
ASSERT_NE(buffer->mirror, NULL);
/* Reserve a range of addresses. */
_
Patches currently in -mm which might be from donettom(a)linux.ibm.com are
The quilt patch titled
Subject: device-dax: correct pgoff align in dax_set_mapping()
has been removed from the -mm tree. Its filename was
device-dax-correct-pgoff-align-in-dax_set_mapping.patch
This patch was dropped because it was merged into the mm-hotfixes-stable branch
of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
------------------------------------------------------
From: "Kun(llfl)" <llfl(a)linux.alibaba.com>
Subject: device-dax: correct pgoff align in dax_set_mapping()
Date: Fri, 27 Sep 2024 15:45:09 +0800
pgoff should be aligned using ALIGN_DOWN() instead of ALIGN(). Otherwise,
vmf->address not aligned to fault_size will be aligned to the next
alignment, that can result in memory failure getting the wrong address.
It's a subtle situation that only can be observed in
page_mapped_in_vma() after the page is page fault handled by
dev_dax_huge_fault. Generally, there is little chance to perform
page_mapped_in_vma in dev-dax's page unless in specific error injection
to the dax device to trigger an MCE - memory-failure. In that case,
page_mapped_in_vma() will be triggered to determine which task is
accessing the failure address and kill that task in the end.
We used self-developed dax device (which is 2M aligned mapping) , to
perform error injection to random address. It turned out that error
injected to non-2M-aligned address was causing endless MCE until panic.
Because page_mapped_in_vma() kept resulting wrong address and the task
accessing the failure address was never killed properly:
[ 3783.719419] Memory failure: 0x200c9742: recovery action for dax page:
Recovered
[ 3784.049006] mce: Uncorrected hardware memory error in user-access at
200c9742380
[ 3784.049190] Memory failure: 0x200c9742: recovery action for dax page:
Recovered
[ 3784.448042] mce: Uncorrected hardware memory error in user-access at
200c9742380
[ 3784.448186] Memory failure: 0x200c9742: recovery action for dax page:
Recovered
[ 3784.792026] mce: Uncorrected hardware memory error in user-access at
200c9742380
[ 3784.792179] Memory failure: 0x200c9742: recovery action for dax page:
Recovered
[ 3785.162502] mce: Uncorrected hardware memory error in user-access at
200c9742380
[ 3785.162633] Memory failure: 0x200c9742: recovery action for dax page:
Recovered
[ 3785.461116] mce: Uncorrected hardware memory error in user-access at
200c9742380
[ 3785.461247] Memory failure: 0x200c9742: recovery action for dax page:
Recovered
[ 3785.764730] mce: Uncorrected hardware memory error in user-access at
200c9742380
[ 3785.764859] Memory failure: 0x200c9742: recovery action for dax page:
Recovered
[ 3786.042128] mce: Uncorrected hardware memory error in user-access at
200c9742380
[ 3786.042259] Memory failure: 0x200c9742: recovery action for dax page:
Recovered
[ 3786.464293] mce: Uncorrected hardware memory error in user-access at
200c9742380
[ 3786.464423] Memory failure: 0x200c9742: recovery action for dax page:
Recovered
[ 3786.818090] mce: Uncorrected hardware memory error in user-access at
200c9742380
[ 3786.818217] Memory failure: 0x200c9742: recovery action for dax page:
Recovered
[ 3787.085297] mce: Uncorrected hardware memory error in user-access at
200c9742380
[ 3787.085424] Memory failure: 0x200c9742: recovery action for dax page:
Recovered
It took us several weeks to pinpoint this problem,�� but we eventually
used bpftrace to trace the page fault and mce address and successfully
identified the issue.
Joao added:
; Likely we never reproduce in production because we always pin
: device-dax regions in the region align they provide (Qemu does
: similarly with prealloc in hugetlb/file backed memory). I think this
: bug requires that we touch *unpinned* device-dax regions unaligned to
: the device-dax selected alignment (page size i.e. 4K/2M/1G)
Link: https://lkml.kernel.org/r/23c02a03e8d666fef11bbe13e85c69c8b4ca0624.17274216…
Fixes: b9b5777f09be ("device-dax: use ALIGN() for determining pgoff")
Signed-off-by: Kun(llfl) <llfl(a)linux.alibaba.com>
Tested-by: JianXiong Zhao <zhaojianxiong.zjx(a)alibaba-inc.com>
Reviewed-by: Joao Martins <joao.m.martins(a)oracle.com>
Cc: Dan Williams <dan.j.williams(a)intel.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
drivers/dax/device.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
--- a/drivers/dax/device.c~device-dax-correct-pgoff-align-in-dax_set_mapping
+++ a/drivers/dax/device.c
@@ -86,7 +86,7 @@ static void dax_set_mapping(struct vm_fa
nr_pages = 1;
pgoff = linear_page_index(vmf->vma,
- ALIGN(vmf->address, fault_size));
+ ALIGN_DOWN(vmf->address, fault_size));
for (i = 0; i < nr_pages; i++) {
struct page *page = pfn_to_page(pfn_t_to_pfn(pfn) + i);
_
Patches currently in -mm which might be from llfl(a)linux.alibaba.com are
The quilt patch titled
Subject: kthread: unpark only parked kthread
has been removed from the -mm tree. Its filename was
kthread-unpark-only-parked-kthread.patch
This patch was dropped because it was merged into the mm-hotfixes-stable branch
of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
------------------------------------------------------
From: Frederic Weisbecker <frederic(a)kernel.org>
Subject: kthread: unpark only parked kthread
Date: Fri, 13 Sep 2024 23:46:34 +0200
Calling into kthread unparking unconditionally is mostly harmless when
the kthread is already unparked. The wake up is then simply ignored
because the target is not in TASK_PARKED state.
However if the kthread is per CPU, the wake up is preceded by a call
to kthread_bind() which expects the task to be inactive and in
TASK_PARKED state, which obviously isn't the case if it is unparked.
As a result, calling kthread_stop() on an unparked per-cpu kthread
triggers such a warning:
WARNING: CPU: 0 PID: 11 at kernel/kthread.c:525 __kthread_bind_mask kernel/kthread.c:525
<TASK>
kthread_stop+0x17a/0x630 kernel/kthread.c:707
destroy_workqueue+0x136/0xc40 kernel/workqueue.c:5810
wg_destruct+0x1e2/0x2e0 drivers/net/wireguard/device.c:257
netdev_run_todo+0xe1a/0x1000 net/core/dev.c:10693
default_device_exit_batch+0xa14/0xa90 net/core/dev.c:11769
ops_exit_list net/core/net_namespace.c:178 [inline]
cleanup_net+0x89d/0xcc0 net/core/net_namespace.c:640
process_one_work kernel/workqueue.c:3231 [inline]
process_scheduled_works+0xa2c/0x1830 kernel/workqueue.c:3312
worker_thread+0x86d/0xd70 kernel/workqueue.c:3393
kthread+0x2f0/0x390 kernel/kthread.c:389
ret_from_fork+0x4b/0x80 arch/x86/kernel/process.c:147
ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:244
</TASK>
Fix this with skipping unecessary unparking while stopping a kthread.
Link: https://lkml.kernel.org/r/20240913214634.12557-1-frederic@kernel.org
Fixes: 5c25b5ff89f0 ("workqueue: Tag bound workers with KTHREAD_IS_PER_CPU")
Signed-off-by: Frederic Weisbecker <frederic(a)kernel.org>
Reported-by: syzbot+943d34fa3cf2191e3068(a)syzkaller.appspotmail.com
Tested-by: syzbot+943d34fa3cf2191e3068(a)syzkaller.appspotmail.com
Suggested-by: Thomas Gleixner <tglx(a)linutronix.de>
Cc: Hillf Danton <hdanton(a)sina.com>
Cc: Tejun Heo <tj(a)kernel.org>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
kernel/kthread.c | 2 ++
1 file changed, 2 insertions(+)
--- a/kernel/kthread.c~kthread-unpark-only-parked-kthread
+++ a/kernel/kthread.c
@@ -623,6 +623,8 @@ void kthread_unpark(struct task_struct *
{
struct kthread *kthread = to_kthread(k);
+ if (!test_bit(KTHREAD_SHOULD_PARK, &kthread->flags))
+ return;
/*
* Newly created kthread was parked when the CPU was offline.
* The binding was lost and we need to set it again.
_
Patches currently in -mm which might be from frederic(a)kernel.org are
(resending, sorry - I'd forgotten to add the mailing list)
After commit 379a58caa199 ("scsi: fnic: Move fnic_fnic_flush_tx() to a work
queue"), it can happen that a work item is sent to an uninitialized work
queue. This may has the effect that the item being queued is never
actually queued, and any further actions depending on it will not proceed.
The following warning is observed while the fnic driver is loaded:
kernel: WARNING: CPU: 11 PID: 0 at ../kernel/workqueue.c:1524 __queue_work+0x373/0x410
kernel: <IRQ>
kernel: queue_work_on+0x3a/0x50
kernel: fnic_wq_copy_cmpl_handler+0x54a/0x730 [fnic 62fbff0c42e7fb825c60a55cde2fb91facb2ed24]
kernel: fnic_isr_msix_wq_copy+0x2d/0x60 [fnic 62fbff0c42e7fb825c60a55cde2fb91facb2ed24]
kernel: __handle_irq_event_percpu+0x36/0x1a0
kernel: handle_irq_event_percpu+0x30/0x70
kernel: handle_irq_event+0x34/0x60
kernel: handle_edge_irq+0x7e/0x1a0
kernel: __common_interrupt+0x3b/0xb0
kernel: common_interrupt+0x58/0xa0
kernel: </IRQ>
It has been observed that this may break the rediscovery of fibre channel
devices after a temporary fabric failure.
This patch fixes it by moving the work queue initialization out of
an if block in fnic_probe().
Signed-off-by: Martin Wilck <mwilck(a)suse.com>
Fixes: 379a58caa199 ("scsi: fnic: Move fnic_fnic_flush_tx() to a work queue")
Cc: stable(a)vger.kernel.org
---
drivers/scsi/fnic/fnic_main.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/scsi/fnic/fnic_main.c b/drivers/scsi/fnic/fnic_main.c
index 0044717d4486..adec0df24bc4 100644
--- a/drivers/scsi/fnic/fnic_main.c
+++ b/drivers/scsi/fnic/fnic_main.c
@@ -830,7 +830,6 @@ static int fnic_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
spin_lock_init(&fnic->vlans_lock);
INIT_WORK(&fnic->fip_frame_work, fnic_handle_fip_frame);
INIT_WORK(&fnic->event_work, fnic_handle_event);
- INIT_WORK(&fnic->flush_work, fnic_flush_tx);
skb_queue_head_init(&fnic->fip_frame_queue);
INIT_LIST_HEAD(&fnic->evlist);
INIT_LIST_HEAD(&fnic->vlans);
@@ -948,6 +947,7 @@ static int fnic_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
INIT_WORK(&fnic->link_work, fnic_handle_link);
INIT_WORK(&fnic->frame_work, fnic_handle_frame);
+ INIT_WORK(&fnic->flush_work, fnic_flush_tx);
skb_queue_head_init(&fnic->frame_queue);
skb_queue_head_init(&fnic->tx_queue);
--
2.46.1
From: Aaron Thompson <dev(a)aaront.org>
If bt_debugfs is not created successfully, which happens if either
CONFIG_DEBUG_FS or CONFIG_DEBUG_FS_ALLOW_ALL is unset, then iso_init()
returns early and does not set iso_inited to true. This means that a
subsequent call to iso_init() will result in duplicate calls to
proto_register(), bt_sock_register(), etc.
With CONFIG_LIST_HARDENED and CONFIG_BUG_ON_DATA_CORRUPTION enabled, the
duplicate call to proto_register() triggers this BUG():
list_add double add: new=ffffffffc0b280d0, prev=ffffffffbab56250,
next=ffffffffc0b280d0.
------------[ cut here ]------------
kernel BUG at lib/list_debug.c:35!
Oops: invalid opcode: 0000 [#1] PREEMPT SMP PTI
CPU: 2 PID: 887 Comm: bluetoothd Not tainted 6.10.11-1-ao-desktop #1
RIP: 0010:__list_add_valid_or_report+0x9a/0xa0
...
__list_add_valid_or_report+0x9a/0xa0
proto_register+0x2b5/0x340
iso_init+0x23/0x150 [bluetooth]
set_iso_socket_func+0x68/0x1b0 [bluetooth]
kmem_cache_free+0x308/0x330
hci_sock_sendmsg+0x990/0x9e0 [bluetooth]
__sock_sendmsg+0x7b/0x80
sock_write_iter+0x9a/0x110
do_iter_readv_writev+0x11d/0x220
vfs_writev+0x180/0x3e0
do_writev+0xca/0x100
...
This change removes the early return. The check for iso_debugfs being
NULL was unnecessary, it is always NULL when iso_inited is false.
Cc: stable(a)vger.kernel.org
Fixes: ccf74f2390d6 ("Bluetooth: Add BTPROTO_ISO socket type")
Signed-off-by: Aaron Thompson <dev(a)aaront.org>
---
net/bluetooth/iso.c | 6 +-----
1 file changed, 1 insertion(+), 5 deletions(-)
diff --git a/net/bluetooth/iso.c b/net/bluetooth/iso.c
index d5e00d0dd1a0..c9eefb43bf47 100644
--- a/net/bluetooth/iso.c
+++ b/net/bluetooth/iso.c
@@ -2301,13 +2301,9 @@ int iso_init(void)
hci_register_cb(&iso_cb);
- if (IS_ERR_OR_NULL(bt_debugfs))
- return 0;
-
- if (!iso_debugfs) {
+ if (!IS_ERR_OR_NULL(bt_debugfs))
iso_debugfs = debugfs_create_file("iso", 0444, bt_debugfs,
NULL, &iso_debugfs_fops);
- }
iso_inited = true;
--
2.39.5