The patch below does not apply to the 4.9-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From ba0803050d610d5072666be727bca5e03e55b242 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells(a)redhat.com>
Date: Tue, 23 Aug 2022 02:10:56 -0500
Subject: [PATCH] smb3: missing inode locks in punch hole
smb3 fallocate punch hole was not grabbing the inode or filemap_invalidate
locks so could have race with pagemap reinstantiating the page.
Cc: stable(a)vger.kernel.org
Signed-off-by: David Howells <dhowells(a)redhat.com>
Signed-off-by: Steve French <stfrench(a)microsoft.com>
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index a6fe54281fd3..4810bd62266a 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -3384,7 +3384,7 @@ static long smb3_zero_range(struct file *file, struct cifs_tcon *tcon,
static long smb3_punch_hole(struct file *file, struct cifs_tcon *tcon,
loff_t offset, loff_t len)
{
- struct inode *inode;
+ struct inode *inode = file_inode(file);
struct cifsFileInfo *cfile = file->private_data;
struct file_zero_data_information fsctl_buf;
long rc;
@@ -3393,14 +3393,12 @@ static long smb3_punch_hole(struct file *file, struct cifs_tcon *tcon,
xid = get_xid();
- inode = d_inode(cfile->dentry);
-
+ inode_lock(inode);
/* Need to make file sparse, if not already, before freeing range. */
/* Consider adding equivalent for compressed since it could also work */
if (!smb2_set_sparse(xid, tcon, cfile, inode, set_sparse)) {
rc = -EOPNOTSUPP;
- free_xid(xid);
- return rc;
+ goto out;
}
filemap_invalidate_lock(inode->i_mapping);
@@ -3420,8 +3418,10 @@ static long smb3_punch_hole(struct file *file, struct cifs_tcon *tcon,
(char *)&fsctl_buf,
sizeof(struct file_zero_data_information),
CIFSMaxBufSize, NULL, NULL);
- free_xid(xid);
filemap_invalidate_unlock(inode->i_mapping);
+out:
+ inode_unlock(inode);
+ free_xid(xid);
return rc;
}
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From ba0803050d610d5072666be727bca5e03e55b242 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells(a)redhat.com>
Date: Tue, 23 Aug 2022 02:10:56 -0500
Subject: [PATCH] smb3: missing inode locks in punch hole
smb3 fallocate punch hole was not grabbing the inode or filemap_invalidate
locks so could have race with pagemap reinstantiating the page.
Cc: stable(a)vger.kernel.org
Signed-off-by: David Howells <dhowells(a)redhat.com>
Signed-off-by: Steve French <stfrench(a)microsoft.com>
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index a6fe54281fd3..4810bd62266a 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -3384,7 +3384,7 @@ static long smb3_zero_range(struct file *file, struct cifs_tcon *tcon,
static long smb3_punch_hole(struct file *file, struct cifs_tcon *tcon,
loff_t offset, loff_t len)
{
- struct inode *inode;
+ struct inode *inode = file_inode(file);
struct cifsFileInfo *cfile = file->private_data;
struct file_zero_data_information fsctl_buf;
long rc;
@@ -3393,14 +3393,12 @@ static long smb3_punch_hole(struct file *file, struct cifs_tcon *tcon,
xid = get_xid();
- inode = d_inode(cfile->dentry);
-
+ inode_lock(inode);
/* Need to make file sparse, if not already, before freeing range. */
/* Consider adding equivalent for compressed since it could also work */
if (!smb2_set_sparse(xid, tcon, cfile, inode, set_sparse)) {
rc = -EOPNOTSUPP;
- free_xid(xid);
- return rc;
+ goto out;
}
filemap_invalidate_lock(inode->i_mapping);
@@ -3420,8 +3418,10 @@ static long smb3_punch_hole(struct file *file, struct cifs_tcon *tcon,
(char *)&fsctl_buf,
sizeof(struct file_zero_data_information),
CIFSMaxBufSize, NULL, NULL);
- free_xid(xid);
filemap_invalidate_unlock(inode->i_mapping);
+out:
+ inode_unlock(inode);
+ free_xid(xid);
return rc;
}
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From ba0803050d610d5072666be727bca5e03e55b242 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells(a)redhat.com>
Date: Tue, 23 Aug 2022 02:10:56 -0500
Subject: [PATCH] smb3: missing inode locks in punch hole
smb3 fallocate punch hole was not grabbing the inode or filemap_invalidate
locks so could have race with pagemap reinstantiating the page.
Cc: stable(a)vger.kernel.org
Signed-off-by: David Howells <dhowells(a)redhat.com>
Signed-off-by: Steve French <stfrench(a)microsoft.com>
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index a6fe54281fd3..4810bd62266a 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -3384,7 +3384,7 @@ static long smb3_zero_range(struct file *file, struct cifs_tcon *tcon,
static long smb3_punch_hole(struct file *file, struct cifs_tcon *tcon,
loff_t offset, loff_t len)
{
- struct inode *inode;
+ struct inode *inode = file_inode(file);
struct cifsFileInfo *cfile = file->private_data;
struct file_zero_data_information fsctl_buf;
long rc;
@@ -3393,14 +3393,12 @@ static long smb3_punch_hole(struct file *file, struct cifs_tcon *tcon,
xid = get_xid();
- inode = d_inode(cfile->dentry);
-
+ inode_lock(inode);
/* Need to make file sparse, if not already, before freeing range. */
/* Consider adding equivalent for compressed since it could also work */
if (!smb2_set_sparse(xid, tcon, cfile, inode, set_sparse)) {
rc = -EOPNOTSUPP;
- free_xid(xid);
- return rc;
+ goto out;
}
filemap_invalidate_lock(inode->i_mapping);
@@ -3420,8 +3418,10 @@ static long smb3_punch_hole(struct file *file, struct cifs_tcon *tcon,
(char *)&fsctl_buf,
sizeof(struct file_zero_data_information),
CIFSMaxBufSize, NULL, NULL);
- free_xid(xid);
filemap_invalidate_unlock(inode->i_mapping);
+out:
+ inode_unlock(inode);
+ free_xid(xid);
return rc;
}
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From ba0803050d610d5072666be727bca5e03e55b242 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells(a)redhat.com>
Date: Tue, 23 Aug 2022 02:10:56 -0500
Subject: [PATCH] smb3: missing inode locks in punch hole
smb3 fallocate punch hole was not grabbing the inode or filemap_invalidate
locks so could have race with pagemap reinstantiating the page.
Cc: stable(a)vger.kernel.org
Signed-off-by: David Howells <dhowells(a)redhat.com>
Signed-off-by: Steve French <stfrench(a)microsoft.com>
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index a6fe54281fd3..4810bd62266a 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -3384,7 +3384,7 @@ static long smb3_zero_range(struct file *file, struct cifs_tcon *tcon,
static long smb3_punch_hole(struct file *file, struct cifs_tcon *tcon,
loff_t offset, loff_t len)
{
- struct inode *inode;
+ struct inode *inode = file_inode(file);
struct cifsFileInfo *cfile = file->private_data;
struct file_zero_data_information fsctl_buf;
long rc;
@@ -3393,14 +3393,12 @@ static long smb3_punch_hole(struct file *file, struct cifs_tcon *tcon,
xid = get_xid();
- inode = d_inode(cfile->dentry);
-
+ inode_lock(inode);
/* Need to make file sparse, if not already, before freeing range. */
/* Consider adding equivalent for compressed since it could also work */
if (!smb2_set_sparse(xid, tcon, cfile, inode, set_sparse)) {
rc = -EOPNOTSUPP;
- free_xid(xid);
- return rc;
+ goto out;
}
filemap_invalidate_lock(inode->i_mapping);
@@ -3420,8 +3418,10 @@ static long smb3_punch_hole(struct file *file, struct cifs_tcon *tcon,
(char *)&fsctl_buf,
sizeof(struct file_zero_data_information),
CIFSMaxBufSize, NULL, NULL);
- free_xid(xid);
filemap_invalidate_unlock(inode->i_mapping);
+out:
+ inode_unlock(inode);
+ free_xid(xid);
return rc;
}
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From ba0803050d610d5072666be727bca5e03e55b242 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells(a)redhat.com>
Date: Tue, 23 Aug 2022 02:10:56 -0500
Subject: [PATCH] smb3: missing inode locks in punch hole
smb3 fallocate punch hole was not grabbing the inode or filemap_invalidate
locks so could have race with pagemap reinstantiating the page.
Cc: stable(a)vger.kernel.org
Signed-off-by: David Howells <dhowells(a)redhat.com>
Signed-off-by: Steve French <stfrench(a)microsoft.com>
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index a6fe54281fd3..4810bd62266a 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -3384,7 +3384,7 @@ static long smb3_zero_range(struct file *file, struct cifs_tcon *tcon,
static long smb3_punch_hole(struct file *file, struct cifs_tcon *tcon,
loff_t offset, loff_t len)
{
- struct inode *inode;
+ struct inode *inode = file_inode(file);
struct cifsFileInfo *cfile = file->private_data;
struct file_zero_data_information fsctl_buf;
long rc;
@@ -3393,14 +3393,12 @@ static long smb3_punch_hole(struct file *file, struct cifs_tcon *tcon,
xid = get_xid();
- inode = d_inode(cfile->dentry);
-
+ inode_lock(inode);
/* Need to make file sparse, if not already, before freeing range. */
/* Consider adding equivalent for compressed since it could also work */
if (!smb2_set_sparse(xid, tcon, cfile, inode, set_sparse)) {
rc = -EOPNOTSUPP;
- free_xid(xid);
- return rc;
+ goto out;
}
filemap_invalidate_lock(inode->i_mapping);
@@ -3420,8 +3418,10 @@ static long smb3_punch_hole(struct file *file, struct cifs_tcon *tcon,
(char *)&fsctl_buf,
sizeof(struct file_zero_data_information),
CIFSMaxBufSize, NULL, NULL);
- free_xid(xid);
filemap_invalidate_unlock(inode->i_mapping);
+out:
+ inode_unlock(inode);
+ free_xid(xid);
return rc;
}
The patch below does not apply to the 4.9-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 8238b4579866b7c1bb99883cfe102a43db5506ff Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka(a)redhat.com>
Date: Fri, 26 Aug 2022 09:17:08 -0400
Subject: [PATCH] wait_on_bit: add an acquire memory barrier
There are several places in the kernel where wait_on_bit is not followed
by a memory barrier (for example, in drivers/md/dm-bufio.c:new_read).
On architectures with weak memory ordering, it may happen that memory
accesses that follow wait_on_bit are reordered before wait_on_bit and
they may return invalid data.
Fix this class of bugs by introducing a new function "test_bit_acquire"
that works like test_bit, but has acquire memory ordering semantics.
Signed-off-by: Mikulas Patocka <mpatocka(a)redhat.com>
Acked-by: Will Deacon <will(a)kernel.org>
Cc: stable(a)vger.kernel.org
Signed-off-by: Linus Torvalds <torvalds(a)linux-foundation.org>
diff --git a/Documentation/atomic_bitops.txt b/Documentation/atomic_bitops.txt
index d8b101c97031..edea4656c5c0 100644
--- a/Documentation/atomic_bitops.txt
+++ b/Documentation/atomic_bitops.txt
@@ -58,13 +58,11 @@ Like with atomic_t, the rule of thumb is:
- RMW operations that have a return value are fully ordered.
- - RMW operations that are conditional are unordered on FAILURE,
- otherwise the above rules apply. In the case of test_and_set_bit_lock(),
- if the bit in memory is unchanged by the operation then it is deemed to have
- failed.
+ - RMW operations that are conditional are fully ordered.
-Except for a successful test_and_set_bit_lock() which has ACQUIRE semantics and
-clear_bit_unlock() which has RELEASE semantics.
+Except for a successful test_and_set_bit_lock() which has ACQUIRE semantics,
+clear_bit_unlock() which has RELEASE semantics and test_bit_acquire which has
+ACQUIRE semantics.
Since a platform only has a single means of achieving atomic operations
the same barriers as for atomic_t are used, see atomic_t.txt.
diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h
index 973c6bd17f98..0fe9de58af31 100644
--- a/arch/x86/include/asm/bitops.h
+++ b/arch/x86/include/asm/bitops.h
@@ -207,6 +207,20 @@ static __always_inline bool constant_test_bit(long nr, const volatile unsigned l
(addr[nr >> _BITOPS_LONG_SHIFT])) != 0;
}
+static __always_inline bool constant_test_bit_acquire(long nr, const volatile unsigned long *addr)
+{
+ bool oldbit;
+
+ asm volatile("testb %2,%1"
+ CC_SET(nz)
+ : CC_OUT(nz) (oldbit)
+ : "m" (((unsigned char *)addr)[nr >> 3]),
+ "i" (1 << (nr & 7))
+ :"memory");
+
+ return oldbit;
+}
+
static __always_inline bool variable_test_bit(long nr, volatile const unsigned long *addr)
{
bool oldbit;
@@ -226,6 +240,13 @@ arch_test_bit(unsigned long nr, const volatile unsigned long *addr)
variable_test_bit(nr, addr);
}
+static __always_inline bool
+arch_test_bit_acquire(unsigned long nr, const volatile unsigned long *addr)
+{
+ return __builtin_constant_p(nr) ? constant_test_bit_acquire(nr, addr) :
+ variable_test_bit(nr, addr);
+}
+
/**
* __ffs - find first set bit in word
* @word: The word to search
diff --git a/include/asm-generic/bitops/generic-non-atomic.h b/include/asm-generic/bitops/generic-non-atomic.h
index 3d5ebd24652b..564a8c675d85 100644
--- a/include/asm-generic/bitops/generic-non-atomic.h
+++ b/include/asm-generic/bitops/generic-non-atomic.h
@@ -4,6 +4,7 @@
#define __ASM_GENERIC_BITOPS_GENERIC_NON_ATOMIC_H
#include <linux/bits.h>
+#include <asm/barrier.h>
#ifndef _LINUX_BITOPS_H
#error only <linux/bitops.h> can be included directly
@@ -127,6 +128,18 @@ generic_test_bit(unsigned long nr, const volatile unsigned long *addr)
return 1UL & (addr[BIT_WORD(nr)] >> (nr & (BITS_PER_LONG-1)));
}
+/**
+ * generic_test_bit_acquire - Determine, with acquire semantics, whether a bit is set
+ * @nr: bit number to test
+ * @addr: Address to start counting from
+ */
+static __always_inline bool
+generic_test_bit_acquire(unsigned long nr, const volatile unsigned long *addr)
+{
+ unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
+ return 1UL & (smp_load_acquire(p) >> (nr & (BITS_PER_LONG-1)));
+}
+
/*
* const_*() definitions provide good compile-time optimizations when
* the passed arguments can be resolved at compile time.
@@ -137,6 +150,7 @@ generic_test_bit(unsigned long nr, const volatile unsigned long *addr)
#define const___test_and_set_bit generic___test_and_set_bit
#define const___test_and_clear_bit generic___test_and_clear_bit
#define const___test_and_change_bit generic___test_and_change_bit
+#define const_test_bit_acquire generic_test_bit_acquire
/**
* const_test_bit - Determine whether a bit is set
diff --git a/include/asm-generic/bitops/instrumented-non-atomic.h b/include/asm-generic/bitops/instrumented-non-atomic.h
index 988a3bbfba34..2b238b161a62 100644
--- a/include/asm-generic/bitops/instrumented-non-atomic.h
+++ b/include/asm-generic/bitops/instrumented-non-atomic.h
@@ -142,4 +142,16 @@ _test_bit(unsigned long nr, const volatile unsigned long *addr)
return arch_test_bit(nr, addr);
}
+/**
+ * _test_bit_acquire - Determine, with acquire semantics, whether a bit is set
+ * @nr: bit number to test
+ * @addr: Address to start counting from
+ */
+static __always_inline bool
+_test_bit_acquire(unsigned long nr, const volatile unsigned long *addr)
+{
+ instrument_atomic_read(addr + BIT_WORD(nr), sizeof(long));
+ return arch_test_bit_acquire(nr, addr);
+}
+
#endif /* _ASM_GENERIC_BITOPS_INSTRUMENTED_NON_ATOMIC_H */
diff --git a/include/asm-generic/bitops/non-atomic.h b/include/asm-generic/bitops/non-atomic.h
index 5c37ced343ae..71f8d54a5195 100644
--- a/include/asm-generic/bitops/non-atomic.h
+++ b/include/asm-generic/bitops/non-atomic.h
@@ -13,6 +13,7 @@
#define arch___test_and_change_bit generic___test_and_change_bit
#define arch_test_bit generic_test_bit
+#define arch_test_bit_acquire generic_test_bit_acquire
#include <asm-generic/bitops/non-instrumented-non-atomic.h>
diff --git a/include/asm-generic/bitops/non-instrumented-non-atomic.h b/include/asm-generic/bitops/non-instrumented-non-atomic.h
index bdb9b1ffaee9..0ddc78dfc358 100644
--- a/include/asm-generic/bitops/non-instrumented-non-atomic.h
+++ b/include/asm-generic/bitops/non-instrumented-non-atomic.h
@@ -12,5 +12,6 @@
#define ___test_and_change_bit arch___test_and_change_bit
#define _test_bit arch_test_bit
+#define _test_bit_acquire arch_test_bit_acquire
#endif /* __ASM_GENERIC_BITOPS_NON_INSTRUMENTED_NON_ATOMIC_H */
diff --git a/include/linux/bitops.h b/include/linux/bitops.h
index cf9bf65039f2..3b89c64bcfd8 100644
--- a/include/linux/bitops.h
+++ b/include/linux/bitops.h
@@ -59,6 +59,7 @@ extern unsigned long __sw_hweight64(__u64 w);
#define __test_and_clear_bit(nr, addr) bitop(___test_and_clear_bit, nr, addr)
#define __test_and_change_bit(nr, addr) bitop(___test_and_change_bit, nr, addr)
#define test_bit(nr, addr) bitop(_test_bit, nr, addr)
+#define test_bit_acquire(nr, addr) bitop(_test_bit_acquire, nr, addr)
/*
* Include this here because some architectures need generic_ffs/fls in
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index def8b8d30ccc..089c9ade4325 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -156,7 +156,7 @@ static __always_inline int buffer_uptodate(const struct buffer_head *bh)
* make it consistent with folio_test_uptodate
* pairs with smp_mb__before_atomic in set_buffer_uptodate
*/
- return (smp_load_acquire(&bh->b_state) & (1UL << BH_Uptodate)) != 0;
+ return test_bit_acquire(BH_Uptodate, &bh->b_state);
}
#define bh_offset(bh) ((unsigned long)(bh)->b_data & ~PAGE_MASK)
diff --git a/include/linux/wait_bit.h b/include/linux/wait_bit.h
index 7dec36aecbd9..7725b7579b78 100644
--- a/include/linux/wait_bit.h
+++ b/include/linux/wait_bit.h
@@ -71,7 +71,7 @@ static inline int
wait_on_bit(unsigned long *word, int bit, unsigned mode)
{
might_sleep();
- if (!test_bit(bit, word))
+ if (!test_bit_acquire(bit, word))
return 0;
return out_of_line_wait_on_bit(word, bit,
bit_wait,
@@ -96,7 +96,7 @@ static inline int
wait_on_bit_io(unsigned long *word, int bit, unsigned mode)
{
might_sleep();
- if (!test_bit(bit, word))
+ if (!test_bit_acquire(bit, word))
return 0;
return out_of_line_wait_on_bit(word, bit,
bit_wait_io,
@@ -123,7 +123,7 @@ wait_on_bit_timeout(unsigned long *word, int bit, unsigned mode,
unsigned long timeout)
{
might_sleep();
- if (!test_bit(bit, word))
+ if (!test_bit_acquire(bit, word))
return 0;
return out_of_line_wait_on_bit_timeout(word, bit,
bit_wait_timeout,
@@ -151,7 +151,7 @@ wait_on_bit_action(unsigned long *word, int bit, wait_bit_action_f *action,
unsigned mode)
{
might_sleep();
- if (!test_bit(bit, word))
+ if (!test_bit_acquire(bit, word))
return 0;
return out_of_line_wait_on_bit(word, bit, action, mode);
}
diff --git a/kernel/sched/wait_bit.c b/kernel/sched/wait_bit.c
index d4788f810b55..0b1cd985dc27 100644
--- a/kernel/sched/wait_bit.c
+++ b/kernel/sched/wait_bit.c
@@ -47,7 +47,7 @@ __wait_on_bit(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_
prepare_to_wait(wq_head, &wbq_entry->wq_entry, mode);
if (test_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags))
ret = (*action)(&wbq_entry->key, mode);
- } while (test_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags) && !ret);
+ } while (test_bit_acquire(wbq_entry->key.bit_nr, wbq_entry->key.flags) && !ret);
finish_wait(wq_head, &wbq_entry->wq_entry);
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 8238b4579866b7c1bb99883cfe102a43db5506ff Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka(a)redhat.com>
Date: Fri, 26 Aug 2022 09:17:08 -0400
Subject: [PATCH] wait_on_bit: add an acquire memory barrier
There are several places in the kernel where wait_on_bit is not followed
by a memory barrier (for example, in drivers/md/dm-bufio.c:new_read).
On architectures with weak memory ordering, it may happen that memory
accesses that follow wait_on_bit are reordered before wait_on_bit and
they may return invalid data.
Fix this class of bugs by introducing a new function "test_bit_acquire"
that works like test_bit, but has acquire memory ordering semantics.
Signed-off-by: Mikulas Patocka <mpatocka(a)redhat.com>
Acked-by: Will Deacon <will(a)kernel.org>
Cc: stable(a)vger.kernel.org
Signed-off-by: Linus Torvalds <torvalds(a)linux-foundation.org>
diff --git a/Documentation/atomic_bitops.txt b/Documentation/atomic_bitops.txt
index d8b101c97031..edea4656c5c0 100644
--- a/Documentation/atomic_bitops.txt
+++ b/Documentation/atomic_bitops.txt
@@ -58,13 +58,11 @@ Like with atomic_t, the rule of thumb is:
- RMW operations that have a return value are fully ordered.
- - RMW operations that are conditional are unordered on FAILURE,
- otherwise the above rules apply. In the case of test_and_set_bit_lock(),
- if the bit in memory is unchanged by the operation then it is deemed to have
- failed.
+ - RMW operations that are conditional are fully ordered.
-Except for a successful test_and_set_bit_lock() which has ACQUIRE semantics and
-clear_bit_unlock() which has RELEASE semantics.
+Except for a successful test_and_set_bit_lock() which has ACQUIRE semantics,
+clear_bit_unlock() which has RELEASE semantics and test_bit_acquire which has
+ACQUIRE semantics.
Since a platform only has a single means of achieving atomic operations
the same barriers as for atomic_t are used, see atomic_t.txt.
diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h
index 973c6bd17f98..0fe9de58af31 100644
--- a/arch/x86/include/asm/bitops.h
+++ b/arch/x86/include/asm/bitops.h
@@ -207,6 +207,20 @@ static __always_inline bool constant_test_bit(long nr, const volatile unsigned l
(addr[nr >> _BITOPS_LONG_SHIFT])) != 0;
}
+static __always_inline bool constant_test_bit_acquire(long nr, const volatile unsigned long *addr)
+{
+ bool oldbit;
+
+ asm volatile("testb %2,%1"
+ CC_SET(nz)
+ : CC_OUT(nz) (oldbit)
+ : "m" (((unsigned char *)addr)[nr >> 3]),
+ "i" (1 << (nr & 7))
+ :"memory");
+
+ return oldbit;
+}
+
static __always_inline bool variable_test_bit(long nr, volatile const unsigned long *addr)
{
bool oldbit;
@@ -226,6 +240,13 @@ arch_test_bit(unsigned long nr, const volatile unsigned long *addr)
variable_test_bit(nr, addr);
}
+static __always_inline bool
+arch_test_bit_acquire(unsigned long nr, const volatile unsigned long *addr)
+{
+ return __builtin_constant_p(nr) ? constant_test_bit_acquire(nr, addr) :
+ variable_test_bit(nr, addr);
+}
+
/**
* __ffs - find first set bit in word
* @word: The word to search
diff --git a/include/asm-generic/bitops/generic-non-atomic.h b/include/asm-generic/bitops/generic-non-atomic.h
index 3d5ebd24652b..564a8c675d85 100644
--- a/include/asm-generic/bitops/generic-non-atomic.h
+++ b/include/asm-generic/bitops/generic-non-atomic.h
@@ -4,6 +4,7 @@
#define __ASM_GENERIC_BITOPS_GENERIC_NON_ATOMIC_H
#include <linux/bits.h>
+#include <asm/barrier.h>
#ifndef _LINUX_BITOPS_H
#error only <linux/bitops.h> can be included directly
@@ -127,6 +128,18 @@ generic_test_bit(unsigned long nr, const volatile unsigned long *addr)
return 1UL & (addr[BIT_WORD(nr)] >> (nr & (BITS_PER_LONG-1)));
}
+/**
+ * generic_test_bit_acquire - Determine, with acquire semantics, whether a bit is set
+ * @nr: bit number to test
+ * @addr: Address to start counting from
+ */
+static __always_inline bool
+generic_test_bit_acquire(unsigned long nr, const volatile unsigned long *addr)
+{
+ unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
+ return 1UL & (smp_load_acquire(p) >> (nr & (BITS_PER_LONG-1)));
+}
+
/*
* const_*() definitions provide good compile-time optimizations when
* the passed arguments can be resolved at compile time.
@@ -137,6 +150,7 @@ generic_test_bit(unsigned long nr, const volatile unsigned long *addr)
#define const___test_and_set_bit generic___test_and_set_bit
#define const___test_and_clear_bit generic___test_and_clear_bit
#define const___test_and_change_bit generic___test_and_change_bit
+#define const_test_bit_acquire generic_test_bit_acquire
/**
* const_test_bit - Determine whether a bit is set
diff --git a/include/asm-generic/bitops/instrumented-non-atomic.h b/include/asm-generic/bitops/instrumented-non-atomic.h
index 988a3bbfba34..2b238b161a62 100644
--- a/include/asm-generic/bitops/instrumented-non-atomic.h
+++ b/include/asm-generic/bitops/instrumented-non-atomic.h
@@ -142,4 +142,16 @@ _test_bit(unsigned long nr, const volatile unsigned long *addr)
return arch_test_bit(nr, addr);
}
+/**
+ * _test_bit_acquire - Determine, with acquire semantics, whether a bit is set
+ * @nr: bit number to test
+ * @addr: Address to start counting from
+ */
+static __always_inline bool
+_test_bit_acquire(unsigned long nr, const volatile unsigned long *addr)
+{
+ instrument_atomic_read(addr + BIT_WORD(nr), sizeof(long));
+ return arch_test_bit_acquire(nr, addr);
+}
+
#endif /* _ASM_GENERIC_BITOPS_INSTRUMENTED_NON_ATOMIC_H */
diff --git a/include/asm-generic/bitops/non-atomic.h b/include/asm-generic/bitops/non-atomic.h
index 5c37ced343ae..71f8d54a5195 100644
--- a/include/asm-generic/bitops/non-atomic.h
+++ b/include/asm-generic/bitops/non-atomic.h
@@ -13,6 +13,7 @@
#define arch___test_and_change_bit generic___test_and_change_bit
#define arch_test_bit generic_test_bit
+#define arch_test_bit_acquire generic_test_bit_acquire
#include <asm-generic/bitops/non-instrumented-non-atomic.h>
diff --git a/include/asm-generic/bitops/non-instrumented-non-atomic.h b/include/asm-generic/bitops/non-instrumented-non-atomic.h
index bdb9b1ffaee9..0ddc78dfc358 100644
--- a/include/asm-generic/bitops/non-instrumented-non-atomic.h
+++ b/include/asm-generic/bitops/non-instrumented-non-atomic.h
@@ -12,5 +12,6 @@
#define ___test_and_change_bit arch___test_and_change_bit
#define _test_bit arch_test_bit
+#define _test_bit_acquire arch_test_bit_acquire
#endif /* __ASM_GENERIC_BITOPS_NON_INSTRUMENTED_NON_ATOMIC_H */
diff --git a/include/linux/bitops.h b/include/linux/bitops.h
index cf9bf65039f2..3b89c64bcfd8 100644
--- a/include/linux/bitops.h
+++ b/include/linux/bitops.h
@@ -59,6 +59,7 @@ extern unsigned long __sw_hweight64(__u64 w);
#define __test_and_clear_bit(nr, addr) bitop(___test_and_clear_bit, nr, addr)
#define __test_and_change_bit(nr, addr) bitop(___test_and_change_bit, nr, addr)
#define test_bit(nr, addr) bitop(_test_bit, nr, addr)
+#define test_bit_acquire(nr, addr) bitop(_test_bit_acquire, nr, addr)
/*
* Include this here because some architectures need generic_ffs/fls in
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index def8b8d30ccc..089c9ade4325 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -156,7 +156,7 @@ static __always_inline int buffer_uptodate(const struct buffer_head *bh)
* make it consistent with folio_test_uptodate
* pairs with smp_mb__before_atomic in set_buffer_uptodate
*/
- return (smp_load_acquire(&bh->b_state) & (1UL << BH_Uptodate)) != 0;
+ return test_bit_acquire(BH_Uptodate, &bh->b_state);
}
#define bh_offset(bh) ((unsigned long)(bh)->b_data & ~PAGE_MASK)
diff --git a/include/linux/wait_bit.h b/include/linux/wait_bit.h
index 7dec36aecbd9..7725b7579b78 100644
--- a/include/linux/wait_bit.h
+++ b/include/linux/wait_bit.h
@@ -71,7 +71,7 @@ static inline int
wait_on_bit(unsigned long *word, int bit, unsigned mode)
{
might_sleep();
- if (!test_bit(bit, word))
+ if (!test_bit_acquire(bit, word))
return 0;
return out_of_line_wait_on_bit(word, bit,
bit_wait,
@@ -96,7 +96,7 @@ static inline int
wait_on_bit_io(unsigned long *word, int bit, unsigned mode)
{
might_sleep();
- if (!test_bit(bit, word))
+ if (!test_bit_acquire(bit, word))
return 0;
return out_of_line_wait_on_bit(word, bit,
bit_wait_io,
@@ -123,7 +123,7 @@ wait_on_bit_timeout(unsigned long *word, int bit, unsigned mode,
unsigned long timeout)
{
might_sleep();
- if (!test_bit(bit, word))
+ if (!test_bit_acquire(bit, word))
return 0;
return out_of_line_wait_on_bit_timeout(word, bit,
bit_wait_timeout,
@@ -151,7 +151,7 @@ wait_on_bit_action(unsigned long *word, int bit, wait_bit_action_f *action,
unsigned mode)
{
might_sleep();
- if (!test_bit(bit, word))
+ if (!test_bit_acquire(bit, word))
return 0;
return out_of_line_wait_on_bit(word, bit, action, mode);
}
diff --git a/kernel/sched/wait_bit.c b/kernel/sched/wait_bit.c
index d4788f810b55..0b1cd985dc27 100644
--- a/kernel/sched/wait_bit.c
+++ b/kernel/sched/wait_bit.c
@@ -47,7 +47,7 @@ __wait_on_bit(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_
prepare_to_wait(wq_head, &wbq_entry->wq_entry, mode);
if (test_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags))
ret = (*action)(&wbq_entry->key, mode);
- } while (test_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags) && !ret);
+ } while (test_bit_acquire(wbq_entry->key.bit_nr, wbq_entry->key.flags) && !ret);
finish_wait(wq_head, &wbq_entry->wq_entry);
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 8238b4579866b7c1bb99883cfe102a43db5506ff Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka(a)redhat.com>
Date: Fri, 26 Aug 2022 09:17:08 -0400
Subject: [PATCH] wait_on_bit: add an acquire memory barrier
There are several places in the kernel where wait_on_bit is not followed
by a memory barrier (for example, in drivers/md/dm-bufio.c:new_read).
On architectures with weak memory ordering, it may happen that memory
accesses that follow wait_on_bit are reordered before wait_on_bit and
they may return invalid data.
Fix this class of bugs by introducing a new function "test_bit_acquire"
that works like test_bit, but has acquire memory ordering semantics.
Signed-off-by: Mikulas Patocka <mpatocka(a)redhat.com>
Acked-by: Will Deacon <will(a)kernel.org>
Cc: stable(a)vger.kernel.org
Signed-off-by: Linus Torvalds <torvalds(a)linux-foundation.org>
diff --git a/Documentation/atomic_bitops.txt b/Documentation/atomic_bitops.txt
index d8b101c97031..edea4656c5c0 100644
--- a/Documentation/atomic_bitops.txt
+++ b/Documentation/atomic_bitops.txt
@@ -58,13 +58,11 @@ Like with atomic_t, the rule of thumb is:
- RMW operations that have a return value are fully ordered.
- - RMW operations that are conditional are unordered on FAILURE,
- otherwise the above rules apply. In the case of test_and_set_bit_lock(),
- if the bit in memory is unchanged by the operation then it is deemed to have
- failed.
+ - RMW operations that are conditional are fully ordered.
-Except for a successful test_and_set_bit_lock() which has ACQUIRE semantics and
-clear_bit_unlock() which has RELEASE semantics.
+Except for a successful test_and_set_bit_lock() which has ACQUIRE semantics,
+clear_bit_unlock() which has RELEASE semantics and test_bit_acquire which has
+ACQUIRE semantics.
Since a platform only has a single means of achieving atomic operations
the same barriers as for atomic_t are used, see atomic_t.txt.
diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h
index 973c6bd17f98..0fe9de58af31 100644
--- a/arch/x86/include/asm/bitops.h
+++ b/arch/x86/include/asm/bitops.h
@@ -207,6 +207,20 @@ static __always_inline bool constant_test_bit(long nr, const volatile unsigned l
(addr[nr >> _BITOPS_LONG_SHIFT])) != 0;
}
+static __always_inline bool constant_test_bit_acquire(long nr, const volatile unsigned long *addr)
+{
+ bool oldbit;
+
+ asm volatile("testb %2,%1"
+ CC_SET(nz)
+ : CC_OUT(nz) (oldbit)
+ : "m" (((unsigned char *)addr)[nr >> 3]),
+ "i" (1 << (nr & 7))
+ :"memory");
+
+ return oldbit;
+}
+
static __always_inline bool variable_test_bit(long nr, volatile const unsigned long *addr)
{
bool oldbit;
@@ -226,6 +240,13 @@ arch_test_bit(unsigned long nr, const volatile unsigned long *addr)
variable_test_bit(nr, addr);
}
+static __always_inline bool
+arch_test_bit_acquire(unsigned long nr, const volatile unsigned long *addr)
+{
+ return __builtin_constant_p(nr) ? constant_test_bit_acquire(nr, addr) :
+ variable_test_bit(nr, addr);
+}
+
/**
* __ffs - find first set bit in word
* @word: The word to search
diff --git a/include/asm-generic/bitops/generic-non-atomic.h b/include/asm-generic/bitops/generic-non-atomic.h
index 3d5ebd24652b..564a8c675d85 100644
--- a/include/asm-generic/bitops/generic-non-atomic.h
+++ b/include/asm-generic/bitops/generic-non-atomic.h
@@ -4,6 +4,7 @@
#define __ASM_GENERIC_BITOPS_GENERIC_NON_ATOMIC_H
#include <linux/bits.h>
+#include <asm/barrier.h>
#ifndef _LINUX_BITOPS_H
#error only <linux/bitops.h> can be included directly
@@ -127,6 +128,18 @@ generic_test_bit(unsigned long nr, const volatile unsigned long *addr)
return 1UL & (addr[BIT_WORD(nr)] >> (nr & (BITS_PER_LONG-1)));
}
+/**
+ * generic_test_bit_acquire - Determine, with acquire semantics, whether a bit is set
+ * @nr: bit number to test
+ * @addr: Address to start counting from
+ */
+static __always_inline bool
+generic_test_bit_acquire(unsigned long nr, const volatile unsigned long *addr)
+{
+ unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
+ return 1UL & (smp_load_acquire(p) >> (nr & (BITS_PER_LONG-1)));
+}
+
/*
* const_*() definitions provide good compile-time optimizations when
* the passed arguments can be resolved at compile time.
@@ -137,6 +150,7 @@ generic_test_bit(unsigned long nr, const volatile unsigned long *addr)
#define const___test_and_set_bit generic___test_and_set_bit
#define const___test_and_clear_bit generic___test_and_clear_bit
#define const___test_and_change_bit generic___test_and_change_bit
+#define const_test_bit_acquire generic_test_bit_acquire
/**
* const_test_bit - Determine whether a bit is set
diff --git a/include/asm-generic/bitops/instrumented-non-atomic.h b/include/asm-generic/bitops/instrumented-non-atomic.h
index 988a3bbfba34..2b238b161a62 100644
--- a/include/asm-generic/bitops/instrumented-non-atomic.h
+++ b/include/asm-generic/bitops/instrumented-non-atomic.h
@@ -142,4 +142,16 @@ _test_bit(unsigned long nr, const volatile unsigned long *addr)
return arch_test_bit(nr, addr);
}
+/**
+ * _test_bit_acquire - Determine, with acquire semantics, whether a bit is set
+ * @nr: bit number to test
+ * @addr: Address to start counting from
+ */
+static __always_inline bool
+_test_bit_acquire(unsigned long nr, const volatile unsigned long *addr)
+{
+ instrument_atomic_read(addr + BIT_WORD(nr), sizeof(long));
+ return arch_test_bit_acquire(nr, addr);
+}
+
#endif /* _ASM_GENERIC_BITOPS_INSTRUMENTED_NON_ATOMIC_H */
diff --git a/include/asm-generic/bitops/non-atomic.h b/include/asm-generic/bitops/non-atomic.h
index 5c37ced343ae..71f8d54a5195 100644
--- a/include/asm-generic/bitops/non-atomic.h
+++ b/include/asm-generic/bitops/non-atomic.h
@@ -13,6 +13,7 @@
#define arch___test_and_change_bit generic___test_and_change_bit
#define arch_test_bit generic_test_bit
+#define arch_test_bit_acquire generic_test_bit_acquire
#include <asm-generic/bitops/non-instrumented-non-atomic.h>
diff --git a/include/asm-generic/bitops/non-instrumented-non-atomic.h b/include/asm-generic/bitops/non-instrumented-non-atomic.h
index bdb9b1ffaee9..0ddc78dfc358 100644
--- a/include/asm-generic/bitops/non-instrumented-non-atomic.h
+++ b/include/asm-generic/bitops/non-instrumented-non-atomic.h
@@ -12,5 +12,6 @@
#define ___test_and_change_bit arch___test_and_change_bit
#define _test_bit arch_test_bit
+#define _test_bit_acquire arch_test_bit_acquire
#endif /* __ASM_GENERIC_BITOPS_NON_INSTRUMENTED_NON_ATOMIC_H */
diff --git a/include/linux/bitops.h b/include/linux/bitops.h
index cf9bf65039f2..3b89c64bcfd8 100644
--- a/include/linux/bitops.h
+++ b/include/linux/bitops.h
@@ -59,6 +59,7 @@ extern unsigned long __sw_hweight64(__u64 w);
#define __test_and_clear_bit(nr, addr) bitop(___test_and_clear_bit, nr, addr)
#define __test_and_change_bit(nr, addr) bitop(___test_and_change_bit, nr, addr)
#define test_bit(nr, addr) bitop(_test_bit, nr, addr)
+#define test_bit_acquire(nr, addr) bitop(_test_bit_acquire, nr, addr)
/*
* Include this here because some architectures need generic_ffs/fls in
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index def8b8d30ccc..089c9ade4325 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -156,7 +156,7 @@ static __always_inline int buffer_uptodate(const struct buffer_head *bh)
* make it consistent with folio_test_uptodate
* pairs with smp_mb__before_atomic in set_buffer_uptodate
*/
- return (smp_load_acquire(&bh->b_state) & (1UL << BH_Uptodate)) != 0;
+ return test_bit_acquire(BH_Uptodate, &bh->b_state);
}
#define bh_offset(bh) ((unsigned long)(bh)->b_data & ~PAGE_MASK)
diff --git a/include/linux/wait_bit.h b/include/linux/wait_bit.h
index 7dec36aecbd9..7725b7579b78 100644
--- a/include/linux/wait_bit.h
+++ b/include/linux/wait_bit.h
@@ -71,7 +71,7 @@ static inline int
wait_on_bit(unsigned long *word, int bit, unsigned mode)
{
might_sleep();
- if (!test_bit(bit, word))
+ if (!test_bit_acquire(bit, word))
return 0;
return out_of_line_wait_on_bit(word, bit,
bit_wait,
@@ -96,7 +96,7 @@ static inline int
wait_on_bit_io(unsigned long *word, int bit, unsigned mode)
{
might_sleep();
- if (!test_bit(bit, word))
+ if (!test_bit_acquire(bit, word))
return 0;
return out_of_line_wait_on_bit(word, bit,
bit_wait_io,
@@ -123,7 +123,7 @@ wait_on_bit_timeout(unsigned long *word, int bit, unsigned mode,
unsigned long timeout)
{
might_sleep();
- if (!test_bit(bit, word))
+ if (!test_bit_acquire(bit, word))
return 0;
return out_of_line_wait_on_bit_timeout(word, bit,
bit_wait_timeout,
@@ -151,7 +151,7 @@ wait_on_bit_action(unsigned long *word, int bit, wait_bit_action_f *action,
unsigned mode)
{
might_sleep();
- if (!test_bit(bit, word))
+ if (!test_bit_acquire(bit, word))
return 0;
return out_of_line_wait_on_bit(word, bit, action, mode);
}
diff --git a/kernel/sched/wait_bit.c b/kernel/sched/wait_bit.c
index d4788f810b55..0b1cd985dc27 100644
--- a/kernel/sched/wait_bit.c
+++ b/kernel/sched/wait_bit.c
@@ -47,7 +47,7 @@ __wait_on_bit(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_
prepare_to_wait(wq_head, &wbq_entry->wq_entry, mode);
if (test_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags))
ret = (*action)(&wbq_entry->key, mode);
- } while (test_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags) && !ret);
+ } while (test_bit_acquire(wbq_entry->key.bit_nr, wbq_entry->key.flags) && !ret);
finish_wait(wq_head, &wbq_entry->wq_entry);
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 8238b4579866b7c1bb99883cfe102a43db5506ff Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka(a)redhat.com>
Date: Fri, 26 Aug 2022 09:17:08 -0400
Subject: [PATCH] wait_on_bit: add an acquire memory barrier
There are several places in the kernel where wait_on_bit is not followed
by a memory barrier (for example, in drivers/md/dm-bufio.c:new_read).
On architectures with weak memory ordering, it may happen that memory
accesses that follow wait_on_bit are reordered before wait_on_bit and
they may return invalid data.
Fix this class of bugs by introducing a new function "test_bit_acquire"
that works like test_bit, but has acquire memory ordering semantics.
Signed-off-by: Mikulas Patocka <mpatocka(a)redhat.com>
Acked-by: Will Deacon <will(a)kernel.org>
Cc: stable(a)vger.kernel.org
Signed-off-by: Linus Torvalds <torvalds(a)linux-foundation.org>
diff --git a/Documentation/atomic_bitops.txt b/Documentation/atomic_bitops.txt
index d8b101c97031..edea4656c5c0 100644
--- a/Documentation/atomic_bitops.txt
+++ b/Documentation/atomic_bitops.txt
@@ -58,13 +58,11 @@ Like with atomic_t, the rule of thumb is:
- RMW operations that have a return value are fully ordered.
- - RMW operations that are conditional are unordered on FAILURE,
- otherwise the above rules apply. In the case of test_and_set_bit_lock(),
- if the bit in memory is unchanged by the operation then it is deemed to have
- failed.
+ - RMW operations that are conditional are fully ordered.
-Except for a successful test_and_set_bit_lock() which has ACQUIRE semantics and
-clear_bit_unlock() which has RELEASE semantics.
+Except for a successful test_and_set_bit_lock() which has ACQUIRE semantics,
+clear_bit_unlock() which has RELEASE semantics and test_bit_acquire which has
+ACQUIRE semantics.
Since a platform only has a single means of achieving atomic operations
the same barriers as for atomic_t are used, see atomic_t.txt.
diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h
index 973c6bd17f98..0fe9de58af31 100644
--- a/arch/x86/include/asm/bitops.h
+++ b/arch/x86/include/asm/bitops.h
@@ -207,6 +207,20 @@ static __always_inline bool constant_test_bit(long nr, const volatile unsigned l
(addr[nr >> _BITOPS_LONG_SHIFT])) != 0;
}
+static __always_inline bool constant_test_bit_acquire(long nr, const volatile unsigned long *addr)
+{
+ bool oldbit;
+
+ asm volatile("testb %2,%1"
+ CC_SET(nz)
+ : CC_OUT(nz) (oldbit)
+ : "m" (((unsigned char *)addr)[nr >> 3]),
+ "i" (1 << (nr & 7))
+ :"memory");
+
+ return oldbit;
+}
+
static __always_inline bool variable_test_bit(long nr, volatile const unsigned long *addr)
{
bool oldbit;
@@ -226,6 +240,13 @@ arch_test_bit(unsigned long nr, const volatile unsigned long *addr)
variable_test_bit(nr, addr);
}
+static __always_inline bool
+arch_test_bit_acquire(unsigned long nr, const volatile unsigned long *addr)
+{
+ return __builtin_constant_p(nr) ? constant_test_bit_acquire(nr, addr) :
+ variable_test_bit(nr, addr);
+}
+
/**
* __ffs - find first set bit in word
* @word: The word to search
diff --git a/include/asm-generic/bitops/generic-non-atomic.h b/include/asm-generic/bitops/generic-non-atomic.h
index 3d5ebd24652b..564a8c675d85 100644
--- a/include/asm-generic/bitops/generic-non-atomic.h
+++ b/include/asm-generic/bitops/generic-non-atomic.h
@@ -4,6 +4,7 @@
#define __ASM_GENERIC_BITOPS_GENERIC_NON_ATOMIC_H
#include <linux/bits.h>
+#include <asm/barrier.h>
#ifndef _LINUX_BITOPS_H
#error only <linux/bitops.h> can be included directly
@@ -127,6 +128,18 @@ generic_test_bit(unsigned long nr, const volatile unsigned long *addr)
return 1UL & (addr[BIT_WORD(nr)] >> (nr & (BITS_PER_LONG-1)));
}
+/**
+ * generic_test_bit_acquire - Determine, with acquire semantics, whether a bit is set
+ * @nr: bit number to test
+ * @addr: Address to start counting from
+ */
+static __always_inline bool
+generic_test_bit_acquire(unsigned long nr, const volatile unsigned long *addr)
+{
+ unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
+ return 1UL & (smp_load_acquire(p) >> (nr & (BITS_PER_LONG-1)));
+}
+
/*
* const_*() definitions provide good compile-time optimizations when
* the passed arguments can be resolved at compile time.
@@ -137,6 +150,7 @@ generic_test_bit(unsigned long nr, const volatile unsigned long *addr)
#define const___test_and_set_bit generic___test_and_set_bit
#define const___test_and_clear_bit generic___test_and_clear_bit
#define const___test_and_change_bit generic___test_and_change_bit
+#define const_test_bit_acquire generic_test_bit_acquire
/**
* const_test_bit - Determine whether a bit is set
diff --git a/include/asm-generic/bitops/instrumented-non-atomic.h b/include/asm-generic/bitops/instrumented-non-atomic.h
index 988a3bbfba34..2b238b161a62 100644
--- a/include/asm-generic/bitops/instrumented-non-atomic.h
+++ b/include/asm-generic/bitops/instrumented-non-atomic.h
@@ -142,4 +142,16 @@ _test_bit(unsigned long nr, const volatile unsigned long *addr)
return arch_test_bit(nr, addr);
}
+/**
+ * _test_bit_acquire - Determine, with acquire semantics, whether a bit is set
+ * @nr: bit number to test
+ * @addr: Address to start counting from
+ */
+static __always_inline bool
+_test_bit_acquire(unsigned long nr, const volatile unsigned long *addr)
+{
+ instrument_atomic_read(addr + BIT_WORD(nr), sizeof(long));
+ return arch_test_bit_acquire(nr, addr);
+}
+
#endif /* _ASM_GENERIC_BITOPS_INSTRUMENTED_NON_ATOMIC_H */
diff --git a/include/asm-generic/bitops/non-atomic.h b/include/asm-generic/bitops/non-atomic.h
index 5c37ced343ae..71f8d54a5195 100644
--- a/include/asm-generic/bitops/non-atomic.h
+++ b/include/asm-generic/bitops/non-atomic.h
@@ -13,6 +13,7 @@
#define arch___test_and_change_bit generic___test_and_change_bit
#define arch_test_bit generic_test_bit
+#define arch_test_bit_acquire generic_test_bit_acquire
#include <asm-generic/bitops/non-instrumented-non-atomic.h>
diff --git a/include/asm-generic/bitops/non-instrumented-non-atomic.h b/include/asm-generic/bitops/non-instrumented-non-atomic.h
index bdb9b1ffaee9..0ddc78dfc358 100644
--- a/include/asm-generic/bitops/non-instrumented-non-atomic.h
+++ b/include/asm-generic/bitops/non-instrumented-non-atomic.h
@@ -12,5 +12,6 @@
#define ___test_and_change_bit arch___test_and_change_bit
#define _test_bit arch_test_bit
+#define _test_bit_acquire arch_test_bit_acquire
#endif /* __ASM_GENERIC_BITOPS_NON_INSTRUMENTED_NON_ATOMIC_H */
diff --git a/include/linux/bitops.h b/include/linux/bitops.h
index cf9bf65039f2..3b89c64bcfd8 100644
--- a/include/linux/bitops.h
+++ b/include/linux/bitops.h
@@ -59,6 +59,7 @@ extern unsigned long __sw_hweight64(__u64 w);
#define __test_and_clear_bit(nr, addr) bitop(___test_and_clear_bit, nr, addr)
#define __test_and_change_bit(nr, addr) bitop(___test_and_change_bit, nr, addr)
#define test_bit(nr, addr) bitop(_test_bit, nr, addr)
+#define test_bit_acquire(nr, addr) bitop(_test_bit_acquire, nr, addr)
/*
* Include this here because some architectures need generic_ffs/fls in
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index def8b8d30ccc..089c9ade4325 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -156,7 +156,7 @@ static __always_inline int buffer_uptodate(const struct buffer_head *bh)
* make it consistent with folio_test_uptodate
* pairs with smp_mb__before_atomic in set_buffer_uptodate
*/
- return (smp_load_acquire(&bh->b_state) & (1UL << BH_Uptodate)) != 0;
+ return test_bit_acquire(BH_Uptodate, &bh->b_state);
}
#define bh_offset(bh) ((unsigned long)(bh)->b_data & ~PAGE_MASK)
diff --git a/include/linux/wait_bit.h b/include/linux/wait_bit.h
index 7dec36aecbd9..7725b7579b78 100644
--- a/include/linux/wait_bit.h
+++ b/include/linux/wait_bit.h
@@ -71,7 +71,7 @@ static inline int
wait_on_bit(unsigned long *word, int bit, unsigned mode)
{
might_sleep();
- if (!test_bit(bit, word))
+ if (!test_bit_acquire(bit, word))
return 0;
return out_of_line_wait_on_bit(word, bit,
bit_wait,
@@ -96,7 +96,7 @@ static inline int
wait_on_bit_io(unsigned long *word, int bit, unsigned mode)
{
might_sleep();
- if (!test_bit(bit, word))
+ if (!test_bit_acquire(bit, word))
return 0;
return out_of_line_wait_on_bit(word, bit,
bit_wait_io,
@@ -123,7 +123,7 @@ wait_on_bit_timeout(unsigned long *word, int bit, unsigned mode,
unsigned long timeout)
{
might_sleep();
- if (!test_bit(bit, word))
+ if (!test_bit_acquire(bit, word))
return 0;
return out_of_line_wait_on_bit_timeout(word, bit,
bit_wait_timeout,
@@ -151,7 +151,7 @@ wait_on_bit_action(unsigned long *word, int bit, wait_bit_action_f *action,
unsigned mode)
{
might_sleep();
- if (!test_bit(bit, word))
+ if (!test_bit_acquire(bit, word))
return 0;
return out_of_line_wait_on_bit(word, bit, action, mode);
}
diff --git a/kernel/sched/wait_bit.c b/kernel/sched/wait_bit.c
index d4788f810b55..0b1cd985dc27 100644
--- a/kernel/sched/wait_bit.c
+++ b/kernel/sched/wait_bit.c
@@ -47,7 +47,7 @@ __wait_on_bit(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_
prepare_to_wait(wq_head, &wbq_entry->wq_entry, mode);
if (test_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags))
ret = (*action)(&wbq_entry->key, mode);
- } while (test_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags) && !ret);
+ } while (test_bit_acquire(wbq_entry->key.bit_nr, wbq_entry->key.flags) && !ret);
finish_wait(wq_head, &wbq_entry->wq_entry);
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 8238b4579866b7c1bb99883cfe102a43db5506ff Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka(a)redhat.com>
Date: Fri, 26 Aug 2022 09:17:08 -0400
Subject: [PATCH] wait_on_bit: add an acquire memory barrier
There are several places in the kernel where wait_on_bit is not followed
by a memory barrier (for example, in drivers/md/dm-bufio.c:new_read).
On architectures with weak memory ordering, it may happen that memory
accesses that follow wait_on_bit are reordered before wait_on_bit and
they may return invalid data.
Fix this class of bugs by introducing a new function "test_bit_acquire"
that works like test_bit, but has acquire memory ordering semantics.
Signed-off-by: Mikulas Patocka <mpatocka(a)redhat.com>
Acked-by: Will Deacon <will(a)kernel.org>
Cc: stable(a)vger.kernel.org
Signed-off-by: Linus Torvalds <torvalds(a)linux-foundation.org>
diff --git a/Documentation/atomic_bitops.txt b/Documentation/atomic_bitops.txt
index d8b101c97031..edea4656c5c0 100644
--- a/Documentation/atomic_bitops.txt
+++ b/Documentation/atomic_bitops.txt
@@ -58,13 +58,11 @@ Like with atomic_t, the rule of thumb is:
- RMW operations that have a return value are fully ordered.
- - RMW operations that are conditional are unordered on FAILURE,
- otherwise the above rules apply. In the case of test_and_set_bit_lock(),
- if the bit in memory is unchanged by the operation then it is deemed to have
- failed.
+ - RMW operations that are conditional are fully ordered.
-Except for a successful test_and_set_bit_lock() which has ACQUIRE semantics and
-clear_bit_unlock() which has RELEASE semantics.
+Except for a successful test_and_set_bit_lock() which has ACQUIRE semantics,
+clear_bit_unlock() which has RELEASE semantics and test_bit_acquire which has
+ACQUIRE semantics.
Since a platform only has a single means of achieving atomic operations
the same barriers as for atomic_t are used, see atomic_t.txt.
diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h
index 973c6bd17f98..0fe9de58af31 100644
--- a/arch/x86/include/asm/bitops.h
+++ b/arch/x86/include/asm/bitops.h
@@ -207,6 +207,20 @@ static __always_inline bool constant_test_bit(long nr, const volatile unsigned l
(addr[nr >> _BITOPS_LONG_SHIFT])) != 0;
}
+static __always_inline bool constant_test_bit_acquire(long nr, const volatile unsigned long *addr)
+{
+ bool oldbit;
+
+ asm volatile("testb %2,%1"
+ CC_SET(nz)
+ : CC_OUT(nz) (oldbit)
+ : "m" (((unsigned char *)addr)[nr >> 3]),
+ "i" (1 << (nr & 7))
+ :"memory");
+
+ return oldbit;
+}
+
static __always_inline bool variable_test_bit(long nr, volatile const unsigned long *addr)
{
bool oldbit;
@@ -226,6 +240,13 @@ arch_test_bit(unsigned long nr, const volatile unsigned long *addr)
variable_test_bit(nr, addr);
}
+static __always_inline bool
+arch_test_bit_acquire(unsigned long nr, const volatile unsigned long *addr)
+{
+ return __builtin_constant_p(nr) ? constant_test_bit_acquire(nr, addr) :
+ variable_test_bit(nr, addr);
+}
+
/**
* __ffs - find first set bit in word
* @word: The word to search
diff --git a/include/asm-generic/bitops/generic-non-atomic.h b/include/asm-generic/bitops/generic-non-atomic.h
index 3d5ebd24652b..564a8c675d85 100644
--- a/include/asm-generic/bitops/generic-non-atomic.h
+++ b/include/asm-generic/bitops/generic-non-atomic.h
@@ -4,6 +4,7 @@
#define __ASM_GENERIC_BITOPS_GENERIC_NON_ATOMIC_H
#include <linux/bits.h>
+#include <asm/barrier.h>
#ifndef _LINUX_BITOPS_H
#error only <linux/bitops.h> can be included directly
@@ -127,6 +128,18 @@ generic_test_bit(unsigned long nr, const volatile unsigned long *addr)
return 1UL & (addr[BIT_WORD(nr)] >> (nr & (BITS_PER_LONG-1)));
}
+/**
+ * generic_test_bit_acquire - Determine, with acquire semantics, whether a bit is set
+ * @nr: bit number to test
+ * @addr: Address to start counting from
+ */
+static __always_inline bool
+generic_test_bit_acquire(unsigned long nr, const volatile unsigned long *addr)
+{
+ unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
+ return 1UL & (smp_load_acquire(p) >> (nr & (BITS_PER_LONG-1)));
+}
+
/*
* const_*() definitions provide good compile-time optimizations when
* the passed arguments can be resolved at compile time.
@@ -137,6 +150,7 @@ generic_test_bit(unsigned long nr, const volatile unsigned long *addr)
#define const___test_and_set_bit generic___test_and_set_bit
#define const___test_and_clear_bit generic___test_and_clear_bit
#define const___test_and_change_bit generic___test_and_change_bit
+#define const_test_bit_acquire generic_test_bit_acquire
/**
* const_test_bit - Determine whether a bit is set
diff --git a/include/asm-generic/bitops/instrumented-non-atomic.h b/include/asm-generic/bitops/instrumented-non-atomic.h
index 988a3bbfba34..2b238b161a62 100644
--- a/include/asm-generic/bitops/instrumented-non-atomic.h
+++ b/include/asm-generic/bitops/instrumented-non-atomic.h
@@ -142,4 +142,16 @@ _test_bit(unsigned long nr, const volatile unsigned long *addr)
return arch_test_bit(nr, addr);
}
+/**
+ * _test_bit_acquire - Determine, with acquire semantics, whether a bit is set
+ * @nr: bit number to test
+ * @addr: Address to start counting from
+ */
+static __always_inline bool
+_test_bit_acquire(unsigned long nr, const volatile unsigned long *addr)
+{
+ instrument_atomic_read(addr + BIT_WORD(nr), sizeof(long));
+ return arch_test_bit_acquire(nr, addr);
+}
+
#endif /* _ASM_GENERIC_BITOPS_INSTRUMENTED_NON_ATOMIC_H */
diff --git a/include/asm-generic/bitops/non-atomic.h b/include/asm-generic/bitops/non-atomic.h
index 5c37ced343ae..71f8d54a5195 100644
--- a/include/asm-generic/bitops/non-atomic.h
+++ b/include/asm-generic/bitops/non-atomic.h
@@ -13,6 +13,7 @@
#define arch___test_and_change_bit generic___test_and_change_bit
#define arch_test_bit generic_test_bit
+#define arch_test_bit_acquire generic_test_bit_acquire
#include <asm-generic/bitops/non-instrumented-non-atomic.h>
diff --git a/include/asm-generic/bitops/non-instrumented-non-atomic.h b/include/asm-generic/bitops/non-instrumented-non-atomic.h
index bdb9b1ffaee9..0ddc78dfc358 100644
--- a/include/asm-generic/bitops/non-instrumented-non-atomic.h
+++ b/include/asm-generic/bitops/non-instrumented-non-atomic.h
@@ -12,5 +12,6 @@
#define ___test_and_change_bit arch___test_and_change_bit
#define _test_bit arch_test_bit
+#define _test_bit_acquire arch_test_bit_acquire
#endif /* __ASM_GENERIC_BITOPS_NON_INSTRUMENTED_NON_ATOMIC_H */
diff --git a/include/linux/bitops.h b/include/linux/bitops.h
index cf9bf65039f2..3b89c64bcfd8 100644
--- a/include/linux/bitops.h
+++ b/include/linux/bitops.h
@@ -59,6 +59,7 @@ extern unsigned long __sw_hweight64(__u64 w);
#define __test_and_clear_bit(nr, addr) bitop(___test_and_clear_bit, nr, addr)
#define __test_and_change_bit(nr, addr) bitop(___test_and_change_bit, nr, addr)
#define test_bit(nr, addr) bitop(_test_bit, nr, addr)
+#define test_bit_acquire(nr, addr) bitop(_test_bit_acquire, nr, addr)
/*
* Include this here because some architectures need generic_ffs/fls in
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index def8b8d30ccc..089c9ade4325 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -156,7 +156,7 @@ static __always_inline int buffer_uptodate(const struct buffer_head *bh)
* make it consistent with folio_test_uptodate
* pairs with smp_mb__before_atomic in set_buffer_uptodate
*/
- return (smp_load_acquire(&bh->b_state) & (1UL << BH_Uptodate)) != 0;
+ return test_bit_acquire(BH_Uptodate, &bh->b_state);
}
#define bh_offset(bh) ((unsigned long)(bh)->b_data & ~PAGE_MASK)
diff --git a/include/linux/wait_bit.h b/include/linux/wait_bit.h
index 7dec36aecbd9..7725b7579b78 100644
--- a/include/linux/wait_bit.h
+++ b/include/linux/wait_bit.h
@@ -71,7 +71,7 @@ static inline int
wait_on_bit(unsigned long *word, int bit, unsigned mode)
{
might_sleep();
- if (!test_bit(bit, word))
+ if (!test_bit_acquire(bit, word))
return 0;
return out_of_line_wait_on_bit(word, bit,
bit_wait,
@@ -96,7 +96,7 @@ static inline int
wait_on_bit_io(unsigned long *word, int bit, unsigned mode)
{
might_sleep();
- if (!test_bit(bit, word))
+ if (!test_bit_acquire(bit, word))
return 0;
return out_of_line_wait_on_bit(word, bit,
bit_wait_io,
@@ -123,7 +123,7 @@ wait_on_bit_timeout(unsigned long *word, int bit, unsigned mode,
unsigned long timeout)
{
might_sleep();
- if (!test_bit(bit, word))
+ if (!test_bit_acquire(bit, word))
return 0;
return out_of_line_wait_on_bit_timeout(word, bit,
bit_wait_timeout,
@@ -151,7 +151,7 @@ wait_on_bit_action(unsigned long *word, int bit, wait_bit_action_f *action,
unsigned mode)
{
might_sleep();
- if (!test_bit(bit, word))
+ if (!test_bit_acquire(bit, word))
return 0;
return out_of_line_wait_on_bit(word, bit, action, mode);
}
diff --git a/kernel/sched/wait_bit.c b/kernel/sched/wait_bit.c
index d4788f810b55..0b1cd985dc27 100644
--- a/kernel/sched/wait_bit.c
+++ b/kernel/sched/wait_bit.c
@@ -47,7 +47,7 @@ __wait_on_bit(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_
prepare_to_wait(wq_head, &wbq_entry->wq_entry, mode);
if (test_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags))
ret = (*action)(&wbq_entry->key, mode);
- } while (test_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags) && !ret);
+ } while (test_bit_acquire(wbq_entry->key.bit_nr, wbq_entry->key.flags) && !ret);
finish_wait(wq_head, &wbq_entry->wq_entry);
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 8238b4579866b7c1bb99883cfe102a43db5506ff Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka(a)redhat.com>
Date: Fri, 26 Aug 2022 09:17:08 -0400
Subject: [PATCH] wait_on_bit: add an acquire memory barrier
There are several places in the kernel where wait_on_bit is not followed
by a memory barrier (for example, in drivers/md/dm-bufio.c:new_read).
On architectures with weak memory ordering, it may happen that memory
accesses that follow wait_on_bit are reordered before wait_on_bit and
they may return invalid data.
Fix this class of bugs by introducing a new function "test_bit_acquire"
that works like test_bit, but has acquire memory ordering semantics.
Signed-off-by: Mikulas Patocka <mpatocka(a)redhat.com>
Acked-by: Will Deacon <will(a)kernel.org>
Cc: stable(a)vger.kernel.org
Signed-off-by: Linus Torvalds <torvalds(a)linux-foundation.org>
diff --git a/Documentation/atomic_bitops.txt b/Documentation/atomic_bitops.txt
index d8b101c97031..edea4656c5c0 100644
--- a/Documentation/atomic_bitops.txt
+++ b/Documentation/atomic_bitops.txt
@@ -58,13 +58,11 @@ Like with atomic_t, the rule of thumb is:
- RMW operations that have a return value are fully ordered.
- - RMW operations that are conditional are unordered on FAILURE,
- otherwise the above rules apply. In the case of test_and_set_bit_lock(),
- if the bit in memory is unchanged by the operation then it is deemed to have
- failed.
+ - RMW operations that are conditional are fully ordered.
-Except for a successful test_and_set_bit_lock() which has ACQUIRE semantics and
-clear_bit_unlock() which has RELEASE semantics.
+Except for a successful test_and_set_bit_lock() which has ACQUIRE semantics,
+clear_bit_unlock() which has RELEASE semantics and test_bit_acquire which has
+ACQUIRE semantics.
Since a platform only has a single means of achieving atomic operations
the same barriers as for atomic_t are used, see atomic_t.txt.
diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h
index 973c6bd17f98..0fe9de58af31 100644
--- a/arch/x86/include/asm/bitops.h
+++ b/arch/x86/include/asm/bitops.h
@@ -207,6 +207,20 @@ static __always_inline bool constant_test_bit(long nr, const volatile unsigned l
(addr[nr >> _BITOPS_LONG_SHIFT])) != 0;
}
+static __always_inline bool constant_test_bit_acquire(long nr, const volatile unsigned long *addr)
+{
+ bool oldbit;
+
+ asm volatile("testb %2,%1"
+ CC_SET(nz)
+ : CC_OUT(nz) (oldbit)
+ : "m" (((unsigned char *)addr)[nr >> 3]),
+ "i" (1 << (nr & 7))
+ :"memory");
+
+ return oldbit;
+}
+
static __always_inline bool variable_test_bit(long nr, volatile const unsigned long *addr)
{
bool oldbit;
@@ -226,6 +240,13 @@ arch_test_bit(unsigned long nr, const volatile unsigned long *addr)
variable_test_bit(nr, addr);
}
+static __always_inline bool
+arch_test_bit_acquire(unsigned long nr, const volatile unsigned long *addr)
+{
+ return __builtin_constant_p(nr) ? constant_test_bit_acquire(nr, addr) :
+ variable_test_bit(nr, addr);
+}
+
/**
* __ffs - find first set bit in word
* @word: The word to search
diff --git a/include/asm-generic/bitops/generic-non-atomic.h b/include/asm-generic/bitops/generic-non-atomic.h
index 3d5ebd24652b..564a8c675d85 100644
--- a/include/asm-generic/bitops/generic-non-atomic.h
+++ b/include/asm-generic/bitops/generic-non-atomic.h
@@ -4,6 +4,7 @@
#define __ASM_GENERIC_BITOPS_GENERIC_NON_ATOMIC_H
#include <linux/bits.h>
+#include <asm/barrier.h>
#ifndef _LINUX_BITOPS_H
#error only <linux/bitops.h> can be included directly
@@ -127,6 +128,18 @@ generic_test_bit(unsigned long nr, const volatile unsigned long *addr)
return 1UL & (addr[BIT_WORD(nr)] >> (nr & (BITS_PER_LONG-1)));
}
+/**
+ * generic_test_bit_acquire - Determine, with acquire semantics, whether a bit is set
+ * @nr: bit number to test
+ * @addr: Address to start counting from
+ */
+static __always_inline bool
+generic_test_bit_acquire(unsigned long nr, const volatile unsigned long *addr)
+{
+ unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
+ return 1UL & (smp_load_acquire(p) >> (nr & (BITS_PER_LONG-1)));
+}
+
/*
* const_*() definitions provide good compile-time optimizations when
* the passed arguments can be resolved at compile time.
@@ -137,6 +150,7 @@ generic_test_bit(unsigned long nr, const volatile unsigned long *addr)
#define const___test_and_set_bit generic___test_and_set_bit
#define const___test_and_clear_bit generic___test_and_clear_bit
#define const___test_and_change_bit generic___test_and_change_bit
+#define const_test_bit_acquire generic_test_bit_acquire
/**
* const_test_bit - Determine whether a bit is set
diff --git a/include/asm-generic/bitops/instrumented-non-atomic.h b/include/asm-generic/bitops/instrumented-non-atomic.h
index 988a3bbfba34..2b238b161a62 100644
--- a/include/asm-generic/bitops/instrumented-non-atomic.h
+++ b/include/asm-generic/bitops/instrumented-non-atomic.h
@@ -142,4 +142,16 @@ _test_bit(unsigned long nr, const volatile unsigned long *addr)
return arch_test_bit(nr, addr);
}
+/**
+ * _test_bit_acquire - Determine, with acquire semantics, whether a bit is set
+ * @nr: bit number to test
+ * @addr: Address to start counting from
+ */
+static __always_inline bool
+_test_bit_acquire(unsigned long nr, const volatile unsigned long *addr)
+{
+ instrument_atomic_read(addr + BIT_WORD(nr), sizeof(long));
+ return arch_test_bit_acquire(nr, addr);
+}
+
#endif /* _ASM_GENERIC_BITOPS_INSTRUMENTED_NON_ATOMIC_H */
diff --git a/include/asm-generic/bitops/non-atomic.h b/include/asm-generic/bitops/non-atomic.h
index 5c37ced343ae..71f8d54a5195 100644
--- a/include/asm-generic/bitops/non-atomic.h
+++ b/include/asm-generic/bitops/non-atomic.h
@@ -13,6 +13,7 @@
#define arch___test_and_change_bit generic___test_and_change_bit
#define arch_test_bit generic_test_bit
+#define arch_test_bit_acquire generic_test_bit_acquire
#include <asm-generic/bitops/non-instrumented-non-atomic.h>
diff --git a/include/asm-generic/bitops/non-instrumented-non-atomic.h b/include/asm-generic/bitops/non-instrumented-non-atomic.h
index bdb9b1ffaee9..0ddc78dfc358 100644
--- a/include/asm-generic/bitops/non-instrumented-non-atomic.h
+++ b/include/asm-generic/bitops/non-instrumented-non-atomic.h
@@ -12,5 +12,6 @@
#define ___test_and_change_bit arch___test_and_change_bit
#define _test_bit arch_test_bit
+#define _test_bit_acquire arch_test_bit_acquire
#endif /* __ASM_GENERIC_BITOPS_NON_INSTRUMENTED_NON_ATOMIC_H */
diff --git a/include/linux/bitops.h b/include/linux/bitops.h
index cf9bf65039f2..3b89c64bcfd8 100644
--- a/include/linux/bitops.h
+++ b/include/linux/bitops.h
@@ -59,6 +59,7 @@ extern unsigned long __sw_hweight64(__u64 w);
#define __test_and_clear_bit(nr, addr) bitop(___test_and_clear_bit, nr, addr)
#define __test_and_change_bit(nr, addr) bitop(___test_and_change_bit, nr, addr)
#define test_bit(nr, addr) bitop(_test_bit, nr, addr)
+#define test_bit_acquire(nr, addr) bitop(_test_bit_acquire, nr, addr)
/*
* Include this here because some architectures need generic_ffs/fls in
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index def8b8d30ccc..089c9ade4325 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -156,7 +156,7 @@ static __always_inline int buffer_uptodate(const struct buffer_head *bh)
* make it consistent with folio_test_uptodate
* pairs with smp_mb__before_atomic in set_buffer_uptodate
*/
- return (smp_load_acquire(&bh->b_state) & (1UL << BH_Uptodate)) != 0;
+ return test_bit_acquire(BH_Uptodate, &bh->b_state);
}
#define bh_offset(bh) ((unsigned long)(bh)->b_data & ~PAGE_MASK)
diff --git a/include/linux/wait_bit.h b/include/linux/wait_bit.h
index 7dec36aecbd9..7725b7579b78 100644
--- a/include/linux/wait_bit.h
+++ b/include/linux/wait_bit.h
@@ -71,7 +71,7 @@ static inline int
wait_on_bit(unsigned long *word, int bit, unsigned mode)
{
might_sleep();
- if (!test_bit(bit, word))
+ if (!test_bit_acquire(bit, word))
return 0;
return out_of_line_wait_on_bit(word, bit,
bit_wait,
@@ -96,7 +96,7 @@ static inline int
wait_on_bit_io(unsigned long *word, int bit, unsigned mode)
{
might_sleep();
- if (!test_bit(bit, word))
+ if (!test_bit_acquire(bit, word))
return 0;
return out_of_line_wait_on_bit(word, bit,
bit_wait_io,
@@ -123,7 +123,7 @@ wait_on_bit_timeout(unsigned long *word, int bit, unsigned mode,
unsigned long timeout)
{
might_sleep();
- if (!test_bit(bit, word))
+ if (!test_bit_acquire(bit, word))
return 0;
return out_of_line_wait_on_bit_timeout(word, bit,
bit_wait_timeout,
@@ -151,7 +151,7 @@ wait_on_bit_action(unsigned long *word, int bit, wait_bit_action_f *action,
unsigned mode)
{
might_sleep();
- if (!test_bit(bit, word))
+ if (!test_bit_acquire(bit, word))
return 0;
return out_of_line_wait_on_bit(word, bit, action, mode);
}
diff --git a/kernel/sched/wait_bit.c b/kernel/sched/wait_bit.c
index d4788f810b55..0b1cd985dc27 100644
--- a/kernel/sched/wait_bit.c
+++ b/kernel/sched/wait_bit.c
@@ -47,7 +47,7 @@ __wait_on_bit(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_
prepare_to_wait(wq_head, &wbq_entry->wq_entry, mode);
if (test_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags))
ret = (*action)(&wbq_entry->key, mode);
- } while (test_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags) && !ret);
+ } while (test_bit_acquire(wbq_entry->key.bit_nr, wbq_entry->key.flags) && !ret);
finish_wait(wq_head, &wbq_entry->wq_entry);
The patch below does not apply to the 5.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 8238b4579866b7c1bb99883cfe102a43db5506ff Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka(a)redhat.com>
Date: Fri, 26 Aug 2022 09:17:08 -0400
Subject: [PATCH] wait_on_bit: add an acquire memory barrier
There are several places in the kernel where wait_on_bit is not followed
by a memory barrier (for example, in drivers/md/dm-bufio.c:new_read).
On architectures with weak memory ordering, it may happen that memory
accesses that follow wait_on_bit are reordered before wait_on_bit and
they may return invalid data.
Fix this class of bugs by introducing a new function "test_bit_acquire"
that works like test_bit, but has acquire memory ordering semantics.
Signed-off-by: Mikulas Patocka <mpatocka(a)redhat.com>
Acked-by: Will Deacon <will(a)kernel.org>
Cc: stable(a)vger.kernel.org
Signed-off-by: Linus Torvalds <torvalds(a)linux-foundation.org>
diff --git a/Documentation/atomic_bitops.txt b/Documentation/atomic_bitops.txt
index d8b101c97031..edea4656c5c0 100644
--- a/Documentation/atomic_bitops.txt
+++ b/Documentation/atomic_bitops.txt
@@ -58,13 +58,11 @@ Like with atomic_t, the rule of thumb is:
- RMW operations that have a return value are fully ordered.
- - RMW operations that are conditional are unordered on FAILURE,
- otherwise the above rules apply. In the case of test_and_set_bit_lock(),
- if the bit in memory is unchanged by the operation then it is deemed to have
- failed.
+ - RMW operations that are conditional are fully ordered.
-Except for a successful test_and_set_bit_lock() which has ACQUIRE semantics and
-clear_bit_unlock() which has RELEASE semantics.
+Except for a successful test_and_set_bit_lock() which has ACQUIRE semantics,
+clear_bit_unlock() which has RELEASE semantics and test_bit_acquire which has
+ACQUIRE semantics.
Since a platform only has a single means of achieving atomic operations
the same barriers as for atomic_t are used, see atomic_t.txt.
diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h
index 973c6bd17f98..0fe9de58af31 100644
--- a/arch/x86/include/asm/bitops.h
+++ b/arch/x86/include/asm/bitops.h
@@ -207,6 +207,20 @@ static __always_inline bool constant_test_bit(long nr, const volatile unsigned l
(addr[nr >> _BITOPS_LONG_SHIFT])) != 0;
}
+static __always_inline bool constant_test_bit_acquire(long nr, const volatile unsigned long *addr)
+{
+ bool oldbit;
+
+ asm volatile("testb %2,%1"
+ CC_SET(nz)
+ : CC_OUT(nz) (oldbit)
+ : "m" (((unsigned char *)addr)[nr >> 3]),
+ "i" (1 << (nr & 7))
+ :"memory");
+
+ return oldbit;
+}
+
static __always_inline bool variable_test_bit(long nr, volatile const unsigned long *addr)
{
bool oldbit;
@@ -226,6 +240,13 @@ arch_test_bit(unsigned long nr, const volatile unsigned long *addr)
variable_test_bit(nr, addr);
}
+static __always_inline bool
+arch_test_bit_acquire(unsigned long nr, const volatile unsigned long *addr)
+{
+ return __builtin_constant_p(nr) ? constant_test_bit_acquire(nr, addr) :
+ variable_test_bit(nr, addr);
+}
+
/**
* __ffs - find first set bit in word
* @word: The word to search
diff --git a/include/asm-generic/bitops/generic-non-atomic.h b/include/asm-generic/bitops/generic-non-atomic.h
index 3d5ebd24652b..564a8c675d85 100644
--- a/include/asm-generic/bitops/generic-non-atomic.h
+++ b/include/asm-generic/bitops/generic-non-atomic.h
@@ -4,6 +4,7 @@
#define __ASM_GENERIC_BITOPS_GENERIC_NON_ATOMIC_H
#include <linux/bits.h>
+#include <asm/barrier.h>
#ifndef _LINUX_BITOPS_H
#error only <linux/bitops.h> can be included directly
@@ -127,6 +128,18 @@ generic_test_bit(unsigned long nr, const volatile unsigned long *addr)
return 1UL & (addr[BIT_WORD(nr)] >> (nr & (BITS_PER_LONG-1)));
}
+/**
+ * generic_test_bit_acquire - Determine, with acquire semantics, whether a bit is set
+ * @nr: bit number to test
+ * @addr: Address to start counting from
+ */
+static __always_inline bool
+generic_test_bit_acquire(unsigned long nr, const volatile unsigned long *addr)
+{
+ unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
+ return 1UL & (smp_load_acquire(p) >> (nr & (BITS_PER_LONG-1)));
+}
+
/*
* const_*() definitions provide good compile-time optimizations when
* the passed arguments can be resolved at compile time.
@@ -137,6 +150,7 @@ generic_test_bit(unsigned long nr, const volatile unsigned long *addr)
#define const___test_and_set_bit generic___test_and_set_bit
#define const___test_and_clear_bit generic___test_and_clear_bit
#define const___test_and_change_bit generic___test_and_change_bit
+#define const_test_bit_acquire generic_test_bit_acquire
/**
* const_test_bit - Determine whether a bit is set
diff --git a/include/asm-generic/bitops/instrumented-non-atomic.h b/include/asm-generic/bitops/instrumented-non-atomic.h
index 988a3bbfba34..2b238b161a62 100644
--- a/include/asm-generic/bitops/instrumented-non-atomic.h
+++ b/include/asm-generic/bitops/instrumented-non-atomic.h
@@ -142,4 +142,16 @@ _test_bit(unsigned long nr, const volatile unsigned long *addr)
return arch_test_bit(nr, addr);
}
+/**
+ * _test_bit_acquire - Determine, with acquire semantics, whether a bit is set
+ * @nr: bit number to test
+ * @addr: Address to start counting from
+ */
+static __always_inline bool
+_test_bit_acquire(unsigned long nr, const volatile unsigned long *addr)
+{
+ instrument_atomic_read(addr + BIT_WORD(nr), sizeof(long));
+ return arch_test_bit_acquire(nr, addr);
+}
+
#endif /* _ASM_GENERIC_BITOPS_INSTRUMENTED_NON_ATOMIC_H */
diff --git a/include/asm-generic/bitops/non-atomic.h b/include/asm-generic/bitops/non-atomic.h
index 5c37ced343ae..71f8d54a5195 100644
--- a/include/asm-generic/bitops/non-atomic.h
+++ b/include/asm-generic/bitops/non-atomic.h
@@ -13,6 +13,7 @@
#define arch___test_and_change_bit generic___test_and_change_bit
#define arch_test_bit generic_test_bit
+#define arch_test_bit_acquire generic_test_bit_acquire
#include <asm-generic/bitops/non-instrumented-non-atomic.h>
diff --git a/include/asm-generic/bitops/non-instrumented-non-atomic.h b/include/asm-generic/bitops/non-instrumented-non-atomic.h
index bdb9b1ffaee9..0ddc78dfc358 100644
--- a/include/asm-generic/bitops/non-instrumented-non-atomic.h
+++ b/include/asm-generic/bitops/non-instrumented-non-atomic.h
@@ -12,5 +12,6 @@
#define ___test_and_change_bit arch___test_and_change_bit
#define _test_bit arch_test_bit
+#define _test_bit_acquire arch_test_bit_acquire
#endif /* __ASM_GENERIC_BITOPS_NON_INSTRUMENTED_NON_ATOMIC_H */
diff --git a/include/linux/bitops.h b/include/linux/bitops.h
index cf9bf65039f2..3b89c64bcfd8 100644
--- a/include/linux/bitops.h
+++ b/include/linux/bitops.h
@@ -59,6 +59,7 @@ extern unsigned long __sw_hweight64(__u64 w);
#define __test_and_clear_bit(nr, addr) bitop(___test_and_clear_bit, nr, addr)
#define __test_and_change_bit(nr, addr) bitop(___test_and_change_bit, nr, addr)
#define test_bit(nr, addr) bitop(_test_bit, nr, addr)
+#define test_bit_acquire(nr, addr) bitop(_test_bit_acquire, nr, addr)
/*
* Include this here because some architectures need generic_ffs/fls in
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index def8b8d30ccc..089c9ade4325 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -156,7 +156,7 @@ static __always_inline int buffer_uptodate(const struct buffer_head *bh)
* make it consistent with folio_test_uptodate
* pairs with smp_mb__before_atomic in set_buffer_uptodate
*/
- return (smp_load_acquire(&bh->b_state) & (1UL << BH_Uptodate)) != 0;
+ return test_bit_acquire(BH_Uptodate, &bh->b_state);
}
#define bh_offset(bh) ((unsigned long)(bh)->b_data & ~PAGE_MASK)
diff --git a/include/linux/wait_bit.h b/include/linux/wait_bit.h
index 7dec36aecbd9..7725b7579b78 100644
--- a/include/linux/wait_bit.h
+++ b/include/linux/wait_bit.h
@@ -71,7 +71,7 @@ static inline int
wait_on_bit(unsigned long *word, int bit, unsigned mode)
{
might_sleep();
- if (!test_bit(bit, word))
+ if (!test_bit_acquire(bit, word))
return 0;
return out_of_line_wait_on_bit(word, bit,
bit_wait,
@@ -96,7 +96,7 @@ static inline int
wait_on_bit_io(unsigned long *word, int bit, unsigned mode)
{
might_sleep();
- if (!test_bit(bit, word))
+ if (!test_bit_acquire(bit, word))
return 0;
return out_of_line_wait_on_bit(word, bit,
bit_wait_io,
@@ -123,7 +123,7 @@ wait_on_bit_timeout(unsigned long *word, int bit, unsigned mode,
unsigned long timeout)
{
might_sleep();
- if (!test_bit(bit, word))
+ if (!test_bit_acquire(bit, word))
return 0;
return out_of_line_wait_on_bit_timeout(word, bit,
bit_wait_timeout,
@@ -151,7 +151,7 @@ wait_on_bit_action(unsigned long *word, int bit, wait_bit_action_f *action,
unsigned mode)
{
might_sleep();
- if (!test_bit(bit, word))
+ if (!test_bit_acquire(bit, word))
return 0;
return out_of_line_wait_on_bit(word, bit, action, mode);
}
diff --git a/kernel/sched/wait_bit.c b/kernel/sched/wait_bit.c
index d4788f810b55..0b1cd985dc27 100644
--- a/kernel/sched/wait_bit.c
+++ b/kernel/sched/wait_bit.c
@@ -47,7 +47,7 @@ __wait_on_bit(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_
prepare_to_wait(wq_head, &wbq_entry->wq_entry, mode);
if (test_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags))
ret = (*action)(&wbq_entry->key, mode);
- } while (test_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags) && !ret);
+ } while (test_bit_acquire(wbq_entry->key.bit_nr, wbq_entry->key.flags) && !ret);
finish_wait(wq_head, &wbq_entry->wq_entry);
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 4e3aa9238277597c6c7624f302d81a7b568b6f2d Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz(a)infradead.org>
Date: Tue, 16 Aug 2022 14:28:36 +0200
Subject: [PATCH] x86/nospec: Unwreck the RSB stuffing
Commit 2b1299322016 ("x86/speculation: Add RSB VM Exit protections")
made a right mess of the RSB stuffing, rewrite the whole thing to not
suck.
Thanks to Andrew for the enlightening comment about Post-Barrier RSB
things so we can make this code less magical.
Cc: stable(a)vger.kernel.org
Signed-off-by: Peter Zijlstra (Intel) <peterz(a)infradead.org>
Link: https://lkml.kernel.org/r/YvuNdDWoUZSBjYcm@worktop.programming.kicks-ass.net
diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h
index e64fd20778b6..10731ccfed37 100644
--- a/arch/x86/include/asm/nospec-branch.h
+++ b/arch/x86/include/asm/nospec-branch.h
@@ -35,33 +35,44 @@
#define RSB_CLEAR_LOOPS 32 /* To forcibly overwrite all entries */
/*
+ * Common helper for __FILL_RETURN_BUFFER and __FILL_ONE_RETURN.
+ */
+#define __FILL_RETURN_SLOT \
+ ANNOTATE_INTRA_FUNCTION_CALL; \
+ call 772f; \
+ int3; \
+772:
+
+/*
+ * Stuff the entire RSB.
+ *
* Google experimented with loop-unrolling and this turned out to be
* the optimal version - two calls, each with their own speculation
* trap should their return address end up getting used, in a loop.
*/
-#define __FILL_RETURN_BUFFER(reg, nr, sp) \
- mov $(nr/2), reg; \
-771: \
- ANNOTATE_INTRA_FUNCTION_CALL; \
- call 772f; \
-773: /* speculation trap */ \
- UNWIND_HINT_EMPTY; \
- pause; \
- lfence; \
- jmp 773b; \
-772: \
- ANNOTATE_INTRA_FUNCTION_CALL; \
- call 774f; \
-775: /* speculation trap */ \
- UNWIND_HINT_EMPTY; \
- pause; \
- lfence; \
- jmp 775b; \
-774: \
- add $(BITS_PER_LONG/8) * 2, sp; \
- dec reg; \
- jnz 771b; \
- /* barrier for jnz misprediction */ \
+#define __FILL_RETURN_BUFFER(reg, nr) \
+ mov $(nr/2), reg; \
+771: \
+ __FILL_RETURN_SLOT \
+ __FILL_RETURN_SLOT \
+ add $(BITS_PER_LONG/8) * 2, %_ASM_SP; \
+ dec reg; \
+ jnz 771b; \
+ /* barrier for jnz misprediction */ \
+ lfence;
+
+/*
+ * Stuff a single RSB slot.
+ *
+ * To mitigate Post-Barrier RSB speculation, one CALL instruction must be
+ * forced to retire before letting a RET instruction execute.
+ *
+ * On PBRSB-vulnerable CPUs, it is not safe for a RET to be executed
+ * before this point.
+ */
+#define __FILL_ONE_RETURN \
+ __FILL_RETURN_SLOT \
+ add $(BITS_PER_LONG/8), %_ASM_SP; \
lfence;
#ifdef __ASSEMBLY__
@@ -132,28 +143,15 @@
#endif
.endm
-.macro ISSUE_UNBALANCED_RET_GUARD
- ANNOTATE_INTRA_FUNCTION_CALL
- call .Lunbalanced_ret_guard_\@
- int3
-.Lunbalanced_ret_guard_\@:
- add $(BITS_PER_LONG/8), %_ASM_SP
- lfence
-.endm
-
/*
* A simpler FILL_RETURN_BUFFER macro. Don't make people use the CPP
* monstrosity above, manually.
*/
-.macro FILL_RETURN_BUFFER reg:req nr:req ftr:req ftr2
-.ifb \ftr2
- ALTERNATIVE "jmp .Lskip_rsb_\@", "", \ftr
-.else
- ALTERNATIVE_2 "jmp .Lskip_rsb_\@", "", \ftr, "jmp .Lunbalanced_\@", \ftr2
-.endif
- __FILL_RETURN_BUFFER(\reg,\nr,%_ASM_SP)
-.Lunbalanced_\@:
- ISSUE_UNBALANCED_RET_GUARD
+.macro FILL_RETURN_BUFFER reg:req nr:req ftr:req ftr2=ALT_NOT(X86_FEATURE_ALWAYS)
+ ALTERNATIVE_2 "jmp .Lskip_rsb_\@", \
+ __stringify(__FILL_RETURN_BUFFER(\reg,\nr)), \ftr, \
+ __stringify(__FILL_ONE_RETURN), \ftr2
+
.Lskip_rsb_\@:
.endm
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 4e3aa9238277597c6c7624f302d81a7b568b6f2d Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz(a)infradead.org>
Date: Tue, 16 Aug 2022 14:28:36 +0200
Subject: [PATCH] x86/nospec: Unwreck the RSB stuffing
Commit 2b1299322016 ("x86/speculation: Add RSB VM Exit protections")
made a right mess of the RSB stuffing, rewrite the whole thing to not
suck.
Thanks to Andrew for the enlightening comment about Post-Barrier RSB
things so we can make this code less magical.
Cc: stable(a)vger.kernel.org
Signed-off-by: Peter Zijlstra (Intel) <peterz(a)infradead.org>
Link: https://lkml.kernel.org/r/YvuNdDWoUZSBjYcm@worktop.programming.kicks-ass.net
diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h
index e64fd20778b6..10731ccfed37 100644
--- a/arch/x86/include/asm/nospec-branch.h
+++ b/arch/x86/include/asm/nospec-branch.h
@@ -35,33 +35,44 @@
#define RSB_CLEAR_LOOPS 32 /* To forcibly overwrite all entries */
/*
+ * Common helper for __FILL_RETURN_BUFFER and __FILL_ONE_RETURN.
+ */
+#define __FILL_RETURN_SLOT \
+ ANNOTATE_INTRA_FUNCTION_CALL; \
+ call 772f; \
+ int3; \
+772:
+
+/*
+ * Stuff the entire RSB.
+ *
* Google experimented with loop-unrolling and this turned out to be
* the optimal version - two calls, each with their own speculation
* trap should their return address end up getting used, in a loop.
*/
-#define __FILL_RETURN_BUFFER(reg, nr, sp) \
- mov $(nr/2), reg; \
-771: \
- ANNOTATE_INTRA_FUNCTION_CALL; \
- call 772f; \
-773: /* speculation trap */ \
- UNWIND_HINT_EMPTY; \
- pause; \
- lfence; \
- jmp 773b; \
-772: \
- ANNOTATE_INTRA_FUNCTION_CALL; \
- call 774f; \
-775: /* speculation trap */ \
- UNWIND_HINT_EMPTY; \
- pause; \
- lfence; \
- jmp 775b; \
-774: \
- add $(BITS_PER_LONG/8) * 2, sp; \
- dec reg; \
- jnz 771b; \
- /* barrier for jnz misprediction */ \
+#define __FILL_RETURN_BUFFER(reg, nr) \
+ mov $(nr/2), reg; \
+771: \
+ __FILL_RETURN_SLOT \
+ __FILL_RETURN_SLOT \
+ add $(BITS_PER_LONG/8) * 2, %_ASM_SP; \
+ dec reg; \
+ jnz 771b; \
+ /* barrier for jnz misprediction */ \
+ lfence;
+
+/*
+ * Stuff a single RSB slot.
+ *
+ * To mitigate Post-Barrier RSB speculation, one CALL instruction must be
+ * forced to retire before letting a RET instruction execute.
+ *
+ * On PBRSB-vulnerable CPUs, it is not safe for a RET to be executed
+ * before this point.
+ */
+#define __FILL_ONE_RETURN \
+ __FILL_RETURN_SLOT \
+ add $(BITS_PER_LONG/8), %_ASM_SP; \
lfence;
#ifdef __ASSEMBLY__
@@ -132,28 +143,15 @@
#endif
.endm
-.macro ISSUE_UNBALANCED_RET_GUARD
- ANNOTATE_INTRA_FUNCTION_CALL
- call .Lunbalanced_ret_guard_\@
- int3
-.Lunbalanced_ret_guard_\@:
- add $(BITS_PER_LONG/8), %_ASM_SP
- lfence
-.endm
-
/*
* A simpler FILL_RETURN_BUFFER macro. Don't make people use the CPP
* monstrosity above, manually.
*/
-.macro FILL_RETURN_BUFFER reg:req nr:req ftr:req ftr2
-.ifb \ftr2
- ALTERNATIVE "jmp .Lskip_rsb_\@", "", \ftr
-.else
- ALTERNATIVE_2 "jmp .Lskip_rsb_\@", "", \ftr, "jmp .Lunbalanced_\@", \ftr2
-.endif
- __FILL_RETURN_BUFFER(\reg,\nr,%_ASM_SP)
-.Lunbalanced_\@:
- ISSUE_UNBALANCED_RET_GUARD
+.macro FILL_RETURN_BUFFER reg:req nr:req ftr:req ftr2=ALT_NOT(X86_FEATURE_ALWAYS)
+ ALTERNATIVE_2 "jmp .Lskip_rsb_\@", \
+ __stringify(__FILL_RETURN_BUFFER(\reg,\nr)), \ftr, \
+ __stringify(__FILL_ONE_RETURN), \ftr2
+
.Lskip_rsb_\@:
.endm
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 4e3aa9238277597c6c7624f302d81a7b568b6f2d Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz(a)infradead.org>
Date: Tue, 16 Aug 2022 14:28:36 +0200
Subject: [PATCH] x86/nospec: Unwreck the RSB stuffing
Commit 2b1299322016 ("x86/speculation: Add RSB VM Exit protections")
made a right mess of the RSB stuffing, rewrite the whole thing to not
suck.
Thanks to Andrew for the enlightening comment about Post-Barrier RSB
things so we can make this code less magical.
Cc: stable(a)vger.kernel.org
Signed-off-by: Peter Zijlstra (Intel) <peterz(a)infradead.org>
Link: https://lkml.kernel.org/r/YvuNdDWoUZSBjYcm@worktop.programming.kicks-ass.net
diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h
index e64fd20778b6..10731ccfed37 100644
--- a/arch/x86/include/asm/nospec-branch.h
+++ b/arch/x86/include/asm/nospec-branch.h
@@ -35,33 +35,44 @@
#define RSB_CLEAR_LOOPS 32 /* To forcibly overwrite all entries */
/*
+ * Common helper for __FILL_RETURN_BUFFER and __FILL_ONE_RETURN.
+ */
+#define __FILL_RETURN_SLOT \
+ ANNOTATE_INTRA_FUNCTION_CALL; \
+ call 772f; \
+ int3; \
+772:
+
+/*
+ * Stuff the entire RSB.
+ *
* Google experimented with loop-unrolling and this turned out to be
* the optimal version - two calls, each with their own speculation
* trap should their return address end up getting used, in a loop.
*/
-#define __FILL_RETURN_BUFFER(reg, nr, sp) \
- mov $(nr/2), reg; \
-771: \
- ANNOTATE_INTRA_FUNCTION_CALL; \
- call 772f; \
-773: /* speculation trap */ \
- UNWIND_HINT_EMPTY; \
- pause; \
- lfence; \
- jmp 773b; \
-772: \
- ANNOTATE_INTRA_FUNCTION_CALL; \
- call 774f; \
-775: /* speculation trap */ \
- UNWIND_HINT_EMPTY; \
- pause; \
- lfence; \
- jmp 775b; \
-774: \
- add $(BITS_PER_LONG/8) * 2, sp; \
- dec reg; \
- jnz 771b; \
- /* barrier for jnz misprediction */ \
+#define __FILL_RETURN_BUFFER(reg, nr) \
+ mov $(nr/2), reg; \
+771: \
+ __FILL_RETURN_SLOT \
+ __FILL_RETURN_SLOT \
+ add $(BITS_PER_LONG/8) * 2, %_ASM_SP; \
+ dec reg; \
+ jnz 771b; \
+ /* barrier for jnz misprediction */ \
+ lfence;
+
+/*
+ * Stuff a single RSB slot.
+ *
+ * To mitigate Post-Barrier RSB speculation, one CALL instruction must be
+ * forced to retire before letting a RET instruction execute.
+ *
+ * On PBRSB-vulnerable CPUs, it is not safe for a RET to be executed
+ * before this point.
+ */
+#define __FILL_ONE_RETURN \
+ __FILL_RETURN_SLOT \
+ add $(BITS_PER_LONG/8), %_ASM_SP; \
lfence;
#ifdef __ASSEMBLY__
@@ -132,28 +143,15 @@
#endif
.endm
-.macro ISSUE_UNBALANCED_RET_GUARD
- ANNOTATE_INTRA_FUNCTION_CALL
- call .Lunbalanced_ret_guard_\@
- int3
-.Lunbalanced_ret_guard_\@:
- add $(BITS_PER_LONG/8), %_ASM_SP
- lfence
-.endm
-
/*
* A simpler FILL_RETURN_BUFFER macro. Don't make people use the CPP
* monstrosity above, manually.
*/
-.macro FILL_RETURN_BUFFER reg:req nr:req ftr:req ftr2
-.ifb \ftr2
- ALTERNATIVE "jmp .Lskip_rsb_\@", "", \ftr
-.else
- ALTERNATIVE_2 "jmp .Lskip_rsb_\@", "", \ftr, "jmp .Lunbalanced_\@", \ftr2
-.endif
- __FILL_RETURN_BUFFER(\reg,\nr,%_ASM_SP)
-.Lunbalanced_\@:
- ISSUE_UNBALANCED_RET_GUARD
+.macro FILL_RETURN_BUFFER reg:req nr:req ftr:req ftr2=ALT_NOT(X86_FEATURE_ALWAYS)
+ ALTERNATIVE_2 "jmp .Lskip_rsb_\@", \
+ __stringify(__FILL_RETURN_BUFFER(\reg,\nr)), \ftr, \
+ __stringify(__FILL_ONE_RETURN), \ftr2
+
.Lskip_rsb_\@:
.endm
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From cde643ff75bc20c538dfae787ca3b587bab16b50 Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang(a)linux.intel.com>
Date: Thu, 18 Aug 2022 11:44:29 -0700
Subject: [PATCH] perf/x86/intel: Fix pebs event constraints for ADL
According to the latest event list, the LOAD_LATENCY PEBS event only
works on the GP counter 0 and 1 for ADL and RPL.
Update the pebs event constraints table.
Fixes: f83d2f91d259 ("perf/x86/intel: Add Alder Lake Hybrid support")
Reported-by: Ammy Yi <ammy.yi(a)intel.com>
Signed-off-by: Kan Liang <kan.liang(a)linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz(a)infradead.org>
Cc: stable(a)vger.kernel.org
Link: https://lkml.kernel.org/r/20220818184429.2355857-1-kan.liang@linux.intel.com
diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index e5b587499122..de1f55d51784 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -830,7 +830,7 @@ struct event_constraint intel_glm_pebs_event_constraints[] = {
struct event_constraint intel_grt_pebs_event_constraints[] = {
/* Allow all events as PEBS with no flags */
- INTEL_HYBRID_LAT_CONSTRAINT(0x5d0, 0xf),
+ INTEL_HYBRID_LAT_CONSTRAINT(0x5d0, 0x3),
INTEL_HYBRID_LAT_CONSTRAINT(0x6d0, 0xf),
EVENT_CONSTRAINT_END
};
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From ced8ecf026fd8084cf175530ff85c76d6085d715 Mon Sep 17 00:00:00 2001
From: Omar Sandoval <osandov(a)fb.com>
Date: Tue, 23 Aug 2022 11:28:13 -0700
Subject: [PATCH] btrfs: fix space cache corruption and potential double
allocations
When testing space_cache v2 on a large set of machines, we encountered a
few symptoms:
1. "unable to add free space :-17" (EEXIST) errors.
2. Missing free space info items, sometimes caught with a "missing free
space info for X" error.
3. Double-accounted space: ranges that were allocated in the extent tree
and also marked as free in the free space tree, ranges that were
marked as allocated twice in the extent tree, or ranges that were
marked as free twice in the free space tree. If the latter made it
onto disk, the next reboot would hit the BUG_ON() in
add_new_free_space().
4. On some hosts with no on-disk corruption or error messages, the
in-memory space cache (dumped with drgn) disagreed with the free
space tree.
All of these symptoms have the same underlying cause: a race between
caching the free space for a block group and returning free space to the
in-memory space cache for pinned extents causes us to double-add a free
range to the space cache. This race exists when free space is cached
from the free space tree (space_cache=v2) or the extent tree
(nospace_cache, or space_cache=v1 if the cache needs to be regenerated).
struct btrfs_block_group::last_byte_to_unpin and struct
btrfs_block_group::progress are supposed to protect against this race,
but commit d0c2f4fa555e ("btrfs: make concurrent fsyncs wait less when
waiting for a transaction commit") subtly broke this by allowing
multiple transactions to be unpinning extents at the same time.
Specifically, the race is as follows:
1. An extent is deleted from an uncached block group in transaction A.
2. btrfs_commit_transaction() is called for transaction A.
3. btrfs_run_delayed_refs() -> __btrfs_free_extent() runs the delayed
ref for the deleted extent.
4. __btrfs_free_extent() -> do_free_extent_accounting() ->
add_to_free_space_tree() adds the deleted extent back to the free
space tree.
5. do_free_extent_accounting() -> btrfs_update_block_group() ->
btrfs_cache_block_group() queues up the block group to get cached.
block_group->progress is set to block_group->start.
6. btrfs_commit_transaction() for transaction A calls
switch_commit_roots(). It sets block_group->last_byte_to_unpin to
block_group->progress, which is block_group->start because the block
group hasn't been cached yet.
7. The caching thread gets to our block group. Since the commit roots
were already switched, load_free_space_tree() sees the deleted extent
as free and adds it to the space cache. It finishes caching and sets
block_group->progress to U64_MAX.
8. btrfs_commit_transaction() advances transaction A to
TRANS_STATE_SUPER_COMMITTED.
9. fsync calls btrfs_commit_transaction() for transaction B. Since
transaction A is already in TRANS_STATE_SUPER_COMMITTED and the
commit is for fsync, it advances.
10. btrfs_commit_transaction() for transaction B calls
switch_commit_roots(). This time, the block group has already been
cached, so it sets block_group->last_byte_to_unpin to U64_MAX.
11. btrfs_commit_transaction() for transaction A calls
btrfs_finish_extent_commit(), which calls unpin_extent_range() for
the deleted extent. It sees last_byte_to_unpin set to U64_MAX (by
transaction B!), so it adds the deleted extent to the space cache
again!
This explains all of our symptoms above:
* If the sequence of events is exactly as described above, when the free
space is re-added in step 11, it will fail with EEXIST.
* If another thread reallocates the deleted extent in between steps 7
and 11, then step 11 will silently re-add that space to the space
cache as free even though it is actually allocated. Then, if that
space is allocated *again*, the free space tree will be corrupted
(namely, the wrong item will be deleted).
* If we don't catch this free space tree corruption, it will continue
to get worse as extents are deleted and reallocated.
The v1 space_cache is synchronously loaded when an extent is deleted
(btrfs_update_block_group() with alloc=0 calls btrfs_cache_block_group()
with load_cache_only=1), so it is not normally affected by this bug.
However, as noted above, if we fail to load the space cache, we will
fall back to caching from the extent tree and may hit this bug.
The easiest fix for this race is to also make caching from the free
space tree or extent tree synchronous. Josef tested this and found no
performance regressions.
A few extra changes fall out of this change. Namely, this fix does the
following, with step 2 being the crucial fix:
1. Factor btrfs_caching_ctl_wait_done() out of
btrfs_wait_block_group_cache_done() to allow waiting on a caching_ctl
that we already hold a reference to.
2. Change the call in btrfs_cache_block_group() of
btrfs_wait_space_cache_v1_finished() to
btrfs_caching_ctl_wait_done(), which makes us wait regardless of the
space_cache option.
3. Delete the now unused btrfs_wait_space_cache_v1_finished() and
space_cache_v1_done().
4. Change btrfs_cache_block_group()'s `int load_cache_only` parameter to
`bool wait` to more accurately describe its new meaning.
5. Change a few callers which had a separate call to
btrfs_wait_block_group_cache_done() to use wait = true instead.
6. Make btrfs_wait_block_group_cache_done() static now that it's not
used outside of block-group.c anymore.
Fixes: d0c2f4fa555e ("btrfs: make concurrent fsyncs wait less when waiting for a transaction commit")
CC: stable(a)vger.kernel.org # 5.12+
Reviewed-by: Filipe Manana <fdmanana(a)suse.com>
Signed-off-by: Omar Sandoval <osandov(a)fb.com>
Signed-off-by: David Sterba <dsterba(a)suse.com>
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index 993aca2f1e18..e0375ba9d0fe 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -440,39 +440,26 @@ void btrfs_wait_block_group_cache_progress(struct btrfs_block_group *cache,
btrfs_put_caching_control(caching_ctl);
}
-int btrfs_wait_block_group_cache_done(struct btrfs_block_group *cache)
+static int btrfs_caching_ctl_wait_done(struct btrfs_block_group *cache,
+ struct btrfs_caching_control *caching_ctl)
+{
+ wait_event(caching_ctl->wait, btrfs_block_group_done(cache));
+ return cache->cached == BTRFS_CACHE_ERROR ? -EIO : 0;
+}
+
+static int btrfs_wait_block_group_cache_done(struct btrfs_block_group *cache)
{
struct btrfs_caching_control *caching_ctl;
- int ret = 0;
+ int ret;
caching_ctl = btrfs_get_caching_control(cache);
if (!caching_ctl)
return (cache->cached == BTRFS_CACHE_ERROR) ? -EIO : 0;
-
- wait_event(caching_ctl->wait, btrfs_block_group_done(cache));
- if (cache->cached == BTRFS_CACHE_ERROR)
- ret = -EIO;
+ ret = btrfs_caching_ctl_wait_done(cache, caching_ctl);
btrfs_put_caching_control(caching_ctl);
return ret;
}
-static bool space_cache_v1_done(struct btrfs_block_group *cache)
-{
- bool ret;
-
- spin_lock(&cache->lock);
- ret = cache->cached != BTRFS_CACHE_FAST;
- spin_unlock(&cache->lock);
-
- return ret;
-}
-
-void btrfs_wait_space_cache_v1_finished(struct btrfs_block_group *cache,
- struct btrfs_caching_control *caching_ctl)
-{
- wait_event(caching_ctl->wait, space_cache_v1_done(cache));
-}
-
#ifdef CONFIG_BTRFS_DEBUG
static void fragment_free_space(struct btrfs_block_group *block_group)
{
@@ -750,9 +737,8 @@ done:
btrfs_put_block_group(block_group);
}
-int btrfs_cache_block_group(struct btrfs_block_group *cache, int load_cache_only)
+int btrfs_cache_block_group(struct btrfs_block_group *cache, bool wait)
{
- DEFINE_WAIT(wait);
struct btrfs_fs_info *fs_info = cache->fs_info;
struct btrfs_caching_control *caching_ctl = NULL;
int ret = 0;
@@ -785,10 +771,7 @@ int btrfs_cache_block_group(struct btrfs_block_group *cache, int load_cache_only
}
WARN_ON(cache->caching_ctl);
cache->caching_ctl = caching_ctl;
- if (btrfs_test_opt(fs_info, SPACE_CACHE))
- cache->cached = BTRFS_CACHE_FAST;
- else
- cache->cached = BTRFS_CACHE_STARTED;
+ cache->cached = BTRFS_CACHE_STARTED;
cache->has_caching_ctl = 1;
spin_unlock(&cache->lock);
@@ -801,8 +784,8 @@ int btrfs_cache_block_group(struct btrfs_block_group *cache, int load_cache_only
btrfs_queue_work(fs_info->caching_workers, &caching_ctl->work);
out:
- if (load_cache_only && caching_ctl)
- btrfs_wait_space_cache_v1_finished(cache, caching_ctl);
+ if (wait && caching_ctl)
+ ret = btrfs_caching_ctl_wait_done(cache, caching_ctl);
if (caching_ctl)
btrfs_put_caching_control(caching_ctl);
@@ -3312,7 +3295,7 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans,
* space back to the block group, otherwise we will leak space.
*/
if (!alloc && !btrfs_block_group_done(cache))
- btrfs_cache_block_group(cache, 1);
+ btrfs_cache_block_group(cache, true);
byte_in_group = bytenr - cache->start;
WARN_ON(byte_in_group > cache->length);
diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h
index 35e0e860cc0b..6b3cdc4cbc41 100644
--- a/fs/btrfs/block-group.h
+++ b/fs/btrfs/block-group.h
@@ -263,9 +263,7 @@ void btrfs_dec_nocow_writers(struct btrfs_block_group *bg);
void btrfs_wait_nocow_writers(struct btrfs_block_group *bg);
void btrfs_wait_block_group_cache_progress(struct btrfs_block_group *cache,
u64 num_bytes);
-int btrfs_wait_block_group_cache_done(struct btrfs_block_group *cache);
-int btrfs_cache_block_group(struct btrfs_block_group *cache,
- int load_cache_only);
+int btrfs_cache_block_group(struct btrfs_block_group *cache, bool wait);
void btrfs_put_caching_control(struct btrfs_caching_control *ctl);
struct btrfs_caching_control *btrfs_get_caching_control(
struct btrfs_block_group *cache);
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 4edb4bfb2166..9ef162dbd4bc 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -505,7 +505,6 @@ struct btrfs_free_cluster {
enum btrfs_caching_type {
BTRFS_CACHE_NO,
BTRFS_CACHE_STARTED,
- BTRFS_CACHE_FAST,
BTRFS_CACHE_FINISHED,
BTRFS_CACHE_ERROR,
};
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index ab944d1f94ef..6914cd8024ba 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2551,17 +2551,10 @@ int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
return -EINVAL;
/*
- * pull in the free space cache (if any) so that our pin
- * removes the free space from the cache. We have load_only set
- * to one because the slow code to read in the free extents does check
- * the pinned extents.
+ * Fully cache the free space first so that our pin removes the free space
+ * from the cache.
*/
- btrfs_cache_block_group(cache, 1);
- /*
- * Make sure we wait until the cache is completely built in case it is
- * missing or is invalid and therefore needs to be rebuilt.
- */
- ret = btrfs_wait_block_group_cache_done(cache);
+ ret = btrfs_cache_block_group(cache, true);
if (ret)
goto out;
@@ -2584,12 +2577,7 @@ static int __exclude_logged_extent(struct btrfs_fs_info *fs_info,
if (!block_group)
return -EINVAL;
- btrfs_cache_block_group(block_group, 1);
- /*
- * Make sure we wait until the cache is completely built in case it is
- * missing or is invalid and therefore needs to be rebuilt.
- */
- ret = btrfs_wait_block_group_cache_done(block_group);
+ ret = btrfs_cache_block_group(block_group, true);
if (ret)
goto out;
@@ -4399,7 +4387,7 @@ have_block_group:
ffe_ctl->cached = btrfs_block_group_done(block_group);
if (unlikely(!ffe_ctl->cached)) {
ffe_ctl->have_caching_bg = true;
- ret = btrfs_cache_block_group(block_group, 0);
+ ret = btrfs_cache_block_group(block_group, false);
/*
* If we get ENOMEM here or something else we want to
@@ -6169,13 +6157,7 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range)
if (end - start >= range->minlen) {
if (!btrfs_block_group_done(cache)) {
- ret = btrfs_cache_block_group(cache, 0);
- if (ret) {
- bg_failed++;
- bg_ret = ret;
- continue;
- }
- ret = btrfs_wait_block_group_cache_done(cache);
+ ret = btrfs_cache_block_group(cache, true);
if (ret) {
bg_failed++;
bg_ret = ret;
ISO OUT endpoint is enabled during queuing first usb request
in transfer ring and disabled when TRBERR is reported by controller.
After TRBERR and before next transfer added to TR driver must again
reenable endpoint but does not.
To solve this issue during processing TRBERR event driver must
set the flag EP_UPDATE_EP_TRBADDR in priv_ep->flags field.
Fixes: 7733f6c32e36 ("usb: cdns3: Add Cadence USB3 DRD Driver")
cc: <stable(a)vger.kernel.org>
Signed-off-by: Pawel Laszczak <pawell(a)cadence.com>
---
drivers/usb/cdns3/cdns3-gadget.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/drivers/usb/cdns3/cdns3-gadget.c b/drivers/usb/cdns3/cdns3-gadget.c
index 682ceba25765..fa8263951e63 100644
--- a/drivers/usb/cdns3/cdns3-gadget.c
+++ b/drivers/usb/cdns3/cdns3-gadget.c
@@ -1689,6 +1689,7 @@ static int cdns3_check_ep_interrupt_proceed(struct cdns3_endpoint *priv_ep)
ep_cfg &= ~EP_CFG_ENABLE;
writel(ep_cfg, &priv_dev->regs->ep_cfg);
priv_ep->flags &= ~EP_QUIRK_ISO_OUT_EN;
+ priv_ep->flags |= EP_UPDATE_EP_TRBADDR;
}
cdns3_transfer_completed(priv_dev, priv_ep);
} else if (!(priv_ep->flags & EP_STALLED) &&
--
2.25.1
The TRB_SMM flag indicates that DMA has completed the TD service with
this TRB. Usually it’s a last TRB in TD. In case of ISOC transfer for
bInterval > 1 each ISOC transfer contains more than one TD associated
with usb request (one TD per ITP). In such case the TRB_SMM flag will
be set in every TD and driver will recognize the end of transfer after
processing the first TD with TRB_SMM. In result driver stops updating
request->actual and returns incorrect actual length.
To fix this issue driver additionally must check TRB_CHAIN which is not
used for isochronous transfers.
Fixes: 249f0a25e8be ("usb: cdns3: gadget: handle sg list use case at completion correctly")
cc: <stable(a)vger.kernel.org>
Signed-off-by: Pawel Laszczak <pawell(a)cadence.com>
---
drivers/usb/cdns3/cdns3-gadget.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/drivers/usb/cdns3/cdns3-gadget.c b/drivers/usb/cdns3/cdns3-gadget.c
index fa8263951e63..a6618a922c61 100644
--- a/drivers/usb/cdns3/cdns3-gadget.c
+++ b/drivers/usb/cdns3/cdns3-gadget.c
@@ -1529,7 +1529,8 @@ static void cdns3_transfer_completed(struct cdns3_device *priv_dev,
TRB_LEN(le32_to_cpu(trb->length));
if (priv_req->num_of_trb > 1 &&
- le32_to_cpu(trb->control) & TRB_SMM)
+ le32_to_cpu(trb->control) & TRB_SMM &&
+ le32_to_cpu(trb->control) & TRB_CHAIN)
transfer_end = true;
cdns3_ep_inc_deq(priv_ep);
--
2.25.1
The quilt patch titled
Subject: mm/damon/dbgfs: avoid duplicate context directory creation
has been removed from the -mm tree. Its filename was
mm-damon-dbgfs-avoid-duplicate-context-directory-creation.patch
This patch was dropped because it was merged into the mm-stable branch
of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
------------------------------------------------------
From: Badari Pulavarty <badari.pulavarty(a)intel.com>
Subject: mm/damon/dbgfs: avoid duplicate context directory creation
Date: Sun, 21 Aug 2022 18:08:53 +0000
When user tries to create a DAMON context via the DAMON debugfs interface
with a name of an already existing context, the context directory creation
fails but a new context is created and added in the internal data
structure, due to absence of the directory creation success check. As a
result, memory could leak and DAMON cannot be turned on. An example test
case is as below:
# cd /sys/kernel/debug/damon/
# echo "off" > monitor_on
# echo paddr > target_ids
# echo "abc" > mk_context
# echo "abc" > mk_context
# echo $$ > abc/target_ids
# echo "on" > monitor_on <<< fails
Return value of 'debugfs_create_dir()' is expected to be ignored in
general, but this is an exceptional case as DAMON feature is depending
on the debugfs functionality and it has the potential duplicate name
issue. This commit therefore fixes the issue by checking the directory
creation failure and immediately return the error in the case.
Link: https://lkml.kernel.org/r/20220821180853.2400-1-sj@kernel.org
Fixes: 75c1c2b53c78 ("mm/damon/dbgfs: support multiple contexts")
Signed-off-by: Badari Pulavarty <badari.pulavarty(a)intel.com>
Signed-off-by: SeongJae Park <sj(a)kernel.org>
Cc: <stable(a)vger.kernel.org> [ 5.15.x]
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/damon/dbgfs.c | 3 +++
1 file changed, 3 insertions(+)
--- a/mm/damon/dbgfs.c~mm-damon-dbgfs-avoid-duplicate-context-directory-creation
+++ a/mm/damon/dbgfs.c
@@ -818,6 +818,9 @@ static int dbgfs_mk_context(char *name)
return -ENOENT;
new_dir = debugfs_create_dir(name, root);
+ /* Below check is required for a potential duplicated name case */
+ if (IS_ERR(new_dir))
+ return PTR_ERR(new_dir);
dbgfs_dirs[dbgfs_nr_ctxs] = new_dir;
new_ctx = dbgfs_new_ctx();
_
Patches currently in -mm which might be from badari.pulavarty(a)intel.com are
The quilt patch titled
Subject: asm-generic: sections: refactor memory_intersects
has been removed from the -mm tree. Its filename was
asm-generic-sections-refactor-memory_intersects.patch
This patch was dropped because it was merged into the mm-stable branch
of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
------------------------------------------------------
From: Quanyang Wang <quanyang.wang(a)windriver.com>
Subject: asm-generic: sections: refactor memory_intersects
Date: Fri, 19 Aug 2022 16:11:45 +0800
There are two problems with the current code of memory_intersects:
First, it doesn't check whether the region (begin, end) falls inside the
region (virt, vend), that is (virt < begin && vend > end).
The second problem is if vend is equal to begin, it will return true but
this is wrong since vend (virt + size) is not the last address of the
memory region but (virt + size -1) is. The wrong determination will
trigger the misreporting when the function check_for_illegal_area calls
memory_intersects to check if the dma region intersects with stext region.
The misreporting is as below (stext is at 0x80100000):
WARNING: CPU: 0 PID: 77 at kernel/dma/debug.c:1073 check_for_illegal_area+0x130/0x168
DMA-API: chipidea-usb2 e0002000.usb: device driver maps memory from kernel text or rodata [addr=800f0000] [len=65536]
Modules linked in:
CPU: 1 PID: 77 Comm: usb-storage Not tainted 5.19.0-yocto-standard #5
Hardware name: Xilinx Zynq Platform
unwind_backtrace from show_stack+0x18/0x1c
show_stack from dump_stack_lvl+0x58/0x70
dump_stack_lvl from __warn+0xb0/0x198
__warn from warn_slowpath_fmt+0x80/0xb4
warn_slowpath_fmt from check_for_illegal_area+0x130/0x168
check_for_illegal_area from debug_dma_map_sg+0x94/0x368
debug_dma_map_sg from __dma_map_sg_attrs+0x114/0x128
__dma_map_sg_attrs from dma_map_sg_attrs+0x18/0x24
dma_map_sg_attrs from usb_hcd_map_urb_for_dma+0x250/0x3b4
usb_hcd_map_urb_for_dma from usb_hcd_submit_urb+0x194/0x214
usb_hcd_submit_urb from usb_sg_wait+0xa4/0x118
usb_sg_wait from usb_stor_bulk_transfer_sglist+0xa0/0xec
usb_stor_bulk_transfer_sglist from usb_stor_bulk_srb+0x38/0x70
usb_stor_bulk_srb from usb_stor_Bulk_transport+0x150/0x360
usb_stor_Bulk_transport from usb_stor_invoke_transport+0x38/0x440
usb_stor_invoke_transport from usb_stor_control_thread+0x1e0/0x238
usb_stor_control_thread from kthread+0xf8/0x104
kthread from ret_from_fork+0x14/0x2c
Refactor memory_intersects to fix the two problems above.
Before the 1d7db834a027e ("dma-debug: use memory_intersects()
directly"), memory_intersects is called only by printk_late_init:
printk_late_init -> init_section_intersects ->memory_intersects.
There were few places where memory_intersects was called.
When commit 1d7db834a027e ("dma-debug: use memory_intersects()
directly") was merged and CONFIG_DMA_API_DEBUG is enabled, the DMA
subsystem uses it to check for an illegal area and the calltrace above
is triggered.
[akpm(a)linux-foundation.org: fix nearby comment typo]
Link: https://lkml.kernel.org/r/20220819081145.948016-1-quanyang.wang@windriver.c…
Fixes: 979559362516 ("asm/sections: add helpers to check for section data")
Signed-off-by: Quanyang Wang <quanyang.wang(a)windriver.com>
Cc: Ard Biesheuvel <ardb(a)kernel.org>
Cc: Arnd Bergmann <arnd(a)arndb.de>
Cc: Thierry Reding <treding(a)nvidia.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
include/asm-generic/sections.h | 7 +++++--
1 file changed, 5 insertions(+), 2 deletions(-)
--- a/include/asm-generic/sections.h~asm-generic-sections-refactor-memory_intersects
+++ a/include/asm-generic/sections.h
@@ -97,7 +97,7 @@ static inline bool memory_contains(void
/**
* memory_intersects - checks if the region occupied by an object intersects
* with another memory region
- * @begin: virtual address of the beginning of the memory regien
+ * @begin: virtual address of the beginning of the memory region
* @end: virtual address of the end of the memory region
* @virt: virtual address of the memory object
* @size: size of the memory object
@@ -110,7 +110,10 @@ static inline bool memory_intersects(voi
{
void *vend = virt + size;
- return (virt >= begin && virt < end) || (vend >= begin && vend < end);
+ if (virt < end && vend > begin)
+ return true;
+
+ return false;
}
/**
_
Patches currently in -mm which might be from quanyang.wang(a)windriver.com are
The quilt patch titled
Subject: bootmem: remove the vmemmap pages from kmemleak in put_page_bootmem
has been removed from the -mm tree. Its filename was
bootmem-remove-the-vmemmap-pages-from-kmemleak-in-put_page_bootmem.patch
This patch was dropped because it was merged into the mm-stable branch
of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
------------------------------------------------------
From: Liu Shixin <liushixin2(a)huawei.com>
Subject: bootmem: remove the vmemmap pages from kmemleak in put_page_bootmem
Date: Fri, 19 Aug 2022 17:40:05 +0800
The vmemmap pages is marked by kmemleak when allocated from memblock.
Remove it from kmemleak when freeing the page. Otherwise, when we reuse
the page, kmemleak may report such an error and then stop working.
kmemleak: Cannot insert 0xffff98fb6eab3d40 into the object search tree (overlaps existing)
kmemleak: Kernel memory leak detector disabled
kmemleak: Object 0xffff98fb6be00000 (size 335544320):
kmemleak: comm "swapper", pid 0, jiffies 4294892296
kmemleak: min_count = 0
kmemleak: count = 0
kmemleak: flags = 0x1
kmemleak: checksum = 0
kmemleak: backtrace:
Link: https://lkml.kernel.org/r/20220819094005.2928241-1-liushixin2@huawei.com
Fixes: f41f2ed43ca5 (mm: hugetlb: free the vmemmap pages associated with each HugeTLB page)
Signed-off-by: Liu Shixin <liushixin2(a)huawei.com>
Reviewed-by: Muchun Song <songmuchun(a)bytedance.com>
Cc: Matthew Wilcox <willy(a)infradead.org>
Cc: Mike Kravetz <mike.kravetz(a)oracle.com>
Cc: Oscar Salvador <osalvador(a)suse.de>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/bootmem_info.c | 2 ++
1 file changed, 2 insertions(+)
--- a/mm/bootmem_info.c~bootmem-remove-the-vmemmap-pages-from-kmemleak-in-put_page_bootmem
+++ a/mm/bootmem_info.c
@@ -12,6 +12,7 @@
#include <linux/memblock.h>
#include <linux/bootmem_info.h>
#include <linux/memory_hotplug.h>
+#include <linux/kmemleak.h>
void get_page_bootmem(unsigned long info, struct page *page, unsigned long type)
{
@@ -33,6 +34,7 @@ void put_page_bootmem(struct page *page)
ClearPagePrivate(page);
set_page_private(page, 0);
INIT_LIST_HEAD(&page->lru);
+ kmemleak_free_part(page_to_virt(page), PAGE_SIZE);
free_reserved_page(page);
}
}
_
Patches currently in -mm which might be from liushixin2(a)huawei.com are
frontswap-skip-frontswap_ops-init-if-zswap-init-failed.patch
frontswap-invoke-ops-init-for-online-swap-device-in-frontswap_register_ops.patch
mm-zswap-replace-zswap_init_started-failed-with-zswap_init_state.patch
mm-zswap-delay-the-initializaton-of-zswap-until-the-first-enablement.patch
mm-zswap-skip-confusing-print-info.patch
The quilt patch titled
Subject: ocfs2: fix freeing uninitialized resource on ocfs2_dlm_shutdown
has been removed from the -mm tree. Its filename was
ocfs2-fix-freeing-uninitialized-resource-on-ocfs2_dlm_shutdown.patch
This patch was dropped because it was merged into the mm-stable branch
of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
------------------------------------------------------
From: Heming Zhao <ocfs2-devel(a)oss.oracle.com>
Subject: ocfs2: fix freeing uninitialized resource on ocfs2_dlm_shutdown
Date: Mon, 15 Aug 2022 16:57:54 +0800
After commit 0737e01de9c4 ("ocfs2: ocfs2_mount_volume does cleanup job
before return error"), any procedure after ocfs2_dlm_init() fails will
trigger crash when calling ocfs2_dlm_shutdown().
ie: On local mount mode, no dlm resource is initialized. If
ocfs2_mount_volume() fails in ocfs2_find_slot(), error handling will call
ocfs2_dlm_shutdown(), then does dlm resource cleanup job, which will
trigger kernel crash.
This solution should bypass uninitialized resources in
ocfs2_dlm_shutdown().
Link: https://lkml.kernel.org/r/20220815085754.20417-1-heming.zhao@suse.com
Fixes: 0737e01de9c4 ("ocfs2: ocfs2_mount_volume does cleanup job before return error")
Signed-off-by: Heming Zhao <heming.zhao(a)suse.com>
Reviewed-by: Joseph Qi <joseph.qi(a)linux.alibaba.com>
Cc: Mark Fasheh <mark(a)fasheh.com>
Cc: Joel Becker <jlbec(a)evilplan.org>
Cc: Junxiao Bi <junxiao.bi(a)oracle.com>
Cc: Changwei Ge <gechangwei(a)live.cn>
Cc: Gang He <ghe(a)suse.com>
Cc: Jun Piao <piaojun(a)huawei.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
fs/ocfs2/dlmglue.c | 8 +++++---
fs/ocfs2/super.c | 3 +--
2 files changed, 6 insertions(+), 5 deletions(-)
--- a/fs/ocfs2/dlmglue.c~ocfs2-fix-freeing-uninitialized-resource-on-ocfs2_dlm_shutdown
+++ a/fs/ocfs2/dlmglue.c
@@ -3403,10 +3403,12 @@ void ocfs2_dlm_shutdown(struct ocfs2_sup
ocfs2_lock_res_free(&osb->osb_nfs_sync_lockres);
ocfs2_lock_res_free(&osb->osb_orphan_scan.os_lockres);
- ocfs2_cluster_disconnect(osb->cconn, hangup_pending);
- osb->cconn = NULL;
+ if (osb->cconn) {
+ ocfs2_cluster_disconnect(osb->cconn, hangup_pending);
+ osb->cconn = NULL;
- ocfs2_dlm_shutdown_debug(osb);
+ ocfs2_dlm_shutdown_debug(osb);
+ }
}
static int ocfs2_drop_lock(struct ocfs2_super *osb,
--- a/fs/ocfs2/super.c~ocfs2-fix-freeing-uninitialized-resource-on-ocfs2_dlm_shutdown
+++ a/fs/ocfs2/super.c
@@ -1914,8 +1914,7 @@ static void ocfs2_dismount_volume(struct
!ocfs2_is_hard_readonly(osb))
hangup_needed = 1;
- if (osb->cconn)
- ocfs2_dlm_shutdown(osb, hangup_needed);
+ ocfs2_dlm_shutdown(osb, hangup_needed);
ocfs2_blockcheck_stats_debugfs_remove(&osb->osb_ecc_stats);
debugfs_remove_recursive(osb->osb_debug_root);
_
Patches currently in -mm which might be from ocfs2-devel(a)oss.oracle.com are
The quilt patch titled
Subject: Revert "memcg: cleanup racy sum avoidance code"
has been removed from the -mm tree. Its filename was
revert-memcg-cleanup-racy-sum-avoidance-code.patch
This patch was dropped because it was merged into the mm-stable branch
of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
------------------------------------------------------
From: Shakeel Butt <shakeelb(a)google.com>
Subject: Revert "memcg: cleanup racy sum avoidance code"
Date: Wed, 17 Aug 2022 17:21:39 +0000
This reverts commit 96e51ccf1af33e82f429a0d6baebba29c6448d0f.
Recently we started running the kernel with rstat infrastructure on
production traffic and begin to see negative memcg stats values.
Particularly the 'sock' stat is the one which we observed having negative
value.
$ grep "sock " /mnt/memory/job/memory.stat
sock 253952
total_sock 18446744073708724224
Re-run after couple of seconds
$ grep "sock " /mnt/memory/job/memory.stat
sock 253952
total_sock 53248
For now we are only seeing this issue on large machines (256 CPUs) and
only with 'sock' stat. I think the networking stack increase the stat on
one cpu and decrease it on another cpu much more often. So, this negative
sock is due to rstat flusher flushing the stats on the CPU that has seen
the decrement of sock but missed the CPU that has increments. A typical
race condition.
For easy stable backport, revert is the most simple solution. For long
term solution, I am thinking of two directions. First is just reduce the
race window by optimizing the rstat flusher. Second is if the reader sees
a negative stat value, force flush and restart the stat collection.
Basically retry but limited.
Link: https://lkml.kernel.org/r/20220817172139.3141101-1-shakeelb@google.com
Fixes: 96e51ccf1af33e8 ("memcg: cleanup racy sum avoidance code")
Signed-off-by: Shakeel Butt <shakeelb(a)google.com>
Cc: "Michal Koutn��" <mkoutny(a)suse.com>
Cc: Johannes Weiner <hannes(a)cmpxchg.org>
Cc: Michal Hocko <mhocko(a)kernel.org>
Cc: Roman Gushchin <roman.gushchin(a)linux.dev>
Cc: Muchun Song <songmuchun(a)bytedance.com>
Cc: David Hildenbrand <david(a)redhat.com>
Cc: Yosry Ahmed <yosryahmed(a)google.com>
Cc: Greg Thelen <gthelen(a)google.com>
Cc: <stable(a)vger.kernel.org> [5.15]
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
include/linux/memcontrol.h | 15 +++++++++++++--
1 file changed, 13 insertions(+), 2 deletions(-)
--- a/include/linux/memcontrol.h~revert-memcg-cleanup-racy-sum-avoidance-code
+++ a/include/linux/memcontrol.h
@@ -987,19 +987,30 @@ static inline void mod_memcg_page_state(
static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx)
{
- return READ_ONCE(memcg->vmstats.state[idx]);
+ long x = READ_ONCE(memcg->vmstats.state[idx]);
+#ifdef CONFIG_SMP
+ if (x < 0)
+ x = 0;
+#endif
+ return x;
}
static inline unsigned long lruvec_page_state(struct lruvec *lruvec,
enum node_stat_item idx)
{
struct mem_cgroup_per_node *pn;
+ long x;
if (mem_cgroup_disabled())
return node_page_state(lruvec_pgdat(lruvec), idx);
pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
- return READ_ONCE(pn->lruvec_stats.state[idx]);
+ x = READ_ONCE(pn->lruvec_stats.state[idx]);
+#ifdef CONFIG_SMP
+ if (x < 0)
+ x = 0;
+#endif
+ return x;
}
static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec,
_
Patches currently in -mm which might be from shakeelb(a)google.com are
mm-page_counter-remove-unneeded-atomic-ops-for-low-min.patch
mm-page_counter-rearrange-struct-page_counter-fields.patch
memcg-increase-memcg_charge_batch-to-64.patch
mm-deduplicate-cacheline-padding-code.patch
The quilt patch titled
Subject: writeback: avoid use-after-free after removing device
has been removed from the -mm tree. Its filename was
writeback-avoid-use-after-free-after-removing-device.patch
This patch was dropped because it was merged into the mm-stable branch
of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
------------------------------------------------------
From: Khazhismel Kumykov <khazhy(a)chromium.org>
Subject: writeback: avoid use-after-free after removing device
Date: Mon, 1 Aug 2022 08:50:34 -0700
When a disk is removed, bdi_unregister gets called to stop further
writeback and wait for associated delayed work to complete. However,
wb_inode_writeback_end() may schedule bandwidth estimation dwork after
this has completed, which can result in the timer attempting to access the
just freed bdi_writeback.
Fix this by checking if the bdi_writeback is alive, similar to when
scheduling writeback work.
Since this requires wb->work_lock, and wb_inode_writeback_end() may get
called from interrupt, switch wb->work_lock to an irqsafe lock.
Link: https://lkml.kernel.org/r/20220801155034.3772543-1-khazhy@google.com
Fixes: 45a2966fd641 ("writeback: fix bandwidth estimate for spiky workload")
Signed-off-by: Khazhismel Kumykov <khazhy(a)google.com>
Reviewed-by: Jan Kara <jack(a)suse.cz>
Cc: Michael Stapelberg <stapelberg+linux(a)google.com>
Cc: Wu Fengguang <fengguang.wu(a)intel.com>
Cc: Alexander Viro <viro(a)zeniv.linux.org.uk>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
fs/fs-writeback.c | 12 ++++++------
mm/backing-dev.c | 10 +++++-----
mm/page-writeback.c | 6 +++++-
3 files changed, 16 insertions(+), 12 deletions(-)
--- a/fs/fs-writeback.c~writeback-avoid-use-after-free-after-removing-device
+++ a/fs/fs-writeback.c
@@ -134,10 +134,10 @@ static bool inode_io_list_move_locked(st
static void wb_wakeup(struct bdi_writeback *wb)
{
- spin_lock_bh(&wb->work_lock);
+ spin_lock_irq(&wb->work_lock);
if (test_bit(WB_registered, &wb->state))
mod_delayed_work(bdi_wq, &wb->dwork, 0);
- spin_unlock_bh(&wb->work_lock);
+ spin_unlock_irq(&wb->work_lock);
}
static void finish_writeback_work(struct bdi_writeback *wb,
@@ -164,7 +164,7 @@ static void wb_queue_work(struct bdi_wri
if (work->done)
atomic_inc(&work->done->cnt);
- spin_lock_bh(&wb->work_lock);
+ spin_lock_irq(&wb->work_lock);
if (test_bit(WB_registered, &wb->state)) {
list_add_tail(&work->list, &wb->work_list);
@@ -172,7 +172,7 @@ static void wb_queue_work(struct bdi_wri
} else
finish_writeback_work(wb, work);
- spin_unlock_bh(&wb->work_lock);
+ spin_unlock_irq(&wb->work_lock);
}
/**
@@ -2082,13 +2082,13 @@ static struct wb_writeback_work *get_nex
{
struct wb_writeback_work *work = NULL;
- spin_lock_bh(&wb->work_lock);
+ spin_lock_irq(&wb->work_lock);
if (!list_empty(&wb->work_list)) {
work = list_entry(wb->work_list.next,
struct wb_writeback_work, list);
list_del_init(&work->list);
}
- spin_unlock_bh(&wb->work_lock);
+ spin_unlock_irq(&wb->work_lock);
return work;
}
--- a/mm/backing-dev.c~writeback-avoid-use-after-free-after-removing-device
+++ a/mm/backing-dev.c
@@ -260,10 +260,10 @@ void wb_wakeup_delayed(struct bdi_writeb
unsigned long timeout;
timeout = msecs_to_jiffies(dirty_writeback_interval * 10);
- spin_lock_bh(&wb->work_lock);
+ spin_lock_irq(&wb->work_lock);
if (test_bit(WB_registered, &wb->state))
queue_delayed_work(bdi_wq, &wb->dwork, timeout);
- spin_unlock_bh(&wb->work_lock);
+ spin_unlock_irq(&wb->work_lock);
}
static void wb_update_bandwidth_workfn(struct work_struct *work)
@@ -334,12 +334,12 @@ static void cgwb_remove_from_bdi_list(st
static void wb_shutdown(struct bdi_writeback *wb)
{
/* Make sure nobody queues further work */
- spin_lock_bh(&wb->work_lock);
+ spin_lock_irq(&wb->work_lock);
if (!test_and_clear_bit(WB_registered, &wb->state)) {
- spin_unlock_bh(&wb->work_lock);
+ spin_unlock_irq(&wb->work_lock);
return;
}
- spin_unlock_bh(&wb->work_lock);
+ spin_unlock_irq(&wb->work_lock);
cgwb_remove_from_bdi_list(wb);
/*
--- a/mm/page-writeback.c~writeback-avoid-use-after-free-after-removing-device
+++ a/mm/page-writeback.c
@@ -2892,6 +2892,7 @@ static void wb_inode_writeback_start(str
static void wb_inode_writeback_end(struct bdi_writeback *wb)
{
+ unsigned long flags;
atomic_dec(&wb->writeback_inodes);
/*
* Make sure estimate of writeback throughput gets updated after
@@ -2900,7 +2901,10 @@ static void wb_inode_writeback_end(struc
* that if multiple inodes end writeback at a similar time, they get
* batched into one bandwidth update.
*/
- queue_delayed_work(bdi_wq, &wb->bw_dwork, BANDWIDTH_INTERVAL);
+ spin_lock_irqsave(&wb->work_lock, flags);
+ if (test_bit(WB_registered, &wb->state))
+ queue_delayed_work(bdi_wq, &wb->bw_dwork, BANDWIDTH_INTERVAL);
+ spin_unlock_irqrestore(&wb->work_lock, flags);
}
bool __folio_end_writeback(struct folio *folio)
_
Patches currently in -mm which might be from khazhy(a)chromium.org are
The quilt patch titled
Subject: shmem: update folio if shmem_replace_page() updates the page
has been removed from the -mm tree. Its filename was
shmem-update-folio-if-shmem_replace_page-updates-the-page.patch
This patch was dropped because it was merged into the mm-stable branch
of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
------------------------------------------------------
From: "Matthew Wilcox (Oracle)" <willy(a)infradead.org>
Subject: shmem: update folio if shmem_replace_page() updates the page
Date: Sat, 30 Jul 2022 05:25:18 +0100
If we allocate a new page, we need to make sure that our folio matches
that new page.
If we do end up in this code path, we store the wrong page in the shmem
inode's page cache, and I would rather imagine that data corruption
ensues.
This will be solved by changing shmem_replace_page() to
shmem_replace_folio(), but this is the minimal fix.
Link: https://lkml.kernel.org/r/20220730042518.1264767-1-willy@infradead.org
Fixes: da08e9b79323 ("mm/shmem: convert shmem_swapin_page() to shmem_swapin_folio()")
Signed-off-by: Matthew Wilcox (Oracle) <willy(a)infradead.org>
Reviewed-by: William Kucharski <william.kucharski(a)oracle.com>
Cc: Hugh Dickins <hughd(a)google.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/shmem.c | 1 +
1 file changed, 1 insertion(+)
--- a/mm/shmem.c~shmem-update-folio-if-shmem_replace_page-updates-the-page
+++ a/mm/shmem.c
@@ -1782,6 +1782,7 @@ static int shmem_swapin_folio(struct ino
if (shmem_should_replace_folio(folio, gfp)) {
error = shmem_replace_page(&page, gfp, info, index);
+ folio = page_folio(page);
if (error)
goto failed;
}
_
Patches currently in -mm which might be from willy(a)infradead.org are
support-highmem-pages-in-vmap_pages_range_noflush.patch
mm-add-vma-iterator.patch
mmap-use-the-vma-iterator-in-count_vma_pages_range.patch
proc-remove-vma-rbtree-use-from-nommu.patch
arm64-remove-mmap-linked-list-from-vdso.patch
parisc-remove-mmap-linked-list-from-cache-handling.patch
powerpc-remove-mmap-linked-list-walks.patch
s390-remove-vma-linked-list-walks.patch
x86-remove-vma-linked-list-walks.patch
xtensa-remove-vma-linked-list-walks.patch
cxl-remove-vma-linked-list-walk.patch
optee-remove-vma-linked-list-walk.patch
um-remove-vma-linked-list-walk.patch
coredump-remove-vma-linked-list-walk.patch
exec-use-vma-iterator-instead-of-linked-list.patch
fs-proc-task_mmu-stop-using-linked-list-and-highest_vm_end.patch
acct-use-vma-iterator-instead-of-linked-list.patch
perf-use-vma-iterator.patch
sched-use-maple-tree-iterator-to-walk-vmas.patch
fork-use-vma-iterator.patch
mm-khugepaged-stop-using-vma-linked-list.patch
mm-ksm-use-vma-iterators-instead-of-vma-linked-list.patch
mm-mlock-use-vma-iterator-and-maple-state-instead-of-vma-linked-list.patch
mm-pagewalk-use-vma_find-instead-of-vma-linked-list.patch
i915-use-the-vma-iterator.patch
nommu-remove-uses-of-vma-linked-list.patch
The quilt patch titled
Subject: mm/hugetlb: avoid corrupting page->mapping in hugetlb_mcopy_atomic_pte
has been removed from the -mm tree. Its filename was
mm-hugetlb-avoid-corrupting-page-mapping-in-hugetlb_mcopy_atomic_pte.patch
This patch was dropped because it was merged into the mm-stable branch
of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
------------------------------------------------------
From: Miaohe Lin <linmiaohe(a)huawei.com>
Subject: mm/hugetlb: avoid corrupting page->mapping in hugetlb_mcopy_atomic_pte
Date: Tue, 12 Jul 2022 21:05:42 +0800
In MCOPY_ATOMIC_CONTINUE case with a non-shared VMA, pages in the page
cache are installed in the ptes. But hugepage_add_new_anon_rmap is called
for them mistakenly because they're not vm_shared. This will corrupt the
page->mapping used by page cache code.
Link: https://lkml.kernel.org/r/20220712130542.18836-1-linmiaohe@huawei.com
Fixes: f619147104c8 ("userfaultfd: add UFFDIO_CONTINUE ioctl")
Signed-off-by: Miaohe Lin <linmiaohe(a)huawei.com>
Reviewed-by: Mike Kravetz <mike.kravetz(a)oracle.com>
Cc: Axel Rasmussen <axelrasmussen(a)google.com>
Cc: Peter Xu <peterx(a)redhat.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/hugetlb.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
--- a/mm/hugetlb.c~mm-hugetlb-avoid-corrupting-page-mapping-in-hugetlb_mcopy_atomic_pte
+++ a/mm/hugetlb.c
@@ -6041,7 +6041,7 @@ int hugetlb_mcopy_atomic_pte(struct mm_s
if (!huge_pte_none_mostly(huge_ptep_get(dst_pte)))
goto out_release_unlock;
- if (vm_shared) {
+ if (page_in_pagecache) {
page_dup_file_rmap(page, true);
} else {
ClearHPageRestoreReserve(page);
_
Patches currently in -mm which might be from linmiaohe(a)huawei.com are
mm-page_alloc-minor-clean-up-for-memmap_init_compound.patch
hugetlb_cgroup-remove-unneeded-nr_pages-0-check.patch
hugetlb_cgroup-hugetlbfs-use-helper-macro-sz_1kmg.patch
hugetlb_cgroup-remove-unneeded-return-value.patch
hugetlb_cgroup-use-helper-macro-numa_no_node.patch
hugetlb_cgroup-use-helper-for_each_hstate-and-hstate_index.patch
mm-hugetlb-fix-incorrect-update-of-max_huge_pages.patch
mm-hugetlb-fix-warn_onkobj-in-sysfs_create_group.patch
mm-hugetlb-fix-missing-call-to-restore_reserve_on_error.patch
mm-hugetlb-fix-missing-call-to-restore_reserve_on_error-v2.patch
mm-hugetlb_vmemmap-add-missing-smp_wmb-before-set_pte_at.patch
mm-hugetlb-fix-sysfs-group-leak-in-hugetlb_unregister_node.patch
mm-hugetlb-make-detecting-shared-pte-more-reliable.patch
mm-hwpoison-fix-page-refcnt-leaking-in-try_memory_failure_hugetlb.patch
mm-hwpoison-fix-page-refcnt-leaking-in-unpoison_memory.patch
mm-hwpoison-fix-extra-put_page-in-soft_offline_page.patch
mm-hwpoison-fix-possible-use-after-free-in-mf_dax_kill_procs-v2.patch
mm-hwpoison-kill-procs-if-unmap-fails.patch
mm-hwpoison-kill-procs-if-unmap-fails-v2.patch
mm-hwpoison-avoid-trying-to-unpoison-reserved-page.patch
Dear Child of God,
Calvary Greetings in the name of the LORD Almighty and Our LORD JESUS
CHRIST the giver of every good thing. Good day and compliments of the
seasons, i know this letter will definitely come to you as a huge
surprise, but I implore you to take the time to go through it
carefully as the decision you make will go off a long way to determine
my future and continued existence. I am Mrs Lila Haber aging widow of
57 years old suffering from long time illness.I have some funds I
inherited from my late husband, the sum of (7.2Million Dollars) and I
needed a very honest and God fearing who can withdraw this money then
use the funds for Charity works. I WISH TO GIVE THIS FUNDS TO YOU FOR
CHARITY WORKS. I found your email address from the internet after
honest prayers to the LORD to bring me a helper and i decided to
contact you if you may be willing and interested to handle these trust
funds in good faith before anything happens to me.
I accept this decision because I do not have any child who will
inherit this money after I die. I want your urgent reply to me so that
I will give you the deposit receipt which the SECURITY COMPANY issued
to me as next of kin for immediate transfer of the money to your
account in your country, to start the good work of God, I want you to
use the 25/percent of the total amount to help yourself in doing the
project. I am desperately in keen need of assistance and I have
summoned up courage to contact you for this task, you must not fail me
and the millions of the poor people in our todays WORLD. This is no
stolen money and there are no dangers involved,100% RISK FREE with
full legal proof. Please if you would be able to use the funds for the
Charity works kindly let me know immediately.I will appreciate your
utmost confidentiality and trust in this matter to accomplish my heart
desire, as I don't want anything that will jeopardize my last wish.
Please kindly respond quickly for further details.
Warmest Regards,
Mrs Lila Haber
During cdrom emulation, the response to read_toc command must contain
the cdrom address as the number of sectors (2048 byte sized blocks)
represented either as an absolute value (when MSF bit is '0') or in
terms of PMin/PSec/PFrame (when MSF bit is set to '1'). Incase of
cdrom, the fsg_lun_open call sets the sector size to 2048 bytes.
When MAC OS sends a read_toc request with MSF set to '1', the
store_cdrom_address assumes that the address being provided is the
LUN size represented in 512 byte sized blocks instead of 2048. It
tries to modify the address further to convert it to 2048 byte sized
blocks and store it in MSF format. This results in data transfer
failures as the cdrom address being provided in the read_toc response
is incorrect.
Fixes: 3f565a363cee ("usb: gadget: storage: adapt logic block size to bound block devices")
Cc: stable(a)vger.kernel.org
Signed-off-by: Krishna Kurapati <quic_kriskura(a)quicinc.com>
Acked-by: Alan Stern <stern(a)rowland.harvard.edu>
---
drivers/usb/gadget/function/storage_common.c | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/drivers/usb/gadget/function/storage_common.c b/drivers/usb/gadget/function/storage_common.c
index 03035db..208c6a9 100644
--- a/drivers/usb/gadget/function/storage_common.c
+++ b/drivers/usb/gadget/function/storage_common.c
@@ -294,8 +294,10 @@ EXPORT_SYMBOL_GPL(fsg_lun_fsync_sub);
void store_cdrom_address(u8 *dest, int msf, u32 addr)
{
if (msf) {
- /* Convert to Minutes-Seconds-Frames */
- addr >>= 2; /* Convert to 2048-byte frames */
+ /*
+ * Convert to Minutes-Seconds-Frames.
+ * Sector size is already set to 2048 bytes.
+ */
addr += 2*75; /* Lead-in occupies 2 seconds */
dest[3] = addr % 75; /* Frames */
addr /= 75;
--
2.7.4
The quilt patch titled
Subject: mm/migrate_device.c: copy pte dirty bit to page
has been removed from the -mm tree. Its filename was
mm-migrate_devicec-copy-pte-dirty-bit-to-page.patch
This patch was dropped because an updated version will be merged
------------------------------------------------------
From: Alistair Popple <apopple(a)nvidia.com>
Subject: mm/migrate_device.c: copy pte dirty bit to page
Date: Tue, 16 Aug 2022 17:39:24 +1000
migrate_vma_setup() has a fast path in migrate_vma_collect_pmd() that
installs migration entries directly if it can lock the migrating page.
When removing a dirty pte the dirty bit is supposed to be carried over to
the underlying page to prevent it being lost.
Currently migrate_vma_*() can only be used for private anonymous mappings.
That means loss of the dirty bit usually doesn't result in data loss
because these pages are typically not file-backed. However pages may be
backed by swap storage which can result in data loss if an attempt is made
to migrate a dirty page that doesn't yet have the PageDirty flag set.
In this case migration will fail due to unexpected references but the
dirty pte bit will be lost. If the page is subsequently reclaimed data
won't be written back to swap storage as it is considered uptodate,
resulting in data loss if the page is subsequently accessed.
Prevent this by copying the dirty bit to the page when removing the pte to
match what try_to_migrate_one() does.
Link: https://lkml.kernel.org/r/6e77914685ede036c419fa65b6adc27f25a6c3e9.16606350…
Fixes: 8c3328f1f36a ("mm/migrate: migrate_vma() unmap page from vma while collecting pages")
Signed-off-by: Alistair Popple <apopple(a)nvidia.com>
Acked-by: Peter Xu <peterx(a)redhat.com>
Reported-by: Huang Ying <ying.huang(a)intel.com>
Reviewed-by: Huang Ying <ying.huang(a)intel.com>
Cc: Alex Sierra <alex.sierra(a)amd.com>
Cc: Ben Skeggs <bskeggs(a)redhat.com>
Cc: David Hildenbrand <david(a)redhat.com>
Cc: Felix Kuehling <felix.kuehling(a)amd.com>
Cc: Jason Gunthorpe <jgg(a)nvidia.com>
Cc: John Hubbard <jhubbard(a)nvidia.com>
Cc: Karol Herbst <kherbst(a)redhat.com>
Cc: Logan Gunthorpe <logang(a)deltatee.com>
Cc: Lyude Paul <lyude(a)redhat.com>
Cc: Matthew Wilcox (Oracle) <willy(a)infradead.org>
Cc: Paul Mackerras <paulus(a)ozlabs.org>
Cc: Ralph Campbell <rcampbell(a)nvidia.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/migrate_device.c | 21 ++++++++-------------
1 file changed, 8 insertions(+), 13 deletions(-)
--- a/mm/migrate_device.c~mm-migrate_devicec-copy-pte-dirty-bit-to-page
+++ a/mm/migrate_device.c
@@ -7,6 +7,7 @@
#include <linux/export.h>
#include <linux/memremap.h>
#include <linux/migrate.h>
+#include <linux/mm.h>
#include <linux/mm_inline.h>
#include <linux/mmu_notifier.h>
#include <linux/oom.h>
@@ -61,7 +62,7 @@ static int migrate_vma_collect_pmd(pmd_t
struct migrate_vma *migrate = walk->private;
struct vm_area_struct *vma = walk->vma;
struct mm_struct *mm = vma->vm_mm;
- unsigned long addr = start, unmapped = 0;
+ unsigned long addr = start;
spinlock_t *ptl;
pte_t *ptep;
@@ -193,11 +194,10 @@ again:
bool anon_exclusive;
pte_t swp_pte;
+ flush_cache_page(vma, addr, pte_pfn(*ptep));
+ pte = ptep_clear_flush(vma, addr, ptep);
anon_exclusive = PageAnon(page) && PageAnonExclusive(page);
if (anon_exclusive) {
- flush_cache_page(vma, addr, pte_pfn(*ptep));
- ptep_clear_flush(vma, addr, ptep);
-
if (page_try_share_anon_rmap(page)) {
set_pte_at(mm, addr, ptep, pte);
unlock_page(page);
@@ -205,12 +205,14 @@ again:
mpfn = 0;
goto next;
}
- } else {
- ptep_get_and_clear(mm, addr, ptep);
}
migrate->cpages++;
+ /* Set the dirty flag on the folio now the pte is gone. */
+ if (pte_dirty(pte))
+ folio_mark_dirty(page_folio(page));
+
/* Setup special migration page table entry */
if (mpfn & MIGRATE_PFN_WRITE)
entry = make_writable_migration_entry(
@@ -242,9 +244,6 @@ again:
*/
page_remove_rmap(page, vma, false);
put_page(page);
-
- if (pte_present(pte))
- unmapped++;
} else {
put_page(page);
mpfn = 0;
@@ -257,10 +256,6 @@ next:
arch_leave_lazy_mmu_mode();
pte_unmap_unlock(ptep - 1, ptl);
- /* Only flush the TLB if we actually modified any entries */
- if (unmapped)
- flush_tlb_range(walk->vma, start, end);
-
return 0;
}
_
Patches currently in -mm which might be from apopple(a)nvidia.com are
mm-gupc-simplify-and-fix-check_and_migrate_movable_pages-return-codes.patch
selftests-hmm-tests-add-test-for-dirty-bits.patch
mm-gupc-dont-pass-gup_flags-to-check_and_migrate_movable_pages.patch
mm-gupc-refactor-check_and_migrate_movable_pages.patch
When clearing a PTE the TLB should be flushed whilst still holding the
PTL to avoid a potential race with madvise/munmap/etc. For example
consider the following sequence:
CPU0 CPU1
---- ----
migrate_vma_collect_pmd()
pte_unmap_unlock()
madvise(MADV_DONTNEED)
-> zap_pte_range()
pte_offset_map_lock()
[ PTE not present, TLB not flushed ]
pte_unmap_unlock()
[ page is still accessible via stale TLB ]
flush_tlb_range()
In this case the page may still be accessed via the stale TLB entry
after madvise returns. Fix this by flushing the TLB while holding the
PTL.
Signed-off-by: Alistair Popple <apopple(a)nvidia.com>
Reported-by: Nadav Amit <nadav.amit(a)gmail.com>
Fixes: 8c3328f1f36a ("mm/migrate: migrate_vma() unmap page from vma while collecting pages")
Cc: stable(a)vger.kernel.org
---
Changes for v3:
- New for v3
---
mm/migrate_device.c | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/mm/migrate_device.c b/mm/migrate_device.c
index 27fb37d..6a5ef9f 100644
--- a/mm/migrate_device.c
+++ b/mm/migrate_device.c
@@ -254,13 +254,14 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
migrate->dst[migrate->npages] = 0;
migrate->src[migrate->npages++] = mpfn;
}
- arch_leave_lazy_mmu_mode();
- pte_unmap_unlock(ptep - 1, ptl);
/* Only flush the TLB if we actually modified any entries */
if (unmapped)
flush_tlb_range(walk->vma, start, end);
+ arch_leave_lazy_mmu_mode();
+ pte_unmap_unlock(ptep - 1, ptl);
+
return 0;
}
base-commit: ffcf9c5700e49c0aee42dcba9a12ba21338e8136
--
git-series 0.9.1
Hi,
Here's the relevant bit of the build log:
drivers/net/usb/smsc95xx.c: In function ‘smsc95xx_status’:
drivers/net/usb/smsc95xx.c:625:3: error: implicit declaration of
function ‘generic_handle_domain_irq’; did you mean
‘generic_handle_irq’? [-Werror
=implicit-function-declaration]
625 | generic_handle_domain_irq(pdata->irqdomain, PHY_HWIRQ);
| ^~~~~~~~~~~~~~~~~~~~~~~~~
| generic_handle_irq
drivers/net/usb/smsc95xx.c: In function ‘smsc95xx_bind’:
drivers/net/usb/smsc95xx.c:1136:21: error: implicit declaration of
function ‘irq_domain_alloc_named_fwnode’
[-Werror=implicit-function-declaration
]
1136 | pdata->irqfwnode = irq_domain_alloc_named_fwnode(usb_path);
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~
drivers/net/usb/smsc95xx.c:1136:19: warning: assignment to ‘struct
fwnode_handle *’ from ‘int’ makes pointer from integer without a cast
[-Wint-co
nversion]
1136 | pdata->irqfwnode = irq_domain_alloc_named_fwnode(usb_path);
| ^
drivers/net/usb/smsc95xx.c:1142:21: error: implicit declaration of
function ‘irq_domain_create_linear’
[-Werror=implicit-function-declaration]
1142 | pdata->irqdomain = irq_domain_create_linear(pdata->irqfwnode,
| ^~~~~~~~~~~~~~~~~~~~~~~~
drivers/net/usb/smsc95xx.c:1144:12: error: ‘irq_domain_simple_ops’
undeclared (first use in this function); did you mean
‘irq_domain_ops’?
1144 | &irq_domain_simple_ops,
| ^~~~~~~~~~~~~~~~~~~~~
| irq_domain_ops
drivers/net/usb/smsc95xx.c:1144:12: note: each undeclared identifier
is reported only once for each function it appears in
drivers/net/usb/smsc95xx.c:1151:12: error: implicit declaration of
function ‘irq_create_mapping’; did you mean ‘irq_dispose_mapping’?
[-Werror=imp
licit-function-declaration]
1151 | phy_irq = irq_create_mapping(pdata->irqdomain, PHY_HWIRQ);
| ^~~~~~~~~~~~~~~~~~
| irq_dispose_mapping
drivers/net/usb/smsc95xx.c:1245:2: error: implicit declaration of
function ‘irq_domain_remove’ [-Werror=implicit-function-declaration]
1245 | irq_domain_remove(pdata->irqdomain);
| ^~~~~~~~~~~~~~~~~
drivers/net/usb/smsc95xx.c:1248:2: error: implicit declaration of
function ‘irq_domain_free_fwnode’; did you mean
‘irq_domain_get_of_node’? [-Werr
or=implicit-function-declaration]
1248 | irq_domain_free_fwnode(pdata->irqfwnode);
| ^~~~~~~~~~~~~~~~~~~~~~
| irq_domain_get_of_node
drivers/net/usb/smsc95xx.c: In function ‘smsc95xx_unbind’:
drivers/net/usb/smsc95xx.c:1262:22: error: implicit declaration of
function ‘irq_find_mapping’; did you mean ‘irq_dispose_mapping’?
[-Werror=impli
cit-function-declaration]
1262 | irq_dispose_mapping(irq_find_mapping(pdata->irqdomain, PHY_HWIRQ));
| ^~~~~~~~~~~~~~~~
| irq_dispose_mapping
The build is for 32-bit x86, the defconfig can be found here:
https://github.com/urjaman/i586con/blob/master/brext/board/linux.config
The build failure also happens with 5.15.62 and 63.
--
Urja Rannikko
From: Eric Biggers <ebiggers(a)google.com>
CRYPTO_LIB_CHACHA_GENERIC doesn't need to select XOR_BLOCKS. It perhaps
was thought that it's needed for __crypto_xor, but that's not the case.
Enabling XOR_BLOCKS is problematic because the XOR_BLOCKS code runs a
benchmark when it is initialized. That causes a boot time regression on
systems that didn't have it enabled before.
Therefore, remove this unnecessary and problematic selection.
Fixes: e56e18985596 ("lib/crypto: add prompts back to crypto libraries")
Cc: stable(a)vger.kernel.org
Signed-off-by: Eric Biggers <ebiggers(a)google.com>
---
I've separated this fix out from the larger patch
https://lore.kernel.org/r/20220725183636.97326-3-ebiggers@kernel.org
that is currently queued in cryptodev.
lib/crypto/Kconfig | 1 -
1 file changed, 1 deletion(-)
diff --git a/lib/crypto/Kconfig b/lib/crypto/Kconfig
index 9ff549f63540fa..47816af9a9d7e1 100644
--- a/lib/crypto/Kconfig
+++ b/lib/crypto/Kconfig
@@ -33,7 +33,6 @@ config CRYPTO_ARCH_HAVE_LIB_CHACHA
config CRYPTO_LIB_CHACHA_GENERIC
tristate
- select XOR_BLOCKS
help
This symbol can be depended upon by arch implementations of the
ChaCha library interface that require the generic code as a
base-commit: 4c612826bec1441214816827979b62f84a097e91
--
2.37.2
The error exit of privcmd_ioctl_dm_op() is calling unlock_pages()
potentially with pages being NULL, leading to a NULL dereference.
Additionally lock_pages() doesn't check for pin_user_pages_fast()
having been completely successful, resulting in potentially not
locking all pages into memory. This could result in sporadic failures
when using the related memory in user mode.
Fix all of that by calling unlock_pages() always with the real number
of pinned pages, which will be zero in case pages being NULL, and by
checking the number of pages pinned by pin_user_pages_fast() matching
the expected number of pages.
Cc: <stable(a)vger.kernel.org>
Fixes: ab520be8cd5d ("xen/privcmd: Add IOCTL_PRIVCMD_DM_OP")
Reported-by: Rustam Subkhankulov <subkhankulov(a)ispras.ru>
Signed-off-by: Juergen Gross <jgross(a)suse.com>
---
V2:
- use "pinned" as parameter for unlock_pages() (Jan Beulich)
- drop label "unlock" again (Jan Beulich)
- add check for complete success of pin_user_pages_fast()
V3:
- continue after partial success of pin_user_pages_fast() (Jan Beulich)
V4:
- fix case of multiple partial successes for one buffer (Jan Beulich)
---
drivers/xen/privcmd.c | 21 +++++++++++----------
1 file changed, 11 insertions(+), 10 deletions(-)
diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c
index 3369734108af..e88e8f6f0a33 100644
--- a/drivers/xen/privcmd.c
+++ b/drivers/xen/privcmd.c
@@ -581,27 +581,30 @@ static int lock_pages(
struct privcmd_dm_op_buf kbufs[], unsigned int num,
struct page *pages[], unsigned int nr_pages, unsigned int *pinned)
{
- unsigned int i;
+ unsigned int i, off = 0;
- for (i = 0; i < num; i++) {
+ for (i = 0; i < num; ) {
unsigned int requested;
int page_count;
requested = DIV_ROUND_UP(
offset_in_page(kbufs[i].uptr) + kbufs[i].size,
- PAGE_SIZE);
+ PAGE_SIZE) - off;
if (requested > nr_pages)
return -ENOSPC;
page_count = pin_user_pages_fast(
- (unsigned long) kbufs[i].uptr,
+ (unsigned long)kbufs[i].uptr + off * PAGE_SIZE,
requested, FOLL_WRITE, pages);
- if (page_count < 0)
- return page_count;
+ if (page_count <= 0)
+ return page_count ? : -EFAULT;
*pinned += page_count;
nr_pages -= page_count;
pages += page_count;
+
+ off = (requested == page_count) ? 0 : off + page_count;
+ i += !off;
}
return 0;
@@ -677,10 +680,8 @@ static long privcmd_ioctl_dm_op(struct file *file, void __user *udata)
}
rc = lock_pages(kbufs, kdata.num, pages, nr_pages, &pinned);
- if (rc < 0) {
- nr_pages = pinned;
+ if (rc < 0)
goto out;
- }
for (i = 0; i < kdata.num; i++) {
set_xen_guest_handle(xbufs[i].h, kbufs[i].uptr);
@@ -692,7 +693,7 @@ static long privcmd_ioctl_dm_op(struct file *file, void __user *udata)
xen_preemptible_hcall_end();
out:
- unlock_pages(pages, nr_pages);
+ unlock_pages(pages, pinned);
kfree(xbufs);
kfree(pages);
kfree(kbufs);
--
2.35.3
The detailed description: When testing with selftests/net/icmp_redirect.sh, a redirect exception FAIL occurred for IPv4.
This is not in line with actual expectations. r1 changes the route to the destination network 172.16.2.0/24 from 10.1.1.2 to 172.16.1.254. After h1 sends the ping packet, h1 continues to obtain the route to 172.16.2.2, and the result is not as expected.
This flaw was introduced by 747c14307214b55dbd8250e1ab44cad8305756f1. When this commit is rolled back, the test will pass.
bug commit: 747c14307214b55dbd8250e1ab44cad8305756f1
Anolis bugzilla link: https://bugzilla.openanolis.cn/show_bug.cgi?id=1843
uname -mi: aarch64 aarch64
uname -r: 6.0.0-rc2-00159-g4c612826bec1
cat /proc/version: Linux version 6.0.0-rc2-00159-g4c612826bec1 (root@iZbp1abkyymykg1xs81hoeZ) (gcc (GCC) 8.5.0 20210514 (Anolis 8.5.0-10.0.1), GNU ld version 2.30-113.0.1.an8)
#!/bin/bash
# SPDX-License-Identifier: GPL-2.0
#
# redirect test
#
# .253 +----+
# +----| r1 |
# | +----+
# +----+ | |.1
# | h1 |--------------+ | 10.1.1.0/30 2001:db8:1::0/126
# +----+ .1 | |.2
# 172.16.1/24 | +----+ +----+
# 2001:db8:16:1/64 +----| r2 |-------------------| h2 |
# .254 +----+ .254 .2 +----+
# 172.16.2/24
# 2001:db8:16:2/64
#
# Route from h1 to h2 goes through r1, eth1 - connection between r1 and r2.
# Route on r1 changed to go to r2 via eth0. This causes a redirect to be sent
# from r1 to h1 telling h1 to use r2 when talking to h2.
-------------------
From 747c14307214b55dbd8250e1ab44cad8305756f1 Mon Sep 17 00:00:00 2001
From: Nicolas Dichtel <nicolas.dichtel(a)6wind.com>
Date: Wed, 13 Jul 2022 13:48:52 +0200
Subject: [PATCH] ip: fix dflt addr selection for connected nexthop
When a nexthop is added, without a gw address, the default scope was set
to 'host'. Thus, when a source address is selected, 127.0.0.1 may be chosen
but rejected when the route is used.
When using a route without a nexthop id, the scope can be configured in the
route, thus the problem doesn't exist.
To explain more deeply: when a user creates a nexthop, it cannot specify
the scope. To create it, the function nh_create_ipv4() calls fib_check_nh()
with scope set to 0. fib_check_nh() calls fib_check_nh_nongw() wich was
setting scope to 'host'. Then, nh_create_ipv4() calls
fib_info_update_nhc_saddr() with scope set to 'host'. The src addr is
chosen before the route is inserted.
When a 'standard' route (ie without a reference to a nexthop) is added,
fib_create_info() calls fib_info_update_nhc_saddr() with the scope set by
the user. iproute2 set the scope to 'link' by default.
Here is a way to reproduce the problem:
ip netns add foo
ip -n foo link set lo up
ip netns add bar
ip -n bar link set lo up
sleep 1
ip -n foo link add name eth0 type dummy
ip -n foo link set eth0 up
ip -n foo address add 192.168.0.1/24 dev eth0
ip -n foo link add name veth0 type veth peer name veth1 netns bar
ip -n foo link set veth0 up
ip -n bar link set veth1 up
ip -n bar address add 192.168.1.1/32 dev veth1
ip -n bar route add default dev veth1
ip -n foo nexthop add id 1 dev veth0
ip -n foo route add 192.168.1.1 nhid 1
Try to get/use the route:
> $ ip -n foo route get 192.168.1.1
> RTNETLINK answers: Invalid argument
> $ ip netns exec foo ping -c1 192.168.1.1
> ping: connect: Invalid argument
Try without nexthop group (iproute2 sets scope to 'link' by dflt):
ip -n foo route del 192.168.1.1
ip -n foo route add 192.168.1.1 dev veth0
Try to get/use the route:
> $ ip -n foo route get 192.168.1.1
> 192.168.1.1 dev veth0 src 192.168.0.1 uid 0
> cache
> $ ip netns exec foo ping -c1 192.168.1.1
> PING 192.168.1.1 (192.168.1.1) 56(84) bytes of data.
> 64 bytes from 192.168.1.1: icmp_seq=1 ttl=64 time=0.039 ms
>
> --- 192.168.1.1 ping statistics ---
> 1 packets transmitted, 1 received, 0% packet loss, time 0ms
> rtt min/avg/max/mdev = 0.039/0.039/0.039/0.000 ms
CC: stable(a)vger.kernel.org
Fixes: 597cfe4fc339 ("nexthop: Add support for IPv4 nexthops")
Reported-by: Edwin Brossette <edwin.brossette(a)6wind.com>
Signed-off-by: Nicolas Dichtel <nicolas.dichtel(a)6wind.com>
Link: https://lore.kernel.org/r/20220713114853.29406-1-nicolas.dichtel@6wind.com
Signed-off-by: Paolo Abeni <pabeni(a)redhat.com>
---
net/ipv4/fib_semantics.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 16dbd5075284..d9fdcbae16ee 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -1230,7 +1230,7 @@ static int fib_check_nh_nongw(struct net *net, struct fib_nh *nh,
nh->fib_nh_dev = in_dev->dev;
dev_hold_track(nh->fib_nh_dev, &nh->fib_nh_dev_tracker, GFP_ATOMIC);
- nh->fib_nh_scope = RT_SCOPE_HOST;
+ nh->fib_nh_scope = RT_SCOPE_LINK;
if (!netif_carrier_ok(nh->fib_nh_dev))
nh->fib_nh_flags |= RTNH_F_LINKDOWN;
err = 0;
--
2.31.1
From: Kyle Huey <me(a)kylehuey.com>
When management of the PKRU register was moved away from XSTATE, emulation
of PKRU's existence in XSTATE was added for APIs that read XSTATE, but not
for APIs that write XSTATE. This can be seen by running gdb and executing
`p $pkru`, `set $pkru = 42`, and `p $pkru`. On affected kernels (5.14+) the
write to the PKRU register (which gdb performs through ptrace) is ignored.
There are three relevant APIs: PTRACE_SETREGSET with NT_X86_XSTATE,
sigreturn, and KVM_SET_XSAVE. KVM_SET_XSAVE has its own special handling to
make PKRU writes take effect (in fpu_copy_uabi_to_guest_fpstate). Push that
down into copy_uabi_to_xstate and have PTRACE_SETREGSET with NT_X86_XSTATE
and sigreturn pass in pointers to the appropriate PKRU value.
This also adds code to initialize the PKRU value to the hardware init value
(namely 0) if the PKRU bit is not set in the XSTATE header to match XRSTOR.
This is a change to the current KVM_SET_XSAVE behavior.
Changelog since v4:
- Selftest additionally checks PKRU readbacks through ptrace.
- Selftest flips all PKRU bits (except the key used for PROT_EXEC).
Changelog since v3:
- The v3 patch is now part 1 of 2.
- Adds a selftest in part 2 of 2.
Changelog since v2:
- Removed now unused variables in fpu_copy_uabi_to_guest_fpstate
Changelog since v1:
- Handles the error case of copy_to_buffer().
Signed-off-by: Kyle Huey <me(a)kylehuey.com>
Cc: Dave Hansen <dave.hansen(a)linux.intel.com>
Cc: Thomas Gleixner <tglx(a)linutronix.de>
Cc: Borislav Petkov <bp(a)suse.de>
Cc: kvm(a)vger.kernel.org # For edge case behavior of KVM_SET_XSAVE
Cc: stable(a)vger.kernel.org # 5.14+
Fixes: e84ba47e313d ("x86/fpu: Hook up PKRU into ptrace()")
---
arch/x86/kernel/fpu/core.c | 13 +------------
arch/x86/kernel/fpu/regset.c | 2 +-
arch/x86/kernel/fpu/signal.c | 2 +-
arch/x86/kernel/fpu/xstate.c | 28 +++++++++++++++++++++++-----
arch/x86/kernel/fpu/xstate.h | 4 ++--
5 files changed, 28 insertions(+), 21 deletions(-)
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index 3b28c5b25e12..46b935bc87c8 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -391,8 +391,6 @@ int fpu_copy_uabi_to_guest_fpstate(struct fpu_guest *gfpu, const void *buf,
{
struct fpstate *kstate = gfpu->fpstate;
const union fpregs_state *ustate = buf;
- struct pkru_state *xpkru;
- int ret;
if (!cpu_feature_enabled(X86_FEATURE_XSAVE)) {
if (ustate->xsave.header.xfeatures & ~XFEATURE_MASK_FPSSE)
@@ -406,16 +404,7 @@ int fpu_copy_uabi_to_guest_fpstate(struct fpu_guest *gfpu, const void *buf,
if (ustate->xsave.header.xfeatures & ~xcr0)
return -EINVAL;
- ret = copy_uabi_from_kernel_to_xstate(kstate, ustate);
- if (ret)
- return ret;
-
- /* Retrieve PKRU if not in init state */
- if (kstate->regs.xsave.header.xfeatures & XFEATURE_MASK_PKRU) {
- xpkru = get_xsave_addr(&kstate->regs.xsave, XFEATURE_PKRU);
- *vpkru = xpkru->pkru;
- }
- return 0;
+ return copy_uabi_from_kernel_to_xstate(kstate, ustate, vpkru);
}
EXPORT_SYMBOL_GPL(fpu_copy_uabi_to_guest_fpstate);
#endif /* CONFIG_KVM */
diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c
index 75ffaef8c299..6d056b68f4ed 100644
--- a/arch/x86/kernel/fpu/regset.c
+++ b/arch/x86/kernel/fpu/regset.c
@@ -167,7 +167,7 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset,
}
fpu_force_restore(fpu);
- ret = copy_uabi_from_kernel_to_xstate(fpu->fpstate, kbuf ?: tmpbuf);
+ ret = copy_uabi_from_kernel_to_xstate(fpu->fpstate, kbuf ?: tmpbuf, &target->thread.pkru);
out:
vfree(tmpbuf);
diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index 91d4b6de58ab..558076dbde5b 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -396,7 +396,7 @@ static bool __fpu_restore_sig(void __user *buf, void __user *buf_fx,
fpregs = &fpu->fpstate->regs;
if (use_xsave() && !fx_only) {
- if (copy_sigframe_from_user_to_xstate(fpu->fpstate, buf_fx))
+ if (copy_sigframe_from_user_to_xstate(tsk, buf_fx))
return false;
} else {
if (__copy_from_user(&fpregs->fxsave, buf_fx,
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index c8340156bfd2..e01d3514ae68 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -1197,7 +1197,7 @@ static int copy_from_buffer(void *dst, unsigned int offset, unsigned int size,
static int copy_uabi_to_xstate(struct fpstate *fpstate, const void *kbuf,
- const void __user *ubuf)
+ const void __user *ubuf, u32 *pkru)
{
struct xregs_state *xsave = &fpstate->regs.xsave;
unsigned int offset, size;
@@ -1235,6 +1235,24 @@ static int copy_uabi_to_xstate(struct fpstate *fpstate, const void *kbuf,
for (i = 0; i < XFEATURE_MAX; i++) {
mask = BIT_ULL(i);
+ if (i == XFEATURE_PKRU) {
+ /*
+ * Retrieve PKRU if not in init state, otherwise
+ * initialize it.
+ */
+ if (hdr.xfeatures & mask) {
+ struct pkru_state xpkru = {0};
+
+ if (copy_from_buffer(&xpkru, xstate_offsets[i],
+ sizeof(xpkru), kbuf, ubuf))
+ return -EFAULT;
+
+ *pkru = xpkru.pkru;
+ } else {
+ *pkru = 0;
+ }
+ }
+
if (hdr.xfeatures & mask) {
void *dst = __raw_xsave_addr(xsave, i);
@@ -1264,9 +1282,9 @@ static int copy_uabi_to_xstate(struct fpstate *fpstate, const void *kbuf,
* Convert from a ptrace standard-format kernel buffer to kernel XSAVE[S]
* format and copy to the target thread. Used by ptrace and KVM.
*/
-int copy_uabi_from_kernel_to_xstate(struct fpstate *fpstate, const void *kbuf)
+int copy_uabi_from_kernel_to_xstate(struct fpstate *fpstate, const void *kbuf, u32 *pkru)
{
- return copy_uabi_to_xstate(fpstate, kbuf, NULL);
+ return copy_uabi_to_xstate(fpstate, kbuf, NULL, pkru);
}
/*
@@ -1274,10 +1292,10 @@ int copy_uabi_from_kernel_to_xstate(struct fpstate *fpstate, const void *kbuf)
* XSAVE[S] format and copy to the target thread. This is called from the
* sigreturn() and rt_sigreturn() system calls.
*/
-int copy_sigframe_from_user_to_xstate(struct fpstate *fpstate,
+int copy_sigframe_from_user_to_xstate(struct task_struct *tsk,
const void __user *ubuf)
{
- return copy_uabi_to_xstate(fpstate, NULL, ubuf);
+ return copy_uabi_to_xstate(tsk->thread.fpu.fpstate, NULL, ubuf, &tsk->thread.pkru);
}
static bool validate_independent_components(u64 mask)
diff --git a/arch/x86/kernel/fpu/xstate.h b/arch/x86/kernel/fpu/xstate.h
index 5ad47031383b..a4ecb04d8d64 100644
--- a/arch/x86/kernel/fpu/xstate.h
+++ b/arch/x86/kernel/fpu/xstate.h
@@ -46,8 +46,8 @@ extern void __copy_xstate_to_uabi_buf(struct membuf to, struct fpstate *fpstate,
u32 pkru_val, enum xstate_copy_mode copy_mode);
extern void copy_xstate_to_uabi_buf(struct membuf to, struct task_struct *tsk,
enum xstate_copy_mode mode);
-extern int copy_uabi_from_kernel_to_xstate(struct fpstate *fpstate, const void *kbuf);
-extern int copy_sigframe_from_user_to_xstate(struct fpstate *fpstate, const void __user *ubuf);
+extern int copy_uabi_from_kernel_to_xstate(struct fpstate *fpstate, const void *kbuf, u32 *pkru);
+extern int copy_sigframe_from_user_to_xstate(struct task_struct *tsk, const void __user *ubuf);
extern void fpu__init_cpu_xstate(void);
--
2.37.1
migrate_vma_setup() has a fast path in migrate_vma_collect_pmd() that
installs migration entries directly if it can lock the migrating page.
When removing a dirty pte the dirty bit is supposed to be carried over
to the underlying page to prevent it being lost.
Currently migrate_vma_*() can only be used for private anonymous
mappings. That means loss of the dirty bit usually doesn't result in
data loss because these pages are typically not file-backed. However
pages may be backed by swap storage which can result in data loss if an
attempt is made to migrate a dirty page that doesn't yet have the
PageDirty flag set.
In this case migration will fail due to unexpected references but the
dirty pte bit will be lost. If the page is subsequently reclaimed data
won't be written back to swap storage as it is considered uptodate,
resulting in data loss if the page is subsequently accessed.
Prevent this by copying the dirty bit to the page when removing the pte
to match what try_to_migrate_one() does.
Signed-off-by: Alistair Popple <apopple(a)nvidia.com>
Acked-by: Peter Xu <peterx(a)redhat.com>
Reported-by: Huang Ying <ying.huang(a)intel.com>
Fixes: 8c3328f1f36a ("mm/migrate: migrate_vma() unmap page from vma while collecting pages")
Cc: stable(a)vger.kernel.org
---
Changes for v2:
- Fixed up Reported-by tag.
- Added Peter's Acked-by.
- Atomically read and clear the pte to prevent the dirty bit getting
set after reading it.
- Added fixes tag
---
mm/migrate_device.c | 21 ++++++++-------------
1 file changed, 8 insertions(+), 13 deletions(-)
diff --git a/mm/migrate_device.c b/mm/migrate_device.c
index 27fb37d..e2d09e5 100644
--- a/mm/migrate_device.c
+++ b/mm/migrate_device.c
@@ -7,6 +7,7 @@
#include <linux/export.h>
#include <linux/memremap.h>
#include <linux/migrate.h>
+#include <linux/mm.h>
#include <linux/mm_inline.h>
#include <linux/mmu_notifier.h>
#include <linux/oom.h>
@@ -61,7 +62,7 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
struct migrate_vma *migrate = walk->private;
struct vm_area_struct *vma = walk->vma;
struct mm_struct *mm = vma->vm_mm;
- unsigned long addr = start, unmapped = 0;
+ unsigned long addr = start;
spinlock_t *ptl;
pte_t *ptep;
@@ -193,11 +194,10 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
bool anon_exclusive;
pte_t swp_pte;
+ flush_cache_page(vma, addr, pte_pfn(*ptep));
+ pte = ptep_clear_flush(vma, addr, ptep);
anon_exclusive = PageAnon(page) && PageAnonExclusive(page);
if (anon_exclusive) {
- flush_cache_page(vma, addr, pte_pfn(*ptep));
- ptep_clear_flush(vma, addr, ptep);
-
if (page_try_share_anon_rmap(page)) {
set_pte_at(mm, addr, ptep, pte);
unlock_page(page);
@@ -205,12 +205,14 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
mpfn = 0;
goto next;
}
- } else {
- ptep_get_and_clear(mm, addr, ptep);
}
migrate->cpages++;
+ /* Set the dirty flag on the folio now the pte is gone. */
+ if (pte_dirty(pte))
+ folio_mark_dirty(page_folio(page));
+
/* Setup special migration page table entry */
if (mpfn & MIGRATE_PFN_WRITE)
entry = make_writable_migration_entry(
@@ -242,9 +244,6 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
*/
page_remove_rmap(page, vma, false);
put_page(page);
-
- if (pte_present(pte))
- unmapped++;
} else {
put_page(page);
mpfn = 0;
@@ -257,10 +256,6 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
arch_leave_lazy_mmu_mode();
pte_unmap_unlock(ptep - 1, ptl);
- /* Only flush the TLB if we actually modified any entries */
- if (unmapped)
- flush_tlb_range(walk->vma, start, end);
-
return 0;
}
base-commit: ffcf9c5700e49c0aee42dcba9a12ba21338e8136
--
git-series 0.9.1
device_register() always calls device_initialize() so calling device_del()
is safe even if device_register() fails. Implement the following advice
from the comment block above device_register(): "NOTE: _Never_ directly free
@dev after calling this function, even if it returned an error! Always use
put_device() to give up the reference initialized in this function instead."
Keep the kfree() call in the error path since srp_release_dev() does not
free the host.
Cc: stable(a)vger.kernel.org
Signed-off-by: Bart Van Assche <bvanassche(a)acm.org>
---
drivers/infiniband/ulp/srp/ib_srp.c | 15 +++++++--------
1 file changed, 7 insertions(+), 8 deletions(-)
diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c
index 7720ea270ed8..8fd6a88f7a9c 100644
--- a/drivers/infiniband/ulp/srp/ib_srp.c
+++ b/drivers/infiniband/ulp/srp/ib_srp.c
@@ -3909,20 +3909,19 @@ static struct srp_host *srp_add_port(struct srp_device *device, u8 port)
port);
if (device_register(&host->dev))
- goto free_host;
+ goto put_host;
if (device_create_file(&host->dev, &dev_attr_add_target))
- goto err_class;
+ goto put_host;
if (device_create_file(&host->dev, &dev_attr_ibdev))
- goto err_class;
+ goto put_host;
if (device_create_file(&host->dev, &dev_attr_port))
- goto err_class;
+ goto put_host;
return host;
-err_class:
- device_unregister(&host->dev);
-
-free_host:
+put_host:
+ device_del(&host->dev);
+ put_device(&host->dev);
kfree(host);
return NULL;
From: Dmitry Osipenko <dmitry.osipenko(a)collabora.com>
The busy status bit may never de-assert if number of programmed skip
bits is incorrect, resulting in a kernel hang because the bit is polled
endlessly in the code. Fix it by adding timeout for the bit-polling.
This problem is reproducible by setting the data_bit_offset field of
the HEVC slice params to a wrong value by userspace.
Cc: stable(a)vger.kernel.org
Reported-by: Nicolas Dufresne <nicolas.dufresne(a)collabora.com>
Signed-off-by: Dmitry Osipenko <dmitry.osipenko(a)collabora.com>
Signed-off-by: Nicolas Dufresne <nicolas.dufresne(a)collabora.com>
---
drivers/staging/media/sunxi/cedrus/cedrus_h265.c | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/drivers/staging/media/sunxi/cedrus/cedrus_h265.c b/drivers/staging/media/sunxi/cedrus/cedrus_h265.c
index f703c585d91c5..f0bc118021b0a 100644
--- a/drivers/staging/media/sunxi/cedrus/cedrus_h265.c
+++ b/drivers/staging/media/sunxi/cedrus/cedrus_h265.c
@@ -227,6 +227,7 @@ static void cedrus_h265_pred_weight_write(struct cedrus_dev *dev,
static void cedrus_h265_skip_bits(struct cedrus_dev *dev, int num)
{
int count = 0;
+ u32 reg;
while (count < num) {
int tmp = min(num - count, 32);
@@ -234,8 +235,9 @@ static void cedrus_h265_skip_bits(struct cedrus_dev *dev, int num)
cedrus_write(dev, VE_DEC_H265_TRIGGER,
VE_DEC_H265_TRIGGER_FLUSH_BITS |
VE_DEC_H265_TRIGGER_TYPE_N_BITS(tmp));
- while (cedrus_read(dev, VE_DEC_H265_STATUS) & VE_DEC_H265_STATUS_VLD_BUSY)
- udelay(1);
+
+ if (cedrus_wait_for(dev, VE_DEC_H265_STATUS, VE_DEC_H265_STATUS_VLD_BUSY))
+ dev_err_ratelimited(dev->dev, "timed out waiting to skip bits\n");
count += tmp;
}
--
2.37.2
The watchdog needs to be schedule before we trigger the decode
operation, otherwise there is a risk that the decoder IRQ will be
called before we have schedule the watchdog. As a side effect, the
watchdog would never be cancelled and its function would be called
at an inappropriate time.
This was observed while running Fluster with GStreamer as a backend.
Some programming error would cause the decoder IRQ to be call very
quickly after the trigger. Later calls into the driver would deadlock
due to the unbalanced state.
Cc: stable(a)vger.kernel.org
Fixes: 7c38a551bda1 ("media: cedrus: Add watchdog for job completion")
Signed-off-by: Nicolas Dufresne <nicolas.dufresne(a)collabora.com>
---
drivers/staging/media/sunxi/cedrus/cedrus_dec.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/drivers/staging/media/sunxi/cedrus/cedrus_dec.c b/drivers/staging/media/sunxi/cedrus/cedrus_dec.c
index 3b6aa78a2985f..e7f7602a5ab40 100644
--- a/drivers/staging/media/sunxi/cedrus/cedrus_dec.c
+++ b/drivers/staging/media/sunxi/cedrus/cedrus_dec.c
@@ -106,11 +106,11 @@ void cedrus_device_run(void *priv)
/* Trigger decoding if setup went well, bail out otherwise. */
if (!error) {
- dev->dec_ops[ctx->current_codec]->trigger(ctx);
-
/* Start the watchdog timer. */
schedule_delayed_work(&dev->watchdog_work,
msecs_to_jiffies(2000));
+
+ dev->dec_ops[ctx->current_codec]->trigger(ctx);
} else {
v4l2_m2m_buf_done_and_job_finish(ctx->dev->m2m_dev,
ctx->fh.m2m_ctx,
--
2.37.2
From: Dongliang Mu <mudongliangabcd(a)gmail.com>
In alloc_inode, inode_init_always() could return -ENOMEM if
security_inode_alloc() fails, which causes inode->i_private
uninitialized. Then nilfs_is_metadata_file_inode() returns
true and nilfs_free_inode() wrongly calls nilfs_mdt_destroy(),
which frees the uninitialized inode->i_private
and leads to crashes(e.g., UAF/GPF).
Fix this by moving security_inode_alloc just prior to
this_cpu_inc(nr_inodes)
Link: https://lkml.kernel.org/r/CAFcO6XOcf1Jj2SeGt=jJV59wmhESeSKpfR0omdFRq+J9nD1v…
Reported-by: butt3rflyh4ck <butterflyhuangxx(a)gmail.com>
Reported-by: Hao Sun <sunhao.th(a)gmail.com>
Reported-by: Jiacheng Xu <stitch(a)zju.edu.cn>
Signed-off-by: Dongliang Mu <mudongliangabcd(a)gmail.com>
Cc: Al Viro <viro(a)zeniv.linux.org.uk>
Cc: stable(a)vger.kernel.org
---
v1->v2: move security_inode_alloc at the very end according to Al Viro
other than initializing i_private before security_inode_alloc.
fs/inode.c | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/fs/inode.c b/fs/inode.c
index 6462276dfdf0..49d1eb91728c 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -192,8 +192,6 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
inode->i_wb_frn_history = 0;
#endif
- if (security_inode_alloc(inode))
- goto out;
spin_lock_init(&inode->i_lock);
lockdep_set_class(&inode->i_lock, &sb->s_type->i_lock_key);
@@ -228,6 +226,9 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
inode->i_fsnotify_mask = 0;
#endif
inode->i_flctx = NULL;
+
+ if (security_inode_alloc(inode))
+ goto out;
this_cpu_inc(nr_inodes);
return 0;
--
2.35.1
After xHC controller is started, either in probe or resume, it can take
a while before any of the connected usb devices are visible to the roothub
due to link training.
It's possible xhci driver loads, sees no acivity and suspends the host
before the USB device is visible.
In one testcase with a hotplugged xHC controller the host finally detected
the connected USB device and generated a wake 500ms after host initial
start.
If hosts didn't suspend the device duringe training it probablty wouldn't
take up to 500ms to detect it, but looking at specs reveal USB3 link
training has a couple long timeout values, such as 120ms
RxDetectQuietTimeout, and 360ms PollingLFPSTimeout.
So Add a 500ms grace period that keeps polling the roothub for 500ms after
start, preventing runtime suspend until USB devices are detected.
Cc: stable(a)vger.kernel.org
Signed-off-by: Mathias Nyman <mathias.nyman(a)linux.intel.com>
---
drivers/usb/host/xhci-hub.c | 11 +++++++++++
drivers/usb/host/xhci.c | 4 +++-
drivers/usb/host/xhci.h | 2 +-
3 files changed, 15 insertions(+), 2 deletions(-)
diff --git a/drivers/usb/host/xhci-hub.c b/drivers/usb/host/xhci-hub.c
index 0fdc014c9401..b30298986a69 100644
--- a/drivers/usb/host/xhci-hub.c
+++ b/drivers/usb/host/xhci-hub.c
@@ -1648,6 +1648,17 @@ int xhci_hub_status_data(struct usb_hcd *hcd, char *buf)
status = bus_state->resuming_ports;
+ /*
+ * SS devices are only visible to roothub after link training completes.
+ * Keep polling roothubs for a grace period after xHC start
+ */
+ if (xhci->run_graceperiod) {
+ if (time_before(jiffies, xhci->run_graceperiod))
+ status = 1;
+ else
+ xhci->run_graceperiod = 0;
+ }
+
mask = PORT_CSC | PORT_PEC | PORT_OCC | PORT_PLC | PORT_WRC | PORT_CEC;
/* For each port, did anything change? If so, set that bit in buf. */
diff --git a/drivers/usb/host/xhci.c b/drivers/usb/host/xhci.c
index 65858f607437..1afd32beec99 100644
--- a/drivers/usb/host/xhci.c
+++ b/drivers/usb/host/xhci.c
@@ -151,9 +151,11 @@ int xhci_start(struct xhci_hcd *xhci)
xhci_err(xhci, "Host took too long to start, "
"waited %u microseconds.\n",
XHCI_MAX_HALT_USEC);
- if (!ret)
+ if (!ret) {
/* clear state flags. Including dying, halted or removing */
xhci->xhc_state = 0;
+ xhci->run_graceperiod = jiffies + msecs_to_jiffies(500);
+ }
return ret;
}
diff --git a/drivers/usb/host/xhci.h b/drivers/usb/host/xhci.h
index 1960b47acfb2..df6f2ebaff18 100644
--- a/drivers/usb/host/xhci.h
+++ b/drivers/usb/host/xhci.h
@@ -1826,7 +1826,7 @@ struct xhci_hcd {
/* Host controller watchdog timer structures */
unsigned int xhc_state;
-
+ unsigned long run_graceperiod;
u32 command;
struct s3_save s3;
/* Host controller is dying - not responding to commands. "I'm not dead yet!"
--
2.25.1
--
Dear,
I had sent you a mail but i don't think you received it that's why am
writing you again.It is important you get back to me as soon as you
can.
Maddah Hussain
When a CFS task that was boosted by a SCHED_DEADLINE
task boosts another CFS task (nested priority inheritance),
Kernel panic is observed.
Fixing priority inheritance changes the way how sched_deadline
attributes are being inherited from original donor task.
Additional supporting patches are added to fix throttling of
boosted tasks.
Daniel Bristot de Oliveira (1):
sched/deadline: Unthrottle PI boosted threads while enqueuing
Lucas Stach (1):
sched/deadline: Fix stale throttling on de-/boosted tasks
Juri Lelli (1):
sched/deadline: Fix priority inheritance with multiple scheduling
classes
Hui Su (1):
kernel/sched: Remove dl_boosted flag comment
include/linux/sched.h | 13 ++--
kernel/sched/core.c | 11 ++--
kernel/sched/deadline.c | 131 +++++++++++++++++++++++++---------------
3 files changed, 96 insertions(+), 59 deletions(-)
--
2.34.1
The error exit of privcmd_ioctl_dm_op() is calling unlock_pages()
potentially with pages being NULL, leading to a NULL dereference.
Additionally lock_pages() doesn't check for pin_user_pages_fast()
having been completely successful, resulting in potentially not
locking all pages into memory. This could result in sporadic failures
when using the related memory in user mode.
Fix all of that by calling unlock_pages() always with the real number
of pinned pages, which will be zero in case pages being NULL, and by
checking the number of pages pinned by pin_user_pages_fast() matching
the expected number of pages.
Cc: <stable(a)vger.kernel.org>
Fixes: ab520be8cd5d ("xen/privcmd: Add IOCTL_PRIVCMD_DM_OP")
Reported-by: Rustam Subkhankulov <subkhankulov(a)ispras.ru>
Signed-off-by: Juergen Gross <jgross(a)suse.com>
---
V2:
- use "pinned" as parameter for unlock_pages() (Jan Beulich)
- drop label "unlock" again (Jan Beulich)
- add check for complete success of pin_user_pages_fast()
V3:
- continue after partial success of pin_user_pages_fast() (Jan Beulich)
---
drivers/xen/privcmd.c | 20 +++++++++++---------
1 file changed, 11 insertions(+), 9 deletions(-)
diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c
index 3369734108af..1ca7e3ea6fd4 100644
--- a/drivers/xen/privcmd.c
+++ b/drivers/xen/privcmd.c
@@ -581,7 +581,7 @@ static int lock_pages(
struct privcmd_dm_op_buf kbufs[], unsigned int num,
struct page *pages[], unsigned int nr_pages, unsigned int *pinned)
{
- unsigned int i;
+ unsigned int i, off = 0;
for (i = 0; i < num; i++) {
unsigned int requested;
@@ -589,19 +589,23 @@ static int lock_pages(
requested = DIV_ROUND_UP(
offset_in_page(kbufs[i].uptr) + kbufs[i].size,
- PAGE_SIZE);
+ PAGE_SIZE) - off;
if (requested > nr_pages)
return -ENOSPC;
page_count = pin_user_pages_fast(
- (unsigned long) kbufs[i].uptr,
+ (unsigned long)kbufs[i].uptr + off * PAGE_SIZE,
requested, FOLL_WRITE, pages);
- if (page_count < 0)
- return page_count;
+ if (page_count <= 0)
+ return page_count ? : -EFAULT;
*pinned += page_count;
nr_pages -= page_count;
pages += page_count;
+
+ off = requested - page_count;
+ if (off)
+ i--;
}
return 0;
@@ -677,10 +681,8 @@ static long privcmd_ioctl_dm_op(struct file *file, void __user *udata)
}
rc = lock_pages(kbufs, kdata.num, pages, nr_pages, &pinned);
- if (rc < 0) {
- nr_pages = pinned;
+ if (rc < 0)
goto out;
- }
for (i = 0; i < kdata.num; i++) {
set_xen_guest_handle(xbufs[i].h, kbufs[i].uptr);
@@ -692,7 +694,7 @@ static long privcmd_ioctl_dm_op(struct file *file, void __user *udata)
xen_preemptible_hcall_end();
out:
- unlock_pages(pages, nr_pages);
+ unlock_pages(pages, pinned);
kfree(xbufs);
kfree(pages);
kfree(kbufs);
--
2.35.3
Hi Greg,
This 5.10.y backport series contains fixes from v5.17 release.
All the patches in this series have already been applied to 5.15.y
except for patch 2 which you have queued to 5.15.y yesterday [1].
Yesterday's 5.15.y series contains mostly fixed from v5.18/v5.19.
The applicable fixed from that series will be included in the next
5.10.y series.
Thanks,
Amir.
Changes from [v1]:
- Added Acked-by Darrick
- CC stable
[1] https://lore.kernel.org/linux-xfs/YwSADLkBPNe7hZQs@kroah.com/
[v1] https://lore.kernel.org/linux-xfs/20220822162802.1661512-1-amir73il@gmail.c…
Christoph Hellwig (1):
fs: remove __sync_filesystem
Dan Carpenter (1):
xfs: prevent a WARN_ONCE() in xfs_ioc_attr_list()
Darrick J. Wong (4):
xfs: reject crazy array sizes being fed to XFS_IOC_GETBMAP*
vfs: make sync_filesystem return errors from ->sync_fs
xfs: return errors in xfs_fs_sync_fs
xfs: only bother with sync_filesystem during readonly remount
fs/sync.c | 48 ++++++++++++++++++++++++----------------------
fs/xfs/xfs_ioctl.c | 4 ++--
fs/xfs/xfs_ioctl.h | 5 +++--
fs/xfs/xfs_super.c | 13 ++++++++++---
4 files changed, 40 insertions(+), 30 deletions(-)
--
2.25.1
Hi Greg,
stable kernels 5.18 and 5.15 seem to be missing upstream patch
c64cc2802a78 ("x86/entry: Move CLD to the start of the idtentry macro").
This is a prerequisite patch for 64cbd0acb582 ("x86/entry: Don't call
error_entry() for XENPV"), which is included in 5.15.y and 5.18.y.
Could you please take c64cc2802a78 for 5.15 and 5.18?
Juergen
Rajesh reports [1] that the test_align BPF selftest is broken in
5.4.210. Three patches were added since 5.4.209:
(A) 7c1134c7da99 ("bpf: Verifer, adjust_scalar_min_max_vals to always call update_reg_bounds()")
(B) 6a9b3f0f3bad ("selftests/bpf: Fix test_align verifier log patterns")
(C) 6098562ed9df ("selftests/bpf: Fix "dubious pointer arithmetic" test")
(A) fixes an issue in the BPF verifier, which changes the verifier trace
output. (B) fixes those trace changes in the selftests.
Unfortunately (B) also address changes to the verifier output from other
patches that weren't backported to v5.4, so the test now fails.
(C) also addresses a different verifier change that is not in v5.4.
Therefore revert (C), and partially revert (B).
[1] https://lore.kernel.org/all/CAPXMrf-C5XEUfOJd3GCtgtHOkc8DxDGbLxE5=GFmr+Py0z…
Jean-Philippe Brucker (2):
Revert "selftests/bpf: Fix "dubious pointer arithmetic" test"
Revert "selftests/bpf: Fix test_align verifier log patterns"
tools/testing/selftests/bpf/test_align.c | 14 +++++++-------
1 file changed, 7 insertions(+), 7 deletions(-)
--
2.37.1
The error exit of privcmd_ioctl_dm_op() is calling unlock_pages()
potentially with pages being NULL, leading to a NULL dereference.
Additionally lock_pages() doesn't check for pin_user_pages_fast()
having been completely successful, resulting in potentially not
locking all pages into memory. This could result in sporadic failures
when using the related memory in user mode.
Fix all of that by calling unlock_pages() always with the real number
of pinned pages, which will be zero in case pages being NULL, and by
checking the number of patches pinned by pin_user_pages_fast()
matching the expected number of pages.
Cc: <stable(a)vger.kernel.org>
Fixes: ab520be8cd5d ("xen/privcmd: Add IOCTL_PRIVCMD_DM_OP")
Reported-by: Rustam Subkhankulov <subkhankulov(a)ispras.ru>
Signed-off-by: Juergen Gross <jgross(a)suse.com>
---
V2:
- use "pinned" as parameter for unlock_pages() (Jan Beulich)
- drop label "unlock" again (Jan Beulich)
- add check for complete success of pin_user_pages_fast()
---
drivers/xen/privcmd.c | 10 ++++++----
1 file changed, 6 insertions(+), 4 deletions(-)
diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c
index 3369734108af..7dc62510635e 100644
--- a/drivers/xen/privcmd.c
+++ b/drivers/xen/privcmd.c
@@ -602,6 +602,10 @@ static int lock_pages(
*pinned += page_count;
nr_pages -= page_count;
pages += page_count;
+
+ /* Exact reason isn't known, EFAULT is one possibility. */
+ if (page_count < requested)
+ return -EFAULT;
}
return 0;
@@ -677,10 +681,8 @@ static long privcmd_ioctl_dm_op(struct file *file, void __user *udata)
}
rc = lock_pages(kbufs, kdata.num, pages, nr_pages, &pinned);
- if (rc < 0) {
- nr_pages = pinned;
+ if (rc < 0)
goto out;
- }
for (i = 0; i < kdata.num; i++) {
set_xen_guest_handle(xbufs[i].h, kbufs[i].uptr);
@@ -692,7 +694,7 @@ static long privcmd_ioctl_dm_op(struct file *file, void __user *udata)
xen_preemptible_hcall_end();
out:
- unlock_pages(pages, nr_pages);
+ unlock_pages(pages, pinned);
kfree(xbufs);
kfree(pages);
kfree(kbufs);
--
2.35.3
The patch below does not apply to the 4.9-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From f96f7a40874d7c746680c0b9f57cef2262ae551f Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david(a)redhat.com>
Date: Thu, 11 Aug 2022 12:34:34 +0200
Subject: [PATCH] mm/hugetlb: fix hugetlb not supporting softdirty tracking
Patch series "mm/hugetlb: fix write-fault handling for shared mappings", v2.
I observed that hugetlb does not support/expect write-faults in shared
mappings that would have to map the R/O-mapped page writable -- and I
found two case where we could currently get such faults and would
erroneously map an anon page into a shared mapping.
Reproducers part of the patches.
I propose to backport both fixes to stable trees. The first fix needs a
small adjustment.
This patch (of 2):
Staring at hugetlb_wp(), one might wonder where all the logic for shared
mappings is when stumbling over a write-protected page in a shared
mapping. In fact, there is none, and so far we thought we could get away
with that because e.g., mprotect() should always do the right thing and
map all pages directly writable.
Looks like we were wrong:
--------------------------------------------------------------------------
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <fcntl.h>
#include <unistd.h>
#include <errno.h>
#include <sys/mman.h>
#define HUGETLB_SIZE (2 * 1024 * 1024u)
static void clear_softdirty(void)
{
int fd = open("/proc/self/clear_refs", O_WRONLY);
const char *ctrl = "4";
int ret;
if (fd < 0) {
fprintf(stderr, "open(clear_refs) failed\n");
exit(1);
}
ret = write(fd, ctrl, strlen(ctrl));
if (ret != strlen(ctrl)) {
fprintf(stderr, "write(clear_refs) failed\n");
exit(1);
}
close(fd);
}
int main(int argc, char **argv)
{
char *map;
int fd;
fd = open("/dev/hugepages/tmp", O_RDWR | O_CREAT);
if (!fd) {
fprintf(stderr, "open() failed\n");
return -errno;
}
if (ftruncate(fd, HUGETLB_SIZE)) {
fprintf(stderr, "ftruncate() failed\n");
return -errno;
}
map = mmap(NULL, HUGETLB_SIZE, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
if (map == MAP_FAILED) {
fprintf(stderr, "mmap() failed\n");
return -errno;
}
*map = 0;
if (mprotect(map, HUGETLB_SIZE, PROT_READ)) {
fprintf(stderr, "mmprotect() failed\n");
return -errno;
}
clear_softdirty();
if (mprotect(map, HUGETLB_SIZE, PROT_READ|PROT_WRITE)) {
fprintf(stderr, "mmprotect() failed\n");
return -errno;
}
*map = 0;
return 0;
}
--------------------------------------------------------------------------
Above test fails with SIGBUS when there is only a single free hugetlb page.
# echo 1 > /sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages
# ./test
Bus error (core dumped)
And worse, with sufficient free hugetlb pages it will map an anonymous page
into a shared mapping, for example, messing up accounting during unmap
and breaking MAP_SHARED semantics:
# echo 2 > /sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages
# ./test
# cat /proc/meminfo | grep HugePages_
HugePages_Total: 2
HugePages_Free: 1
HugePages_Rsvd: 18446744073709551615
HugePages_Surp: 0
Reason in this particular case is that vma_wants_writenotify() will
return "true", removing VM_SHARED in vma_set_page_prot() to map pages
write-protected. Let's teach vma_wants_writenotify() that hugetlb does not
support softdirty tracking.
Link: https://lkml.kernel.org/r/20220811103435.188481-1-david@redhat.com
Link: https://lkml.kernel.org/r/20220811103435.188481-2-david@redhat.com
Fixes: 64e455079e1b ("mm: softdirty: enable write notifications on VMAs after VM_SOFTDIRTY cleared")
Signed-off-by: David Hildenbrand <david(a)redhat.com>
Reviewed-by: Mike Kravetz <mike.kravetz(a)oracle.com>
Cc: Peter Feiner <pfeiner(a)google.com>
Cc: Kirill A. Shutemov <kirill.shutemov(a)linux.intel.com>
Cc: Cyrill Gorcunov <gorcunov(a)openvz.org>
Cc: Pavel Emelyanov <xemul(a)parallels.com>
Cc: Jamie Liu <jamieliu(a)google.com>
Cc: Hugh Dickins <hughd(a)google.com>
Cc: Naoya Horiguchi <n-horiguchi(a)ah.jp.nec.com>
Cc: Bjorn Helgaas <bhelgaas(a)google.com>
Cc: Muchun Song <songmuchun(a)bytedance.com>
Cc: Peter Xu <peterx(a)redhat.com>
Cc: <stable(a)vger.kernel.org> [3.18+]
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
diff --git a/mm/mmap.c b/mm/mmap.c
index c035020d0c89..9d780f415be3 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1646,8 +1646,11 @@ int vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot)
pgprot_val(vm_pgprot_modify(vm_page_prot, vm_flags)))
return 0;
- /* Do we need to track softdirty? */
- if (vma_soft_dirty_enabled(vma))
+ /*
+ * Do we need to track softdirty? hugetlb does not support softdirty
+ * tracking yet.
+ */
+ if (vma_soft_dirty_enabled(vma) && !is_vm_hugetlb_page(vma))
return 1;
/* Specialty mapping? */
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From f96f7a40874d7c746680c0b9f57cef2262ae551f Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david(a)redhat.com>
Date: Thu, 11 Aug 2022 12:34:34 +0200
Subject: [PATCH] mm/hugetlb: fix hugetlb not supporting softdirty tracking
Patch series "mm/hugetlb: fix write-fault handling for shared mappings", v2.
I observed that hugetlb does not support/expect write-faults in shared
mappings that would have to map the R/O-mapped page writable -- and I
found two case where we could currently get such faults and would
erroneously map an anon page into a shared mapping.
Reproducers part of the patches.
I propose to backport both fixes to stable trees. The first fix needs a
small adjustment.
This patch (of 2):
Staring at hugetlb_wp(), one might wonder where all the logic for shared
mappings is when stumbling over a write-protected page in a shared
mapping. In fact, there is none, and so far we thought we could get away
with that because e.g., mprotect() should always do the right thing and
map all pages directly writable.
Looks like we were wrong:
--------------------------------------------------------------------------
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <fcntl.h>
#include <unistd.h>
#include <errno.h>
#include <sys/mman.h>
#define HUGETLB_SIZE (2 * 1024 * 1024u)
static void clear_softdirty(void)
{
int fd = open("/proc/self/clear_refs", O_WRONLY);
const char *ctrl = "4";
int ret;
if (fd < 0) {
fprintf(stderr, "open(clear_refs) failed\n");
exit(1);
}
ret = write(fd, ctrl, strlen(ctrl));
if (ret != strlen(ctrl)) {
fprintf(stderr, "write(clear_refs) failed\n");
exit(1);
}
close(fd);
}
int main(int argc, char **argv)
{
char *map;
int fd;
fd = open("/dev/hugepages/tmp", O_RDWR | O_CREAT);
if (!fd) {
fprintf(stderr, "open() failed\n");
return -errno;
}
if (ftruncate(fd, HUGETLB_SIZE)) {
fprintf(stderr, "ftruncate() failed\n");
return -errno;
}
map = mmap(NULL, HUGETLB_SIZE, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
if (map == MAP_FAILED) {
fprintf(stderr, "mmap() failed\n");
return -errno;
}
*map = 0;
if (mprotect(map, HUGETLB_SIZE, PROT_READ)) {
fprintf(stderr, "mmprotect() failed\n");
return -errno;
}
clear_softdirty();
if (mprotect(map, HUGETLB_SIZE, PROT_READ|PROT_WRITE)) {
fprintf(stderr, "mmprotect() failed\n");
return -errno;
}
*map = 0;
return 0;
}
--------------------------------------------------------------------------
Above test fails with SIGBUS when there is only a single free hugetlb page.
# echo 1 > /sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages
# ./test
Bus error (core dumped)
And worse, with sufficient free hugetlb pages it will map an anonymous page
into a shared mapping, for example, messing up accounting during unmap
and breaking MAP_SHARED semantics:
# echo 2 > /sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages
# ./test
# cat /proc/meminfo | grep HugePages_
HugePages_Total: 2
HugePages_Free: 1
HugePages_Rsvd: 18446744073709551615
HugePages_Surp: 0
Reason in this particular case is that vma_wants_writenotify() will
return "true", removing VM_SHARED in vma_set_page_prot() to map pages
write-protected. Let's teach vma_wants_writenotify() that hugetlb does not
support softdirty tracking.
Link: https://lkml.kernel.org/r/20220811103435.188481-1-david@redhat.com
Link: https://lkml.kernel.org/r/20220811103435.188481-2-david@redhat.com
Fixes: 64e455079e1b ("mm: softdirty: enable write notifications on VMAs after VM_SOFTDIRTY cleared")
Signed-off-by: David Hildenbrand <david(a)redhat.com>
Reviewed-by: Mike Kravetz <mike.kravetz(a)oracle.com>
Cc: Peter Feiner <pfeiner(a)google.com>
Cc: Kirill A. Shutemov <kirill.shutemov(a)linux.intel.com>
Cc: Cyrill Gorcunov <gorcunov(a)openvz.org>
Cc: Pavel Emelyanov <xemul(a)parallels.com>
Cc: Jamie Liu <jamieliu(a)google.com>
Cc: Hugh Dickins <hughd(a)google.com>
Cc: Naoya Horiguchi <n-horiguchi(a)ah.jp.nec.com>
Cc: Bjorn Helgaas <bhelgaas(a)google.com>
Cc: Muchun Song <songmuchun(a)bytedance.com>
Cc: Peter Xu <peterx(a)redhat.com>
Cc: <stable(a)vger.kernel.org> [3.18+]
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
diff --git a/mm/mmap.c b/mm/mmap.c
index c035020d0c89..9d780f415be3 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1646,8 +1646,11 @@ int vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot)
pgprot_val(vm_pgprot_modify(vm_page_prot, vm_flags)))
return 0;
- /* Do we need to track softdirty? */
- if (vma_soft_dirty_enabled(vma))
+ /*
+ * Do we need to track softdirty? hugetlb does not support softdirty
+ * tracking yet.
+ */
+ if (vma_soft_dirty_enabled(vma) && !is_vm_hugetlb_page(vma))
return 1;
/* Specialty mapping? */
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From f96f7a40874d7c746680c0b9f57cef2262ae551f Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david(a)redhat.com>
Date: Thu, 11 Aug 2022 12:34:34 +0200
Subject: [PATCH] mm/hugetlb: fix hugetlb not supporting softdirty tracking
Patch series "mm/hugetlb: fix write-fault handling for shared mappings", v2.
I observed that hugetlb does not support/expect write-faults in shared
mappings that would have to map the R/O-mapped page writable -- and I
found two case where we could currently get such faults and would
erroneously map an anon page into a shared mapping.
Reproducers part of the patches.
I propose to backport both fixes to stable trees. The first fix needs a
small adjustment.
This patch (of 2):
Staring at hugetlb_wp(), one might wonder where all the logic for shared
mappings is when stumbling over a write-protected page in a shared
mapping. In fact, there is none, and so far we thought we could get away
with that because e.g., mprotect() should always do the right thing and
map all pages directly writable.
Looks like we were wrong:
--------------------------------------------------------------------------
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <fcntl.h>
#include <unistd.h>
#include <errno.h>
#include <sys/mman.h>
#define HUGETLB_SIZE (2 * 1024 * 1024u)
static void clear_softdirty(void)
{
int fd = open("/proc/self/clear_refs", O_WRONLY);
const char *ctrl = "4";
int ret;
if (fd < 0) {
fprintf(stderr, "open(clear_refs) failed\n");
exit(1);
}
ret = write(fd, ctrl, strlen(ctrl));
if (ret != strlen(ctrl)) {
fprintf(stderr, "write(clear_refs) failed\n");
exit(1);
}
close(fd);
}
int main(int argc, char **argv)
{
char *map;
int fd;
fd = open("/dev/hugepages/tmp", O_RDWR | O_CREAT);
if (!fd) {
fprintf(stderr, "open() failed\n");
return -errno;
}
if (ftruncate(fd, HUGETLB_SIZE)) {
fprintf(stderr, "ftruncate() failed\n");
return -errno;
}
map = mmap(NULL, HUGETLB_SIZE, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
if (map == MAP_FAILED) {
fprintf(stderr, "mmap() failed\n");
return -errno;
}
*map = 0;
if (mprotect(map, HUGETLB_SIZE, PROT_READ)) {
fprintf(stderr, "mmprotect() failed\n");
return -errno;
}
clear_softdirty();
if (mprotect(map, HUGETLB_SIZE, PROT_READ|PROT_WRITE)) {
fprintf(stderr, "mmprotect() failed\n");
return -errno;
}
*map = 0;
return 0;
}
--------------------------------------------------------------------------
Above test fails with SIGBUS when there is only a single free hugetlb page.
# echo 1 > /sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages
# ./test
Bus error (core dumped)
And worse, with sufficient free hugetlb pages it will map an anonymous page
into a shared mapping, for example, messing up accounting during unmap
and breaking MAP_SHARED semantics:
# echo 2 > /sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages
# ./test
# cat /proc/meminfo | grep HugePages_
HugePages_Total: 2
HugePages_Free: 1
HugePages_Rsvd: 18446744073709551615
HugePages_Surp: 0
Reason in this particular case is that vma_wants_writenotify() will
return "true", removing VM_SHARED in vma_set_page_prot() to map pages
write-protected. Let's teach vma_wants_writenotify() that hugetlb does not
support softdirty tracking.
Link: https://lkml.kernel.org/r/20220811103435.188481-1-david@redhat.com
Link: https://lkml.kernel.org/r/20220811103435.188481-2-david@redhat.com
Fixes: 64e455079e1b ("mm: softdirty: enable write notifications on VMAs after VM_SOFTDIRTY cleared")
Signed-off-by: David Hildenbrand <david(a)redhat.com>
Reviewed-by: Mike Kravetz <mike.kravetz(a)oracle.com>
Cc: Peter Feiner <pfeiner(a)google.com>
Cc: Kirill A. Shutemov <kirill.shutemov(a)linux.intel.com>
Cc: Cyrill Gorcunov <gorcunov(a)openvz.org>
Cc: Pavel Emelyanov <xemul(a)parallels.com>
Cc: Jamie Liu <jamieliu(a)google.com>
Cc: Hugh Dickins <hughd(a)google.com>
Cc: Naoya Horiguchi <n-horiguchi(a)ah.jp.nec.com>
Cc: Bjorn Helgaas <bhelgaas(a)google.com>
Cc: Muchun Song <songmuchun(a)bytedance.com>
Cc: Peter Xu <peterx(a)redhat.com>
Cc: <stable(a)vger.kernel.org> [3.18+]
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
diff --git a/mm/mmap.c b/mm/mmap.c
index c035020d0c89..9d780f415be3 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1646,8 +1646,11 @@ int vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot)
pgprot_val(vm_pgprot_modify(vm_page_prot, vm_flags)))
return 0;
- /* Do we need to track softdirty? */
- if (vma_soft_dirty_enabled(vma))
+ /*
+ * Do we need to track softdirty? hugetlb does not support softdirty
+ * tracking yet.
+ */
+ if (vma_soft_dirty_enabled(vma) && !is_vm_hugetlb_page(vma))
return 1;
/* Specialty mapping? */
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From f96f7a40874d7c746680c0b9f57cef2262ae551f Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david(a)redhat.com>
Date: Thu, 11 Aug 2022 12:34:34 +0200
Subject: [PATCH] mm/hugetlb: fix hugetlb not supporting softdirty tracking
Patch series "mm/hugetlb: fix write-fault handling for shared mappings", v2.
I observed that hugetlb does not support/expect write-faults in shared
mappings that would have to map the R/O-mapped page writable -- and I
found two case where we could currently get such faults and would
erroneously map an anon page into a shared mapping.
Reproducers part of the patches.
I propose to backport both fixes to stable trees. The first fix needs a
small adjustment.
This patch (of 2):
Staring at hugetlb_wp(), one might wonder where all the logic for shared
mappings is when stumbling over a write-protected page in a shared
mapping. In fact, there is none, and so far we thought we could get away
with that because e.g., mprotect() should always do the right thing and
map all pages directly writable.
Looks like we were wrong:
--------------------------------------------------------------------------
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <fcntl.h>
#include <unistd.h>
#include <errno.h>
#include <sys/mman.h>
#define HUGETLB_SIZE (2 * 1024 * 1024u)
static void clear_softdirty(void)
{
int fd = open("/proc/self/clear_refs", O_WRONLY);
const char *ctrl = "4";
int ret;
if (fd < 0) {
fprintf(stderr, "open(clear_refs) failed\n");
exit(1);
}
ret = write(fd, ctrl, strlen(ctrl));
if (ret != strlen(ctrl)) {
fprintf(stderr, "write(clear_refs) failed\n");
exit(1);
}
close(fd);
}
int main(int argc, char **argv)
{
char *map;
int fd;
fd = open("/dev/hugepages/tmp", O_RDWR | O_CREAT);
if (!fd) {
fprintf(stderr, "open() failed\n");
return -errno;
}
if (ftruncate(fd, HUGETLB_SIZE)) {
fprintf(stderr, "ftruncate() failed\n");
return -errno;
}
map = mmap(NULL, HUGETLB_SIZE, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
if (map == MAP_FAILED) {
fprintf(stderr, "mmap() failed\n");
return -errno;
}
*map = 0;
if (mprotect(map, HUGETLB_SIZE, PROT_READ)) {
fprintf(stderr, "mmprotect() failed\n");
return -errno;
}
clear_softdirty();
if (mprotect(map, HUGETLB_SIZE, PROT_READ|PROT_WRITE)) {
fprintf(stderr, "mmprotect() failed\n");
return -errno;
}
*map = 0;
return 0;
}
--------------------------------------------------------------------------
Above test fails with SIGBUS when there is only a single free hugetlb page.
# echo 1 > /sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages
# ./test
Bus error (core dumped)
And worse, with sufficient free hugetlb pages it will map an anonymous page
into a shared mapping, for example, messing up accounting during unmap
and breaking MAP_SHARED semantics:
# echo 2 > /sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages
# ./test
# cat /proc/meminfo | grep HugePages_
HugePages_Total: 2
HugePages_Free: 1
HugePages_Rsvd: 18446744073709551615
HugePages_Surp: 0
Reason in this particular case is that vma_wants_writenotify() will
return "true", removing VM_SHARED in vma_set_page_prot() to map pages
write-protected. Let's teach vma_wants_writenotify() that hugetlb does not
support softdirty tracking.
Link: https://lkml.kernel.org/r/20220811103435.188481-1-david@redhat.com
Link: https://lkml.kernel.org/r/20220811103435.188481-2-david@redhat.com
Fixes: 64e455079e1b ("mm: softdirty: enable write notifications on VMAs after VM_SOFTDIRTY cleared")
Signed-off-by: David Hildenbrand <david(a)redhat.com>
Reviewed-by: Mike Kravetz <mike.kravetz(a)oracle.com>
Cc: Peter Feiner <pfeiner(a)google.com>
Cc: Kirill A. Shutemov <kirill.shutemov(a)linux.intel.com>
Cc: Cyrill Gorcunov <gorcunov(a)openvz.org>
Cc: Pavel Emelyanov <xemul(a)parallels.com>
Cc: Jamie Liu <jamieliu(a)google.com>
Cc: Hugh Dickins <hughd(a)google.com>
Cc: Naoya Horiguchi <n-horiguchi(a)ah.jp.nec.com>
Cc: Bjorn Helgaas <bhelgaas(a)google.com>
Cc: Muchun Song <songmuchun(a)bytedance.com>
Cc: Peter Xu <peterx(a)redhat.com>
Cc: <stable(a)vger.kernel.org> [3.18+]
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
diff --git a/mm/mmap.c b/mm/mmap.c
index c035020d0c89..9d780f415be3 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1646,8 +1646,11 @@ int vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot)
pgprot_val(vm_pgprot_modify(vm_page_prot, vm_flags)))
return 0;
- /* Do we need to track softdirty? */
- if (vma_soft_dirty_enabled(vma))
+ /*
+ * Do we need to track softdirty? hugetlb does not support softdirty
+ * tracking yet.
+ */
+ if (vma_soft_dirty_enabled(vma) && !is_vm_hugetlb_page(vma))
return 1;
/* Specialty mapping? */
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From f96f7a40874d7c746680c0b9f57cef2262ae551f Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david(a)redhat.com>
Date: Thu, 11 Aug 2022 12:34:34 +0200
Subject: [PATCH] mm/hugetlb: fix hugetlb not supporting softdirty tracking
Patch series "mm/hugetlb: fix write-fault handling for shared mappings", v2.
I observed that hugetlb does not support/expect write-faults in shared
mappings that would have to map the R/O-mapped page writable -- and I
found two case where we could currently get such faults and would
erroneously map an anon page into a shared mapping.
Reproducers part of the patches.
I propose to backport both fixes to stable trees. The first fix needs a
small adjustment.
This patch (of 2):
Staring at hugetlb_wp(), one might wonder where all the logic for shared
mappings is when stumbling over a write-protected page in a shared
mapping. In fact, there is none, and so far we thought we could get away
with that because e.g., mprotect() should always do the right thing and
map all pages directly writable.
Looks like we were wrong:
--------------------------------------------------------------------------
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <fcntl.h>
#include <unistd.h>
#include <errno.h>
#include <sys/mman.h>
#define HUGETLB_SIZE (2 * 1024 * 1024u)
static void clear_softdirty(void)
{
int fd = open("/proc/self/clear_refs", O_WRONLY);
const char *ctrl = "4";
int ret;
if (fd < 0) {
fprintf(stderr, "open(clear_refs) failed\n");
exit(1);
}
ret = write(fd, ctrl, strlen(ctrl));
if (ret != strlen(ctrl)) {
fprintf(stderr, "write(clear_refs) failed\n");
exit(1);
}
close(fd);
}
int main(int argc, char **argv)
{
char *map;
int fd;
fd = open("/dev/hugepages/tmp", O_RDWR | O_CREAT);
if (!fd) {
fprintf(stderr, "open() failed\n");
return -errno;
}
if (ftruncate(fd, HUGETLB_SIZE)) {
fprintf(stderr, "ftruncate() failed\n");
return -errno;
}
map = mmap(NULL, HUGETLB_SIZE, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
if (map == MAP_FAILED) {
fprintf(stderr, "mmap() failed\n");
return -errno;
}
*map = 0;
if (mprotect(map, HUGETLB_SIZE, PROT_READ)) {
fprintf(stderr, "mmprotect() failed\n");
return -errno;
}
clear_softdirty();
if (mprotect(map, HUGETLB_SIZE, PROT_READ|PROT_WRITE)) {
fprintf(stderr, "mmprotect() failed\n");
return -errno;
}
*map = 0;
return 0;
}
--------------------------------------------------------------------------
Above test fails with SIGBUS when there is only a single free hugetlb page.
# echo 1 > /sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages
# ./test
Bus error (core dumped)
And worse, with sufficient free hugetlb pages it will map an anonymous page
into a shared mapping, for example, messing up accounting during unmap
and breaking MAP_SHARED semantics:
# echo 2 > /sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages
# ./test
# cat /proc/meminfo | grep HugePages_
HugePages_Total: 2
HugePages_Free: 1
HugePages_Rsvd: 18446744073709551615
HugePages_Surp: 0
Reason in this particular case is that vma_wants_writenotify() will
return "true", removing VM_SHARED in vma_set_page_prot() to map pages
write-protected. Let's teach vma_wants_writenotify() that hugetlb does not
support softdirty tracking.
Link: https://lkml.kernel.org/r/20220811103435.188481-1-david@redhat.com
Link: https://lkml.kernel.org/r/20220811103435.188481-2-david@redhat.com
Fixes: 64e455079e1b ("mm: softdirty: enable write notifications on VMAs after VM_SOFTDIRTY cleared")
Signed-off-by: David Hildenbrand <david(a)redhat.com>
Reviewed-by: Mike Kravetz <mike.kravetz(a)oracle.com>
Cc: Peter Feiner <pfeiner(a)google.com>
Cc: Kirill A. Shutemov <kirill.shutemov(a)linux.intel.com>
Cc: Cyrill Gorcunov <gorcunov(a)openvz.org>
Cc: Pavel Emelyanov <xemul(a)parallels.com>
Cc: Jamie Liu <jamieliu(a)google.com>
Cc: Hugh Dickins <hughd(a)google.com>
Cc: Naoya Horiguchi <n-horiguchi(a)ah.jp.nec.com>
Cc: Bjorn Helgaas <bhelgaas(a)google.com>
Cc: Muchun Song <songmuchun(a)bytedance.com>
Cc: Peter Xu <peterx(a)redhat.com>
Cc: <stable(a)vger.kernel.org> [3.18+]
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
diff --git a/mm/mmap.c b/mm/mmap.c
index c035020d0c89..9d780f415be3 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1646,8 +1646,11 @@ int vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot)
pgprot_val(vm_pgprot_modify(vm_page_prot, vm_flags)))
return 0;
- /* Do we need to track softdirty? */
- if (vma_soft_dirty_enabled(vma))
+ /*
+ * Do we need to track softdirty? hugetlb does not support softdirty
+ * tracking yet.
+ */
+ if (vma_soft_dirty_enabled(vma) && !is_vm_hugetlb_page(vma))
return 1;
/* Specialty mapping? */
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From f96f7a40874d7c746680c0b9f57cef2262ae551f Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david(a)redhat.com>
Date: Thu, 11 Aug 2022 12:34:34 +0200
Subject: [PATCH] mm/hugetlb: fix hugetlb not supporting softdirty tracking
Patch series "mm/hugetlb: fix write-fault handling for shared mappings", v2.
I observed that hugetlb does not support/expect write-faults in shared
mappings that would have to map the R/O-mapped page writable -- and I
found two case where we could currently get such faults and would
erroneously map an anon page into a shared mapping.
Reproducers part of the patches.
I propose to backport both fixes to stable trees. The first fix needs a
small adjustment.
This patch (of 2):
Staring at hugetlb_wp(), one might wonder where all the logic for shared
mappings is when stumbling over a write-protected page in a shared
mapping. In fact, there is none, and so far we thought we could get away
with that because e.g., mprotect() should always do the right thing and
map all pages directly writable.
Looks like we were wrong:
--------------------------------------------------------------------------
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <fcntl.h>
#include <unistd.h>
#include <errno.h>
#include <sys/mman.h>
#define HUGETLB_SIZE (2 * 1024 * 1024u)
static void clear_softdirty(void)
{
int fd = open("/proc/self/clear_refs", O_WRONLY);
const char *ctrl = "4";
int ret;
if (fd < 0) {
fprintf(stderr, "open(clear_refs) failed\n");
exit(1);
}
ret = write(fd, ctrl, strlen(ctrl));
if (ret != strlen(ctrl)) {
fprintf(stderr, "write(clear_refs) failed\n");
exit(1);
}
close(fd);
}
int main(int argc, char **argv)
{
char *map;
int fd;
fd = open("/dev/hugepages/tmp", O_RDWR | O_CREAT);
if (!fd) {
fprintf(stderr, "open() failed\n");
return -errno;
}
if (ftruncate(fd, HUGETLB_SIZE)) {
fprintf(stderr, "ftruncate() failed\n");
return -errno;
}
map = mmap(NULL, HUGETLB_SIZE, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
if (map == MAP_FAILED) {
fprintf(stderr, "mmap() failed\n");
return -errno;
}
*map = 0;
if (mprotect(map, HUGETLB_SIZE, PROT_READ)) {
fprintf(stderr, "mmprotect() failed\n");
return -errno;
}
clear_softdirty();
if (mprotect(map, HUGETLB_SIZE, PROT_READ|PROT_WRITE)) {
fprintf(stderr, "mmprotect() failed\n");
return -errno;
}
*map = 0;
return 0;
}
--------------------------------------------------------------------------
Above test fails with SIGBUS when there is only a single free hugetlb page.
# echo 1 > /sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages
# ./test
Bus error (core dumped)
And worse, with sufficient free hugetlb pages it will map an anonymous page
into a shared mapping, for example, messing up accounting during unmap
and breaking MAP_SHARED semantics:
# echo 2 > /sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages
# ./test
# cat /proc/meminfo | grep HugePages_
HugePages_Total: 2
HugePages_Free: 1
HugePages_Rsvd: 18446744073709551615
HugePages_Surp: 0
Reason in this particular case is that vma_wants_writenotify() will
return "true", removing VM_SHARED in vma_set_page_prot() to map pages
write-protected. Let's teach vma_wants_writenotify() that hugetlb does not
support softdirty tracking.
Link: https://lkml.kernel.org/r/20220811103435.188481-1-david@redhat.com
Link: https://lkml.kernel.org/r/20220811103435.188481-2-david@redhat.com
Fixes: 64e455079e1b ("mm: softdirty: enable write notifications on VMAs after VM_SOFTDIRTY cleared")
Signed-off-by: David Hildenbrand <david(a)redhat.com>
Reviewed-by: Mike Kravetz <mike.kravetz(a)oracle.com>
Cc: Peter Feiner <pfeiner(a)google.com>
Cc: Kirill A. Shutemov <kirill.shutemov(a)linux.intel.com>
Cc: Cyrill Gorcunov <gorcunov(a)openvz.org>
Cc: Pavel Emelyanov <xemul(a)parallels.com>
Cc: Jamie Liu <jamieliu(a)google.com>
Cc: Hugh Dickins <hughd(a)google.com>
Cc: Naoya Horiguchi <n-horiguchi(a)ah.jp.nec.com>
Cc: Bjorn Helgaas <bhelgaas(a)google.com>
Cc: Muchun Song <songmuchun(a)bytedance.com>
Cc: Peter Xu <peterx(a)redhat.com>
Cc: <stable(a)vger.kernel.org> [3.18+]
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
diff --git a/mm/mmap.c b/mm/mmap.c
index c035020d0c89..9d780f415be3 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1646,8 +1646,11 @@ int vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot)
pgprot_val(vm_pgprot_modify(vm_page_prot, vm_flags)))
return 0;
- /* Do we need to track softdirty? */
- if (vma_soft_dirty_enabled(vma))
+ /*
+ * Do we need to track softdirty? hugetlb does not support softdirty
+ * tracking yet.
+ */
+ if (vma_soft_dirty_enabled(vma) && !is_vm_hugetlb_page(vma))
return 1;
/* Specialty mapping? */
The patch below does not apply to the 4.9-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From a8faed3a02eeb75857a3b5d660fa80fe79db77a3 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap(a)infradead.org>
Date: Sun, 7 Aug 2022 15:09:34 -0700
Subject: [PATCH] kernel/sys_ni: add compat entry for fadvise64_64
When CONFIG_ADVISE_SYSCALLS is not set/enabled and CONFIG_COMPAT is
set/enabled, the riscv compat_syscall_table references
'compat_sys_fadvise64_64', which is not defined:
riscv64-linux-ld: arch/riscv/kernel/compat_syscall_table.o:(.rodata+0x6f8):
undefined reference to `compat_sys_fadvise64_64'
Add 'fadvise64_64' to kernel/sys_ni.c as a conditional COMPAT function so
that when CONFIG_ADVISE_SYSCALLS is not set, there is a fallback function
available.
Link: https://lkml.kernel.org/r/20220807220934.5689-1-rdunlap@infradead.org
Fixes: d3ac21cacc24 ("mm: Support compiling out madvise and fadvise")
Signed-off-by: Randy Dunlap <rdunlap(a)infradead.org>
Suggested-by: Arnd Bergmann <arnd(a)arndb.de>
Reviewed-by: Arnd Bergmann <arnd(a)arndb.de>
Cc: Josh Triplett <josh(a)joshtriplett.org>
Cc: Paul Walmsley <paul.walmsley(a)sifive.com>
Cc: Palmer Dabbelt <palmer(a)dabbelt.com>
Cc: Albert Ou <aou(a)eecs.berkeley.edu>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index a492f159624f..860b2dcf3ac4 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -277,6 +277,7 @@ COND_SYSCALL(landlock_restrict_self);
/* mm/fadvise.c */
COND_SYSCALL(fadvise64_64);
+COND_SYSCALL_COMPAT(fadvise64_64);
/* mm/, CONFIG_MMU only */
COND_SYSCALL(swapon);
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From a8faed3a02eeb75857a3b5d660fa80fe79db77a3 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap(a)infradead.org>
Date: Sun, 7 Aug 2022 15:09:34 -0700
Subject: [PATCH] kernel/sys_ni: add compat entry for fadvise64_64
When CONFIG_ADVISE_SYSCALLS is not set/enabled and CONFIG_COMPAT is
set/enabled, the riscv compat_syscall_table references
'compat_sys_fadvise64_64', which is not defined:
riscv64-linux-ld: arch/riscv/kernel/compat_syscall_table.o:(.rodata+0x6f8):
undefined reference to `compat_sys_fadvise64_64'
Add 'fadvise64_64' to kernel/sys_ni.c as a conditional COMPAT function so
that when CONFIG_ADVISE_SYSCALLS is not set, there is a fallback function
available.
Link: https://lkml.kernel.org/r/20220807220934.5689-1-rdunlap@infradead.org
Fixes: d3ac21cacc24 ("mm: Support compiling out madvise and fadvise")
Signed-off-by: Randy Dunlap <rdunlap(a)infradead.org>
Suggested-by: Arnd Bergmann <arnd(a)arndb.de>
Reviewed-by: Arnd Bergmann <arnd(a)arndb.de>
Cc: Josh Triplett <josh(a)joshtriplett.org>
Cc: Paul Walmsley <paul.walmsley(a)sifive.com>
Cc: Palmer Dabbelt <palmer(a)dabbelt.com>
Cc: Albert Ou <aou(a)eecs.berkeley.edu>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index a492f159624f..860b2dcf3ac4 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -277,6 +277,7 @@ COND_SYSCALL(landlock_restrict_self);
/* mm/fadvise.c */
COND_SYSCALL(fadvise64_64);
+COND_SYSCALL_COMPAT(fadvise64_64);
/* mm/, CONFIG_MMU only */
COND_SYSCALL(swapon);
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 05c7b7a92cc87ff8d7fde189d0fade250697573c Mon Sep 17 00:00:00 2001
From: Zhang Qiao <zhangqiao22(a)huawei.com>
Date: Fri, 21 Jan 2022 18:12:10 +0800
Subject: [PATCH] cgroup/cpuset: Fix a race between cpuset_attach() and cpu
hotplug
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
As previously discussed(https://lkml.org/lkml/2022/1/20/51),
cpuset_attach() is affected with similar cpu hotplug race,
as follow scenario:
cpuset_attach() cpu hotplug
--------------------------- ----------------------
down_write(cpuset_rwsem)
guarantee_online_cpus() // (load cpus_attach)
sched_cpu_deactivate
set_cpu_active()
// will change cpu_active_mask
set_cpus_allowed_ptr(cpus_attach)
__set_cpus_allowed_ptr_locked()
// (if the intersection of cpus_attach and
cpu_active_mask is empty, will return -EINVAL)
up_write(cpuset_rwsem)
To avoid races such as described above, protect cpuset_attach() call
with cpu_hotplug_lock.
Fixes: be367d099270 ("cgroups: let ss->can_attach and ss->attach do whole threadgroups at a time")
Cc: stable(a)vger.kernel.org # v2.6.32+
Reported-by: Zhao Gongyi <zhaogongyi(a)huawei.com>
Signed-off-by: Zhang Qiao <zhangqiao22(a)huawei.com>
Acked-by: Waiman Long <longman(a)redhat.com>
Reviewed-by: Michal Koutný <mkoutny(a)suse.com>
Signed-off-by: Tejun Heo <tj(a)kernel.org>
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 4c7254e8f49a..97c53f3cc917 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -2289,6 +2289,7 @@ static void cpuset_attach(struct cgroup_taskset *tset)
cgroup_taskset_first(tset, &css);
cs = css_cs(css);
+ cpus_read_lock();
percpu_down_write(&cpuset_rwsem);
guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
@@ -2342,6 +2343,7 @@ static void cpuset_attach(struct cgroup_taskset *tset)
wake_up(&cpuset_attach_wq);
percpu_up_write(&cpuset_rwsem);
+ cpus_read_unlock();
}
/* The various types of files and directories in a cpuset file system */
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 05c7b7a92cc87ff8d7fde189d0fade250697573c Mon Sep 17 00:00:00 2001
From: Zhang Qiao <zhangqiao22(a)huawei.com>
Date: Fri, 21 Jan 2022 18:12:10 +0800
Subject: [PATCH] cgroup/cpuset: Fix a race between cpuset_attach() and cpu
hotplug
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
As previously discussed(https://lkml.org/lkml/2022/1/20/51),
cpuset_attach() is affected with similar cpu hotplug race,
as follow scenario:
cpuset_attach() cpu hotplug
--------------------------- ----------------------
down_write(cpuset_rwsem)
guarantee_online_cpus() // (load cpus_attach)
sched_cpu_deactivate
set_cpu_active()
// will change cpu_active_mask
set_cpus_allowed_ptr(cpus_attach)
__set_cpus_allowed_ptr_locked()
// (if the intersection of cpus_attach and
cpu_active_mask is empty, will return -EINVAL)
up_write(cpuset_rwsem)
To avoid races such as described above, protect cpuset_attach() call
with cpu_hotplug_lock.
Fixes: be367d099270 ("cgroups: let ss->can_attach and ss->attach do whole threadgroups at a time")
Cc: stable(a)vger.kernel.org # v2.6.32+
Reported-by: Zhao Gongyi <zhaogongyi(a)huawei.com>
Signed-off-by: Zhang Qiao <zhangqiao22(a)huawei.com>
Acked-by: Waiman Long <longman(a)redhat.com>
Reviewed-by: Michal Koutný <mkoutny(a)suse.com>
Signed-off-by: Tejun Heo <tj(a)kernel.org>
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 4c7254e8f49a..97c53f3cc917 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -2289,6 +2289,7 @@ static void cpuset_attach(struct cgroup_taskset *tset)
cgroup_taskset_first(tset, &css);
cs = css_cs(css);
+ cpus_read_lock();
percpu_down_write(&cpuset_rwsem);
guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
@@ -2342,6 +2343,7 @@ static void cpuset_attach(struct cgroup_taskset *tset)
wake_up(&cpuset_attach_wq);
percpu_up_write(&cpuset_rwsem);
+ cpus_read_unlock();
}
/* The various types of files and directories in a cpuset file system */
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 05c7b7a92cc87ff8d7fde189d0fade250697573c Mon Sep 17 00:00:00 2001
From: Zhang Qiao <zhangqiao22(a)huawei.com>
Date: Fri, 21 Jan 2022 18:12:10 +0800
Subject: [PATCH] cgroup/cpuset: Fix a race between cpuset_attach() and cpu
hotplug
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
As previously discussed(https://lkml.org/lkml/2022/1/20/51),
cpuset_attach() is affected with similar cpu hotplug race,
as follow scenario:
cpuset_attach() cpu hotplug
--------------------------- ----------------------
down_write(cpuset_rwsem)
guarantee_online_cpus() // (load cpus_attach)
sched_cpu_deactivate
set_cpu_active()
// will change cpu_active_mask
set_cpus_allowed_ptr(cpus_attach)
__set_cpus_allowed_ptr_locked()
// (if the intersection of cpus_attach and
cpu_active_mask is empty, will return -EINVAL)
up_write(cpuset_rwsem)
To avoid races such as described above, protect cpuset_attach() call
with cpu_hotplug_lock.
Fixes: be367d099270 ("cgroups: let ss->can_attach and ss->attach do whole threadgroups at a time")
Cc: stable(a)vger.kernel.org # v2.6.32+
Reported-by: Zhao Gongyi <zhaogongyi(a)huawei.com>
Signed-off-by: Zhang Qiao <zhangqiao22(a)huawei.com>
Acked-by: Waiman Long <longman(a)redhat.com>
Reviewed-by: Michal Koutný <mkoutny(a)suse.com>
Signed-off-by: Tejun Heo <tj(a)kernel.org>
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 4c7254e8f49a..97c53f3cc917 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -2289,6 +2289,7 @@ static void cpuset_attach(struct cgroup_taskset *tset)
cgroup_taskset_first(tset, &css);
cs = css_cs(css);
+ cpus_read_lock();
percpu_down_write(&cpuset_rwsem);
guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
@@ -2342,6 +2343,7 @@ static void cpuset_attach(struct cgroup_taskset *tset)
wake_up(&cpuset_attach_wq);
percpu_up_write(&cpuset_rwsem);
+ cpus_read_unlock();
}
/* The various types of files and directories in a cpuset file system */
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 05c7b7a92cc87ff8d7fde189d0fade250697573c Mon Sep 17 00:00:00 2001
From: Zhang Qiao <zhangqiao22(a)huawei.com>
Date: Fri, 21 Jan 2022 18:12:10 +0800
Subject: [PATCH] cgroup/cpuset: Fix a race between cpuset_attach() and cpu
hotplug
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
As previously discussed(https://lkml.org/lkml/2022/1/20/51),
cpuset_attach() is affected with similar cpu hotplug race,
as follow scenario:
cpuset_attach() cpu hotplug
--------------------------- ----------------------
down_write(cpuset_rwsem)
guarantee_online_cpus() // (load cpus_attach)
sched_cpu_deactivate
set_cpu_active()
// will change cpu_active_mask
set_cpus_allowed_ptr(cpus_attach)
__set_cpus_allowed_ptr_locked()
// (if the intersection of cpus_attach and
cpu_active_mask is empty, will return -EINVAL)
up_write(cpuset_rwsem)
To avoid races such as described above, protect cpuset_attach() call
with cpu_hotplug_lock.
Fixes: be367d099270 ("cgroups: let ss->can_attach and ss->attach do whole threadgroups at a time")
Cc: stable(a)vger.kernel.org # v2.6.32+
Reported-by: Zhao Gongyi <zhaogongyi(a)huawei.com>
Signed-off-by: Zhang Qiao <zhangqiao22(a)huawei.com>
Acked-by: Waiman Long <longman(a)redhat.com>
Reviewed-by: Michal Koutný <mkoutny(a)suse.com>
Signed-off-by: Tejun Heo <tj(a)kernel.org>
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 4c7254e8f49a..97c53f3cc917 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -2289,6 +2289,7 @@ static void cpuset_attach(struct cgroup_taskset *tset)
cgroup_taskset_first(tset, &css);
cs = css_cs(css);
+ cpus_read_lock();
percpu_down_write(&cpuset_rwsem);
guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
@@ -2342,6 +2343,7 @@ static void cpuset_attach(struct cgroup_taskset *tset)
wake_up(&cpuset_attach_wq);
percpu_up_write(&cpuset_rwsem);
+ cpus_read_unlock();
}
/* The various types of files and directories in a cpuset file system */
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 05c7b7a92cc87ff8d7fde189d0fade250697573c Mon Sep 17 00:00:00 2001
From: Zhang Qiao <zhangqiao22(a)huawei.com>
Date: Fri, 21 Jan 2022 18:12:10 +0800
Subject: [PATCH] cgroup/cpuset: Fix a race between cpuset_attach() and cpu
hotplug
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
As previously discussed(https://lkml.org/lkml/2022/1/20/51),
cpuset_attach() is affected with similar cpu hotplug race,
as follow scenario:
cpuset_attach() cpu hotplug
--------------------------- ----------------------
down_write(cpuset_rwsem)
guarantee_online_cpus() // (load cpus_attach)
sched_cpu_deactivate
set_cpu_active()
// will change cpu_active_mask
set_cpus_allowed_ptr(cpus_attach)
__set_cpus_allowed_ptr_locked()
// (if the intersection of cpus_attach and
cpu_active_mask is empty, will return -EINVAL)
up_write(cpuset_rwsem)
To avoid races such as described above, protect cpuset_attach() call
with cpu_hotplug_lock.
Fixes: be367d099270 ("cgroups: let ss->can_attach and ss->attach do whole threadgroups at a time")
Cc: stable(a)vger.kernel.org # v2.6.32+
Reported-by: Zhao Gongyi <zhaogongyi(a)huawei.com>
Signed-off-by: Zhang Qiao <zhangqiao22(a)huawei.com>
Acked-by: Waiman Long <longman(a)redhat.com>
Reviewed-by: Michal Koutný <mkoutny(a)suse.com>
Signed-off-by: Tejun Heo <tj(a)kernel.org>
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 4c7254e8f49a..97c53f3cc917 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -2289,6 +2289,7 @@ static void cpuset_attach(struct cgroup_taskset *tset)
cgroup_taskset_first(tset, &css);
cs = css_cs(css);
+ cpus_read_lock();
percpu_down_write(&cpuset_rwsem);
guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
@@ -2342,6 +2343,7 @@ static void cpuset_attach(struct cgroup_taskset *tset)
wake_up(&cpuset_attach_wq);
percpu_up_write(&cpuset_rwsem);
+ cpus_read_unlock();
}
/* The various types of files and directories in a cpuset file system */
The patch below does not apply to the 5.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 05c7b7a92cc87ff8d7fde189d0fade250697573c Mon Sep 17 00:00:00 2001
From: Zhang Qiao <zhangqiao22(a)huawei.com>
Date: Fri, 21 Jan 2022 18:12:10 +0800
Subject: [PATCH] cgroup/cpuset: Fix a race between cpuset_attach() and cpu
hotplug
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
As previously discussed(https://lkml.org/lkml/2022/1/20/51),
cpuset_attach() is affected with similar cpu hotplug race,
as follow scenario:
cpuset_attach() cpu hotplug
--------------------------- ----------------------
down_write(cpuset_rwsem)
guarantee_online_cpus() // (load cpus_attach)
sched_cpu_deactivate
set_cpu_active()
// will change cpu_active_mask
set_cpus_allowed_ptr(cpus_attach)
__set_cpus_allowed_ptr_locked()
// (if the intersection of cpus_attach and
cpu_active_mask is empty, will return -EINVAL)
up_write(cpuset_rwsem)
To avoid races such as described above, protect cpuset_attach() call
with cpu_hotplug_lock.
Fixes: be367d099270 ("cgroups: let ss->can_attach and ss->attach do whole threadgroups at a time")
Cc: stable(a)vger.kernel.org # v2.6.32+
Reported-by: Zhao Gongyi <zhaogongyi(a)huawei.com>
Signed-off-by: Zhang Qiao <zhangqiao22(a)huawei.com>
Acked-by: Waiman Long <longman(a)redhat.com>
Reviewed-by: Michal Koutný <mkoutny(a)suse.com>
Signed-off-by: Tejun Heo <tj(a)kernel.org>
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 4c7254e8f49a..97c53f3cc917 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -2289,6 +2289,7 @@ static void cpuset_attach(struct cgroup_taskset *tset)
cgroup_taskset_first(tset, &css);
cs = css_cs(css);
+ cpus_read_lock();
percpu_down_write(&cpuset_rwsem);
guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
@@ -2342,6 +2343,7 @@ static void cpuset_attach(struct cgroup_taskset *tset)
wake_up(&cpuset_attach_wq);
percpu_up_write(&cpuset_rwsem);
+ cpus_read_unlock();
}
/* The various types of files and directories in a cpuset file system */
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 67f4b5dc49913abcdb5cc736e73674e2f352f81d Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust(a)hammerspace.com>
Date: Sat, 13 Aug 2022 08:22:25 -0400
Subject: [PATCH] NFS: Fix another fsync() issue after a server reboot
Currently, when the writeback code detects a server reboot, it redirties
any pages that were not committed to disk, and it sets the flag
NFS_CONTEXT_RESEND_WRITES in the nfs_open_context of the file descriptor
that dirtied the file. While this allows the file descriptor in question
to redrive its own writes, it violates the fsync() requirement that we
should be synchronising all writes to disk.
While the problem is infrequent, we do see corner cases where an
untimely server reboot causes the fsync() call to abandon its attempt to
sync data to disk and causing data corruption issues due to missed error
conditions or similar.
In order to tighted up the client's ability to deal with this situation
without introducing livelocks, add a counter that records the number of
times pages are redirtied due to a server reboot-like condition, and use
that in fsync() to redrive the sync to disk.
Fixes: 2197e9b06c22 ("NFS: Fix up fsync() when the server rebooted")
Cc: stable(a)vger.kernel.org
Signed-off-by: Trond Myklebust <trond.myklebust(a)hammerspace.com>
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 54237a231687..749e5487df50 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -221,8 +221,10 @@ nfs_file_fsync_commit(struct file *file, int datasync)
int
nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync)
{
- struct nfs_open_context *ctx = nfs_file_open_context(file);
struct inode *inode = file_inode(file);
+ struct nfs_inode *nfsi = NFS_I(inode);
+ long save_nredirtied = atomic_long_read(&nfsi->redirtied_pages);
+ long nredirtied;
int ret;
trace_nfs_fsync_enter(inode);
@@ -237,15 +239,10 @@ nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync)
ret = pnfs_sync_inode(inode, !!datasync);
if (ret != 0)
break;
- if (!test_and_clear_bit(NFS_CONTEXT_RESEND_WRITES, &ctx->flags))
+ nredirtied = atomic_long_read(&nfsi->redirtied_pages);
+ if (nredirtied == save_nredirtied)
break;
- /*
- * If nfs_file_fsync_commit detected a server reboot, then
- * resend all dirty pages that might have been covered by
- * the NFS_CONTEXT_RESEND_WRITES flag
- */
- start = 0;
- end = LLONG_MAX;
+ save_nredirtied = nredirtied;
}
trace_nfs_fsync_exit(inode, ret);
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index b4e46b0ffa2d..bea7c005119c 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -426,6 +426,7 @@ nfs_ilookup(struct super_block *sb, struct nfs_fattr *fattr, struct nfs_fh *fh)
static void nfs_inode_init_regular(struct nfs_inode *nfsi)
{
atomic_long_set(&nfsi->nrequests, 0);
+ atomic_long_set(&nfsi->redirtied_pages, 0);
INIT_LIST_HEAD(&nfsi->commit_info.list);
atomic_long_set(&nfsi->commit_info.ncommit, 0);
atomic_set(&nfsi->commit_info.rpcs_out, 0);
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 4a3796811b4b..989c734cf91d 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1420,10 +1420,12 @@ static void nfs_initiate_write(struct nfs_pgio_header *hdr,
*/
static void nfs_redirty_request(struct nfs_page *req)
{
+ struct nfs_inode *nfsi = NFS_I(page_file_mapping(req->wb_page)->host);
+
/* Bump the transmission count */
req->wb_nio++;
nfs_mark_request_dirty(req);
- set_bit(NFS_CONTEXT_RESEND_WRITES, &nfs_req_openctx(req)->flags);
+ atomic_long_inc(&nfsi->redirtied_pages);
nfs_end_page_writeback(req);
nfs_release_request(req);
}
@@ -1904,7 +1906,7 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data)
/* We have a mismatch. Write the page again */
dprintk_cont(" mismatch\n");
nfs_mark_request_dirty(req);
- set_bit(NFS_CONTEXT_RESEND_WRITES, &nfs_req_openctx(req)->flags);
+ atomic_long_inc(&NFS_I(data->inode)->redirtied_pages);
next:
nfs_unlock_and_release_request(req);
/* Latency breaker */
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index b32ed68e7dc4..f08e581f0161 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -182,6 +182,7 @@ struct nfs_inode {
/* Regular file */
struct {
atomic_long_t nrequests;
+ atomic_long_t redirtied_pages;
struct nfs_mds_commit_info commit_info;
struct mutex commit_mutex;
};
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 67f4b5dc49913abcdb5cc736e73674e2f352f81d Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust(a)hammerspace.com>
Date: Sat, 13 Aug 2022 08:22:25 -0400
Subject: [PATCH] NFS: Fix another fsync() issue after a server reboot
Currently, when the writeback code detects a server reboot, it redirties
any pages that were not committed to disk, and it sets the flag
NFS_CONTEXT_RESEND_WRITES in the nfs_open_context of the file descriptor
that dirtied the file. While this allows the file descriptor in question
to redrive its own writes, it violates the fsync() requirement that we
should be synchronising all writes to disk.
While the problem is infrequent, we do see corner cases where an
untimely server reboot causes the fsync() call to abandon its attempt to
sync data to disk and causing data corruption issues due to missed error
conditions or similar.
In order to tighted up the client's ability to deal with this situation
without introducing livelocks, add a counter that records the number of
times pages are redirtied due to a server reboot-like condition, and use
that in fsync() to redrive the sync to disk.
Fixes: 2197e9b06c22 ("NFS: Fix up fsync() when the server rebooted")
Cc: stable(a)vger.kernel.org
Signed-off-by: Trond Myklebust <trond.myklebust(a)hammerspace.com>
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 54237a231687..749e5487df50 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -221,8 +221,10 @@ nfs_file_fsync_commit(struct file *file, int datasync)
int
nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync)
{
- struct nfs_open_context *ctx = nfs_file_open_context(file);
struct inode *inode = file_inode(file);
+ struct nfs_inode *nfsi = NFS_I(inode);
+ long save_nredirtied = atomic_long_read(&nfsi->redirtied_pages);
+ long nredirtied;
int ret;
trace_nfs_fsync_enter(inode);
@@ -237,15 +239,10 @@ nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync)
ret = pnfs_sync_inode(inode, !!datasync);
if (ret != 0)
break;
- if (!test_and_clear_bit(NFS_CONTEXT_RESEND_WRITES, &ctx->flags))
+ nredirtied = atomic_long_read(&nfsi->redirtied_pages);
+ if (nredirtied == save_nredirtied)
break;
- /*
- * If nfs_file_fsync_commit detected a server reboot, then
- * resend all dirty pages that might have been covered by
- * the NFS_CONTEXT_RESEND_WRITES flag
- */
- start = 0;
- end = LLONG_MAX;
+ save_nredirtied = nredirtied;
}
trace_nfs_fsync_exit(inode, ret);
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index b4e46b0ffa2d..bea7c005119c 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -426,6 +426,7 @@ nfs_ilookup(struct super_block *sb, struct nfs_fattr *fattr, struct nfs_fh *fh)
static void nfs_inode_init_regular(struct nfs_inode *nfsi)
{
atomic_long_set(&nfsi->nrequests, 0);
+ atomic_long_set(&nfsi->redirtied_pages, 0);
INIT_LIST_HEAD(&nfsi->commit_info.list);
atomic_long_set(&nfsi->commit_info.ncommit, 0);
atomic_set(&nfsi->commit_info.rpcs_out, 0);
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 4a3796811b4b..989c734cf91d 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1420,10 +1420,12 @@ static void nfs_initiate_write(struct nfs_pgio_header *hdr,
*/
static void nfs_redirty_request(struct nfs_page *req)
{
+ struct nfs_inode *nfsi = NFS_I(page_file_mapping(req->wb_page)->host);
+
/* Bump the transmission count */
req->wb_nio++;
nfs_mark_request_dirty(req);
- set_bit(NFS_CONTEXT_RESEND_WRITES, &nfs_req_openctx(req)->flags);
+ atomic_long_inc(&nfsi->redirtied_pages);
nfs_end_page_writeback(req);
nfs_release_request(req);
}
@@ -1904,7 +1906,7 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data)
/* We have a mismatch. Write the page again */
dprintk_cont(" mismatch\n");
nfs_mark_request_dirty(req);
- set_bit(NFS_CONTEXT_RESEND_WRITES, &nfs_req_openctx(req)->flags);
+ atomic_long_inc(&NFS_I(data->inode)->redirtied_pages);
next:
nfs_unlock_and_release_request(req);
/* Latency breaker */
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index b32ed68e7dc4..f08e581f0161 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -182,6 +182,7 @@ struct nfs_inode {
/* Regular file */
struct {
atomic_long_t nrequests;
+ atomic_long_t redirtied_pages;
struct nfs_mds_commit_info commit_info;
struct mutex commit_mutex;
};
Hi Greg,
v5.15.y riscv builds fails with gcc-12. Can I please request to add the
following to the queue:
ee3db469dd31 ("wifi: rtlwifi: remove always-true condition pointed out by GCC 12")
32329216ca1d ("eth: sun: cassini: remove dead code")
--
Regards
Sudip
On Thu, Aug 25, 2022 at 05:20:46PM +0800, zhouzhixiu wrote:
>
> On 2022/8/23 16:21, Greg Kroah-Hartman wrote:
> > This is the start of the stable review cycle for the 5.4.211 release.
> > There are 389 patches in this series, all will be posted as a response
> > to this one. If anyone has any issues with these being applied, please
> > let me know. Responses should be made by Thu, 25 Aug 2022 08:00:15
> > +0000. Anything received after that time might be too late. The whole
> > patch series can be found in one patch at: https://www.kernel.org/pub/linux/kernel/v5.x/stable-review/patch-5.4.211-rc…
> > or in the git tree and branch at:
> > git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git
> > linux-5.4.y and the diffstat can be found below. thanks, greg k-h
>
> Tested on arm64 and x86 for 5.4.211-rc1,
>
> Kernel repo:
>
> https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git
>
> Branch: linux-5.10.yVersion:
> 5.4.211-rc1Commit:1cece69eaa889a27cf3e9f2051fcc57eda957271Compiler: gcc
> version 7.3.0 (GCC) arm64:--------------------------------------------------------------------Testcase
> Result Summary:total: 9017passed: 9017failed: 0timeout: 0--------------------------------------------------------------------x86:--------------------------------------------------------------------Testcase
> Result Summary:total: 9017passed: 9017failed: 0timeout: 0--------------------------------------------------------------------Tested-by:
> Hulk Robot <hulkrobot(a)huawei.com>
Your emails are being sent in html format and not being accepted by the
mialing list.
Also this response is very odd :(
From: Juergen Gross <jgross(a)suse.com>
[ Upstream commit 7b6670b03641ac308aaa6fa2e6f964ac993b5ea3 ]
When booting under KVM the following error messages are issued:
hypfs.7f5705: The hardware system does not support hypfs
hypfs.7a79f0: Initialization of hypfs failed with rc=-61
Demote the severity of first message from "error" to "info" and issue
the second message only in other error cases.
Signed-off-by: Juergen Gross <jgross(a)suse.com>
Acked-by: Heiko Carstens <hca(a)linux.ibm.com>
Acked-by: Christian Borntraeger <borntraeger(a)linux.ibm.com>
Link: https://lore.kernel.org/r/20220620094534.18967-1-jgross@suse.com
[arch/s390/hypfs/hypfs_diag.c changed description]
Signed-off-by: Alexander Gordeev <agordeev(a)linux.ibm.com>
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
---
arch/s390/hypfs/hypfs_diag.c | 2 +-
arch/s390/hypfs/inode.c | 2 +-
2 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/arch/s390/hypfs/hypfs_diag.c b/arch/s390/hypfs/hypfs_diag.c
index 794bebb43d23..64448c0998eb 100644
--- a/arch/s390/hypfs/hypfs_diag.c
+++ b/arch/s390/hypfs/hypfs_diag.c
@@ -436,7 +436,7 @@ __init int hypfs_diag_init(void)
int rc;
if (diag204_probe()) {
- pr_err("The hardware system does not support hypfs\n");
+ pr_info("The hardware system does not support hypfs\n");
return -ENODATA;
}
if (diag204_info_type == DIAG204_INFO_EXT) {
diff --git a/arch/s390/hypfs/inode.c b/arch/s390/hypfs/inode.c
index 224aeda1e8cc..d73d2d001a62 100644
--- a/arch/s390/hypfs/inode.c
+++ b/arch/s390/hypfs/inode.c
@@ -493,9 +493,9 @@ static int __init hypfs_init(void)
hypfs_vm_exit();
fail_hypfs_diag_exit:
hypfs_diag_exit();
+ pr_err("Initialization of hypfs failed with rc=%i\n", rc);
fail_dbfs_exit:
hypfs_dbfs_exit();
- pr_err("Initialization of hypfs failed with rc=%i\n", rc);
return rc;
}
--
2.35.1
From: "Denis V. Lunev" <den(a)openvz.org>
[ Upstream commit 66ba215cb51323e4e55e38fd5f250e0fae0cbc94 ]
Normal processing of ARP request (usually this is Ethernet broadcast
packet) coming to the host is looking like the following:
* the packet comes to arp_process() call and is passed through routing
procedure
* the request is put into the queue using pneigh_enqueue() if
corresponding ARP record is not local (common case for container
records on the host)
* the request is processed by timer (within 80 jiffies by default) and
ARP reply is sent from the same arp_process() using
NEIGH_CB(skb)->flags & LOCALLY_ENQUEUED condition (flag is set inside
pneigh_enqueue())
And here the problem comes. Linux kernel calls pneigh_queue_purge()
which destroys the whole queue of ARP requests on ANY network interface
start/stop event through __neigh_ifdown().
This is actually not a problem within the original world as network
interface start/stop was accessible to the host 'root' only, which
could do more destructive things. But the world is changed and there
are Linux containers available. Here container 'root' has an access
to this API and could be considered as untrusted user in the hosting
(container's) world.
Thus there is an attack vector to other containers on node when
container's root will endlessly start/stop interfaces. We have observed
similar situation on a real production node when docker container was
doing such activity and thus other containers on the node become not
accessible.
The patch proposed doing very simple thing. It drops only packets from
the same namespace in the pneigh_queue_purge() where network interface
state change is detected. This is enough to prevent the problem for the
whole node preserving original semantics of the code.
v2:
- do del_timer_sync() if queue is empty after pneigh_queue_purge()
v3:
- rebase to net tree
Cc: "David S. Miller" <davem(a)davemloft.net>
Cc: Eric Dumazet <edumazet(a)google.com>
Cc: Jakub Kicinski <kuba(a)kernel.org>
Cc: Paolo Abeni <pabeni(a)redhat.com>
Cc: Daniel Borkmann <daniel(a)iogearbox.net>
Cc: David Ahern <dsahern(a)kernel.org>
Cc: Yajun Deng <yajun.deng(a)linux.dev>
Cc: Roopa Prabhu <roopa(a)nvidia.com>
Cc: Christian Brauner <brauner(a)kernel.org>
Cc: netdev(a)vger.kernel.org
Cc: linux-kernel(a)vger.kernel.org
Cc: Alexey Kuznetsov <kuznet(a)ms2.inr.ac.ru>
Cc: Alexander Mikhalitsyn <alexander.mikhalitsyn(a)virtuozzo.com>
Cc: Konstantin Khorenko <khorenko(a)virtuozzo.com>
Cc: kernel(a)openvz.org
Cc: devel(a)openvz.org
Investigated-by: Alexander Mikhalitsyn <alexander.mikhalitsyn(a)virtuozzo.com>
Signed-off-by: Denis V. Lunev <den(a)openvz.org>
Signed-off-by: David S. Miller <davem(a)davemloft.net>
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
---
net/core/neighbour.c | 25 +++++++++++++++++--------
1 file changed, 17 insertions(+), 8 deletions(-)
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 358e84af0210..8af9761768e0 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -222,14 +222,23 @@ static int neigh_del_timer(struct neighbour *n)
return 0;
}
-static void pneigh_queue_purge(struct sk_buff_head *list)
+static void pneigh_queue_purge(struct sk_buff_head *list, struct net *net)
{
+ unsigned long flags;
struct sk_buff *skb;
- while ((skb = skb_dequeue(list)) != NULL) {
- dev_put(skb->dev);
- kfree_skb(skb);
+ spin_lock_irqsave(&list->lock, flags);
+ skb = skb_peek(list);
+ while (skb != NULL) {
+ struct sk_buff *skb_next = skb_peek_next(skb, list);
+ if (net == NULL || net_eq(dev_net(skb->dev), net)) {
+ __skb_unlink(skb, list);
+ dev_put(skb->dev);
+ kfree_skb(skb);
+ }
+ skb = skb_next;
}
+ spin_unlock_irqrestore(&list->lock, flags);
}
static void neigh_flush_dev(struct neigh_table *tbl, struct net_device *dev)
@@ -295,9 +304,9 @@ int neigh_ifdown(struct neigh_table *tbl, struct net_device *dev)
write_lock_bh(&tbl->lock);
neigh_flush_dev(tbl, dev);
pneigh_ifdown_and_unlock(tbl, dev);
-
- del_timer_sync(&tbl->proxy_timer);
- pneigh_queue_purge(&tbl->proxy_queue);
+ pneigh_queue_purge(&tbl->proxy_queue, dev_net(dev));
+ if (skb_queue_empty_lockless(&tbl->proxy_queue))
+ del_timer_sync(&tbl->proxy_timer);
return 0;
}
EXPORT_SYMBOL(neigh_ifdown);
@@ -1609,7 +1618,7 @@ int neigh_table_clear(int index, struct neigh_table *tbl)
/* It is not clean... Fix it to unload IPv6 module safely */
cancel_delayed_work_sync(&tbl->gc_work);
del_timer_sync(&tbl->proxy_timer);
- pneigh_queue_purge(&tbl->proxy_queue);
+ pneigh_queue_purge(&tbl->proxy_queue, NULL);
neigh_ifdown(tbl, NULL);
if (atomic_read(&tbl->entries))
pr_crit("neighbour leakage\n");
--
2.35.1
From: Fudong Wang <Fudong.Wang(a)amd.com>
[ Upstream commit b2a93490201300a749ad261b5c5d05cb50179c44 ]
[Why]
After ODM clock off, optc underflow bit will be kept there always and clear not work.
We need to clear that before clock off.
[How]
Clear that if have when clock off.
Reviewed-by: Alvin Lee <alvin.lee2(a)amd.com>
Acked-by: Tom Chung <chiahsuan.chung(a)amd.com>
Signed-off-by: Fudong Wang <Fudong.Wang(a)amd.com>
Tested-by: Daniel Wheeler <daniel.wheeler(a)amd.com>
Signed-off-by: Alex Deucher <alexander.deucher(a)amd.com>
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
---
drivers/gpu/drm/amd/display/dc/dcn10/dcn10_optc.c | 5 +++++
1 file changed, 5 insertions(+)
diff --git a/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_optc.c b/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_optc.c
index 411f89218e01..cb5c44b339e0 100644
--- a/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_optc.c
+++ b/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_optc.c
@@ -452,6 +452,11 @@ void optc1_enable_optc_clock(struct timing_generator *optc, bool enable)
OTG_CLOCK_ON, 1,
1, 1000);
} else {
+
+ //last chance to clear underflow, otherwise, it will always there due to clock is off.
+ if (optc->funcs->is_optc_underflow_occurred(optc) == true)
+ optc->funcs->clear_optc_underflow(optc);
+
REG_UPDATE_2(OTG_CLOCK_CONTROL,
OTG_CLOCK_GATE_DIS, 0,
OTG_CLOCK_EN, 0);
--
2.35.1
From: Josip Pavic <Josip.Pavic(a)amd.com>
[ Upstream commit 8de297dc046c180651c0500f8611663ae1c3828a ]
[why]
In some cases MPC tree bottom pipe ends up point to itself. This causes
iterating from top to bottom to hang the system in an infinite loop.
[how]
When looping to next MPC bottom pipe, check that the pointer is not same
as current to avoid infinite loop.
Reviewed-by: Josip Pavic <Josip.Pavic(a)amd.com>
Reviewed-by: Jun Lei <Jun.Lei(a)amd.com>
Acked-by: Alex Hung <alex.hung(a)amd.com>
Signed-off-by: Aric Cyr <aric.cyr(a)amd.com>
Tested-by: Daniel Wheeler <daniel.wheeler(a)amd.com>
Signed-off-by: Alex Deucher <alexander.deucher(a)amd.com>
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
---
drivers/gpu/drm/amd/display/dc/dcn10/dcn10_mpc.c | 6 ++++++
drivers/gpu/drm/amd/display/dc/dcn20/dcn20_mpc.c | 6 ++++++
2 files changed, 12 insertions(+)
diff --git a/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_mpc.c b/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_mpc.c
index 8b2f29f6dabd..068e79fa3490 100644
--- a/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_mpc.c
+++ b/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_mpc.c
@@ -118,6 +118,12 @@ struct mpcc *mpc1_get_mpcc_for_dpp(struct mpc_tree *tree, int dpp_id)
while (tmp_mpcc != NULL) {
if (tmp_mpcc->dpp_id == dpp_id)
return tmp_mpcc;
+
+ /* avoid circular linked list */
+ ASSERT(tmp_mpcc != tmp_mpcc->mpcc_bot);
+ if (tmp_mpcc == tmp_mpcc->mpcc_bot)
+ break;
+
tmp_mpcc = tmp_mpcc->mpcc_bot;
}
return NULL;
diff --git a/drivers/gpu/drm/amd/display/dc/dcn20/dcn20_mpc.c b/drivers/gpu/drm/amd/display/dc/dcn20/dcn20_mpc.c
index 5a188b2bc033..0a00bd8e00ab 100644
--- a/drivers/gpu/drm/amd/display/dc/dcn20/dcn20_mpc.c
+++ b/drivers/gpu/drm/amd/display/dc/dcn20/dcn20_mpc.c
@@ -488,6 +488,12 @@ struct mpcc *mpc2_get_mpcc_for_dpp(struct mpc_tree *tree, int dpp_id)
while (tmp_mpcc != NULL) {
if (tmp_mpcc->dpp_id == 0xf || tmp_mpcc->dpp_id == dpp_id)
return tmp_mpcc;
+
+ /* avoid circular linked list */
+ ASSERT(tmp_mpcc != tmp_mpcc->mpcc_bot);
+ if (tmp_mpcc == tmp_mpcc->mpcc_bot)
+ break;
+
tmp_mpcc = tmp_mpcc->mpcc_bot;
}
return NULL;
--
2.35.1
From: Josip Pavic <Josip.Pavic(a)amd.com>
[ Upstream commit 8de297dc046c180651c0500f8611663ae1c3828a ]
[why]
In some cases MPC tree bottom pipe ends up point to itself. This causes
iterating from top to bottom to hang the system in an infinite loop.
[how]
When looping to next MPC bottom pipe, check that the pointer is not same
as current to avoid infinite loop.
Reviewed-by: Josip Pavic <Josip.Pavic(a)amd.com>
Reviewed-by: Jun Lei <Jun.Lei(a)amd.com>
Acked-by: Alex Hung <alex.hung(a)amd.com>
Signed-off-by: Aric Cyr <aric.cyr(a)amd.com>
Tested-by: Daniel Wheeler <daniel.wheeler(a)amd.com>
Signed-off-by: Alex Deucher <alexander.deucher(a)amd.com>
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
---
drivers/gpu/drm/amd/display/dc/dcn10/dcn10_mpc.c | 6 ++++++
drivers/gpu/drm/amd/display/dc/dcn20/dcn20_mpc.c | 6 ++++++
2 files changed, 12 insertions(+)
diff --git a/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_mpc.c b/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_mpc.c
index 3fcd408e9103..855682590c1b 100644
--- a/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_mpc.c
+++ b/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_mpc.c
@@ -125,6 +125,12 @@ struct mpcc *mpc1_get_mpcc_for_dpp(struct mpc_tree *tree, int dpp_id)
while (tmp_mpcc != NULL) {
if (tmp_mpcc->dpp_id == dpp_id)
return tmp_mpcc;
+
+ /* avoid circular linked list */
+ ASSERT(tmp_mpcc != tmp_mpcc->mpcc_bot);
+ if (tmp_mpcc == tmp_mpcc->mpcc_bot)
+ break;
+
tmp_mpcc = tmp_mpcc->mpcc_bot;
}
return NULL;
diff --git a/drivers/gpu/drm/amd/display/dc/dcn20/dcn20_mpc.c b/drivers/gpu/drm/amd/display/dc/dcn20/dcn20_mpc.c
index 99cc095dc33c..a701ea56c0aa 100644
--- a/drivers/gpu/drm/amd/display/dc/dcn20/dcn20_mpc.c
+++ b/drivers/gpu/drm/amd/display/dc/dcn20/dcn20_mpc.c
@@ -533,6 +533,12 @@ struct mpcc *mpc2_get_mpcc_for_dpp(struct mpc_tree *tree, int dpp_id)
while (tmp_mpcc != NULL) {
if (tmp_mpcc->dpp_id == 0xf || tmp_mpcc->dpp_id == dpp_id)
return tmp_mpcc;
+
+ /* avoid circular linked list */
+ ASSERT(tmp_mpcc != tmp_mpcc->mpcc_bot);
+ if (tmp_mpcc == tmp_mpcc->mpcc_bot)
+ break;
+
tmp_mpcc = tmp_mpcc->mpcc_bot;
}
return NULL;
--
2.35.1
This is the start of the stable review cycle for the 4.9.326 release.
There are 98 patches in this series, all will be posted as a response
to this one. If anyone has any issues with these being applied, please
let me know.
Responses should be made by Fri, 26 Aug 2022 07:24:55 +0000.
Anything received after that time might be too late.
The whole patch series can be found in one patch at:
https://www.kernel.org/pub/linux/kernel/v4.x/stable-review/patch-4.9.326-rc…
or in the git tree and branch at:
git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git linux-4.9.y
and the diffstat can be found below.
thanks,
greg k-h
-------------
Pseudo-Shortlog of commits:
Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
Linux 4.9.326-rc2
Nathan Chancellor <nathan(a)kernel.org>
MIPS: tlbex: Explicitly compare _PAGE_NO_EXEC against 0
Zheyu Ma <zheyuma97(a)gmail.com>
video: fbdev: i740fb: Check the argument of i740_calc_vclk()
Zhouyi Zhou <zhouzhouyi(a)gmail.com>
powerpc/64: Init jump labels before parse_early_param()
Takashi Iwai <tiwai(a)suse.de>
ALSA: timer: Use deferred fasync helper
Takashi Iwai <tiwai(a)suse.de>
ALSA: core: Add async signal helpers
Liang He <windhl(a)126.com>
mips: cavium-octeon: Fix missing of_node_put() in octeon2_usb_clocks_start
Schspa Shi <schspa(a)gmail.com>
vfio: Clear the caps->buf to NULL after free
Liang He <windhl(a)126.com>
tty: serial: Fix refcount leak bug in ucc_uart.c
Kiselev, Oleg <okiselev(a)amazon.com>
ext4: avoid resizing to a partial cluster size
Ye Bin <yebin10(a)huawei.com>
ext4: avoid remove directory when directory is corrupted
Wentao_Liang <Wentao_Liang_g(a)163.com>
drivers:md:fix a potential use-after-free bug
Christophe JAILLET <christophe.jaillet(a)wanadoo.fr>
cxl: Fix a memory leak in an error handling path
Jozef Martiniak <jomajm(a)gmail.com>
gadgetfs: ep_io - wait until IRQ finishes
Liang He <windhl(a)126.com>
usb: host: ohci-ppc-of: Fix refcount leak bug
Sai Prakash Ranjan <quic_saipraka(a)quicinc.com>
irqchip/tegra: Fix overflow implicit truncation warnings
Csókás Bence <csokas.bence(a)prolan.hu>
fec: Fix timer capture timing in `fec_ptp_enable_pps()`
Pablo Neira Ayuso <pablo(a)netfilter.org>
netfilter: nf_tables: really skip inactive sets when allocating name
Al Viro <viro(a)zeniv.linux.org.uk>
nios2: add force_successful_syscall_return()
Al Viro <viro(a)zeniv.linux.org.uk>
nios2: restarts apply only to the first sigframe we build...
Al Viro <viro(a)zeniv.linux.org.uk>
nios2: fix syscall restart checks
Al Viro <viro(a)zeniv.linux.org.uk>
nios2: traced syscall does need to check the syscall number
Al Viro <viro(a)zeniv.linux.org.uk>
nios2: don't leave NULLs in sys_call_table[]
Al Viro <viro(a)zeniv.linux.org.uk>
nios2: page fault et.al. are *not* restartable syscalls...
Duoming Zhou <duoming(a)zju.edu.cn>
atm: idt77252: fix use-after-free bugs caused by tst_timer
Dan Carpenter <dan.carpenter(a)oracle.com>
xen/xenbus: fix return type in xenbus_file_read()
Peilin Ye <peilin.ye(a)bytedance.com>
vsock: Fix memory leak in vsock_connect()
Nikita Travkin <nikita(a)trvn.ru>
pinctrl: qcom: msm8916: Allow CAMSS GP clocks to be muxed
Miaoqian Lin <linmq006(a)gmail.com>
pinctrl: nomadik: Fix refcount leak in nmk_pinctrl_dt_subnode_to_map
Trond Myklebust <trond.myklebust(a)hammerspace.com>
SUNRPC: Reinitialise the backchannel request buffers before reuse
Zhang Xianwei <zhang.xianwei8(a)zte.com.cn>
NFSv4.1: RECLAIM_COMPLETE must handle EACCES
Marc Kleine-Budde <mkl(a)pengutronix.de>
can: ems_usb: fix clang's -Wunaligned-access warning
Filipe Manana <fdmanana(a)suse.com>
btrfs: fix lost error handling when looking up extended ref on log replay
Damien Le Moal <damien.lemoal(a)opensource.wdc.com>
ata: libata-eh: Add missing command name
Mikulas Patocka <mpatocka(a)redhat.com>
rds: add missing barrier to release_refill
Amadeusz Sławiński <amadeuszx.slawinski(a)linux.intel.com>
ALSA: info: Fix llseek return value when using callback
Jamal Hadi Salim <jhs(a)mojatatu.com>
net_sched: cls_route: disallow handle of 0
Tyler Hicks <tyhicks(a)linux.microsoft.com>
net/9p: Initialize the iounit field during fid creation
Guenter Roeck <linux(a)roeck-us.net>
nios2: time: Read timer in get_cycles only if initialized
Luiz Augusto von Dentz <luiz.von.dentz(a)intel.com>
Bluetooth: L2CAP: Fix l2cap_global_chan_by_psm regression
Jose Alonso <joalonsof(a)gmail.com>
Revert "net: usb: ax88179_178a needs FLAG_SEND_ZLP"
Tony Battersby <tonyb(a)cybernetics.com>
scsi: sg: Allow waiting for commands to complete on removed device
Eric Dumazet <edumazet(a)google.com>
tcp: fix over estimation in sk_forced_mem_schedule()
Qu Wenruo <wqu(a)suse.com>
btrfs: reject log replay if there is unsupported RO compat flag
Thadeu Lima de Souza Cascardo <cascardo(a)canonical.com>
net_sched: cls_route: remove from list when handle is 0
Mikulas Patocka <mpatocka(a)redhat.com>
dm raid: fix address sanitizer warning in raid_status
Baokun Li <libaokun1(a)huawei.com>
ext4: correct max_inline_xattr_value_size computing
Eric Whitney <enwlinux(a)gmail.com>
ext4: fix extent status tree race in writeback error recovery path
Theodore Ts'o <tytso(a)mit.edu>
ext4: update s_overhead_clusters in the superblock during an on-line resize
Baokun Li <libaokun1(a)huawei.com>
ext4: fix use-after-free in ext4_xattr_set_entry
Lukas Czerner <lczerner(a)redhat.com>
ext4: make sure ext4_append() always allocates new block
Baokun Li <libaokun1(a)huawei.com>
ext4: add EXT4_INODE_HAS_XATTR_SPACE macro in xattr.h
David Collins <quic_collinsd(a)quicinc.com>
spmi: trace: fix stack-out-of-bound access in SPMI tracing functions
Alexander Lobakin <alexandr.lobakin(a)intel.com>
x86/olpc: fix 'logical not is only applied to the left hand side'
Steffen Maier <maier(a)linux.ibm.com>
scsi: zfcp: Fix missing auto port scan and thus missing target ports
Florian Westphal <fw(a)strlen.de>
netfilter: nf_tables: fix null deref due to zeroed list head
Weitao Wang <WeitaoWang-oc(a)zhaoxin.com>
USB: HCD: Fix URB giveback issue in tasklet function
Huacai Chen <chenhuacai(a)loongson.cn>
MIPS: cpuinfo: Fix a warning for CONFIG_CPUMASK_OFFSTACK
Michael Ellerman <mpe(a)ellerman.id.au>
powerpc/powernv: Avoid crashing if rng is NULL
Pali Rohár <pali(a)kernel.org>
powerpc/fsl-pci: Fix Class Code of PCIe Root Port
Pali Rohár <pali(a)kernel.org>
PCI: Add defines for normal and subtractive PCI bridges
Alexander Lobakin <alexandr.lobakin(a)intel.com>
ia64, processor: fix -Wincompatible-pointer-types in ia64_get_irr()
Mikulas Patocka <mpatocka(a)redhat.com>
md-raid10: fix KASAN warning
Miklos Szeredi <mszeredi(a)redhat.com>
fuse: limit nsec
Daniel Borkmann <daniel(a)iogearbox.net>
bpf: fix overflow in prog accounting
Timur Tabi <ttabi(a)nvidia.com>
drm/nouveau: fix another off-by-one in nvbios_addr
Helge Deller <deller(a)gmx.de>
parisc: Fix device names in /proc/iomem
Lukas Wunner <lukas(a)wunner.de>
usbnet: Fix linkwatch use-after-free on disconnect
David Howells <dhowells(a)redhat.com>
vfs: Check the truncate maximum size in inode_newsize_ok()
Allen Ballway <ballway(a)chromium.org>
ALSA: hda/cirrus - support for iMac 12,1 model
Meng Tang <tangmeng(a)uniontech.com>
ALSA: hda/conexant: Add quirk for LENOVO 20149 Notebook model
Sean Christopherson <seanjc(a)google.com>
KVM: x86: Mark TSS busy during LTR emulation _after_ all fault checks
Maciej S. Szmigiero <maciej.szmigiero(a)oracle.com>
KVM: SVM: Don't BUG if userspace injects an interrupt with GIF=0
Mikulas Patocka <mpatocka(a)redhat.com>
add barriers to buffer_uptodate and set_buffer_uptodate
Zheyu Ma <zheyuma97(a)gmail.com>
ALSA: bcd2000: Fix a UAF bug on the error path of probing
Ning Qiang <sohu0106(a)126.com>
macintosh/adb: fix oob read in do_adb_query() function
Hans-Christian Noren Egtvedt <hegtvedt(a)cisco.com>
random: only call boot_init_stack_canary() once
Werner Sembach <wse(a)tuxedocomputers.com>
ACPI: video: Shortening quirk list by identifying Clevo by board_name only
Werner Sembach <wse(a)tuxedocomputers.com>
ACPI: video: Force backlight native for some TongFang devices
Daniel Micay <danielmicay(a)gmail.com>
init/main.c: extract early boot entropy from the passed cmdline
Laura Abbott <lauraa(a)codeaurora.org>
init: move stack canary initialization after setup_arch
Viresh Kumar <viresh.kumar(a)linaro.org>
init/main: properly align the multi-line comment
Viresh Kumar <viresh.kumar(a)linaro.org>
init/main: Fix double "the" in comment
Christian Borntraeger <borntraeger(a)de.ibm.com>
include/uapi/linux/swab.h: fix userspace breakage, use __BITS_PER_LONG for swap
Paul Moore <paul(a)paul-moore.com>
selinux: fix inode_doinit_with_dentry() LABEL_INVALID error handling
Tianyue Ren <rentianyue(a)kylinos.cn>
selinux: fix error initialization in inode_doinit_with_dentry()
Andreas Gruenbacher <agruenba(a)redhat.com>
selinux: Convert isec->lock into a spinlock
Andreas Gruenbacher <agruenba(a)redhat.com>
selinux: Clean up initialization of isec->sclass
Andreas Gruenbacher <agruenba(a)redhat.com>
proc: Pass file mode to proc_pid_make_inode
Andreas Gruenbacher <agruenba(a)redhat.com>
selinux: Minor cleanups
Nathan Chancellor <nathan(a)kernel.org>
ion: Make user_ion_handle_put_nolock() a void function
Wei Mingzhi <whistler(a)member.fsf.org>
mt7601u: add USB device ID for some versions of XiaoDu WiFi Dongle.
Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
ARM: crypto: comment out gcc warning that breaks clang builds
Florian Westphal <fw(a)strlen.de>
netfilter: nf_queue: do not allow packet truncation below transport header offset
Liang He <windhl(a)126.com>
net: sungem_phy: Add of_node_put() for reference returned by of_get_parent()
Kuniyuki Iwashima <kuniyu(a)amazon.com>
net: ping6: Fix memleak in ipv6_renew_options().
Liang He <windhl(a)126.com>
scsi: ufs: host: Hold reference returned by of_parse_phandle()
ChenXiaoSong <chenxiaosong2(a)huawei.com>
ntfs: fix use-after-free in ntfs_ucsncmp()
Luiz Augusto von Dentz <luiz.von.dentz(a)intel.com>
Bluetooth: L2CAP: Fix use-after-free caused by l2cap_chan_put
-------------
Diffstat:
Makefile | 4 +-
arch/arm/lib/xor-neon.c | 3 +-
arch/ia64/include/asm/processor.h | 2 +-
arch/mips/cavium-octeon/octeon-platform.c | 3 +-
arch/mips/kernel/proc.c | 2 +-
arch/mips/mm/tlbex.c | 4 +-
arch/nios2/include/asm/entry.h | 3 +-
arch/nios2/include/asm/ptrace.h | 2 +
arch/nios2/kernel/entry.S | 22 +++--
arch/nios2/kernel/signal.c | 3 +-
arch/nios2/kernel/syscall_table.c | 1 +
arch/nios2/kernel/time.c | 5 +-
arch/parisc/kernel/drivers.c | 9 +-
arch/powerpc/kernel/prom.c | 7 ++
arch/powerpc/platforms/powernv/rng.c | 2 +
arch/powerpc/sysdev/fsl_pci.c | 8 ++
arch/powerpc/sysdev/fsl_pci.h | 1 +
arch/x86/kvm/emulate.c | 19 ++--
arch/x86/kvm/svm.c | 2 -
arch/x86/platform/olpc/olpc-xo1-sci.c | 2 +-
drivers/acpi/video_detect.c | 55 +++++++----
drivers/ata/libata-eh.c | 1 +
drivers/atm/idt77252.c | 1 +
drivers/gpu/drm/nouveau/nvkm/subdev/bios/base.c | 2 +-
drivers/irqchip/irq-tegra.c | 10 +-
drivers/macintosh/adb.c | 2 +-
drivers/md/dm-raid.c | 2 +-
drivers/md/raid10.c | 5 +-
drivers/md/raid5.c | 2 +-
drivers/misc/cxl/irq.c | 1 +
drivers/net/can/usb/ems_usb.c | 2 +-
drivers/net/ethernet/freescale/fec_ptp.c | 6 +-
drivers/net/sungem_phy.c | 1 +
drivers/net/usb/ax88179_178a.c | 14 +--
drivers/net/usb/usbnet.c | 8 +-
drivers/net/wireless/mediatek/mt7601u/usb.c | 1 +
drivers/pinctrl/nomadik/pinctrl-nomadik.c | 4 +-
drivers/pinctrl/qcom/pinctrl-msm8916.c | 4 +-
drivers/s390/scsi/zfcp_fc.c | 29 ++++--
drivers/s390/scsi/zfcp_fc.h | 6 +-
drivers/s390/scsi/zfcp_fsf.c | 4 +-
drivers/scsi/sg.c | 57 ++++++-----
drivers/scsi/ufs/ufshcd-pltfrm.c | 15 ++-
drivers/staging/android/ion/ion-ioctl.c | 8 +-
drivers/tty/serial/ucc_uart.c | 2 +
drivers/usb/core/hcd.c | 26 ++---
drivers/usb/gadget/legacy/inode.c | 1 +
drivers/usb/host/ohci-ppc-of.c | 1 +
drivers/vfio/vfio.c | 1 +
drivers/video/fbdev/i740fb.c | 9 +-
drivers/xen/xenbus/xenbus_dev_frontend.c | 4 +-
fs/attr.c | 2 +
fs/btrfs/disk-io.c | 14 +++
fs/btrfs/tree-log.c | 4 +-
fs/ext4/inline.c | 3 +
fs/ext4/inode.c | 7 ++
fs/ext4/namei.c | 23 ++++-
fs/ext4/resize.c | 11 +++
fs/ext4/xattr.c | 6 +-
fs/ext4/xattr.h | 13 +++
fs/fuse/inode.c | 6 ++
fs/nfs/nfs4proc.c | 3 +
fs/ntfs/attrib.c | 8 +-
fs/proc/base.c | 23 ++---
fs/proc/fd.c | 6 +-
fs/proc/internal.h | 2 +-
fs/proc/namespaces.c | 3 +-
include/linux/bpf.h | 11 +++
include/linux/buffer_head.h | 25 ++++-
include/linux/pci_ids.h | 2 +
include/linux/usb/hcd.h | 1 +
include/net/bluetooth/l2cap.h | 1 +
include/sound/core.h | 8 ++
include/trace/events/spmi.h | 12 +--
include/uapi/linux/swab.h | 4 +-
init/main.c | 14 +--
kernel/bpf/core.c | 16 ++-
kernel/bpf/syscall.c | 36 +++++--
net/9p/client.c | 4 +-
net/bluetooth/l2cap_core.c | 68 +++++++++----
net/ipv4/tcp_output.c | 7 +-
net/ipv6/ping.c | 6 ++
net/netfilter/nf_tables_api.c | 3 +-
net/netfilter/nfnetlink_queue.c | 7 +-
net/rds/ib_recv.c | 1 +
net/sched/cls_route.c | 8 +-
net/sunrpc/backchannel_rqst.c | 14 +++
net/vmw_vsock/af_vsock.c | 9 +-
security/selinux/hooks.c | 123 +++++++++++++++---------
security/selinux/include/objsec.h | 5 +-
security/selinux/selinuxfs.c | 4 +-
sound/core/info.c | 6 +-
sound/core/misc.c | 94 ++++++++++++++++++
sound/core/timer.c | 11 ++-
sound/pci/hda/patch_cirrus.c | 1 +
sound/pci/hda/patch_conexant.c | 11 ++-
sound/usb/bcd2000/bcd2000.c | 3 +-
97 files changed, 743 insertions(+), 294 deletions(-)
commit 5535be3099717646781ce1540cf725965d680e7b upstream.
Ever since the Dirty COW (CVE-2016-5195) security issue happened, we know
that FOLL_FORCE can be possibly dangerous, especially if there are races
that can be exploited by user space.
Right now, it would be sufficient to have some code that sets a PTE of a
R/O-mapped shared page dirty, in order for it to erroneously become
writable by FOLL_FORCE. The implications of setting a write-protected PTE
dirty might not be immediately obvious to everyone.
And in fact ever since commit 9ae0f87d009c ("mm/shmem: unconditionally set
pte dirty in mfill_atomic_install_pte"), we can use UFFDIO_CONTINUE to map
a shmem page R/O while marking the pte dirty. This can be used by
unprivileged user space to modify tmpfs/shmem file content even if the
user does not have write permissions to the file, and to bypass memfd
write sealing -- Dirty COW restricted to tmpfs/shmem (CVE-2022-2590).
To fix such security issues for good, the insight is that we really only
need that fancy retry logic (FOLL_COW) for COW mappings that are not
writable (!VM_WRITE). And in a COW mapping, we really only broke COW if
we have an exclusive anonymous page mapped. If we have something else
mapped, or the mapped anonymous page might be shared (!PageAnonExclusive),
we have to trigger a write fault to break COW. If we don't find an
exclusive anonymous page when we retry, we have to trigger COW breaking
once again because something intervened.
Let's move away from this mandatory-retry + dirty handling and rely on our
PageAnonExclusive() flag for making a similar decision, to use the same
COW logic as in other kernel parts here as well. In case we stumble over
a PTE in a COW mapping that does not map an exclusive anonymous page, COW
was not properly broken and we have to trigger a fake write-fault to break
COW.
Just like we do in can_change_pte_writable() added via commit 64fe24a3e05e
("mm/mprotect: try avoiding write faults for exclusive anonymous pages
when changing protection") and commit 76aefad628aa ("mm/mprotect: fix
soft-dirty check in can_change_pte_writable()"), take care of softdirty
and uffd-wp manually.
For example, a write() via /proc/self/mem to a uffd-wp-protected range has
to fail instead of silently granting write access and bypassing the
userspace fault handler. Note that FOLL_FORCE is not only used for debug
access, but also triggered by applications without debug intentions, for
example, when pinning pages via RDMA.
This fixes CVE-2022-2590. Note that only x86_64 and aarch64 are
affected, because only those support CONFIG_HAVE_ARCH_USERFAULTFD_MINOR.
Fortunately, FOLL_COW is no longer required to handle FOLL_FORCE. So
let's just get rid of it.
Thanks to Nadav Amit for pointing out that the pte_dirty() check in
FOLL_FORCE code is problematic and might be exploitable.
Note 1: We don't check for the PTE being dirty because it doesn't matter
for making a "was COWed" decision anymore, and whoever modifies the
page has to set the page dirty either way.
Note 2: Kernels before extended uffd-wp support and before
PageAnonExclusive (< 5.19) can simply revert the problematic
commit instead and be safe regarding UFFDIO_CONTINUE. A backport to
v5.19 requires minor adjustments due to lack of
vma_soft_dirty_enabled().
Link: https://lkml.kernel.org/r/20220809205640.70916-1-david@redhat.com
Fixes: 9ae0f87d009c ("mm/shmem: unconditionally set pte dirty in mfill_atomic_install_pte")
Signed-off-by: David Hildenbrand <david(a)redhat.com>
Cc: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
Cc: Axel Rasmussen <axelrasmussen(a)google.com>
Cc: Nadav Amit <nadav.amit(a)gmail.com>
Cc: Peter Xu <peterx(a)redhat.com>
Cc: Hugh Dickins <hughd(a)google.com>
Cc: Andrea Arcangeli <aarcange(a)redhat.com>
Cc: Matthew Wilcox <willy(a)infradead.org>
Cc: Vlastimil Babka <vbabka(a)suse.cz>
Cc: John Hubbard <jhubbard(a)nvidia.com>
Cc: Jason Gunthorpe <jgg(a)nvidia.com>
Cc: David Laight <David.Laight(a)ACULAB.COM>
Cc: <stable(a)vger.kernel.org> [5.16]
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
Signed-off-by: David Hildenbrand <david(a)redhat.com>
---
include/linux/mm.h | 1 -
mm/gup.c | 69 +++++++++++++++++++++++++++++++---------------
mm/huge_memory.c | 65 +++++++++++++++++++++++++++++--------------
3 files changed, 91 insertions(+), 44 deletions(-)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 7898e29bcfb5..25b8860f47cc 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2939,7 +2939,6 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
#define FOLL_MIGRATION 0x400 /* wait for page to replace migration entry */
#define FOLL_TRIED 0x800 /* a retry, previous pass started an IO */
#define FOLL_REMOTE 0x2000 /* we are working on non-current tsk/mm */
-#define FOLL_COW 0x4000 /* internal GUP flag */
#define FOLL_ANON 0x8000 /* don't do file mappings */
#define FOLL_LONGTERM 0x10000 /* mapping lifetime is indefinite: see below */
#define FOLL_SPLIT_PMD 0x20000 /* split huge pmd before returning */
diff --git a/mm/gup.c b/mm/gup.c
index e2a39e30756d..d2fd46b50102 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -478,14 +478,43 @@ static int follow_pfn_pte(struct vm_area_struct *vma, unsigned long address,
return -EEXIST;
}
-/*
- * FOLL_FORCE can write to even unwritable pte's, but only
- * after we've gone through a COW cycle and they are dirty.
- */
-static inline bool can_follow_write_pte(pte_t pte, unsigned int flags)
+/* FOLL_FORCE can write to even unwritable PTEs in COW mappings. */
+static inline bool can_follow_write_pte(pte_t pte, struct page *page,
+ struct vm_area_struct *vma,
+ unsigned int flags)
{
- return pte_write(pte) ||
- ((flags & FOLL_FORCE) && (flags & FOLL_COW) && pte_dirty(pte));
+ /* If the pte is writable, we can write to the page. */
+ if (pte_write(pte))
+ return true;
+
+ /* Maybe FOLL_FORCE is set to override it? */
+ if (!(flags & FOLL_FORCE))
+ return false;
+
+ /* But FOLL_FORCE has no effect on shared mappings */
+ if (vma->vm_flags & (VM_MAYSHARE | VM_SHARED))
+ return false;
+
+ /* ... or read-only private ones */
+ if (!(vma->vm_flags & VM_MAYWRITE))
+ return false;
+
+ /* ... or already writable ones that just need to take a write fault */
+ if (vma->vm_flags & VM_WRITE)
+ return false;
+
+ /*
+ * See can_change_pte_writable(): we broke COW and could map the page
+ * writable if we have an exclusive anonymous page ...
+ */
+ if (!page || !PageAnon(page) || !PageAnonExclusive(page))
+ return false;
+
+ /* ... and a write-fault isn't required for other reasons. */
+ if (IS_ENABLED(CONFIG_MEM_SOFT_DIRTY) &&
+ !(vma->vm_flags & VM_SOFTDIRTY) && !pte_soft_dirty(pte))
+ return false;
+ return !userfaultfd_pte_wp(vma, pte);
}
static struct page *follow_page_pte(struct vm_area_struct *vma,
@@ -528,12 +557,19 @@ static struct page *follow_page_pte(struct vm_area_struct *vma,
}
if ((flags & FOLL_NUMA) && pte_protnone(pte))
goto no_page;
- if ((flags & FOLL_WRITE) && !can_follow_write_pte(pte, flags)) {
- pte_unmap_unlock(ptep, ptl);
- return NULL;
- }
page = vm_normal_page(vma, address, pte);
+
+ /*
+ * We only care about anon pages in can_follow_write_pte() and don't
+ * have to worry about pte_devmap() because they are never anon.
+ */
+ if ((flags & FOLL_WRITE) &&
+ !can_follow_write_pte(pte, page, vma, flags)) {
+ page = NULL;
+ goto out;
+ }
+
if (!page && pte_devmap(pte) && (flags & (FOLL_GET | FOLL_PIN))) {
/*
* Only return device mapping pages in the FOLL_GET or FOLL_PIN
@@ -967,17 +1003,6 @@ static int faultin_page(struct vm_area_struct *vma,
return -EBUSY;
}
- /*
- * The VM_FAULT_WRITE bit tells us that do_wp_page has broken COW when
- * necessary, even if maybe_mkwrite decided not to set pte_write. We
- * can thus safely do subsequent page lookups as if they were reads.
- * But only do so when looping for pte_write is futile: in some cases
- * userspace may also be wanting to write to the gotten user page,
- * which a read fault here might prevent (a readonly page might get
- * reCOWed by userspace write).
- */
- if ((ret & VM_FAULT_WRITE) && !(vma->vm_flags & VM_WRITE))
- *flags |= FOLL_COW;
return 0;
}
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 834f288b3769..164d13b62079 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -977,12 +977,6 @@ struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr,
assert_spin_locked(pmd_lockptr(mm, pmd));
- /*
- * When we COW a devmap PMD entry, we split it into PTEs, so we should
- * not be in this function with `flags & FOLL_COW` set.
- */
- WARN_ONCE(flags & FOLL_COW, "mm: In follow_devmap_pmd with FOLL_COW set");
-
/* FOLL_GET and FOLL_PIN are mutually exclusive. */
if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) ==
(FOLL_PIN | FOLL_GET)))
@@ -1348,14 +1342,43 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf)
return VM_FAULT_FALLBACK;
}
-/*
- * FOLL_FORCE can write to even unwritable pmd's, but only
- * after we've gone through a COW cycle and they are dirty.
- */
-static inline bool can_follow_write_pmd(pmd_t pmd, unsigned int flags)
+/* FOLL_FORCE can write to even unwritable PMDs in COW mappings. */
+static inline bool can_follow_write_pmd(pmd_t pmd, struct page *page,
+ struct vm_area_struct *vma,
+ unsigned int flags)
{
- return pmd_write(pmd) ||
- ((flags & FOLL_FORCE) && (flags & FOLL_COW) && pmd_dirty(pmd));
+ /* If the pmd is writable, we can write to the page. */
+ if (pmd_write(pmd))
+ return true;
+
+ /* Maybe FOLL_FORCE is set to override it? */
+ if (!(flags & FOLL_FORCE))
+ return false;
+
+ /* But FOLL_FORCE has no effect on shared mappings */
+ if (vma->vm_flags & (VM_MAYSHARE | VM_SHARED))
+ return false;
+
+ /* ... or read-only private ones */
+ if (!(vma->vm_flags & VM_MAYWRITE))
+ return false;
+
+ /* ... or already writable ones that just need to take a write fault */
+ if (vma->vm_flags & VM_WRITE)
+ return false;
+
+ /*
+ * See can_change_pte_writable(): we broke COW and could map the page
+ * writable if we have an exclusive anonymous page ...
+ */
+ if (!page || !PageAnon(page) || !PageAnonExclusive(page))
+ return false;
+
+ /* ... and a write-fault isn't required for other reasons. */
+ if (IS_ENABLED(CONFIG_MEM_SOFT_DIRTY) &&
+ !(vma->vm_flags & VM_SOFTDIRTY) && !pmd_soft_dirty(pmd))
+ return false;
+ return !userfaultfd_huge_pmd_wp(vma, pmd);
}
struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
@@ -1364,12 +1387,16 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
unsigned int flags)
{
struct mm_struct *mm = vma->vm_mm;
- struct page *page = NULL;
+ struct page *page;
assert_spin_locked(pmd_lockptr(mm, pmd));
- if (flags & FOLL_WRITE && !can_follow_write_pmd(*pmd, flags))
- goto out;
+ page = pmd_page(*pmd);
+ VM_BUG_ON_PAGE(!PageHead(page) && !is_zone_device_page(page), page);
+
+ if ((flags & FOLL_WRITE) &&
+ !can_follow_write_pmd(*pmd, page, vma, flags))
+ return NULL;
/* Avoid dumping huge zero page */
if ((flags & FOLL_DUMP) && is_huge_zero_pmd(*pmd))
@@ -1377,10 +1404,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
/* Full NUMA hinting faults to serialise migration in fault paths */
if ((flags & FOLL_NUMA) && pmd_protnone(*pmd))
- goto out;
-
- page = pmd_page(*pmd);
- VM_BUG_ON_PAGE(!PageHead(page) && !is_zone_device_page(page), page);
+ return NULL;
if (!pmd_write(*pmd) && gup_must_unshare(flags, page))
return ERR_PTR(-EMLINK);
@@ -1397,7 +1421,6 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT;
VM_BUG_ON_PAGE(!PageCompound(page) && !is_zone_device_page(page), page);
-out:
return page;
}
--
2.37.1
When running identity mapped and depending on the kernel configuration,
it is possible that cc_platform_has() can have compiler generated code
that uses jump tables. This causes a boot failure because the jump table
uses un-mapped kernel virtual addresses, not identity mapped addresses.
This has been seen with CONFIG_RETPOLINE=n.
Similar to sme_encrypt_kernel(), use an open-coded direct check for the
status of SNP rather than trying to eliminate the jump table. This
preserves any code optimization in cc_platform_has() that can be useful
post boot. It also limits the changes to SEV-specific files so that
future compiler features won't necessarily require possible build changes
just because they are not compatible with running identity mapped.
Link: https://bugzilla.kernel.org/show_bug.cgi?id=216385
Link: https://lore.kernel.org/all/YqfabnTRxFSM+LoX@google.com/
Cc: <stable(a)vger.kernel.org> # 5.19.x
Reported-by: Sean Christopherson <seanjc(a)google.com>
Suggested-by: Sean Christopherson <seanjc(a)google.com>
Signed-off-by: Tom Lendacky <thomas.lendacky(a)amd.com>
---
arch/x86/kernel/sev.c | 16 ++++++++++++++--
1 file changed, 14 insertions(+), 2 deletions(-)
diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c
index 63dc626627a0..4f84c3f11af5 100644
--- a/arch/x86/kernel/sev.c
+++ b/arch/x86/kernel/sev.c
@@ -701,7 +701,13 @@ static void __init early_set_pages_state(unsigned long paddr, unsigned int npage
void __init early_snp_set_memory_private(unsigned long vaddr, unsigned long paddr,
unsigned int npages)
{
- if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP))
+ /*
+ * This can be invoked in early boot while running identity mapped, so
+ * use an open coded check for SNP instead of using cc_platform_has().
+ * This eliminates worries about jump tables or checking boot_cpu_data
+ * in the cc_platform_has() function.
+ */
+ if (!(sev_status & MSR_AMD64_SEV_SNP_ENABLED))
return;
/*
@@ -717,7 +723,13 @@ void __init early_snp_set_memory_private(unsigned long vaddr, unsigned long padd
void __init early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr,
unsigned int npages)
{
- if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP))
+ /*
+ * This can be invoked in early boot while running identity mapped, so
+ * use an open coded check for SNP instead of using cc_platform_has().
+ * This eliminates worries about jump tables or checking boot_cpu_data
+ * in the cc_platform_has() function.
+ */
+ if (!(sev_status & MSR_AMD64_SEV_SNP_ENABLED))
return;
/* Invalidate the memory pages before they are marked shared in the RMP table. */
--
2.37.2
Do changesets that already included the "Fixes:" tag in the commit
description also need to include the "Cc: stable(a)vger.kernel.org" in
order to be included in stable?
--
Thanks,
Steve
The patch titled
Subject: mm/page_alloc: fix race condition between build_all_zonelists and page allocation
has been added to the -mm mm-hotfixes-unstable branch. Its filename is
mm-page_alloc-fix-race-condition-between-build_all_zonelists-and-page-allocation.patch
This patch will shortly appear at
https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patche…
This patch will later appear in the mm-hotfixes-unstable branch at
git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next via the mm-everything
branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
and is updated there every 2-3 working days
------------------------------------------------------
From: Mel Gorman <mgorman(a)techsingularity.net>
Subject: mm/page_alloc: fix race condition between build_all_zonelists and page allocation
Date: Wed, 24 Aug 2022 12:14:50 +0100
Patrick Daly reported the following problem;
NODE_DATA(nid)->node_zonelists[ZONELIST_FALLBACK] - before offline operation
[0] - ZONE_MOVABLE
[1] - ZONE_NORMAL
[2] - NULL
For a GFP_KERNEL allocation, alloc_pages_slowpath() will save the
offset of ZONE_NORMAL in ac->preferred_zoneref. If a concurrent
memory_offline operation removes the last page from ZONE_MOVABLE,
build_all_zonelists() & build_zonerefs_node() will update
node_zonelists as shown below. Only populated zones are added.
NODE_DATA(nid)->node_zonelists[ZONELIST_FALLBACK] - after offline operation
[0] - ZONE_NORMAL
[1] - NULL
[2] - NULL
The race is simple -- page allocation could be in progress when a memory
hot-remove operation triggers a zonelist rebuild that removes zones. The
allocation request will still have a valid ac->preferred_zoneref that is
now pointing to NULL and triggers an OOM kill.
This problem probably always existed but may be slightly easier to trigger
due to 6aa303defb74 ("mm, vmscan: only allocate and reclaim from zones
with pages managed by the buddy allocator") which distinguishes between
zones that are completely unpopulated versus zones that have valid pages
not managed by the buddy allocator (e.g. reserved, memblock, ballooning
etc). Memory hotplug had multiple stages with timing considerations
around managed/present page updates, the zonelist rebuild and the zone
span updates. As David Hildenbrand puts it
memory offlining adjusts managed+present pages of the zone
essentially in one go. If after the adjustments, the zone is no
longer populated (present==0), we rebuild the zone lists.
Once that's done, we try shrinking the zone (start+spanned
pages) -- which results in zone_start_pfn == 0 if there are no
more pages. That happens *after* rebuilding the zonelists via
remove_pfn_range_from_zone().
The only requirement to fix the race is that a page allocation request
identifies when a zonelist rebuild has happened since the allocation
request started and no page has yet been allocated. Use a seqlock_t to
track zonelist updates with a lockless read-side of the zonelist and
protecting the rebuild and update of the counter with a spinlock.
Link: https://lkml.kernel.org/r/20220824110900.vh674ltxmzb3proq@techsingularity.n…
Fixes: 6aa303defb74 ("mm, vmscan: only allocate and reclaim from zones with pages managed by the buddy allocator")
Signed-off-by: Mel Gorman <mgorman(a)techsingularity.net>
Reported-by: Patrick Daly <quic_pdaly(a)quicinc.com>
Acked-by: Michal Hocko <mhocko(a)suse.com>
Reviewed-by: David Hildenbrand <david(a)redhat.com>
Cc: <stable(a)vger.kernel.org> [4.9+]
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/page_alloc.c | 53 +++++++++++++++++++++++++++++++++++++---------
1 file changed, 43 insertions(+), 10 deletions(-)
--- a/mm/page_alloc.c~mm-page_alloc-fix-race-condition-between-build_all_zonelists-and-page-allocation
+++ a/mm/page_alloc.c
@@ -4708,6 +4708,30 @@ void fs_reclaim_release(gfp_t gfp_mask)
EXPORT_SYMBOL_GPL(fs_reclaim_release);
#endif
+/*
+ * Zonelists may change due to hotplug during allocation. Detect when zonelists
+ * have been rebuilt so allocation retries. Reader side does not lock and
+ * retries the allocation if zonelist changes. Writer side is protected by the
+ * embedded spin_lock.
+ */
+DEFINE_SEQLOCK(zonelist_update_seq);
+
+static unsigned int zonelist_iter_begin(void)
+{
+ if (IS_ENABLED(CONFIG_MEMORY_HOTREMOVE))
+ return read_seqbegin(&zonelist_update_seq);
+
+ return 0;
+}
+
+static unsigned int check_retry_zonelist(unsigned int seq)
+{
+ if (IS_ENABLED(CONFIG_MEMORY_HOTREMOVE))
+ return read_seqretry(&zonelist_update_seq, seq);
+
+ return seq;
+}
+
/* Perform direct synchronous page reclaim */
static unsigned long
__perform_reclaim(gfp_t gfp_mask, unsigned int order,
@@ -5001,6 +5025,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, u
int compaction_retries;
int no_progress_loops;
unsigned int cpuset_mems_cookie;
+ unsigned int zonelist_iter_cookie;
int reserve_flags;
/*
@@ -5011,11 +5036,12 @@ __alloc_pages_slowpath(gfp_t gfp_mask, u
(__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)))
gfp_mask &= ~__GFP_ATOMIC;
-retry_cpuset:
+restart:
compaction_retries = 0;
no_progress_loops = 0;
compact_priority = DEF_COMPACT_PRIORITY;
cpuset_mems_cookie = read_mems_allowed_begin();
+ zonelist_iter_cookie = zonelist_iter_begin();
/*
* The fast path uses conservative alloc_flags to succeed only until
@@ -5187,9 +5213,13 @@ retry:
goto retry;
- /* Deal with possible cpuset update races before we start OOM killing */
- if (check_retry_cpuset(cpuset_mems_cookie, ac))
- goto retry_cpuset;
+ /*
+ * Deal with possible cpuset update races or zonelist updates to avoid
+ * a unnecessary OOM kill.
+ */
+ if (check_retry_cpuset(cpuset_mems_cookie, ac) ||
+ check_retry_zonelist(zonelist_iter_cookie))
+ goto restart;
/* Reclaim has failed us, start killing things */
page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress);
@@ -5209,9 +5239,13 @@ retry:
}
nopage:
- /* Deal with possible cpuset update races before we fail */
- if (check_retry_cpuset(cpuset_mems_cookie, ac))
- goto retry_cpuset;
+ /*
+ * Deal with possible cpuset update races or zonelist updates to avoid
+ * a unnecessary OOM kill.
+ */
+ if (check_retry_cpuset(cpuset_mems_cookie, ac) ||
+ check_retry_zonelist(zonelist_iter_cookie))
+ goto restart;
/*
* Make sure that __GFP_NOFAIL request doesn't leak out and make sure
@@ -6514,9 +6548,8 @@ static void __build_all_zonelists(void *
int nid;
int __maybe_unused cpu;
pg_data_t *self = data;
- static DEFINE_SPINLOCK(lock);
- spin_lock(&lock);
+ write_seqlock(&zonelist_update_seq);
#ifdef CONFIG_NUMA
memset(node_load, 0, sizeof(node_load));
@@ -6553,7 +6586,7 @@ static void __build_all_zonelists(void *
#endif
}
- spin_unlock(&lock);
+ write_sequnlock(&zonelist_update_seq);
}
static noinline void __init
_
Patches currently in -mm which might be from mgorman(a)techsingularity.net are
mm-page_alloc-fix-race-condition-between-build_all_zonelists-and-page-allocation.patch
mm-page_alloc-leave-irqs-enabled-for-per-cpu-page-allocations.patch
--
Hello dear,
I am contacting you again further to my previous email which you never
responded to. Please reconfirm to me if you are still using this email
address. However, I apologize for any inconvenience.
I await your swift response.
Hi ,
We are running bpf selftests on 5.4.210 kernel version and we see that
test case 11 of test_align failed. Please find the below error.
selftests: bpf: test_align
Test 11: pointer variable subtraction ... Failed to find match 16:
R5_w=pkt(id=2,off=0,r=0,umin_value=2,umax_value=1082,var_off=(0x2;
0xfffffffc)
# func#0 @0
# 0: R1=ctx(id=0,off=0,imm=0) R10=fp0
# 0: (61) r2 = *(u32 *)(r1 +76)
# 1: R1=ctx(id=0,off=0,imm=0) R2_w=pkt(id=0,off=0,r=0,imm=0) R10=fp0
# 1: (61) r3 = *(u32 *)(r1 +80)
For complete errors please see the attached file. The same test case
execution was successful in the 5.4.209 version , could you please let
me know any known issue with the recent changes in 5.4.210 and how to
fix these errors.
Thanks,
Rajesh Dasari.
The userspace can configure a loop using an ioctl call, wherein
a configuration of type loop_config is passed (see lo_ioctl()'s
case on line 1550 of drivers/block/loop.c). This proceeds to call
loop_configure() which in turn calls loop_set_status_from_info()
(see line 1050 of loop.c), passing &config->info which is of type
loop_info64*. This function then sets the appropriate values, like
the offset.
loop_device has lo_offset of type loff_t (see line 52 of loop.c),
which is typdef-chained to long long, whereas loop_info64 has
lo_offset of type __u64 (see line 56 of include/uapi/linux/loop.h).
The function directly copies offset from info to the device as
follows (See line 980 of loop.c):
lo->lo_offset = info->lo_offset;
This results in an overflow, which triggers a warning in iomap_iter()
due to a call to iomap_iter_done() which has:
WARN_ON_ONCE(iter->iomap.offset > iter->pos);
Thus, check for negative value during loop_set_status_from_info().
Bug report: https://syzkaller.appspot.com/bug?id=c620fe14aac810396d3c3edc9ad73848bf69a2…
Reported-and-tested-by: syzbot+a8e049cd3abd342936b6(a)syzkaller.appspotmail.com
Cc: stable(a)vger.kernel.org
Reviewed-by: Matthew Wilcox (Oracle) <willy(a)infradead.org>
Signed-off-by: Siddh Raman Pant <code(a)siddh.me>
---
Changes since v1:
- Do not break userspace API, so check loop_device for overflow.
- Use EOVERFLOW instead of EINVAL.
drivers/block/loop.c | 5 +++++
1 file changed, 5 insertions(+)
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index e3c0ba93c1a3..ad92192c7d61 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -979,6 +979,11 @@ loop_set_status_from_info(struct loop_device *lo,
lo->lo_offset = info->lo_offset;
lo->lo_sizelimit = info->lo_sizelimit;
+
+ /* loff_t vars have been assigned __u64 */
+ if (lo->lo_offset < 0 || lo->lo_sizelimit < 0)
+ return -EOVERFLOW;
+
memcpy(lo->lo_file_name, info->lo_file_name, LO_NAME_SIZE);
lo->lo_file_name[LO_NAME_SIZE-1] = 0;
lo->lo_flags = info->lo_flags;
--
2.35.1
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From b8c824a869f220c6b46df724f85794349bafbf23 Mon Sep 17 00:00:00 2001
From: Basavaraj Natikar <Basavaraj.Natikar(a)amd.com>
Date: Mon, 13 Jun 2022 12:11:26 +0530
Subject: [PATCH] pinctrl: amd: Don't save/restore interrupt status and wake
status bits
Saving/restoring interrupt and wake status bits across suspend can
cause the suspend to fail if an IRQ is serviced across the
suspend cycle.
Signed-off-by: Mario Limonciello <mario.limonciello(a)amd.com>
Signed-off-by: Basavaraj Natikar <Basavaraj.Natikar(a)amd.com>
Fixes: 79d2c8bede2c ("pinctrl/amd: save pin registers over suspend/resume")
Link: https://lore.kernel.org/r/20220613064127.220416-3-Basavaraj.Natikar@amd.com
Signed-off-by: Linus Walleij <linus.walleij(a)linaro.org>
diff --git a/drivers/pinctrl/pinctrl-amd.c b/drivers/pinctrl/pinctrl-amd.c
index e497df89a4a7..9ec97c6db5e9 100644
--- a/drivers/pinctrl/pinctrl-amd.c
+++ b/drivers/pinctrl/pinctrl-amd.c
@@ -918,6 +918,7 @@ static int amd_gpio_suspend(struct device *dev)
{
struct amd_gpio *gpio_dev = dev_get_drvdata(dev);
struct pinctrl_desc *desc = gpio_dev->pctrl->desc;
+ unsigned long flags;
int i;
for (i = 0; i < desc->npins; i++) {
@@ -926,7 +927,9 @@ static int amd_gpio_suspend(struct device *dev)
if (!amd_gpio_should_save(gpio_dev, pin))
continue;
- gpio_dev->saved_regs[i] = readl(gpio_dev->base + pin*4);
+ raw_spin_lock_irqsave(&gpio_dev->lock, flags);
+ gpio_dev->saved_regs[i] = readl(gpio_dev->base + pin * 4) & ~PIN_IRQ_PENDING;
+ raw_spin_unlock_irqrestore(&gpio_dev->lock, flags);
}
return 0;
@@ -936,6 +939,7 @@ static int amd_gpio_resume(struct device *dev)
{
struct amd_gpio *gpio_dev = dev_get_drvdata(dev);
struct pinctrl_desc *desc = gpio_dev->pctrl->desc;
+ unsigned long flags;
int i;
for (i = 0; i < desc->npins; i++) {
@@ -944,7 +948,10 @@ static int amd_gpio_resume(struct device *dev)
if (!amd_gpio_should_save(gpio_dev, pin))
continue;
- writel(gpio_dev->saved_regs[i], gpio_dev->base + pin*4);
+ raw_spin_lock_irqsave(&gpio_dev->lock, flags);
+ gpio_dev->saved_regs[i] |= readl(gpio_dev->base + pin * 4) & PIN_IRQ_PENDING;
+ writel(gpio_dev->saved_regs[i], gpio_dev->base + pin * 4);
+ raw_spin_unlock_irqrestore(&gpio_dev->lock, flags);
}
return 0;
On 13-08-22, 16:23, gregkh(a)linuxfoundation.org wrote:
>
> This is a note to let you know that I've just added the patch titled
>
> soundwire: qcom: Check device status before reading devid
>
> to the 5.15-stable tree which can be found at:
> http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum…
>
> The filename of the patch is:
> soundwire-qcom-check-device-status-before-reading-devid.patch
> and it can be found in the queue-5.15 subdirectory.
>
> If you, or anyone else, feels it should not be added to the stable tree,
> please let <stable(a)vger.kernel.org> know about it.
This is causing regression in rc1 so can this be dropped from stable
please
--
~Vinod
The following commit has been merged into the x86/urgent branch of tip:
Commit-ID: cdaa0a407f1acd3a44861e3aea6e3c7349e668f1
Gitweb: https://git.kernel.org/tip/cdaa0a407f1acd3a44861e3aea6e3c7349e668f1
Author: Tom Lendacky <thomas.lendacky(a)amd.com>
AuthorDate: Tue, 23 Aug 2022 16:55:51 -05:00
Committer: Borislav Petkov <bp(a)suse.de>
CommitterDate: Wed, 24 Aug 2022 09:54:32 +02:00
x86/sev: Don't use cc_platform_has() for early SEV-SNP calls
When running identity-mapped and depending on the kernel configuration,
it is possible that the compiler uses jump tables when generating code
for cc_platform_has().
This causes a boot failure because the jump table uses un-mapped kernel
virtual addresses, not identity-mapped addresses. This has been seen
with CONFIG_RETPOLINE=n.
Similar to sme_encrypt_kernel(), use an open-coded direct check for the
status of SNP rather than trying to eliminate the jump table. This
preserves any code optimization in cc_platform_has() that can be useful
post boot. It also limits the changes to SEV-specific files so that
future compiler features won't necessarily require possible build changes
just because they are not compatible with running identity-mapped.
[ bp: Massage commit message. ]
Fixes: 5e5ccff60a29 ("x86/sev: Add helper for validating pages in early enc attribute changes")
Reported-by: Sean Christopherson <seanjc(a)google.com>
Suggested-by: Sean Christopherson <seanjc(a)google.com>
Signed-off-by: Tom Lendacky <thomas.lendacky(a)amd.com>
Signed-off-by: Borislav Petkov <bp(a)suse.de>
Cc: <stable(a)vger.kernel.org> # 5.19.x
Link: https://lore.kernel.org/all/YqfabnTRxFSM+LoX@google.com/
---
arch/x86/kernel/sev.c | 16 ++++++++++++++--
1 file changed, 14 insertions(+), 2 deletions(-)
diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c
index 63dc626..4f84c3f 100644
--- a/arch/x86/kernel/sev.c
+++ b/arch/x86/kernel/sev.c
@@ -701,7 +701,13 @@ e_term:
void __init early_snp_set_memory_private(unsigned long vaddr, unsigned long paddr,
unsigned int npages)
{
- if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP))
+ /*
+ * This can be invoked in early boot while running identity mapped, so
+ * use an open coded check for SNP instead of using cc_platform_has().
+ * This eliminates worries about jump tables or checking boot_cpu_data
+ * in the cc_platform_has() function.
+ */
+ if (!(sev_status & MSR_AMD64_SEV_SNP_ENABLED))
return;
/*
@@ -717,7 +723,13 @@ void __init early_snp_set_memory_private(unsigned long vaddr, unsigned long padd
void __init early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr,
unsigned int npages)
{
- if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP))
+ /*
+ * This can be invoked in early boot while running identity mapped, so
+ * use an open coded check for SNP instead of using cc_platform_has().
+ * This eliminates worries about jump tables or checking boot_cpu_data
+ * in the cc_platform_has() function.
+ */
+ if (!(sev_status & MSR_AMD64_SEV_SNP_ENABLED))
return;
/* Invalidate the memory pages before they are marked shared in the RMP table. */
In some cases bootloaders will leave boot_params->cc_blob_address
uninitialized rather than zero'ing it out. This field is only meant to
be set by the boot/compressed kernel to pass information to the
uncompressed kernel when SEV-SNP support is enabled, so there are no
cases where the bootloader-provided values should be treated as
anything other than garbage. Otherwise, the uncompressed kernel may
attempt to access this bogus address, leading to a crash during early
boot.
Normally sanitize_boot_params() would be used to clear out such fields,
but that happens too late: sev_enable() may have already initialized it
to a valid value that should not be zero'd out. Instead, have
sev_enable() zero it out unconditionally beforehand.
Also ensure this happens for !CONFIG_AMD_MEM_ENCRYPT as well by also
including this handling in the sev_enable() stub function.
Fixes: b190a043c49a ("x86/sev: Add SEV-SNP feature detection/setup")
Cc: stable(a)vger.kernel.org
Reported-by: Jeremi Piotrowski <jpiotrowski(a)linux.microsoft.com>
Reported-by: watnuss(a)gmx.de
Link: https://bugzilla.kernel.org/show_bug.cgi?id=216387
Signed-off-by: Michael Roth <michael.roth(a)amd.com>
---
arch/x86/boot/compressed/misc.h | 11 ++++++++++-
arch/x86/boot/compressed/sev.c | 8 ++++++++
2 files changed, 18 insertions(+), 1 deletion(-)
diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h
index 4910bf230d7b..aa7889751abc 100644
--- a/arch/x86/boot/compressed/misc.h
+++ b/arch/x86/boot/compressed/misc.h
@@ -132,7 +132,16 @@ void snp_set_page_private(unsigned long paddr);
void snp_set_page_shared(unsigned long paddr);
void sev_prep_identity_maps(unsigned long top_level_pgt);
#else
-static inline void sev_enable(struct boot_params *bp) { }
+static inline void sev_enable(struct boot_params *bp)
+{
+ /*
+ * bp->cc_blob_address should only be set by boot/compressed kernel.
+ * Initialize it to 0 to ensure that uninitialized values from
+ * buggy bootloaders aren't propagated.
+ */
+ if (bp)
+ bp->cc_blob_address = 0;
+}
static inline void sev_es_shutdown_ghcb(void) { }
static inline bool sev_es_check_ghcb_fault(unsigned long address)
{
diff --git a/arch/x86/boot/compressed/sev.c b/arch/x86/boot/compressed/sev.c
index 52f989f6acc2..c93930d5ccbd 100644
--- a/arch/x86/boot/compressed/sev.c
+++ b/arch/x86/boot/compressed/sev.c
@@ -276,6 +276,14 @@ void sev_enable(struct boot_params *bp)
struct msr m;
bool snp;
+ /*
+ * bp->cc_blob_address should only be set by boot/compressed kernel.
+ * Initialize it to 0 to ensure that uninitialized values from
+ * buggy bootloaders aren't propagated.
+ */
+ if (bp)
+ bp->cc_blob_address = 0;
+
/*
* Setup/preliminary detection of SNP. This will be sanity-checked
* against CPUID/MSR values later.
--
2.25.1
This is the start of the stable review cycle for the 4.9.326 release.
There are 101 patches in this series, all will be posted as a response
to this one. If anyone has any issues with these being applied, please
let me know.
Responses should be made by Thu, 25 Aug 2022 08:00:15 +0000.
Anything received after that time might be too late.
The whole patch series can be found in one patch at:
https://www.kernel.org/pub/linux/kernel/v4.x/stable-review/patch-4.9.326-rc…
or in the git tree and branch at:
git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git linux-4.9.y
and the diffstat can be found below.
thanks,
greg k-h
-------------
Pseudo-Shortlog of commits:
Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
Linux 4.9.326-rc1
Nathan Chancellor <nathan(a)kernel.org>
MIPS: tlbex: Explicitly compare _PAGE_NO_EXEC against 0
Zheyu Ma <zheyuma97(a)gmail.com>
video: fbdev: i740fb: Check the argument of i740_calc_vclk()
Zhouyi Zhou <zhouzhouyi(a)gmail.com>
powerpc/64: Init jump labels before parse_early_param()
Takashi Iwai <tiwai(a)suse.de>
ALSA: timer: Use deferred fasync helper
Takashi Iwai <tiwai(a)suse.de>
ALSA: core: Add async signal helpers
Liang He <windhl(a)126.com>
mips: cavium-octeon: Fix missing of_node_put() in octeon2_usb_clocks_start
Schspa Shi <schspa(a)gmail.com>
vfio: Clear the caps->buf to NULL after free
Liang He <windhl(a)126.com>
tty: serial: Fix refcount leak bug in ucc_uart.c
Kiselev, Oleg <okiselev(a)amazon.com>
ext4: avoid resizing to a partial cluster size
Ye Bin <yebin10(a)huawei.com>
ext4: avoid remove directory when directory is corrupted
Wentao_Liang <Wentao_Liang_g(a)163.com>
drivers:md:fix a potential use-after-free bug
Christophe JAILLET <christophe.jaillet(a)wanadoo.fr>
cxl: Fix a memory leak in an error handling path
Jozef Martiniak <jomajm(a)gmail.com>
gadgetfs: ep_io - wait until IRQ finishes
Liang He <windhl(a)126.com>
usb: host: ohci-ppc-of: Fix refcount leak bug
Sai Prakash Ranjan <quic_saipraka(a)quicinc.com>
irqchip/tegra: Fix overflow implicit truncation warnings
Masahiro Yamada <yamada.masahiro(a)socionext.com>
kbuild: clear LDFLAGS in the top Makefile
Csókás Bence <csokas.bence(a)prolan.hu>
fec: Fix timer capture timing in `fec_ptp_enable_pps()`
Pablo Neira Ayuso <pablo(a)netfilter.org>
netfilter: nf_tables: really skip inactive sets when allocating name
Al Viro <viro(a)zeniv.linux.org.uk>
nios2: add force_successful_syscall_return()
Al Viro <viro(a)zeniv.linux.org.uk>
nios2: restarts apply only to the first sigframe we build...
Al Viro <viro(a)zeniv.linux.org.uk>
nios2: fix syscall restart checks
Al Viro <viro(a)zeniv.linux.org.uk>
nios2: traced syscall does need to check the syscall number
Al Viro <viro(a)zeniv.linux.org.uk>
nios2: don't leave NULLs in sys_call_table[]
Al Viro <viro(a)zeniv.linux.org.uk>
nios2: page fault et.al. are *not* restartable syscalls...
Duoming Zhou <duoming(a)zju.edu.cn>
atm: idt77252: fix use-after-free bugs caused by tst_timer
Dan Carpenter <dan.carpenter(a)oracle.com>
xen/xenbus: fix return type in xenbus_file_read()
Peilin Ye <peilin.ye(a)bytedance.com>
vsock: Fix memory leak in vsock_connect()
Nikita Travkin <nikita(a)trvn.ru>
pinctrl: qcom: msm8916: Allow CAMSS GP clocks to be muxed
Miaoqian Lin <linmq006(a)gmail.com>
pinctrl: nomadik: Fix refcount leak in nmk_pinctrl_dt_subnode_to_map
Trond Myklebust <trond.myklebust(a)hammerspace.com>
SUNRPC: Reinitialise the backchannel request buffers before reuse
Zhang Xianwei <zhang.xianwei8(a)zte.com.cn>
NFSv4.1: RECLAIM_COMPLETE must handle EACCES
Marc Kleine-Budde <mkl(a)pengutronix.de>
can: ems_usb: fix clang's -Wunaligned-access warning
Filipe Manana <fdmanana(a)suse.com>
btrfs: fix lost error handling when looking up extended ref on log replay
Damien Le Moal <damien.lemoal(a)opensource.wdc.com>
ata: libata-eh: Add missing command name
Mikulas Patocka <mpatocka(a)redhat.com>
rds: add missing barrier to release_refill
Amadeusz Sławiński <amadeuszx.slawinski(a)linux.intel.com>
ALSA: info: Fix llseek return value when using callback
Jamal Hadi Salim <jhs(a)mojatatu.com>
net_sched: cls_route: disallow handle of 0
Tyler Hicks <tyhicks(a)linux.microsoft.com>
net/9p: Initialize the iounit field during fid creation
Guenter Roeck <linux(a)roeck-us.net>
nios2: time: Read timer in get_cycles only if initialized
Luiz Augusto von Dentz <luiz.von.dentz(a)intel.com>
Bluetooth: L2CAP: Fix l2cap_global_chan_by_psm regression
Jose Alonso <joalonsof(a)gmail.com>
Revert "net: usb: ax88179_178a needs FLAG_SEND_ZLP"
Tony Battersby <tonyb(a)cybernetics.com>
scsi: sg: Allow waiting for commands to complete on removed device
Eric Dumazet <edumazet(a)google.com>
tcp: fix over estimation in sk_forced_mem_schedule()
Qu Wenruo <wqu(a)suse.com>
btrfs: reject log replay if there is unsupported RO compat flag
Thadeu Lima de Souza Cascardo <cascardo(a)canonical.com>
net_sched: cls_route: remove from list when handle is 0
Mikulas Patocka <mpatocka(a)redhat.com>
dm raid: fix address sanitizer warning in raid_status
Baokun Li <libaokun1(a)huawei.com>
ext4: correct max_inline_xattr_value_size computing
Eric Whitney <enwlinux(a)gmail.com>
ext4: fix extent status tree race in writeback error recovery path
Theodore Ts'o <tytso(a)mit.edu>
ext4: update s_overhead_clusters in the superblock during an on-line resize
Baokun Li <libaokun1(a)huawei.com>
ext4: fix use-after-free in ext4_xattr_set_entry
Lukas Czerner <lczerner(a)redhat.com>
ext4: make sure ext4_append() always allocates new block
Baokun Li <libaokun1(a)huawei.com>
ext4: add EXT4_INODE_HAS_XATTR_SPACE macro in xattr.h
David Collins <quic_collinsd(a)quicinc.com>
spmi: trace: fix stack-out-of-bound access in SPMI tracing functions
Alexander Lobakin <alexandr.lobakin(a)intel.com>
x86/olpc: fix 'logical not is only applied to the left hand side'
Steffen Maier <maier(a)linux.ibm.com>
scsi: zfcp: Fix missing auto port scan and thus missing target ports
Florian Westphal <fw(a)strlen.de>
netfilter: nf_tables: fix null deref due to zeroed list head
Weitao Wang <WeitaoWang-oc(a)zhaoxin.com>
USB: HCD: Fix URB giveback issue in tasklet function
Huacai Chen <chenhuacai(a)loongson.cn>
MIPS: cpuinfo: Fix a warning for CONFIG_CPUMASK_OFFSTACK
Michael Ellerman <mpe(a)ellerman.id.au>
powerpc/powernv: Avoid crashing if rng is NULL
Pali Rohár <pali(a)kernel.org>
powerpc/fsl-pci: Fix Class Code of PCIe Root Port
Pali Rohár <pali(a)kernel.org>
PCI: Add defines for normal and subtractive PCI bridges
Alexander Lobakin <alexandr.lobakin(a)intel.com>
ia64, processor: fix -Wincompatible-pointer-types in ia64_get_irr()
Mikulas Patocka <mpatocka(a)redhat.com>
md-raid10: fix KASAN warning
Miklos Szeredi <mszeredi(a)redhat.com>
fuse: limit nsec
Daniel Borkmann <daniel(a)iogearbox.net>
bpf: fix overflow in prog accounting
Timur Tabi <ttabi(a)nvidia.com>
drm/nouveau: fix another off-by-one in nvbios_addr
Helge Deller <deller(a)gmx.de>
parisc: Fix device names in /proc/iomem
Lukas Wunner <lukas(a)wunner.de>
usbnet: Fix linkwatch use-after-free on disconnect
David Howells <dhowells(a)redhat.com>
vfs: Check the truncate maximum size in inode_newsize_ok()
Allen Ballway <ballway(a)chromium.org>
ALSA: hda/cirrus - support for iMac 12,1 model
Meng Tang <tangmeng(a)uniontech.com>
ALSA: hda/conexant: Add quirk for LENOVO 20149 Notebook model
Sean Christopherson <seanjc(a)google.com>
KVM: x86: Mark TSS busy during LTR emulation _after_ all fault checks
Maciej S. Szmigiero <maciej.szmigiero(a)oracle.com>
KVM: SVM: Don't BUG if userspace injects an interrupt with GIF=0
Mikulas Patocka <mpatocka(a)redhat.com>
add barriers to buffer_uptodate and set_buffer_uptodate
Zheyu Ma <zheyuma97(a)gmail.com>
ALSA: bcd2000: Fix a UAF bug on the error path of probing
Nick Desaulniers <ndesaulniers(a)google.com>
x86: link vdso and boot with -z noexecstack --no-warn-rwx-segments
Nick Desaulniers <ndesaulniers(a)google.com>
Makefile: link with -z noexecstack --no-warn-rwx-segments
Ning Qiang <sohu0106(a)126.com>
macintosh/adb: fix oob read in do_adb_query() function
Hans-Christian Noren Egtvedt <hegtvedt(a)cisco.com>
random: only call boot_init_stack_canary() once
Werner Sembach <wse(a)tuxedocomputers.com>
ACPI: video: Shortening quirk list by identifying Clevo by board_name only
Werner Sembach <wse(a)tuxedocomputers.com>
ACPI: video: Force backlight native for some TongFang devices
Daniel Micay <danielmicay(a)gmail.com>
init/main.c: extract early boot entropy from the passed cmdline
Laura Abbott <lauraa(a)codeaurora.org>
init: move stack canary initialization after setup_arch
Viresh Kumar <viresh.kumar(a)linaro.org>
init/main: properly align the multi-line comment
Viresh Kumar <viresh.kumar(a)linaro.org>
init/main: Fix double "the" in comment
Christian Borntraeger <borntraeger(a)de.ibm.com>
include/uapi/linux/swab.h: fix userspace breakage, use __BITS_PER_LONG for swap
Paul Moore <paul(a)paul-moore.com>
selinux: fix inode_doinit_with_dentry() LABEL_INVALID error handling
Tianyue Ren <rentianyue(a)kylinos.cn>
selinux: fix error initialization in inode_doinit_with_dentry()
Andreas Gruenbacher <agruenba(a)redhat.com>
selinux: Convert isec->lock into a spinlock
Andreas Gruenbacher <agruenba(a)redhat.com>
selinux: Clean up initialization of isec->sclass
Andreas Gruenbacher <agruenba(a)redhat.com>
proc: Pass file mode to proc_pid_make_inode
Andreas Gruenbacher <agruenba(a)redhat.com>
selinux: Minor cleanups
Nathan Chancellor <nathan(a)kernel.org>
ion: Make user_ion_handle_put_nolock() a void function
Wei Mingzhi <whistler(a)member.fsf.org>
mt7601u: add USB device ID for some versions of XiaoDu WiFi Dongle.
Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
ARM: crypto: comment out gcc warning that breaks clang builds
Florian Westphal <fw(a)strlen.de>
netfilter: nf_queue: do not allow packet truncation below transport header offset
Liang He <windhl(a)126.com>
net: sungem_phy: Add of_node_put() for reference returned by of_get_parent()
Kuniyuki Iwashima <kuniyu(a)amazon.com>
net: ping6: Fix memleak in ipv6_renew_options().
Liang He <windhl(a)126.com>
scsi: ufs: host: Hold reference returned by of_parse_phandle()
ChenXiaoSong <chenxiaosong2(a)huawei.com>
ntfs: fix use-after-free in ntfs_ucsncmp()
Luiz Augusto von Dentz <luiz.von.dentz(a)intel.com>
Bluetooth: L2CAP: Fix use-after-free caused by l2cap_chan_put
-------------
Diffstat:
Makefile | 8 +-
arch/arm/lib/xor-neon.c | 3 +-
arch/ia64/include/asm/processor.h | 2 +-
arch/mips/cavium-octeon/octeon-platform.c | 3 +-
arch/mips/kernel/proc.c | 2 +-
arch/mips/mm/tlbex.c | 4 +-
arch/nios2/include/asm/entry.h | 3 +-
arch/nios2/include/asm/ptrace.h | 2 +
arch/nios2/kernel/entry.S | 22 +++--
arch/nios2/kernel/signal.c | 3 +-
arch/nios2/kernel/syscall_table.c | 1 +
arch/nios2/kernel/time.c | 5 +-
arch/parisc/kernel/drivers.c | 9 +-
arch/powerpc/kernel/prom.c | 7 ++
arch/powerpc/platforms/powernv/rng.c | 2 +
arch/powerpc/sysdev/fsl_pci.c | 8 ++
arch/powerpc/sysdev/fsl_pci.h | 1 +
arch/x86/boot/Makefile | 2 +-
arch/x86/boot/compressed/Makefile | 4 +
arch/x86/entry/vdso/Makefile | 2 +-
arch/x86/kvm/emulate.c | 19 ++--
arch/x86/kvm/svm.c | 2 -
arch/x86/platform/olpc/olpc-xo1-sci.c | 2 +-
drivers/acpi/video_detect.c | 55 +++++++----
drivers/ata/libata-eh.c | 1 +
drivers/atm/idt77252.c | 1 +
drivers/gpu/drm/nouveau/nvkm/subdev/bios/base.c | 2 +-
drivers/irqchip/irq-tegra.c | 10 +-
drivers/macintosh/adb.c | 2 +-
drivers/md/dm-raid.c | 2 +-
drivers/md/raid10.c | 5 +-
drivers/md/raid5.c | 2 +-
drivers/misc/cxl/irq.c | 1 +
drivers/net/can/usb/ems_usb.c | 2 +-
drivers/net/ethernet/freescale/fec_ptp.c | 6 +-
drivers/net/sungem_phy.c | 1 +
drivers/net/usb/ax88179_178a.c | 14 +--
drivers/net/usb/usbnet.c | 8 +-
drivers/net/wireless/mediatek/mt7601u/usb.c | 1 +
drivers/pinctrl/nomadik/pinctrl-nomadik.c | 4 +-
drivers/pinctrl/qcom/pinctrl-msm8916.c | 4 +-
drivers/s390/scsi/zfcp_fc.c | 29 ++++--
drivers/s390/scsi/zfcp_fc.h | 6 +-
drivers/s390/scsi/zfcp_fsf.c | 4 +-
drivers/scsi/sg.c | 57 ++++++-----
drivers/scsi/ufs/ufshcd-pltfrm.c | 15 ++-
drivers/staging/android/ion/ion-ioctl.c | 8 +-
drivers/tty/serial/ucc_uart.c | 2 +
drivers/usb/core/hcd.c | 26 ++---
drivers/usb/gadget/legacy/inode.c | 1 +
drivers/usb/host/ohci-ppc-of.c | 1 +
drivers/vfio/vfio.c | 1 +
drivers/video/fbdev/i740fb.c | 9 +-
drivers/xen/xenbus/xenbus_dev_frontend.c | 4 +-
fs/attr.c | 2 +
fs/btrfs/disk-io.c | 14 +++
fs/btrfs/tree-log.c | 4 +-
fs/ext4/inline.c | 3 +
fs/ext4/inode.c | 7 ++
fs/ext4/namei.c | 23 ++++-
fs/ext4/resize.c | 11 +++
fs/ext4/xattr.c | 6 +-
fs/ext4/xattr.h | 13 +++
fs/fuse/inode.c | 6 ++
fs/nfs/nfs4proc.c | 3 +
fs/ntfs/attrib.c | 8 +-
fs/proc/base.c | 23 ++---
fs/proc/fd.c | 6 +-
fs/proc/internal.h | 2 +-
fs/proc/namespaces.c | 3 +-
include/linux/bpf.h | 11 +++
include/linux/buffer_head.h | 25 ++++-
include/linux/pci_ids.h | 2 +
include/linux/usb/hcd.h | 1 +
include/net/bluetooth/l2cap.h | 1 +
include/sound/core.h | 8 ++
include/trace/events/spmi.h | 12 +--
include/uapi/linux/swab.h | 4 +-
init/main.c | 14 +--
kernel/bpf/core.c | 16 ++-
kernel/bpf/syscall.c | 36 +++++--
net/9p/client.c | 4 +-
net/bluetooth/l2cap_core.c | 68 +++++++++----
net/ipv4/tcp_output.c | 7 +-
net/ipv6/ping.c | 6 ++
net/netfilter/nf_tables_api.c | 3 +-
net/netfilter/nfnetlink_queue.c | 7 +-
net/rds/ib_recv.c | 1 +
net/sched/cls_route.c | 8 +-
net/sunrpc/backchannel_rqst.c | 14 +++
net/vmw_vsock/af_vsock.c | 9 +-
security/selinux/hooks.c | 123 +++++++++++++++---------
security/selinux/include/objsec.h | 5 +-
security/selinux/selinuxfs.c | 4 +-
sound/core/info.c | 6 +-
sound/core/misc.c | 94 ++++++++++++++++++
sound/core/timer.c | 11 ++-
sound/pci/hda/patch_cirrus.c | 1 +
sound/pci/hda/patch_conexant.c | 11 ++-
sound/usb/bcd2000/bcd2000.c | 3 +-
100 files changed, 753 insertions(+), 296 deletions(-)
The patch titled
Subject: mm/mprotect: Only reference swap pfn page if type match
has been added to the -mm mm-hotfixes-unstable branch. Its filename is
mm-mprotect-only-reference-swap-pfn-page-if-type-match.patch
This patch will shortly appear at
https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patche…
This patch will later appear in the mm-hotfixes-unstable branch at
git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next via the mm-everything
branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
and is updated there every 2-3 working days
------------------------------------------------------
From: Peter Xu <peterx(a)redhat.com>
Subject: mm/mprotect: Only reference swap pfn page if type match
Date: Tue, 23 Aug 2022 18:11:38 -0400
Yu Zhao reported a bug after the commit "mm/swap: Add swp_offset_pfn() to
fetch PFN from swap entry" added a check in swp_offset_pfn() for swap type [1]:
kernel BUG at include/linux/swapops.h:117!
CPU: 46 PID: 5245 Comm: EventManager_De Tainted: G S O L 6.0.0-dbg-DEV #2
RIP: 0010:pfn_swap_entry_to_page+0x72/0xf0
Code: c6 48 8b 36 48 83 fe ff 74 53 48 01 d1 48 83 c1 08 48 8b 09 f6
c1 01 75 7b 66 90 48 89 c1 48 8b 09 f6 c1 01 74 74 5d c3 eb 9e <0f> 0b
48 ba ff ff ff ff 03 00 00 00 eb ae a9 ff 0f 00 00 75 13 48
RSP: 0018:ffffa59e73fabb80 EFLAGS: 00010282
RAX: 00000000ffffffe8 RBX: 0c00000000000000 RCX: ffffcd5440000000
RDX: 1ffffffffff7a80a RSI: 0000000000000000 RDI: 0c0000000000042b
RBP: ffffa59e73fabb80 R08: ffff9965ca6e8bb8 R09: 0000000000000000
R10: ffffffffa5a2f62d R11: 0000030b372e9fff R12: ffff997b79db5738
R13: 000000000000042b R14: 0c0000000000042b R15: 1ffffffffff7a80a
FS: 00007f549d1bb700(0000) GS:ffff99d3cf680000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 0000440d035b3180 CR3: 0000002243176004 CR4: 00000000003706e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
<TASK>
change_pte_range+0x36e/0x880
change_p4d_range+0x2e8/0x670
change_protection_range+0x14e/0x2c0
mprotect_fixup+0x1ee/0x330
do_mprotect_pkey+0x34c/0x440
__x64_sys_mprotect+0x1d/0x30
It triggers because pfn_swap_entry_to_page() could be called upon e.g. a
genuine swap entry.
Fix it by only calling it when it's a write migration entry where the page*
is used.
[1] https://lore.kernel.org/lkml/CAOUHufaVC2Za-p8m0aiHw6YkheDcrO-C3wRGixwDS32VT…
Link: https://lkml.kernel.org/r/20220823221138.45602-1-peterx@redhat.com
Fixes: 6c287605fd56 ("mm: remember exclusively mapped anonymous pages with PG_anon_exclusive")
Signed-off-by: Peter Xu <peterx(a)redhat.com>
Reported-by: Yu Zhao <yuzhao(a)google.com>
Tested-by: Yu Zhao <yuzhao(a)google.com>
Cc: David Hildenbrand <david(a)redhat.com>
Cc: "Huang, Ying" <ying.huang(a)intel.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/mprotect.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
--- a/mm/mprotect.c~mm-mprotect-only-reference-swap-pfn-page-if-type-match
+++ a/mm/mprotect.c
@@ -196,10 +196,11 @@ static unsigned long change_pte_range(st
pages++;
} else if (is_swap_pte(oldpte)) {
swp_entry_t entry = pte_to_swp_entry(oldpte);
- struct page *page = pfn_swap_entry_to_page(entry);
pte_t newpte;
if (is_writable_migration_entry(entry)) {
+ struct page *page = pfn_swap_entry_to_page(entry);
+
/*
* A protection check is difficult so
* just be safe and disable write
_
Patches currently in -mm which might be from peterx(a)redhat.com are
mm-mprotect-only-reference-swap-pfn-page-if-type-match.patch
mm-x86-use-swp_type_bits-in-3-level-swap-macros.patch
mm-swap-comment-all-the-ifdef-in-swapopsh.patch
mm-swap-add-swp_offset_pfn-to-fetch-pfn-from-swap-entry.patch
mm-thp-carry-over-dirty-bit-when-thp-splits-on-pmd.patch
mm-remember-young-dirty-bit-for-page-migrations.patch
mm-swap-cache-maximum-swapfile-size-when-init-swap.patch
mm-swap-cache-swap-migration-a-d-bits-support.patch
In the function report_idle_softirq(), ratelimit is a static variable
and checked for < 10 to return false. Since this variable is not
assigned to any other value before the check, this condition will
always be true.
report_idle_softirq() introduced with the
commit 0345691b24c0 ("tick/rcu: Stop allowing RCU_SOFTIRQ in idle")
checks for pending soft irq during stopping of tick and returns true if
tick can't be stopped.
The purpose of ratelimit is to limit printing of warning messages, so
move the check for printing warning message only. Also don't return
false as tick can't be stopped here for pending soft irq.
Fixes: 0345691b24c0 ("tick/rcu: Stop allowing RCU_SOFTIRQ in idle")
Signed-off-by: Srinivas Pandruvada <srinivas.pandruvada(a)linux.intel.com>
Cc: stable(a)vger.kernel.org # 5.18+
---
One more change from the prior version of kernel:
In prior version !local_bh_blocked() check was only used for printing
warning message. The output was not used to decide whether to allow
tick-stop or not. But with change introduced with 5.18, the output
will also decide whether to stop tick or not even if
local_softirq_pending() returns true.
Not sure if this is intentional change or not.
kernel/time/tick-sched.c | 11 +++++------
1 file changed, 5 insertions(+), 6 deletions(-)
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index b0e3c9205946..4db634525bf4 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -1025,16 +1025,15 @@ static bool report_idle_softirq(void)
return false;
}
- if (ratelimit < 10)
- return false;
-
/* On RT, softirqs handling may be waiting on some lock */
if (!local_bh_blocked())
return false;
- pr_warn("NOHZ tick-stop error: local softirq work is pending, handler #%02x!!!\n",
- pending);
- ratelimit++;
+ if (ratelimit < 10) {
+ pr_warn("NOHZ tick-stop error: local softirq work is pending, handler #%02x!!!\n",
+ pending);
+ ratelimit++;
+ }
return true;
}
--
2.35.3
One of the side-effects of mb_optimize_scan was that the optimized
functions to select next group to try were called even before we tried
the goal group. As a result we no longer allocate files close to
corresponding inodes as well as we don't try to expand currently
allocated extent in the same group. This results in reaim regression
with workfile.disk workload of upto 8% with many clients on my test
machine:
baseline mb_optimize_scan
Hmean disk-1 2114.16 ( 0.00%) 2099.37 ( -0.70%)
Hmean disk-41 87794.43 ( 0.00%) 83787.47 * -4.56%*
Hmean disk-81 148170.73 ( 0.00%) 135527.05 * -8.53%*
Hmean disk-121 177506.11 ( 0.00%) 166284.93 * -6.32%*
Hmean disk-161 220951.51 ( 0.00%) 207563.39 * -6.06%*
Hmean disk-201 208722.74 ( 0.00%) 203235.59 ( -2.63%)
Hmean disk-241 222051.60 ( 0.00%) 217705.51 ( -1.96%)
Hmean disk-281 252244.17 ( 0.00%) 241132.72 * -4.41%*
Hmean disk-321 255844.84 ( 0.00%) 245412.84 * -4.08%*
Also this is causing huge regression (time increased by a factor of 5 or
so) when untarring archive with lots of small files on some eMMC storage
cards.
Fix the problem by making sure we try goal group first.
Fixes: 196e402adf2e ("ext4: improve cr 0 / cr 1 group scanning")
CC: stable(a)vger.kernel.org
Reported-by: Stefan Wahren <stefan.wahren(a)i2se.com>
Link: https://lore.kernel.org/all/20220727105123.ckwrhbilzrxqpt24@quack3/
Link: https://lore.kernel.org/all/0d81a7c2-46b7-6010-62a4-3e6cfc1628d6@i2se.com/
Signed-off-by: Jan Kara <jack(a)suse.cz>
---
fs/ext4/mballoc.c | 14 +++++++-------
1 file changed, 7 insertions(+), 7 deletions(-)
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index bd8f8b5c3d30..41e1cfecac3b 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -1049,8 +1049,10 @@ static void ext4_mb_choose_next_group(struct ext4_allocation_context *ac,
{
*new_cr = ac->ac_criteria;
- if (!should_optimize_scan(ac) || ac->ac_groups_linear_remaining)
+ if (!should_optimize_scan(ac) || ac->ac_groups_linear_remaining) {
+ *group = next_linear_group(ac, *group, ngroups);
return;
+ }
if (*new_cr == 0) {
ext4_mb_choose_next_group_cr0(ac, new_cr, group, ngroups);
@@ -2636,7 +2638,7 @@ static noinline_for_stack int
ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
{
ext4_group_t prefetch_grp = 0, ngroups, group, i;
- int cr = -1;
+ int cr = -1, new_cr;
int err = 0, first_err = 0;
unsigned int nr = 0, prefetch_ios = 0;
struct ext4_sb_info *sbi;
@@ -2711,13 +2713,11 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
ac->ac_groups_linear_remaining = sbi->s_mb_max_linear_groups;
prefetch_grp = group;
- for (i = 0; i < ngroups; group = next_linear_group(ac, group, ngroups),
- i++) {
- int ret = 0, new_cr;
+ for (i = 0, new_cr = cr; i < ngroups; i++,
+ ext4_mb_choose_next_group(ac, &new_cr, &group, ngroups)) {
+ int ret = 0;
cond_resched();
-
- ext4_mb_choose_next_group(ac, &new_cr, &group, ngroups);
if (new_cr != cr) {
cr = new_cr;
goto repeat;
--
2.35.3
UNITED NATIONS COVID-19 OVERDUE COMPENSATION UNIT.
REFERENCE PAYMENT CODE: 8525595
BAILOUT AMOUNT: $500,000.00 USD
ADDRESS: NEW YORK, NY 10017, UNITED STATES
Dear award recipient, Covid-19 Compensation funds.
You are receiving this correspondence because we have finally reached
a consensus with the UN, IRS, and IMF that your total fund worth
$500,000.00 USD of Covid-19 Compensation payment shall be delivered to
your nominated mode of receipt, and you are expected to pay the sum of
$10,000 for levies owed to authorities after receiving your funds.
You have a grace period of 2 weeks to pay the $10,000 levy after you
have received your Covid-19 Compensation total sum of $500,000.00 USD.
We shall proceed with the payment of your bailout grant only if you
agree to the terms and conditions stated.
Contact Dr. Mustafa Ali for more information by email at: (
mustafaliali180(a)gmail.com ) Your consent in this regard would be
highly appreciated.
Regards,
Mr. Jimmy Moore.
Undersecretary-General United Nations
Office of Internal Oversight-UNIOS.
From: Guenter Roeck <linux(a)roeck-us.net>
[ Upstream commit 0cc011c576aaa4de505046f7a6c90933d7c749a9 ]
In some circumstances, attempts are made to add entries to or to remove
entries from an uninitialized list. A prime example is
amdgpu_bo_vm_destroy(): It is indirectly called from
ttm_bo_init_reserved() if that function fails, and tries to remove an
entry from a list. However, that list is only initialized in
amdgpu_bo_create_vm() after the call to ttm_bo_init_reserved() returned
success. This results in crashes such as
BUG: kernel NULL pointer dereference, address: 0000000000000000
#PF: supervisor read access in kernel mode
#PF: error_code(0x0000) - not-present page
PGD 0 P4D 0
Oops: 0000 [#1] PREEMPT SMP NOPTI
CPU: 1 PID: 1479 Comm: chrome Not tainted 5.10.110-15768-g29a72e65dae5
Hardware name: Google Grunt/Grunt, BIOS Google_Grunt.11031.149.0 07/15/2020
RIP: 0010:__list_del_entry_valid+0x26/0x7d
...
Call Trace:
amdgpu_bo_vm_destroy+0x48/0x8b
ttm_bo_init_reserved+0x1d7/0x1e0
amdgpu_bo_create+0x212/0x476
? amdgpu_bo_user_destroy+0x23/0x23
? kmem_cache_alloc+0x60/0x271
amdgpu_bo_create_vm+0x40/0x7d
amdgpu_vm_pt_create+0xe8/0x24b
...
Check if the list's prev and next pointers are NULL to catch such problems.
Link: https://lkml.kernel.org/r/20220531222951.92073-1-linux@roeck-us.net
Signed-off-by: Guenter Roeck <linux(a)roeck-us.net>
Cc: Steven Rostedt <rostedt(a)goodmis.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
---
lib/list_debug.c | 12 ++++++++++--
1 file changed, 10 insertions(+), 2 deletions(-)
diff --git a/lib/list_debug.c b/lib/list_debug.c
index 5d5424b51b74..413daa72a3d8 100644
--- a/lib/list_debug.c
+++ b/lib/list_debug.c
@@ -20,7 +20,11 @@
bool __list_add_valid(struct list_head *new, struct list_head *prev,
struct list_head *next)
{
- if (CHECK_DATA_CORRUPTION(next->prev != prev,
+ if (CHECK_DATA_CORRUPTION(prev == NULL,
+ "list_add corruption. prev is NULL.\n") ||
+ CHECK_DATA_CORRUPTION(next == NULL,
+ "list_add corruption. next is NULL.\n") ||
+ CHECK_DATA_CORRUPTION(next->prev != prev,
"list_add corruption. next->prev should be prev (%px), but was %px. (next=%px).\n",
prev, next->prev, next) ||
CHECK_DATA_CORRUPTION(prev->next != next,
@@ -42,7 +46,11 @@ bool __list_del_entry_valid(struct list_head *entry)
prev = entry->prev;
next = entry->next;
- if (CHECK_DATA_CORRUPTION(next == LIST_POISON1,
+ if (CHECK_DATA_CORRUPTION(next == NULL,
+ "list_del corruption, %px->next is NULL\n", entry) ||
+ CHECK_DATA_CORRUPTION(prev == NULL,
+ "list_del corruption, %px->prev is NULL\n", entry) ||
+ CHECK_DATA_CORRUPTION(next == LIST_POISON1,
"list_del corruption, %px->next is LIST_POISON1 (%px)\n",
entry, LIST_POISON1) ||
CHECK_DATA_CORRUPTION(prev == LIST_POISON2,
--
2.35.1
This is a note to let you know that I've just added the patch titled
iio: ad7292: Prevent regulator double disable
to my char-misc git tree which can be found at
git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/char-misc.git
in the char-misc-linus branch.
The patch will show up in the next release of the linux-next tree
(usually sometime within the next 24 hours during the week.)
The patch will hopefully also be merged in Linus's tree for the
next -rc kernel release.
If you have any questions about this process, please let me know.
From 22b4277641c6823ec03d5b1cd82628e5e53e75b7 Mon Sep 17 00:00:00 2001
From: Matti Vaittinen <mazziesaccount(a)gmail.com>
Date: Fri, 19 Aug 2022 11:51:07 +0300
Subject: iio: ad7292: Prevent regulator double disable
The ad7292 tries to add an devm_action for disabling a regulator at
device detach using devm_add_action_or_reset(). The
devm_add_action_or_reset() does call the release function should adding
action fail. The driver inspects the value returned by
devm_add_action_or_reset() and manually calls regulator_disable() if
adding the action has failed. This leads to double disable and messes
the enable count for regulator.
Do not manually call disable if devm_add_action_or_reset() fails.
Fixes: 506d2e317a0a ("iio: adc: Add driver support for AD7292")
Signed-off-by: Matti Vaittinen <mazziesaccount(a)gmail.com>
Tested-by: Marcelo Schmitt <marcelo.schmitt1(a)gmail.com>
Link: https://lore.kernel.org/r/Yv9O+9sxU7gAv3vM@fedora
Cc: <Stable(a)vger.kernel.org>
Signed-off-by: Jonathan Cameron <Jonathan.Cameron(a)huawei.com>
---
drivers/iio/adc/ad7292.c | 4 +---
1 file changed, 1 insertion(+), 3 deletions(-)
diff --git a/drivers/iio/adc/ad7292.c b/drivers/iio/adc/ad7292.c
index 92c68d467c50..a2f9fda25ff3 100644
--- a/drivers/iio/adc/ad7292.c
+++ b/drivers/iio/adc/ad7292.c
@@ -287,10 +287,8 @@ static int ad7292_probe(struct spi_device *spi)
ret = devm_add_action_or_reset(&spi->dev,
ad7292_regulator_disable, st);
- if (ret) {
- regulator_disable(st->reg);
+ if (ret)
return ret;
- }
ret = regulator_get_voltage(st->reg);
if (ret < 0)
--
2.37.2
This is a note to let you know that I've just added the patch titled
iio: adc: mcp3911: use correct formula for AD conversion
to my char-misc git tree which can be found at
git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/char-misc.git
in the char-misc-linus branch.
The patch will show up in the next release of the linux-next tree
(usually sometime within the next 24 hours during the week.)
The patch will hopefully also be merged in Linus's tree for the
next -rc kernel release.
If you have any questions about this process, please let me know.
From 9e2238e3ae40d371a1130226e0e740aa1601efa6 Mon Sep 17 00:00:00 2001
From: Marcus Folkesson <marcus.folkesson(a)gmail.com>
Date: Fri, 22 Jul 2022 15:07:20 +0200
Subject: iio: adc: mcp3911: use correct formula for AD conversion
The ADC conversion is actually not rail-to-rail but with a factor 1.5.
Make use of this factor when calculating actual voltage.
Fixes: 3a89b289df5d ("iio: adc: add support for mcp3911")
Signed-off-by: Marcus Folkesson <marcus.folkesson(a)gmail.com>
Reviewed-by: Andy Shevchenko <andy.shevchenko(a)gmail.com>
Link: https://lore.kernel.org/r/20220722130726.7627-4-marcus.folkesson@gmail.com
Cc: <Stable(a)vger.kernel.org>
Signed-off-by: Jonathan Cameron <Jonathan.Cameron(a)huawei.com>
---
drivers/iio/adc/mcp3911.c | 17 ++++++++++++-----
1 file changed, 12 insertions(+), 5 deletions(-)
diff --git a/drivers/iio/adc/mcp3911.c b/drivers/iio/adc/mcp3911.c
index f8875076ae80..890af7dca62d 100644
--- a/drivers/iio/adc/mcp3911.c
+++ b/drivers/iio/adc/mcp3911.c
@@ -40,8 +40,8 @@
#define MCP3911_CHANNEL(x) (MCP3911_REG_CHANNEL0 + x * 3)
#define MCP3911_OFFCAL(x) (MCP3911_REG_OFFCAL_CH0 + x * 6)
-/* Internal voltage reference in uV */
-#define MCP3911_INT_VREF_UV 1200000
+/* Internal voltage reference in mV */
+#define MCP3911_INT_VREF_MV 1200
#define MCP3911_REG_READ(reg, id) ((((reg) << 1) | ((id) << 5) | (1 << 0)) & 0xff)
#define MCP3911_REG_WRITE(reg, id) ((((reg) << 1) | ((id) << 5) | (0 << 0)) & 0xff)
@@ -139,11 +139,18 @@ static int mcp3911_read_raw(struct iio_dev *indio_dev,
*val = ret / 1000;
} else {
- *val = MCP3911_INT_VREF_UV;
+ *val = MCP3911_INT_VREF_MV;
}
- *val2 = 24;
- ret = IIO_VAL_FRACTIONAL_LOG2;
+ /*
+ * For 24bit Conversion
+ * Raw = ((Voltage)/(Vref) * 2^23 * Gain * 1.5
+ * Voltage = Raw * (Vref)/(2^23 * Gain * 1.5)
+ */
+
+ /* val2 = (2^23 * 1.5) */
+ *val2 = 12582912;
+ ret = IIO_VAL_FRACTIONAL;
break;
}
--
2.37.2
This is a note to let you know that I've just added the patch titled
iio: light: cm3605: Fix an error handling path in cm3605_probe()
to my char-misc git tree which can be found at
git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/char-misc.git
in the char-misc-linus branch.
The patch will show up in the next release of the linux-next tree
(usually sometime within the next 24 hours during the week.)
The patch will hopefully also be merged in Linus's tree for the
next -rc kernel release.
If you have any questions about this process, please let me know.
From 160905549e663019e26395ed9d66c24ee2cf5187 Mon Sep 17 00:00:00 2001
From: Christophe JAILLET <christophe.jaillet(a)wanadoo.fr>
Date: Sun, 7 Aug 2022 08:37:43 +0200
Subject: iio: light: cm3605: Fix an error handling path in cm3605_probe()
The commit in Fixes also introduced a new error handling path which should
goto the existing error handling path.
Otherwise some resources leak.
Fixes: 0d31d91e6145 ("iio: light: cm3605: Make use of the helper function dev_err_probe()")
Signed-off-by: Christophe JAILLET <christophe.jaillet(a)wanadoo.fr>
Link: https://lore.kernel.org/r/0e186de2c125b3e17476ebf9c54eae4a5d66f994.16598542…
Cc: <Stable(a)vger.kernel.org>
Signed-off-by: Jonathan Cameron <Jonathan.Cameron(a)huawei.com>
---
drivers/iio/light/cm3605.c | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/drivers/iio/light/cm3605.c b/drivers/iio/light/cm3605.c
index c721b69d5095..0b30db77f78b 100644
--- a/drivers/iio/light/cm3605.c
+++ b/drivers/iio/light/cm3605.c
@@ -226,8 +226,10 @@ static int cm3605_probe(struct platform_device *pdev)
}
irq = platform_get_irq(pdev, 0);
- if (irq < 0)
- return dev_err_probe(dev, irq, "failed to get irq\n");
+ if (irq < 0) {
+ ret = dev_err_probe(dev, irq, "failed to get irq\n");
+ goto out_disable_aset;
+ }
ret = devm_request_threaded_irq(dev, irq, cm3605_prox_irq,
NULL, 0, "cm3605", indio_dev);
--
2.37.2
This is a note to let you know that I've just added the patch titled
iio: adc: mcp3911: correct "microchip,device-addr" property
to my char-misc git tree which can be found at
git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/char-misc.git
in the char-misc-linus branch.
The patch will show up in the next release of the linux-next tree
(usually sometime within the next 24 hours during the week.)
The patch will hopefully also be merged in Linus's tree for the
next -rc kernel release.
If you have any questions about this process, please let me know.
From cfbd76d5c9c449739bb74288d982bccf9ff822f4 Mon Sep 17 00:00:00 2001
From: Marcus Folkesson <marcus.folkesson(a)gmail.com>
Date: Fri, 22 Jul 2022 15:07:19 +0200
Subject: iio: adc: mcp3911: correct "microchip,device-addr" property
Go for the right property name that is documented in the bindings.
Fixes: 3a89b289df5d ("iio: adc: add support for mcp3911")
Signed-off-by: Marcus Folkesson <marcus.folkesson(a)gmail.com>
Reviewed-by: Andy Shevchenko <andy.shevchenko(a)gmail.com>
Link: https://lore.kernel.org/r/20220722130726.7627-3-marcus.folkesson@gmail.com
Cc: <Stable(a)vger.kernel.org>
Signed-off-by: Jonathan Cameron <Jonathan.Cameron(a)huawei.com>
---
drivers/iio/adc/mcp3911.c | 9 ++++++++-
1 file changed, 8 insertions(+), 1 deletion(-)
diff --git a/drivers/iio/adc/mcp3911.c b/drivers/iio/adc/mcp3911.c
index f581cefb6719..f8875076ae80 100644
--- a/drivers/iio/adc/mcp3911.c
+++ b/drivers/iio/adc/mcp3911.c
@@ -210,7 +210,14 @@ static int mcp3911_config(struct mcp3911 *adc)
u32 configreg;
int ret;
- device_property_read_u32(dev, "device-addr", &adc->dev_addr);
+ ret = device_property_read_u32(dev, "microchip,device-addr", &adc->dev_addr);
+
+ /*
+ * Fallback to "device-addr" due to historical mismatch between
+ * dt-bindings and implementation
+ */
+ if (ret)
+ device_property_read_u32(dev, "device-addr", &adc->dev_addr);
if (adc->dev_addr > 3) {
dev_err(&adc->spi->dev,
"invalid device address (%i). Must be in range 0-3.\n",
--
2.37.2
Moves the access_ok() check for valid memory range from user space from
the function tee_shm_register() to tee_ioctl_shm_register(). With this
we error out early before anything is done that must be undone on error.
Fixes: 578c349570d2 ("tee: add overflow check in register_shm_helper()")
Cc: stable(a)vger.kernel.org # 5.10
Reported-by: Pavel Machek <pavel(a)denx.de>
Signed-off-by: Jens Wiklander <jens.wiklander(a)linaro.org>
---
Hi,
This patch targets the 5.10.y release to take care of a recently introduced
issue there.
Thanks,
Jens
drivers/tee/tee_core.c | 3 +++
drivers/tee/tee_shm.c | 3 ---
2 files changed, 3 insertions(+), 3 deletions(-)
diff --git a/drivers/tee/tee_core.c b/drivers/tee/tee_core.c
index e07f997cf8dd..9cc4a7b63b0d 100644
--- a/drivers/tee/tee_core.c
+++ b/drivers/tee/tee_core.c
@@ -334,6 +334,9 @@ tee_ioctl_shm_register(struct tee_context *ctx,
if (data.flags)
return -EINVAL;
+ if (!access_ok((void __user *)(unsigned long)data.addr, data.length))
+ return -EFAULT;
+
shm = tee_shm_register(ctx, data.addr, data.length,
TEE_SHM_DMA_BUF | TEE_SHM_USER_MAPPED);
if (IS_ERR(shm))
diff --git a/drivers/tee/tee_shm.c b/drivers/tee/tee_shm.c
index 6e662fb131d5..499fccba3d74 100644
--- a/drivers/tee/tee_shm.c
+++ b/drivers/tee/tee_shm.c
@@ -222,9 +222,6 @@ struct tee_shm *tee_shm_register(struct tee_context *ctx, unsigned long addr,
goto err;
}
- if (!access_ok((void __user *)addr, length))
- return ERR_PTR(-EFAULT);
-
mutex_lock(&teedev->mutex);
shm->id = idr_alloc(&teedev->idr, shm, 1, 0, GFP_KERNEL);
mutex_unlock(&teedev->mutex);
--
2.31.1
kernel test robot reported kernel-doc warning:
arch/mips/pci/pci-ar2315.c:6: warning: This comment starts with '/**', but isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst
* Both AR2315 and AR2316 chips have PCI interface unit, which supports DMA
The warning above is caused by an extraneous asterisk on the top level
(description) comment of pci-ar2315.c, for which the comment is confused as
kernel-doc comment instead.
Remove the asterisk.
Link: https://lore.kernel.org/linux-doc/202208221854.8ASrzjKa-lkp@intel.com/
Fixes: 3ed7a2a702dc0f ("MIPS: ath25: add AR2315 PCI host controller driver")
Fixes: 3e58e839150db0 ("scripts: kernel-doc: add warning for comment not following kernel-doc syntax")
Reported-by: kernel test robot <lkp(a)intel.com>
Cc: stable(a)vger.kernel.org # v5.15, v5.19
Signed-off-by: Bagas Sanjaya <bagasdotme(a)gmail.com>
---
The second Fixes: commit exposes the kernel-doc warning, which the
culprit is actually the first Fixes: commit, so it make sense to apply
this patch to supported stable branches after the second commit one.
arch/mips/pci/pci-ar2315.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/arch/mips/pci/pci-ar2315.c b/arch/mips/pci/pci-ar2315.c
index 30e0922f4ceaec..e17d862cfa4c6a 100644
--- a/arch/mips/pci/pci-ar2315.c
+++ b/arch/mips/pci/pci-ar2315.c
@@ -2,7 +2,7 @@
/*
*/
-/**
+/*
* Both AR2315 and AR2316 chips have PCI interface unit, which supports DMA
* and interrupt. PCI interface supports MMIO access method, but does not
* seem to support I/O ports.
base-commit: 072e51356cd5a4a1c12c1020bc054c99b98333df
--
An old man doll... just what I always wanted! - Clara
This is a note to let you know that I've just added the patch titled
Revert "usb: typec: ucsi: add a common function
to my usb git tree which can be found at
git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/usb.git
in the usb-linus branch.
The patch will show up in the next release of the linux-next tree
(usually sometime within the next 24 hours during the week.)
The patch will hopefully also be merged in Linus's tree for the
next -rc kernel release.
If you have any questions about this process, please let me know.
From 5f73aa2cf8bef4a39baa1591c3144ede4788826e Mon Sep 17 00:00:00 2001
From: Takashi Iwai <tiwai(a)suse.de>
Date: Tue, 23 Aug 2022 08:54:55 +0200
Subject: Revert "usb: typec: ucsi: add a common function
ucsi_unregister_connectors()"
The recent commit 87d0e2f41b8c ("usb: typec: ucsi: add a common
function ucsi_unregister_connectors()") introduced a regression that
caused NULL dereference at reading the power supply sysfs. It's a
stale sysfs entry that should have been removed but remains with NULL
ops. The commit changed the error handling to skip the entries after
a NULL con->wq, and this leaves the power device unreleased.
For addressing the regression, the straight revert is applied here.
Further code improvements can be done from the scratch again.
Link: https://bugzilla.suse.com/show_bug.cgi?id=1202386
Link: https://lore.kernel.org/r/87r11cmbx0.wl-tiwai@suse.de
Fixes: 87d0e2f41b8c ("usb: typec: ucsi: add a common function ucsi_unregister_connectors()")
Cc: <stable(a)vger.kernel.org>
Acked-by: Heikki Krogerus <heikki.krogerus(a)linux.intel.com>
Signed-off-by: Takashi Iwai <tiwai(a)suse.de>
Link: https://lore.kernel.org/r/20220823065455.32579-1-tiwai@suse.de
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
drivers/usb/typec/ucsi/ucsi.c | 53 ++++++++++++++++-------------------
1 file changed, 24 insertions(+), 29 deletions(-)
diff --git a/drivers/usb/typec/ucsi/ucsi.c b/drivers/usb/typec/ucsi/ucsi.c
index 1aea46493b85..7f2624f42724 100644
--- a/drivers/usb/typec/ucsi/ucsi.c
+++ b/drivers/usb/typec/ucsi/ucsi.c
@@ -1200,32 +1200,6 @@ static int ucsi_register_port(struct ucsi *ucsi, int index)
return ret;
}
-static void ucsi_unregister_connectors(struct ucsi *ucsi)
-{
- struct ucsi_connector *con;
- int i;
-
- if (!ucsi->connector)
- return;
-
- for (i = 0; i < ucsi->cap.num_connectors; i++) {
- con = &ucsi->connector[i];
-
- if (!con->wq)
- break;
-
- cancel_work_sync(&con->work);
- ucsi_unregister_partner(con);
- ucsi_unregister_altmodes(con, UCSI_RECIPIENT_CON);
- ucsi_unregister_port_psy(con);
- destroy_workqueue(con->wq);
- typec_unregister_port(con->port);
- }
-
- kfree(ucsi->connector);
- ucsi->connector = NULL;
-}
-
/**
* ucsi_init - Initialize UCSI interface
* @ucsi: UCSI to be initialized
@@ -1234,6 +1208,7 @@ static void ucsi_unregister_connectors(struct ucsi *ucsi)
*/
static int ucsi_init(struct ucsi *ucsi)
{
+ struct ucsi_connector *con;
u64 command;
int ret;
int i;
@@ -1264,7 +1239,7 @@ static int ucsi_init(struct ucsi *ucsi)
}
/* Allocate the connectors. Released in ucsi_unregister() */
- ucsi->connector = kcalloc(ucsi->cap.num_connectors,
+ ucsi->connector = kcalloc(ucsi->cap.num_connectors + 1,
sizeof(*ucsi->connector), GFP_KERNEL);
if (!ucsi->connector) {
ret = -ENOMEM;
@@ -1288,7 +1263,15 @@ static int ucsi_init(struct ucsi *ucsi)
return 0;
err_unregister:
- ucsi_unregister_connectors(ucsi);
+ for (con = ucsi->connector; con->port; con++) {
+ ucsi_unregister_partner(con);
+ ucsi_unregister_altmodes(con, UCSI_RECIPIENT_CON);
+ ucsi_unregister_port_psy(con);
+ if (con->wq)
+ destroy_workqueue(con->wq);
+ typec_unregister_port(con->port);
+ con->port = NULL;
+ }
err_reset:
memset(&ucsi->cap, 0, sizeof(ucsi->cap));
@@ -1402,6 +1385,7 @@ EXPORT_SYMBOL_GPL(ucsi_register);
void ucsi_unregister(struct ucsi *ucsi)
{
u64 cmd = UCSI_SET_NOTIFICATION_ENABLE;
+ int i;
/* Make sure that we are not in the middle of driver initialization */
cancel_delayed_work_sync(&ucsi->work);
@@ -1409,7 +1393,18 @@ void ucsi_unregister(struct ucsi *ucsi)
/* Disable notifications */
ucsi->ops->async_write(ucsi, UCSI_CONTROL, &cmd, sizeof(cmd));
- ucsi_unregister_connectors(ucsi);
+ for (i = 0; i < ucsi->cap.num_connectors; i++) {
+ cancel_work_sync(&ucsi->connector[i].work);
+ ucsi_unregister_partner(&ucsi->connector[i]);
+ ucsi_unregister_altmodes(&ucsi->connector[i],
+ UCSI_RECIPIENT_CON);
+ ucsi_unregister_port_psy(&ucsi->connector[i]);
+ if (ucsi->connector[i].wq)
+ destroy_workqueue(ucsi->connector[i].wq);
+ typec_unregister_port(ucsi->connector[i].port);
+ }
+
+ kfree(ucsi->connector);
}
EXPORT_SYMBOL_GPL(ucsi_unregister);
--
2.37.2
From: Tadeusz Struk <tadeusz.struk(a)linaro.org>
commit 4c46091ee985ae84c60c5e95055d779fcd291d87 upstream.
Syzbot found a Use After Free bug in compute_effective_progs().
The reproducer creates a number of BPF links, and causes a fault
injected alloc to fail, while calling bpf_link_detach on them.
Link detach triggers the link to be freed by bpf_link_free(),
which calls __cgroup_bpf_detach() and update_effective_progs().
If the memory allocation in this function fails, the function restores
the pointer to the bpf_cgroup_link on the cgroup list, but the memory
gets freed just after it returns. After this, every subsequent call to
update_effective_progs() causes this already deallocated pointer to be
dereferenced in prog_list_length(), and triggers KASAN UAF error.
To fix this issue don't preserve the pointer to the prog or link in the
list, but remove it and replace it with a dummy prog without shrinking
the table. The subsequent call to __cgroup_bpf_detach() or
__cgroup_bpf_detach() will correct it.
Fixes: af6eea57437a ("bpf: Implement bpf_link-based cgroup BPF program attachment")
Reported-by: <syzbot+f264bffdfbd5614f3bb2(a)syzkaller.appspotmail.com>
Signed-off-by: Tadeusz Struk <tadeusz.struk(a)linaro.org>
Signed-off-by: Andrii Nakryiko <andrii(a)kernel.org>
Cc: <stable(a)vger.kernel.org>
Link: https://syzkaller.appspot.com/bug?id=8ebf179a95c2a2670f7cf1ba62429ec044369d…
Link: https://lore.kernel.org/bpf/20220517180420.87954-1-tadeusz.struk@linaro.org
Signed-off-by: Pu Lehui <pulehui(a)huawei.com>
---
kernel/bpf/cgroup.c | 70 ++++++++++++++++++++++++++++++++++++++-------
1 file changed, 60 insertions(+), 10 deletions(-)
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 6aa9e10c6335..d154e52dd7ae 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -653,6 +653,60 @@ static struct bpf_prog_list *find_detach_entry(struct list_head *progs,
return ERR_PTR(-ENOENT);
}
+/**
+ * purge_effective_progs() - After compute_effective_progs fails to alloc new
+ * cgrp->bpf.inactive table we can recover by
+ * recomputing the array in place.
+ *
+ * @cgrp: The cgroup which descendants to travers
+ * @prog: A program to detach or NULL
+ * @link: A link to detach or NULL
+ * @type: Type of detach operation
+ */
+static void purge_effective_progs(struct cgroup *cgrp, struct bpf_prog *prog,
+ struct bpf_cgroup_link *link,
+ enum bpf_attach_type type)
+{
+ struct cgroup_subsys_state *css;
+ struct bpf_prog_array *progs;
+ struct bpf_prog_list *pl;
+ struct list_head *head;
+ struct cgroup *cg;
+ int pos;
+
+ /* recompute effective prog array in place */
+ css_for_each_descendant_pre(css, &cgrp->self) {
+ struct cgroup *desc = container_of(css, struct cgroup, self);
+
+ if (percpu_ref_is_zero(&desc->bpf.refcnt))
+ continue;
+
+ /* find position of link or prog in effective progs array */
+ for (pos = 0, cg = desc; cg; cg = cgroup_parent(cg)) {
+ if (pos && !(cg->bpf.flags[type] & BPF_F_ALLOW_MULTI))
+ continue;
+
+ head = &cg->bpf.progs[type];
+ list_for_each_entry(pl, head, node) {
+ if (!prog_list_prog(pl))
+ continue;
+ if (pl->prog == prog && pl->link == link)
+ goto found;
+ pos++;
+ }
+ }
+found:
+ BUG_ON(!cg);
+ progs = rcu_dereference_protected(
+ desc->bpf.effective[type],
+ lockdep_is_held(&cgroup_mutex));
+
+ /* Remove the program from the array */
+ WARN_ONCE(bpf_prog_array_delete_safe_at(progs, pos),
+ "Failed to purge a prog from array at index %d", pos);
+ }
+}
+
/**
* __cgroup_bpf_detach() - Detach the program or link from a cgroup, and
* propagate the change to descendants
@@ -671,7 +725,6 @@ int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
u32 flags = cgrp->bpf.flags[type];
struct bpf_prog_list *pl;
struct bpf_prog *old_prog;
- int err;
if (prog && link)
/* only one of prog or link can be specified */
@@ -686,9 +739,12 @@ int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
pl->prog = NULL;
pl->link = NULL;
- err = update_effective_progs(cgrp, type);
- if (err)
- goto cleanup;
+ if (update_effective_progs(cgrp, type)) {
+ /* if update effective array failed replace the prog with a dummy prog*/
+ pl->prog = old_prog;
+ pl->link = link;
+ purge_effective_progs(cgrp, old_prog, link, type);
+ }
/* now can actually delete it from this cgroup list */
list_del(&pl->node);
@@ -700,12 +756,6 @@ int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
bpf_prog_put(old_prog);
static_branch_dec(&cgroup_bpf_enabled_key);
return 0;
-
-cleanup:
- /* restore back prog or link */
- pl->prog = old_prog;
- pl->link = link;
- return err;
}
/* Must be called with cgroup_mutex held to avoid races. */
--
2.25.1
This is the backport for v4.19.x stable branch.
The full explananation can be found here:
https://lore.kernel.org/linux-btrfs/cover.1660891713.git.wqu@suse.com/
No code change between v4.14.x and v4.19.x, at least nothing git can not
auto-resolve.
Testing wise, this is beyond my testing environment.
Although latest GCC compiles without problem, the result kernel can not
be properly boot at all, not even any kernel early boot message.
I'm not sure if this is something related to latest edk2 UEFI or
something else, I can no longer do proper testing for any older branch,
including this 4.19 one.
Thus I can not do any guarantee on these backports, unfortunately
the backports can only go to v5.x branches for now.
Unless anyone has better ideas how to build and run older kernels with
latest edk2 UEFI environment.
Qu Wenruo (2):
btrfs: only write the sectors in the vertical stripe which has data
stripes
btrfs: raid56: don't trust any cached sector in
__raid56_parity_recover()
fs/btrfs/raid56.c | 74 ++++++++++++++++++++++++++++++++++++-----------
1 file changed, 57 insertions(+), 17 deletions(-)
--
2.37.1
Syzkaller reports using smp_processor_id() in preemptible code at
radix_tree_node_alloc() in 5.10 stable releases. The problem has
been fixed by the following patch which can be cleanly applied to
the 5.10 branch.
Found by Linux Verification Center (linuxtesting.org) with Syzkaller.
Hello,
Here's another round of xfs backports for 5.15.y that have been
through testing and ACK'd.
Thanks,
Leah
Brian Foster (2):
xfs: flush inodegc workqueue tasks before cancel
xfs: fix soft lockup via spinning in filestream ag selection loop
Darrick J. Wong (6):
xfs: reserve quota for dir expansion when linking/unlinking files
xfs: reserve quota for target dir expansion when renaming files
xfs: remove infinite loop when reserving free block pool
xfs: always succeed at setting the reserve pool size
xfs: fix overfilling of reserve pool
xfs: reject crazy array sizes being fed to XFS_IOC_GETBMAP*
Eric Sandeen (1):
xfs: revert "xfs: actually bump warning counts when we send warnings"
fs/xfs/xfs_filestream.c | 7 ++--
fs/xfs/xfs_fsops.c | 50 ++++++++++-------------
fs/xfs/xfs_icache.c | 22 ++--------
fs/xfs/xfs_inode.c | 79 ++++++++++++++++++++++--------------
fs/xfs/xfs_ioctl.c | 2 +-
fs/xfs/xfs_trans.c | 86 ++++++++++++++++++++++++++++++++++++++++
fs/xfs/xfs_trans.h | 3 ++
fs/xfs/xfs_trans_dquot.c | 1 -
8 files changed, 167 insertions(+), 83 deletions(-)
--
2.37.1.595.g718a3a8f04-goog
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From ab8384442ee512fc0fc72deeb036110843d0e7ff Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Google)" <rostedt(a)goodmis.org>
Date: Sat, 20 Aug 2022 09:43:21 -0400
Subject: [PATCH] tracing/probes: Have kprobes and uprobes use $COMM too
Both $comm and $COMM can be used to get current->comm in eprobes and the
filtering and histogram logic. Make kprobes and uprobes consistent in this
regard and allow both $comm and $COMM as well. Currently kprobes and
uprobes only handle $comm, which is inconsistent with the other utilities,
and can be confusing to users.
Link: https://lkml.kernel.org/r/20220820134401.317014913@goodmis.org
Link: https://lore.kernel.org/all/20220820220442.776e1ddaf8836e82edb34d01@kernel.…
Cc: stable(a)vger.kernel.org
Cc: Ingo Molnar <mingo(a)kernel.org>
Cc: Andrew Morton <akpm(a)linux-foundation.org>
Cc: Tzvetomir Stoyanov <tz.stoyanov(a)gmail.com>
Cc: Tom Zanussi <zanussi(a)kernel.org>
Fixes: 533059281ee5 ("tracing: probeevent: Introduce new argument fetching code")
Suggested-by: Masami Hiramatsu (Google) <mhiramat(a)kernel.org>
Acked-by: Masami Hiramatsu (Google) <mhiramat(a)kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt(a)goodmis.org>
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index 4daabbb8b772..36dff277de46 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -314,7 +314,7 @@ static int parse_probe_vars(char *arg, const struct fetch_type *t,
}
} else
goto inval_var;
- } else if (strcmp(arg, "comm") == 0) {
+ } else if (strcmp(arg, "comm") == 0 || strcmp(arg, "COMM") == 0) {
code->op = FETCH_OP_COMM;
#ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API
} else if (((flags & TPARG_FL_MASK) ==
@@ -625,7 +625,8 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size,
* we can find those by strcmp. But ignore for eprobes.
*/
if (!(flags & TPARG_FL_TPOINT) &&
- (strcmp(arg, "$comm") == 0 || strncmp(arg, "\\\"", 2) == 0)) {
+ (strcmp(arg, "$comm") == 0 || strcmp(arg, "$COMM") == 0 ||
+ strncmp(arg, "\\\"", 2) == 0)) {
/* The type of $comm must be "string", and not an array. */
if (parg->count || (t && strcmp(t, "string")))
goto out;
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From f04dec93466a0481763f3b56cdadf8076e28bfbf Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Google)" <rostedt(a)goodmis.org>
Date: Sat, 20 Aug 2022 09:43:19 -0400
Subject: [PATCH] tracing/eprobes: Fix reading of string fields
Currently when an event probe (eprobe) hooks to a string field, it does
not display it as a string, but instead as a number. This makes the field
rather useless. Handle the different kinds of strings, dynamic, static,
relational/dynamic etc.
Now when a string field is used, the ":string" type can be used to display
it:
echo "e:sw sched/sched_switch comm=$next_comm:string" > dynamic_events
Link: https://lkml.kernel.org/r/20220820134400.959640191@goodmis.org
Cc: stable(a)vger.kernel.org
Cc: Ingo Molnar <mingo(a)kernel.org>
Cc: Andrew Morton <akpm(a)linux-foundation.org>
Cc: Tzvetomir Stoyanov <tz.stoyanov(a)gmail.com>
Cc: Tom Zanussi <zanussi(a)kernel.org>
Fixes: 7491e2c44278 ("tracing: Add a probe that attaches to trace events")
Acked-by: Masami Hiramatsu (Google) <mhiramat(a)kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt(a)goodmis.org>
diff --git a/kernel/trace/trace_eprobe.c b/kernel/trace/trace_eprobe.c
index 550671985fd1..a1d3423ab74f 100644
--- a/kernel/trace/trace_eprobe.c
+++ b/kernel/trace/trace_eprobe.c
@@ -311,6 +311,27 @@ static unsigned long get_event_field(struct fetch_insn *code, void *rec)
addr = rec + field->offset;
+ if (is_string_field(field)) {
+ switch (field->filter_type) {
+ case FILTER_DYN_STRING:
+ val = (unsigned long)(rec + (*(unsigned int *)addr & 0xffff));
+ break;
+ case FILTER_RDYN_STRING:
+ val = (unsigned long)(addr + (*(unsigned int *)addr & 0xffff));
+ break;
+ case FILTER_STATIC_STRING:
+ val = (unsigned long)addr;
+ break;
+ case FILTER_PTR_STRING:
+ val = (unsigned long)(*(char *)addr);
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ return 0;
+ }
+ return val;
+ }
+
switch (field->size) {
case 1:
if (field->is_signed)
From: Badari Pulavarty <badari.pulavarty(a)intel.com>
From: Badari Pulavarty <badari.pulavarty(a)intel.com>
When user tries to create a DAMON context via the DAMON debugfs
interface with a name of an already existing context, the context
directory creation fails but a new context is created and added in the
internal data structure, due to absence of the directory creation
success check. As a result, memory could leak and DAMON cannot be
turned on. An example test case is as below:
# cd /sys/kernel/debug/damon/
# echo "off" > monitor_on
# echo paddr > target_ids
# echo "abc" > mk_context
# echo "abc" > mk_context
# echo $$ > abc/target_ids
# echo "on" > monitor_on <<< fails
Return value of 'debugfs_create_dir()' is expected to be ignored in
general, but this is an exceptional case as DAMON feature is depending
on the debugfs functionality and it has the potential duplicate name
issue. This commit therefore fixes the issue by checking the directory
creation failure and immediately return the error in the case.
Fixes: 75c1c2b53c78 ("mm/damon/dbgfs: support multiple contexts")
Cc: <stable(a)vger.kernel.org> # 5.15.x
Signed-off-by: Badari Pulavarty <badari.pulavarty(a)intel.com>
Signed-off-by: SeongJae Park <sj(a)kernel.org>
---
Changes from v2
(https://lore.kernel.org/damon/20220819171930.16166-1-sj@kernel.org/)
- Simply check the debugfs_create_dir() return value (Andrew Morton)
- Put a comment for justifying check of the return value (Greg KH)
Changes from v1
(https://lore.kernel.org/damon/DM6PR11MB3978994F75A4104D714437379C679@DM6PR1…)
- Manually check duplicate entry instead of checking
'debugfs_create_dir()' return value
- Reword commit message and the test case
Seems Badari have some email client issue, so I (SJ) am making this
third version of the patch based on Badari's final proposal and
reposting on behalf of Badari.
mm/damon/dbgfs.c | 3 +++
1 file changed, 3 insertions(+)
diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c
index 51d67c8050dd..3b55a1b219b5 100644
--- a/mm/damon/dbgfs.c
+++ b/mm/damon/dbgfs.c
@@ -818,6 +818,9 @@ static int dbgfs_mk_context(char *name)
return -ENOENT;
new_dir = debugfs_create_dir(name, root);
+ /* Below check is required for a potential duplicated name case */
+ if (IS_ERR(new_dir))
+ return PTR_ERR(new_dir);
dbgfs_dirs[dbgfs_nr_ctxs] = new_dir;
new_ctx = dbgfs_new_ctx();
--
2.25.1
The Intel IOMMU driver possibly selects between the first-level and the
second-level translation tables for DMA address translation. However,
the levels of page-table walks for the 4KB base page size are calculated
from the SAGAW field of the capability register, which is only valid for
the second-level page table. This causes the IOMMU driver to stop working
if the hardware (or the emulated IOMMU) advertises only first-level
translation capability and reports the SAGAW field as 0.
This solves the above problem by considering both the first level and the
second level when calculating the supported page table levels.
Fixes: b802d070a52a1 ("iommu/vt-d: Use iova over first level")
Cc: stable(a)vger.kernel.org
Signed-off-by: Lu Baolu <baolu.lu(a)linux.intel.com>
---
drivers/iommu/intel/iommu.c | 28 +++++++++++++++++++++++++---
1 file changed, 25 insertions(+), 3 deletions(-)
diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 889ad2c9a7b9..2d0d2ef820d2 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -370,14 +370,36 @@ static inline int domain_pfn_supported(struct dmar_domain *domain,
return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
}
+/*
+ * Calculate the Supported Adjusted Guest Address Widths of an IOMMU.
+ * Refer to 11.4.2 of the VT-d spec for the encoding of each bit of
+ * the returned SAGAW.
+ */
+static unsigned long __iommu_calculate_sagaw(struct intel_iommu *iommu)
+{
+ unsigned long fl_sagaw, sl_sagaw;
+
+ fl_sagaw = 0x6 | (cap_fl1gp_support(iommu->cap) ? BIT(3) : 0);
+ sl_sagaw = cap_sagaw(iommu->cap);
+
+ /* Second level only. */
+ if (!sm_supported(iommu) || !ecap_flts(iommu->ecap))
+ return sl_sagaw;
+
+ /* First level only. */
+ if (!ecap_slts(iommu->ecap))
+ return fl_sagaw;
+
+ return fl_sagaw & sl_sagaw;
+}
+
static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
{
unsigned long sagaw;
int agaw;
- sagaw = cap_sagaw(iommu->cap);
- for (agaw = width_to_agaw(max_gaw);
- agaw >= 0; agaw--) {
+ sagaw = __iommu_calculate_sagaw(iommu);
+ for (agaw = width_to_agaw(max_gaw); agaw >= 0; agaw--) {
if (test_bit(agaw, &sagaw))
break;
}
--
2.25.1
The Amiga partition parser module uses signed int for partition sector
address and count, which will overflow for disks larger than 1 TB.
Use u64 as type for sector address and size to allow using disks up to
2 TB without LBD support, and disks larger than 2 TB with LBD. The RBD
format allows to specify disk sizes up to 2^128 bytes (though native
OS limitations reduce this somewhat, to max 2^68 bytes), so check for
u64 overflow carefully to protect against overflowing sector_t.
Bail out if sector addresses overflow 32 bits on kernels without LBD
support.
This bug was reported originally in 2012, and the fix was created by
the RDB author, Joanne Dow <jdow(a)earthlink.net>. A patch had been
discussed and reviewed on linux-m68k at that time but never officially
submitted (now resubmitted as separate patch).
This patch adds additional error checking and warning messages.
Fixes: https://bugzilla.kernel.org/show_bug.cgi?id=43511
Fixes: 1da177e4c3f41524 ("Linux-2.6.12-rc2")
Cc: <stable(a)vger.kernel.org> # 5.2
Reported-by: Martin Steigerwald <Martin(a)lichtvoll.de>
Message-ID: <201206192146.09327.Martin(a)lichtvoll.de>
Signed-off-by: Michael Schmitz <schmitzmic(a)gmail.com>
Reviewed-by: Geert Uytterhoeven <geert(a)linux-m68k.org>
Reviewed-by: Christoph Hellwig <hch(a)infradead.org>
---
Changes from RFC:
- use u64 instead of sector_t, since that may be u32 without LBD support
- check multiplication overflows each step - 3 u32 values may exceed u64!
- warn against use on AmigaDOS if partition data overflow u32 sector count.
- warn if partition CylBlocks larger than what's stored in the RDSK header.
- bail out if we were to overflow sector_t (32 or 64 bit).
Changes from v1:
Kars de Jong:
- use defines for magic offsets in DosEnvec struct
Geert Uytterhoeven:
- use u64 cast for multiplications of u32 numbers
- use array3_size for overflow checks
- change pr_err to pr_warn
- discontinue use of pr_cont
- reword log messages
- drop redundant nr_sects overflow test
- warn against 32 bit overflow for each affected partition
- skip partitions that overflow sector_t size instead of aborting scan
Changes from v2:
- further trim 32 bit overflow test
- correct duplicate types.h inclusion introduced in v2
Changes from v3:
- split off sector address type fix for independent review
- change blksize to unsigned
- use check_mul_overflow() instead of array3_size()
- rewrite checks to avoid 64 bit divisions in check_mul_overflow()
Changes from v5:
Geert Uytterhoeven:
- correct ineffective u64 cast to avoid 32 bit mult. overflow
- fix mult. overflow in partition block address calculation
Changes from v6:
Geert Uytterhoeven:
- don't fail hard on partition block address overflow
Changes from v7:
- replace bdevname(state->bdev, b) by state->disk->disk_name
- drop warn_no_part conditionals
- remove remaining warn_no_part
Changes from v8:
Christoph Hellwig:
- whitespace fix, drop unnecessary u64 casts
kbuid warning:
- sparse warning fix
---
block/partitions/amiga.c | 103 ++++++++++++++++++++++++++++++++-------
1 file changed, 85 insertions(+), 18 deletions(-)
diff --git a/block/partitions/amiga.c b/block/partitions/amiga.c
index 85c5c79aae48..e28a4b917822 100644
--- a/block/partitions/amiga.c
+++ b/block/partitions/amiga.c
@@ -11,10 +11,18 @@
#define pr_fmt(fmt) fmt
#include <linux/types.h>
+#include <linux/mm_types.h>
+#include <linux/overflow.h>
#include <linux/affs_hardblocks.h>
#include "check.h"
+/* magic offsets in partition DosEnvVec */
+#define NR_HD 3
+#define NR_SECT 5
+#define LO_CYL 9
+#define HI_CYL 10
+
static __inline__ u32
checksum_block(__be32 *m, int size)
{
@@ -31,9 +39,12 @@ int amiga_partition(struct parsed_partitions *state)
unsigned char *data;
struct RigidDiskBlock *rdb;
struct PartitionBlock *pb;
- sector_t start_sect, nr_sects;
- int blk, part, res = 0;
- int blksize = 1; /* Multiplier for disk block size */
+ u64 start_sect, nr_sects;
+ sector_t blk, end_sect;
+ u32 cylblk; /* rdb_CylBlocks = nr_heads*sect_per_track */
+ u32 nr_hd, nr_sect, lo_cyl, hi_cyl;
+ int part, res = 0;
+ unsigned int blksize = 1; /* Multiplier for disk block size */
int slot = 1;
for (blk = 0; ; blk++, put_dev_sector(sect)) {
@@ -41,7 +52,7 @@ int amiga_partition(struct parsed_partitions *state)
goto rdb_done;
data = read_part_sector(state, blk, §);
if (!data) {
- pr_err("Dev %s: unable to read RDB block %d\n",
+ pr_err("Dev %s: unable to read RDB block %llu\n",
state->disk->disk_name, blk);
res = -1;
goto rdb_done;
@@ -58,12 +69,12 @@ int amiga_partition(struct parsed_partitions *state)
*(__be32 *)(data+0xdc) = 0;
if (checksum_block((__be32 *)data,
be32_to_cpu(rdb->rdb_SummedLongs) & 0x7F)==0) {
- pr_err("Trashed word at 0xd0 in block %d ignored in checksum calculation\n",
+ pr_err("Trashed word at 0xd0 in block %llu ignored in checksum calculation\n",
blk);
break;
}
- pr_err("Dev %s: RDB in block %d has bad checksum\n",
+ pr_err("Dev %s: RDB in block %llu has bad checksum\n",
state->disk->disk_name, blk);
}
@@ -80,10 +91,15 @@ int amiga_partition(struct parsed_partitions *state)
blk = be32_to_cpu(rdb->rdb_PartitionList);
put_dev_sector(sect);
for (part = 1; blk>0 && part<=16; part++, put_dev_sector(sect)) {
- blk *= blksize; /* Read in terms partition table understands */
+ /* Read in terms partition table understands */
+ if (check_mul_overflow(blk, (sector_t) blksize, &blk)) {
+ pr_err("Dev %s: overflow calculating partition block %llu! Skipping partitions %u and beyond\n",
+ state->disk->disk_name, blk, part);
+ break;
+ }
data = read_part_sector(state, blk, §);
if (!data) {
- pr_err("Dev %s: unable to read partition block %d\n",
+ pr_err("Dev %s: unable to read partition block %llu\n",
state->disk->disk_name, blk);
res = -1;
goto rdb_done;
@@ -95,19 +111,70 @@ int amiga_partition(struct parsed_partitions *state)
if (checksum_block((__be32 *)pb, be32_to_cpu(pb->pb_SummedLongs) & 0x7F) != 0 )
continue;
- /* Tell Kernel about it */
+ /* RDB gives us more than enough rope to hang ourselves with,
+ * many times over (2^128 bytes if all fields max out).
+ * Some careful checks are in order, so check for potential
+ * overflows.
+ * We are multiplying four 32 bit numbers to one sector_t!
+ */
+
+ nr_hd = be32_to_cpu(pb->pb_Environment[NR_HD]);
+ nr_sect = be32_to_cpu(pb->pb_Environment[NR_SECT]);
+
+ /* CylBlocks is total number of blocks per cylinder */
+ if (check_mul_overflow(nr_hd, nr_sect, &cylblk)) {
+ pr_err("Dev %s: heads*sects %u overflows u32, skipping partition!\n",
+ state->disk->disk_name, cylblk);
+ continue;
+ }
+
+ /* check for consistency with RDB defined CylBlocks */
+ if (cylblk > be32_to_cpu((__be32)rdb->rdb_CylBlocks)) {
+ pr_warn("Dev %s: cylblk %u > rdb_CylBlocks %u!\n",
+ state->disk->disk_name, cylblk,
+ be32_to_cpu(rdb->rdb_CylBlocks));
+ }
+
+ /* RDB allows for variable logical block size -
+ * normalize to 512 byte blocks and check result.
+ */
+
+ if (check_mul_overflow(cylblk, blksize, &cylblk)) {
+ pr_err("Dev %s: partition %u bytes per cyl. overflows u32, skipping partition!\n",
+ state->disk->disk_name, part);
+ continue;
+ }
+
+ /* Calculate partition start and end. Limit of 32 bit on cylblk
+ * guarantees no overflow occurs if LBD support is enabled.
+ */
+
+ lo_cyl = be32_to_cpu(pb->pb_Environment[LO_CYL]);
+ start_sect = ((u64) lo_cyl * cylblk);
+
+ hi_cyl = be32_to_cpu(pb->pb_Environment[HI_CYL]);
+ nr_sects = (((u64) hi_cyl - lo_cyl + 1) * cylblk);
- nr_sects = ((sector_t)be32_to_cpu(pb->pb_Environment[10]) + 1 -
- be32_to_cpu(pb->pb_Environment[9])) *
- be32_to_cpu(pb->pb_Environment[3]) *
- be32_to_cpu(pb->pb_Environment[5]) *
- blksize;
if (!nr_sects)
continue;
- start_sect = (sector_t)be32_to_cpu(pb->pb_Environment[9]) *
- be32_to_cpu(pb->pb_Environment[3]) *
- be32_to_cpu(pb->pb_Environment[5]) *
- blksize;
+
+ /* Warn user if partition end overflows u32 (AmigaDOS limit) */
+
+ if ((start_sect + nr_sects) > UINT_MAX) {
+ pr_warn("Dev %s: partition %u (%llu-%llu) needs 64 bit device support!\n",
+ state->disk->disk_name, part,
+ start_sect, start_sect + nr_sects);
+ }
+
+ if (check_add_overflow(start_sect, nr_sects, &end_sect)) {
+ pr_err("Dev %s: partition %u (%llu-%llu) needs LBD device support, skipping partition!\n",
+ state->disk->disk_name, part,
+ start_sect, end_sect);
+ continue;
+ }
+
+ /* Tell Kernel about it */
+
put_partition(state,slot++,start_sect,nr_sects);
{
/* Be even more informative to aid mounting */
--
2.17.1