From: Eric Biggers <ebiggers(a)google.com>
commit f5e55e777cc93eae1416f0fa4908e8846b6d7825 upstream.
[Please apply to 4.9-stable. This is an important fix to have,
and it will be needed for xfstest generic/398 to pass if
https://lkml.kernel.org/r/20201031054141.695517-1-ebiggers@kernel.org
is applied. Note, this commit had to be reworked to apply to 4.9.]
Currently, trying to rename or link a regular file, directory, or
symlink into an encrypted directory fails with EPERM when the source
file is unencrypted or is encrypted with a different encryption policy,
and is on the same mountpoint. It is correct for the operation to fail,
but the choice of EPERM breaks tools like 'mv' that know to copy rather
than rename if they see EXDEV, but don't know what to do with EPERM.
Our original motivation for EPERM was to encourage users to securely
handle their data. Encrypting files by "moving" them into an encrypted
directory can be insecure because the unencrypted data may remain in
free space on disk, where it can later be recovered by an attacker.
It's much better to encrypt the data from the start, or at least try to
securely delete the source data e.g. using the 'shred' program.
However, the current behavior hasn't been effective at achieving its
goal because users tend to be confused, hack around it, and complain;
see e.g. https://github.com/google/fscrypt/issues/76. And in some cases
it's actually inconsistent or unnecessary. For example, 'mv'-ing files
between differently encrypted directories doesn't work even in cases
where it can be secure, such as when in userspace the same passphrase
protects both directories. Yet, you *can* already 'mv' unencrypted
files into an encrypted directory if the source files are on a different
mountpoint, even though doing so is often insecure.
There are probably better ways to teach users to securely handle their
files. For example, the 'fscrypt' userspace tool could provide a
command that migrates unencrypted files into an encrypted directory,
acting like 'shred' on the source files and providing appropriate
warnings depending on the type of the source filesystem and disk.
Receiving errors on unimportant files might also force some users to
disable encryption, thus making the behavior counterproductive. It's
desirable to make encryption as unobtrusive as possible.
Therefore, change the error code from EPERM to EXDEV so that tools
looking for EXDEV will fall back to a copy.
This, of course, doesn't prevent users from still doing the right things
to securely manage their files. Note that this also matches the
behavior when a file is renamed between two project quota hierarchies;
so there's precedent for using EXDEV for things other than mountpoints.
xfstests generic/398 will require an update with this change.
[Rewritten from an earlier patch series by Michael Halcrow.]
Cc: Michael Halcrow <mhalcrow(a)google.com>
Cc: Joe Richey <joerichey(a)google.com>
Signed-off-by: Eric Biggers <ebiggers(a)google.com>
---
fs/crypto/policy.c | 3 +--
fs/ext4/namei.c | 6 +++---
fs/f2fs/namei.c | 6 +++---
3 files changed, 7 insertions(+), 8 deletions(-)
diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c
index 57a97b38a2fa2..51f4463718e84 100644
--- a/fs/crypto/policy.c
+++ b/fs/crypto/policy.c
@@ -180,8 +180,7 @@ EXPORT_SYMBOL(fscrypt_get_policy);
* malicious offline violations of this constraint, while the link and rename
* checks are needed to prevent online violations of this constraint.
*
- * Return: 1 if permitted, 0 if forbidden. If forbidden, the caller must fail
- * the filesystem operation with EPERM.
+ * Return: 1 if permitted, 0 if forbidden.
*/
int fscrypt_has_permitted_context(struct inode *parent, struct inode *child)
{
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 157dbbe235f90..8ded38ac4cdef 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -3259,7 +3259,7 @@ static int ext4_link(struct dentry *old_dentry,
return -EMLINK;
if (ext4_encrypted_inode(dir) &&
!fscrypt_has_permitted_context(dir, inode))
- return -EPERM;
+ return -EXDEV;
if ((ext4_test_inode_flag(dir, EXT4_INODE_PROJINHERIT)) &&
(!projid_eq(EXT4_I(dir)->i_projid,
@@ -3597,7 +3597,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
if ((old.dir != new.dir) &&
ext4_encrypted_inode(new.dir) &&
!fscrypt_has_permitted_context(new.dir, old.inode)) {
- retval = -EPERM;
+ retval = -EXDEV;
goto end_rename;
}
@@ -3776,7 +3776,7 @@ static int ext4_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
(old_dir != new_dir) &&
(!fscrypt_has_permitted_context(new_dir, old.inode) ||
!fscrypt_has_permitted_context(old_dir, new.inode)))
- return -EPERM;
+ return -EXDEV;
if ((ext4_test_inode_flag(new_dir, EXT4_INODE_PROJINHERIT) &&
!projid_eq(EXT4_I(new_dir)->i_projid,
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index ccb99d5cfd8b0..ce0957318771c 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -177,7 +177,7 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir,
if (f2fs_encrypted_inode(dir) &&
!fscrypt_has_permitted_context(dir, inode))
- return -EPERM;
+ return -EXDEV;
f2fs_balance_fs(sbi, true);
@@ -667,7 +667,7 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
if ((old_dir != new_dir) && f2fs_encrypted_inode(new_dir) &&
!fscrypt_has_permitted_context(new_dir, old_inode)) {
- err = -EPERM;
+ err = -EXDEV;
goto out;
}
@@ -855,7 +855,7 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
(old_dir != new_dir) &&
(!fscrypt_has_permitted_context(new_dir, old_inode) ||
!fscrypt_has_permitted_context(old_dir, new_inode)))
- return -EPERM;
+ return -EXDEV;
old_entry = f2fs_find_entry(old_dir, &old_dentry->d_name, &old_page);
if (!old_entry) {
--
2.29.1
From: Eric Biggers <ebiggers(a)google.com>
commit f5e55e777cc93eae1416f0fa4908e8846b6d7825 upstream.
[Please apply to 4.14-stable. This is an important fix to have,
and it will be needed for xfstest generic/398 to pass if
https://lkml.kernel.org/r/20201031054141.695517-1-ebiggers@kernel.org
is applied. Note, this commit had to be reworked to apply to 4.14.]
Currently, trying to rename or link a regular file, directory, or
symlink into an encrypted directory fails with EPERM when the source
file is unencrypted or is encrypted with a different encryption policy,
and is on the same mountpoint. It is correct for the operation to fail,
but the choice of EPERM breaks tools like 'mv' that know to copy rather
than rename if they see EXDEV, but don't know what to do with EPERM.
Our original motivation for EPERM was to encourage users to securely
handle their data. Encrypting files by "moving" them into an encrypted
directory can be insecure because the unencrypted data may remain in
free space on disk, where it can later be recovered by an attacker.
It's much better to encrypt the data from the start, or at least try to
securely delete the source data e.g. using the 'shred' program.
However, the current behavior hasn't been effective at achieving its
goal because users tend to be confused, hack around it, and complain;
see e.g. https://github.com/google/fscrypt/issues/76. And in some cases
it's actually inconsistent or unnecessary. For example, 'mv'-ing files
between differently encrypted directories doesn't work even in cases
where it can be secure, such as when in userspace the same passphrase
protects both directories. Yet, you *can* already 'mv' unencrypted
files into an encrypted directory if the source files are on a different
mountpoint, even though doing so is often insecure.
There are probably better ways to teach users to securely handle their
files. For example, the 'fscrypt' userspace tool could provide a
command that migrates unencrypted files into an encrypted directory,
acting like 'shred' on the source files and providing appropriate
warnings depending on the type of the source filesystem and disk.
Receiving errors on unimportant files might also force some users to
disable encryption, thus making the behavior counterproductive. It's
desirable to make encryption as unobtrusive as possible.
Therefore, change the error code from EPERM to EXDEV so that tools
looking for EXDEV will fall back to a copy.
This, of course, doesn't prevent users from still doing the right things
to securely manage their files. Note that this also matches the
behavior when a file is renamed between two project quota hierarchies;
so there's precedent for using EXDEV for things other than mountpoints.
xfstests generic/398 will require an update with this change.
[Rewritten from an earlier patch series by Michael Halcrow.]
Cc: Michael Halcrow <mhalcrow(a)google.com>
Cc: Joe Richey <joerichey(a)google.com>
Signed-off-by: Eric Biggers <ebiggers(a)google.com>
---
fs/crypto/policy.c | 3 +--
fs/ext4/namei.c | 6 +++---
fs/f2fs/namei.c | 6 +++---
fs/ubifs/dir.c | 6 +++---
4 files changed, 10 insertions(+), 11 deletions(-)
diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c
index d13a154c84240..4cda0e960bc26 100644
--- a/fs/crypto/policy.c
+++ b/fs/crypto/policy.c
@@ -153,8 +153,7 @@ EXPORT_SYMBOL(fscrypt_ioctl_get_policy);
* malicious offline violations of this constraint, while the link and rename
* checks are needed to prevent online violations of this constraint.
*
- * Return: 1 if permitted, 0 if forbidden. If forbidden, the caller must fail
- * the filesystem operation with EPERM.
+ * Return: 1 if permitted, 0 if forbidden.
*/
int fscrypt_has_permitted_context(struct inode *parent, struct inode *child)
{
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 3f999053457b6..6936de30fcf0d 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -3280,7 +3280,7 @@ static int ext4_link(struct dentry *old_dentry,
return -EMLINK;
if (ext4_encrypted_inode(dir) &&
!fscrypt_has_permitted_context(dir, inode))
- return -EPERM;
+ return -EXDEV;
if ((ext4_test_inode_flag(dir, EXT4_INODE_PROJINHERIT)) &&
(!projid_eq(EXT4_I(dir)->i_projid,
@@ -3618,7 +3618,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
if ((old.dir != new.dir) &&
ext4_encrypted_inode(new.dir) &&
!fscrypt_has_permitted_context(new.dir, old.inode)) {
- retval = -EPERM;
+ retval = -EXDEV;
goto end_rename;
}
@@ -3798,7 +3798,7 @@ static int ext4_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
(old_dir != new_dir) &&
(!fscrypt_has_permitted_context(new_dir, old.inode) ||
!fscrypt_has_permitted_context(old_dir, new.inode)))
- return -EPERM;
+ return -EXDEV;
if ((ext4_test_inode_flag(new_dir, EXT4_INODE_PROJINHERIT) &&
!projid_eq(EXT4_I(new_dir)->i_projid,
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index b13383948fca3..9fb98fce70965 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -222,7 +222,7 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir,
if (f2fs_encrypted_inode(dir) &&
!fscrypt_has_permitted_context(dir, inode))
- return -EPERM;
+ return -EXDEV;
if (is_inode_flag_set(dir, FI_PROJ_INHERIT) &&
(!projid_eq(F2FS_I(dir)->i_projid,
@@ -746,7 +746,7 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
if ((old_dir != new_dir) && f2fs_encrypted_inode(new_dir) &&
!fscrypt_has_permitted_context(new_dir, old_inode)) {
- err = -EPERM;
+ err = -EXDEV;
goto out;
}
@@ -942,7 +942,7 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
(old_dir != new_dir) &&
(!fscrypt_has_permitted_context(new_dir, old_inode) ||
!fscrypt_has_permitted_context(old_dir, new_inode)))
- return -EPERM;
+ return -EXDEV;
if ((is_inode_flag_set(new_dir, FI_PROJ_INHERIT) &&
!projid_eq(F2FS_I(new_dir)->i_projid,
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index 358abc26dbc0b..9d5face7fdc01 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -747,7 +747,7 @@ static int ubifs_link(struct dentry *old_dentry, struct inode *dir,
if (ubifs_crypt_is_encrypted(dir) &&
!fscrypt_has_permitted_context(dir, inode))
- return -EPERM;
+ return -EXDEV;
err = fscrypt_setup_filename(dir, &dentry->d_name, 0, &nm);
if (err)
@@ -1357,7 +1357,7 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry,
if (old_dir != new_dir) {
if (ubifs_crypt_is_encrypted(new_dir) &&
!fscrypt_has_permitted_context(new_dir, old_inode))
- return -EPERM;
+ return -EXDEV;
}
if (unlink && is_dir) {
@@ -1579,7 +1579,7 @@ static int ubifs_xrename(struct inode *old_dir, struct dentry *old_dentry,
(old_dir != new_dir) &&
(!fscrypt_has_permitted_context(new_dir, fst_inode) ||
!fscrypt_has_permitted_context(old_dir, snd_inode)))
- return -EPERM;
+ return -EXDEV;
err = fscrypt_setup_filename(old_dir, &old_dentry->d_name, 0, &fst_nm);
if (err)
--
2.29.1
Building powerpc:defconfig ... failed
--------------
Error log:
arch/powerpc/platforms/powernv/opal-dump.c: In function ‘process_dump’:
arch/powerpc/platforms/powernv/opal-dump.c:409:7: error: void value not ignored as it ought to be
dump = create_dump_obj(dump_id, dump_size, dump_type);
From: "Steven Rostedt (VMware)" <rostedt(a)goodmis.org>
When an interrupt or NMI comes in and switches the context, there's a delay
from when the preempt_count() shows the update. As the preempt_count() is
used to detect recursion having each context have its own bit get set when
tracing starts, and if that bit is already set, it is considered a recursion
and the function exits. But if this happens in that section where context
has changed but preempt_count() has not been updated, this will be
incorrectly flagged as a recursion.
To handle this case, create another bit call TRANSITION and test it if the
current context bit is already set. Flag the call as a recursion if the
TRANSITION bit is already set, and if not, set it and continue. The
TRANSITION bit will be cleared normally on the return of the function that
set it, or if the current context bit is clear, set it and clear the
TRANSITION bit to allow for another transition between the current context
and an even higher one.
Cc: stable(a)vger.kernel.org
Fixes: edc15cafcbfa3 ("tracing: Avoid unnecessary multiple recursion checks")
Signed-off-by: Steven Rostedt (VMware) <rostedt(a)goodmis.org>
---
kernel/trace/trace.h | 23 +++++++++++++++++++++--
kernel/trace/trace_selftest.c | 9 +++++++--
2 files changed, 28 insertions(+), 4 deletions(-)
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index fee535a89560..1dadef445cd1 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -637,6 +637,12 @@ enum {
* function is called to clear it.
*/
TRACE_GRAPH_NOTRACE_BIT,
+
+ /*
+ * When transitioning between context, the preempt_count() may
+ * not be correct. Allow for a single recursion to cover this case.
+ */
+ TRACE_TRANSITION_BIT,
};
#define trace_recursion_set(bit) do { (current)->trace_recursion |= (1<<(bit)); } while (0)
@@ -691,8 +697,21 @@ static __always_inline int trace_test_and_set_recursion(int start, int max)
return 0;
bit = trace_get_context_bit() + start;
- if (unlikely(val & (1 << bit)))
- return -1;
+ if (unlikely(val & (1 << bit))) {
+ /*
+ * It could be that preempt_count has not been updated during
+ * a switch between contexts. Allow for a single recursion.
+ */
+ bit = TRACE_TRANSITION_BIT;
+ if (trace_recursion_test(bit))
+ return -1;
+ trace_recursion_set(bit);
+ barrier();
+ return bit + 1;
+ }
+
+ /* Normal check passed, clear the transition to allow it again */
+ trace_recursion_clear(TRACE_TRANSITION_BIT);
val |= 1 << bit;
current->trace_recursion = val;
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index b5e3496cf803..4738ad48a667 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -492,8 +492,13 @@ trace_selftest_function_recursion(void)
unregister_ftrace_function(&test_rec_probe);
ret = -1;
- if (trace_selftest_recursion_cnt != 1) {
- pr_cont("*callback not called once (%d)* ",
+ /*
+ * Recursion allows for transitions between context,
+ * and may call the callback twice.
+ */
+ if (trace_selftest_recursion_cnt != 1 &&
+ trace_selftest_recursion_cnt != 2) {
+ pr_cont("*callback not called once (or twice) (%d)* ",
trace_selftest_recursion_cnt);
goto out;
}
--
2.28.0
From: "Steven Rostedt (VMware)" <rostedt(a)goodmis.org>
The code that checks recursion will work to only do the recursion check once
if there's nested checks. The top one will do the check, the other nested
checks will see recursion was already checked and return zero for its "bit".
On the return side, nothing will be done if the "bit" is zero.
The problem is that zero is returned for the "good" bit when in NMI context.
This will set the bit for NMIs making it look like *all* NMI tracing is
recursing, and prevent tracing of anything in NMI context!
The simple fix is to return "bit + 1" and subtract that bit on the end to
get the real bit.
Cc: stable(a)vger.kernel.org
Fixes: edc15cafcbfa3 ("tracing: Avoid unnecessary multiple recursion checks")
Signed-off-by: Steven Rostedt (VMware) <rostedt(a)goodmis.org>
---
kernel/trace/trace.h | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index f3f5e77123ad..fee535a89560 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -698,7 +698,7 @@ static __always_inline int trace_test_and_set_recursion(int start, int max)
current->trace_recursion = val;
barrier();
- return bit;
+ return bit + 1;
}
static __always_inline void trace_clear_recursion(int bit)
@@ -708,6 +708,7 @@ static __always_inline void trace_clear_recursion(int bit)
if (!bit)
return;
+ bit--;
bit = 1 << bit;
val &= ~bit;
--
2.28.0
From: Qiujun Huang <hqjagain(a)gmail.com>
The nesting count of trace_printk allows for 4 levels of nesting. The
nesting counter starts at zero and is incremented before being used to
retrieve the current context's buffer. But the index to the buffer uses the
nesting counter after it was incremented, and not its original number,
which in needs to do.
Link: https://lkml.kernel.org/r/20201029161905.4269-1-hqjagain@gmail.com
Cc: stable(a)vger.kernel.org
Fixes: 3d9622c12c887 ("tracing: Add barrier to trace_printk() buffer nesting modification")
Signed-off-by: Qiujun Huang <hqjagain(a)gmail.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt(a)goodmis.org>
---
kernel/trace/trace.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 528971714fc6..daa96215e294 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3132,7 +3132,7 @@ static char *get_trace_buf(void)
/* Interrupts must see nesting incremented before we use the buffer */
barrier();
- return &buffer->buffer[buffer->nesting][0];
+ return &buffer->buffer[buffer->nesting - 1][0];
}
static void put_trace_buf(void)
--
2.28.0
This is a backported version of the v5.9 patch to fix issues on OpenRISC using
get_user with 64-bit values. This also fixes compilation failures when
compiling binder.c.
Discussion: https://lkml.org/lkml/2020/10/16/198
Stafford Horne (1):
openrisc: Fix issue with get_user for 64-bit values
arch/openrisc/include/asm/uaccess.h | 35 ++++++++++++++++++-----------
1 file changed, 22 insertions(+), 13 deletions(-)
--
2.26.2
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 779055842da5b2e508f3ccf9a8153cb1f704f566 Mon Sep 17 00:00:00 2001
From: Souptick Joarder <jrdr.linux(a)gmail.com>
Date: Sun, 6 Sep 2020 12:21:53 +0530
Subject: [PATCH] xen/gntdev.c: Mark pages as dirty
There seems to be a bug in the original code when gntdev_get_page()
is called with writeable=true then the page needs to be marked dirty
before being put.
To address this, a bool writeable is added in gnt_dev_copy_batch, set
it in gntdev_grant_copy_seg() (and drop `writeable` argument to
gntdev_get_page()) and then, based on batch->writeable, use
set_page_dirty_lock().
Fixes: a4cdb556cae0 (xen/gntdev: add ioctl for grant copy)
Suggested-by: Boris Ostrovsky <boris.ostrovsky(a)oracle.com>
Signed-off-by: Souptick Joarder <jrdr.linux(a)gmail.com>
Cc: John Hubbard <jhubbard(a)nvidia.com>
Cc: Boris Ostrovsky <boris.ostrovsky(a)oracle.com>
Cc: Juergen Gross <jgross(a)suse.com>
Cc: David Vrabel <david.vrabel(a)citrix.com>
Cc: stable(a)vger.kernel.org
Link: https://lore.kernel.org/r/1599375114-32360-1-git-send-email-jrdr.linux@gmai…
Reviewed-by: Boris Ostrovsky <boris.ostrovsky(a)oracle.com>
Signed-off-by: Boris Ostrovsky <boris.ostrovsky(a)oracle.com>
diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c
index 64a9025a87be..1f32db7b72b2 100644
--- a/drivers/xen/gntdev.c
+++ b/drivers/xen/gntdev.c
@@ -720,17 +720,18 @@ struct gntdev_copy_batch {
s16 __user *status[GNTDEV_COPY_BATCH];
unsigned int nr_ops;
unsigned int nr_pages;
+ bool writeable;
};
static int gntdev_get_page(struct gntdev_copy_batch *batch, void __user *virt,
- bool writeable, unsigned long *gfn)
+ unsigned long *gfn)
{
unsigned long addr = (unsigned long)virt;
struct page *page;
unsigned long xen_pfn;
int ret;
- ret = get_user_pages_fast(addr, 1, writeable ? FOLL_WRITE : 0, &page);
+ ret = get_user_pages_fast(addr, 1, batch->writeable ? FOLL_WRITE : 0, &page);
if (ret < 0)
return ret;
@@ -746,9 +747,13 @@ static void gntdev_put_pages(struct gntdev_copy_batch *batch)
{
unsigned int i;
- for (i = 0; i < batch->nr_pages; i++)
+ for (i = 0; i < batch->nr_pages; i++) {
+ if (batch->writeable && !PageDirty(batch->pages[i]))
+ set_page_dirty_lock(batch->pages[i]);
put_page(batch->pages[i]);
+ }
batch->nr_pages = 0;
+ batch->writeable = false;
}
static int gntdev_copy(struct gntdev_copy_batch *batch)
@@ -837,8 +842,9 @@ static int gntdev_grant_copy_seg(struct gntdev_copy_batch *batch,
virt = seg->source.virt + copied;
off = (unsigned long)virt & ~XEN_PAGE_MASK;
len = min(len, (size_t)XEN_PAGE_SIZE - off);
+ batch->writeable = false;
- ret = gntdev_get_page(batch, virt, false, &gfn);
+ ret = gntdev_get_page(batch, virt, &gfn);
if (ret < 0)
return ret;
@@ -856,8 +862,9 @@ static int gntdev_grant_copy_seg(struct gntdev_copy_batch *batch,
virt = seg->dest.virt + copied;
off = (unsigned long)virt & ~XEN_PAGE_MASK;
len = min(len, (size_t)XEN_PAGE_SIZE - off);
+ batch->writeable = true;
- ret = gntdev_get_page(batch, virt, true, &gfn);
+ ret = gntdev_get_page(batch, virt, &gfn);
if (ret < 0)
return ret;
The patch below does not apply to the 4.9-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 779055842da5b2e508f3ccf9a8153cb1f704f566 Mon Sep 17 00:00:00 2001
From: Souptick Joarder <jrdr.linux(a)gmail.com>
Date: Sun, 6 Sep 2020 12:21:53 +0530
Subject: [PATCH] xen/gntdev.c: Mark pages as dirty
There seems to be a bug in the original code when gntdev_get_page()
is called with writeable=true then the page needs to be marked dirty
before being put.
To address this, a bool writeable is added in gnt_dev_copy_batch, set
it in gntdev_grant_copy_seg() (and drop `writeable` argument to
gntdev_get_page()) and then, based on batch->writeable, use
set_page_dirty_lock().
Fixes: a4cdb556cae0 (xen/gntdev: add ioctl for grant copy)
Suggested-by: Boris Ostrovsky <boris.ostrovsky(a)oracle.com>
Signed-off-by: Souptick Joarder <jrdr.linux(a)gmail.com>
Cc: John Hubbard <jhubbard(a)nvidia.com>
Cc: Boris Ostrovsky <boris.ostrovsky(a)oracle.com>
Cc: Juergen Gross <jgross(a)suse.com>
Cc: David Vrabel <david.vrabel(a)citrix.com>
Cc: stable(a)vger.kernel.org
Link: https://lore.kernel.org/r/1599375114-32360-1-git-send-email-jrdr.linux@gmai…
Reviewed-by: Boris Ostrovsky <boris.ostrovsky(a)oracle.com>
Signed-off-by: Boris Ostrovsky <boris.ostrovsky(a)oracle.com>
diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c
index 64a9025a87be..1f32db7b72b2 100644
--- a/drivers/xen/gntdev.c
+++ b/drivers/xen/gntdev.c
@@ -720,17 +720,18 @@ struct gntdev_copy_batch {
s16 __user *status[GNTDEV_COPY_BATCH];
unsigned int nr_ops;
unsigned int nr_pages;
+ bool writeable;
};
static int gntdev_get_page(struct gntdev_copy_batch *batch, void __user *virt,
- bool writeable, unsigned long *gfn)
+ unsigned long *gfn)
{
unsigned long addr = (unsigned long)virt;
struct page *page;
unsigned long xen_pfn;
int ret;
- ret = get_user_pages_fast(addr, 1, writeable ? FOLL_WRITE : 0, &page);
+ ret = get_user_pages_fast(addr, 1, batch->writeable ? FOLL_WRITE : 0, &page);
if (ret < 0)
return ret;
@@ -746,9 +747,13 @@ static void gntdev_put_pages(struct gntdev_copy_batch *batch)
{
unsigned int i;
- for (i = 0; i < batch->nr_pages; i++)
+ for (i = 0; i < batch->nr_pages; i++) {
+ if (batch->writeable && !PageDirty(batch->pages[i]))
+ set_page_dirty_lock(batch->pages[i]);
put_page(batch->pages[i]);
+ }
batch->nr_pages = 0;
+ batch->writeable = false;
}
static int gntdev_copy(struct gntdev_copy_batch *batch)
@@ -837,8 +842,9 @@ static int gntdev_grant_copy_seg(struct gntdev_copy_batch *batch,
virt = seg->source.virt + copied;
off = (unsigned long)virt & ~XEN_PAGE_MASK;
len = min(len, (size_t)XEN_PAGE_SIZE - off);
+ batch->writeable = false;
- ret = gntdev_get_page(batch, virt, false, &gfn);
+ ret = gntdev_get_page(batch, virt, &gfn);
if (ret < 0)
return ret;
@@ -856,8 +862,9 @@ static int gntdev_grant_copy_seg(struct gntdev_copy_batch *batch,
virt = seg->dest.virt + copied;
off = (unsigned long)virt & ~XEN_PAGE_MASK;
len = min(len, (size_t)XEN_PAGE_SIZE - off);
+ batch->writeable = true;
- ret = gntdev_get_page(batch, virt, true, &gfn);
+ ret = gntdev_get_page(batch, virt, &gfn);
if (ret < 0)
return ret;
The patch below was submitted to be applied to the 5.9-stable tree.
I fail to see how this patch meets the stable kernel rules as found at
Documentation/process/stable-kernel-rules.rst.
I could be totally wrong, and if so, please respond to
<stable(a)vger.kernel.org> and let me know why this patch should be
applied. Otherwise, it is now dropped from my patch queues, never to be
seen again.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 4be92db3b5667f3a5c874a04635b037dc5e3f373 Mon Sep 17 00:00:00 2001
From: Roberto Sassu <roberto.sassu(a)huawei.com>
Date: Fri, 4 Sep 2020 11:23:29 +0200
Subject: [PATCH] ima: Remove semicolon at the end of
ima_get_binary_runtime_size()
This patch removes the unnecessary semicolon at the end of
ima_get_binary_runtime_size().
Cc: stable(a)vger.kernel.org
Fixes: d158847ae89a2 ("ima: maintain memory size needed for serializing the measurement list")
Signed-off-by: Roberto Sassu <roberto.sassu(a)huawei.com>
Signed-off-by: Mimi Zohar <zohar(a)linux.ibm.com>
diff --git a/security/integrity/ima/ima_queue.c b/security/integrity/ima/ima_queue.c
index fb4ec270f620..c096ef8945c7 100644
--- a/security/integrity/ima/ima_queue.c
+++ b/security/integrity/ima/ima_queue.c
@@ -133,7 +133,7 @@ unsigned long ima_get_binary_runtime_size(void)
return ULONG_MAX;
else
return binary_runtime_size + sizeof(struct ima_kexec_hdr);
-};
+}
static int ima_pcr_extend(struct tpm_digest *digests_arg, int pcr)
{
From: Gao Xiang <hsiangkao(a)redhat.com>
commit d578b46db69d125a654f509bdc9091d84e924dc8 upstream.
Don't recheck it since xattr_permission() already
checks CAP_SYS_ADMIN capability.
Just follow 5d3ce4f70172 ("f2fs: avoid duplicated permission check for "trusted." xattrs")
Reported-by: Hongyu Jin <hongyu.jin(a)unisoc.com>
[ Gao Xiang: since it could cause some complex Android overlay
permission issue as well on android-5.4+, it'd be better to
backport to 5.4+ rather than pure cleanup on mainline. ]
Cc: <stable(a)vger.kernel.org> # 5.4+
Link: https://lore.kernel.org/r/20200811070020.6339-1-hsiangkao@redhat.com
Reviewed-by: Chao Yu <yuchao0(a)huawei.com>
Signed-off-by: Gao Xiang <hsiangkao(a)redhat.com>
---
fs/erofs/xattr.c | 2 --
1 file changed, 2 deletions(-)
diff --git a/fs/erofs/xattr.c b/fs/erofs/xattr.c
index c8c381eadcd6..5bde77d70852 100644
--- a/fs/erofs/xattr.c
+++ b/fs/erofs/xattr.c
@@ -473,8 +473,6 @@ static int erofs_xattr_generic_get(const struct xattr_handler *handler,
return -EOPNOTSUPP;
break;
case EROFS_XATTR_INDEX_TRUSTED:
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
break;
case EROFS_XATTR_INDEX_SECURITY:
break;
--
2.24.0
From: Gao Xiang <hsiangkao(a)redhat.com>
commit d578b46db69d125a654f509bdc9091d84e924dc8 upstream.
Don't recheck it since xattr_permission() already
checks CAP_SYS_ADMIN capability.
Just follow 5d3ce4f70172 ("f2fs: avoid duplicated permission check for "trusted." xattrs")
Reported-by: Hongyu Jin <hongyu.jin(a)unisoc.com>
[ Gao Xiang: since it could cause some complex Android overlay
permission issue as well on android-5.4+, it'd be better to
backport to 5.4+ rather than pure cleanup on mainline. ]
Cc: <stable(a)vger.kernel.org> # 5.4+
Link: https://lore.kernel.org/r/20200811070020.6339-1-hsiangkao@redhat.com
Reviewed-by: Chao Yu <yuchao0(a)huawei.com>
Signed-off-by: Gao Xiang <hsiangkao(a)redhat.com>
---
fs/erofs/xattr.c | 2 --
1 file changed, 2 deletions(-)
diff --git a/fs/erofs/xattr.c b/fs/erofs/xattr.c
index 87e437e7b34f..f86e3247febc 100644
--- a/fs/erofs/xattr.c
+++ b/fs/erofs/xattr.c
@@ -473,8 +473,6 @@ static int erofs_xattr_generic_get(const struct xattr_handler *handler,
return -EOPNOTSUPP;
break;
case EROFS_XATTR_INDEX_TRUSTED:
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
break;
case EROFS_XATTR_INDEX_SECURITY:
break;
--
2.24.0
From: Gao Xiang <hsiangkao(a)redhat.com>
commit d578b46db69d125a654f509bdc9091d84e924dc8 upstream.
Don't recheck it since xattr_permission() already
checks CAP_SYS_ADMIN capability.
Just follow 5d3ce4f70172 ("f2fs: avoid duplicated permission check for "trusted." xattrs")
Reported-by: Hongyu Jin <hongyu.jin(a)unisoc.com>
[ Gao Xiang: since it could cause some complex Android overlay
permission issue as well on android-5.4+, it'd be better to
backport to 5.4+ rather than pure cleanup on mainline. ]
Cc: <stable(a)vger.kernel.org> # 5.4+
Link: https://lore.kernel.org/r/20200811070020.6339-1-hsiangkao@redhat.com
Reviewed-by: Chao Yu <yuchao0(a)huawei.com>
Signed-off-by: Gao Xiang <hsiangkao(a)redhat.com>
---
fs/erofs/xattr.c | 2 --
1 file changed, 2 deletions(-)
diff --git a/fs/erofs/xattr.c b/fs/erofs/xattr.c
index b766c3ee5fa8..503bea20cde2 100644
--- a/fs/erofs/xattr.c
+++ b/fs/erofs/xattr.c
@@ -473,8 +473,6 @@ static int erofs_xattr_generic_get(const struct xattr_handler *handler,
return -EOPNOTSUPP;
break;
case EROFS_XATTR_INDEX_TRUSTED:
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
break;
case EROFS_XATTR_INDEX_SECURITY:
break;
--
2.24.0
free_highpages() iterates over the free memblock regions in high
memory, and marks each page as available for the memory management
system. However, as it rounds the end of each region downwards, we
may end up freeing a page that is memblock_reserve()d, resulting
in memory corruption. So align the end of the range to the next
page instead.
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Ard Biesheuvel <ardb(a)kernel.org>
---
arch/arm/mm/init.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c
index a391804c7ce3..d41781cb5496 100644
--- a/arch/arm/mm/init.c
+++ b/arch/arm/mm/init.c
@@ -354,7 +354,7 @@ static void __init free_highpages(void)
for_each_free_mem_range(i, NUMA_NO_NODE, MEMBLOCK_NONE,
&range_start, &range_end, NULL) {
unsigned long start = PHYS_PFN(range_start);
- unsigned long end = PHYS_PFN(range_end);
+ unsigned long end = PHYS_PFN(PAGE_ALIGN(range_end));
/* Ignore complete lowmem entries */
if (end <= max_low)
--
2.17.1
On Fri, Oct 30, 2020 at 12:24 PM Jian Cai <jiancai(a)google.com> wrote:
>
> Hi Nathan,
>
> Thanks for all the tips! I have fixed the issues mentioned in your comments and used git send-email to resend the patch as recommended. FYI I used the Message ID of this thread but it created a new thread anyway.
No, I'll bet you're using gmail which has issues showing threads when
the subject is changed or is not `Re: <old subject>`. If you look at
lore, it's correct:
https://lore.kernel.org/stable/20201030014930.GB2519055@ubuntu-m3-large-x86…
Just that you forgot to cc stable. :^P Don't worry about it; I forget
to do that still myself.
--
Thanks,
~Nick Desaulniers
From: Zi Yan <ziy(a)nvidia.com>
In isolate_migratepages_block, when cc->alloc_contig is true, we are
able to isolate compound pages, nr_migratepages and nr_isolated did not
count compound pages correctly, causing us to isolate more pages than we
thought. Use thp_nr_pages to count pages. Otherwise, we might be trapped
in too_many_isolated while loop, since the actual isolated pages can go
up to COMPACT_CLUSTER_MAX*512=16384, where COMPACT_CLUSTER_MAX is 32,
since we stop isolation after cc->nr_migratepages reaches to
COMPACT_CLUSTER_MAX.
In addition, after we fix the issue above, cc->nr_migratepages could
never be equal to COMPACT_CLUSTER_MAX if compound pages are isolated,
thus page isolation could not stop as we intended. Change the isolation
stop condition to >=.
The issue can be triggered as follows:
In a system with 16GB memory and an 8GB CMA region reserved by
hugetlb_cma, if we first allocate 10GB THPs and mlock them
(so some THPs are allocated in the CMA region and mlocked), reserving
6 1GB hugetlb pages via
/sys/kernel/mm/hugepages/hugepages-1048576kB/nr_hugepages will get stuck
(looping in too_many_isolated function) until we kill either task.
With the patch applied, oom will kill the application with 10GB THPs and
let hugetlb page reservation finish.
Fixes: 1da2f328fa64 (“mm,thp,compaction,cma: allow THP migration for CMA allocations”)
Signed-off-by: Zi Yan <ziy(a)nvidia.com>
Reviewed-by: Yang Shi <shy828301(a)gmail.com>
Cc: <stable(a)vger.kernel.org>
---
mm/compaction.c | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/mm/compaction.c b/mm/compaction.c
index ee1f8439369e..3e834ac402f1 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -1012,8 +1012,8 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
isolate_success:
list_add(&page->lru, &cc->migratepages);
- cc->nr_migratepages++;
- nr_isolated++;
+ cc->nr_migratepages += compound_nr(page);
+ nr_isolated += compound_nr(page);
/*
* Avoid isolating too much unless this block is being
@@ -1021,7 +1021,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
* or a lock is contended. For contention, isolate quickly to
* potentially remove one source of contention.
*/
- if (cc->nr_migratepages == COMPACT_CLUSTER_MAX &&
+ if (cc->nr_migratepages >= COMPACT_CLUSTER_MAX &&
!cc->rescan && !cc->contended) {
++low_pfn;
break;
@@ -1132,7 +1132,7 @@ isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn,
if (!pfn)
break;
- if (cc->nr_migratepages == COMPACT_CLUSTER_MAX)
+ if (cc->nr_migratepages >= COMPACT_CLUSTER_MAX)
break;
}
--
2.28.0
After each codeword NAND_FLASH_STATUS is read for possible operational
failures. But there is no DMA sync for CPU operation before reading it
and this leads to incorrect or older copy of DMA buffer in reg_read_buf.
This patch adds the DMA sync on reg_read_buf for CPU before reading it.
Fixes: 5bc36b2bf6e2 ("mtd: rawnand: qcom: check for operation errors in case of raw read")
Signed-off-by: Praveenkumar I <ipkumar(a)codeaurora.org>
---
drivers/mtd/nand/raw/qcom_nandc.c | 2 ++
1 file changed, 2 insertions(+)
diff --git a/drivers/mtd/nand/raw/qcom_nandc.c b/drivers/mtd/nand/raw/qcom_nandc.c
index bd7a7251429b..5bb85f1ba84c 100644
--- a/drivers/mtd/nand/raw/qcom_nandc.c
+++ b/drivers/mtd/nand/raw/qcom_nandc.c
@@ -1570,6 +1570,8 @@ static int check_flash_errors(struct qcom_nand_host *host, int cw_cnt)
struct qcom_nand_controller *nandc = get_qcom_nand_controller(chip);
int i;
+ nandc_read_buffer_sync(nandc, true);
+
for (i = 0; i < cw_cnt; i++) {
u32 flash = le32_to_cpu(nandc->reg_read_buf[i]);
--
The Qualcomm Innovation Center, Inc. is a member of the Code Aurora Forum,
a Linux Foundation Collaborative Project
From: Santosh Shukla <sashukla(a)nvidia.com>
VFIO allows a device driver to resolve a fault by mapping a MMIO
range. This can be subsequently result in user_mem_abort() to
try and compute a huge mapping based on the MMIO pfn, which is
a sure recipe for things to go wrong.
Instead, force a PTE mapping when the pfn faulted in has a device
mapping.
Fixes: 6d674e28f642 ("KVM: arm/arm64: Properly handle faulting of device mappings")
Suggested-by: Marc Zyngier <maz(a)kernel.org>
Signed-off-by: Santosh Shukla <sashukla(a)nvidia.com>
[maz: rewritten commit message]
Signed-off-by: Marc Zyngier <maz(a)kernel.org>
Reviewed-by: Gavin Shan <gshan(a)redhat.com>
Cc: stable(a)vger.kernel.org
Link: https://lore.kernel.org/r/1603711447-11998-2-git-send-email-sashukla@nvidia…
---
arch/arm64/kvm/mmu.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index e431d2d8e368..c7c6df6309d5 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -851,6 +851,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
if (kvm_is_device_pfn(pfn)) {
device = true;
+ force_pte = true;
} else if (logging_active && !write_fault) {
/*
* Only actually map the page as writable if this was a write
--
2.28.0
From: Santosh Shukla <sashukla(a)nvidia.com>
VFIO allows a device driver to resolve a fault by mapping a MMIO
range. This can be subsequently result in user_mem_abort() to
try and compute a huge mapping based on the MMIO pfn, which is
a sure recipe for things to go wrong.
Instead, force a PTE mapping when the pfn faulted in has a device
mapping.
Fixes: 6d674e28f642 ("KVM: arm/arm64: Properly handle faulting of device mappings")
Suggested-by: Marc Zyngier <maz(a)kernel.org>
Signed-off-by: Santosh Shukla <sashukla(a)nvidia.com>
[maz: rewritten commit message]
Signed-off-by: Marc Zyngier <maz(a)kernel.org>
Reviewed-by: Gavin Shan <gshan(a)redhat.com>
Cc: stable(a)vger.kernel.org
Link: https://lore.kernel.org/r/1603711447-11998-2-git-send-email-sashukla@nvidia…
---
arch/arm64/kvm/mmu.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index e431d2d8e368..c7c6df6309d5 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -851,6 +851,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
if (kvm_is_device_pfn(pfn)) {
device = true;
+ force_pte = true;
} else if (logging_active && !write_fault) {
/*
* Only actually map the page as writable if this was a write
--
2.28.0
From: Arnd Bergmann <arnd(a)arndb.de>
The epoll_wait() syscall has a special version for OABI compat
mode to convert the arguments to the EABI structure layout
of the kernel. However, the later epoll_pwait() syscall was
added in arch/arm in linux-2.6.32 without this conversion.
Use the same kind of handler for both.
Fixes: 369842658a36 ("ARM: 5677/1: ARM support for TIF_RESTORE_SIGMASK/pselect6/ppoll/epoll_pwait")
Cc: stable(a)vger.kernel.org
Reviewed-by: Christoph Hellwig <hch(a)lst.de>
Signed-off-by: Arnd Bergmann <arnd(a)arndb.de>
---
arch/arm/kernel/sys_oabi-compat.c | 37 ++++++++++++++++++++++++++++---
arch/arm/tools/syscall.tbl | 2 +-
2 files changed, 35 insertions(+), 4 deletions(-)
diff --git a/arch/arm/kernel/sys_oabi-compat.c b/arch/arm/kernel/sys_oabi-compat.c
index 0203e545bbc8..a2b1ae01e5bf 100644
--- a/arch/arm/kernel/sys_oabi-compat.c
+++ b/arch/arm/kernel/sys_oabi-compat.c
@@ -264,9 +264,8 @@ asmlinkage long sys_oabi_epoll_ctl(int epfd, int op, int fd,
return do_epoll_ctl(epfd, op, fd, &kernel, false);
}
-asmlinkage long sys_oabi_epoll_wait(int epfd,
- struct oabi_epoll_event __user *events,
- int maxevents, int timeout)
+static long do_oabi_epoll_wait(int epfd, struct oabi_epoll_event __user *events,
+ int maxevents, int timeout)
{
struct epoll_event *kbuf;
struct oabi_epoll_event e;
@@ -299,6 +298,38 @@ asmlinkage long sys_oabi_epoll_wait(int epfd,
return err ? -EFAULT : ret;
}
+SYSCALL_DEFINE4(oabi_epoll_wait, int, epfd,
+ struct oabi_epoll_event __user *, events,
+ int, maxevents, int, timeout)
+{
+ return do_oabi_epoll_wait(epfd, events, maxevents, timeout);
+}
+
+/*
+ * Implement the event wait interface for the eventpoll file. It is the kernel
+ * part of the user space epoll_pwait(2).
+ */
+SYSCALL_DEFINE6(oabi_epoll_pwait, int, epfd,
+ struct oabi_epoll_event __user *, events, int, maxevents,
+ int, timeout, const sigset_t __user *, sigmask,
+ size_t, sigsetsize)
+{
+ int error;
+
+ /*
+ * If the caller wants a certain signal mask to be set during the wait,
+ * we apply it here.
+ */
+ error = set_user_sigmask(sigmask, sigsetsize);
+ if (error)
+ return error;
+
+ error = do_oabi_epoll_wait(epfd, events, maxevents, timeout);
+ restore_saved_sigmask_unless(error == -EINTR);
+
+ return error;
+}
+
struct oabi_sembuf {
unsigned short sem_num;
short sem_op;
diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl
index d056a548358e..330cf0aa04c4 100644
--- a/arch/arm/tools/syscall.tbl
+++ b/arch/arm/tools/syscall.tbl
@@ -360,7 +360,7 @@
343 common vmsplice sys_vmsplice
344 common move_pages sys_move_pages
345 common getcpu sys_getcpu
-346 common epoll_pwait sys_epoll_pwait
+346 common epoll_pwait sys_epoll_pwait sys_oabi_epoll_pwait
347 common kexec_load sys_kexec_load
348 common utimensat sys_utimensat_time32
349 common signalfd sys_signalfd
--
2.27.0
From: Arnd Bergmann <arnd(a)arndb.de>
The epoll_wait() syscall has a special version for OABI compat
mode to convert the arguments to the EABI structure layout
of the kernel. However, the later epoll_pwait() syscall was
added in arch/arm in linux-2.6.32 without this conversion.
Use the same kind of handler for both.
Fixes: 369842658a36 ("ARM: 5677/1: ARM support for TIF_RESTORE_SIGMASK/pselect6/ppoll/epoll_pwait")
Cc: stable(a)vger.kernel.org
Reviewed-by: Christoph Hellwig <hch(a)lst.de>
Signed-off-by: Arnd Bergmann <arnd(a)arndb.de>
---
arch/arm/kernel/sys_oabi-compat.c | 37 ++++++++++++++++++++++++++++---
arch/arm/tools/syscall.tbl | 2 +-
2 files changed, 35 insertions(+), 4 deletions(-)
diff --git a/arch/arm/kernel/sys_oabi-compat.c b/arch/arm/kernel/sys_oabi-compat.c
index 0203e545bbc8..a2b1ae01e5bf 100644
--- a/arch/arm/kernel/sys_oabi-compat.c
+++ b/arch/arm/kernel/sys_oabi-compat.c
@@ -264,9 +264,8 @@ asmlinkage long sys_oabi_epoll_ctl(int epfd, int op, int fd,
return do_epoll_ctl(epfd, op, fd, &kernel, false);
}
-asmlinkage long sys_oabi_epoll_wait(int epfd,
- struct oabi_epoll_event __user *events,
- int maxevents, int timeout)
+static long do_oabi_epoll_wait(int epfd, struct oabi_epoll_event __user *events,
+ int maxevents, int timeout)
{
struct epoll_event *kbuf;
struct oabi_epoll_event e;
@@ -299,6 +298,38 @@ asmlinkage long sys_oabi_epoll_wait(int epfd,
return err ? -EFAULT : ret;
}
+SYSCALL_DEFINE4(oabi_epoll_wait, int, epfd,
+ struct oabi_epoll_event __user *, events,
+ int, maxevents, int, timeout)
+{
+ return do_oabi_epoll_wait(epfd, events, maxevents, timeout);
+}
+
+/*
+ * Implement the event wait interface for the eventpoll file. It is the kernel
+ * part of the user space epoll_pwait(2).
+ */
+SYSCALL_DEFINE6(oabi_epoll_pwait, int, epfd,
+ struct oabi_epoll_event __user *, events, int, maxevents,
+ int, timeout, const sigset_t __user *, sigmask,
+ size_t, sigsetsize)
+{
+ int error;
+
+ /*
+ * If the caller wants a certain signal mask to be set during the wait,
+ * we apply it here.
+ */
+ error = set_user_sigmask(sigmask, sigsetsize);
+ if (error)
+ return error;
+
+ error = do_oabi_epoll_wait(epfd, events, maxevents, timeout);
+ restore_saved_sigmask_unless(error == -EINTR);
+
+ return error;
+}
+
struct oabi_sembuf {
unsigned short sem_num;
short sem_op;
diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl
index d056a548358e..330cf0aa04c4 100644
--- a/arch/arm/tools/syscall.tbl
+++ b/arch/arm/tools/syscall.tbl
@@ -360,7 +360,7 @@
343 common vmsplice sys_vmsplice
344 common move_pages sys_move_pages
345 common getcpu sys_getcpu
-346 common epoll_pwait sys_epoll_pwait
+346 common epoll_pwait sys_epoll_pwait sys_oabi_epoll_pwait
347 common kexec_load sys_kexec_load
348 common utimensat sys_utimensat_time32
349 common signalfd sys_signalfd
--
2.27.0
From: Ville Syrjälä <ville.syrjala(a)linux.intel.com>
The new >8k CEA modes have dotclocks reaching 5.94 GHz, which
means our clock*1000 will now overflow the 32bit unsigned
integer. Switch to 64bit maths to avoid it.
Cc: stable(a)vger.kernel.org
Reported-by: Randy Dunlap <rdunlap(a)infradead.org>
Signed-off-by: Ville Syrjälä <ville.syrjala(a)linux.intel.com>
---
An interesting question how many other place might suffer from similar
overflows. I think i915 should be mostly OK. The one place I know we use
Hz instead kHz is the hsw DPLL code, which I would prefer we also change
to use kHz. The other concern is whether we have any potential overflows
before we check this against the platform's max dotclock.
I do have this unreviewed igt series
https://patchwork.freedesktop.org/series/69531/ which extends the
current testing with some other forms of invalid modes. Could probably
extend that with a mode.clock=INT_MAX test to see if anything else might
trip up.
No idea about other drivers.
drivers/gpu/drm/drm_modes.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/drivers/gpu/drm/drm_modes.c b/drivers/gpu/drm/drm_modes.c
index 501b4fe55a3d..511cde5c7fa6 100644
--- a/drivers/gpu/drm/drm_modes.c
+++ b/drivers/gpu/drm/drm_modes.c
@@ -762,7 +762,7 @@ int drm_mode_vrefresh(const struct drm_display_mode *mode)
if (mode->htotal == 0 || mode->vtotal == 0)
return 0;
- num = mode->clock * 1000;
+ num = mode->clock;
den = mode->htotal * mode->vtotal;
if (mode->flags & DRM_MODE_FLAG_INTERLACE)
@@ -772,7 +772,7 @@ int drm_mode_vrefresh(const struct drm_display_mode *mode)
if (mode->vscan > 1)
den *= mode->vscan;
- return DIV_ROUND_CLOSEST(num, den);
+ return DIV_ROUND_CLOSEST_ULL(mul_u32_u32(num, 1000), den);
}
EXPORT_SYMBOL(drm_mode_vrefresh);
--
2.26.2
From: Matteo Croce <mcroce(a)microsoft.com>
The parsing of the reboot= cmdline has two major errors:
- a missing bound check can crash the system on reboot
- parsing of the cpu number only works if specified last
Fix both, along with a small code refactor.
v1->v2:
As Petr suggested, don't force base 10 in simple_strtoul(),
so hex values are accepted as well.
Matteo Croce (2):
reboot: fix overflow parsing reboot cpu number
reboot: fix parsing of reboot cpu number
kernel/reboot.c | 24 +++++++++++++-----------
1 file changed, 13 insertions(+), 11 deletions(-)
--
2.28.0
Hallo
this rc1 runs here (pure Intel-box) without errors.
Thanks !
An RPC (I'm thinking about since some month)
======
Wouldn't it be better (and not so much add. work) to sort the
Pseudo-Shortlog towards subsystem/driver ?
something like this:
...
usb: gadget: f_ncm: allow using NCM in SuperSpeed Plus gadgets.
usb: cdns3: gadget: free interrupt after gadget has deleted
Lorenzo Colitti <lorenzo(a)google.com>
Peter Chen <peter.chen(a)nxp.com>
...
Think of searching a bugfix in the shortlog.
With the current layout I need to read/"visual grep" the whole log.
With the new layout I'm able to jump to the "buggy" subsystem/driver and
only need to read that part of the log to get the info if the bug is
fixed or not yet
Well, surely there are other ways to get the info I need, e.g.
search-function of my favorite browser, but ...
--
regards
Ronald
On Wed, 28 Oct 2020 at 11:03, Sakari Ailus <sakari.ailus(a)linux.intel.com> wrote:
>
> On Wed, Oct 28, 2020 at 10:56:55AM +0100, Krzysztof Kozlowski wrote:
> > On Wed, 28 Oct 2020 at 10:45, Krzysztof Kozlowski <krzk(a)kernel.org> wrote:
> > >
> > > On Wed, 28 Oct 2020 at 10:43, Yeh, Andy <andy.yeh(a)intel.com> wrote:
> > > >
> > > > But the sensor settings for the original submission is to output GRBG Bayer RAW.
> > > >
> > > > Regards, Andy
> > >
> > > No, not to my knowledge. There are no settings for color output
> > > because it is fixed to GBGB/RGRG. I was looking a lot into this driver
> > > (I have few other problems with it, already few other patches posted)
> > > and I could not find a setting for this in datasheet. If you know the
> > > setting for the other color - can you point me to it?
> >
> > And except the datasheet which mentions the specific format, the
> > testing confirms it. With original color the pictures are pink/purple.
> > With proper color, the pictures are correct (with more green color as
> > expected for bayer).
>
> Quoting the driver's start_streaming function:
>
> /* Set Orientation be 180 degree */
> ret = imx258_write_reg(imx258, REG_MIRROR_FLIP_CONTROL,
> IMX258_REG_VALUE_08BIT, REG_CONFIG_MIRROR_FLIP);
I understand that you think it will replace the lines and columns and
the first line will be RG, instead of GB.... or actually BG because it
flips horizontal and vertical? So why does it not work?
BTW, this nicely points that the comment around
device_property_read_u32() for rotation is a little bit misleading :)
> if (ret) {
> dev_err(&client->dev, "%s failed to set orientation\n",
> __func__);
> return ret;
> }
>
> Could it be you're taking pictures of pink objects? ;-)
I can send a few sample pictures taken with GStreamer (RAW8, not
original RAW10)...
Best regards,
Krzysztof
On Wed, Oct 28, 2020 at 10:14:52AM +0000, Yeh, Andy wrote:
> >-----Original Message-----
> >From: Sakari Ailus <sakari.ailus(a)linux.intel.com>
> >Sent: Wednesday, October 28, 2020 6:03 PM
> >To: Krzysztof Kozlowski <krzk(a)kernel.org>
> >Cc: Yeh, Andy <andy.yeh(a)intel.com>; Mauro Carvalho Chehab
> ><mchehab(a)kernel.org>; Tomasz Figa <tfiga(a)chromium.org>; Jason Chen
> ><jasonx.z.chen(a)intel.com>; Alan Chiang <alanx.chiang(a)intel.com>; linux-
> >media(a)vger.kernel.org; linux-kernel(a)vger.kernel.org;
> >stable(a)vger.kernel.org
> >Subject: Re: [PATCH] media: i2c: imx258: correct mode to GBGB/RGRG
> >
> >On Wed, Oct 28, 2020 at 10:56:55AM +0100, Krzysztof Kozlowski wrote:
> >> On Wed, 28 Oct 2020 at 10:45, Krzysztof Kozlowski <krzk(a)kernel.org>
> >wrote:
> >> >
> >> > On Wed, 28 Oct 2020 at 10:43, Yeh, Andy <andy.yeh(a)intel.com> wrote:
> >> > >
> >> > > But the sensor settings for the original submission is to output GRBG
> >Bayer RAW.
> >> > >
> >> > > Regards, Andy
> >> >
> >> > No, not to my knowledge. There are no settings for color output
> >> > because it is fixed to GBGB/RGRG. I was looking a lot into this
> >> > driver (I have few other problems with it, already few other patches
> >> > posted) and I could not find a setting for this in datasheet. If you
> >> > know the setting for the other color - can you point me to it?
> >>
> >> And except the datasheet which mentions the specific format, the
> >> testing confirms it. With original color the pictures are pink/purple.
> >> With proper color, the pictures are correct (with more green color as
> >> expected for bayer).
> >
> >Quoting the driver's start_streaming function:
> >
> > /* Set Orientation be 180 degree */
> > ret = imx258_write_reg(imx258, REG_MIRROR_FLIP_CONTROL,
> > IMX258_REG_VALUE_08BIT, REG_CONFIG_MIRROR_FLIP);
> > if (ret) {
> > dev_err(&client->dev, "%s failed to set orientation\n",
> > __func__);
> > return ret;
> > }
> >
> >Could it be you're taking pictures of pink objects? ;-)
> >
> >--
> >Sakari Ailus
>
> Sakari is right. IIRC since the default sensor settings outputs in GBRG, and after mirro/flip (it is the original application when submit the driver) the bayer order will be GRBG.
Yes, you are all right. It seems I was using wrong color mode for
Gstreamer, or I messed up something more.
Thanks for help everyone!
The patch can be dropped.
Best regards,
Krzysztof
We've fixed many races in panfrost_job_timedout() but some remain.
Instead of trying to fix it again, let's simplify the logic and move
the reset bits to a separate work scheduled when one of the queue
reports a timeout.
Fixes: 1a11a88cfd9a ("drm/panfrost: Fix job timeout handling")
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Boris Brezillon <boris.brezillon(a)collabora.com>
---
drivers/gpu/drm/panfrost/panfrost_device.c | 1 -
drivers/gpu/drm/panfrost/panfrost_device.h | 6 +-
drivers/gpu/drm/panfrost/panfrost_job.c | 130 ++++++++++++---------
3 files changed, 82 insertions(+), 55 deletions(-)
diff --git a/drivers/gpu/drm/panfrost/panfrost_device.c b/drivers/gpu/drm/panfrost/panfrost_device.c
index ea8d31863c50..a83b2ff5837a 100644
--- a/drivers/gpu/drm/panfrost/panfrost_device.c
+++ b/drivers/gpu/drm/panfrost/panfrost_device.c
@@ -200,7 +200,6 @@ int panfrost_device_init(struct panfrost_device *pfdev)
struct resource *res;
mutex_init(&pfdev->sched_lock);
- mutex_init(&pfdev->reset_lock);
INIT_LIST_HEAD(&pfdev->scheduled_jobs);
INIT_LIST_HEAD(&pfdev->as_lru_list);
diff --git a/drivers/gpu/drm/panfrost/panfrost_device.h b/drivers/gpu/drm/panfrost/panfrost_device.h
index 2e9cbd1c4a58..67f9f66904be 100644
--- a/drivers/gpu/drm/panfrost/panfrost_device.h
+++ b/drivers/gpu/drm/panfrost/panfrost_device.h
@@ -105,7 +105,11 @@ struct panfrost_device {
struct panfrost_perfcnt *perfcnt;
struct mutex sched_lock;
- struct mutex reset_lock;
+
+ struct {
+ struct work_struct work;
+ atomic_t pending;
+ } reset;
struct mutex shrinker_lock;
struct list_head shrinker_list;
diff --git a/drivers/gpu/drm/panfrost/panfrost_job.c b/drivers/gpu/drm/panfrost/panfrost_job.c
index d0469e944143..745ee9563a54 100644
--- a/drivers/gpu/drm/panfrost/panfrost_job.c
+++ b/drivers/gpu/drm/panfrost/panfrost_job.c
@@ -20,6 +20,8 @@
#include "panfrost_gpu.h"
#include "panfrost_mmu.h"
+#define JOB_TIMEOUT_MS 500
+
#define job_write(dev, reg, data) writel(data, dev->iomem + (reg))
#define job_read(dev, reg) readl(dev->iomem + (reg))
@@ -382,19 +384,37 @@ static bool panfrost_scheduler_stop(struct panfrost_queue_state *queue,
drm_sched_increase_karma(bad);
queue->stopped = true;
stopped = true;
+
+ /*
+ * Set the timeout to max so the timer doesn't get started
+ * when we return from the timeout handler (restored in
+ * panfrost_scheduler_start()).
+ */
+ queue->sched.timeout = MAX_SCHEDULE_TIMEOUT;
}
mutex_unlock(&queue->lock);
return stopped;
}
+static void panfrost_scheduler_start(struct panfrost_queue_state *queue)
+{
+ if (WARN_ON(!queue->stopped))
+ return;
+
+ mutex_lock(&queue->lock);
+ /* Restore the original timeout before starting the scheduler. */
+ queue->sched.timeout = msecs_to_jiffies(JOB_TIMEOUT_MS);
+ drm_sched_start(&queue->sched, true);
+ queue->stopped = false;
+ mutex_unlock(&queue->lock);
+}
+
static void panfrost_job_timedout(struct drm_sched_job *sched_job)
{
struct panfrost_job *job = to_panfrost_job(sched_job);
struct panfrost_device *pfdev = job->pfdev;
int js = panfrost_job_get_slot(job);
- unsigned long flags;
- int i;
/*
* If the GPU managed to complete this jobs fence, the timeout is
@@ -415,56 +435,9 @@ static void panfrost_job_timedout(struct drm_sched_job *sched_job)
if (!panfrost_scheduler_stop(&pfdev->js->queue[js], sched_job))
return;
- if (!mutex_trylock(&pfdev->reset_lock))
- return;
-
- for (i = 0; i < NUM_JOB_SLOTS; i++) {
- struct drm_gpu_scheduler *sched = &pfdev->js->queue[i].sched;
-
- /*
- * If the queue is still active, make sure we wait for any
- * pending timeouts.
- */
- if (!pfdev->js->queue[i].stopped)
- cancel_delayed_work_sync(&sched->work_tdr);
-
- /*
- * If the scheduler was not already stopped, there's a tiny
- * chance a timeout has expired just before we stopped it, and
- * drm_sched_stop() does not flush pending works. Let's flush
- * them now so the timeout handler doesn't get called in the
- * middle of a reset.
- */
- if (panfrost_scheduler_stop(&pfdev->js->queue[i], NULL))
- cancel_delayed_work_sync(&sched->work_tdr);
-
- /*
- * Now that we cancelled the pending timeouts, we can safely
- * reset the stopped state.
- */
- pfdev->js->queue[i].stopped = false;
- }
-
- spin_lock_irqsave(&pfdev->js->job_lock, flags);
- for (i = 0; i < NUM_JOB_SLOTS; i++) {
- if (pfdev->jobs[i]) {
- pm_runtime_put_noidle(pfdev->dev);
- panfrost_devfreq_record_idle(&pfdev->pfdevfreq);
- pfdev->jobs[i] = NULL;
- }
- }
- spin_unlock_irqrestore(&pfdev->js->job_lock, flags);
-
- panfrost_device_reset(pfdev);
-
- for (i = 0; i < NUM_JOB_SLOTS; i++)
- drm_sched_resubmit_jobs(&pfdev->js->queue[i].sched);
-
- mutex_unlock(&pfdev->reset_lock);
-
- /* restart scheduler after GPU is usable again */
- for (i = 0; i < NUM_JOB_SLOTS; i++)
- drm_sched_start(&pfdev->js->queue[i].sched, true);
+ /* Schedule a reset. */
+ atomic_set(&pfdev->reset.pending, 1);
+ schedule_work(&pfdev->reset.work);
}
static const struct drm_sched_backend_ops panfrost_sched_ops = {
@@ -531,11 +504,62 @@ static irqreturn_t panfrost_job_irq_handler(int irq, void *data)
return IRQ_HANDLED;
}
+static void panfrost_reset(struct work_struct *work)
+{
+ struct panfrost_device *pfdev = container_of(work,
+ struct panfrost_device,
+ reset.work);
+ unsigned long flags;
+ unsigned int i;
+
+ if (!atomic_read(&pfdev->reset.pending))
+ return;
+
+ for (i = 0; i < NUM_JOB_SLOTS; i++) {
+ /*
+ * We want pending timeouts to be handled before we attempt
+ * to stop the scheduler. If we don't do that and the timeout
+ * handler is in flight, it might have removed the bad job
+ * from the list, and we'll lose this job if the reset handler
+ * enters the critical section in panfrost_scheduler_stop()
+ * before the timeout handler.
+ *
+ * Timeout is set to max to make sure the timer is not
+ * restarted after the cancellation.
+ */
+ pfdev->js->queue[i].sched.timeout = MAX_SCHEDULE_TIMEOUT;
+ cancel_delayed_work_sync(&pfdev->js->queue[i].sched.work_tdr);
+ panfrost_scheduler_stop(&pfdev->js->queue[i], NULL);
+ }
+
+ /* All timers have been stopped, we can safely reset the pending state. */
+ atomic_set(&pfdev->reset.pending, 0);
+
+ spin_lock_irqsave(&pfdev->js->job_lock, flags);
+ for (i = 0; i < NUM_JOB_SLOTS; i++) {
+ if (pfdev->jobs[i]) {
+ pm_runtime_put_noidle(pfdev->dev);
+ panfrost_devfreq_record_idle(&pfdev->pfdevfreq);
+ pfdev->jobs[i] = NULL;
+ }
+ }
+ spin_unlock_irqrestore(&pfdev->js->job_lock, flags);
+
+ panfrost_device_reset(pfdev);
+
+ for (i = 0; i < NUM_JOB_SLOTS; i++) {
+ drm_sched_resubmit_jobs(&pfdev->js->queue[i].sched);
+ panfrost_scheduler_start(&pfdev->js->queue[i]);
+ }
+}
+
int panfrost_job_init(struct panfrost_device *pfdev)
{
struct panfrost_job_slot *js;
int ret, j, irq;
+ INIT_WORK(&pfdev->reset.work, panfrost_reset);
+
pfdev->js = js = devm_kzalloc(pfdev->dev, sizeof(*js), GFP_KERNEL);
if (!js)
return -ENOMEM;
@@ -558,7 +582,7 @@ int panfrost_job_init(struct panfrost_device *pfdev)
ret = drm_sched_init(&js->queue[j].sched,
&panfrost_sched_ops,
- 1, 0, msecs_to_jiffies(500),
+ 1, 0, msecs_to_jiffies(JOB_TIMEOUT_MS),
"pan_js");
if (ret) {
dev_err(pfdev->dev, "Failed to create scheduler: %d.", ret);
--
2.26.2
The work on improving gpio chip-select in spi core, and the following
fixes, has caused the bcm2835 spi driver to use wrong levels. Fix this
by simply removing level handling in the bcm2835 driver, and let the
core do its work.
Fixes: 3e5ec1db8bfe ("spi: Fix SPI_CS_HIGH setting when using native and GPIO CS")
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Martin Hundebøll <martin(a)geanix.com>
---
drivers/spi/spi-bcm2835.c | 12 ------------
1 file changed, 12 deletions(-)
diff --git a/drivers/spi/spi-bcm2835.c b/drivers/spi/spi-bcm2835.c
index b87116e9b413..9b6ba94fe878 100644
--- a/drivers/spi/spi-bcm2835.c
+++ b/drivers/spi/spi-bcm2835.c
@@ -1259,18 +1259,6 @@ static int bcm2835_spi_setup(struct spi_device *spi)
if (!chip)
return 0;
- /*
- * Retrieve the corresponding GPIO line used for CS.
- * The inversion semantics will be handled by the GPIO core
- * code, so we pass GPIOD_OUT_LOW for "unasserted" and
- * the correct flag for inversion semantics. The SPI_CS_HIGH
- * on spi->mode cannot be checked for polarity in this case
- * as the flag use_gpio_descriptors enforces SPI_CS_HIGH.
- */
- if (of_property_read_bool(spi->dev.of_node, "spi-cs-high"))
- lflags = GPIO_ACTIVE_HIGH;
- else
- lflags = GPIO_ACTIVE_LOW;
spi->cs_gpiod = gpiochip_request_own_desc(chip, 8 - spi->chip_select,
DRV_NAME,
lflags,
--
2.28.0
When tpm_get_random() was introduced, it defined the following API for the
return value:
1. A positive value tells how many bytes of random data was generated.
2. A negative value on error.
However, in the call sites the API was used incorrectly, i.e. as it would
only return negative values and otherwise zero. Returning he positive read
counts to the user space does not make any possible sense.
Fix this by returning -EIO when tpm_get_random() returns a positive value.
Fixes: 41ab999c80f1 ("tpm: Move tpm_get_random api into the TPM device driver")
Cc: stable(a)vger.kernel.org
Cc: Mimi Zohar <zohar(a)linux.ibm.com>
Cc: "James E.J. Bottomley" <James.Bottomley(a)HansenPartnership.com>
Cc: David Howells <dhowells(a)redhat.com>
Cc: Kent Yoder <key(a)linux.vnet.ibm.com>
Signed-off-by: Jarkko Sakkinen <jarkko.sakkinen(a)linux.intel.com>
---
security/keys/trusted-keys/trusted_tpm1.c | 20 +++++++++++++++++---
1 file changed, 17 insertions(+), 3 deletions(-)
diff --git a/security/keys/trusted-keys/trusted_tpm1.c b/security/keys/trusted-keys/trusted_tpm1.c
index b9fe02e5f84f..c7b1701cdac5 100644
--- a/security/keys/trusted-keys/trusted_tpm1.c
+++ b/security/keys/trusted-keys/trusted_tpm1.c
@@ -403,9 +403,12 @@ static int osap(struct tpm_buf *tb, struct osapsess *s,
int ret;
ret = tpm_get_random(chip, ononce, TPM_NONCE_SIZE);
- if (ret != TPM_NONCE_SIZE)
+ if (ret < 0)
return ret;
+ if (ret != TPM_NONCE_SIZE)
+ return -EIO;
+
tpm_buf_reset(tb, TPM_TAG_RQU_COMMAND, TPM_ORD_OSAP);
tpm_buf_append_u16(tb, type);
tpm_buf_append_u32(tb, handle);
@@ -496,8 +499,12 @@ static int tpm_seal(struct tpm_buf *tb, uint16_t keytype,
goto out;
ret = tpm_get_random(chip, td->nonceodd, TPM_NONCE_SIZE);
+ if (ret < 0)
+ return ret;
+
if (ret != TPM_NONCE_SIZE)
- goto out;
+ return -EIO;
+
ordinal = htonl(TPM_ORD_SEAL);
datsize = htonl(datalen);
pcrsize = htonl(pcrinfosize);
@@ -601,9 +608,12 @@ static int tpm_unseal(struct tpm_buf *tb,
ordinal = htonl(TPM_ORD_UNSEAL);
ret = tpm_get_random(chip, nonceodd, TPM_NONCE_SIZE);
+ if (ret < 0)
+ return ret;
+
if (ret != TPM_NONCE_SIZE) {
pr_info("trusted_key: tpm_get_random failed (%d)\n", ret);
- return ret;
+ return -EIO;
}
ret = TSS_authhmac(authdata1, keyauth, TPM_NONCE_SIZE,
enonce1, nonceodd, cont, sizeof(uint32_t),
@@ -1013,8 +1023,12 @@ static int trusted_instantiate(struct key *key,
case Opt_new:
key_len = payload->key_len;
ret = tpm_get_random(chip, payload->key, key_len);
+ if (ret < 0)
+ goto out;
+
if (ret != key_len) {
pr_info("trusted_key: key_create failed (%d)\n", ret);
+ ret = -EIO;
goto out;
}
if (tpm2)
--
2.25.1
The patch titled
Subject: mm/vmscan: fix NR_ISOLATED_FILE corruption on 64-bit
has been added to the -mm tree. Its filename is
mm-vmscan-fix-nr_isolated_file-corruption-on-64-bit.patch
This patch should soon appear at
https://ozlabs.org/~akpm/mmots/broken-out/mm-vmscan-fix-nr_isolated_file-co…
and later at
https://ozlabs.org/~akpm/mmotm/broken-out/mm-vmscan-fix-nr_isolated_file-co…
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next and is updated
there every 3-4 working days
------------------------------------------------------
From: Nicholas Piggin <npiggin(a)gmail.com>
Subject: mm/vmscan: fix NR_ISOLATED_FILE corruption on 64-bit
Previously the negated unsigned long would be cast back to signed long
which would have the correct negative value. After commit 730ec8c01a2b
("mm/vmscan.c: change prototype for shrink_page_list"), the large unsigned
int converts to a large positive signed long.
Symptoms include CMA allocations hanging forever holding the cma_mutex due
to alloc_contig_range->...->isolate_migratepages_block waiting forever in
"while (unlikely(too_many_isolated(pgdat)))".
Link: https://lkml.kernel.org/r/20201029032320.1448441-1-npiggin@gmail.com
Fixes: 730ec8c01a2b ("mm/vmscan.c: change prototype for shrink_page_list")
Signed-off-by: Nicholas Piggin <npiggin(a)gmail.com>
Cc: Maninder Singh <maninder1.s(a)samsung.com>
Cc: Vaneet Narang <v.narang(a)samsung.com>
Cc: Maninder Singh <maninder1.s(a)samsung.com>
Cc: Michal Hocko <mhocko(a)suse.com>
Cc: Amit Sahrawat <a.sahrawat(a)samsung.com>
Cc: Mel Gorman <mgorman(a)suse.de>
Cc: Vlastimil Babka <vbabka(a)suse.cz>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/vmscan.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
--- a/mm/vmscan.c~mm-vmscan-fix-nr_isolated_file-corruption-on-64-bit
+++ a/mm/vmscan.c
@@ -1516,7 +1516,7 @@ unsigned int reclaim_clean_pages_from_li
nr_reclaimed = shrink_page_list(&clean_pages, zone->zone_pgdat, &sc,
TTU_IGNORE_ACCESS, &stat, true);
list_splice(&clean_pages, page_list);
- mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE, -nr_reclaimed);
+ mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE, -(long)nr_reclaimed);
/*
* Since lazyfree pages are isolated from file LRU from the beginning,
* they will rotate back to anonymous LRU in the end if it failed to
_
Patches currently in -mm which might be from npiggin(a)gmail.com are
mm-vmscan-fix-nr_isolated_file-corruption-on-64-bit.patch
Hi Jian,
On Thu, Oct 29, 2020 at 06:13:07PM -0700, 'Jian Cai' via Clang Built Linux wrote:
> Thanks @Nick Desaulniers <ndesaulniers(a)google.com> and @Sasha Levin
> <sashal(a)kernel.org> for the tips. For this particular change, it seems we
> do not need to backport all the dependencies (if they have not been merged
> into 5.4 stable). @Greg KH <gregkh(a)linuxfoundation.org>, please find the
> custom backport as below. It has passed all the tests on ChromeOS (
> http://crrev.com/c/2504570).
>
> Thanks,
> Jian
The below patch won't apply because it appears to be copy pasted into
this message:
Applying: Backport 44623b2818f4a442726639572f44fd9b6d0ef68c to kernel 5.4
error: git diff header lacks filename information when removing 1 leading pathname component (line 6)
Patch failed at 0001 Backport 44623b2818f4a442726639572f44fd9b6d0ef68c to kernel 5.4
hint: Use 'git am --show-current-patch=diff' to see the failed patch
When you have resolved this problem, run "git am --continue".
If you prefer to skip this patch, run "git am --skip" instead.
To restore the original branch and stop patching, run "git am --abort".
I would recommend resending the patch with git send-email or attaching
the patch file created by 'git format-patch -1' to a future email for
proper application.
> From 60891062750a39d8bba9710d500e381a26c11f75 Mon Sep 17 00:00:00 2001
> From: Jian Cai <jiancai(a)google.com>
> Date: Thu, 29 Oct 2020 17:49:39 -0700
Authorship and date should be fixed to retain the information of the
original commit.
It is trivial to just redo the cherry-pick to fix that information in
this instance but this is the command I usually run for more non-trivial
backports that I have already done:
$ git commit -s --amend -C <sha> --date "$(git show -s --format=%ai <sha>)"
This should allow you to retain the commit message of the original
message along with the author's date.
> Subject: [PATCH] crypto: x86/crc32c - fix building with clang ias
>
> commit 44623b2818f4a442726639572f44fd9b6d0ef68c upstream
>
> The clang integrated assembler complains about movzxw:
>
> arch/x86/crypto/crc32c-pcl-intel-asm_64.S:173:2: error: invalid instruction
> mnemonic 'movzxw'
>
> It seems that movzwq is the mnemonic that it expects instead,
> and this is what objdump prints when disassembling the file.
>
> NOTE: this is a custom backport as the surrounding code has been
> changed upstream.
A note of this nature is traditionally placed after the signoffs of the
original patch like my example below:
> Fixes: 6a8ce1ef3940 ("crypto: crc32c - Optimize CRC32C calculation with
> PCLMULQDQ instruction")
> Signed-off-by: Arnd Bergmann <arnd(a)arndb.de>
> Reviewed-by: Nathan Chancellor <natechancellor(a)gmail.com>
> Signed-off-by: Herbert Xu <herbert(a)gondor.apana.org.au>
[jc: Backport to 5.4]
> Signed-off-by: Jian Cai <caij2003(a)gmail.com>
I usually like to notate why the patch did not apply cleanly so that it
is easier for others to audit, such as:
[jc: Fixed conflicts due to lack of 34fdce6981b969]
That is merely a suggestion, not required by any means.
Otherwise, the backport seems obvious fine to me.
Cheers,
Nathan
> ---
> arch/x86/crypto/crc32c-pcl-intel-asm_64.S | 2 +-
> 1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
> b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
> index d9b734d0c8cc..3c6e01520a97 100644
> --- a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
> +++ b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
> @@ -170,7 +170,7 @@ continue_block:
>
> ## branch into array
> lea jump_table(%rip), bufp
> - movzxw (bufp, %rax, 2), len
> + movzwq (bufp, %rax, 2), len
> lea crc_array(%rip), bufp
> lea (bufp, len, 1), bufp
> JMP_NOSPEC bufp
> --
> 2.29.1.341.ge80a0c044ae-goog
On 29 Oct 2020, at 17:14, Yang Shi wrote:
> On Thu, Oct 29, 2020 at 1:04 PM Zi Yan <zi.yan(a)sent.com> wrote:
>>
>> From: Zi Yan <ziy(a)nvidia.com>
>>
>> In isolate_migratepages_block, when cc->alloc_contig is true, we are
>> able to isolate compound pages, nr_migratepages and nr_isolated did not
>> count compound pages correctly, causing us to isolate more pages than we
>> thought. Use thp_nr_pages to count pages. Otherwise, we might be trapped
>> in too_many_isolated while loop, since the actual isolated pages can go
>> up to COMPACT_CLUSTER_MAX*512=16384, where COMPACT_CLUSTER_MAX is 32,
>
> Is it that easy to run into? 16384 doesn't seem like too many pages, just 64MB.
I hit this when I was running oom01 from ltp to test my PUD THP patchset, which
allocates PUD THPs from CMA regions and splits them into PMD THPs due to memory
pressure. I am not sure if it is common that in the upstream kernel PMD THPs will
be allocated in CMA regions due to allocation fallback.
>
>> since we stop isolation after cc->nr_migratepages reaches to
>> COMPACT_CLUSTER_MAX.
>>
>> In addition, after we fix the issue above, cc->nr_migratepages could
>> never be equal to COMPACT_CLUSTER_MAX if compound pages are isolated,
>> thus page isolation could not stop as we intended. Change the isolation
>> stop condition to >=.
>
> The fix looks sane to me. Reviewed-by: Yang Shi <shy828301(a)gmail.com>
Thanks.
>
> Shall you add Fixes tag to commit
> 1da2f328fa643bd72197dfed0c655148af31e4eb? And may cc stable.
Sure.
Fixes: 1da2f328fa64 (“mm,thp,compaction,cma: allow THP migration for CMA allocations”)
stable cc’ed.
>
>>
>> Signed-off-by: Zi Yan <ziy(a)nvidia.com>
>> ---
>> mm/compaction.c | 8 ++++----
>> 1 file changed, 4 insertions(+), 4 deletions(-)
>>
>> diff --git a/mm/compaction.c b/mm/compaction.c
>> index ee1f8439369e..0683a4999581 100644
>> --- a/mm/compaction.c
>> +++ b/mm/compaction.c
>> @@ -1012,8 +1012,8 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
>>
>> isolate_success:
>> list_add(&page->lru, &cc->migratepages);
>> - cc->nr_migratepages++;
>> - nr_isolated++;
>> + cc->nr_migratepages += thp_nr_pages(page);
>> + nr_isolated += thp_nr_pages(page);
>>
>> /*
>> * Avoid isolating too much unless this block is being
>> @@ -1021,7 +1021,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
>> * or a lock is contended. For contention, isolate quickly to
>> * potentially remove one source of contention.
>> */
>> - if (cc->nr_migratepages == COMPACT_CLUSTER_MAX &&
>> + if (cc->nr_migratepages >= COMPACT_CLUSTER_MAX &&
>> !cc->rescan && !cc->contended) {
>> ++low_pfn;
>> break;
>> @@ -1132,7 +1132,7 @@ isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn,
>> if (!pfn)
>> break;
>>
>> - if (cc->nr_migratepages == COMPACT_CLUSTER_MAX)
>> + if (cc->nr_migratepages >= COMPACT_CLUSTER_MAX)
>> break;
>> }
>>
>> --
>> 2.28.0
>>
>>
—
Best Regards,
Yan Zi
On Mon, Oct 26, 2020 at 06:17:00PM -0700, Jian Cai wrote:
> Hello,
>
> I am working on assembling kernel 5.4 with LLVM's integrated assembler on
> ChromeOS, and the following patch is required to make it work. Would you
> please consider backporting it to 5.4?
>
>
> commit 44623b2818f4a442726639572f44fd9b6d0ef68c
> Author: Arnd Bergmann <arnd(a)arndb.de>
> Date: Wed May 27 16:17:40 2020 +0200
>
> crypto: x86/crc32c - fix building with clang ias
>
> Link:
> https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?…
>
It does not apply cleanly, can you please provide a properly backported
and tested version?
thanks,
greg k-h
From: Michael Schaller <misch(a)google.com>
commit 336af6a4686d885a067ecea8c3c3dd129ba4fc75 upstream
Without this patch efivarfs_alloc_dentry creates dentries with slashes in
their name if the respective EFI variable has slashes in its name. This in
turn causes EIO on getdents64, which prevents a complete directory listing
of /sys/firmware/efi/efivars/.
This patch replaces the invalid shlashes with exclamation marks like
kobject_set_name_vargs does for /sys/firmware/efi/vars/ to have consistently
named dentries under /sys/firmware/efi/vars/ and /sys/firmware/efi/efivars/.
Signed-off-by: Michael Schaller <misch(a)google.com>
Link: https://lore.kernel.org/r/20200925074502.150448-1-misch@google.com
Signed-off-by: Ard Biesheuvel <ardb(a)kernel.org>
---
This addresses an issue that breaks Ubuntu installs on platforms that have
variable names as described above. One of our installers needs to sort
the BootOrder to keep BootCurrent at the front, but this fails when the
variable BootCurrent points at appears to not exist due to this issue.
Ref: https://bugs.launchpad.net/bugs/1899993
fs/efivarfs/super.c | 3 +++
1 file changed, 3 insertions(+)
diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c
index 28bb5689333a..15880a68faad 100644
--- a/fs/efivarfs/super.c
+++ b/fs/efivarfs/super.c
@@ -141,6 +141,9 @@ static int efivarfs_callback(efi_char16_t *name16, efi_guid_t vendor,
name[len + EFI_VARIABLE_GUID_LEN+1] = '\0';
+ /* replace invalid slashes like kobject_set_name_vargs does for /sys/firmware/efi/vars. */
+ strreplace(name, '/', '!');
+
inode = efivarfs_get_inode(sb, d_inode(root), S_IFREG | 0644, 0,
is_removable);
if (!inode)
--
2.29.1
Hi-
This commit from v4.4.241 breaks the v4.4.y build for powerpc:
217f139551c0 powerpc/powernv/dump: Fix race while processing OPAL dump
Like this:
.../arch/powerpc/platforms/powernv/opal-dump.c:409:7: error: void value not ignored as it ought to be
dump = create_dump_obj(dump_id, dump_size, dump_type);
^
The commit descriptions says:
"... the return value of create_dump_obj() function isn't being used today ..."
But that's only true for kernels >= v4.19, because they carry this commit:
b29336c0e178 powerpc/powernv/opal-dump : Use IRQ_HANDLED instead of numbers in interrupt handler
In v4.4 process_dump(), the only caller of create_dump_obj(), still tries to
use the return value (see the error above).
Applying "b29336c0e178 powerpc/powernv/opal-dump : Use IRQ_HANDLED ..." to
v4.4.y fixes the problem.
-Kamal
This is a note to let you know that I've just added the patch titled
coresight: cti: Initialize dynamic sysfs attributes
to my char-misc git tree which can be found at
git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/char-misc.git
in the char-misc-linus branch.
The patch will show up in the next release of the linux-next tree
(usually sometime within the next 24 hours during the week.)
The patch will hopefully also be merged in Linus's tree for the
next -rc kernel release.
If you have any questions about this process, please let me know.
>From 80624263fa289b3416f7ca309491f1b75e579477 Mon Sep 17 00:00:00 2001
From: Suzuki K Poulose <suzuki.poulose(a)arm.com>
Date: Thu, 29 Oct 2020 10:45:58 -0600
Subject: coresight: cti: Initialize dynamic sysfs attributes
With LOCKDEP enabled, CTI driver triggers the following splat due
to uninitialized lock class for dynamically allocated attribute
objects.
[ 5.372901] coresight etm0: CPU0: ETM v4.0 initialized
[ 5.376694] coresight etm1: CPU1: ETM v4.0 initialized
[ 5.380785] coresight etm2: CPU2: ETM v4.0 initialized
[ 5.385851] coresight etm3: CPU3: ETM v4.0 initialized
[ 5.389808] BUG: key ffff00000564a798 has not been registered!
[ 5.392456] ------------[ cut here ]------------
[ 5.398195] DEBUG_LOCKS_WARN_ON(1)
[ 5.398233] WARNING: CPU: 1 PID: 32 at kernel/locking/lockdep.c:4623 lockdep_init_map_waits+0x14c/0x260
[ 5.406149] Modules linked in:
[ 5.415411] CPU: 1 PID: 32 Comm: kworker/1:1 Not tainted 5.9.0-12034-gbbe85027ce80 #51
[ 5.418553] Hardware name: Qualcomm Technologies, Inc. APQ 8016 SBC (DT)
[ 5.426453] Workqueue: events amba_deferred_retry_func
[ 5.433299] pstate: 40000005 (nZcv daif -PAN -UAO -TCO BTYPE=--)
[ 5.438252] pc : lockdep_init_map_waits+0x14c/0x260
[ 5.444410] lr : lockdep_init_map_waits+0x14c/0x260
[ 5.449007] sp : ffff800012bbb720
...
[ 5.531561] Call trace:
[ 5.536847] lockdep_init_map_waits+0x14c/0x260
[ 5.539027] __kernfs_create_file+0xa8/0x1c8
[ 5.543539] sysfs_add_file_mode_ns+0xd0/0x208
[ 5.548054] internal_create_group+0x118/0x3c8
[ 5.552307] internal_create_groups+0x58/0xb8
[ 5.556733] sysfs_create_groups+0x2c/0x38
[ 5.561160] device_add+0x2d8/0x768
[ 5.565148] device_register+0x28/0x38
[ 5.568537] coresight_register+0xf8/0x320
[ 5.572358] cti_probe+0x1b0/0x3f0
...
Fix this by initializing the attributes when they are allocated.
Fixes: 3c5597e39812 ("coresight: cti: Add connection information to sysfs")
Reported-by: Leo Yan <leo.yan(a)linaro.org>
Tested-by: Leo Yan <leo.yan(a)linaro.org>
Cc: Mike Leach <mike.leach(a)linaro.org>
Cc: Mathieu Poirier <mathieu.poirier(a)linaro.org>
Signed-off-by: Suzuki K Poulose <suzuki.poulose(a)arm.com>
Cc: stable <stable(a)vger.kernel.org>
Signed-off-by: Mathieu Poirier <mathieu.poirier(a)linaro.org>
Link: https://lore.kernel.org/r/20201029164559.1268531-2-mathieu.poirier@linaro.o…
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
drivers/hwtracing/coresight/coresight-cti-sysfs.c | 7 +++++++
1 file changed, 7 insertions(+)
diff --git a/drivers/hwtracing/coresight/coresight-cti-sysfs.c b/drivers/hwtracing/coresight/coresight-cti-sysfs.c
index 392757f3a019..7ff7e7780bbf 100644
--- a/drivers/hwtracing/coresight/coresight-cti-sysfs.c
+++ b/drivers/hwtracing/coresight/coresight-cti-sysfs.c
@@ -1065,6 +1065,13 @@ static int cti_create_con_sysfs_attr(struct device *dev,
}
eattr->var = con;
con->con_attrs[attr_idx] = &eattr->attr.attr;
+ /*
+ * Initialize the dynamically allocated attribute
+ * to avoid LOCKDEP splat. See include/linux/sysfs.h
+ * for more details.
+ */
+ sysfs_attr_init(con->con_attrs[attr_idx]);
+
return 0;
}
--
2.29.1
Hi.
On Mon, Oct 26, 2020 at 04:13:26PM -0700, Roman Gushchin <guro(a)fb.com> wrote:
> Please note, that in the non-hierarchical mode all objcgs are always
> reparented to the root memory cgroup, even if the hierarchy has more
> than 1 level. This patch doesn't change it.
>
> The patch also doesn't affect how the hierarchical mode is working,
> which is the only sane and truly supported mode now.
I agree with the patch and you can add
Reviewed-by: Michal Koutný <mkoutny(a)suse.com>
However, it effectively switches any users of root.use_hierarchy=0 (if there
are any, watching the counters of root memcg) into root.use_hierarchy=1.
So I'd show them the warning even with a single level of cgroups, i.e.
add this hunk
@@ -5356,12 +5356,11 @@
page_counter_init(&memcg->kmem, &root_mem_cgroup->kmem);
page_counter_init(&memcg->tcpmem, &root_mem_cgroup->tcpmem);
/*
- * Deeper hierachy with use_hierarchy == false doesn't make
+ * Hierachy with use_hierarchy == false doesn't make
* much sense so let cgroup subsystem know about this
* unfortunate state in our controller.
*/
- if (parent != root_mem_cgroup)
- memory_cgrp_subsys.broken_hierarchy = true;
+ memory_cgrp_subsys.broken_hierarchy = true;
}
/* The following stuff does not apply to the root */
What do you think?
Michal
The patch below does not apply to the 4.9-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 5da8e4a658109e3b7e1f45ae672b7c06ac3e7158 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams(a)intel.com>
Date: Mon, 5 Oct 2020 20:40:25 -0700
Subject: [PATCH] x86/copy_mc: Introduce copy_mc_enhanced_fast_string()
The motivations to go rework memcpy_mcsafe() are that the benefit of
doing slow and careful copies is obviated on newer CPUs, and that the
current opt-in list of CPUs to instrument recovery is broken relative to
those CPUs. There is no need to keep an opt-in list up to date on an
ongoing basis if pmem/dax operations are instrumented for recovery by
default. With recovery enabled by default the old "mcsafe_key" opt-in to
careful copying can be made a "fragile" opt-out. Where the "fragile"
list takes steps to not consume poison across cachelines.
The discussion with Linus made clear that the current "_mcsafe" suffix
was imprecise to a fault. The operations that are needed by pmem/dax are
to copy from a source address that might throw #MC to a destination that
may write-fault, if it is a user page.
So copy_to_user_mcsafe() becomes copy_mc_to_user() to indicate
the separate precautions taken on source and destination.
copy_mc_to_kernel() is introduced as a non-SMAP version that does not
expect write-faults on the destination, but is still prepared to abort
with an error code upon taking #MC.
The original copy_mc_fragile() implementation had negative performance
implications since it did not use the fast-string instruction sequence
to perform copies. For this reason copy_mc_to_kernel() fell back to
plain memcpy() to preserve performance on platforms that did not indicate
the capability to recover from machine check exceptions. However, that
capability detection was not architectural and now that some platforms
can recover from fast-string consumption of memory errors the memcpy()
fallback now causes these more capable platforms to fail.
Introduce copy_mc_enhanced_fast_string() as the fast default
implementation of copy_mc_to_kernel() and finalize the transition of
copy_mc_fragile() to be a platform quirk to indicate 'copy-carefully'.
With this in place, copy_mc_to_kernel() is fast and recovery-ready by
default regardless of hardware capability.
Thanks to Vivek for identifying that copy_user_generic() is not suitable
as the copy_mc_to_user() backend since the #MC handler explicitly checks
ex_has_fault_handler(). Thanks to the 0day robot for catching a
performance bug in the x86/copy_mc_to_user implementation.
[ bp: Add the "why" for this change from the 0/2th message, massage. ]
Fixes: 92b0729c34ca ("x86/mm, x86/mce: Add memcpy_mcsafe()")
Reported-by: Erwin Tsaur <erwin.tsaur(a)intel.com>
Reported-by: 0day robot <lkp(a)intel.com>
Signed-off-by: Dan Williams <dan.j.williams(a)intel.com>
Signed-off-by: Borislav Petkov <bp(a)suse.de>
Reviewed-by: Tony Luck <tony.luck(a)intel.com>
Tested-by: Erwin Tsaur <erwin.tsaur(a)intel.com>
Cc: <stable(a)vger.kernel.org>
Link: https://lkml.kernel.org/r/160195562556.2163339.18063423034951948973.stgit@d…
diff --git a/arch/x86/lib/copy_mc.c b/arch/x86/lib/copy_mc.c
index 2633635530b7..c13e8c9ee926 100644
--- a/arch/x86/lib/copy_mc.c
+++ b/arch/x86/lib/copy_mc.c
@@ -45,6 +45,8 @@ void enable_copy_mc_fragile(void)
#define copy_mc_fragile_enabled (0)
#endif
+unsigned long copy_mc_enhanced_fast_string(void *dst, const void *src, unsigned len);
+
/**
* copy_mc_to_kernel - memory copy that handles source exceptions
*
@@ -52,9 +54,11 @@ void enable_copy_mc_fragile(void)
* @src: source address
* @len: number of bytes to copy
*
- * Call into the 'fragile' version on systems that have trouble
- * actually do machine check recovery. Everyone else can just
- * use memcpy().
+ * Call into the 'fragile' version on systems that benefit from avoiding
+ * corner case poison consumption scenarios, For example, accessing
+ * poison across 2 cachelines with a single instruction. Almost all
+ * other uses case can use copy_mc_enhanced_fast_string() for a fast
+ * recoverable copy, or fallback to plain memcpy.
*
* Return 0 for success, or number of bytes not copied if there was an
* exception.
@@ -63,6 +67,8 @@ unsigned long __must_check copy_mc_to_kernel(void *dst, const void *src, unsigne
{
if (copy_mc_fragile_enabled)
return copy_mc_fragile(dst, src, len);
+ if (static_cpu_has(X86_FEATURE_ERMS))
+ return copy_mc_enhanced_fast_string(dst, src, len);
memcpy(dst, src, len);
return 0;
}
@@ -72,11 +78,19 @@ unsigned long __must_check copy_mc_to_user(void *dst, const void *src, unsigned
{
unsigned long ret;
- if (!copy_mc_fragile_enabled)
- return copy_user_generic(dst, src, len);
+ if (copy_mc_fragile_enabled) {
+ __uaccess_begin();
+ ret = copy_mc_fragile(dst, src, len);
+ __uaccess_end();
+ return ret;
+ }
+
+ if (static_cpu_has(X86_FEATURE_ERMS)) {
+ __uaccess_begin();
+ ret = copy_mc_enhanced_fast_string(dst, src, len);
+ __uaccess_end();
+ return ret;
+ }
- __uaccess_begin();
- ret = copy_mc_fragile(dst, src, len);
- __uaccess_end();
- return ret;
+ return copy_user_generic(dst, src, len);
}
diff --git a/arch/x86/lib/copy_mc_64.S b/arch/x86/lib/copy_mc_64.S
index c3b613c4544a..892d8915f609 100644
--- a/arch/x86/lib/copy_mc_64.S
+++ b/arch/x86/lib/copy_mc_64.S
@@ -124,4 +124,40 @@ EXPORT_SYMBOL_GPL(copy_mc_fragile)
_ASM_EXTABLE(.L_write_words, .E_write_words)
_ASM_EXTABLE(.L_write_trailing_bytes, .E_trailing_bytes)
#endif /* CONFIG_X86_MCE */
+
+/*
+ * copy_mc_enhanced_fast_string - memory copy with exception handling
+ *
+ * Fast string copy + fault / exception handling. If the CPU does
+ * support machine check exception recovery, but does not support
+ * recovering from fast-string exceptions then this CPU needs to be
+ * added to the copy_mc_fragile_key set of quirks. Otherwise, absent any
+ * machine check recovery support this version should be no slower than
+ * standard memcpy.
+ */
+SYM_FUNC_START(copy_mc_enhanced_fast_string)
+ movq %rdi, %rax
+ movq %rdx, %rcx
+.L_copy:
+ rep movsb
+ /* Copy successful. Return zero */
+ xorl %eax, %eax
+ ret
+SYM_FUNC_END(copy_mc_enhanced_fast_string)
+
+ .section .fixup, "ax"
+.E_copy:
+ /*
+ * On fault %rcx is updated such that the copy instruction could
+ * optionally be restarted at the fault position, i.e. it
+ * contains 'bytes remaining'. A non-zero return indicates error
+ * to copy_mc_generic() users, or indicate short transfers to
+ * user-copy routines.
+ */
+ movq %rcx, %rax
+ ret
+
+ .previous
+
+ _ASM_EXTABLE_FAULT(.L_copy, .E_copy)
#endif /* !CONFIG_UML */
diff --git a/tools/objtool/check.c b/tools/objtool/check.c
index 893f021fec63..b3e4efcf7ca6 100644
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -550,6 +550,7 @@ static const char *uaccess_safe_builtin[] = {
"csum_partial_copy_generic",
"copy_mc_fragile",
"copy_mc_fragile_handle_tail",
+ "copy_mc_enhanced_fast_string",
"ftrace_likely_update", /* CONFIG_TRACE_BRANCH_PROFILING */
NULL
};
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 5da8e4a658109e3b7e1f45ae672b7c06ac3e7158 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams(a)intel.com>
Date: Mon, 5 Oct 2020 20:40:25 -0700
Subject: [PATCH] x86/copy_mc: Introduce copy_mc_enhanced_fast_string()
The motivations to go rework memcpy_mcsafe() are that the benefit of
doing slow and careful copies is obviated on newer CPUs, and that the
current opt-in list of CPUs to instrument recovery is broken relative to
those CPUs. There is no need to keep an opt-in list up to date on an
ongoing basis if pmem/dax operations are instrumented for recovery by
default. With recovery enabled by default the old "mcsafe_key" opt-in to
careful copying can be made a "fragile" opt-out. Where the "fragile"
list takes steps to not consume poison across cachelines.
The discussion with Linus made clear that the current "_mcsafe" suffix
was imprecise to a fault. The operations that are needed by pmem/dax are
to copy from a source address that might throw #MC to a destination that
may write-fault, if it is a user page.
So copy_to_user_mcsafe() becomes copy_mc_to_user() to indicate
the separate precautions taken on source and destination.
copy_mc_to_kernel() is introduced as a non-SMAP version that does not
expect write-faults on the destination, but is still prepared to abort
with an error code upon taking #MC.
The original copy_mc_fragile() implementation had negative performance
implications since it did not use the fast-string instruction sequence
to perform copies. For this reason copy_mc_to_kernel() fell back to
plain memcpy() to preserve performance on platforms that did not indicate
the capability to recover from machine check exceptions. However, that
capability detection was not architectural and now that some platforms
can recover from fast-string consumption of memory errors the memcpy()
fallback now causes these more capable platforms to fail.
Introduce copy_mc_enhanced_fast_string() as the fast default
implementation of copy_mc_to_kernel() and finalize the transition of
copy_mc_fragile() to be a platform quirk to indicate 'copy-carefully'.
With this in place, copy_mc_to_kernel() is fast and recovery-ready by
default regardless of hardware capability.
Thanks to Vivek for identifying that copy_user_generic() is not suitable
as the copy_mc_to_user() backend since the #MC handler explicitly checks
ex_has_fault_handler(). Thanks to the 0day robot for catching a
performance bug in the x86/copy_mc_to_user implementation.
[ bp: Add the "why" for this change from the 0/2th message, massage. ]
Fixes: 92b0729c34ca ("x86/mm, x86/mce: Add memcpy_mcsafe()")
Reported-by: Erwin Tsaur <erwin.tsaur(a)intel.com>
Reported-by: 0day robot <lkp(a)intel.com>
Signed-off-by: Dan Williams <dan.j.williams(a)intel.com>
Signed-off-by: Borislav Petkov <bp(a)suse.de>
Reviewed-by: Tony Luck <tony.luck(a)intel.com>
Tested-by: Erwin Tsaur <erwin.tsaur(a)intel.com>
Cc: <stable(a)vger.kernel.org>
Link: https://lkml.kernel.org/r/160195562556.2163339.18063423034951948973.stgit@d…
diff --git a/arch/x86/lib/copy_mc.c b/arch/x86/lib/copy_mc.c
index 2633635530b7..c13e8c9ee926 100644
--- a/arch/x86/lib/copy_mc.c
+++ b/arch/x86/lib/copy_mc.c
@@ -45,6 +45,8 @@ void enable_copy_mc_fragile(void)
#define copy_mc_fragile_enabled (0)
#endif
+unsigned long copy_mc_enhanced_fast_string(void *dst, const void *src, unsigned len);
+
/**
* copy_mc_to_kernel - memory copy that handles source exceptions
*
@@ -52,9 +54,11 @@ void enable_copy_mc_fragile(void)
* @src: source address
* @len: number of bytes to copy
*
- * Call into the 'fragile' version on systems that have trouble
- * actually do machine check recovery. Everyone else can just
- * use memcpy().
+ * Call into the 'fragile' version on systems that benefit from avoiding
+ * corner case poison consumption scenarios, For example, accessing
+ * poison across 2 cachelines with a single instruction. Almost all
+ * other uses case can use copy_mc_enhanced_fast_string() for a fast
+ * recoverable copy, or fallback to plain memcpy.
*
* Return 0 for success, or number of bytes not copied if there was an
* exception.
@@ -63,6 +67,8 @@ unsigned long __must_check copy_mc_to_kernel(void *dst, const void *src, unsigne
{
if (copy_mc_fragile_enabled)
return copy_mc_fragile(dst, src, len);
+ if (static_cpu_has(X86_FEATURE_ERMS))
+ return copy_mc_enhanced_fast_string(dst, src, len);
memcpy(dst, src, len);
return 0;
}
@@ -72,11 +78,19 @@ unsigned long __must_check copy_mc_to_user(void *dst, const void *src, unsigned
{
unsigned long ret;
- if (!copy_mc_fragile_enabled)
- return copy_user_generic(dst, src, len);
+ if (copy_mc_fragile_enabled) {
+ __uaccess_begin();
+ ret = copy_mc_fragile(dst, src, len);
+ __uaccess_end();
+ return ret;
+ }
+
+ if (static_cpu_has(X86_FEATURE_ERMS)) {
+ __uaccess_begin();
+ ret = copy_mc_enhanced_fast_string(dst, src, len);
+ __uaccess_end();
+ return ret;
+ }
- __uaccess_begin();
- ret = copy_mc_fragile(dst, src, len);
- __uaccess_end();
- return ret;
+ return copy_user_generic(dst, src, len);
}
diff --git a/arch/x86/lib/copy_mc_64.S b/arch/x86/lib/copy_mc_64.S
index c3b613c4544a..892d8915f609 100644
--- a/arch/x86/lib/copy_mc_64.S
+++ b/arch/x86/lib/copy_mc_64.S
@@ -124,4 +124,40 @@ EXPORT_SYMBOL_GPL(copy_mc_fragile)
_ASM_EXTABLE(.L_write_words, .E_write_words)
_ASM_EXTABLE(.L_write_trailing_bytes, .E_trailing_bytes)
#endif /* CONFIG_X86_MCE */
+
+/*
+ * copy_mc_enhanced_fast_string - memory copy with exception handling
+ *
+ * Fast string copy + fault / exception handling. If the CPU does
+ * support machine check exception recovery, but does not support
+ * recovering from fast-string exceptions then this CPU needs to be
+ * added to the copy_mc_fragile_key set of quirks. Otherwise, absent any
+ * machine check recovery support this version should be no slower than
+ * standard memcpy.
+ */
+SYM_FUNC_START(copy_mc_enhanced_fast_string)
+ movq %rdi, %rax
+ movq %rdx, %rcx
+.L_copy:
+ rep movsb
+ /* Copy successful. Return zero */
+ xorl %eax, %eax
+ ret
+SYM_FUNC_END(copy_mc_enhanced_fast_string)
+
+ .section .fixup, "ax"
+.E_copy:
+ /*
+ * On fault %rcx is updated such that the copy instruction could
+ * optionally be restarted at the fault position, i.e. it
+ * contains 'bytes remaining'. A non-zero return indicates error
+ * to copy_mc_generic() users, or indicate short transfers to
+ * user-copy routines.
+ */
+ movq %rcx, %rax
+ ret
+
+ .previous
+
+ _ASM_EXTABLE_FAULT(.L_copy, .E_copy)
#endif /* !CONFIG_UML */
diff --git a/tools/objtool/check.c b/tools/objtool/check.c
index 893f021fec63..b3e4efcf7ca6 100644
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -550,6 +550,7 @@ static const char *uaccess_safe_builtin[] = {
"csum_partial_copy_generic",
"copy_mc_fragile",
"copy_mc_fragile_handle_tail",
+ "copy_mc_enhanced_fast_string",
"ftrace_likely_update", /* CONFIG_TRACE_BRANCH_PROFILING */
NULL
};
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 5da8e4a658109e3b7e1f45ae672b7c06ac3e7158 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams(a)intel.com>
Date: Mon, 5 Oct 2020 20:40:25 -0700
Subject: [PATCH] x86/copy_mc: Introduce copy_mc_enhanced_fast_string()
The motivations to go rework memcpy_mcsafe() are that the benefit of
doing slow and careful copies is obviated on newer CPUs, and that the
current opt-in list of CPUs to instrument recovery is broken relative to
those CPUs. There is no need to keep an opt-in list up to date on an
ongoing basis if pmem/dax operations are instrumented for recovery by
default. With recovery enabled by default the old "mcsafe_key" opt-in to
careful copying can be made a "fragile" opt-out. Where the "fragile"
list takes steps to not consume poison across cachelines.
The discussion with Linus made clear that the current "_mcsafe" suffix
was imprecise to a fault. The operations that are needed by pmem/dax are
to copy from a source address that might throw #MC to a destination that
may write-fault, if it is a user page.
So copy_to_user_mcsafe() becomes copy_mc_to_user() to indicate
the separate precautions taken on source and destination.
copy_mc_to_kernel() is introduced as a non-SMAP version that does not
expect write-faults on the destination, but is still prepared to abort
with an error code upon taking #MC.
The original copy_mc_fragile() implementation had negative performance
implications since it did not use the fast-string instruction sequence
to perform copies. For this reason copy_mc_to_kernel() fell back to
plain memcpy() to preserve performance on platforms that did not indicate
the capability to recover from machine check exceptions. However, that
capability detection was not architectural and now that some platforms
can recover from fast-string consumption of memory errors the memcpy()
fallback now causes these more capable platforms to fail.
Introduce copy_mc_enhanced_fast_string() as the fast default
implementation of copy_mc_to_kernel() and finalize the transition of
copy_mc_fragile() to be a platform quirk to indicate 'copy-carefully'.
With this in place, copy_mc_to_kernel() is fast and recovery-ready by
default regardless of hardware capability.
Thanks to Vivek for identifying that copy_user_generic() is not suitable
as the copy_mc_to_user() backend since the #MC handler explicitly checks
ex_has_fault_handler(). Thanks to the 0day robot for catching a
performance bug in the x86/copy_mc_to_user implementation.
[ bp: Add the "why" for this change from the 0/2th message, massage. ]
Fixes: 92b0729c34ca ("x86/mm, x86/mce: Add memcpy_mcsafe()")
Reported-by: Erwin Tsaur <erwin.tsaur(a)intel.com>
Reported-by: 0day robot <lkp(a)intel.com>
Signed-off-by: Dan Williams <dan.j.williams(a)intel.com>
Signed-off-by: Borislav Petkov <bp(a)suse.de>
Reviewed-by: Tony Luck <tony.luck(a)intel.com>
Tested-by: Erwin Tsaur <erwin.tsaur(a)intel.com>
Cc: <stable(a)vger.kernel.org>
Link: https://lkml.kernel.org/r/160195562556.2163339.18063423034951948973.stgit@d…
diff --git a/arch/x86/lib/copy_mc.c b/arch/x86/lib/copy_mc.c
index 2633635530b7..c13e8c9ee926 100644
--- a/arch/x86/lib/copy_mc.c
+++ b/arch/x86/lib/copy_mc.c
@@ -45,6 +45,8 @@ void enable_copy_mc_fragile(void)
#define copy_mc_fragile_enabled (0)
#endif
+unsigned long copy_mc_enhanced_fast_string(void *dst, const void *src, unsigned len);
+
/**
* copy_mc_to_kernel - memory copy that handles source exceptions
*
@@ -52,9 +54,11 @@ void enable_copy_mc_fragile(void)
* @src: source address
* @len: number of bytes to copy
*
- * Call into the 'fragile' version on systems that have trouble
- * actually do machine check recovery. Everyone else can just
- * use memcpy().
+ * Call into the 'fragile' version on systems that benefit from avoiding
+ * corner case poison consumption scenarios, For example, accessing
+ * poison across 2 cachelines with a single instruction. Almost all
+ * other uses case can use copy_mc_enhanced_fast_string() for a fast
+ * recoverable copy, or fallback to plain memcpy.
*
* Return 0 for success, or number of bytes not copied if there was an
* exception.
@@ -63,6 +67,8 @@ unsigned long __must_check copy_mc_to_kernel(void *dst, const void *src, unsigne
{
if (copy_mc_fragile_enabled)
return copy_mc_fragile(dst, src, len);
+ if (static_cpu_has(X86_FEATURE_ERMS))
+ return copy_mc_enhanced_fast_string(dst, src, len);
memcpy(dst, src, len);
return 0;
}
@@ -72,11 +78,19 @@ unsigned long __must_check copy_mc_to_user(void *dst, const void *src, unsigned
{
unsigned long ret;
- if (!copy_mc_fragile_enabled)
- return copy_user_generic(dst, src, len);
+ if (copy_mc_fragile_enabled) {
+ __uaccess_begin();
+ ret = copy_mc_fragile(dst, src, len);
+ __uaccess_end();
+ return ret;
+ }
+
+ if (static_cpu_has(X86_FEATURE_ERMS)) {
+ __uaccess_begin();
+ ret = copy_mc_enhanced_fast_string(dst, src, len);
+ __uaccess_end();
+ return ret;
+ }
- __uaccess_begin();
- ret = copy_mc_fragile(dst, src, len);
- __uaccess_end();
- return ret;
+ return copy_user_generic(dst, src, len);
}
diff --git a/arch/x86/lib/copy_mc_64.S b/arch/x86/lib/copy_mc_64.S
index c3b613c4544a..892d8915f609 100644
--- a/arch/x86/lib/copy_mc_64.S
+++ b/arch/x86/lib/copy_mc_64.S
@@ -124,4 +124,40 @@ EXPORT_SYMBOL_GPL(copy_mc_fragile)
_ASM_EXTABLE(.L_write_words, .E_write_words)
_ASM_EXTABLE(.L_write_trailing_bytes, .E_trailing_bytes)
#endif /* CONFIG_X86_MCE */
+
+/*
+ * copy_mc_enhanced_fast_string - memory copy with exception handling
+ *
+ * Fast string copy + fault / exception handling. If the CPU does
+ * support machine check exception recovery, but does not support
+ * recovering from fast-string exceptions then this CPU needs to be
+ * added to the copy_mc_fragile_key set of quirks. Otherwise, absent any
+ * machine check recovery support this version should be no slower than
+ * standard memcpy.
+ */
+SYM_FUNC_START(copy_mc_enhanced_fast_string)
+ movq %rdi, %rax
+ movq %rdx, %rcx
+.L_copy:
+ rep movsb
+ /* Copy successful. Return zero */
+ xorl %eax, %eax
+ ret
+SYM_FUNC_END(copy_mc_enhanced_fast_string)
+
+ .section .fixup, "ax"
+.E_copy:
+ /*
+ * On fault %rcx is updated such that the copy instruction could
+ * optionally be restarted at the fault position, i.e. it
+ * contains 'bytes remaining'. A non-zero return indicates error
+ * to copy_mc_generic() users, or indicate short transfers to
+ * user-copy routines.
+ */
+ movq %rcx, %rax
+ ret
+
+ .previous
+
+ _ASM_EXTABLE_FAULT(.L_copy, .E_copy)
#endif /* !CONFIG_UML */
diff --git a/tools/objtool/check.c b/tools/objtool/check.c
index 893f021fec63..b3e4efcf7ca6 100644
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -550,6 +550,7 @@ static const char *uaccess_safe_builtin[] = {
"csum_partial_copy_generic",
"copy_mc_fragile",
"copy_mc_fragile_handle_tail",
+ "copy_mc_enhanced_fast_string",
"ftrace_likely_update", /* CONFIG_TRACE_BRANCH_PROFILING */
NULL
};
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 5da8e4a658109e3b7e1f45ae672b7c06ac3e7158 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams(a)intel.com>
Date: Mon, 5 Oct 2020 20:40:25 -0700
Subject: [PATCH] x86/copy_mc: Introduce copy_mc_enhanced_fast_string()
The motivations to go rework memcpy_mcsafe() are that the benefit of
doing slow and careful copies is obviated on newer CPUs, and that the
current opt-in list of CPUs to instrument recovery is broken relative to
those CPUs. There is no need to keep an opt-in list up to date on an
ongoing basis if pmem/dax operations are instrumented for recovery by
default. With recovery enabled by default the old "mcsafe_key" opt-in to
careful copying can be made a "fragile" opt-out. Where the "fragile"
list takes steps to not consume poison across cachelines.
The discussion with Linus made clear that the current "_mcsafe" suffix
was imprecise to a fault. The operations that are needed by pmem/dax are
to copy from a source address that might throw #MC to a destination that
may write-fault, if it is a user page.
So copy_to_user_mcsafe() becomes copy_mc_to_user() to indicate
the separate precautions taken on source and destination.
copy_mc_to_kernel() is introduced as a non-SMAP version that does not
expect write-faults on the destination, but is still prepared to abort
with an error code upon taking #MC.
The original copy_mc_fragile() implementation had negative performance
implications since it did not use the fast-string instruction sequence
to perform copies. For this reason copy_mc_to_kernel() fell back to
plain memcpy() to preserve performance on platforms that did not indicate
the capability to recover from machine check exceptions. However, that
capability detection was not architectural and now that some platforms
can recover from fast-string consumption of memory errors the memcpy()
fallback now causes these more capable platforms to fail.
Introduce copy_mc_enhanced_fast_string() as the fast default
implementation of copy_mc_to_kernel() and finalize the transition of
copy_mc_fragile() to be a platform quirk to indicate 'copy-carefully'.
With this in place, copy_mc_to_kernel() is fast and recovery-ready by
default regardless of hardware capability.
Thanks to Vivek for identifying that copy_user_generic() is not suitable
as the copy_mc_to_user() backend since the #MC handler explicitly checks
ex_has_fault_handler(). Thanks to the 0day robot for catching a
performance bug in the x86/copy_mc_to_user implementation.
[ bp: Add the "why" for this change from the 0/2th message, massage. ]
Fixes: 92b0729c34ca ("x86/mm, x86/mce: Add memcpy_mcsafe()")
Reported-by: Erwin Tsaur <erwin.tsaur(a)intel.com>
Reported-by: 0day robot <lkp(a)intel.com>
Signed-off-by: Dan Williams <dan.j.williams(a)intel.com>
Signed-off-by: Borislav Petkov <bp(a)suse.de>
Reviewed-by: Tony Luck <tony.luck(a)intel.com>
Tested-by: Erwin Tsaur <erwin.tsaur(a)intel.com>
Cc: <stable(a)vger.kernel.org>
Link: https://lkml.kernel.org/r/160195562556.2163339.18063423034951948973.stgit@d…
diff --git a/arch/x86/lib/copy_mc.c b/arch/x86/lib/copy_mc.c
index 2633635530b7..c13e8c9ee926 100644
--- a/arch/x86/lib/copy_mc.c
+++ b/arch/x86/lib/copy_mc.c
@@ -45,6 +45,8 @@ void enable_copy_mc_fragile(void)
#define copy_mc_fragile_enabled (0)
#endif
+unsigned long copy_mc_enhanced_fast_string(void *dst, const void *src, unsigned len);
+
/**
* copy_mc_to_kernel - memory copy that handles source exceptions
*
@@ -52,9 +54,11 @@ void enable_copy_mc_fragile(void)
* @src: source address
* @len: number of bytes to copy
*
- * Call into the 'fragile' version on systems that have trouble
- * actually do machine check recovery. Everyone else can just
- * use memcpy().
+ * Call into the 'fragile' version on systems that benefit from avoiding
+ * corner case poison consumption scenarios, For example, accessing
+ * poison across 2 cachelines with a single instruction. Almost all
+ * other uses case can use copy_mc_enhanced_fast_string() for a fast
+ * recoverable copy, or fallback to plain memcpy.
*
* Return 0 for success, or number of bytes not copied if there was an
* exception.
@@ -63,6 +67,8 @@ unsigned long __must_check copy_mc_to_kernel(void *dst, const void *src, unsigne
{
if (copy_mc_fragile_enabled)
return copy_mc_fragile(dst, src, len);
+ if (static_cpu_has(X86_FEATURE_ERMS))
+ return copy_mc_enhanced_fast_string(dst, src, len);
memcpy(dst, src, len);
return 0;
}
@@ -72,11 +78,19 @@ unsigned long __must_check copy_mc_to_user(void *dst, const void *src, unsigned
{
unsigned long ret;
- if (!copy_mc_fragile_enabled)
- return copy_user_generic(dst, src, len);
+ if (copy_mc_fragile_enabled) {
+ __uaccess_begin();
+ ret = copy_mc_fragile(dst, src, len);
+ __uaccess_end();
+ return ret;
+ }
+
+ if (static_cpu_has(X86_FEATURE_ERMS)) {
+ __uaccess_begin();
+ ret = copy_mc_enhanced_fast_string(dst, src, len);
+ __uaccess_end();
+ return ret;
+ }
- __uaccess_begin();
- ret = copy_mc_fragile(dst, src, len);
- __uaccess_end();
- return ret;
+ return copy_user_generic(dst, src, len);
}
diff --git a/arch/x86/lib/copy_mc_64.S b/arch/x86/lib/copy_mc_64.S
index c3b613c4544a..892d8915f609 100644
--- a/arch/x86/lib/copy_mc_64.S
+++ b/arch/x86/lib/copy_mc_64.S
@@ -124,4 +124,40 @@ EXPORT_SYMBOL_GPL(copy_mc_fragile)
_ASM_EXTABLE(.L_write_words, .E_write_words)
_ASM_EXTABLE(.L_write_trailing_bytes, .E_trailing_bytes)
#endif /* CONFIG_X86_MCE */
+
+/*
+ * copy_mc_enhanced_fast_string - memory copy with exception handling
+ *
+ * Fast string copy + fault / exception handling. If the CPU does
+ * support machine check exception recovery, but does not support
+ * recovering from fast-string exceptions then this CPU needs to be
+ * added to the copy_mc_fragile_key set of quirks. Otherwise, absent any
+ * machine check recovery support this version should be no slower than
+ * standard memcpy.
+ */
+SYM_FUNC_START(copy_mc_enhanced_fast_string)
+ movq %rdi, %rax
+ movq %rdx, %rcx
+.L_copy:
+ rep movsb
+ /* Copy successful. Return zero */
+ xorl %eax, %eax
+ ret
+SYM_FUNC_END(copy_mc_enhanced_fast_string)
+
+ .section .fixup, "ax"
+.E_copy:
+ /*
+ * On fault %rcx is updated such that the copy instruction could
+ * optionally be restarted at the fault position, i.e. it
+ * contains 'bytes remaining'. A non-zero return indicates error
+ * to copy_mc_generic() users, or indicate short transfers to
+ * user-copy routines.
+ */
+ movq %rcx, %rax
+ ret
+
+ .previous
+
+ _ASM_EXTABLE_FAULT(.L_copy, .E_copy)
#endif /* !CONFIG_UML */
diff --git a/tools/objtool/check.c b/tools/objtool/check.c
index 893f021fec63..b3e4efcf7ca6 100644
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -550,6 +550,7 @@ static const char *uaccess_safe_builtin[] = {
"csum_partial_copy_generic",
"copy_mc_fragile",
"copy_mc_fragile_handle_tail",
+ "copy_mc_enhanced_fast_string",
"ftrace_likely_update", /* CONFIG_TRACE_BRANCH_PROFILING */
NULL
};
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 18fce56134c987e5b4eceddafdbe4b00c07e2ae1 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz(a)kernel.org>
Date: Thu, 16 Jul 2020 17:11:09 +0100
Subject: [PATCH] arm64: Run ARCH_WORKAROUND_1 enabling code on all CPUs
Commit 73f381660959 ("arm64: Advertise mitigation of Spectre-v2, or lack
thereof") changed the way we deal with ARCH_WORKAROUND_1, by moving most
of the enabling code to the .matches() callback.
This has the unfortunate effect that the workaround gets only enabled on
the first affected CPU, and no other.
In order to address this, forcefully call the .matches() callback from a
.cpu_enable() callback, which brings us back to the original behaviour.
Fixes: 73f381660959 ("arm64: Advertise mitigation of Spectre-v2, or lack thereof")
Cc: <stable(a)vger.kernel.org>
Reviewed-by: Suzuki K Poulose <suzuki.poulose(a)arm.com>
Signed-off-by: Marc Zyngier <maz(a)kernel.org>
Signed-off-by: Will Deacon <will(a)kernel.org>
diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c
index 88966496806a..3fe64bf5a58d 100644
--- a/arch/arm64/kernel/cpu_errata.c
+++ b/arch/arm64/kernel/cpu_errata.c
@@ -599,6 +599,12 @@ check_branch_predictor(const struct arm64_cpu_capabilities *entry, int scope)
return (need_wa > 0);
}
+static void
+cpu_enable_branch_predictor_hardening(const struct arm64_cpu_capabilities *cap)
+{
+ cap->matches(cap, SCOPE_LOCAL_CPU);
+}
+
static const __maybe_unused struct midr_range tx2_family_cpus[] = {
MIDR_ALL_VERSIONS(MIDR_BRCM_VULCAN),
MIDR_ALL_VERSIONS(MIDR_CAVIUM_THUNDERX2),
@@ -890,9 +896,11 @@ const struct arm64_cpu_capabilities arm64_errata[] = {
},
#endif
{
+ .desc = "Branch predictor hardening",
.capability = ARM64_HARDEN_BRANCH_PREDICTOR,
.type = ARM64_CPUCAP_LOCAL_CPU_ERRATUM,
.matches = check_branch_predictor,
+ .cpu_enable = cpu_enable_branch_predictor_hardening,
},
#ifdef CONFIG_RANDOMIZE_BASE
{
On Tue, Oct 27, 2020 at 10:12:49AM -0700, Saeed Mirzamohammadi wrote:
> Hi Greg,
>
> Sorry for the confusion. I’m requesting stable maintainers to cherry-pick this patch into stable 5.4 and 5.8.
> commit cc07057c7c88fb8eff3b1991131ded0f0bcfa7e3
> Author: Saeed Mirzamohammadi <saeed.mirzamohammadi(a)oracle.com>
> Date: Wed Oct 21 16:57:58 2020 -0700
>
> video: fbdev: fix divide error in fbcon_switch
I do not see that commit in Linus's tree, do you?
confused,
greg k-h
[ Upstream commit 4e3bbb33e6f36e4b05be1b1b9b02e3dd5aaa3e69 ]
SOCK_TSTAMP_NEW (timespec64 instead of timespec) is also used for
hardware time stamps (configured via SO_TIMESTAMPING_NEW).
User space (ptp4l) first configures hardware time stamping via
SO_TIMESTAMPING_NEW which sets SOCK_TSTAMP_NEW. In the next step, ptp4l
disables SO_TIMESTAMPNS(_NEW) (software time stamps), but this must not
switch hardware time stamps back to "32 bit mode".
This problem happens on 32 bit platforms were the libc has already
switched to struct timespec64 (from SO_TIMExxx_OLD to SO_TIMExxx_NEW
socket options). ptp4l complains with "missing timestamp on transmitted
peer delay request" because the wrong format is received (and
discarded).
Fixes: 887feae36aee ("socket: Add SO_TIMESTAMP[NS]_NEW")
Signed-off-by: Christian Eggers <ceggers(a)arri.de>
Acked-by: Willem de Bruijn <willemb(a)google.com>
Acked-by: Deepa Dinamani <deepa.kernel(a)gmail.com>
---
Hi Greg,
I just got your E-mail(s) that this patch has been applied to 5.8 and 5.9.
This is a back port for the same problem on 5.4. It does the same as the
upstream patch, only the affected code is at another position here. Please
decide yourself whether the Acked-by: tags (from the upstream patch) should
be kept or removed.
This back port is only required for 5.4, older kernels like 4.19 are not
affected.
regards
Christian
net/core/sock.c | 1 -
1 file changed, 1 deletion(-)
diff --git a/net/core/sock.c b/net/core/sock.c
index 9a186d2ad36d..1eda7337b881 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -923,7 +923,6 @@ int sock_setsockopt(struct socket *sock, int level, int optname,
} else {
sock_reset_flag(sk, SOCK_RCVTSTAMP);
sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
- sock_reset_flag(sk, SOCK_TSTAMP_NEW);
}
break;
--
Christian Eggers
Embedded software developer
Arnold & Richter Cine Technik GmbH & Co. Betriebs KG
Sitz: Muenchen - Registergericht: Amtsgericht Muenchen - Handelsregisternummer: HRA 57918
Persoenlich haftender Gesellschafter: Arnold & Richter Cine Technik GmbH
Sitz: Muenchen - Registergericht: Amtsgericht Muenchen - Handelsregisternummer: HRB 54477
Geschaeftsfuehrer: Dr. Michael Neuhaeuser; Stephan Schenk; Walter Trauninger; Markus Zeiler
Attached the syzkaller C repro.
Tested-by: Saeed Mirzamohammadi <saeed.mirzamohammadi(a)oracle.com>
> On Oct 20, 2020, at 9:45 AM, Saeed Mirzamohammadi <saeed.mirzamohammadi(a)oracle.com> wrote:
>
> Thanks! Yes, that looks good to me.
>
> Saeed
>
>> On Oct 20, 2020, at 4:50 AM, Pablo Neira Ayuso <pablo(a)netfilter.org> wrote:
>>
>> On Mon, Oct 19, 2020 at 10:25:32AM -0700, saeed.mirzamohammadi(a)oracle.com wrote:
>>> From: Saeed Mirzamohammadi <saeed.mirzamohammadi(a)oracle.com>
>>>
>>> This patch fixes the issue due to:
>>>
>>> BUG: KASAN: slab-out-of-bounds in nft_flow_rule_create+0x622/0x6a2
>>> net/netfilter/nf_tables_offload.c:40
>>> Read of size 8 at addr ffff888103910b58 by task syz-executor227/16244
>>>
>>> The error happens when expr->ops is accessed early on before performing the boundary check and after nft_expr_next() moves the expr to go out-of-bounds.
>>>
>>> This patch checks the boundary condition before expr->ops that fixes the slab-out-of-bounds Read issue.
>>
>> Thanks. I made a slight variant of your patch.
>>
>> I'm attaching it, it is also fixing the problem but it introduced
>> nft_expr_more() and use it everywhere.
>>
>> Let me know if this looks fine to you.
>> <0001-netfilter-fix-KASAN-slab-out-of-bounds-Read-in-nft_f.patch>
>
This commit:
fd01b2597941 SUNRPC: ECONNREFUSED should cause a rebind.
(originally applied to v4.14-rc1) didn't appear to get a stable cc,
perhaps because it wasn't considered a common problem at the time.
A patch I'm shortly about to post, cc stable, depends on the above, so
could it please be cherry-picked for stable?
It applies cleanly to both v4.4.240 & v4.9.240
thank you,
calum.
Dear Stable kernel maintainers,
Please consider cherry picking
commit e81e07244325 ("objtool: Support Clang non-section symbols in
ORC generation")
to linux-5.4.y and linux-4.19.y. This allows us to use LLVM_IAS=1 for
x86_64 Android kernel builds without warning.
Its partner patch (8782e7cab51b6b) was already backported to
linux-5.4.y as 8c627d4b15de9, and linux-4.19.y as 6e575122cd956.
https://github.com/ClangBuiltLinux/linux/issues/669https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?…
--
Thanks,
~Nick Desaulniers
From: Kan Liang <kan.liang(a)linux.intel.com>
The event CYCLE_ACTIVITY.STALLS_MEM_ANY (0x14a3) should be available on
all 8 GP counters on ICL, but it's only scheduled on the first four
counters due to the current ICL constraint table.
Add a line for the CYCLE_ACTIVITY.STALLS_MEM_ANY event in the ICL
constraint table.
Correct the comments for the CYCLE_ACTIVITY.CYCLES_MEM_ANY event.
Fixes: 6017608936c1 ("perf/x86/intel: Add Icelake support")
Reported-by: Andi Kleen <ak(a)linux.intel.com>
Signed-off-by: Kan Liang <kan.liang(a)linux.intel.com>
Cc: stable(a)vger.kernel.org
---
arch/x86/events/intel/core.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index c72e4904e056..b31ebb5f7fc4 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -257,7 +257,8 @@ static struct event_constraint intel_icl_event_constraints[] = {
INTEL_EVENT_CONSTRAINT_RANGE(0x48, 0x54, 0xf),
INTEL_EVENT_CONSTRAINT_RANGE(0x60, 0x8b, 0xf),
INTEL_UEVENT_CONSTRAINT(0x04a3, 0xff), /* CYCLE_ACTIVITY.STALLS_TOTAL */
- INTEL_UEVENT_CONSTRAINT(0x10a3, 0xff), /* CYCLE_ACTIVITY.STALLS_MEM_ANY */
+ INTEL_UEVENT_CONSTRAINT(0x10a3, 0xff), /* CYCLE_ACTIVITY.CYCLES_MEM_ANY */
+ INTEL_UEVENT_CONSTRAINT(0x14a3, 0xff), /* CYCLE_ACTIVITY.STALLS_MEM_ANY */
INTEL_EVENT_CONSTRAINT(0xa3, 0xf), /* CYCLE_ACTIVITY.* */
INTEL_EVENT_CONSTRAINT_RANGE(0xa8, 0xb0, 0xf),
INTEL_EVENT_CONSTRAINT_RANGE(0xb7, 0xbd, 0xf),
--
2.17.1
In our last attempt to fix races in the panfrost_job_timedout() path we
overlooked the case where a re-submitted job immediately triggers a
fault. This lead to a situation where we try to stop a scheduler that's
not resumed yet and lose the 'timedout' event without restarting the
timeout, thus blocking the whole queue.
Let's fix that by tracking timeouts occurring between the
drm_sched_resubmit_jobs() and drm_sched_start() calls.
v2:
- Fix another race (reported by Steven)
Fixes: 1a11a88cfd9a ("drm/panfrost: Fix job timeout handling")
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Boris Brezillon <boris.brezillon(a)collabora.com>
---
drivers/gpu/drm/panfrost/panfrost_job.c | 61 +++++++++++++++++--------
1 file changed, 43 insertions(+), 18 deletions(-)
diff --git a/drivers/gpu/drm/panfrost/panfrost_job.c b/drivers/gpu/drm/panfrost/panfrost_job.c
index d0469e944143..0f9a34f5c6d0 100644
--- a/drivers/gpu/drm/panfrost/panfrost_job.c
+++ b/drivers/gpu/drm/panfrost/panfrost_job.c
@@ -26,6 +26,7 @@
struct panfrost_queue_state {
struct drm_gpu_scheduler sched;
bool stopped;
+ bool timedout;
struct mutex lock;
u64 fence_context;
u64 emit_seqno;
@@ -383,11 +384,33 @@ static bool panfrost_scheduler_stop(struct panfrost_queue_state *queue,
queue->stopped = true;
stopped = true;
}
+ queue->timedout = true;
mutex_unlock(&queue->lock);
return stopped;
}
+static void panfrost_scheduler_start(struct panfrost_queue_state *queue)
+{
+ if (WARN_ON(!queue->stopped))
+ return;
+
+ mutex_lock(&queue->lock);
+ drm_sched_start(&queue->sched, true);
+
+ /*
+ * We might have missed fault-timeouts (AKA immediate timeouts) while
+ * the scheduler was stopped. Let's fake a new fault to trigger an
+ * immediate reset.
+ */
+ if (queue->timedout)
+ drm_sched_fault(&queue->sched);
+
+ queue->timedout = false;
+ queue->stopped = false;
+ mutex_unlock(&queue->lock);
+}
+
static void panfrost_job_timedout(struct drm_sched_job *sched_job)
{
struct panfrost_job *job = to_panfrost_job(sched_job);
@@ -422,27 +445,20 @@ static void panfrost_job_timedout(struct drm_sched_job *sched_job)
struct drm_gpu_scheduler *sched = &pfdev->js->queue[i].sched;
/*
- * If the queue is still active, make sure we wait for any
- * pending timeouts.
+ * Stop the scheduler and wait for any pending timeout handler
+ * to return.
*/
- if (!pfdev->js->queue[i].stopped)
+ panfrost_scheduler_stop(&pfdev->js->queue[i], NULL);
+ if (i != js)
cancel_delayed_work_sync(&sched->work_tdr);
/*
- * If the scheduler was not already stopped, there's a tiny
- * chance a timeout has expired just before we stopped it, and
- * drm_sched_stop() does not flush pending works. Let's flush
- * them now so the timeout handler doesn't get called in the
- * middle of a reset.
+ * We do another stop after cancel_delayed_work_sync() to make
+ * sure we don't race against another thread finishing its
+ * reset (the restart queue steps are not protected by the
+ * reset lock).
*/
- if (panfrost_scheduler_stop(&pfdev->js->queue[i], NULL))
- cancel_delayed_work_sync(&sched->work_tdr);
-
- /*
- * Now that we cancelled the pending timeouts, we can safely
- * reset the stopped state.
- */
- pfdev->js->queue[i].stopped = false;
+ panfrost_scheduler_stop(&pfdev->js->queue[i], NULL);
}
spin_lock_irqsave(&pfdev->js->job_lock, flags);
@@ -457,14 +473,23 @@ static void panfrost_job_timedout(struct drm_sched_job *sched_job)
panfrost_device_reset(pfdev);
- for (i = 0; i < NUM_JOB_SLOTS; i++)
+ for (i = 0; i < NUM_JOB_SLOTS; i++) {
+ /*
+ * The GPU is idle, and the scheduler is stopped, we can safely
+ * reset the ->timedout state without taking any lock. We need
+ * to do that before calling drm_sched_resubmit_jobs() though,
+ * because the resubmission might trigger immediate faults
+ * which we want to catch.
+ */
+ pfdev->js->queue[i].timedout = false;
drm_sched_resubmit_jobs(&pfdev->js->queue[i].sched);
+ }
mutex_unlock(&pfdev->reset_lock);
/* restart scheduler after GPU is usable again */
for (i = 0; i < NUM_JOB_SLOTS; i++)
- drm_sched_start(&pfdev->js->queue[i].sched, true);
+ panfrost_scheduler_start(&pfdev->js->queue[i]);
}
static const struct drm_sched_backend_ops panfrost_sched_ops = {
--
2.26.2
From: Alexander Usyskin <alexander.usyskin(a)intel.com>
A receive callback is queued while the client is still connected
but can still be called after the client was disconnected. Upon
disconnect cl->me_cl is set to NULL, hence we need to check
that ME client is not-NULL in mei_cl_mtu to avoid
null dereference.
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Alexander Usyskin <alexander.usyskin(a)intel.com>
Signed-off-by: Tomas Winkler <tomas.winkler(a)intel.com>
---
drivers/misc/mei/client.h | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/drivers/misc/mei/client.h b/drivers/misc/mei/client.h
index 64143d4ec758..9e08a9843bba 100644
--- a/drivers/misc/mei/client.h
+++ b/drivers/misc/mei/client.h
@@ -182,11 +182,11 @@ static inline u8 mei_cl_me_id(const struct mei_cl *cl)
*
* @cl: host client
*
- * Return: mtu
+ * Return: mtu or 0 if client is not connected
*/
static inline size_t mei_cl_mtu(const struct mei_cl *cl)
{
- return cl->me_cl->props.max_msg_length;
+ return cl->me_cl ? cl->me_cl->props.max_msg_length : 0;
}
/**
--
2.25.4