The patch below does not apply to the 4.9-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From f699594d436960160f6d5ba84ed4a222f20d11cd Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers(a)google.com>
Date: Thu, 18 Apr 2019 14:43:02 -0700
Subject: [PATCH] crypto: gcm - fix incompatibility between "gcm" and
"gcm_base"
GCM instances can be created by either the "gcm" template, which only
allows choosing the block cipher, e.g. "gcm(aes)"; or by "gcm_base",
which allows choosing the ctr and ghash implementations, e.g.
"gcm_base(ctr(aes-generic),ghash-generic)".
However, a "gcm_base" instance prevents a "gcm" instance from being
registered using the same implementations. Nor will the instance be
found by lookups of "gcm". This can be used as a denial of service.
Moreover, "gcm_base" instances are never tested by the crypto
self-tests, even if there are compatible "gcm" tests.
The root cause of these problems is that instances of the two templates
use different cra_names. Therefore, fix these problems by making
"gcm_base" instances set the same cra_name as "gcm" instances, e.g.
"gcm(aes)" instead of "gcm_base(ctr(aes-generic),ghash-generic)".
This requires extracting the block cipher name from the name of the ctr
algorithm. It also requires starting to verify that the algorithms are
really ctr and ghash, not something else entirely. But it would be
bizarre if anyone were actually using non-gcm-compatible algorithms with
gcm_base, so this shouldn't break anyone in practice.
Fixes: d00aa19b507b ("[CRYPTO] gcm: Allow block cipher parameter")
Cc: stable(a)vger.kernel.org
Signed-off-by: Eric Biggers <ebiggers(a)google.com>
Signed-off-by: Herbert Xu <herbert(a)gondor.apana.org.au>
diff --git a/crypto/gcm.c b/crypto/gcm.c
index ff498411b43f..33f45a980967 100644
--- a/crypto/gcm.c
+++ b/crypto/gcm.c
@@ -597,7 +597,6 @@ static void crypto_gcm_free(struct aead_instance *inst)
static int crypto_gcm_create_common(struct crypto_template *tmpl,
struct rtattr **tb,
- const char *full_name,
const char *ctr_name,
const char *ghash_name)
{
@@ -638,7 +637,8 @@ static int crypto_gcm_create_common(struct crypto_template *tmpl,
goto err_free_inst;
err = -EINVAL;
- if (ghash->digestsize != 16)
+ if (strcmp(ghash->base.cra_name, "ghash") != 0 ||
+ ghash->digestsize != 16)
goto err_drop_ghash;
crypto_set_skcipher_spawn(&ctx->ctr, aead_crypto_instance(inst));
@@ -650,24 +650,24 @@ static int crypto_gcm_create_common(struct crypto_template *tmpl,
ctr = crypto_spawn_skcipher_alg(&ctx->ctr);
- /* We only support 16-byte blocks. */
+ /* The skcipher algorithm must be CTR mode, using 16-byte blocks. */
err = -EINVAL;
- if (crypto_skcipher_alg_ivsize(ctr) != 16)
+ if (strncmp(ctr->base.cra_name, "ctr(", 4) != 0 ||
+ crypto_skcipher_alg_ivsize(ctr) != 16 ||
+ ctr->base.cra_blocksize != 1)
goto out_put_ctr;
- /* Not a stream cipher? */
- if (ctr->base.cra_blocksize != 1)
+ err = -ENAMETOOLONG;
+ if (snprintf(inst->alg.base.cra_name, CRYPTO_MAX_ALG_NAME,
+ "gcm(%s", ctr->base.cra_name + 4) >= CRYPTO_MAX_ALG_NAME)
goto out_put_ctr;
- err = -ENAMETOOLONG;
if (snprintf(inst->alg.base.cra_driver_name, CRYPTO_MAX_ALG_NAME,
"gcm_base(%s,%s)", ctr->base.cra_driver_name,
ghash_alg->cra_driver_name) >=
CRYPTO_MAX_ALG_NAME)
goto out_put_ctr;
- memcpy(inst->alg.base.cra_name, full_name, CRYPTO_MAX_ALG_NAME);
-
inst->alg.base.cra_flags = (ghash->base.cra_flags |
ctr->base.cra_flags) & CRYPTO_ALG_ASYNC;
inst->alg.base.cra_priority = (ghash->base.cra_priority +
@@ -709,7 +709,6 @@ static int crypto_gcm_create(struct crypto_template *tmpl, struct rtattr **tb)
{
const char *cipher_name;
char ctr_name[CRYPTO_MAX_ALG_NAME];
- char full_name[CRYPTO_MAX_ALG_NAME];
cipher_name = crypto_attr_alg_name(tb[1]);
if (IS_ERR(cipher_name))
@@ -719,12 +718,7 @@ static int crypto_gcm_create(struct crypto_template *tmpl, struct rtattr **tb)
CRYPTO_MAX_ALG_NAME)
return -ENAMETOOLONG;
- if (snprintf(full_name, CRYPTO_MAX_ALG_NAME, "gcm(%s)", cipher_name) >=
- CRYPTO_MAX_ALG_NAME)
- return -ENAMETOOLONG;
-
- return crypto_gcm_create_common(tmpl, tb, full_name,
- ctr_name, "ghash");
+ return crypto_gcm_create_common(tmpl, tb, ctr_name, "ghash");
}
static int crypto_gcm_base_create(struct crypto_template *tmpl,
@@ -732,7 +726,6 @@ static int crypto_gcm_base_create(struct crypto_template *tmpl,
{
const char *ctr_name;
const char *ghash_name;
- char full_name[CRYPTO_MAX_ALG_NAME];
ctr_name = crypto_attr_alg_name(tb[1]);
if (IS_ERR(ctr_name))
@@ -742,12 +735,7 @@ static int crypto_gcm_base_create(struct crypto_template *tmpl,
if (IS_ERR(ghash_name))
return PTR_ERR(ghash_name);
- if (snprintf(full_name, CRYPTO_MAX_ALG_NAME, "gcm_base(%s,%s)",
- ctr_name, ghash_name) >= CRYPTO_MAX_ALG_NAME)
- return -ENAMETOOLONG;
-
- return crypto_gcm_create_common(tmpl, tb, full_name,
- ctr_name, ghash_name);
+ return crypto_gcm_create_common(tmpl, tb, ctr_name, ghash_name);
}
static int crypto_rfc4106_setkey(struct crypto_aead *parent, const u8 *key,
The patch titled
Subject: coda: pass the host file in vma->vm_file on mmap
has been added to the -mm tree. Its filename is
coda-pass-the-host-file-in-vma-vm_file-on-mmap.patch
This patch should soon appear at
http://ozlabs.org/~akpm/mmots/broken-out/coda-pass-the-host-file-in-vma-vm_…
and later at
http://ozlabs.org/~akpm/mmotm/broken-out/coda-pass-the-host-file-in-vma-vm_…
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next and is updated
there every 3-4 working days
------------------------------------------------------
From: Jan Harkes <jaharkes(a)cs.cmu.edu>
Subject: coda: pass the host file in vma->vm_file on mmap
Patch series "Coda updates".
The following patch series is a collection of various fixes for Coda, most
of which were collected from linux-fsdevel or linux-kernel but which have
as yet not found their way upstream.
This patch (of 22):
Various file systems expect that vma->vm_file points at their own file
handle, several use file_inode(vma->vm_file) to get at their inode or use
vma->vm_file->private_data. However the way Coda wrapped mmap on a host
file broke this assumption, vm_file was still pointing at the Coda file
and the host file systems would scribble over Coda's inode and private
file data.
This patch fixes the incorrect expectation and wraps vm_ops->open and
vm_ops->close to allow Coda to track when the vm_area_struct is destroyed
so we still release the reference on the Coda file handle at the right
time.
Link: http://lkml.kernel.org/r/0e850c6e59c0b147dc2dcd51a3af004c948c3697.155811738…
Signed-off-by: Jan Harkes <jaharkes(a)cs.cmu.edu>
Cc: Arnd Bergmann <arnd(a)arndb.de>
Cc: Colin Ian King <colin.king(a)canonical.com>
Cc: Dan Carpenter <dan.carpenter(a)oracle.com>
Cc: David Howells <dhowells(a)redhat.com>
Cc: Fabian Frederick <fabf(a)skynet.be>
Cc: Mikko Rapeli <mikko.rapeli(a)iki.fi>
Cc: Sam Protsenko <semen.protsenko(a)linaro.org>
Cc: Yann Droneaud <ydroneaud(a)opteya.com>
Cc: Zhouyang Jia <jiazhouyang09(a)gmail.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
fs/coda/file.c | 70 +++++++++++++++++++++++++++++++++++++++++++++--
1 file changed, 68 insertions(+), 2 deletions(-)
--- a/fs/coda/file.c~coda-pass-the-host-file-in-vma-vm_file-on-mmap
+++ a/fs/coda/file.c
@@ -27,6 +27,13 @@
#include "coda_linux.h"
#include "coda_int.h"
+struct coda_vm_ops {
+ atomic_t refcnt;
+ struct file *coda_file;
+ const struct vm_operations_struct *host_vm_ops;
+ struct vm_operations_struct vm_ops;
+};
+
static ssize_t
coda_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
@@ -61,6 +68,34 @@ coda_file_write_iter(struct kiocb *iocb,
return ret;
}
+static void
+coda_vm_open(struct vm_area_struct *vma)
+{
+ struct coda_vm_ops *cvm_ops =
+ container_of(vma->vm_ops, struct coda_vm_ops, vm_ops);
+
+ atomic_inc(&cvm_ops->refcnt);
+
+ if (cvm_ops->host_vm_ops && cvm_ops->host_vm_ops->open)
+ cvm_ops->host_vm_ops->open(vma);
+}
+
+static void
+coda_vm_close(struct vm_area_struct *vma)
+{
+ struct coda_vm_ops *cvm_ops =
+ container_of(vma->vm_ops, struct coda_vm_ops, vm_ops);
+
+ if (cvm_ops->host_vm_ops && cvm_ops->host_vm_ops->close)
+ cvm_ops->host_vm_ops->close(vma);
+
+ if (atomic_dec_and_test(&cvm_ops->refcnt)) {
+ vma->vm_ops = cvm_ops->host_vm_ops;
+ fput(cvm_ops->coda_file);
+ kfree(cvm_ops);
+ }
+}
+
static int
coda_file_mmap(struct file *coda_file, struct vm_area_struct *vma)
{
@@ -68,6 +103,8 @@ coda_file_mmap(struct file *coda_file, s
struct coda_inode_info *cii;
struct file *host_file;
struct inode *coda_inode, *host_inode;
+ struct coda_vm_ops *cvm_ops;
+ int ret;
cfi = CODA_FTOC(coda_file);
BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC);
@@ -76,6 +113,13 @@ coda_file_mmap(struct file *coda_file, s
if (!host_file->f_op->mmap)
return -ENODEV;
+ if (WARN_ON(coda_file != vma->vm_file))
+ return -EIO;
+
+ cvm_ops = kmalloc(sizeof(struct coda_vm_ops), GFP_KERNEL);
+ if (!cvm_ops)
+ return -ENOMEM;
+
coda_inode = file_inode(coda_file);
host_inode = file_inode(host_file);
@@ -89,6 +133,7 @@ coda_file_mmap(struct file *coda_file, s
* the container file on us! */
else if (coda_inode->i_mapping != host_inode->i_mapping) {
spin_unlock(&cii->c_lock);
+ kfree(cvm_ops);
return -EBUSY;
}
@@ -97,7 +142,29 @@ coda_file_mmap(struct file *coda_file, s
cfi->cfi_mapcount++;
spin_unlock(&cii->c_lock);
- return call_mmap(host_file, vma);
+ vma->vm_file = get_file(host_file);
+ ret = call_mmap(vma->vm_file, vma);
+
+ if (ret) {
+ /* if call_mmap fails, our caller will put coda_file so we
+ * should drop the reference to the host_file that we got.
+ */
+ fput(host_file);
+ kfree(cvm_ops);
+ } else {
+ /* here we add redirects for the open/close vm_operations */
+ cvm_ops->host_vm_ops = vma->vm_ops;
+ if (vma->vm_ops)
+ cvm_ops->vm_ops = *vma->vm_ops;
+
+ cvm_ops->vm_ops.open = coda_vm_open;
+ cvm_ops->vm_ops.close = coda_vm_close;
+ cvm_ops->coda_file = coda_file;
+ atomic_set(&cvm_ops->refcnt, 1);
+
+ vma->vm_ops = &cvm_ops->vm_ops;
+ }
+ return ret;
}
int coda_open(struct inode *coda_inode, struct file *coda_file)
@@ -207,4 +274,3 @@ const struct file_operations coda_file_o
.fsync = coda_fsync,
.splice_read = generic_file_splice_read,
};
-
_
Patches currently in -mm which might be from jaharkes(a)cs.cmu.edu are
coda-pass-the-host-file-in-vma-vm_file-on-mmap.patch
coda-potential-buffer-overflow-in-coda_psdev_write.patch
coda-dont-try-to-print-names-that-were-considered-too-long.patch
uapi-linux-coda_psdevh-move-coda_req_-from-uapi-to-kernel-side-headers.patch
coda-change-codas-user-api-to-use-64-bit-time_t-in-timespec.patch
coda-bump-module-version.patch
coda-remove-uapi-linux-coda_psdevh.patch
The patch titled
Subject: mm/mmap.c: fix the adjusted length error
has been added to the -mm tree. Its filename is
mm-mmap-fix-the-adjusted-length-error.patch
This patch should soon appear at
http://ozlabs.org/~akpm/mmots/broken-out/mm-mmap-fix-the-adjusted-length-er…
and later at
http://ozlabs.org/~akpm/mmotm/broken-out/mm-mmap-fix-the-adjusted-length-er…
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next and is updated
there every 3-4 working days
------------------------------------------------------
From: jianhong chen <chenjianhong2(a)huawei.com>
Subject: mm/mmap.c: fix the adjusted length error
In linux version 4.4, a 32-bit process may fail to allocate 64M hugepage
memory by function shmat even though there is a 64M memory gap in the
process.
It is the adjusted length that causes the problem, introduced by
db4fbfb9523c935 ("mm: vm_unmapped_area() lookup function"). Accounting
for the worst case alignment overhead, unmapped_area() and
unmapped_area_topdown() adjust the search length before searching for
available vma gap. This is an estimated length, sum of the desired length
and the longest alignment offset, which can cause misjudgement if the
system has very little virtual memory left. For example, if the longest
memory gap available is 64M, we can't get it from the system by allocating
64M hugepage memory via shmat function. The reason is that it requires a
longer length, the sum of the desired length(64M) and the longest
alignment offset.
To fix this error we can calculate the alignment offset of gap_start or
gap_end to get a desired gap_start or gap_end value, before searching for
the available gap. In this way, we don't need to adjust the search
length.
Problem reproduces procedure:
1. allocate a lot of virtual memory segments via shmat and malloc
2. release one of the biggest memory segment via shmdt
3. attach the biggest memory segment via shmat
e.g.
process maps:
00008000-00009000 r-xp 00000000 00:12 3385 /tmp/memory_mmap
00011000-00012000 rw-p 00001000 00:12 3385 /tmp/memory_mmap
27536000-f756a000 rw-p 00000000 00:00 0
f756a000-f7691000 r-xp 00000000 01:00 560 /lib/libc-2.11.1.so
f7691000-f7699000 ---p 00127000 01:00 560 /lib/libc-2.11.1.so
f7699000-f769b000 r--p 00127000 01:00 560 /lib/libc-2.11.1.so
f769b000-f769c000 rw-p 00129000 01:00 560 /lib/libc-2.11.1.so
f769c000-f769f000 rw-p 00000000 00:00 0
f769f000-f76c0000 r-xp 00000000 01:00 583 /lib/libgcc_s.so.1
f76c0000-f76c7000 ---p 00021000 01:00 583 /lib/libgcc_s.so.1
f76c7000-f76c8000 rw-p 00020000 01:00 583 /lib/libgcc_s.so.1
f76c8000-f76e5000 r-xp 00000000 01:00 543 /lib/ld-2.11.1.so
f76e9000-f76ea000 rw-p 00000000 00:00 0
f76ea000-f76ec000 rw-p 00000000 00:00 0
f76ec000-f76ed000 r--p 0001c000 01:00 543 /lib/ld-2.11.1.so
f76ed000-f76ee000 rw-p 0001d000 01:00 543 /lib/ld-2.11.1.so
f7800000-f7a00000 rw-s 00000000 00:0e 0 /SYSV000000ea (deleted)
fba00000-fca00000 rw-s 00000000 00:0e 65538 /SYSV000000ec (deleted)
fca00000-fce00000 rw-s 00000000 00:0e 98307 /SYSV000000ed (deleted)
fce00000-fd800000 rw-s 00000000 00:0e 131076 /SYSV000000ee (deleted)
ff913000-ff934000 rw-p 00000000 00:00 0 [stack]
ffff0000-ffff1000 r-xp 00000000 00:00 0 [vectors]
from 0xf7a00000 to fba00000, it has 64M memory gap, but we can't get it
from kernel.
Link: http://lkml.kernel.org/r/1558073209-79549-1-git-send-email-chenjianhong2@hu…
Fixes: db4fbfb9523c ("mm: vm_unmapped_area() lookup function")
Signed-off-by: jianhong chen <chenjianhong2(a)huawei.com>
Cc: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
Cc: Michal Hocko <mhocko(a)suse.com>
Cc: Vlastimil Babka <vbabka(a)suse.cz>
Cc: "Kirill A . Shutemov" <kirill.shutemov(a)linux.intel.com>
Cc: Yang Shi <yang.shi(a)linux.alibaba.com>
Cc: Jann Horn <jannh(a)google.com>
Cc: Steve Capper <steve.capper(a)arm.com>
Cc: Yangtao Li <tiny.windzz(a)gmail.com>
Cc: Michel Lespinasse <walken(a)google.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/mmap.c | 43 +++++++++++++++++++++++++++++--------------
1 file changed, 29 insertions(+), 14 deletions(-)
--- a/mm/mmap.c~mm-mmap-fix-the-adjusted-length-error
+++ a/mm/mmap.c
@@ -1865,6 +1865,22 @@ unacct_error:
return error;
}
+static inline unsigned long gap_start_offset(struct vm_unmapped_area_info *info,
+ unsigned long addr)
+{
+ /* get gap_start offset to adjust gap address to the
+ * desired alignment
+ */
+ return (info->align_offset - addr) & info->align_mask;
+}
+
+static inline unsigned long gap_end_offset(struct vm_unmapped_area_info *info,
+ unsigned long addr)
+{
+ /* get gap_end offset to adjust gap address to the desired alignment */
+ return (addr - info->align_offset) & info->align_mask;
+}
+
unsigned long unmapped_area(struct vm_unmapped_area_info *info)
{
/*
@@ -1879,10 +1895,7 @@ unsigned long unmapped_area(struct vm_un
struct vm_area_struct *vma;
unsigned long length, low_limit, high_limit, gap_start, gap_end;
- /* Adjust search length to account for worst case alignment overhead */
- length = info->length + info->align_mask;
- if (length < info->length)
- return -ENOMEM;
+ length = info->length;
/* Adjust search limits by the desired length */
if (info->high_limit < length)
@@ -1914,6 +1927,7 @@ unsigned long unmapped_area(struct vm_un
}
gap_start = vma->vm_prev ? vm_end_gap(vma->vm_prev) : 0;
+ gap_start += gap_start_offset(info, gap_start);
check_current:
/* Check if current node has a suitable gap */
if (gap_start > high_limit)
@@ -1942,6 +1956,7 @@ check_current:
struct vm_area_struct, vm_rb);
if (prev == vma->vm_rb.rb_left) {
gap_start = vm_end_gap(vma->vm_prev);
+ gap_start += gap_start_offset(info, gap_start);
gap_end = vm_start_gap(vma);
goto check_current;
}
@@ -1951,17 +1966,17 @@ check_current:
check_highest:
/* Check highest gap, which does not precede any rbtree node */
gap_start = mm->highest_vm_end;
+ gap_start += gap_start_offset(info, gap_start);
gap_end = ULONG_MAX; /* Only for VM_BUG_ON below */
if (gap_start > high_limit)
return -ENOMEM;
found:
/* We found a suitable gap. Clip it with the original low_limit. */
- if (gap_start < info->low_limit)
+ if (gap_start < info->low_limit) {
gap_start = info->low_limit;
-
- /* Adjust gap address to the desired alignment */
- gap_start += (info->align_offset - gap_start) & info->align_mask;
+ gap_start += gap_start_offset(info, gap_start);
+ }
VM_BUG_ON(gap_start + info->length > info->high_limit);
VM_BUG_ON(gap_start + info->length > gap_end);
@@ -1974,16 +1989,14 @@ unsigned long unmapped_area_topdown(stru
struct vm_area_struct *vma;
unsigned long length, low_limit, high_limit, gap_start, gap_end;
- /* Adjust search length to account for worst case alignment overhead */
- length = info->length + info->align_mask;
- if (length < info->length)
- return -ENOMEM;
+ length = info->length;
/*
* Adjust search limits by the desired length.
* See implementation comment at top of unmapped_area().
*/
gap_end = info->high_limit;
+ gap_end -= gap_end_offset(info, gap_end);
if (gap_end < length)
return -ENOMEM;
high_limit = gap_end - length;
@@ -2020,6 +2033,7 @@ unsigned long unmapped_area_topdown(stru
check_current:
/* Check if current node has a suitable gap */
gap_end = vm_start_gap(vma);
+ gap_end -= gap_end_offset(info, gap_end);
if (gap_end < low_limit)
return -ENOMEM;
if (gap_start <= high_limit &&
@@ -2054,13 +2068,14 @@ check_current:
found:
/* We found a suitable gap. Clip it with the original high_limit. */
- if (gap_end > info->high_limit)
+ if (gap_end > info->high_limit) {
gap_end = info->high_limit;
+ gap_end -= gap_end_offset(info, gap_end);
+ }
found_highest:
/* Compute highest gap address at the desired alignment */
gap_end -= info->length;
- gap_end -= (gap_end - info->align_offset) & info->align_mask;
VM_BUG_ON(gap_end < info->low_limit);
VM_BUG_ON(gap_end < gap_start);
_
Patches currently in -mm which might be from chenjianhong2(a)huawei.com are
mm-mmap-fix-the-adjusted-length-error.patch
From: Jiufei Xue <jiufei.xue(a)linux.alibaba.com>
Subject: fs/writeback.c: use rcu_barrier() to wait for inflight wb switches going into workqueue when umount
synchronize_rcu() didn't wait for call_rcu() callbacks, so inode wb switch
may not go to the workqueue after synchronize_rcu(). Thus previous
scheduled switches was not finished even flushing the workqueue, which
will cause a NULL pointer dereferenced followed below.
VFS: Busy inodes after unmount of vdd. Self-destruct in 5 seconds. Have a nice day...
BUG: unable to handle kernel NULL pointer dereference at 0000000000000278
[<ffffffff8126a303>] evict+0xb3/0x180
[<ffffffff8126a760>] iput+0x1b0/0x230
[<ffffffff8127c690>] inode_switch_wbs_work_fn+0x3c0/0x6a0
[<ffffffff810a5b2e>] worker_thread+0x4e/0x490
[<ffffffff810a5ae0>] ? process_one_work+0x410/0x410
[<ffffffff810ac056>] kthread+0xe6/0x100
[<ffffffff8173c199>] ret_from_fork+0x39/0x50
Replace the synchronize_rcu() call with a rcu_barrier() to wait for all
pending callbacks to finish. And inc isw_nr_in_flight after call_rcu() in
inode_switch_wbs() to make more sense.
Link: http://lkml.kernel.org/r/20190429024108.54150-1-jiufei.xue@linux.alibaba.com
Signed-off-by: Jiufei Xue <jiufei.xue(a)linux.alibaba.com>
Acked-by: Tejun Heo <tj(a)kernel.org>
Suggested-by: Tejun Heo <tj(a)kernel.org>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
fs/fs-writeback.c | 11 ++++++++---
1 file changed, 8 insertions(+), 3 deletions(-)
--- a/fs/fs-writeback.c~fs-writeback-use-rcu_barrier-to-wait-for-inflight-wb-switches-going-into-workqueue-when-umount
+++ a/fs/fs-writeback.c
@@ -523,8 +523,6 @@ static void inode_switch_wbs(struct inod
isw->inode = inode;
- atomic_inc(&isw_nr_in_flight);
-
/*
* In addition to synchronizing among switchers, I_WB_SWITCH tells
* the RCU protected stat update paths to grab the i_page
@@ -532,6 +530,9 @@ static void inode_switch_wbs(struct inod
* Let's continue after I_WB_SWITCH is guaranteed to be visible.
*/
call_rcu(&isw->rcu_head, inode_switch_wbs_rcu_fn);
+
+ atomic_inc(&isw_nr_in_flight);
+
goto out_unlock;
out_free:
@@ -901,7 +902,11 @@ restart:
void cgroup_writeback_umount(void)
{
if (atomic_read(&isw_nr_in_flight)) {
- synchronize_rcu();
+ /*
+ * Use rcu_barrier() to wait for all pending callbacks to
+ * ensure that all in-flight wb switches are in the workqueue.
+ */
+ rcu_barrier();
flush_workqueue(isw_wq);
}
}
_
From: Mel Gorman <mgorman(a)techsingularity.net>
Subject: mm/compaction.c: correct zone boundary handling when isolating pages from a pageblock
syzbot reported the following error from a tree with a head commit of
baf76f0c58ae ("slip: make slhc_free() silently accept an error pointer")
BUG: unable to handle kernel paging request at ffffea0003348000
#PF error: [normal kernel read fault]
PGD 12c3f9067 P4D 12c3f9067 PUD 12c3f8067 PMD 0
Oops: 0000 [#1] PREEMPT SMP KASAN
CPU: 1 PID: 28916 Comm: syz-executor.2 Not tainted 5.1.0-rc6+ #89
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
RIP: 0010:constant_test_bit arch/x86/include/asm/bitops.h:314 [inline]
RIP: 0010:PageCompound include/linux/page-flags.h:186 [inline]
RIP: 0010:isolate_freepages_block+0x1c0/0xd40 mm/compaction.c:579
Code: 01 d8 ff 4d 85 ed 0f 84 ef 07 00 00 e8 29 00 d8 ff 4c 89 e0 83 85 38 ff
ff ff 01 48 c1 e8 03 42 80 3c 38 00 0f 85 31 0a 00 00 <4d> 8b 2c 24 31 ff 49
c1 ed 10 41 83 e5 01 44 89 ee e8 3a 01 d8 ff
RSP: 0018:ffff88802b31eab8 EFLAGS: 00010246
RAX: 1ffffd4000669000 RBX: 00000000000cd200 RCX: ffffc9000a235000
RDX: 000000000001ca5e RSI: ffffffff81988cc7 RDI: 0000000000000001
RBP: ffff88802b31ebd8 R08: ffff88805af700c0 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000000 R12: ffffea0003348000
R13: 0000000000000000 R14: ffff88802b31f030 R15: dffffc0000000000
FS: 00007f61648dc700(0000) GS:ffff8880ae900000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: ffffea0003348000 CR3: 0000000037c64000 CR4: 00000000001426e0
Call Trace:
fast_isolate_around mm/compaction.c:1243 [inline]
fast_isolate_freepages mm/compaction.c:1418 [inline]
isolate_freepages mm/compaction.c:1438 [inline]
compaction_alloc+0x1aee/0x22e0 mm/compaction.c:1550
There is no reproducer and it is difficult to hit -- 1 crash every few
days. The issue is very similar to the fix in commit 6b0868c820ff
("mm/compaction.c: correct zone boundary handling when resetting pageblock
skip hints"). When isolating free pages around a target pageblock, the
boundary handling is off by one and can stray into the next pageblock.
Triggering the syzbot error requires that the end of pageblock is section
or zone aligned, and that the next section is unpopulated.
A more subtle consequence of the bug is that pageblocks were being
improperly used as migration targets which potentially hurts fragmentation
avoidance in the long-term one page at a time.
A debugging patch revealed that it's definitely possible to stray outside
of a pageblock which is not intended. While syzbot cannot be used to
verify this patch, it was confirmed that the debugging warning no longer
triggers with this patch applied. It has also been confirmed that the THP
allocation stress tests are not degraded by this patch.
Link: http://lkml.kernel.org/r/20190510182124.GI18914@techsingularity.net
Fixes: e332f741a8dd ("mm, compaction: be selective about what pageblocks to clear skip hints")
Signed-off-by: Mel Gorman <mgorman(a)techsingularity.net>
Reported-by: syzbot+d84c80f9fe26a0f7a734(a)syzkaller.appspotmail.com
Cc: Dmitry Vyukov <dvyukov(a)google.com>
Cc: Andrey Ryabinin <aryabinin(a)virtuozzo.com>
Cc: Qian Cai <cai(a)lca.pw>
Cc: Michal Hocko <mhocko(a)suse.com>
Cc: Vlastimil Babka <vbabka(a)suse.cz>
Cc: <stable(a)vger.kernel.org> # v5.1+
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/compaction.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
--- a/mm/compaction.c~mm-compactionc-correct-zone-boundary-handling-when-isolating-pages-from-a-pageblock
+++ a/mm/compaction.c
@@ -1230,7 +1230,7 @@ fast_isolate_around(struct compact_contr
/* Pageblock boundaries */
start_pfn = pageblock_start_pfn(pfn);
- end_pfn = min(start_pfn + pageblock_nr_pages, zone_end_pfn(cc->zone));
+ end_pfn = min(pageblock_end_pfn(pfn), zone_end_pfn(cc->zone)) - 1;
/* Scan before */
if (start_pfn != pfn) {
@@ -1241,7 +1241,7 @@ fast_isolate_around(struct compact_contr
/* Scan after */
start_pfn = pfn + nr_isolated;
- if (start_pfn != end_pfn)
+ if (start_pfn < end_pfn)
isolate_freepages_block(cc, &start_pfn, end_pfn, &cc->freepages, 1, false);
/* Skip this pageblock in the future as it's full or nearly full */
_
I noticed 4.19.35 got a backport of mainline 4abb951b, but it appears to
be a duplicate backport that landed in the wrong function. We can see
this in the stable-queue repo:
stable-queue$ find . -name '*acpica-aml-interpreter-add-region-addr*' |grep 4.19
./releases/4.19.6/acpica-aml-interpreter-add-region-addresses-in-global-list-during-initialization.patch
./releases/4.19.3/revert-acpica-aml-interpreter-add-region-addresses-in.patch
./releases/4.19.35/acpica-aml-interpreter-add-region-addresses-in-global-list-during-initialization.patch
./releases/4.19.2/acpica-aml-interpreter-add-region-addresses-in-global-list-during-initialization.patch
So it was added to 4.19.2, reverted in .3, re-added in .6, and then
finally patched into a similar looking but wrong function in .35
If we diff the .6 and .35 versions, we see the function difference:
-@@ -417,6 +417,10 @@ acpi_ds_eval_region_operands(struct acpi
+@@ -523,6 +523,10 @@ acpi_ds_eval_table_region_operands(struc
I don't know what the history is/was around the 2/3/6 churn, but the
re-addition in 4.19.35 to a different function sure looks wrong.
The commit adds a call "status = acpi_ut_add_address_range(..." and if
we check mainline, there is only one in that file, but in 4.19.35+ there
now are two calls - since the two functions had similar context and
comments, it isn't hard to see how patch could/would apply it a 2nd time
in the wrong place.
I didn't check if any of the other currently maintained linux-stable
versions also had this possible issue.
Paul.
From: Adrian Hunter <adrian.hunter(a)intel.com>
The sample timestamp is updated to ensure that the timestamp represents
the time of the sample and not a branch that the decoder is still
walking towards. The sample timestamp is updated when the decoder
returns, but the decoder does not return for non-taken branches. Update
the sample timestamp then also.
Note that commit 3f04d98e972b5 ("perf intel-pt: Improve sample
timestamp") was also a stable fix and appears, for example, in v4.4
stable tree as commit a4ebb58fd124 ("perf intel-pt: Improve sample
timestamp").
Signed-off-by: Adrian Hunter <adrian.hunter(a)intel.com>
Cc: Jiri Olsa <jolsa(a)redhat.com>
Cc: stable(a)vger.kernel.org # v4.4+
Fixes: 3f04d98e972b ("perf intel-pt: Improve sample timestamp")
Link: http://lkml.kernel.org/r/20190510124143.27054-4-adrian.hunter@intel.com
Signed-off-by: Arnaldo Carvalho de Melo <acme(a)redhat.com>
---
tools/perf/util/intel-pt-decoder/intel-pt-decoder.c | 5 ++++-
1 file changed, 4 insertions(+), 1 deletion(-)
diff --git a/tools/perf/util/intel-pt-decoder/intel-pt-decoder.c b/tools/perf/util/intel-pt-decoder/intel-pt-decoder.c
index 9cbd587489bf..f4c3c84b090f 100644
--- a/tools/perf/util/intel-pt-decoder/intel-pt-decoder.c
+++ b/tools/perf/util/intel-pt-decoder/intel-pt-decoder.c
@@ -1318,8 +1318,11 @@ static int intel_pt_walk_tnt(struct intel_pt_decoder *decoder)
return 0;
}
decoder->ip += intel_pt_insn.length;
- if (!decoder->tnt.count)
+ if (!decoder->tnt.count) {
+ decoder->sample_timestamp = decoder->timestamp;
+ decoder->sample_insn_cnt = decoder->timestamp_insn_cnt;
return -EAGAIN;
+ }
decoder->tnt.payload <<= 1;
continue;
}
--
2.20.1
From: Adrian Hunter <adrian.hunter(a)intel.com>
The decoder uses its current timestamp in samples. Usually that is a
timestamp that has already passed, but in some cases it is a timestamp
for a branch that the decoder is walking towards, and consequently
hasn't reached.
The intel_pt_sample_time() function decides which is which, but was not
handling TNT packets exactly correctly.
In the case of TNT, the timestamp applies to the first branch, so the
decoder must first walk to that branch.
That means intel_pt_sample_time() should return true for TNT, and this
patch makes that change. However, if the first branch is a non-taken
branch (i.e. a 'N'), then intel_pt_sample_time() needs to return false
for subsequent taken branches in the same TNT packet.
To handle that, introduce a new state INTEL_PT_STATE_TNT_CONT to
distinguish the cases.
Note that commit 3f04d98e972b5 ("perf intel-pt: Improve sample
timestamp") was also a stable fix and appears, for example, in v4.4
stable tree as commit a4ebb58fd124 ("perf intel-pt: Improve sample
timestamp").
Signed-off-by: Adrian Hunter <adrian.hunter(a)intel.com>
Cc: Jiri Olsa <jolsa(a)redhat.com>
Cc: stable(a)vger.kernel.org # v4.4+
Fixes: 3f04d98e972b5 ("perf intel-pt: Improve sample timestamp")
Link: http://lkml.kernel.org/r/20190510124143.27054-3-adrian.hunter@intel.com
Signed-off-by: Arnaldo Carvalho de Melo <acme(a)redhat.com>
---
tools/perf/util/intel-pt-decoder/intel-pt-decoder.c | 13 ++++++++++---
1 file changed, 10 insertions(+), 3 deletions(-)
diff --git a/tools/perf/util/intel-pt-decoder/intel-pt-decoder.c b/tools/perf/util/intel-pt-decoder/intel-pt-decoder.c
index 26dbf11e071a..9cbd587489bf 100644
--- a/tools/perf/util/intel-pt-decoder/intel-pt-decoder.c
+++ b/tools/perf/util/intel-pt-decoder/intel-pt-decoder.c
@@ -58,6 +58,7 @@ enum intel_pt_pkt_state {
INTEL_PT_STATE_NO_IP,
INTEL_PT_STATE_ERR_RESYNC,
INTEL_PT_STATE_IN_SYNC,
+ INTEL_PT_STATE_TNT_CONT,
INTEL_PT_STATE_TNT,
INTEL_PT_STATE_TIP,
INTEL_PT_STATE_TIP_PGD,
@@ -72,8 +73,9 @@ static inline bool intel_pt_sample_time(enum intel_pt_pkt_state pkt_state)
case INTEL_PT_STATE_NO_IP:
case INTEL_PT_STATE_ERR_RESYNC:
case INTEL_PT_STATE_IN_SYNC:
- case INTEL_PT_STATE_TNT:
+ case INTEL_PT_STATE_TNT_CONT:
return true;
+ case INTEL_PT_STATE_TNT:
case INTEL_PT_STATE_TIP:
case INTEL_PT_STATE_TIP_PGD:
case INTEL_PT_STATE_FUP:
@@ -1261,7 +1263,9 @@ static int intel_pt_walk_tnt(struct intel_pt_decoder *decoder)
return -ENOENT;
}
decoder->tnt.count -= 1;
- if (!decoder->tnt.count)
+ if (decoder->tnt.count)
+ decoder->pkt_state = INTEL_PT_STATE_TNT_CONT;
+ else
decoder->pkt_state = INTEL_PT_STATE_IN_SYNC;
decoder->tnt.payload <<= 1;
decoder->state.from_ip = decoder->ip;
@@ -1292,7 +1296,9 @@ static int intel_pt_walk_tnt(struct intel_pt_decoder *decoder)
if (intel_pt_insn.branch == INTEL_PT_BR_CONDITIONAL) {
decoder->tnt.count -= 1;
- if (!decoder->tnt.count)
+ if (decoder->tnt.count)
+ decoder->pkt_state = INTEL_PT_STATE_TNT_CONT;
+ else
decoder->pkt_state = INTEL_PT_STATE_IN_SYNC;
if (decoder->tnt.payload & BIT63) {
decoder->tnt.payload <<= 1;
@@ -2372,6 +2378,7 @@ const struct intel_pt_state *intel_pt_decode(struct intel_pt_decoder *decoder)
err = intel_pt_walk_trace(decoder);
break;
case INTEL_PT_STATE_TNT:
+ case INTEL_PT_STATE_TNT_CONT:
err = intel_pt_walk_tnt(decoder);
if (err == -EAGAIN)
err = intel_pt_walk_trace(decoder);
--
2.20.1