The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 998ac6d21cfd6efd58f5edf420bae8839dda9f2a Mon Sep 17 00:00:00 2001
From: ethanwu <ethanwu(a)synology.com>
Date: Sun, 29 Apr 2018 15:59:42 +0800
Subject: [PATCH] btrfs: Take trans lock before access running trans in
check_delayed_ref
In preivous patch:
Btrfs: kill trans in run_delalloc_nocow and btrfs_cross_ref_exist
We avoid starting btrfs transaction and get this information from
fs_info->running_transaction directly.
When accessing running_transaction in check_delayed_ref, there's a
chance that current transaction will be freed by commit transaction
after the NULL pointer check of running_transaction is passed.
After looking all the other places using fs_info->running_transaction,
they are either protected by trans_lock or holding the transactions.
Fix this by using trans_lock and increasing the use_count.
Fixes: e4c3b2dcd144 ("Btrfs: kill trans in run_delalloc_nocow and btrfs_cross_ref_exist")
CC: stable(a)vger.kernel.org # 4.14+
Signed-off-by: ethanwu <ethanwu(a)synology.com>
Signed-off-by: David Sterba <dsterba(a)suse.com>
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index f99102063366..3871658b6ab1 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3142,7 +3142,11 @@ static noinline int check_delayed_ref(struct btrfs_root *root,
struct rb_node *node;
int ret = 0;
+ spin_lock(&root->fs_info->trans_lock);
cur_trans = root->fs_info->running_transaction;
+ if (cur_trans)
+ refcount_inc(&cur_trans->use_count);
+ spin_unlock(&root->fs_info->trans_lock);
if (!cur_trans)
return 0;
@@ -3151,6 +3155,7 @@ static noinline int check_delayed_ref(struct btrfs_root *root,
head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
if (!head) {
spin_unlock(&delayed_refs->lock);
+ btrfs_put_transaction(cur_trans);
return 0;
}
@@ -3167,6 +3172,7 @@ static noinline int check_delayed_ref(struct btrfs_root *root,
mutex_lock(&head->mutex);
mutex_unlock(&head->mutex);
btrfs_put_delayed_ref_head(head);
+ btrfs_put_transaction(cur_trans);
return -EAGAIN;
}
spin_unlock(&delayed_refs->lock);
@@ -3199,6 +3205,7 @@ static noinline int check_delayed_ref(struct btrfs_root *root,
}
spin_unlock(&head->lock);
mutex_unlock(&head->mutex);
+ btrfs_put_transaction(cur_trans);
return ret;
}
With commit 8f111bc357aa ("cpufreq/schedutil: Rewrite CPUFREQ_RT support")
schedutil governor uses rq->rt.rt_nr_running to detect whether a RT task is
currently running on the CPU and to set frequency to max if necessary.
cpufreq_update_util() is called in enqueue/dequeue_top_rt_rq() but
rq->rt.rt_nr_running as not been updated yet when dequeue_top_rt_rq() is
called so schedutil still considers that a RT task is running when the
last task is dequeued. The update of rq->rt.rt_nr_running happens later
in dequeue_rt_stack()
Fixes: 8f111bc357aa ('cpufreq/schedutil: Rewrite CPUFREQ_RT support')
Cc: <stable(a)vger.kernel.org> # v4.16+
Signed-off-by: Vincent Guittot <vincent.guittot(a)linaro.org>
---
kernel/sched/rt.c | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 7aef6b4..6e74d3d 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1001,8 +1001,6 @@ dequeue_top_rt_rq(struct rt_rq *rt_rq)
sub_nr_running(rq, rt_rq->rt_nr_running);
rt_rq->rt_queued = 0;
- /* Kick cpufreq (see the comment in kernel/sched/sched.h). */
- cpufreq_update_util(rq, 0);
}
static void
@@ -1288,6 +1286,9 @@ static void dequeue_rt_stack(struct sched_rt_entity *rt_se, unsigned int flags)
if (on_rt_rq(rt_se))
__dequeue_rt_entity(rt_se, flags);
}
+
+ /* Kick cpufreq (see the comment in kernel/sched/sched.h). */
+ cpufreq_update_util(rq_of_rt_rq(rt_rq_of_se(back)), 0);
}
static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
--
2.7.4
Ben Hutchings pointed out that 29b7a6fa1ec0 ("ubi: fastmap: Don't flush
fastmap work on detach") does not really fix the problem, it just
reduces the risk to hit the race window where fastmap work races against
free()'ing ubi->volumes[].
The correct approach is making sure that no more fastmap work is in
progress before we free ubi data structures.
So we cancel fastmap work right after the ubi background thread is
stopped.
By setting ubi->thread_enabled to zero we make sure that no further work
tries to wake the thread.
Fixes: 29b7a6fa1ec0 ("ubi: fastmap: Don't flush fastmap work on detach")
Fixes: 74cdaf24004a ("UBI: Fastmap: Fix memory leaks while closing the WL sub-system")
Cc: stable(a)vger.kernel.org
Cc: Ben Hutchings <ben.hutchings(a)codethink.co.uk>
Cc: Martin Townsend <mtownsend1973(a)gmail.com>
Signed-off-by: Richard Weinberger <richard(a)nod.at>
---
drivers/mtd/ubi/build.c | 3 +++
drivers/mtd/ubi/wl.c | 4 +---
2 files changed, 4 insertions(+), 3 deletions(-)
diff --git a/drivers/mtd/ubi/build.c b/drivers/mtd/ubi/build.c
index 6326a02e4568..0cf3356424cd 100644
--- a/drivers/mtd/ubi/build.c
+++ b/drivers/mtd/ubi/build.c
@@ -1100,6 +1100,9 @@ int ubi_detach_mtd_dev(int ubi_num, int anyway)
if (ubi->bgt_thread)
kthread_stop(ubi->bgt_thread);
+#ifdef CONFIG_MTD_UBI_FASTMAP
+ cancel_work_sync(&ubi->fm_work);
+#endif
ubi_debugfs_exit_dev(ubi);
uif_close(ubi);
diff --git a/drivers/mtd/ubi/wl.c b/drivers/mtd/ubi/wl.c
index 3cc302924899..6bbb968fe9da 100644
--- a/drivers/mtd/ubi/wl.c
+++ b/drivers/mtd/ubi/wl.c
@@ -1505,6 +1505,7 @@ int ubi_thread(void *u)
}
dbg_wl("background thread \"%s\" is killed", ubi->bgt_name);
+ ubi->thread_enabled = 0;
return 0;
}
@@ -1514,9 +1515,6 @@ int ubi_thread(void *u)
*/
static void shutdown_work(struct ubi_device *ubi)
{
-#ifdef CONFIG_MTD_UBI_FASTMAP
- flush_work(&ubi->fm_work);
-#endif
while (!list_empty(&ubi->works)) {
struct ubi_work *wrk;
--
2.13.6
From: Josef Bacik <jbacik(a)fb.com>
I messed up changing the size of an NBD device while it was connected by
not actually updating the device or doing the uevent. Fix this by
updating everything if we're connected and we change the size.
cc: stable(a)vger.kernel.org
Fixes: 639812a ("nbd: don't set the device size until we're connected")
Signed-off-by: Josef Bacik <jbacik(a)fb.com>
---
drivers/block/nbd.c | 2 ++
1 file changed, 2 insertions(+)
diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index 9710a0c338b0..b709abf3cb79 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -246,6 +246,8 @@ static void nbd_size_set(struct nbd_device *nbd, loff_t blocksize,
struct nbd_config *config = nbd->config;
config->blksize = blocksize;
config->bytesize = blocksize * nr_blocks;
+ if (nbd->task_recv != NULL)
+ nbd_size_update(nbd);
}
static void nbd_complete_rq(struct request *req)
--
2.14.3
Removing a drive with drive_del while it is being used to run an I/O
intensive workload can cause QEMU to crash.
An AIO flush can yield at some point:
blk_aio_flush_entry()
blk_co_flush(blk)
bdrv_co_flush(blk->root->bs)
...
qemu_coroutine_yield()
and let the HMP command to run, free blk->root and give control
back to the AIO flush:
hmp_drive_del()
blk_remove_bs()
bdrv_root_unref_child(blk->root)
child_bs = blk->root->bs
bdrv_detach_child(blk->root)
bdrv_replace_child(blk->root, NULL)
blk->root->bs = NULL
g_free(blk->root) <============== blk->root becomes stale
bdrv_unref(child_bs)
bdrv_delete(child_bs)
bdrv_close()
bdrv_drained_begin()
bdrv_do_drained_begin()
bdrv_drain_recurse()
aio_poll()
...
qemu_coroutine_switch()
and the AIO flush completion ends up dereferencing blk->root:
blk_aio_complete()
scsi_aio_complete()
blk_get_aio_context(blk)
bs = blk_bs(blk)
ie, bs = blk->root ? blk->root->bs : NULL
^^^^^
stale
The solution to this user-after-free situation is is to clear
blk->root before calling bdrv_unref() in bdrv_detach_child(),
and let blk_get_aio_context() fall back to the main loop context
since the BDS has been removed.
Signed-off-by: Greg Kurz <groug(a)kaod.org>
---
The use-after-free condition is easy to reproduce with a stress-ng
run in the guest:
-device virtio-scsi-pci,id=scsi1 \
-drive file=/home/greg/images/scratch.qcow2,format=qcow2,if=none,id=drive1 \
-device scsi-hd,bus=scsi1.0,drive=drive1,id=scsi-hd1
# stress-ng --hdd 0 --aggressive
and doing drive_del from the QEMU monitor while stress-ng is still running:
(qemu) drive_del drive1
The crash is less easy to hit though, as it depends on the bs field
of the stale blk->root to have a non-NULL value that eventually breaks
something when it gets dereferenced. The following patch simulates
that, and allows to validate the fix:
--- a/block.c
+++ b/block.c
@@ -2127,6 +2127,8 @@ BdrvChild *bdrv_attach_child(BlockDriverState *parent_bs,
static void bdrv_detach_child(BdrvChild *child)
{
+ BlockDriverState *bs = child->bs;
+
if (child->next.le_prev) {
QLIST_REMOVE(child, next);
child->next.le_prev = NULL;
@@ -2135,7 +2137,15 @@ static void bdrv_detach_child(BdrvChild *child)
bdrv_replace_child(child, NULL);
g_free(child->name);
- g_free(child);
+ /* Poison the BdrvChild instead of freeing it, in order to break blk_bs()
+ * if the blk still has a pointer to this BdrvChild in blk->root.
+ */
+ if (atomic_read(&bs->in_flight)) {
+ child->bs = (BlockDriverState *) -1;
+ fprintf(stderr, "\nPoisonned BdrvChild %p\n", child);
+ } else {
+ g_free(child);
+ }
}
void bdrv_root_unref_child(BdrvChild *child)
---
block/block-backend.c | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/block/block-backend.c b/block/block-backend.c
index 681b240b1268..ed9434e236b9 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -756,6 +756,7 @@ void blk_remove_bs(BlockBackend *blk)
{
ThrottleGroupMember *tgm = &blk->public.throttle_group_member;
BlockDriverState *bs;
+ BdrvChild *root;
notifier_list_notify(&blk->remove_bs_notifiers, blk);
if (tgm->throttle_state) {
@@ -768,8 +769,9 @@ void blk_remove_bs(BlockBackend *blk)
blk_update_root_state(blk);
- bdrv_root_unref_child(blk->root);
+ root = blk->root;
blk->root = NULL;
+ bdrv_root_unref_child(root);
}
/*
From: "Steven Rostedt (VMware)" <rostedt(a)goodmis.org>
The trigger code is picky in how it can be disabled as there may be
dependencies between different events and synthetic events. Change the order
on how triggers are reset.
1) Reset triggers of all synthetic events first
2) Remove triggers with actions attached to them
3) Remove all other triggers
If this order isn't followed, then some triggers will not be reset, and an
error may happen because a trigger is busy.
Cc: stable(a)vger.kernel.org
Fixes: cfa0963dc474f ("kselftests/ftrace : Add event trigger testcases")
Acked-by: Masami Hiramatsu <mhiramat(a)kernel.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt(a)goodmis.org>
---
.../testing/selftests/ftrace/test.d/functions | 21 ++++++++++++++++---
1 file changed, 18 insertions(+), 3 deletions(-)
diff --git a/tools/testing/selftests/ftrace/test.d/functions b/tools/testing/selftests/ftrace/test.d/functions
index 2a4f16fc9819..8393b1c06027 100644
--- a/tools/testing/selftests/ftrace/test.d/functions
+++ b/tools/testing/selftests/ftrace/test.d/functions
@@ -15,14 +15,29 @@ reset_tracer() { # reset the current tracer
echo nop > current_tracer
}
-reset_trigger() { # reset all current setting triggers
- grep -v ^# events/*/*/trigger |
+reset_trigger_file() {
+ # remove action triggers first
+ grep -H ':on[^:]*(' $@ |
+ while read line; do
+ cmd=`echo $line | cut -f2- -d: | cut -f1 -d" "`
+ file=`echo $line | cut -f1 -d:`
+ echo "!$cmd" >> $file
+ done
+ grep -Hv ^# $@ |
while read line; do
cmd=`echo $line | cut -f2- -d: | cut -f1 -d" "`
- echo "!$cmd" > `echo $line | cut -f1 -d:`
+ file=`echo $line | cut -f1 -d:`
+ echo "!$cmd" > $file
done
}
+reset_trigger() { # reset all current setting triggers
+ if [ -d events/synthetic ]; then
+ reset_trigger_file events/synthetic/*/trigger
+ fi
+ reset_trigger_file events/*/*/trigger
+}
+
reset_events_filter() { # reset all current setting filters
grep -v ^none events/*/*/filter |
while read line; do
--
2.17.0
Update support for the UV kernel to accommodate Intel BIOS changes in
NVDIMM alignment, which caused UV BIOS to align the memory boundaries
on different blocks than the previous UV standard of 2GB.
--
Currently, there is a small window where ovl_obtain_alias() can
race with ovl_instantiate() and create two different overlay inodes
with the same underlying real non-dir non-hardlink inode.
The race requires an adversary to guess the file handle of the
yet to be created upper inode and decode the guessed file handle
after ovl_creat_real(), but before ovl_instantiate().
This patch fixes the race, by using insert_inode_locked4() to add
a newly created inode to icache.
If the newly created inode apears to already exist in icache (hashed
by the same real upper inode), we export this error to user instead
of silently not hashing the new inode.
This race does not affect overlay directory inodes, because those
are decoded via ovl_lookup_real() and not with ovl_obtain_alias(),
so avoid using the new helper d_instantiate_new() to reduce backport
dependencies.
Backporting only makes sense for v4.16 where NFS export was introduced.
Cc: Al Viro <viro(a)zeniv.linux.org.uk>
Cc: <stable(a)vger.kernel.org> #v4.16
Signed-off-by: Amir Goldstein <amir73il(a)gmail.com>
---
fs/overlayfs/dir.c | 24 ++++++++++++++++++------
fs/overlayfs/inode.c | 18 ++++++++++++++++++
fs/overlayfs/overlayfs.h | 1 +
3 files changed, 37 insertions(+), 6 deletions(-)
diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c
index 47dc980e8b33..62e6733b755c 100644
--- a/fs/overlayfs/dir.c
+++ b/fs/overlayfs/dir.c
@@ -183,14 +183,24 @@ static int ovl_set_opaque(struct dentry *dentry, struct dentry *upperdentry)
}
/* Common operations required to be done after creation of file on upper */
-static void ovl_instantiate(struct dentry *dentry, struct inode *inode,
- struct dentry *newdentry, bool hardlink)
+static int ovl_instantiate(struct dentry *dentry, struct inode *inode,
+ struct dentry *newdentry, bool hardlink)
{
ovl_dir_modified(dentry->d_parent, false);
- ovl_copyattr(d_inode(newdentry), inode);
ovl_dentry_set_upper_alias(dentry);
if (!hardlink) {
- ovl_inode_update(inode, newdentry);
+ int err;
+
+ ovl_inode_init(inode, newdentry, NULL);
+ /*
+ * XXX: if we ever use ovl_obtain_alias() to decode directory
+ * file handles, need to use ovl_insert_inode_locked() and
+ * d_instantiate_new() here to prevent ovl_obtain_alias()
+ * from sneaking in before d_instantiate().
+ */
+ err = ovl_insert_inode(inode, d_inode(newdentry));
+ if (err)
+ return err;
} else {
WARN_ON(ovl_inode_real(inode) != d_inode(newdentry));
dput(newdentry);
@@ -200,6 +210,8 @@ static void ovl_instantiate(struct dentry *dentry, struct inode *inode,
/* Force lookup of new upper hardlink to find its lower */
if (hardlink)
d_drop(dentry);
+
+ return 0;
}
static bool ovl_type_merge(struct dentry *dentry)
@@ -238,7 +250,7 @@ static int ovl_create_upper(struct dentry *dentry, struct inode *inode,
ovl_set_opaque(dentry, newdentry);
}
- ovl_instantiate(dentry, inode, newdentry, !!hardlink);
+ err = ovl_instantiate(dentry, inode, newdentry, !!hardlink);
newdentry = NULL;
out_dput:
dput(newdentry);
@@ -439,7 +451,7 @@ static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode,
if (err)
goto out_cleanup;
}
- ovl_instantiate(dentry, inode, newdentry, !!hardlink);
+ err = ovl_instantiate(dentry, inode, newdentry, !!hardlink);
newdentry = NULL;
out_dput2:
dput(upper);
diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c
index 7abcf96e94fc..060c534998d1 100644
--- a/fs/overlayfs/inode.c
+++ b/fs/overlayfs/inode.c
@@ -741,6 +741,24 @@ static bool ovl_verify_inode(struct inode *inode, struct dentry *lowerdentry,
return true;
}
+static int ovl_insert_inode_locked(struct inode *inode, struct inode *realinode)
+{
+ return insert_inode_locked4(inode, (unsigned long) realinode,
+ ovl_inode_test, realinode);
+}
+
+int ovl_insert_inode(struct inode *inode, struct inode *realinode)
+{
+ int err;
+
+ err = ovl_insert_inode_locked(inode, realinode);
+ if (err)
+ return err;
+
+ unlock_new_inode(inode);
+ return 0;
+}
+
struct inode *ovl_lookup_inode(struct super_block *sb, struct dentry *real,
bool is_upper)
{
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index caaa47cea2aa..642b25702092 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -343,6 +343,7 @@ int ovl_update_time(struct inode *inode, struct timespec *ts, int flags);
bool ovl_is_private_xattr(const char *name);
struct inode *ovl_new_inode(struct super_block *sb, umode_t mode, dev_t rdev);
+int ovl_insert_inode(struct inode *inode, struct inode *realinode);
struct inode *ovl_lookup_inode(struct super_block *sb, struct dentry *real,
bool is_upper);
struct inode *ovl_get_inode(struct super_block *sb, struct dentry *upperdentry,
--
2.7.4