Removing a drive with drive_del while it is being used to run an I/O
intensive workload can cause QEMU to crash.
An AIO flush can yield at some point:
blk_aio_flush_entry()
blk_co_flush(blk)
bdrv_co_flush(blk->root->bs)
...
qemu_coroutine_yield()
and let the HMP command to run, free blk->root and give control
back to the AIO flush:
hmp_drive_del()
blk_remove_bs()
bdrv_root_unref_child(blk->root)
child_bs = blk->root->bs
bdrv_detach_child(blk->root)
bdrv_replace_child(blk->root, NULL)
blk->root->bs = NULL
g_free(blk->root) <============== blk->root becomes stale
bdrv_unref(child_bs)
bdrv_delete(child_bs)
bdrv_close()
bdrv_drained_begin()
bdrv_do_drained_begin()
bdrv_drain_recurse()
aio_poll()
...
qemu_coroutine_switch()
and the AIO flush completion ends up dereferencing blk->root:
blk_aio_complete()
scsi_aio_complete()
blk_get_aio_context(blk)
bs = blk_bs(blk)
ie, bs = blk->root ? blk->root->bs : NULL
^^^^^
stale
The solution to this user-after-free situation is is to clear
blk->root before calling bdrv_unref() in bdrv_detach_child(),
and let blk_get_aio_context() fall back to the main loop context
since the BDS has been removed.
Signed-off-by: Greg Kurz <groug(a)kaod.org>
---
The use-after-free condition is easy to reproduce with a stress-ng
run in the guest:
-device virtio-scsi-pci,id=scsi1 \
-drive file=/home/greg/images/scratch.qcow2,format=qcow2,if=none,id=drive1 \
-device scsi-hd,bus=scsi1.0,drive=drive1,id=scsi-hd1
# stress-ng --hdd 0 --aggressive
and doing drive_del from the QEMU monitor while stress-ng is still running:
(qemu) drive_del drive1
The crash is less easy to hit though, as it depends on the bs field
of the stale blk->root to have a non-NULL value that eventually breaks
something when it gets dereferenced. The following patch simulates
that, and allows to validate the fix:
--- a/block.c
+++ b/block.c
@@ -2127,6 +2127,8 @@ BdrvChild *bdrv_attach_child(BlockDriverState *parent_bs,
static void bdrv_detach_child(BdrvChild *child)
{
+ BlockDriverState *bs = child->bs;
+
if (child->next.le_prev) {
QLIST_REMOVE(child, next);
child->next.le_prev = NULL;
@@ -2135,7 +2137,15 @@ static void bdrv_detach_child(BdrvChild *child)
bdrv_replace_child(child, NULL);
g_free(child->name);
- g_free(child);
+ /* Poison the BdrvChild instead of freeing it, in order to break blk_bs()
+ * if the blk still has a pointer to this BdrvChild in blk->root.
+ */
+ if (atomic_read(&bs->in_flight)) {
+ child->bs = (BlockDriverState *) -1;
+ fprintf(stderr, "\nPoisonned BdrvChild %p\n", child);
+ } else {
+ g_free(child);
+ }
}
void bdrv_root_unref_child(BdrvChild *child)
---
block/block-backend.c | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/block/block-backend.c b/block/block-backend.c
index 681b240b1268..ed9434e236b9 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -756,6 +756,7 @@ void blk_remove_bs(BlockBackend *blk)
{
ThrottleGroupMember *tgm = &blk->public.throttle_group_member;
BlockDriverState *bs;
+ BdrvChild *root;
notifier_list_notify(&blk->remove_bs_notifiers, blk);
if (tgm->throttle_state) {
@@ -768,8 +769,9 @@ void blk_remove_bs(BlockBackend *blk)
blk_update_root_state(blk);
- bdrv_root_unref_child(blk->root);
+ root = blk->root;
blk->root = NULL;
+ bdrv_root_unref_child(root);
}
/*
From: "Steven Rostedt (VMware)" <rostedt(a)goodmis.org>
The trigger code is picky in how it can be disabled as there may be
dependencies between different events and synthetic events. Change the order
on how triggers are reset.
1) Reset triggers of all synthetic events first
2) Remove triggers with actions attached to them
3) Remove all other triggers
If this order isn't followed, then some triggers will not be reset, and an
error may happen because a trigger is busy.
Cc: stable(a)vger.kernel.org
Fixes: cfa0963dc474f ("kselftests/ftrace : Add event trigger testcases")
Acked-by: Masami Hiramatsu <mhiramat(a)kernel.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt(a)goodmis.org>
---
.../testing/selftests/ftrace/test.d/functions | 21 ++++++++++++++++---
1 file changed, 18 insertions(+), 3 deletions(-)
diff --git a/tools/testing/selftests/ftrace/test.d/functions b/tools/testing/selftests/ftrace/test.d/functions
index 2a4f16fc9819..8393b1c06027 100644
--- a/tools/testing/selftests/ftrace/test.d/functions
+++ b/tools/testing/selftests/ftrace/test.d/functions
@@ -15,14 +15,29 @@ reset_tracer() { # reset the current tracer
echo nop > current_tracer
}
-reset_trigger() { # reset all current setting triggers
- grep -v ^# events/*/*/trigger |
+reset_trigger_file() {
+ # remove action triggers first
+ grep -H ':on[^:]*(' $@ |
+ while read line; do
+ cmd=`echo $line | cut -f2- -d: | cut -f1 -d" "`
+ file=`echo $line | cut -f1 -d:`
+ echo "!$cmd" >> $file
+ done
+ grep -Hv ^# $@ |
while read line; do
cmd=`echo $line | cut -f2- -d: | cut -f1 -d" "`
- echo "!$cmd" > `echo $line | cut -f1 -d:`
+ file=`echo $line | cut -f1 -d:`
+ echo "!$cmd" > $file
done
}
+reset_trigger() { # reset all current setting triggers
+ if [ -d events/synthetic ]; then
+ reset_trigger_file events/synthetic/*/trigger
+ fi
+ reset_trigger_file events/*/*/trigger
+}
+
reset_events_filter() { # reset all current setting filters
grep -v ^none events/*/*/filter |
while read line; do
--
2.17.0
Update support for the UV kernel to accommodate Intel BIOS changes in
NVDIMM alignment, which caused UV BIOS to align the memory boundaries
on different blocks than the previous UV standard of 2GB.
--
Currently, there is a small window where ovl_obtain_alias() can
race with ovl_instantiate() and create two different overlay inodes
with the same underlying real non-dir non-hardlink inode.
The race requires an adversary to guess the file handle of the
yet to be created upper inode and decode the guessed file handle
after ovl_creat_real(), but before ovl_instantiate().
This patch fixes the race, by using insert_inode_locked4() to add
a newly created inode to icache.
If the newly created inode apears to already exist in icache (hashed
by the same real upper inode), we export this error to user instead
of silently not hashing the new inode.
This race does not affect overlay directory inodes, because those
are decoded via ovl_lookup_real() and not with ovl_obtain_alias(),
so avoid using the new helper d_instantiate_new() to reduce backport
dependencies.
Backporting only makes sense for v4.16 where NFS export was introduced.
Cc: Al Viro <viro(a)zeniv.linux.org.uk>
Cc: <stable(a)vger.kernel.org> #v4.16
Signed-off-by: Amir Goldstein <amir73il(a)gmail.com>
---
fs/overlayfs/dir.c | 24 ++++++++++++++++++------
fs/overlayfs/inode.c | 18 ++++++++++++++++++
fs/overlayfs/overlayfs.h | 1 +
3 files changed, 37 insertions(+), 6 deletions(-)
diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c
index 47dc980e8b33..62e6733b755c 100644
--- a/fs/overlayfs/dir.c
+++ b/fs/overlayfs/dir.c
@@ -183,14 +183,24 @@ static int ovl_set_opaque(struct dentry *dentry, struct dentry *upperdentry)
}
/* Common operations required to be done after creation of file on upper */
-static void ovl_instantiate(struct dentry *dentry, struct inode *inode,
- struct dentry *newdentry, bool hardlink)
+static int ovl_instantiate(struct dentry *dentry, struct inode *inode,
+ struct dentry *newdentry, bool hardlink)
{
ovl_dir_modified(dentry->d_parent, false);
- ovl_copyattr(d_inode(newdentry), inode);
ovl_dentry_set_upper_alias(dentry);
if (!hardlink) {
- ovl_inode_update(inode, newdentry);
+ int err;
+
+ ovl_inode_init(inode, newdentry, NULL);
+ /*
+ * XXX: if we ever use ovl_obtain_alias() to decode directory
+ * file handles, need to use ovl_insert_inode_locked() and
+ * d_instantiate_new() here to prevent ovl_obtain_alias()
+ * from sneaking in before d_instantiate().
+ */
+ err = ovl_insert_inode(inode, d_inode(newdentry));
+ if (err)
+ return err;
} else {
WARN_ON(ovl_inode_real(inode) != d_inode(newdentry));
dput(newdentry);
@@ -200,6 +210,8 @@ static void ovl_instantiate(struct dentry *dentry, struct inode *inode,
/* Force lookup of new upper hardlink to find its lower */
if (hardlink)
d_drop(dentry);
+
+ return 0;
}
static bool ovl_type_merge(struct dentry *dentry)
@@ -238,7 +250,7 @@ static int ovl_create_upper(struct dentry *dentry, struct inode *inode,
ovl_set_opaque(dentry, newdentry);
}
- ovl_instantiate(dentry, inode, newdentry, !!hardlink);
+ err = ovl_instantiate(dentry, inode, newdentry, !!hardlink);
newdentry = NULL;
out_dput:
dput(newdentry);
@@ -439,7 +451,7 @@ static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode,
if (err)
goto out_cleanup;
}
- ovl_instantiate(dentry, inode, newdentry, !!hardlink);
+ err = ovl_instantiate(dentry, inode, newdentry, !!hardlink);
newdentry = NULL;
out_dput2:
dput(upper);
diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c
index 7abcf96e94fc..060c534998d1 100644
--- a/fs/overlayfs/inode.c
+++ b/fs/overlayfs/inode.c
@@ -741,6 +741,24 @@ static bool ovl_verify_inode(struct inode *inode, struct dentry *lowerdentry,
return true;
}
+static int ovl_insert_inode_locked(struct inode *inode, struct inode *realinode)
+{
+ return insert_inode_locked4(inode, (unsigned long) realinode,
+ ovl_inode_test, realinode);
+}
+
+int ovl_insert_inode(struct inode *inode, struct inode *realinode)
+{
+ int err;
+
+ err = ovl_insert_inode_locked(inode, realinode);
+ if (err)
+ return err;
+
+ unlock_new_inode(inode);
+ return 0;
+}
+
struct inode *ovl_lookup_inode(struct super_block *sb, struct dentry *real,
bool is_upper)
{
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index caaa47cea2aa..642b25702092 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -343,6 +343,7 @@ int ovl_update_time(struct inode *inode, struct timespec *ts, int flags);
bool ovl_is_private_xattr(const char *name);
struct inode *ovl_new_inode(struct super_block *sb, umode_t mode, dev_t rdev);
+int ovl_insert_inode(struct inode *inode, struct inode *realinode);
struct inode *ovl_lookup_inode(struct super_block *sb, struct dentry *real,
bool is_upper);
struct inode *ovl_get_inode(struct super_block *sb, struct dentry *upperdentry,
--
2.7.4
This is the start of the stable review cycle for the 4.14.41 release.
There are 62 patches in this series, all will be posted as a response
to this one. If anyone has any issues with these being applied, please
let me know.
Responses should be made by Wed May 16 06:47:52 UTC 2018.
Anything received after that time might be too late.
The whole patch series can be found in one patch at:
https://www.kernel.org/pub/linux/kernel/v4.x/stable-review/patch-4.14.41-rc…
or in the git tree and branch at:
git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git linux-4.14.y
and the diffstat can be found below.
thanks,
greg k-h
-------------
Pseudo-Shortlog of commits:
Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
Linux 4.14.41-rc1
Anthoine Bourgeois <anthoine.bourgeois(a)blade-group.com>
KVM: x86: remove APIC Timer periodic/oneshot spikes
Paul Mackerras <paulus(a)ozlabs.org>
KVM: PPC: Book3S HV: Fix handling of large pages in radix page fault handler
Peter Zijlstra <peterz(a)infradead.org>
perf/x86: Fix possible Spectre-v1 indexing for x86_pmu::event_map()
Peter Zijlstra <peterz(a)infradead.org>
perf/core: Fix possible Spectre-v1 indexing for ->aux_pages[]
Peter Zijlstra <peterz(a)infradead.org>
perf/x86/msr: Fix possible Spectre-v1 indexing in the MSR driver
Peter Zijlstra <peterz(a)infradead.org>
perf/x86/cstate: Fix possible Spectre-v1 indexing for pkg_msr
Peter Zijlstra <peterz(a)infradead.org>
perf/x86: Fix possible Spectre-v1 indexing for hw_perf_event cache_*
Masami Hiramatsu <mhiramat(a)kernel.org>
tracing/uprobe_event: Fix strncpy corner case
Peter Zijlstra <peterz(a)infradead.org>
sched/autogroup: Fix possible Spectre-v1 indexing for sched_prio_to_weight[]
Steve French <smfrench(a)gmail.com>
smb3: directory sync should not return an error
Jens Axboe <axboe(a)kernel.dk>
nvme: add quirk to force medium priority for SQ creation
Marek Szyprowski <m.szyprowski(a)samsung.com>
thermal: exynos: Propagate error value from tmu_read()
Marek Szyprowski <m.szyprowski(a)samsung.com>
thermal: exynos: Reading temperature makes sense only when TMU is turned on
Hans de Goede <hdegoede(a)redhat.com>
Bluetooth: btusb: Only check needs_reset_resume DMI table for QCA rome chipsets
Hans de Goede <hdegoede(a)redhat.com>
Bluetooth: btusb: Add Dell XPS 13 9360 to btusb_needs_reset_resume_table
Hans de Goede <hdegoede(a)redhat.com>
Revert "Bluetooth: btusb: Fix quirk for Atheros 1525/QCA6174"
Rafael J. Wysocki <rafael.j.wysocki(a)intel.com>
cpufreq: schedutil: Avoid using invalid next_freq
Rafael J. Wysocki <rafael.j.wysocki(a)intel.com>
PCI / PM: Check device_may_wakeup() in pci_enable_wake()
Kai Heng Feng <kai.heng.feng(a)canonical.com>
PCI / PM: Always check PME wakeup capability for runtime wakeup support
Gustavo A. R. Silva <gustavo(a)embeddedor.com>
atm: zatm: Fix potential Spectre v1
Gustavo A. R. Silva <gustavo(a)embeddedor.com>
net: atm: Fix potential Spectre v1
Ville Syrjälä <ville.syrjala(a)linux.intel.com>
drm/atomic: Clean private obj old_state/new_state in drm_atomic_state_default_clear()
Ville Syrjälä <ville.syrjala(a)linux.intel.com>
drm/atomic: Clean old_state/new_state in drm_atomic_state_default_clear()
Lyude Paul <lyude(a)redhat.com>
drm/nouveau: Fix deadlock in nv50_mstm_register_connector()
Florent Flament <contact(a)florentflament.com>
drm/i915: Fix drm:intel_enable_lvds ERROR message in kernel log
Boris Brezillon <boris.brezillon(a)bootlin.com>
drm/vc4: Fix scaling of uni-planar formats
Lukas Wunner <lukas(a)wunner.de>
can: hi311x: Work around TX complete interrupt erratum
Lukas Wunner <lukas(a)wunner.de>
can: hi311x: Acquire SPI lock on ->do_get_berr_counter
Jimmy Assarsson <extja(a)kvaser.com>
can: kvaser_usb: Increase correct stats counter in kvaser_usb_rx_can_msg()
Ilya Dryomov <idryomov(a)gmail.com>
ceph: fix rsize/wsize capping in ceph_direct_read_write()
David Rientjes <rientjes(a)google.com>
mm, oom: fix concurrent munlock and oom reaper unmap, v3
Pavel Tatashin <pasha.tatashin(a)oracle.com>
mm: sections are not offlined during memory hotremove
Vitaly Wool <vitalywool(a)gmail.com>
z3fold: fix reclaim lock-ups
Steven Rostedt (VMware) <rostedt(a)goodmis.org>
tracing: Fix regex_match_front() to not over compare the test string
Mikulas Patocka <mpatocka(a)redhat.com>
dm integrity: use kvfree for kvmalloc'd memory
Hans de Goede <hdegoede(a)redhat.com>
libata: Apply NOLPM quirk for SanDisk SD7UB3Q*G1001 SSDs
Johan Hovold <johan(a)kernel.org>
rfkill: gpio: fix memory leak in probe error path
Uwe Kleine-König <u.kleine-koenig(a)pengutronix.de>
gpio: fix error path in lineevent_create
Govert Overgaauw <govert.overgaauw(a)prodrive-technologies.com>
gpio: fix aspeed_gpio unmask irq
Timur Tabi <timur(a)codeaurora.org>
gpioib: do not free unrequested descriptors
Jann Horn <jannh(a)google.com>
compat: fix 4-byte infoleak via uninitialized struct field
Suzuki K Poulose <suzuki.poulose(a)arm.com>
arm64: Add work around for Arm Cortex-A55 Erratum 1024718
Paul Mackerras <paulus(a)ozlabs.org>
KVM: PPC: Book3S HV: Fix VRMA initialization with 2MB or 1GB memory backing
Laurent Vivier <lvivier(a)redhat.com>
KVM: PPC: Book3S HV: Fix guest time accounting with VIRT_CPU_ACCOUNTING_GEN
Paul Mackerras <paulus(a)ozlabs.org>
KVM: PPC: Book3S HV: Fix trap number return from __kvmppc_vcore_entry
Jan Kara <jack(a)suse.cz>
bdi: Fix oops in wb_workfn()
Tetsuo Handa <penguin-kernel(a)I-love.SAKURA.ne.jp>
bdi: wake up concurrent wb_shutdown() callers.
Eric Dumazet <edumazet(a)google.com>
tcp: fix TCP_REPAIR_QUEUE bound checking
Jiri Olsa <jolsa(a)kernel.org>
perf: Remove superfluous allocation error check
Michal Hocko <mhocko(a)suse.com>
memcg: fix per_node_info cleanup
Eric Dumazet <edumazet(a)google.com>
inetpeer: fix uninit-value in inet_getpeer
Eric Dumazet <edumazet(a)google.com>
soreuseport: initialise timewait reuseport field
Eric Dumazet <edumazet(a)google.com>
ipv4: fix uninit-value in ip_route_output_key_hash_rcu()
Eric Dumazet <edumazet(a)google.com>
dccp: initialize ireq->ir_mark
Eric Dumazet <edumazet(a)google.com>
net: fix uninit-value in __hw_addr_add_ex()
Eric Dumazet <edumazet(a)google.com>
net: initialize skb->peeked when cloning
Eric Dumazet <edumazet(a)google.com>
net: fix rtnh_ok()
Eric Dumazet <edumazet(a)google.com>
netlink: fix uninit-value in netlink_sendmsg
Eric Dumazet <edumazet(a)google.com>
crypto: af_alg - fix possible uninit-value in alg_bind()
Tom Herbert <tom(a)quantonium.net>
kcm: Call strp_stop before strp_done in kcm_attach
Florian Westphal <fw(a)strlen.de>
netfilter: ebtables: don't attempt to allocate 0-sized compat array
Julian Anastasov <ja(a)ssi.bg>
ipvs: fix rtnl_lock lockups caused by start_sync_thread
-------------
Diffstat:
Documentation/arm64/silicon-errata.txt | 1 +
Makefile | 4 +-
arch/arm64/Kconfig | 14 +++
arch/arm64/include/asm/assembler.h | 40 +++++++++
arch/arm64/include/asm/cputype.h | 2 +
arch/arm64/mm/proc.S | 5 ++
arch/powerpc/kvm/book3s_64_mmu_radix.c | 72 +++++++++------
arch/powerpc/kvm/book3s_hv.c | 17 ++--
arch/powerpc/kvm/book3s_hv_rmhandlers.S | 8 +-
arch/x86/events/core.c | 8 +-
arch/x86/events/intel/cstate.c | 2 +
arch/x86/events/msr.c | 9 +-
arch/x86/kvm/lapic.c | 37 ++++----
crypto/af_alg.c | 8 +-
drivers/ata/libata-core.c | 3 +
drivers/atm/zatm.c | 3 +
drivers/bluetooth/btusb.c | 19 +++-
drivers/gpio/gpio-aspeed.c | 2 +-
drivers/gpio/gpiolib.c | 7 +-
drivers/gpu/drm/drm_atomic.c | 8 ++
drivers/gpu/drm/i915/intel_lvds.c | 3 +-
drivers/gpu/drm/nouveau/nv50_display.c | 7 +-
drivers/gpu/drm/vc4/vc4_plane.c | 2 +-
drivers/md/dm-integrity.c | 2 +-
drivers/net/can/spi/hi311x.c | 11 ++-
drivers/net/can/usb/kvaser_usb.c | 2 +-
drivers/nvme/host/nvme.h | 5 ++
drivers/nvme/host/pci.c | 12 ++-
drivers/pci/pci.c | 37 +++++---
drivers/thermal/samsung/exynos_tmu.c | 14 ++-
fs/ceph/file.c | 10 +--
fs/cifs/cifsfs.c | 13 +++
fs/fs-writeback.c | 2 +-
include/linux/oom.h | 2 +
include/linux/wait_bit.h | 17 ++++
include/net/inet_timewait_sock.h | 1 +
include/net/nexthop.h | 2 +-
kernel/compat.c | 1 +
kernel/events/callchain.c | 10 +--
kernel/events/ring_buffer.c | 7 +-
kernel/sched/autogroup.c | 7 +-
kernel/sched/cpufreq_schedutil.c | 3 +-
kernel/trace/trace_events_filter.c | 3 +
kernel/trace/trace_uprobe.c | 2 +
mm/backing-dev.c | 2 +-
mm/memcontrol.c | 3 +
mm/mmap.c | 44 +++++----
mm/oom_kill.c | 74 ++++++++-------
mm/sparse.c | 2 +-
mm/z3fold.c | 42 ++++++---
net/atm/lec.c | 9 +-
net/bridge/netfilter/ebtables.c | 11 +--
net/core/dev_addr_lists.c | 4 +-
net/core/skbuff.c | 1 +
net/dccp/ipv4.c | 1 +
net/dccp/ipv6.c | 1 +
net/ipv4/inet_timewait_sock.c | 1 +
net/ipv4/inetpeer.c | 1 +
net/ipv4/route.c | 11 +--
net/ipv4/tcp.c | 2 +-
net/kcm/kcmsock.c | 1 +
net/netfilter/ipvs/ip_vs_ctl.c | 8 --
net/netfilter/ipvs/ip_vs_sync.c | 155 ++++++++++++++++----------------
net/netlink/af_netlink.c | 2 +
net/rfkill/rfkill-gpio.c | 7 +-
65 files changed, 543 insertions(+), 283 deletions(-)