From: Jim Mattson <jmattson(a)google.com>
The potential performance advantages of a vmcs02 pool have never been
realized. To simplify the code, eliminate the pool. Instead, a single
vmcs02 is allocated per VCPU when the VCPU enters VMX operation.
Cc: stable(a)vger.kernel.org # prereq for Spectre mitigation
Signed-off-by: Jim Mattson <jmattson(a)google.com>
Signed-off-by: Mark Kanda <mark.kanda(a)oracle.com>
Reviewed-by: Ameya More <ameya.more(a)oracle.com>
Reviewed-by: David Hildenbrand <david(a)redhat.com>
Reviewed-by: Paolo Bonzini <pbonzini(a)redhat.com>
Signed-off-by: Radim Krčmář <rkrcmar(a)redhat.com>
---
arch/x86/kvm/vmx.c | 146 +++++++++--------------------------------------------
1 file changed, 23 insertions(+), 123 deletions(-)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index c829d89e2e63..ad6a883b7a32 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -185,7 +185,6 @@ module_param(ple_window_max, int, S_IRUGO);
extern const ulong vmx_return;
#define NR_AUTOLOAD_MSRS 8
-#define VMCS02_POOL_SIZE 1
struct vmcs {
u32 revision_id;
@@ -226,7 +225,7 @@ struct shared_msr_entry {
* stored in guest memory specified by VMPTRLD, but is opaque to the guest,
* which must access it using VMREAD/VMWRITE/VMCLEAR instructions.
* More than one of these structures may exist, if L1 runs multiple L2 guests.
- * nested_vmx_run() will use the data here to build a vmcs02: a VMCS for the
+ * nested_vmx_run() will use the data here to build the vmcs02: a VMCS for the
* underlying hardware which will be used to run L2.
* This structure is packed to ensure that its layout is identical across
* machines (necessary for live migration).
@@ -409,13 +408,6 @@ struct __packed vmcs12 {
*/
#define VMCS12_SIZE 0x1000
-/* Used to remember the last vmcs02 used for some recently used vmcs12s */
-struct vmcs02_list {
- struct list_head list;
- gpa_t vmptr;
- struct loaded_vmcs vmcs02;
-};
-
/*
* The nested_vmx structure is part of vcpu_vmx, and holds information we need
* for correct emulation of VMX (i.e., nested VMX) on this vcpu.
@@ -440,15 +432,15 @@ struct nested_vmx {
*/
bool sync_shadow_vmcs;
- /* vmcs02_list cache of VMCSs recently used to run L2 guests */
- struct list_head vmcs02_pool;
- int vmcs02_num;
bool change_vmcs01_virtual_x2apic_mode;
/* L2 must run next, and mustn't decide to exit to L1. */
bool nested_run_pending;
+
+ struct loaded_vmcs vmcs02;
+
/*
- * Guest pages referred to in vmcs02 with host-physical pointers, so
- * we must keep them pinned while L2 runs.
+ * Guest pages referred to in the vmcs02 with host-physical
+ * pointers, so we must keep them pinned while L2 runs.
*/
struct page *apic_access_page;
struct page *virtual_apic_page;
@@ -6973,94 +6965,6 @@ static int handle_monitor(struct kvm_vcpu *vcpu)
return handle_nop(vcpu);
}
-/*
- * To run an L2 guest, we need a vmcs02 based on the L1-specified vmcs12.
- * We could reuse a single VMCS for all the L2 guests, but we also want the
- * option to allocate a separate vmcs02 for each separate loaded vmcs12 - this
- * allows keeping them loaded on the processor, and in the future will allow
- * optimizations where prepare_vmcs02 doesn't need to set all the fields on
- * every entry if they never change.
- * So we keep, in vmx->nested.vmcs02_pool, a cache of size VMCS02_POOL_SIZE
- * (>=0) with a vmcs02 for each recently loaded vmcs12s, most recent first.
- *
- * The following functions allocate and free a vmcs02 in this pool.
- */
-
-/* Get a VMCS from the pool to use as vmcs02 for the current vmcs12. */
-static struct loaded_vmcs *nested_get_current_vmcs02(struct vcpu_vmx *vmx)
-{
- struct vmcs02_list *item;
- list_for_each_entry(item, &vmx->nested.vmcs02_pool, list)
- if (item->vmptr == vmx->nested.current_vmptr) {
- list_move(&item->list, &vmx->nested.vmcs02_pool);
- return &item->vmcs02;
- }
-
- if (vmx->nested.vmcs02_num >= max(VMCS02_POOL_SIZE, 1)) {
- /* Recycle the least recently used VMCS. */
- item = list_last_entry(&vmx->nested.vmcs02_pool,
- struct vmcs02_list, list);
- item->vmptr = vmx->nested.current_vmptr;
- list_move(&item->list, &vmx->nested.vmcs02_pool);
- return &item->vmcs02;
- }
-
- /* Create a new VMCS */
- item = kzalloc(sizeof(struct vmcs02_list), GFP_KERNEL);
- if (!item)
- return NULL;
- item->vmcs02.vmcs = alloc_vmcs();
- item->vmcs02.shadow_vmcs = NULL;
- if (!item->vmcs02.vmcs) {
- kfree(item);
- return NULL;
- }
- loaded_vmcs_init(&item->vmcs02);
- item->vmptr = vmx->nested.current_vmptr;
- list_add(&(item->list), &(vmx->nested.vmcs02_pool));
- vmx->nested.vmcs02_num++;
- return &item->vmcs02;
-}
-
-/* Free and remove from pool a vmcs02 saved for a vmcs12 (if there is one) */
-static void nested_free_vmcs02(struct vcpu_vmx *vmx, gpa_t vmptr)
-{
- struct vmcs02_list *item;
- list_for_each_entry(item, &vmx->nested.vmcs02_pool, list)
- if (item->vmptr == vmptr) {
- free_loaded_vmcs(&item->vmcs02);
- list_del(&item->list);
- kfree(item);
- vmx->nested.vmcs02_num--;
- return;
- }
-}
-
-/*
- * Free all VMCSs saved for this vcpu, except the one pointed by
- * vmx->loaded_vmcs. We must be running L1, so vmx->loaded_vmcs
- * must be &vmx->vmcs01.
- */
-static void nested_free_all_saved_vmcss(struct vcpu_vmx *vmx)
-{
- struct vmcs02_list *item, *n;
-
- WARN_ON(vmx->loaded_vmcs != &vmx->vmcs01);
- list_for_each_entry_safe(item, n, &vmx->nested.vmcs02_pool, list) {
- /*
- * Something will leak if the above WARN triggers. Better than
- * a use-after-free.
- */
- if (vmx->loaded_vmcs == &item->vmcs02)
- continue;
-
- free_loaded_vmcs(&item->vmcs02);
- list_del(&item->list);
- kfree(item);
- vmx->nested.vmcs02_num--;
- }
-}
-
/*
* The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(),
* set the success or error code of an emulated VMX instruction, as specified
@@ -7242,6 +7146,12 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu)
struct vcpu_vmx *vmx = to_vmx(vcpu);
struct vmcs *shadow_vmcs;
+ vmx->nested.vmcs02.vmcs = alloc_vmcs();
+ vmx->nested.vmcs02.shadow_vmcs = NULL;
+ if (!vmx->nested.vmcs02.vmcs)
+ goto out_vmcs02;
+ loaded_vmcs_init(&vmx->nested.vmcs02);
+
if (cpu_has_vmx_msr_bitmap()) {
vmx->nested.msr_bitmap =
(unsigned long *)__get_free_page(GFP_KERNEL);
@@ -7264,9 +7174,6 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu)
vmx->vmcs01.shadow_vmcs = shadow_vmcs;
}
- INIT_LIST_HEAD(&(vmx->nested.vmcs02_pool));
- vmx->nested.vmcs02_num = 0;
-
hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC,
HRTIMER_MODE_REL_PINNED);
vmx->nested.preemption_timer.function = vmx_preemption_timer_fn;
@@ -7281,6 +7188,9 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu)
free_page((unsigned long)vmx->nested.msr_bitmap);
out_msr_bitmap:
+ free_loaded_vmcs(&vmx->nested.vmcs02);
+
+out_vmcs02:
return -ENOMEM;
}
@@ -7434,7 +7344,7 @@ static void free_nested(struct vcpu_vmx *vmx)
vmx->vmcs01.shadow_vmcs = NULL;
}
kfree(vmx->nested.cached_vmcs12);
- /* Unpin physical memory we referred to in current vmcs02 */
+ /* Unpin physical memory we referred to in the vmcs02 */
if (vmx->nested.apic_access_page) {
kvm_release_page_dirty(vmx->nested.apic_access_page);
vmx->nested.apic_access_page = NULL;
@@ -7450,7 +7360,7 @@ static void free_nested(struct vcpu_vmx *vmx)
vmx->nested.pi_desc = NULL;
}
- nested_free_all_saved_vmcss(vmx);
+ free_loaded_vmcs(&vmx->nested.vmcs02);
}
/* Emulate the VMXOFF instruction */
@@ -7493,8 +7403,6 @@ static int handle_vmclear(struct kvm_vcpu *vcpu)
vmptr + offsetof(struct vmcs12, launch_state),
&zero, sizeof(zero));
- nested_free_vmcs02(vmx, vmptr);
-
nested_vmx_succeed(vcpu);
return kvm_skip_emulated_instruction(vcpu);
}
@@ -8406,10 +8314,11 @@ static bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason)
/*
* The host physical addresses of some pages of guest memory
- * are loaded into VMCS02 (e.g. L1's Virtual APIC Page). The CPU
- * may write to these pages via their host physical address while
- * L2 is running, bypassing any address-translation-based dirty
- * tracking (e.g. EPT write protection).
+ * are loaded into the vmcs02 (e.g. vmcs12's Virtual APIC
+ * Page). The CPU may write to these pages via their host
+ * physical address while L2 is running, bypassing any
+ * address-translation-based dirty tracking (e.g. EPT write
+ * protection).
*
* Mark them dirty on every exit from L2 to prevent them from
* getting out of sync with dirty tracking.
@@ -10903,20 +10812,15 @@ static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
- struct loaded_vmcs *vmcs02;
u32 msr_entry_idx;
u32 exit_qual;
- vmcs02 = nested_get_current_vmcs02(vmx);
- if (!vmcs02)
- return -ENOMEM;
-
enter_guest_mode(vcpu);
if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS))
vmx->nested.vmcs01_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
- vmx_switch_vmcs(vcpu, vmcs02);
+ vmx_switch_vmcs(vcpu, &vmx->nested.vmcs02);
vmx_segment_cache_clear(vmx);
if (prepare_vmcs02(vcpu, vmcs12, from_vmentry, &exit_qual)) {
@@ -11534,10 +11438,6 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
vm_exit_controls_reset_shadow(vmx);
vmx_segment_cache_clear(vmx);
- /* if no vmcs02 cache requested, remove the one we used */
- if (VMCS02_POOL_SIZE == 0)
- nested_free_vmcs02(vmx, vmx->nested.current_vmptr);
-
/* Update any VMCS fields that might have changed while L2 ran */
vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.nr);
vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.nr);
--
2.14.3
Initialize the request queue lock earlier such that the following
race can no longer occur:
blk_init_queue_node blkcg_print_blkgs
blk_alloc_queue_node (1)
q->queue_lock = &q->__queue_lock (2)
blkcg_init_queue(q) (3)
spin_lock_irq(blkg->q->queue_lock) (4)
q->queue_lock = lock (5)
spin_unlock_irq(blkg->q->queue_lock) (6)
(1) allocate an uninitialized queue;
(2) initialize queue_lock to its default internal lock;
(3) initialize blkcg part of request queue, which will create blkg and
then insert it to blkg_list;
(4) traverse blkg_list and find the created blkg, and then take its
queue lock, here it is the default *internal lock*;
(5) *race window*, now queue_lock is overridden with *driver specified
lock*;
(6) now unlock *driver specified lock*, not the locked *internal lock*,
unlock balance breaks.
The changes in this patch are as follows:
- Rename blk_alloc_queue_node() into blk_alloc_queue_node2() and add
a new queue lock argument.
- Introduce a wrapper function with the same name and behavior as the
old blk_alloc_queue_node() function.
- Move the .queue_lock initialization from blk_init_queue_node() into
blk_alloc_queue_node2().
- For all all block drivers that initialize .queue_lock explicitly,
change the blk_alloc_queue() call in the driver into a
blk_alloc_queue_node2() call and remove the explicit .queue_lock
initialization. Additionally, initialize the spin lock that will
be used as queue lock earlier if necessary.
Reported-by: Joseph Qi <joseph.qi(a)linux.alibaba.com>
Signed-off-by: Bart Van Assche <bart.vanassche(a)wdc.com>
Cc: Christoph Hellwig <hch(a)lst.de>
Cc: Joseph Qi <joseph.qi(a)linux.alibaba.com>
Cc: Philipp Reisner <philipp.reisner(a)linbit.com>
Cc: Ulf Hansson <ulf.hansson(a)linaro.org>
Cc: Kees Cook <keescook(a)chromium.org>
Cc: <stable(a)vger.kernel.org>
---
block/blk-core.c | 33 ++++++++++++++++++++++++---------
drivers/block/drbd/drbd_main.c | 4 ++--
drivers/block/umem.c | 7 +++----
drivers/mmc/core/queue.c | 3 +--
include/linux/blkdev.h | 2 ++
5 files changed, 32 insertions(+), 17 deletions(-)
diff --git a/block/blk-core.c b/block/blk-core.c
index bd43bc50740a..cb18f57e5b13 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -946,7 +946,22 @@ static void blk_rq_timed_out_timer(struct timer_list *t)
kblockd_schedule_work(&q->timeout_work);
}
-struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
+/**
+ * blk_alloc_queue_node2 - allocate a request queue
+ * @gfp_mask: memory allocation flags
+ * @node_id: NUMA node to allocate memory from
+ * @lock: Pointer to a spinlock that will be used to e.g. serialize calls to
+ * the legacy .request_fn(). Only set this pointer for queues that use
+ * legacy mode and not for queues that use blk-mq.
+ *
+ * Note: use this function instead of calling blk_alloc_queue_node() and
+ * setting the queue lock pointer explicitly to avoid triggering a crash in
+ * the blkcg throttling code. That code namely makes sysfs attributes visible
+ * in user space before this function returns and the show method of these
+ * attributes uses the queue lock.
+ */
+struct request_queue *blk_alloc_queue_node2(gfp_t gfp_mask, int node_id,
+ spinlock_t *lock)
{
struct request_queue *q;
@@ -997,11 +1012,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
mutex_init(&q->sysfs_lock);
spin_lock_init(&q->__queue_lock);
- /*
- * By default initialize queue_lock to internal lock and driver can
- * override it later if need be.
- */
- q->queue_lock = &q->__queue_lock;
+ q->queue_lock = lock ? : &q->__queue_lock;
/*
* A queue starts its life with bypass turned on to avoid
@@ -1042,6 +1053,12 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
kmem_cache_free(blk_requestq_cachep, q);
return NULL;
}
+EXPORT_SYMBOL(blk_alloc_queue_node2);
+
+struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
+{
+ return blk_alloc_queue_node2(gfp_mask, node_id, NULL);
+}
EXPORT_SYMBOL(blk_alloc_queue_node);
/**
@@ -1088,13 +1105,11 @@ blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id)
{
struct request_queue *q;
- q = blk_alloc_queue_node(GFP_KERNEL, node_id);
+ q = blk_alloc_queue_node2(GFP_KERNEL, node_id, lock);
if (!q)
return NULL;
q->request_fn = rfn;
- if (lock)
- q->queue_lock = lock;
if (blk_init_allocated_queue(q) < 0) {
blk_cleanup_queue(q);
return NULL;
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 4b4697a1f963..965e80b13443 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -2822,7 +2822,8 @@ enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsig
drbd_init_set_defaults(device);
- q = blk_alloc_queue(GFP_KERNEL);
+ q = blk_alloc_queue_node2(GFP_KERNEL, NUMA_NO_NODE,
+ &resource->req_lock);
if (!q)
goto out_no_q;
device->rq_queue = q;
@@ -2854,7 +2855,6 @@ enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsig
/* Setting the max_hw_sectors to an odd value of 8kibyte here
This triggers a max_bio_size message upon first attach or connect */
blk_queue_max_hw_sectors(q, DRBD_MAX_BIO_SIZE_SAFE >> 8);
- q->queue_lock = &resource->req_lock;
device->md_io.page = alloc_page(GFP_KERNEL);
if (!device->md_io.page)
diff --git a/drivers/block/umem.c b/drivers/block/umem.c
index 8077123678ad..f6bb78782afa 100644
--- a/drivers/block/umem.c
+++ b/drivers/block/umem.c
@@ -888,13 +888,14 @@ static int mm_pci_probe(struct pci_dev *dev, const struct pci_device_id *id)
card->Active = -1; /* no page is active */
card->bio = NULL;
card->biotail = &card->bio;
+ spin_lock_init(&card->lock);
- card->queue = blk_alloc_queue(GFP_KERNEL);
+ card->queue = blk_alloc_queue_node2(GFP_KERNEL, NUMA_NO_NODE,
+ &card->lock);
if (!card->queue)
goto failed_alloc;
blk_queue_make_request(card->queue, mm_make_request);
- card->queue->queue_lock = &card->lock;
card->queue->queuedata = card;
tasklet_init(&card->tasklet, process_page, (unsigned long)card);
@@ -968,8 +969,6 @@ static int mm_pci_probe(struct pci_dev *dev, const struct pci_device_id *id)
dev_printk(KERN_INFO, &card->dev->dev,
"Window size %d bytes, IRQ %d\n", data, dev->irq);
- spin_lock_init(&card->lock);
-
pci_set_drvdata(dev, card);
if (pci_write_cmd != 0x0F) /* If not Memory Write & Invalidate */
diff --git a/drivers/mmc/core/queue.c b/drivers/mmc/core/queue.c
index 5ecd54088988..274a8c6c8d64 100644
--- a/drivers/mmc/core/queue.c
+++ b/drivers/mmc/core/queue.c
@@ -216,10 +216,9 @@ int mmc_init_queue(struct mmc_queue *mq, struct mmc_card *card,
int ret = -ENOMEM;
mq->card = card;
- mq->queue = blk_alloc_queue(GFP_KERNEL);
+ mq->queue = blk_alloc_queue_node2(GFP_KERNEL, NUMA_NO_NODE, lock);
if (!mq->queue)
return -ENOMEM;
- mq->queue->queue_lock = lock;
mq->queue->request_fn = mmc_request_fn;
mq->queue->init_rq_fn = mmc_init_request;
mq->queue->exit_rq_fn = mmc_exit_request;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 781992c4124e..37723a8b799c 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1331,6 +1331,8 @@ extern long nr_blockdev_pages(void);
bool __must_check blk_get_queue(struct request_queue *);
struct request_queue *blk_alloc_queue(gfp_t);
struct request_queue *blk_alloc_queue_node(gfp_t, int);
+struct request_queue *blk_alloc_queue_node2(gfp_t gfp_mask, int node_id,
+ spinlock_t *lock);
extern void blk_put_queue(struct request_queue *);
extern void blk_set_queue_dying(struct request_queue *);
--
2.16.0
This is a note to let you know that I've just added the patch titled
loop: fix concurrent lo_open/lo_release
to the 4.9-stable tree which can be found at:
http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum…
The filename of the patch is:
loop-fix-concurrent-lo_open-lo_release.patch
and it can be found in the queue-4.9 subdirectory.
If you, or anyone else, feels it should not be added to the stable tree,
please let <stable(a)vger.kernel.org> know about it.
>From ae6650163c66a7eff1acd6eb8b0f752dcfa8eba5 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds(a)linux-foundation.org>
Date: Fri, 5 Jan 2018 16:26:00 -0800
Subject: loop: fix concurrent lo_open/lo_release
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
From: Linus Torvalds <torvalds(a)linux-foundation.org>
commit ae6650163c66a7eff1acd6eb8b0f752dcfa8eba5 upstream.
范龙飞 reports that KASAN can report a use-after-free in __lock_acquire.
The reason is due to insufficient serialization in lo_release(), which
will continue to use the loop device even after it has decremented the
lo_refcnt to zero.
In the meantime, another process can come in, open the loop device
again as it is being shut down. Confusion ensues.
Reported-by: 范龙飞 <long7573(a)126.com>
Signed-off-by: Linus Torvalds <torvalds(a)linux-foundation.org>
Signed-off-by: Jens Axboe <axboe(a)kernel.dk>
Cc: Ben Hutchings <ben.hutchings(a)codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
drivers/block/loop.c | 10 ++++++++--
1 file changed, 8 insertions(+), 2 deletions(-)
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1558,9 +1558,8 @@ out:
return err;
}
-static void lo_release(struct gendisk *disk, fmode_t mode)
+static void __lo_release(struct loop_device *lo)
{
- struct loop_device *lo = disk->private_data;
int err;
if (atomic_dec_return(&lo->lo_refcnt))
@@ -1586,6 +1585,13 @@ static void lo_release(struct gendisk *d
mutex_unlock(&lo->lo_ctl_mutex);
}
+static void lo_release(struct gendisk *disk, fmode_t mode)
+{
+ mutex_lock(&loop_index_mutex);
+ __lo_release(disk->private_data);
+ mutex_unlock(&loop_index_mutex);
+}
+
static const struct block_device_operations lo_fops = {
.owner = THIS_MODULE,
.open = lo_open,
Patches currently in stable-queue which might be from torvalds(a)linux-foundation.org are
queue-4.9/loop-fix-concurrent-lo_open-lo_release.patch
This is a note to let you know that I've just added the patch titled
loop: fix concurrent lo_open/lo_release
to the 4.4-stable tree which can be found at:
http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum…
The filename of the patch is:
loop-fix-concurrent-lo_open-lo_release.patch
and it can be found in the queue-4.4 subdirectory.
If you, or anyone else, feels it should not be added to the stable tree,
please let <stable(a)vger.kernel.org> know about it.
>From ae6650163c66a7eff1acd6eb8b0f752dcfa8eba5 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds(a)linux-foundation.org>
Date: Fri, 5 Jan 2018 16:26:00 -0800
Subject: loop: fix concurrent lo_open/lo_release
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
From: Linus Torvalds <torvalds(a)linux-foundation.org>
commit ae6650163c66a7eff1acd6eb8b0f752dcfa8eba5 upstream.
范龙飞 reports that KASAN can report a use-after-free in __lock_acquire.
The reason is due to insufficient serialization in lo_release(), which
will continue to use the loop device even after it has decremented the
lo_refcnt to zero.
In the meantime, another process can come in, open the loop device
again as it is being shut down. Confusion ensues.
Reported-by: 范龙飞 <long7573(a)126.com>
Signed-off-by: Linus Torvalds <torvalds(a)linux-foundation.org>
Signed-off-by: Jens Axboe <axboe(a)kernel.dk>
Cc: Ben Hutchings <ben.hutchings(a)codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
drivers/block/loop.c | 10 ++++++++--
1 file changed, 8 insertions(+), 2 deletions(-)
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1569,9 +1569,8 @@ out:
return err;
}
-static void lo_release(struct gendisk *disk, fmode_t mode)
+static void __lo_release(struct loop_device *lo)
{
- struct loop_device *lo = disk->private_data;
int err;
if (atomic_dec_return(&lo->lo_refcnt))
@@ -1597,6 +1596,13 @@ static void lo_release(struct gendisk *d
mutex_unlock(&lo->lo_ctl_mutex);
}
+static void lo_release(struct gendisk *disk, fmode_t mode)
+{
+ mutex_lock(&loop_index_mutex);
+ __lo_release(disk->private_data);
+ mutex_unlock(&loop_index_mutex);
+}
+
static const struct block_device_operations lo_fops = {
.owner = THIS_MODULE,
.open = lo_open,
Patches currently in stable-queue which might be from torvalds(a)linux-foundation.org are
queue-4.4/loop-fix-concurrent-lo_open-lo_release.patch
This is a note to let you know that I've just added the patch titled
loop: fix concurrent lo_open/lo_release
to the 4.14-stable tree which can be found at:
http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum…
The filename of the patch is:
loop-fix-concurrent-lo_open-lo_release.patch
and it can be found in the queue-4.14 subdirectory.
If you, or anyone else, feels it should not be added to the stable tree,
please let <stable(a)vger.kernel.org> know about it.
>From ae6650163c66a7eff1acd6eb8b0f752dcfa8eba5 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds(a)linux-foundation.org>
Date: Fri, 5 Jan 2018 16:26:00 -0800
Subject: loop: fix concurrent lo_open/lo_release
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
From: Linus Torvalds <torvalds(a)linux-foundation.org>
commit ae6650163c66a7eff1acd6eb8b0f752dcfa8eba5 upstream.
范龙飞 reports that KASAN can report a use-after-free in __lock_acquire.
The reason is due to insufficient serialization in lo_release(), which
will continue to use the loop device even after it has decremented the
lo_refcnt to zero.
In the meantime, another process can come in, open the loop device
again as it is being shut down. Confusion ensues.
Reported-by: 范龙飞 <long7573(a)126.com>
Signed-off-by: Linus Torvalds <torvalds(a)linux-foundation.org>
Signed-off-by: Jens Axboe <axboe(a)kernel.dk>
Cc: Ben Hutchings <ben.hutchings(a)codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
drivers/block/loop.c | 10 ++++++++--
1 file changed, 8 insertions(+), 2 deletions(-)
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1576,9 +1576,8 @@ out:
return err;
}
-static void lo_release(struct gendisk *disk, fmode_t mode)
+static void __lo_release(struct loop_device *lo)
{
- struct loop_device *lo = disk->private_data;
int err;
if (atomic_dec_return(&lo->lo_refcnt))
@@ -1605,6 +1604,13 @@ static void lo_release(struct gendisk *d
mutex_unlock(&lo->lo_ctl_mutex);
}
+static void lo_release(struct gendisk *disk, fmode_t mode)
+{
+ mutex_lock(&loop_index_mutex);
+ __lo_release(disk->private_data);
+ mutex_unlock(&loop_index_mutex);
+}
+
static const struct block_device_operations lo_fops = {
.owner = THIS_MODULE,
.open = lo_open,
Patches currently in stable-queue which might be from torvalds(a)linux-foundation.org are
queue-4.14/futex-fix-owner_dead-fixup.patch
queue-4.14/loop-fix-concurrent-lo_open-lo_release.patch
This is a note to let you know that I've just added the patch titled
loop: fix concurrent lo_open/lo_release
to the 3.18-stable tree which can be found at:
http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum…
The filename of the patch is:
loop-fix-concurrent-lo_open-lo_release.patch
and it can be found in the queue-3.18 subdirectory.
If you, or anyone else, feels it should not be added to the stable tree,
please let <stable(a)vger.kernel.org> know about it.
>From ae6650163c66a7eff1acd6eb8b0f752dcfa8eba5 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds(a)linux-foundation.org>
Date: Fri, 5 Jan 2018 16:26:00 -0800
Subject: loop: fix concurrent lo_open/lo_release
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
From: Linus Torvalds <torvalds(a)linux-foundation.org>
commit ae6650163c66a7eff1acd6eb8b0f752dcfa8eba5 upstream.
范龙飞 reports that KASAN can report a use-after-free in __lock_acquire.
The reason is due to insufficient serialization in lo_release(), which
will continue to use the loop device even after it has decremented the
lo_refcnt to zero.
In the meantime, another process can come in, open the loop device
again as it is being shut down. Confusion ensues.
Reported-by: 范龙飞 <long7573(a)126.com>
Signed-off-by: Linus Torvalds <torvalds(a)linux-foundation.org>
Signed-off-by: Jens Axboe <axboe(a)kernel.dk>
Cc: Ben Hutchings <ben.hutchings(a)codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
drivers/block/loop.c | 10 ++++++++--
1 file changed, 8 insertions(+), 2 deletions(-)
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1512,9 +1512,8 @@ out:
return err;
}
-static void lo_release(struct gendisk *disk, fmode_t mode)
+static void __lo_release(struct loop_device *lo)
{
- struct loop_device *lo = disk->private_data;
int err;
mutex_lock(&lo->lo_ctl_mutex);
@@ -1542,6 +1541,13 @@ out:
mutex_unlock(&lo->lo_ctl_mutex);
}
+static void lo_release(struct gendisk *disk, fmode_t mode)
+{
+ mutex_lock(&loop_index_mutex);
+ __lo_release(disk->private_data);
+ mutex_unlock(&loop_index_mutex);
+}
+
static const struct block_device_operations lo_fops = {
.owner = THIS_MODULE,
.open = lo_open,
Patches currently in stable-queue which might be from torvalds(a)linux-foundation.org are
queue-3.18/loop-fix-concurrent-lo_open-lo_release.patch