On Thu, Feb 1, 2018 at 2:27 AM, Amir Goldstein <amir73il(a)gmail.com> wrote:
> On Mon, Jan 29, 2018 at 5:50 PM, Darrick J. Wong
> <darrick.wong(a)oracle.com> wrote:
>> On Mon, Jan 29, 2018 at 01:07:36PM +0200, Amir Goldstein wrote:
>>> On Fri, Jan 26, 2018 at 11:44 PM, Darrick J. Wong
>>> <darrick.wong(a)oracle.com> wrote:
>>> > On Fri, Jan 26, 2018 at 09:44:29AM +0200, Amir Goldstein wrote:
>>> >> Commit 66f364649d870 ("xfs: remove if_rdev") moved storing of rdev
>>> >> value for special inodes to VFS inodes, but forgot to preserve the
>>> >> value of i_rdev when recycling a reclaimable xfs_inode.
>>> >>
>>> >> This was detected by xfstest overlay/017 with inodex=on mount option
>>> >> and xfs base fs. The test does a lookup of overlay chardev and blockdev
>>> >> right after drop caches.
>>> >>
>>> >> Overlayfs inodes hold a reference on underlying xfs inodes when mount
>>> >> option index=on is configured. If drop caches reclaim xfs inodes, before
>>> >> it relclaims overlayfs inodes, that can sometimes leave a reclaimable xfs
>>> >> inode and that test hits that case quite often.
>>> >>
>>> >> When that happens, the xfs inode cache remains broken (zere i_rdev)
>>> >> until the next cycle mount or drop caches.
>>> >>
>>> >> Fixes: 66f364649d870 ("xfs: remove if_rdev")
>>> >> Signed-off-by: Amir Goldstein <amir73il(a)gmail.com>
>>> >
>>> > Looks ok,
>>> > Reviewed-by: Darrick J. Wong <darrick.wong(a)oracle.com>
>>> >
>>>
>>> I recon that now we should now also strap:
>>> Cc: <stable(a)vger.kernel.org> #v4.15
>>>
>>> Can I assume, you'll add it on apply?
>>
>> I'll do a proper backport of this and a couple other critical cow
>> fixes after I get the 4.16 stuff merged.
>>
>
> I am not sure what "proper backport" means in the context of
> this patch.
> This is a v4.15-rc1 regression fix that is based on v4.15-rc8.
> It applied cleanly on v4.15.
>
> CC'ing stable for attention.
>
> This patch is now in master, but due to its timing it did not
> get the CC: stable tag.
>
Now really CC stable.
Amir.
From: Michal Hocko <mhocko(a)suse.com>
Subject: mm, memory_hotplug: fix memmap initialization
Bharata has noticed that onlining a newly added memory doesn't increase
the total memory, pointing to f7f99100d8d9 ("mm: stop zeroing memory
during allocation in vmemmap") as a culprit. This commit has changed the
way how the memory for memmaps is initialized and moves it from the
allocation time to the initialization time. This works properly for the
early memmap init path.
It doesn't work for the memory hotplug though because we need to mark page
as reserved when the sparsemem section is created and later initialize it
completely during onlining. memmap_init_zone is called in the early stage
of onlining. With the current code it calls __init_single_page and as
such it clears up the whole stage and therefore online_pages_range skips
those pages.
Fix this by skipping mm_zero_struct_page in __init_single_page for memory
hotplug path. This is quite uggly but unifying both early init and memory
hotplug init paths is a large project. Make sure we plug the regression
at least.
Link: http://lkml.kernel.org/r/20180130101141.GW21609@dhcp22.suse.cz
Fixes: f7f99100d8d9 ("mm: stop zeroing memory during allocation in vmemmap")
Signed-off-by: Michal Hocko <mhocko(a)suse.com>
Reported-by: Bharata B Rao <bharata(a)linux.vnet.ibm.com>
Tested-by: Bharata B Rao <bharata(a)linux.vnet.ibm.com>
Reviewed-by: Pavel Tatashin <pasha.tatashin(a)oracle.com>
Cc: Steven Sistare <steven.sistare(a)oracle.com>
Cc: Daniel Jordan <daniel.m.jordan(a)oracle.com>
Cc: Bob Picco <bob.picco(a)oracle.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/page_alloc.c | 22 ++++++++++++++--------
1 file changed, 14 insertions(+), 8 deletions(-)
diff -puN mm/page_alloc.c~mm-memory_hotplug-fix-memmap-initialization mm/page_alloc.c
--- a/mm/page_alloc.c~mm-memory_hotplug-fix-memmap-initialization
+++ a/mm/page_alloc.c
@@ -1177,9 +1177,10 @@ static void free_one_page(struct zone *z
}
static void __meminit __init_single_page(struct page *page, unsigned long pfn,
- unsigned long zone, int nid)
+ unsigned long zone, int nid, bool zero)
{
- mm_zero_struct_page(page);
+ if (zero)
+ mm_zero_struct_page(page);
set_page_links(page, zone, nid, pfn);
init_page_count(page);
page_mapcount_reset(page);
@@ -1194,9 +1195,9 @@ static void __meminit __init_single_page
}
static void __meminit __init_single_pfn(unsigned long pfn, unsigned long zone,
- int nid)
+ int nid, bool zero)
{
- return __init_single_page(pfn_to_page(pfn), pfn, zone, nid);
+ return __init_single_page(pfn_to_page(pfn), pfn, zone, nid, zero);
}
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
@@ -1217,7 +1218,7 @@ static void __meminit init_reserved_page
if (pfn >= zone->zone_start_pfn && pfn < zone_end_pfn(zone))
break;
}
- __init_single_pfn(pfn, zid, nid);
+ __init_single_pfn(pfn, zid, nid, true);
}
#else
static inline void init_reserved_page(unsigned long pfn)
@@ -1534,7 +1535,7 @@ static unsigned long __init deferred_in
} else {
page++;
}
- __init_single_page(page, pfn, zid, nid);
+ __init_single_page(page, pfn, zid, nid, true);
nr_pages++;
}
return (nr_pages);
@@ -5399,15 +5400,20 @@ not_early:
* can be created for invalid pages (for alignment)
* check here not to call set_pageblock_migratetype() against
* pfn out of zone.
+ *
+ * Please note that MEMMAP_HOTPLUG path doesn't clear memmap
+ * because this is done early in sparse_add_one_section
*/
if (!(pfn & (pageblock_nr_pages - 1))) {
struct page *page = pfn_to_page(pfn);
- __init_single_page(page, pfn, zone, nid);
+ __init_single_page(page, pfn, zone, nid,
+ context != MEMMAP_HOTPLUG);
set_pageblock_migratetype(page, MIGRATE_MOVABLE);
cond_resched();
} else {
- __init_single_pfn(pfn, zone, nid);
+ __init_single_pfn(pfn, zone, nid,
+ context != MEMMAP_HOTPLUG);
}
}
}
_
From: Gang He <ghe(a)suse.com>
Subject: ocfs2: try a blocking lock before return AOP_TRUNCATED_PAGE
If we can't get inode lock immediately in the function
ocfs2_inode_lock_with_page() when reading a page, we should not return
directly here, since this will lead to a softlockup problem when the
kernel is configured with CONFIG_PREEMPT is not set. The method is to get
a blocking lock and immediately unlock before returning, this can avoid
CPU resource waste due to lots of retries, and benefits fairness in
getting lock among multiple nodes, increase efficiency in case modifying
the same file frequently from multiple nodes.
The softlockup crash (when set /proc/sys/kernel/softlockup_panic to 1)
looks like:
Kernel panic - not syncing: softlockup: hung tasks
CPU: 0 PID: 885 Comm: multi_mmap Tainted: G L 4.12.14-6.1-default #1
Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011
Call Trace:
<IRQ>
dump_stack+0x5c/0x82
panic+0xd5/0x21e
watchdog_timer_fn+0x208/0x210
? watchdog_park_threads+0x70/0x70
__hrtimer_run_queues+0xcc/0x200
hrtimer_interrupt+0xa6/0x1f0
smp_apic_timer_interrupt+0x34/0x50
apic_timer_interrupt+0x96/0xa0
</IRQ>
RIP: 0010:unlock_page+0x17/0x30
RSP: 0000:ffffaf154080bc88 EFLAGS: 00000246 ORIG_RAX: ffffffffffffff10
RAX: dead000000000100 RBX: fffff21e009f5300 RCX: 0000000000000004
RDX: dead0000000000ff RSI: 0000000000000202 RDI: fffff21e009f5300
RBP: 0000000000000000 R08: 0000000000000000 R09: ffffaf154080bb00
R10: ffffaf154080bc30 R11: 0000000000000040 R12: ffff993749a39518
R13: 0000000000000000 R14: fffff21e009f5300 R15: fffff21e009f5300
ocfs2_inode_lock_with_page+0x25/0x30 [ocfs2]
ocfs2_readpage+0x41/0x2d0 [ocfs2]
? pagecache_get_page+0x30/0x200
filemap_fault+0x12b/0x5c0
? recalc_sigpending+0x17/0x50
? __set_task_blocked+0x28/0x70
? __set_current_blocked+0x3d/0x60
ocfs2_fault+0x29/0xb0 [ocfs2]
__do_fault+0x1a/0xa0
__handle_mm_fault+0xbe8/0x1090
handle_mm_fault+0xaa/0x1f0
__do_page_fault+0x235/0x4b0
trace_do_page_fault+0x3c/0x110
async_page_fault+0x28/0x30
RIP: 0033:0x7fa75ded638e
RSP: 002b:00007ffd6657db18 EFLAGS: 00010287
RAX: 000055c7662fb700 RBX: 0000000000000001 RCX: 000055c7662fb700
RDX: 0000000000001770 RSI: 00007fa75e909000 RDI: 000055c7662fb700
RBP: 0000000000000003 R08: 000000000000000e R09: 0000000000000000
R10: 0000000000000483 R11: 00007fa75ded61b0 R12: 00007fa75e90a770
R13: 000000000000000e R14: 0000000000001770 R15: 0000000000000000
About performance improvement, we can see the testing time is reduced, and
CPU utilization decreases, the detailed data is as follows. I ran
multi_mmap test case in ocfs2-test package in a three nodes cluster.
Before applying this patch:
PID USER PR NI VIRT RES SHR S %CPU %MEM TIME+ COMMAND
2754 ocfs2te+ 20 0 170248 6980 4856 D 80.73 0.341 0:18.71 multi_mmap
1505 root rt 0 222236 123060 97224 S 2.658 6.015 0:01.44 corosync
5 root 20 0 0 0 0 S 1.329 0.000 0:00.19 kworker/u8:0
95 root 20 0 0 0 0 S 1.329 0.000 0:00.25 kworker/u8:1
2728 root 20 0 0 0 0 S 0.997 0.000 0:00.24 jbd2/sda1-33
2721 root 20 0 0 0 0 S 0.664 0.000 0:00.07 ocfs2dc-3C8CFD4
2750 ocfs2te+ 20 0 142976 4652 3532 S 0.664 0.227 0:00.28 mpirun
ocfs2test@tb-node2:~>multiple_run.sh -i ens3 -k ~/linux-4.4.21-69.tar.gz -o
~/ocfs2mullog -C hacluster -s pcmk -n tb-node2,tb-node1,tb-node3 -d
/dev/sda1 -b 4096 -c 32768 -t multi_mmap /mnt/shared
Tests with "-b 4096 -C 32768"
Thu Dec 28 14:44:52 CST 2017
multi_mmap..................................................Passed.
Runtime 783 seconds.
After apply this patch:
PID USER PR NI VIRT RES SHR S %CPU %MEM TIME+ COMMAND
2508 ocfs2te+ 20 0 170248 6804 4680 R 54.00 0.333 0:55.37 multi_mmap
155 root 20 0 0 0 0 S 2.667 0.000 0:01.20 kworker/u8:3
95 root 20 0 0 0 0 S 2.000 0.000 0:01.58 kworker/u8:1
2504 ocfs2te+ 20 0 142976 4604 3480 R 1.667 0.225 0:01.65 mpirun
5 root 20 0 0 0 0 S 1.000 0.000 0:01.36 kworker/u8:0
2482 root 20 0 0 0 0 S 1.000 0.000 0:00.86 jbd2/sda1-33
299 root 0 -20 0 0 0 S 0.333 0.000 0:00.13 kworker/2:1H
335 root 0 -20 0 0 0 S 0.333 0.000 0:00.17 kworker/1:1H
535 root 20 0 12140 7268 1456 S 0.333 0.355 0:00.34 haveged
1282 root rt 0 222284 123108 97224 S 0.333 6.017 0:01.33 corosync
ocfs2test@tb-node2:~>multiple_run.sh -i ens3 -k ~/linux-4.4.21-69.tar.gz -o
~/ocfs2mullog -C hacluster -s pcmk -n tb-node2,tb-node1,tb-node3 -d
/dev/sda1 -b 4096 -c 32768 -t multi_mmap /mnt/shared
Tests with "-b 4096 -C 32768"
Thu Dec 28 15:04:12 CST 2017
multi_mmap..................................................Passed.
Runtime 487 seconds.
Link: http://lkml.kernel.org/r/1514447305-30814-1-git-send-email-ghe@suse.com
Fixes: 1cce4df04f37 ("ocfs2: do not lock/unlock() inode DLM lock")
Signed-off-by: Gang He <ghe(a)suse.com>
Reviewed-by: Eric Ren <zren(a)suse.com>
Acked-by: alex chen <alex.chen(a)huawei.com>
Acked-by: piaojun <piaojun(a)huawei.com>
Cc: Mark Fasheh <mfasheh(a)versity.com>
Cc: Joel Becker <jlbec(a)evilplan.org>
Cc: Junxiao Bi <junxiao.bi(a)oracle.com>
Cc: Joseph Qi <jiangqi903(a)gmail.com>
Cc: Changwei Ge <ge.changwei(a)h3c.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
fs/ocfs2/dlmglue.c | 9 +++++++++
1 file changed, 9 insertions(+)
diff -puN fs/ocfs2/dlmglue.c~ocfs2-try-a-blocking-lock-before-return-aop_truncated_page fs/ocfs2/dlmglue.c
--- a/fs/ocfs2/dlmglue.c~ocfs2-try-a-blocking-lock-before-return-aop_truncated_page
+++ a/fs/ocfs2/dlmglue.c
@@ -2486,6 +2486,15 @@ int ocfs2_inode_lock_with_page(struct in
ret = ocfs2_inode_lock_full(inode, ret_bh, ex, OCFS2_LOCK_NONBLOCK);
if (ret == -EAGAIN) {
unlock_page(page);
+ /*
+ * If we can't get inode lock immediately, we should not return
+ * directly here, since this will lead to a softlockup problem.
+ * The method is to get a blocking lock and immediately unlock
+ * before returning, this can avoid CPU resource waste due to
+ * lots of retries, and benefits fairness in getting lock.
+ */
+ if (ocfs2_inode_lock(inode, ret_bh, ex) == 0)
+ ocfs2_inode_unlock(inode, ex);
ret = AOP_TRUNCATED_PAGE;
}
_
From: Jim Mattson <jmattson(a)google.com>
The potential performance advantages of a vmcs02 pool have never been
realized. To simplify the code, eliminate the pool. Instead, a single
vmcs02 is allocated per VCPU when the VCPU enters VMX operation.
Cc: stable(a)vger.kernel.org # prereq for Spectre mitigation
Signed-off-by: Jim Mattson <jmattson(a)google.com>
Signed-off-by: Mark Kanda <mark.kanda(a)oracle.com>
Reviewed-by: Ameya More <ameya.more(a)oracle.com>
Reviewed-by: David Hildenbrand <david(a)redhat.com>
Reviewed-by: Paolo Bonzini <pbonzini(a)redhat.com>
Signed-off-by: Radim Krčmář <rkrcmar(a)redhat.com>
---
arch/x86/kvm/vmx.c | 146 +++++++++--------------------------------------------
1 file changed, 23 insertions(+), 123 deletions(-)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index c829d89e2e63..ad6a883b7a32 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -185,7 +185,6 @@ module_param(ple_window_max, int, S_IRUGO);
extern const ulong vmx_return;
#define NR_AUTOLOAD_MSRS 8
-#define VMCS02_POOL_SIZE 1
struct vmcs {
u32 revision_id;
@@ -226,7 +225,7 @@ struct shared_msr_entry {
* stored in guest memory specified by VMPTRLD, but is opaque to the guest,
* which must access it using VMREAD/VMWRITE/VMCLEAR instructions.
* More than one of these structures may exist, if L1 runs multiple L2 guests.
- * nested_vmx_run() will use the data here to build a vmcs02: a VMCS for the
+ * nested_vmx_run() will use the data here to build the vmcs02: a VMCS for the
* underlying hardware which will be used to run L2.
* This structure is packed to ensure that its layout is identical across
* machines (necessary for live migration).
@@ -409,13 +408,6 @@ struct __packed vmcs12 {
*/
#define VMCS12_SIZE 0x1000
-/* Used to remember the last vmcs02 used for some recently used vmcs12s */
-struct vmcs02_list {
- struct list_head list;
- gpa_t vmptr;
- struct loaded_vmcs vmcs02;
-};
-
/*
* The nested_vmx structure is part of vcpu_vmx, and holds information we need
* for correct emulation of VMX (i.e., nested VMX) on this vcpu.
@@ -440,15 +432,15 @@ struct nested_vmx {
*/
bool sync_shadow_vmcs;
- /* vmcs02_list cache of VMCSs recently used to run L2 guests */
- struct list_head vmcs02_pool;
- int vmcs02_num;
bool change_vmcs01_virtual_x2apic_mode;
/* L2 must run next, and mustn't decide to exit to L1. */
bool nested_run_pending;
+
+ struct loaded_vmcs vmcs02;
+
/*
- * Guest pages referred to in vmcs02 with host-physical pointers, so
- * we must keep them pinned while L2 runs.
+ * Guest pages referred to in the vmcs02 with host-physical
+ * pointers, so we must keep them pinned while L2 runs.
*/
struct page *apic_access_page;
struct page *virtual_apic_page;
@@ -6973,94 +6965,6 @@ static int handle_monitor(struct kvm_vcpu *vcpu)
return handle_nop(vcpu);
}
-/*
- * To run an L2 guest, we need a vmcs02 based on the L1-specified vmcs12.
- * We could reuse a single VMCS for all the L2 guests, but we also want the
- * option to allocate a separate vmcs02 for each separate loaded vmcs12 - this
- * allows keeping them loaded on the processor, and in the future will allow
- * optimizations where prepare_vmcs02 doesn't need to set all the fields on
- * every entry if they never change.
- * So we keep, in vmx->nested.vmcs02_pool, a cache of size VMCS02_POOL_SIZE
- * (>=0) with a vmcs02 for each recently loaded vmcs12s, most recent first.
- *
- * The following functions allocate and free a vmcs02 in this pool.
- */
-
-/* Get a VMCS from the pool to use as vmcs02 for the current vmcs12. */
-static struct loaded_vmcs *nested_get_current_vmcs02(struct vcpu_vmx *vmx)
-{
- struct vmcs02_list *item;
- list_for_each_entry(item, &vmx->nested.vmcs02_pool, list)
- if (item->vmptr == vmx->nested.current_vmptr) {
- list_move(&item->list, &vmx->nested.vmcs02_pool);
- return &item->vmcs02;
- }
-
- if (vmx->nested.vmcs02_num >= max(VMCS02_POOL_SIZE, 1)) {
- /* Recycle the least recently used VMCS. */
- item = list_last_entry(&vmx->nested.vmcs02_pool,
- struct vmcs02_list, list);
- item->vmptr = vmx->nested.current_vmptr;
- list_move(&item->list, &vmx->nested.vmcs02_pool);
- return &item->vmcs02;
- }
-
- /* Create a new VMCS */
- item = kzalloc(sizeof(struct vmcs02_list), GFP_KERNEL);
- if (!item)
- return NULL;
- item->vmcs02.vmcs = alloc_vmcs();
- item->vmcs02.shadow_vmcs = NULL;
- if (!item->vmcs02.vmcs) {
- kfree(item);
- return NULL;
- }
- loaded_vmcs_init(&item->vmcs02);
- item->vmptr = vmx->nested.current_vmptr;
- list_add(&(item->list), &(vmx->nested.vmcs02_pool));
- vmx->nested.vmcs02_num++;
- return &item->vmcs02;
-}
-
-/* Free and remove from pool a vmcs02 saved for a vmcs12 (if there is one) */
-static void nested_free_vmcs02(struct vcpu_vmx *vmx, gpa_t vmptr)
-{
- struct vmcs02_list *item;
- list_for_each_entry(item, &vmx->nested.vmcs02_pool, list)
- if (item->vmptr == vmptr) {
- free_loaded_vmcs(&item->vmcs02);
- list_del(&item->list);
- kfree(item);
- vmx->nested.vmcs02_num--;
- return;
- }
-}
-
-/*
- * Free all VMCSs saved for this vcpu, except the one pointed by
- * vmx->loaded_vmcs. We must be running L1, so vmx->loaded_vmcs
- * must be &vmx->vmcs01.
- */
-static void nested_free_all_saved_vmcss(struct vcpu_vmx *vmx)
-{
- struct vmcs02_list *item, *n;
-
- WARN_ON(vmx->loaded_vmcs != &vmx->vmcs01);
- list_for_each_entry_safe(item, n, &vmx->nested.vmcs02_pool, list) {
- /*
- * Something will leak if the above WARN triggers. Better than
- * a use-after-free.
- */
- if (vmx->loaded_vmcs == &item->vmcs02)
- continue;
-
- free_loaded_vmcs(&item->vmcs02);
- list_del(&item->list);
- kfree(item);
- vmx->nested.vmcs02_num--;
- }
-}
-
/*
* The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(),
* set the success or error code of an emulated VMX instruction, as specified
@@ -7242,6 +7146,12 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu)
struct vcpu_vmx *vmx = to_vmx(vcpu);
struct vmcs *shadow_vmcs;
+ vmx->nested.vmcs02.vmcs = alloc_vmcs();
+ vmx->nested.vmcs02.shadow_vmcs = NULL;
+ if (!vmx->nested.vmcs02.vmcs)
+ goto out_vmcs02;
+ loaded_vmcs_init(&vmx->nested.vmcs02);
+
if (cpu_has_vmx_msr_bitmap()) {
vmx->nested.msr_bitmap =
(unsigned long *)__get_free_page(GFP_KERNEL);
@@ -7264,9 +7174,6 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu)
vmx->vmcs01.shadow_vmcs = shadow_vmcs;
}
- INIT_LIST_HEAD(&(vmx->nested.vmcs02_pool));
- vmx->nested.vmcs02_num = 0;
-
hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC,
HRTIMER_MODE_REL_PINNED);
vmx->nested.preemption_timer.function = vmx_preemption_timer_fn;
@@ -7281,6 +7188,9 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu)
free_page((unsigned long)vmx->nested.msr_bitmap);
out_msr_bitmap:
+ free_loaded_vmcs(&vmx->nested.vmcs02);
+
+out_vmcs02:
return -ENOMEM;
}
@@ -7434,7 +7344,7 @@ static void free_nested(struct vcpu_vmx *vmx)
vmx->vmcs01.shadow_vmcs = NULL;
}
kfree(vmx->nested.cached_vmcs12);
- /* Unpin physical memory we referred to in current vmcs02 */
+ /* Unpin physical memory we referred to in the vmcs02 */
if (vmx->nested.apic_access_page) {
kvm_release_page_dirty(vmx->nested.apic_access_page);
vmx->nested.apic_access_page = NULL;
@@ -7450,7 +7360,7 @@ static void free_nested(struct vcpu_vmx *vmx)
vmx->nested.pi_desc = NULL;
}
- nested_free_all_saved_vmcss(vmx);
+ free_loaded_vmcs(&vmx->nested.vmcs02);
}
/* Emulate the VMXOFF instruction */
@@ -7493,8 +7403,6 @@ static int handle_vmclear(struct kvm_vcpu *vcpu)
vmptr + offsetof(struct vmcs12, launch_state),
&zero, sizeof(zero));
- nested_free_vmcs02(vmx, vmptr);
-
nested_vmx_succeed(vcpu);
return kvm_skip_emulated_instruction(vcpu);
}
@@ -8406,10 +8314,11 @@ static bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason)
/*
* The host physical addresses of some pages of guest memory
- * are loaded into VMCS02 (e.g. L1's Virtual APIC Page). The CPU
- * may write to these pages via their host physical address while
- * L2 is running, bypassing any address-translation-based dirty
- * tracking (e.g. EPT write protection).
+ * are loaded into the vmcs02 (e.g. vmcs12's Virtual APIC
+ * Page). The CPU may write to these pages via their host
+ * physical address while L2 is running, bypassing any
+ * address-translation-based dirty tracking (e.g. EPT write
+ * protection).
*
* Mark them dirty on every exit from L2 to prevent them from
* getting out of sync with dirty tracking.
@@ -10903,20 +10812,15 @@ static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
- struct loaded_vmcs *vmcs02;
u32 msr_entry_idx;
u32 exit_qual;
- vmcs02 = nested_get_current_vmcs02(vmx);
- if (!vmcs02)
- return -ENOMEM;
-
enter_guest_mode(vcpu);
if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS))
vmx->nested.vmcs01_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
- vmx_switch_vmcs(vcpu, vmcs02);
+ vmx_switch_vmcs(vcpu, &vmx->nested.vmcs02);
vmx_segment_cache_clear(vmx);
if (prepare_vmcs02(vcpu, vmcs12, from_vmentry, &exit_qual)) {
@@ -11534,10 +11438,6 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
vm_exit_controls_reset_shadow(vmx);
vmx_segment_cache_clear(vmx);
- /* if no vmcs02 cache requested, remove the one we used */
- if (VMCS02_POOL_SIZE == 0)
- nested_free_vmcs02(vmx, vmx->nested.current_vmptr);
-
/* Update any VMCS fields that might have changed while L2 ran */
vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.nr);
vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.nr);
--
2.14.3
Initialize the request queue lock earlier such that the following
race can no longer occur:
blk_init_queue_node blkcg_print_blkgs
blk_alloc_queue_node (1)
q->queue_lock = &q->__queue_lock (2)
blkcg_init_queue(q) (3)
spin_lock_irq(blkg->q->queue_lock) (4)
q->queue_lock = lock (5)
spin_unlock_irq(blkg->q->queue_lock) (6)
(1) allocate an uninitialized queue;
(2) initialize queue_lock to its default internal lock;
(3) initialize blkcg part of request queue, which will create blkg and
then insert it to blkg_list;
(4) traverse blkg_list and find the created blkg, and then take its
queue lock, here it is the default *internal lock*;
(5) *race window*, now queue_lock is overridden with *driver specified
lock*;
(6) now unlock *driver specified lock*, not the locked *internal lock*,
unlock balance breaks.
The changes in this patch are as follows:
- Rename blk_alloc_queue_node() into blk_alloc_queue_node2() and add
a new queue lock argument.
- Introduce a wrapper function with the same name and behavior as the
old blk_alloc_queue_node() function.
- Move the .queue_lock initialization from blk_init_queue_node() into
blk_alloc_queue_node2().
- For all all block drivers that initialize .queue_lock explicitly,
change the blk_alloc_queue() call in the driver into a
blk_alloc_queue_node2() call and remove the explicit .queue_lock
initialization. Additionally, initialize the spin lock that will
be used as queue lock earlier if necessary.
Reported-by: Joseph Qi <joseph.qi(a)linux.alibaba.com>
Signed-off-by: Bart Van Assche <bart.vanassche(a)wdc.com>
Cc: Christoph Hellwig <hch(a)lst.de>
Cc: Joseph Qi <joseph.qi(a)linux.alibaba.com>
Cc: Philipp Reisner <philipp.reisner(a)linbit.com>
Cc: Ulf Hansson <ulf.hansson(a)linaro.org>
Cc: Kees Cook <keescook(a)chromium.org>
Cc: <stable(a)vger.kernel.org>
---
block/blk-core.c | 33 ++++++++++++++++++++++++---------
drivers/block/drbd/drbd_main.c | 4 ++--
drivers/block/umem.c | 7 +++----
drivers/mmc/core/queue.c | 3 +--
include/linux/blkdev.h | 2 ++
5 files changed, 32 insertions(+), 17 deletions(-)
diff --git a/block/blk-core.c b/block/blk-core.c
index bd43bc50740a..cb18f57e5b13 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -946,7 +946,22 @@ static void blk_rq_timed_out_timer(struct timer_list *t)
kblockd_schedule_work(&q->timeout_work);
}
-struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
+/**
+ * blk_alloc_queue_node2 - allocate a request queue
+ * @gfp_mask: memory allocation flags
+ * @node_id: NUMA node to allocate memory from
+ * @lock: Pointer to a spinlock that will be used to e.g. serialize calls to
+ * the legacy .request_fn(). Only set this pointer for queues that use
+ * legacy mode and not for queues that use blk-mq.
+ *
+ * Note: use this function instead of calling blk_alloc_queue_node() and
+ * setting the queue lock pointer explicitly to avoid triggering a crash in
+ * the blkcg throttling code. That code namely makes sysfs attributes visible
+ * in user space before this function returns and the show method of these
+ * attributes uses the queue lock.
+ */
+struct request_queue *blk_alloc_queue_node2(gfp_t gfp_mask, int node_id,
+ spinlock_t *lock)
{
struct request_queue *q;
@@ -997,11 +1012,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
mutex_init(&q->sysfs_lock);
spin_lock_init(&q->__queue_lock);
- /*
- * By default initialize queue_lock to internal lock and driver can
- * override it later if need be.
- */
- q->queue_lock = &q->__queue_lock;
+ q->queue_lock = lock ? : &q->__queue_lock;
/*
* A queue starts its life with bypass turned on to avoid
@@ -1042,6 +1053,12 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
kmem_cache_free(blk_requestq_cachep, q);
return NULL;
}
+EXPORT_SYMBOL(blk_alloc_queue_node2);
+
+struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
+{
+ return blk_alloc_queue_node2(gfp_mask, node_id, NULL);
+}
EXPORT_SYMBOL(blk_alloc_queue_node);
/**
@@ -1088,13 +1105,11 @@ blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id)
{
struct request_queue *q;
- q = blk_alloc_queue_node(GFP_KERNEL, node_id);
+ q = blk_alloc_queue_node2(GFP_KERNEL, node_id, lock);
if (!q)
return NULL;
q->request_fn = rfn;
- if (lock)
- q->queue_lock = lock;
if (blk_init_allocated_queue(q) < 0) {
blk_cleanup_queue(q);
return NULL;
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 4b4697a1f963..965e80b13443 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -2822,7 +2822,8 @@ enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsig
drbd_init_set_defaults(device);
- q = blk_alloc_queue(GFP_KERNEL);
+ q = blk_alloc_queue_node2(GFP_KERNEL, NUMA_NO_NODE,
+ &resource->req_lock);
if (!q)
goto out_no_q;
device->rq_queue = q;
@@ -2854,7 +2855,6 @@ enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsig
/* Setting the max_hw_sectors to an odd value of 8kibyte here
This triggers a max_bio_size message upon first attach or connect */
blk_queue_max_hw_sectors(q, DRBD_MAX_BIO_SIZE_SAFE >> 8);
- q->queue_lock = &resource->req_lock;
device->md_io.page = alloc_page(GFP_KERNEL);
if (!device->md_io.page)
diff --git a/drivers/block/umem.c b/drivers/block/umem.c
index 8077123678ad..f6bb78782afa 100644
--- a/drivers/block/umem.c
+++ b/drivers/block/umem.c
@@ -888,13 +888,14 @@ static int mm_pci_probe(struct pci_dev *dev, const struct pci_device_id *id)
card->Active = -1; /* no page is active */
card->bio = NULL;
card->biotail = &card->bio;
+ spin_lock_init(&card->lock);
- card->queue = blk_alloc_queue(GFP_KERNEL);
+ card->queue = blk_alloc_queue_node2(GFP_KERNEL, NUMA_NO_NODE,
+ &card->lock);
if (!card->queue)
goto failed_alloc;
blk_queue_make_request(card->queue, mm_make_request);
- card->queue->queue_lock = &card->lock;
card->queue->queuedata = card;
tasklet_init(&card->tasklet, process_page, (unsigned long)card);
@@ -968,8 +969,6 @@ static int mm_pci_probe(struct pci_dev *dev, const struct pci_device_id *id)
dev_printk(KERN_INFO, &card->dev->dev,
"Window size %d bytes, IRQ %d\n", data, dev->irq);
- spin_lock_init(&card->lock);
-
pci_set_drvdata(dev, card);
if (pci_write_cmd != 0x0F) /* If not Memory Write & Invalidate */
diff --git a/drivers/mmc/core/queue.c b/drivers/mmc/core/queue.c
index 5ecd54088988..274a8c6c8d64 100644
--- a/drivers/mmc/core/queue.c
+++ b/drivers/mmc/core/queue.c
@@ -216,10 +216,9 @@ int mmc_init_queue(struct mmc_queue *mq, struct mmc_card *card,
int ret = -ENOMEM;
mq->card = card;
- mq->queue = blk_alloc_queue(GFP_KERNEL);
+ mq->queue = blk_alloc_queue_node2(GFP_KERNEL, NUMA_NO_NODE, lock);
if (!mq->queue)
return -ENOMEM;
- mq->queue->queue_lock = lock;
mq->queue->request_fn = mmc_request_fn;
mq->queue->init_rq_fn = mmc_init_request;
mq->queue->exit_rq_fn = mmc_exit_request;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 781992c4124e..37723a8b799c 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1331,6 +1331,8 @@ extern long nr_blockdev_pages(void);
bool __must_check blk_get_queue(struct request_queue *);
struct request_queue *blk_alloc_queue(gfp_t);
struct request_queue *blk_alloc_queue_node(gfp_t, int);
+struct request_queue *blk_alloc_queue_node2(gfp_t gfp_mask, int node_id,
+ spinlock_t *lock);
extern void blk_put_queue(struct request_queue *);
extern void blk_set_queue_dying(struct request_queue *);
--
2.16.0