Detect when a directory entry is (possibly partially) beyond directory
size and return EIO in that case since it means the filesystem is
corrupted. Otherwise directory operations can further corrupt the
directory and possibly also oops the kernel.
CC: Anatoly Trosinenko <anatoly.trosinenko(a)gmail.com>
CC: stable(a)vger.kernel.org
Reported-by: Anatoly Trosinenko <anatoly.trosinenko(a)gmail.com>
Signed-off-by: Jan Kara <jack(a)suse.cz>
---
fs/udf/directory.c | 3 +++
1 file changed, 3 insertions(+)
diff --git a/fs/udf/directory.c b/fs/udf/directory.c
index 0a98a2369738..3835f983cc99 100644
--- a/fs/udf/directory.c
+++ b/fs/udf/directory.c
@@ -152,6 +152,9 @@ struct fileIdentDesc *udf_fileident_read(struct inode *dir, loff_t *nf_pos,
sizeof(struct fileIdentDesc));
}
}
+ /* Got last entry outside of dir size - fs is corrupted! */
+ if (*nf_pos > dir->i_size)
+ return NULL;
return fi;
}
--
2.16.4
SysRq-L and RCU stall detector call arch_trigger_cpumask_backtrace() to
trigger other CPU's backtrace, but its behavior is totally broken. The
root cause is arch_trigger_cpumask_backtrace() use call-function IPI in
irq context, which trigger deadlocks in smp_call_function_single() and
smp_call_function_many().
This patch fix arch_trigger_cpumask_backtrace() by:
1, Use a dedecated IPI (SMP_CPU_BACKTRACE) to trigger backtraces;
2, If myself is in target cpumask, do backtrace and clear myself;
3, Use a spinlock to avoid parallel backtrace output;
4, Handle SMP_CPU_BACKTRACE IPI for Loongson-3.
I have attempted to implement SMP_CPU_BACKTRACE for all MIPS CPUs, but I
failed because some of their IPIs are not extensible. :(
Cc: stable(a)vger.kernel.org
Signed-off-by: Huacai Chen <chenhc(a)lemote.com>
---
arch/mips/include/asm/smp.h | 3 +++
arch/mips/kernel/process.c | 23 ++++++++++++++++++-----
arch/mips/loongson64/loongson-3/smp.c | 6 ++++++
3 files changed, 27 insertions(+), 5 deletions(-)
diff --git a/arch/mips/include/asm/smp.h b/arch/mips/include/asm/smp.h
index 88ebd83..b0521f4 100644
--- a/arch/mips/include/asm/smp.h
+++ b/arch/mips/include/asm/smp.h
@@ -43,6 +43,7 @@ extern int __cpu_logical_map[NR_CPUS];
/* Octeon - Tell another core to flush its icache */
#define SMP_ICACHE_FLUSH 0x4
#define SMP_ASK_C0COUNT 0x8
+#define SMP_CPU_BACKTRACE 0x10
/* Mask of CPUs which are currently definitely operating coherently */
extern cpumask_t cpu_coherent_mask;
@@ -81,6 +82,8 @@ static inline void __cpu_die(unsigned int cpu)
extern void play_dead(void);
#endif
+void arch_dump_stack(void);
+
/*
* This function will set up the necessary IPIs for Linux to communicate
* with the CPUs in mask.
diff --git a/arch/mips/kernel/process.c b/arch/mips/kernel/process.c
index 57028d4..647e15d 100644
--- a/arch/mips/kernel/process.c
+++ b/arch/mips/kernel/process.c
@@ -655,26 +655,39 @@ unsigned long arch_align_stack(unsigned long sp)
return sp & ALMASK;
}
-static void arch_dump_stack(void *info)
+void arch_dump_stack(void)
{
struct pt_regs *regs;
+ static arch_spinlock_t lock = __ARCH_SPIN_LOCK_UNLOCKED;
+ arch_spin_lock(&lock);
regs = get_irq_regs();
if (regs)
show_regs(regs);
dump_stack();
+ arch_spin_unlock(&lock);
}
void arch_trigger_cpumask_backtrace(const cpumask_t *mask, bool exclude_self)
{
long this_cpu = get_cpu();
+ struct cpumask backtrace_mask;
+ extern const struct plat_smp_ops *mp_ops;
+
+ cpumask_copy(&backtrace_mask, mask);
+ if (cpumask_test_cpu(this_cpu, mask)) {
+ if (!exclude_self) {
+ struct pt_regs *regs = get_irq_regs();
+ if (regs)
+ show_regs(regs);
+ dump_stack();
+ }
+ cpumask_clear_cpu(this_cpu, &backtrace_mask);
+ }
- if (cpumask_test_cpu(this_cpu, mask) && !exclude_self)
- dump_stack();
-
- smp_call_function_many(mask, arch_dump_stack, NULL, 1);
+ mp_ops->send_ipi_mask(&backtrace_mask, SMP_CPU_BACKTRACE);
put_cpu();
}
diff --git a/arch/mips/loongson64/loongson-3/smp.c b/arch/mips/loongson64/loongson-3/smp.c
index 8501109..0655114 100644
--- a/arch/mips/loongson64/loongson-3/smp.c
+++ b/arch/mips/loongson64/loongson-3/smp.c
@@ -291,6 +291,12 @@ void loongson3_ipi_interrupt(struct pt_regs *regs)
__wbflush(); /* Let others see the result ASAP */
}
+ if (action & SMP_CPU_BACKTRACE) {
+ irq_enter();
+ arch_dump_stack();
+ irq_exit();
+ }
+
if (irqs) {
int irq;
while ((irq = ffs(irqs))) {
--
2.7.0
The patch titled
Subject: slub: fix failure when we delete and create a slab cache
has been added to the -mm tree. Its filename is
slub-fix-failure-when-we-delete-and-create-a-slab-cache.patch
This patch should soon appear at
http://ozlabs.org/~akpm/mmots/broken-out/slub-fix-failure-when-we-delete-an…
and later at
http://ozlabs.org/~akpm/mmotm/broken-out/slub-fix-failure-when-we-delete-an…
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next and is updated
there every 3-4 working days
------------------------------------------------------
From: Mikulas Patocka <mpatocka(a)redhat.com>
Subject: slub: fix failure when we delete and create a slab cache
In kernel 4.17 I removed some code from dm-bufio that did slab cache
merging (21bb13276768) - both slab and slub support merging caches with
identical attributes, so dm-bufio now just calls kmem_cache_create and
relies on implicit merging.
This uncovered a bug in the slub subsystem - if we delete a cache and
immediatelly create another cache with the same attributes, it fails
because of duplicate filename in /sys/kernel/slab/. The slub subsystem
offloads freeing the cache to a workqueue - and if we create the new cache
before the workqueue runs, it complains because of duplicate filename in
sysfs.
This patch fixes the bug by moving the call of kobject_del from
sysfs_slab_remove_workfn to shutdown_cache. kobject_del must be called
while we hold slab_mutex - so that the sysfs entry is deleted before a
cache with the same attributes could be created.
Running device-mapper-test-suite with:
dmtest run --suite thin-provisioning -n /commit_failure_causes_fallback/
triggers:
[ 119.618958] Buffer I/O error on dev dm-0, logical block 1572848, async page read
[ 119.686224] device-mapper: thin: 253:1: metadata operation 'dm_pool_alloc_data_block' failed: error = -5
[ 119.695821] device-mapper: thin: 253:1: aborting current metadata transaction
[ 119.703255] sysfs: cannot create duplicate filename '/kernel/slab/:a-0000144'
[ 119.710394] CPU: 2 PID: 1037 Comm: kworker/u48:1 Not tainted 4.17.0.snitm+ #25
[ 119.717608] Hardware name: Supermicro SYS-1029P-WTR/X11DDW-L, BIOS 2.0a 12/06/2017
[ 119.725177] Workqueue: dm-thin do_worker [dm_thin_pool]
[ 119.730401] Call Trace:
[ 119.732856] dump_stack+0x5a/0x73
[ 119.736173] sysfs_warn_dup+0x58/0x70
[ 119.739839] sysfs_create_dir_ns+0x77/0x80
[ 119.743939] kobject_add_internal+0xba/0x2e0
[ 119.748210] kobject_init_and_add+0x70/0xb0
[ 119.752399] ? sysfs_slab_add+0x101/0x250
[ 119.756409] sysfs_slab_add+0xb1/0x250
[ 119.760161] __kmem_cache_create+0x116/0x150
[ 119.764436] ? number+0x2fb/0x340
[ 119.767755] ? _cond_resched+0x15/0x30
[ 119.771508] create_cache+0xd9/0x1f0
[ 119.775085] kmem_cache_create_usercopy+0x1c1/0x250
[ 119.779965] kmem_cache_create+0x18/0x20
[ 119.783894] dm_bufio_client_create+0x1ae/0x410 [dm_bufio]
[ 119.789380] ? dm_block_manager_alloc_callback+0x20/0x20 [dm_persistent_data]
[ 119.796509] ? kmem_cache_alloc_trace+0xae/0x1d0
[ 119.801131] dm_block_manager_create+0x5e/0x90 [dm_persistent_data]
[ 119.807397] __create_persistent_data_objects+0x38/0x940 [dm_thin_pool]
[ 119.814008] dm_pool_abort_metadata+0x64/0x90 [dm_thin_pool]
[ 119.819669] metadata_operation_failed+0x59/0x100 [dm_thin_pool]
[ 119.825673] alloc_data_block.isra.53+0x86/0x180 [dm_thin_pool]
[ 119.831592] process_cell+0x2a3/0x550 [dm_thin_pool]
[ 119.836558] ? mempool_alloc+0x6f/0x180
[ 119.840400] ? u32_swap+0x10/0x10
[ 119.843717] ? sort+0x17b/0x270
[ 119.846863] ? u32_swap+0x10/0x10
[ 119.850181] do_worker+0x28d/0x8f0 [dm_thin_pool]
[ 119.854890] ? move_linked_works+0x6f/0xa0
[ 119.858989] process_one_work+0x171/0x370
[ 119.862999] worker_thread+0x49/0x3f0
[ 119.866669] kthread+0xf8/0x130
[ 119.869813] ? max_active_store+0x80/0x80
[ 119.873827] ? kthread_bind+0x10/0x10
[ 119.877493] ret_from_fork+0x35/0x40
[ 119.881076] kobject_add_internal failed for :a-0000144 with -EEXIST, don't try to register things with the same name in the same directory.
[ 119.893580] kmem_cache_create(dm_bufio_buffer-16) failed with error -17
Link: http://lkml.kernel.org/r/alpine.LRH.2.02.1806151817130.6333@file01.intranet…
Signed-off-by: Mikulas Patocka <mpatocka(a)redhat.com>
Reported-by: Mike Snitzer <snitzer(a)redhat.com>
Tested-by: Mike Snitzer <snitzer(a)redhat.com>
Cc: Christoph Lameter <cl(a)linux.com>
Cc: Pekka Enberg <penberg(a)kernel.org>
Cc: David Rientjes <rientjes(a)google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim(a)lge.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
include/linux/slub_def.h | 4 ++++
mm/slab_common.c | 4 ++++
mm/slub.c | 7 ++++++-
3 files changed, 14 insertions(+), 1 deletion(-)
diff -puN include/linux/slub_def.h~slub-fix-failure-when-we-delete-and-create-a-slab-cache include/linux/slub_def.h
--- a/include/linux/slub_def.h~slub-fix-failure-when-we-delete-and-create-a-slab-cache
+++ a/include/linux/slub_def.h
@@ -155,8 +155,12 @@ struct kmem_cache {
#ifdef CONFIG_SYSFS
#define SLAB_SUPPORTS_SYSFS
+void sysfs_slab_unlink(struct kmem_cache *);
void sysfs_slab_release(struct kmem_cache *);
#else
+static inline void sysfs_slab_unlink(struct kmem_cache *s)
+{
+}
static inline void sysfs_slab_release(struct kmem_cache *s)
{
}
diff -puN mm/slab_common.c~slub-fix-failure-when-we-delete-and-create-a-slab-cache mm/slab_common.c
--- a/mm/slab_common.c~slub-fix-failure-when-we-delete-and-create-a-slab-cache
+++ a/mm/slab_common.c
@@ -567,10 +567,14 @@ static int shutdown_cache(struct kmem_ca
list_del(&s->list);
if (s->flags & SLAB_TYPESAFE_BY_RCU) {
+#ifdef SLAB_SUPPORTS_SYSFS
+ sysfs_slab_unlink(s);
+#endif
list_add_tail(&s->list, &slab_caches_to_rcu_destroy);
schedule_work(&slab_caches_to_rcu_destroy_work);
} else {
#ifdef SLAB_SUPPORTS_SYSFS
+ sysfs_slab_unlink(s);
sysfs_slab_release(s);
#else
slab_kmem_cache_release(s);
diff -puN mm/slub.c~slub-fix-failure-when-we-delete-and-create-a-slab-cache mm/slub.c
--- a/mm/slub.c~slub-fix-failure-when-we-delete-and-create-a-slab-cache
+++ a/mm/slub.c
@@ -5667,7 +5667,6 @@ static void sysfs_slab_remove_workfn(str
kset_unregister(s->memcg_kset);
#endif
kobject_uevent(&s->kobj, KOBJ_REMOVE);
- kobject_del(&s->kobj);
out:
kobject_put(&s->kobj);
}
@@ -5752,6 +5751,12 @@ static void sysfs_slab_remove(struct kme
schedule_work(&s->kobj_remove_work);
}
+void sysfs_slab_unlink(struct kmem_cache *s)
+{
+ if (slab_state >= FULL)
+ kobject_del(&s->kobj);
+}
+
void sysfs_slab_release(struct kmem_cache *s)
{
if (slab_state >= FULL)
_
Patches currently in -mm which might be from mpatocka(a)redhat.com are
slub-fix-failure-when-we-delete-and-create-a-slab-cache.patch
The patch titled
Subject: mm: fix devmem_is_allowed() for sub-page System RAM intersections
has been removed from the -mm tree. Its filename was
mm-fix-devmem_is_allowed-for-sub-page-system-ram-intersections.patch
This patch was dropped because it was merged into mainline or a subsystem tree
------------------------------------------------------
From: Dan Williams <dan.j.williams(a)intel.com>
Subject: mm: fix devmem_is_allowed() for sub-page System RAM intersections
Hussam reports:
I was poking around and for no real reason, I did cat /dev/mem and
strings /dev/mem. Then I saw the following warning in dmesg. I saved it
and rebooted immediately.
memremap attempted on mixed range 0x000000000009c000 size: 0x1000
------------[ cut here ]------------
WARNING: CPU: 0 PID: 11810 at kernel/memremap.c:98 memremap+0x104/0x170
[..]
Call Trace:
xlate_dev_mem_ptr+0x25/0x40
read_mem+0x89/0x1a0
__vfs_read+0x36/0x170
The memremap() implementation checks for attempts to remap System RAM with
MEMREMAP_WB and instead redirects those mapping attempts to the linear
map. However, that only works if the physical address range being
remapped is page aligned. In low memory we have situations like the
following:
00000000-00000fff : Reserved
00001000-0009fbff : System RAM
0009fc00-0009ffff : Reserved
...where System RAM intersects Reserved ranges on a sub-page page
granularity.
Given that devmem_is_allowed() special cases any attempt to map System RAM
in the first 1MB of memory, replace page_is_ram() with the more precise
region_intersects() to trap attempts to map disallowed ranges.
Link: https://bugzilla.kernel.org/show_bug.cgi?id=199999
Link: http://lkml.kernel.org/r/152856436164.18127.2847888121707136898.stgit@dwill…
Fixes: 92281dee825f ("arch: introduce memremap()")
Signed-off-by: Dan Williams <dan.j.williams(a)intel.com>
Reported-by: Hussam Al-Tayeb <me(a)hussam.eu.org>
Tested-by: Hussam Al-Tayeb <me(a)hussam.eu.org>
Cc: Christoph Hellwig <hch(a)lst.de>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
arch/x86/mm/init.c | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
diff -puN arch/x86/mm/init.c~mm-fix-devmem_is_allowed-for-sub-page-system-ram-intersections arch/x86/mm/init.c
--- a/arch/x86/mm/init.c~mm-fix-devmem_is_allowed-for-sub-page-system-ram-intersections
+++ a/arch/x86/mm/init.c
@@ -706,7 +706,9 @@ void __init init_mem_mapping(void)
*/
int devmem_is_allowed(unsigned long pagenr)
{
- if (page_is_ram(pagenr)) {
+ if (region_intersects(PFN_PHYS(pagenr), PAGE_SIZE,
+ IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE)
+ != REGION_DISJOINT) {
/*
* For disallowed memory regions in the low 1MB range,
* request that the page be shown as all zeros.
_
Patches currently in -mm which might be from dan.j.williams(a)intel.com are
mm-devm_memremap_pages-mark-devm_memremap_pages-export_symbol_gpl.patch
mm-devm_memremap_pages-handle-errors-allocating-final-devres-action.patch
mm-hmm-use-devm-semantics-for-hmm_devmem_add-remove.patch
mm-hmm-replace-hmm_devmem_pages_create-with-devm_memremap_pages.patch
mm-hmm-mark-hmm_devmem_add-add_resource-export_symbol_gpl.patch
The patch titled
Subject: mm/ksm.c: ignore STABLE_FLAG of rmap_item->address in rmap_walk_ksm()
has been removed from the -mm tree. Its filename was
mm-ksm-ignore-stable_flag-of-rmap_item-address-in-rmap_walk_ksm.patch
This patch was dropped because it was merged into mainline or a subsystem tree
------------------------------------------------------
From: Jia He <jia.he(a)hxt-semitech.com>
Subject: mm/ksm.c: ignore STABLE_FLAG of rmap_item->address in rmap_walk_ksm()
In our armv8a server(QDF2400), I noticed lots of WARN_ON caused by
PAGE_SIZE unaligned for rmap_item->address under memory pressure
tests(start 20 guests and run memhog in the host).
--------------------------begin--------------------------------------
[ 410.853828] WARNING: CPU: 4 PID: 4641 at
arch/arm64/kvm/../../../virt/kvm/arm/mmu.c:1826
kvm_age_hva_handler+0xc0/0xc8
[ 410.864518] Modules linked in: vhost_net vhost tap xt_CHECKSUM
ipt_MASQUERADE nf_nat_masquerade_ipv4 ip6t_rpfilter ipt_REJECT
nf_reject_ipv4 ip6t_REJECT nf_reject_ipv6 xt_conntrack ip_set nfnetlink
ebtable_nat ebtable_broute bridge stp llc ip6table_nat nf_conntrack_ipv6
nf_defrag_ipv6 nf_nat_ipv6 ip6table_mangle ip6table_security
ip6table_raw iptable_nat nf_conntrack_ipv4 nf_defrag_ipv4 nf_nat_ipv4
nf_nat nf_conntrack iptable_mangle iptable_security iptable_raw
ebtable_filter ebtables ip6table_filter ip6_tables iptable_filter
rpcrdma ib_isert iscsi_target_mod ib_iser libiscsi scsi_transport_iscsi
ib_srpt target_core_mod ib_srp scsi_transport_srp ib_ipoib rdma_ucm
ib_ucm ib_umad rdma_cm ib_cm iw_cm mlx5_ib vfat fat ib_uverbs dm_mirror
dm_region_hash ib_core dm_log dm_mod crc32_ce ipmi_ssif sg nfsd
[ 410.935101] auth_rpcgss nfs_acl lockd grace sunrpc ip_tables xfs
libcrc32c mlx5_core ixgbe mlxfw devlink mdio ahci_platform
libahci_platform qcom_emac libahci hdma hdma_mgmt i2c_qup
[ 410.951369] CPU: 4 PID: 4641 Comm: memhog Tainted: G W
4.17.0-rc3+ #8
[ 410.959104] Hardware name: <snip for confidential issues>
[ 410.969791] pstate: 80400005 (Nzcv daif +PAN -UAO)
[ 410.974575] pc : kvm_age_hva_handler+0xc0/0xc8
[ 410.979012] lr : handle_hva_to_gpa+0xa8/0xe0
[ 410.983274] sp : ffff801761553290
[ 410.986581] x29: ffff801761553290 x28: 0000000000000000
[ 410.991888] x27: 0000000000000002 x26: 0000000000000000
[ 410.997195] x25: ffff801765430058 x24: ffff0000080b5608
[ 411.002501] x23: 0000000000000000 x22: ffff8017ccb84000
[ 411.007807] x21: 0000000003ff0000 x20: ffff8017ccb84000
[ 411.013113] x19: 000000000000fe00 x18: ffff000008fb3c08
[ 411.018419] x17: 0000000000000000 x16: 0060001645820bd3
[ 411.023725] x15: ffff80176aacbc08 x14: 0000000000000000
[ 411.029031] x13: 0000000000000040 x12: 0000000000000228
[ 411.034337] x11: 0000000000000000 x10: 0000000000000000
[ 411.039643] x9 : 0000000000000010 x8 : 0000000000000004
[ 411.044949] x7 : 0000000000000000 x6 : 00008017f0770000
[ 411.050255] x5 : 0000fffda59f0200 x4 : 0000000000000000
[ 411.055561] x3 : 0000000000000000 x2 : 000000000000fe00
[ 411.060867] x1 : 0000000003ff0000 x0 : 0000000020000000
[ 411.066173] Call trace:
[ 411.068614] kvm_age_hva_handler+0xc0/0xc8
[ 411.072703] handle_hva_to_gpa+0xa8/0xe0
[ 411.076619] kvm_age_hva+0x4c/0xe8
[ 411.080014] kvm_mmu_notifier_clear_flush_young+0x54/0x98
[ 411.085408] __mmu_notifier_clear_flush_young+0x6c/0xa0
[ 411.090627] page_referenced_one+0x154/0x1d8
[ 411.094890] rmap_walk_ksm+0x12c/0x1d0
[ 411.098632] rmap_walk+0x94/0xa0
[ 411.101854] page_referenced+0x194/0x1b0
[ 411.105770] shrink_page_list+0x674/0xc28
[ 411.109772] shrink_inactive_list+0x26c/0x5b8
[ 411.114122] shrink_node_memcg+0x35c/0x620
[ 411.118211] shrink_node+0x100/0x430
[ 411.121778] do_try_to_free_pages+0xe0/0x3a8
[ 411.126041] try_to_free_pages+0xe4/0x230
[ 411.130045] __alloc_pages_nodemask+0x564/0xdc0
[ 411.134569] alloc_pages_vma+0x90/0x228
[ 411.138398] do_anonymous_page+0xc8/0x4d0
[ 411.142400] __handle_mm_fault+0x4a0/0x508
[ 411.146489] handle_mm_fault+0xf8/0x1b0
[ 411.150321] do_page_fault+0x218/0x4b8
[ 411.154064] do_translation_fault+0x90/0xa0
[ 411.158239] do_mem_abort+0x68/0xf0
[ 411.161721] el0_da+0x24/0x28
---------------------------end---------------------------------------
In rmap_walk_ksm, the rmap_item->address might still have the STABLE_FLAG,
then the start and end in handle_hva_to_gpa might not be PAGE_SIZE
aligned. Thus it will cause exceptions in handle_hva_to_gpa on arm64.
This patch fixes it by ignoring (not removing) the low bits of address
when doing rmap_walk_ksm.
IMO, it should be backported to stable tree. the stom of WARN_ONs is
very easy for me to reproduce. More than that, I watched a panic (not
reproducible) as follows:
[35380.805825] page:ffff7fe003742d80 count:-4871 mapcount:-2126053375
mapping: (null) index:0x0
[35380.815024] flags: 0x1fffc00000000000()
[35380.818845] raw: 1fffc00000000000 0000000000000000 0000000000000000
ffffecf981470000
[35380.826569] raw: dead000000000100 dead000000000200 ffff8017c001c000
0000000000000000
[35380.834294] page dumped because: nonzero _refcount
[35380.839069] Modules linked in: vhost_net vhost tap ebtable_filter ebtables
ip6table_filter ip6_tables iptable_filter fcoe libfcoe libfc 8021q garp mrp stp
llc scsi_transport_fc openvswitch nf_conntrack_ipv6 nf_nat_ipv6
nf_conntrack_ipv4 nf_defrag_ipv4 nf_nat_ipv4 nf_defrag_ipv6 nf_nat nf_conntrack
vfat fat rpcrdma ib_isert iscsi_target_mod ib_iser libiscsi scsi_transport_iscsi
ib_srpt target_core_mod ib_srp scsi_transport_srp ib_ipoib rdma_ucm ib_ucm
ib_uverbs ib_umad rdma_cm ib_cm iw_cm mlx5_ib ib_core crc32_ce ipmi_ssif tpm_tis
tpm_tis_core sg nfsd auth_rpcgss nfs_acl lockd grace sunrpc dm_multipath
ip_tables xfs libcrc32c mlx5_core mlxfw devlink ahci_platform libahci_platform
libahci qcom_emac sdhci_acpi sdhci hdma mmc_core hdma_mgmt i2c_qup dm_mirror
dm_region_hash dm_log dm_mod
[35380.908341] CPU: 29 PID: 18323 Comm: qemu-kvm Tainted: G W
4.14.15-5.hxt.aarch64 #1
[35380.917107] Hardware name: <snip for confidential issues>
[35380.930909] Call trace:
[35380.933345] [<ffff000008088f00>] dump_backtrace+0x0/0x22c
[35380.938723] [<ffff000008089150>] show_stack+0x24/0x2c
[35380.943759] [<ffff00000893c078>] dump_stack+0x8c/0xb0
[35380.948794] [<ffff00000820ab50>] bad_page+0xf4/0x154
[35380.953740] [<ffff000008211ce8>] free_pages_check_bad+0x90/0x9c
[35380.959642] [<ffff00000820c430>] free_pcppages_bulk+0x464/0x518
[35380.965545] [<ffff00000820db98>] free_hot_cold_page+0x22c/0x300
[35380.971448] [<ffff0000082176fc>] __put_page+0x54/0x60
[35380.976484] [<ffff0000080b1164>] unmap_stage2_range+0x170/0x2b4
[35380.982385] [<ffff0000080b12d8>] kvm_unmap_hva_handler+0x30/0x40
[35380.988375] [<ffff0000080b0104>] handle_hva_to_gpa+0xb0/0xec
[35380.994016] [<ffff0000080b2644>] kvm_unmap_hva_range+0x5c/0xd0
[35380.999833] [<ffff0000080a8054>]
I even injected a fault on purpose in kvm_unmap_hva_range by seting
size=size-0x200, the call trace is similar as above. So I thought the
panic is similarly caused by the root cause of WARN_ON.
Andrea said:
: It looks a straightforward safe fix, on x86 hva_to_gfn_memslot would
: zap those bits and hide the misalignment caused by the low metadata
: bits being erroneously left set in the address, but the arm code
: notices when that's the last page in the memslot and the hva_end is
: getting aligned and the size is below one page.
:
: I think the problem triggers in the addr += PAGE_SIZE of
: unmap_stage2_ptes that never matches end because end is aligned but
: addr is not.
:
: } while (pte++, addr += PAGE_SIZE, addr != end);
:
: x86 again only works on hva_start/hva_end after converting it to
: gfn_start/end and that being in pfn units the bits are zapped before
: they risk to cause trouble.
Jia He said:
: I've tested by myself in arm64 server (QDF2400,46 cpus,96G mem) Without
: this patch, the WARN_ON is very easy for reproducing. After this patch, I
: have run the same benchmarch for a whole day without any WARN_ONs
Link: http://lkml.kernel.org/r/1525403506-6750-1-git-send-email-hejianet@gmail.com
Signed-off-by: Jia He <jia.he(a)hxt-semitech.com>
Reviewed-by: Andrea Arcangeli <aarcange(a)redhat.com>
Tested-by: Jia He <hejianet(a)gmail.com>
Cc: Suzuki K Poulose <Suzuki.Poulose(a)arm.com>
Cc: Minchan Kim <minchan(a)kernel.org>
Cc: Claudio Imbrenda <imbrenda(a)linux.vnet.ibm.com>
Cc: Arvind Yadav <arvind.yadav.cs(a)gmail.com>
Cc: Mike Rapoport <rppt(a)linux.vnet.ibm.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/ksm.c | 14 ++++++++++----
1 file changed, 10 insertions(+), 4 deletions(-)
diff -puN mm/ksm.c~mm-ksm-ignore-stable_flag-of-rmap_item-address-in-rmap_walk_ksm mm/ksm.c
--- a/mm/ksm.c~mm-ksm-ignore-stable_flag-of-rmap_item-address-in-rmap_walk_ksm
+++ a/mm/ksm.c
@@ -216,6 +216,8 @@ struct rmap_item {
#define SEQNR_MASK 0x0ff /* low bits of unstable tree seqnr */
#define UNSTABLE_FLAG 0x100 /* is a node of the unstable tree */
#define STABLE_FLAG 0x200 /* is listed from the stable tree */
+#define KSM_FLAG_MASK (SEQNR_MASK|UNSTABLE_FLAG|STABLE_FLAG)
+ /* to mask all the flags */
/* The stable and unstable tree heads */
static struct rb_root one_stable_tree[1] = { RB_ROOT };
@@ -2598,10 +2600,15 @@ again:
anon_vma_lock_read(anon_vma);
anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
0, ULONG_MAX) {
+ unsigned long addr;
+
cond_resched();
vma = vmac->vma;
- if (rmap_item->address < vma->vm_start ||
- rmap_item->address >= vma->vm_end)
+
+ /* Ignore the stable/unstable/sqnr flags */
+ addr = rmap_item->address & ~KSM_FLAG_MASK;
+
+ if (addr < vma->vm_start || addr >= vma->vm_end)
continue;
/*
* Initially we examine only the vma which covers this
@@ -2615,8 +2622,7 @@ again:
if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
continue;
- if (!rwc->rmap_one(page, vma,
- rmap_item->address, rwc->arg)) {
+ if (!rwc->rmap_one(page, vma, addr, rwc->arg)) {
anon_vma_unlock_read(anon_vma);
return;
}
_
Patches currently in -mm which might be from jia.he(a)hxt-semitech.com are