The patch below does not apply to the 4.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 1b426bac66e6cc83c9f2d92b96e4e72acf43419a Mon Sep 17 00:00:00 2001
From: Mike Kravetz <mike.kravetz(a)oracle.com>
Date: Mon, 13 May 2019 17:19:41 -0700
Subject: [PATCH] hugetlb: use same fault hash key for shared and private
mappings
hugetlb uses a fault mutex hash table to prevent page faults of the
same pages concurrently. The key for shared and private mappings is
different. Shared keys off address_space and file index. Private keys
off mm and virtual address. Consider a private mappings of a populated
hugetlbfs file. A fault will map the page from the file and if needed
do a COW to map a writable page.
Hugetlbfs hole punch uses the fault mutex to prevent mappings of file
pages. It uses the address_space file index key. However, private
mappings will use a different key and could race with this code to map
the file page. This causes problems (BUG) for the page cache remove
code as it expects the page to be unmapped. A sample stack is:
page dumped because: VM_BUG_ON_PAGE(page_mapped(page))
kernel BUG at mm/filemap.c:169!
...
RIP: 0010:unaccount_page_cache_page+0x1b8/0x200
...
Call Trace:
__delete_from_page_cache+0x39/0x220
delete_from_page_cache+0x45/0x70
remove_inode_hugepages+0x13c/0x380
? __add_to_page_cache_locked+0x162/0x380
hugetlbfs_fallocate+0x403/0x540
? _cond_resched+0x15/0x30
? __inode_security_revalidate+0x5d/0x70
? selinux_file_permission+0x100/0x130
vfs_fallocate+0x13f/0x270
ksys_fallocate+0x3c/0x80
__x64_sys_fallocate+0x1a/0x20
do_syscall_64+0x5b/0x180
entry_SYSCALL_64_after_hwframe+0x44/0xa9
There seems to be another potential COW issue/race with this approach
of different private and shared keys as noted in commit 8382d914ebf7
("mm, hugetlb: improve page-fault scalability").
Since every hugetlb mapping (even anon and private) is actually a file
mapping, just use the address_space index key for all mappings. This
results in potentially more hash collisions. However, this should not
be the common case.
Link: http://lkml.kernel.org/r/20190328234704.27083-3-mike.kravetz@oracle.com
Link: http://lkml.kernel.org/r/20190412165235.t4sscoujczfhuiyt@linux-r8p5
Fixes: b5cec28d36f5 ("hugetlbfs: truncate_hugepages() takes a range of pages")
Signed-off-by: Mike Kravetz <mike.kravetz(a)oracle.com>
Reviewed-by: Naoya Horiguchi <n-horiguchi(a)ah.jp.nec.com>
Reviewed-by: Davidlohr Bueso <dbueso(a)suse.de>
Cc: Joonsoo Kim <iamjoonsoo.kim(a)lge.com>
Cc: "Kirill A . Shutemov" <kirill.shutemov(a)linux.intel.com>
Cc: Michal Hocko <mhocko(a)kernel.org>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds(a)linux-foundation.org>
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index c74ef4426282..f23237135163 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -440,9 +440,7 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
u32 hash;
index = page->index;
- hash = hugetlb_fault_mutex_hash(h, current->mm,
- &pseudo_vma,
- mapping, index, 0);
+ hash = hugetlb_fault_mutex_hash(h, mapping, index, 0);
mutex_lock(&hugetlb_fault_mutex_table[hash]);
/*
@@ -639,8 +637,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
addr = index * hpage_size;
/* mutex taken here, fault path and hole punch */
- hash = hugetlb_fault_mutex_hash(h, mm, &pseudo_vma, mapping,
- index, addr);
+ hash = hugetlb_fault_mutex_hash(h, mapping, index, addr);
mutex_lock(&hugetlb_fault_mutex_table[hash]);
/* See if already present in mapping to avoid alloc/free */
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 11943b60f208..edf476c8cfb9 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -123,9 +123,7 @@ void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason);
void free_huge_page(struct page *page);
void hugetlb_fix_reserve_counts(struct inode *inode);
extern struct mutex *hugetlb_fault_mutex_table;
-u32 hugetlb_fault_mutex_hash(struct hstate *h, struct mm_struct *mm,
- struct vm_area_struct *vma,
- struct address_space *mapping,
+u32 hugetlb_fault_mutex_hash(struct hstate *h, struct address_space *mapping,
pgoff_t idx, unsigned long address);
pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index c33c5cbb67ff..98a3c7c224cb 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -3824,8 +3824,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
* handling userfault. Reacquire after handling
* fault to make calling code simpler.
*/
- hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping,
- idx, haddr);
+ hash = hugetlb_fault_mutex_hash(h, mapping, idx, haddr);
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
ret = handle_userfault(&vmf, VM_UFFD_MISSING);
mutex_lock(&hugetlb_fault_mutex_table[hash]);
@@ -3933,21 +3932,14 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
}
#ifdef CONFIG_SMP
-u32 hugetlb_fault_mutex_hash(struct hstate *h, struct mm_struct *mm,
- struct vm_area_struct *vma,
- struct address_space *mapping,
+u32 hugetlb_fault_mutex_hash(struct hstate *h, struct address_space *mapping,
pgoff_t idx, unsigned long address)
{
unsigned long key[2];
u32 hash;
- if (vma->vm_flags & VM_SHARED) {
- key[0] = (unsigned long) mapping;
- key[1] = idx;
- } else {
- key[0] = (unsigned long) mm;
- key[1] = address >> huge_page_shift(h);
- }
+ key[0] = (unsigned long) mapping;
+ key[1] = idx;
hash = jhash2((u32 *)&key, sizeof(key)/sizeof(u32), 0);
@@ -3958,9 +3950,7 @@ u32 hugetlb_fault_mutex_hash(struct hstate *h, struct mm_struct *mm,
* For uniprocesor systems we always use a single mutex, so just
* return 0 and avoid the hashing overhead.
*/
-u32 hugetlb_fault_mutex_hash(struct hstate *h, struct mm_struct *mm,
- struct vm_area_struct *vma,
- struct address_space *mapping,
+u32 hugetlb_fault_mutex_hash(struct hstate *h, struct address_space *mapping,
pgoff_t idx, unsigned long address)
{
return 0;
@@ -4005,7 +3995,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
* get spurious allocation failures if two CPUs race to instantiate
* the same page in the page cache.
*/
- hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping, idx, haddr);
+ hash = hugetlb_fault_mutex_hash(h, mapping, idx, haddr);
mutex_lock(&hugetlb_fault_mutex_table[hash]);
entry = huge_ptep_get(ptep);
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index d59b5a73dfb3..9932d5755e4c 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -271,8 +271,7 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
*/
idx = linear_page_index(dst_vma, dst_addr);
mapping = dst_vma->vm_file->f_mapping;
- hash = hugetlb_fault_mutex_hash(h, dst_mm, dst_vma, mapping,
- idx, dst_addr);
+ hash = hugetlb_fault_mutex_hash(h, mapping, idx, dst_addr);
mutex_lock(&hugetlb_fault_mutex_table[hash]);
err = -ENOMEM;
The MIPS GIC contains a block of registers used to map local interrupts
to a particular CPU interrupt pin. Since these registers are found at a
consecutive range of addresses we access them using an index, via the
(read|write)_gic_v[lo]_map accessor functions. We currently use values
from enum mips_gic_local_interrupt as those indices.
Unfortunately whilst enum mips_gic_local_interrupt provides the correct
offsets for bits in the pending & mask registers, the ordering of the
map registers is subtly different... Compared with the ordering of
pending & mask bits, the map registers move the FDC from the end of the
list to index 3 after the timer interrupt. As a result the performance
counter & software interrupts are therefore at indices 4-6 rather than
indices 3-5.
Notably this causes problems with performance counter interrupts being
incorrectly mapped on some systems, and presumably will also cause
problems for FDC interrupts.
Introduce a function to map from enum mips_gic_local_interrupt to the
index of the corresponding map register, and use it to ensure we access
the map registers for the correct interrupts.
Signed-off-by: Paul Burton <paul.burton(a)mips.com>
Fixes: a0dc5cb5e31b ("irqchip: mips-gic: Simplify gic_local_irq_domain_map()")
Fixes: da61fcf9d62a ("irqchip: mips-gic: Use irq_cpu_online to (un)mask all-VP(E) IRQs")
Reported-and-tested-by: Archer Yan <ayan(a)wavecomp.com>
Cc: Thomas Gleixner <tglx(a)linutronix.de>
Cc: Jason Cooper <jason(a)lakedaemon.net>
Cc: Marc Zyngier <marc.zyngier(a)arm.com>
Cc: stable(a)vger.kernel.org # v4.14+
---
arch/mips/include/asm/mips-gic.h | 30 ++++++++++++++++++++++++++++++
drivers/irqchip/irq-mips-gic.c | 4 ++--
2 files changed, 32 insertions(+), 2 deletions(-)
diff --git a/arch/mips/include/asm/mips-gic.h b/arch/mips/include/asm/mips-gic.h
index 558059a8f218..0277b56157af 100644
--- a/arch/mips/include/asm/mips-gic.h
+++ b/arch/mips/include/asm/mips-gic.h
@@ -314,6 +314,36 @@ static inline bool mips_gic_present(void)
return IS_ENABLED(CONFIG_MIPS_GIC) && mips_gic_base;
}
+/**
+ * mips_gic_vx_map_reg() - Return GIC_Vx_<intr>_MAP register offset
+ * @intr: A GIC local interrupt
+ *
+ * Determine the index of the GIC_VL_<intr>_MAP or GIC_VO_<intr>_MAP register
+ * within the block of GIC map registers. This is almost the same as the order
+ * of interrupts in the pending & mask registers, as used by enum
+ * mips_gic_local_interrupt, but moves the FDC interrupt & thus offsets the
+ * interrupts after it...
+ *
+ * Return: The map register index corresponding to @intr.
+ *
+ * The return value is suitable for use with the (read|write)_gic_v[lo]_map
+ * accessor functions.
+ */
+static inline unsigned int
+mips_gic_vx_map_reg(enum mips_gic_local_interrupt intr)
+{
+ /* WD, Compare & Timer are 1:1 */
+ if (intr <= GIC_LOCAL_INT_TIMER)
+ return intr;
+
+ /* FDC moves to after Timer... */
+ if (intr == GIC_LOCAL_INT_FDC)
+ return GIC_LOCAL_INT_TIMER + 1;
+
+ /* As a result everything else is offset by 1 */
+ return intr + 1;
+}
+
/**
* gic_get_c0_compare_int() - Return cp0 count/compare interrupt virq
*
diff --git a/drivers/irqchip/irq-mips-gic.c b/drivers/irqchip/irq-mips-gic.c
index d32268cc1174..f3985469c221 100644
--- a/drivers/irqchip/irq-mips-gic.c
+++ b/drivers/irqchip/irq-mips-gic.c
@@ -388,7 +388,7 @@ static void gic_all_vpes_irq_cpu_online(struct irq_data *d)
intr = GIC_HWIRQ_TO_LOCAL(d->hwirq);
cd = irq_data_get_irq_chip_data(d);
- write_gic_vl_map(intr, cd->map);
+ write_gic_vl_map(mips_gic_vx_map_reg(intr), cd->map);
if (cd->mask)
write_gic_vl_smask(BIT(intr));
}
@@ -517,7 +517,7 @@ static int gic_irq_domain_map(struct irq_domain *d, unsigned int virq,
spin_lock_irqsave(&gic_lock, flags);
for_each_online_cpu(cpu) {
write_gic_vl_other(mips_cm_vp_id(cpu));
- write_gic_vo_map(intr, map);
+ write_gic_vo_map(mips_gic_vx_map_reg(intr), map);
}
spin_unlock_irqrestore(&gic_lock, flags);
--
2.21.0