From: Janosch Frank <frankja(a)linux.ibm.com>
Subject: userfaultfd: hugetlbfs: fix userfaultfd_huge_must_wait() pte access
Use huge_ptep_get() to translate huge ptes to normal ptes so we can check
them with the huge_pte_* functions. Otherwise some architectures will
check the wrong values and will not wait for userspace to bring in the
memory.
Link: http://lkml.kernel.org/r/20180626132421.78084-1-frankja@linux.ibm.com
Fixes: 369cd2121be4 ("userfaultfd: hugetlbfs: userfaultfd_huge_must_wait for hugepmd ranges")
Signed-off-by: Janosch Frank <frankja(a)linux.ibm.com>
Reviewed-by: David Hildenbrand <david(a)redhat.com>
Reviewed-by: Mike Kravetz <mike.kravetz(a)oracle.com>
Cc: Andrea Arcangeli <aarcange(a)redhat.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
fs/userfaultfd.c | 12 +++++++-----
1 file changed, 7 insertions(+), 5 deletions(-)
diff -puN fs/userfaultfd.c~userfaultfd-hugetlbfs-fix-userfaultfd_huge_must_wait-pte-access fs/userfaultfd.c
--- a/fs/userfaultfd.c~userfaultfd-hugetlbfs-fix-userfaultfd_huge_must_wait-pte-access
+++ a/fs/userfaultfd.c
@@ -222,24 +222,26 @@ static inline bool userfaultfd_huge_must
unsigned long reason)
{
struct mm_struct *mm = ctx->mm;
- pte_t *pte;
+ pte_t *ptep, pte;
bool ret = true;
VM_BUG_ON(!rwsem_is_locked(&mm->mmap_sem));
- pte = huge_pte_offset(mm, address, vma_mmu_pagesize(vma));
- if (!pte)
+ ptep = huge_pte_offset(mm, address, vma_mmu_pagesize(vma));
+
+ if (!ptep)
goto out;
ret = false;
+ pte = huge_ptep_get(ptep);
/*
* Lockless access: we're in a wait_event so it's ok if it
* changes under us.
*/
- if (huge_pte_none(*pte))
+ if (huge_pte_none(pte))
ret = true;
- if (!huge_pte_write(*pte) && (reason & VM_UFFD_WP))
+ if (!huge_pte_write(pte) && (reason & VM_UFFD_WP))
ret = true;
out:
return ret;
_
The patch titled
Subject: fs/proc/task_mmu.c: fix Locked field in /proc/pid/smaps*
has been added to the -mm tree. Its filename is
mm-fix-locked-field-in-proc-pid-smaps.patch
This patch should soon appear at
http://ozlabs.org/~akpm/mmots/broken-out/mm-fix-locked-field-in-proc-pid-sm…
and later at
http://ozlabs.org/~akpm/mmotm/broken-out/mm-fix-locked-field-in-proc-pid-sm…
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next and is updated
there every 3-4 working days
------------------------------------------------------
From: Vlastimil Babka <vbabka(a)suse.cz>
Subject: fs/proc/task_mmu.c: fix Locked field in /proc/pid/smaps*
Thomas reports:
: While looking around in /proc on my v4.14.52 system I noticed that
: all processes got a lot of "Locked" memory in /proc/*/smaps. A lot
: more memory than a regular user can usually lock with mlock().
:
: commit 493b0e9d945fa9dfe96be93ae41b4ca4b6fdb317 (v4.14-rc1) seems
: to have changed the behavior of "Locked".
:
: Before that commit the code was like this. Notice the VM_LOCKED
: check.
:
: (vma->vm_flags & VM_LOCKED) ?
: (unsigned long)(mss.pss >> (10 + PSS_SHIFT)) : 0);
:
: After that commit Locked is now the same as Pss. This looks like a
: mistake.
:
: (unsigned long)(mss->pss >> (10 + PSS_SHIFT)));
Indeed, the commit has added mss->pss_locked with the correct value that
depends on VM_LOCKED, but forgot to actually use it. Fix it.
Link: http://lkml.kernel.org/r/ebf6c7fb-fec3-6a26-544f-710ed193c154@suse.cz
Fixes: 493b0e9d945f ("mm: add /proc/pid/smaps_rollup")
Signed-off-by: Vlastimil Babka <vbabka(a)suse.cz>
Reported-by: Thomas Lindroth <thomas.lindroth(a)gmail.com>
Cc: Alexey Dobriyan <adobriyan(a)gmail.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
fs/proc/task_mmu.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff -puN fs/proc/task_mmu.c~mm-fix-locked-field-in-proc-pid-smaps fs/proc/task_mmu.c
--- a/fs/proc/task_mmu.c~mm-fix-locked-field-in-proc-pid-smaps
+++ a/fs/proc/task_mmu.c
@@ -831,7 +831,8 @@ static int show_smap(struct seq_file *m,
SEQ_PUT_DEC(" kB\nSwap: ", mss->swap);
SEQ_PUT_DEC(" kB\nSwapPss: ",
mss->swap_pss >> PSS_SHIFT);
- SEQ_PUT_DEC(" kB\nLocked: ", mss->pss >> PSS_SHIFT);
+ SEQ_PUT_DEC(" kB\nLocked: ",
+ mss->pss_locked >> PSS_SHIFT);
seq_puts(m, " kB\n");
}
if (!rollup_mode) {
_
Patches currently in -mm which might be from vbabka(a)suse.cz are
mm-page_alloc-actually-ignore-mempolicies-for-high-priority-allocations.patch
mm-fix-locked-field-in-proc-pid-smaps.patch
Noticed this as I was skimming through, if we fail to allocate memory
for cli we'll end up returning without dropping the runtime PM ref we
got. Additionally, we'll even return the wrong return code! (ret most
likely will == 0 here, we want -ENOMEM).
Signed-off-by: Lyude Paul <lyude(a)redhat.com>
Cc: stable(a)vger.kernel.org
---
drivers/gpu/drm/nouveau/nouveau_drm.c | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/drivers/gpu/drm/nouveau/nouveau_drm.c b/drivers/gpu/drm/nouveau/nouveau_drm.c
index 0452b18d36b9..0f668e275ee1 100644
--- a/drivers/gpu/drm/nouveau/nouveau_drm.c
+++ b/drivers/gpu/drm/nouveau/nouveau_drm.c
@@ -919,8 +919,10 @@ nouveau_drm_open(struct drm_device *dev, struct drm_file *fpriv)
get_task_comm(tmpname, current);
snprintf(name, sizeof(name), "%s[%d]", tmpname, pid_nr(fpriv->pid));
- if (!(cli = kzalloc(sizeof(*cli), GFP_KERNEL)))
- return ret;
+ if (!(cli = kzalloc(sizeof(*cli), GFP_KERNEL))) {
+ ret = -ENOMEM;
+ goto done;
+ }
ret = nouveau_cli_init(drm, name, cli);
if (ret)
--
2.17.1
The patch below does not apply to the 4.16-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From dc7a10ddee0c56c6d891dd18de5c4ee9869545e0 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk(a)kernel.org>
Date: Fri, 30 Mar 2018 17:58:13 -0700
Subject: [PATCH] f2fs: truncate preallocated blocks in error case
If write is failed, we must deallocate the blocks that we couldn't write.
Cc: stable(a)vger.kernel.org
Reviewed-by: Chao Yu <yuchao0(a)huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk(a)kernel.org>
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 8068b015ece5..6b94f19b3fa8 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -2911,6 +2911,8 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
ret = generic_write_checks(iocb, from);
if (ret > 0) {
+ bool preallocated = false;
+ size_t target_size = 0;
int err;
if (iov_iter_fault_in_readable(from, iov_iter_count(from)))
@@ -2927,6 +2929,9 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
}
} else {
+ preallocated = true;
+ target_size = iocb->ki_pos + iov_iter_count(from);
+
err = f2fs_preallocate_blocks(iocb, from);
if (err) {
clear_inode_flag(inode, FI_NO_PREALLOC);
@@ -2939,6 +2944,10 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
blk_finish_plug(&plug);
clear_inode_flag(inode, FI_NO_PREALLOC);
+ /* if we couldn't write data, we should deallocate blocks. */
+ if (preallocated && i_size_read(inode) < target_size)
+ f2fs_truncate(inode);
+
if (ret > 0)
f2fs_update_iostat(F2FS_I_SB(inode), APP_WRITE_IO, ret);
}
When cpu_stop_queue_two_works() begins to wake the stopper
threads, it does so without preemption disabled, which leads
to the following race condition:
The source CPU calls cpu_stop_queue_two_works(), with cpu1
as the source CPU, and cpu2 as the destination CPU. When
adding the stopper threads to the wake queue used in this
function, the source CPU stopper thread is added first,
and the destination CPU stopper thread is added last.
When wake_up_q() is invoked to wake the stopper threads, the
threads are woken up in the order that they are queued in,
so the source CPU's stopper thread is woken up first, and
it preempts the thread running on the source CPU.
The stopper thread will then execute on the source CPU,
disable preemption, and begin executing multi_cpu_stop(),
and wait for an ack from the destination CPU's stopper thread,
with preemption still disabled. Since the worker thread that
woke up the stopper thread on the source CPU is affine to the
source CPU, and preemption is disabled on the source CPU, that
thread will never run to dequeue the destination CPU's stopper
thread from the wake queue, and thus, the destination CPU's
stopper thread will never run, causing the source CPU's stopper
thread to wait forever, and stall.
Disable preemption when waking the stopper threads in
cpu_stop_queue_two_works().
Fixes: 0b26351b910f ("stop_machine, sched: Fix migrate_swap() vs. active_balance() deadlock")
Co-Developed-by: Prasad Sodagudi <psodagud(a)codeaurora.org>
Co-Developed-by: Pavankumar Kondeti <pkondeti(a)codeaurora.org>
Signed-off-by: Isaac J. Manjarres <isaacm(a)codeaurora.org>
Signed-off-by: Prasad Sodagudi <psodagud(a)codeaurora.org>
Signed-off-by: Pavankumar Kondeti <pkondeti(a)codeaurora.org>
Cc: stable(a)vger.kernel.org
---
kernel/stop_machine.c | 6 +++++-
1 file changed, 5 insertions(+), 1 deletion(-)
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index f89014a..1ff523d 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -270,7 +270,11 @@ static int cpu_stop_queue_two_works(int cpu1, struct cpu_stop_work *work1,
goto retry;
}
- wake_up_q(&wakeq);
+ if (!err) {
+ preempt_disable();
+ wake_up_q(&wakeq);
+ preempt_enable();
+ }
return err;
}
--
The Qualcomm Innovation Center, Inc. is a member of the Code Aurora Forum,
a Linux Foundation Collaborative Project
This is a v4.17-stable backport of my "Fix DM DAX handling" series which
had backport collisions.
Greg, please let me know if I need to adjust anything in my backport
formatting.
Darrick J. Wong (1):
fs: allow per-device dax status checking for filesystems
Dave Jiang (1):
dax: change bdev_dax_supported() to support boolean returns
Ross Zwisler (3):
pmem: only set QUEUE_FLAG_DAX for fsdax mode
dax: bdev_dax_supported() check for QUEUE_FLAG_DAX
dm: prevent DAX mounts if not supported
drivers/dax/super.c | 48 ++++++++++++++++++++++++++++--------------------
drivers/md/dm-table.c | 7 ++++---
drivers/md/dm.c | 3 +--
drivers/nvdimm/pmem.c | 3 ++-
fs/ext2/super.c | 3 +--
fs/ext4/super.c | 3 +--
fs/xfs/xfs_ioctl.c | 3 ++-
fs/xfs/xfs_iops.c | 30 +++++++++++++++++++++++++-----
fs/xfs/xfs_super.c | 10 ++++++++--
include/linux/dax.h | 11 ++++++-----
10 files changed, 78 insertions(+), 43 deletions(-)
--
2.14.4
commit cd4a4ae4683dc2e09380118e205e057896dcda2b upstream.
If we end up splitting a bio and the queue goes away between
the initial submission and the later split submission, then we
can block forever in blk_queue_enter() waiting for the reference
to drop to zero. This will never happen, since we already hold
a reference.
Mark a split bio as already having entered the queue, so we can
just use the live non-blocking queue enter variant.
Thanks to Tetsuo Handa for the analysis.
We're running fio tests and the tasks get stuck in a D state forever
when systemd-udevd tries to read the partition table. This patch solves
it. Please apply to 4.17 stable.
Reported-by: syzbot+c4f9cebf9d651f6e54de(a)syzkaller.appspotmail.com
Signed-off-by: Jens Axboe <axboe(a)kernel.dk>
Signed-off-by: Alexandru Moise <00moses.alexander00(a)gmail.com>
---
block/blk-core.c | 4 +++-
block/blk-merge.c | 10 ++++++++++
include/linux/blk_types.h | 2 ++
3 files changed, 15 insertions(+), 1 deletion(-)
diff --git a/block/blk-core.c b/block/blk-core.c
index b559b9d4f1a2..47ab2d9d02d9 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -2392,7 +2392,9 @@ blk_qc_t generic_make_request(struct bio *bio)
if (bio->bi_opf & REQ_NOWAIT)
flags = BLK_MQ_REQ_NOWAIT;
- if (blk_queue_enter(q, flags) < 0) {
+ if (bio_flagged(bio, BIO_QUEUE_ENTERED))
+ blk_queue_enter_live(q);
+ else if (blk_queue_enter(q, flags) < 0) {
if (!blk_queue_dying(q) && (bio->bi_opf & REQ_NOWAIT))
bio_wouldblock_error(bio);
else
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 782940c65d8a..481dc02668f9 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -210,6 +210,16 @@ void blk_queue_split(struct request_queue *q, struct bio **bio)
/* there isn't chance to merge the splitted bio */
split->bi_opf |= REQ_NOMERGE;
+ /*
+ * Since we're recursing into make_request here, ensure
+ * that we mark this bio as already having entered the queue.
+ * If not, and the queue is going away, we can get stuck
+ * forever on waiting for the queue reference to drop. But
+ * that will never happen, as we're already holding a
+ * reference to it.
+ */
+ bio_set_flag(*bio, BIO_QUEUE_ENTERED);
+
bio_chain(split, *bio);
trace_block_split(q, split, (*bio)->bi_iter.bi_sector);
generic_make_request(*bio);
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 17b18b91ebac..1602bf4ab4cd 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -186,6 +186,8 @@ struct bio {
* throttling rules. Don't do it again. */
#define BIO_TRACE_COMPLETION 10 /* bio_endio() should trace the final completion
* of this bio. */
+#define BIO_QUEUE_ENTERED 11 /* can use blk_queue_enter_live() */
+
/* See BVEC_POOL_OFFSET below before adding new flags */
/*
--
2.18.0
Currently nouveau doesn't actually expose the state debugfs file that's
usually provided for any modesetting driver that supports atomic, even
if nouveau is loaded with atomic=1. This is due to the fact that the
standard debugfs files that DRM creates for atomic drivers is called
when drm_get_pci_dev() is called from nouveau_drm.c. This happens well
before we've initialized the display core, which is currently
responsible for setting the DRIVER_ATOMIC cap.
So, move the atomic option into nouveau_drm.c and just add the
DRIVER_ATOMIC cap whenever it's enabled on the kernel commandline. This
shouldn't cause any actual issues, as the atomic ioctl will still fail
as expected even if the display core doesn't disable it until later in
the init sequence. This also provides the added benefit of being able to
use the state debugfs file to check the current display state even if
clients aren't allowed to modify it through anything other than the
legacy ioctls.
Additionally, disable the DRIVER_ATOMIC cap in nv04's display core, as
this was already disabled there previously.
Signed-off-by: Lyude Paul <lyude(a)redhat.com>
Cc: stable(a)vger.kernel.org
---
drivers/gpu/drm/nouveau/dispnv04/disp.c | 3 +++
drivers/gpu/drm/nouveau/dispnv50/disp.c | 6 ------
drivers/gpu/drm/nouveau/nouveau_drm.c | 7 +++++++
3 files changed, 10 insertions(+), 6 deletions(-)
diff --git a/drivers/gpu/drm/nouveau/dispnv04/disp.c b/drivers/gpu/drm/nouveau/dispnv04/disp.c
index 501d2d290e9c..70dce544984e 100644
--- a/drivers/gpu/drm/nouveau/dispnv04/disp.c
+++ b/drivers/gpu/drm/nouveau/dispnv04/disp.c
@@ -55,6 +55,9 @@ nv04_display_create(struct drm_device *dev)
nouveau_display(dev)->init = nv04_display_init;
nouveau_display(dev)->fini = nv04_display_fini;
+ /* Pre-nv50 doesn't support atomic, so don't expose the ioctls */
+ dev->driver->driver_features &= ~DRIVER_ATOMIC;
+
nouveau_hw_save_vga_fonts(dev, 1);
nv04_crtc_create(dev, 0);
diff --git a/drivers/gpu/drm/nouveau/dispnv50/disp.c b/drivers/gpu/drm/nouveau/dispnv50/disp.c
index 9382e99a0bc7..d9da69c83ae7 100644
--- a/drivers/gpu/drm/nouveau/dispnv50/disp.c
+++ b/drivers/gpu/drm/nouveau/dispnv50/disp.c
@@ -2126,10 +2126,6 @@ nv50_display_destroy(struct drm_device *dev)
kfree(disp);
}
-MODULE_PARM_DESC(atomic, "Expose atomic ioctl (default: disabled)");
-static int nouveau_atomic = 0;
-module_param_named(atomic, nouveau_atomic, int, 0400);
-
int
nv50_display_create(struct drm_device *dev)
{
@@ -2154,8 +2150,6 @@ nv50_display_create(struct drm_device *dev)
disp->disp = &nouveau_display(dev)->disp;
dev->mode_config.funcs = &nv50_disp_func;
dev->driver->driver_features |= DRIVER_PREFER_XBGR_30BPP;
- if (nouveau_atomic)
- dev->driver->driver_features |= DRIVER_ATOMIC;
/* small shared memory area we use for notifiers and semaphores */
ret = nouveau_bo_new(&drm->client, 4096, 0x1000, TTM_PL_FLAG_VRAM,
diff --git a/drivers/gpu/drm/nouveau/nouveau_drm.c b/drivers/gpu/drm/nouveau/nouveau_drm.c
index 775443c9af94..0452b18d36b9 100644
--- a/drivers/gpu/drm/nouveau/nouveau_drm.c
+++ b/drivers/gpu/drm/nouveau/nouveau_drm.c
@@ -81,6 +81,10 @@ MODULE_PARM_DESC(modeset, "enable driver (default: auto, "
int nouveau_modeset = -1;
module_param_named(modeset, nouveau_modeset, int, 0400);
+MODULE_PARM_DESC(atomic, "Expose atomic ioctl (default: disabled)");
+static int nouveau_atomic = 0;
+module_param_named(atomic, nouveau_atomic, int, 0400);
+
MODULE_PARM_DESC(runpm, "disable (0), force enable (1), optimus only default (-1)");
static int nouveau_runtime_pm = -1;
module_param_named(runpm, nouveau_runtime_pm, int, 0400);
@@ -509,6 +513,9 @@ static int nouveau_drm_probe(struct pci_dev *pdev,
pci_set_master(pdev);
+ if (nouveau_atomic)
+ driver_pci.driver_features |= DRIVER_ATOMIC;
+
ret = drm_get_pci_dev(pdev, pent, &driver_pci);
if (ret) {
nvkm_device_del(&device);
--
2.17.1