The driver must not call tty_wakeup() while holding its private lock as
line disciplines are allowed to call back into write() from
write_wakeup(), leading to a deadlock.
Also remove the unneeded work struct that was used to defer wakeup in
order to work around a possible race in ancient times (see comment about
n_tty write_chan() in commit 14b54e39b412 ("USB: serial: remove
changelogs and old todo entries")).
Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
Cc: stable(a)vger.kernel.org
Signed-off-by: Johan Hovold <johan(a)kernel.org>
---
drivers/usb/serial/digi_acceleport.c | 45 ++++++++--------------------
1 file changed, 13 insertions(+), 32 deletions(-)
diff --git a/drivers/usb/serial/digi_acceleport.c b/drivers/usb/serial/digi_acceleport.c
index 91055a191995..0d606fa9fdca 100644
--- a/drivers/usb/serial/digi_acceleport.c
+++ b/drivers/usb/serial/digi_acceleport.c
@@ -19,7 +19,6 @@
#include <linux/tty_flip.h>
#include <linux/module.h>
#include <linux/spinlock.h>
-#include <linux/workqueue.h>
#include <linux/uaccess.h>
#include <linux/usb.h>
#include <linux/wait.h>
@@ -198,14 +197,12 @@ struct digi_port {
int dp_throttle_restart;
wait_queue_head_t dp_flush_wait;
wait_queue_head_t dp_close_wait; /* wait queue for close */
- struct work_struct dp_wakeup_work;
struct usb_serial_port *dp_port;
};
/* Local Function Declarations */
-static void digi_wakeup_write_lock(struct work_struct *work);
static int digi_write_oob_command(struct usb_serial_port *port,
unsigned char *buf, int count, int interruptible);
static int digi_write_inb_command(struct usb_serial_port *port,
@@ -356,26 +353,6 @@ __releases(lock)
return timeout;
}
-
-/*
- * Digi Wakeup Write
- *
- * Wake up port, line discipline, and tty processes sleeping
- * on writes.
- */
-
-static void digi_wakeup_write_lock(struct work_struct *work)
-{
- struct digi_port *priv =
- container_of(work, struct digi_port, dp_wakeup_work);
- struct usb_serial_port *port = priv->dp_port;
- unsigned long flags;
-
- spin_lock_irqsave(&priv->dp_port_lock, flags);
- tty_port_tty_wakeup(&port->port);
- spin_unlock_irqrestore(&priv->dp_port_lock, flags);
-}
-
/*
* Digi Write OOB Command
*
@@ -986,6 +963,7 @@ static void digi_write_bulk_callback(struct urb *urb)
unsigned long flags;
int ret = 0;
int status = urb->status;
+ bool wakeup;
/* port and serial sanity check */
if (port == NULL || (priv = usb_get_serial_port_data(port)) == NULL) {
@@ -1012,6 +990,7 @@ static void digi_write_bulk_callback(struct urb *urb)
}
/* try to send any buffered data on this port */
+ wakeup = true;
spin_lock_irqsave(&priv->dp_port_lock, flags);
priv->dp_write_urb_in_use = 0;
if (priv->dp_out_buf_len > 0) {
@@ -1027,19 +1006,18 @@ static void digi_write_bulk_callback(struct urb *urb)
if (ret == 0) {
priv->dp_write_urb_in_use = 1;
priv->dp_out_buf_len = 0;
+ wakeup = false;
}
}
- /* wake up processes sleeping on writes immediately */
- tty_port_tty_wakeup(&port->port);
- /* also queue up a wakeup at scheduler time, in case we */
- /* lost the race in write_chan(). */
- schedule_work(&priv->dp_wakeup_work);
-
spin_unlock_irqrestore(&priv->dp_port_lock, flags);
+
if (ret && ret != -EPERM)
dev_err_console(port,
"%s: usb_submit_urb failed, ret=%d, port=%d\n",
__func__, ret, priv->dp_port_num);
+
+ if (wakeup)
+ tty_port_tty_wakeup(&port->port);
}
static int digi_write_room(struct tty_struct *tty)
@@ -1239,7 +1217,6 @@ static int digi_port_init(struct usb_serial_port *port, unsigned port_num)
init_waitqueue_head(&priv->dp_transmit_idle_wait);
init_waitqueue_head(&priv->dp_flush_wait);
init_waitqueue_head(&priv->dp_close_wait);
- INIT_WORK(&priv->dp_wakeup_work, digi_wakeup_write_lock);
priv->dp_port = port;
init_waitqueue_head(&port->write_wait);
@@ -1508,13 +1485,14 @@ static int digi_read_oob_callback(struct urb *urb)
rts = C_CRTSCTS(tty);
if (tty && opcode == DIGI_CMD_READ_INPUT_SIGNALS) {
+ bool wakeup = false;
+
spin_lock_irqsave(&priv->dp_port_lock, flags);
/* convert from digi flags to termiox flags */
if (val & DIGI_READ_INPUT_SIGNALS_CTS) {
priv->dp_modem_signals |= TIOCM_CTS;
- /* port must be open to use tty struct */
if (rts)
- tty_port_tty_wakeup(&port->port);
+ wakeup = true;
} else {
priv->dp_modem_signals &= ~TIOCM_CTS;
/* port must be open to use tty struct */
@@ -1533,6 +1511,9 @@ static int digi_read_oob_callback(struct urb *urb)
priv->dp_modem_signals &= ~TIOCM_CD;
spin_unlock_irqrestore(&priv->dp_port_lock, flags);
+
+ if (wakeup)
+ tty_port_tty_wakeup(&port->port);
} else if (opcode == DIGI_CMD_TRANSMIT_IDLE) {
spin_lock_irqsave(&priv->dp_port_lock, flags);
priv->dp_transmit_idle = 1;
--
2.26.2
This is a bit of a mess, to put it mildly. But, it's a bug
that seems to have gone unticked up to now, probably because
nobody uses MPX. The other alternative to this fix is to just
deprecate MPX, even in -stable kernels.
MPX has the arch_unmap() hook inside of munmap() because MPX
uses bounds tables that protect other areas of memory. When
memory is unmapped, there is also a need to unmap the MPX
bounds tables. Barring this, unused bounds tables can eat 80%
of the address space.
But, the recursive do_munmap() that gets called vi arch_unmap()
wreaks havoc with __do_munmap()'s state. It can result in
freeing populated page tables, accessing bogus VMA state,
double-freed VMAs and more.
To fix this, call arch_unmap() before __do_unmap() has a chance
to do anything meaningful. Also, remove the 'vma' argument
and force the MPX code to do its own, independent VMA lookup.
For the common success case this is functionally identical to
what was there before. For the munmap() failure case, it's
possible that some MPX tables will be zapped for memory that
continues to be in use. But, this is an extraordinarily
unlikely scenario and the harm would be that MPX provides no
protection since the bounds table got reset (zeroed).
I can't imagine anyone doing this:
ptr = mmap();
// use ptr
ret = munmap(ptr);
if (ret)
// oh, there was an error, I'll
// keep using ptr.
Because if you're doing munmap(), you are *done* with the
memory. There's probably no good data in there _anyway_.
This passes the original reproducer from Richard Biener as
well as the existing mpx selftests/.
====
The long story:
munmap() has a couple of pieces:
1. Find the affected VMA(s)
2. Split the start/end one(s) if neceesary
3. Pull the VMAs out of the rbtree
4. Actually zap the memory via unmap_region(), including
freeing page tables (or queueing them to be freed).
5. Fixup some of the accounting (like fput()) and actually
free the VMA itself.
I decided to put the arch_unmap() call right afer #3. This
was *just* before mmap_sem looked like it might get downgraded
(it won't in this context), but it looked right. It wasn't.
Richard Biener reported a test that shows this in dmesg:
[1216548.787498] BUG: Bad rss-counter state mm:0000000017ce560b idx:1 val:551
[1216548.787500] BUG: non-zero pgtables_bytes on freeing mm: 24576
What triggered this was the recursive do_munmap() called via
arch_unmap(). It was freeing page tables that has not been
properly zapped.
But, the problem was bigger than this. For one, arch_unmap()
can free VMAs. But, the calling __do_munmap() has variables
that *point* to VMAs and obviously can't handle them just
getting freed while the pointer is still valid.
I tried a couple of things here. First, I tried to fix the page
table freeing problem in isolation, but I then found the VMA
issue. I also tried having the MPX code return a flag if it
modified the rbtree which would force __do_munmap() to re-walk
to restart. That spiralled out of control in complexity pretty
fast.
Just moving arch_unmap() and accepting that the bonkers failure
case might eat some bounds tables seems like the simplest viable
fix.
Reported-by: Richard Biener <rguenther(a)suse.de>
Cc: Michal Hocko <mhocko(a)suse.com>
Cc: Vlastimil Babka <vbabka(a)suse.cz>
Cc: Andy Lutomirski <luto(a)amacapital.net>
Cc: x86(a)kernel.org
Cc: Andrew Morton <akpm(a)linux-foundation.org>
Cc: linux-kernel(a)vger.kernel.org
Cc: linux-mm(a)kvack.org
Cc: stable(a)vger.kernel.org
---
b/arch/x86/include/asm/mmu_context.h | 6 +++---
b/arch/x86/include/asm/mpx.h | 5 ++---
b/arch/x86/mm/mpx.c | 10 ++++++----
b/include/asm-generic/mm_hooks.h | 1 -
b/mm/mmap.c | 15 ++++++++-------
5 files changed, 19 insertions(+), 18 deletions(-)
diff -puN mm/mmap.c~mpx-rss-pass-no-vma mm/mmap.c
--- a/mm/mmap.c~mpx-rss-pass-no-vma 2019-04-01 06:56:53.409411123 -0700
+++ b/mm/mmap.c 2019-04-01 06:56:53.423411123 -0700
@@ -2731,9 +2731,17 @@ int __do_munmap(struct mm_struct *mm, un
return -EINVAL;
len = PAGE_ALIGN(len);
+ end = start + len;
if (len == 0)
return -EINVAL;
+ /*
+ * arch_unmap() might do unmaps itself. It must be called
+ * and finish any rbtree manipulation before this code
+ * runs and also starts to manipulate the rbtree.
+ */
+ arch_unmap(mm, start, end);
+
/* Find the first overlapping VMA */
vma = find_vma(mm, start);
if (!vma)
@@ -2742,7 +2750,6 @@ int __do_munmap(struct mm_struct *mm, un
/* we have start < vma->vm_end */
/* if it doesn't overlap, we have nothing.. */
- end = start + len;
if (vma->vm_start >= end)
return 0;
@@ -2812,12 +2819,6 @@ int __do_munmap(struct mm_struct *mm, un
/* Detach vmas from rbtree */
detach_vmas_to_be_unmapped(mm, vma, prev, end);
- /*
- * mpx unmap needs to be called with mmap_sem held for write.
- * It is safe to call it before unmap_region().
- */
- arch_unmap(mm, vma, start, end);
-
if (downgrade)
downgrade_write(&mm->mmap_sem);
diff -puN arch/x86/include/asm/mmu_context.h~mpx-rss-pass-no-vma arch/x86/include/asm/mmu_context.h
--- a/arch/x86/include/asm/mmu_context.h~mpx-rss-pass-no-vma 2019-04-01 06:56:53.412411123 -0700
+++ b/arch/x86/include/asm/mmu_context.h 2019-04-01 06:56:53.423411123 -0700
@@ -277,8 +277,8 @@ static inline void arch_bprm_mm_init(str
mpx_mm_init(mm);
}
-static inline void arch_unmap(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long start, unsigned long end)
+static inline void arch_unmap(struct mm_struct *mm, unsigned long start,
+ unsigned long end)
{
/*
* mpx_notify_unmap() goes and reads a rarely-hot
@@ -298,7 +298,7 @@ static inline void arch_unmap(struct mm_
* consistently wrong.
*/
if (unlikely(cpu_feature_enabled(X86_FEATURE_MPX)))
- mpx_notify_unmap(mm, vma, start, end);
+ mpx_notify_unmap(mm, start, end);
}
/*
diff -puN include/asm-generic/mm_hooks.h~mpx-rss-pass-no-vma include/asm-generic/mm_hooks.h
--- a/include/asm-generic/mm_hooks.h~mpx-rss-pass-no-vma 2019-04-01 06:56:53.414411123 -0700
+++ b/include/asm-generic/mm_hooks.h 2019-04-01 06:56:53.423411123 -0700
@@ -18,7 +18,6 @@ static inline void arch_exit_mmap(struct
}
static inline void arch_unmap(struct mm_struct *mm,
- struct vm_area_struct *vma,
unsigned long start, unsigned long end)
{
}
diff -puN arch/x86/mm/mpx.c~mpx-rss-pass-no-vma arch/x86/mm/mpx.c
--- a/arch/x86/mm/mpx.c~mpx-rss-pass-no-vma 2019-04-01 06:56:53.416411123 -0700
+++ b/arch/x86/mm/mpx.c 2019-04-01 06:56:53.423411123 -0700
@@ -881,9 +881,10 @@ static int mpx_unmap_tables(struct mm_st
* the virtual address region start...end have already been split if
* necessary, and the 'vma' is the first vma in this range (start -> end).
*/
-void mpx_notify_unmap(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long start, unsigned long end)
+void mpx_notify_unmap(struct mm_struct *mm, unsigned long start,
+ unsigned long end)
{
+ struct vm_area_struct *vma;
int ret;
/*
@@ -902,11 +903,12 @@ void mpx_notify_unmap(struct mm_struct *
* which should not occur normally. Being strict about it here
* helps ensure that we do not have an exploitable stack overflow.
*/
- do {
+ vma = find_vma(mm, start);
+ while (vma && vma->vm_start < end) {
if (vma->vm_flags & VM_MPX)
return;
vma = vma->vm_next;
- } while (vma && vma->vm_start < end);
+ }
ret = mpx_unmap_tables(mm, start, end);
if (ret)
diff -puN arch/x86/include/asm/mpx.h~mpx-rss-pass-no-vma arch/x86/include/asm/mpx.h
--- a/arch/x86/include/asm/mpx.h~mpx-rss-pass-no-vma 2019-04-01 06:56:53.418411123 -0700
+++ b/arch/x86/include/asm/mpx.h 2019-04-01 06:56:53.424411123 -0700
@@ -78,8 +78,8 @@ static inline void mpx_mm_init(struct mm
*/
mm->context.bd_addr = MPX_INVALID_BOUNDS_DIR;
}
-void mpx_notify_unmap(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long start, unsigned long end);
+void mpx_notify_unmap(struct mm_struct *mm, unsigned long start,
+ unsigned long end);
unsigned long mpx_unmapped_area_check(unsigned long addr, unsigned long len,
unsigned long flags);
@@ -100,7 +100,6 @@ static inline void mpx_mm_init(struct mm
{
}
static inline void mpx_notify_unmap(struct mm_struct *mm,
- struct vm_area_struct *vma,
unsigned long start, unsigned long end)
{
}
_
From: Nicholas Piggin <npiggin(a)gmail.com>
commit d53c3dfb23c45f7d4f910c3a3ca84bf0a99c6143 upstream.
Reading and modifying current->mm and current->active_mm and switching
mm should be done with irqs off, to prevent races seeing an intermediate
state.
This is similar to commit 38cf307c1f20 ("mm: fix kthread_use_mm() vs TLB
invalidate"). At exec-time when the new mm is activated, the old one
should usually be single-threaded and no longer used, unless something
else is holding an mm_users reference (which may be possible).
Absent other mm_users, there is also a race with preemption and lazy tlb
switching. Consider the kernel_execve case where the current thread is
using a lazy tlb active mm:
call_usermodehelper()
kernel_execve()
old_mm = current->mm;
active_mm = current->active_mm;
*** preempt *** --------------------> schedule()
prev->active_mm = NULL;
mmdrop(prev active_mm);
...
<-------------------- schedule()
current->mm = mm;
current->active_mm = mm;
if (!old_mm)
mmdrop(active_mm);
If we switch back to the kernel thread from a different mm, there is a
double free of the old active_mm, and a missing free of the new one.
Closing this race only requires interrupts to be disabled while ->mm
and ->active_mm are being switched, but the TLB problem requires also
holding interrupts off over activate_mm. Unfortunately not all archs
can do that yet, e.g., arm defers the switch if irqs are disabled and
expects finish_arch_post_lock_switch() to be called to complete the
flush; um takes a blocking lock in activate_mm().
So as a first step, disable interrupts across the mm/active_mm updates
to close the lazy tlb preempt race, and provide an arch option to
extend that to activate_mm which allows architectures doing IPI based
TLB shootdowns to close the second race.
This is a bit ugly, but in the interest of fixing the bug and backporting
before all architectures are converted this is a compromise.
Signed-off-by: Nicholas Piggin <npiggin(a)gmail.com>
Acked-by: Peter Zijlstra (Intel) <peterz(a)infradead.org>
[mpe: Manual backport to 4.19 due to membarrier_exec_mmap(mm) changes]
Signed-off-by: Michael Ellerman <mpe(a)ellerman.id.au>
Link: https://lore.kernel.org/r/20200914045219.3736466-2-npiggin@gmail.com
---
arch/Kconfig | 7 +++++++
fs/exec.c | 15 ++++++++++++++-
2 files changed, 21 insertions(+), 1 deletion(-)
diff --git a/arch/Kconfig b/arch/Kconfig
index a336548487e6..e3a030f7a722 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -366,6 +366,13 @@ config HAVE_RCU_TABLE_FREE
config HAVE_RCU_TABLE_INVALIDATE
bool
+config ARCH_WANT_IRQS_OFF_ACTIVATE_MM
+ bool
+ help
+ Temporary select until all architectures can be converted to have
+ irqs disabled over activate_mm. Architectures that do IPI based TLB
+ shootdowns should enable this.
+
config ARCH_HAVE_NMI_SAFE_CMPXCHG
bool
diff --git a/fs/exec.c b/fs/exec.c
index cece8c14f377..52788644c4af 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1028,10 +1028,23 @@ static int exec_mmap(struct mm_struct *mm)
}
}
task_lock(tsk);
+
+ local_irq_disable();
active_mm = tsk->active_mm;
- tsk->mm = mm;
tsk->active_mm = mm;
+ tsk->mm = mm;
+ /*
+ * This prevents preemption while active_mm is being loaded and
+ * it and mm are being updated, which could cause problems for
+ * lazy tlb mm refcounting when these are updated by context
+ * switches. Not all architectures can handle irqs off over
+ * activate_mm yet.
+ */
+ if (!IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM))
+ local_irq_enable();
activate_mm(active_mm, mm);
+ if (IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM))
+ local_irq_enable();
tsk->mm->vmacache_seqnum = 0;
vmacache_flush(tsk);
task_unlock(tsk);
--
2.25.1