The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 839769c35477d4acc2369e45000ca7b0b6af39a7 Mon Sep 17 00:00:00 2001
From: Max Filippov <jcmvbkbc(a)gmail.com>
Date: Wed, 13 Apr 2022 22:44:36 -0700
Subject: [PATCH] xtensa: fix a7 clobbering in coprocessor context load/store
Fast coprocessor exception handler saves a3..a6, but coprocessor context
load/store code uses a4..a7 as temporaries, potentially clobbering a7.
'Potentially' because coprocessor state load/store macros may not use
all four temporary registers (and neither FPU nor HiFi macros do).
Use a3..a6 as intended.
Cc: stable(a)vger.kernel.org
Fixes: c658eac628aa ("[XTENSA] Add support for configurable registers and coprocessors")
Signed-off-by: Max Filippov <jcmvbkbc(a)gmail.com>
diff --git a/arch/xtensa/kernel/coprocessor.S b/arch/xtensa/kernel/coprocessor.S
index 45cc0ae0af6f..c7b9f12896f2 100644
--- a/arch/xtensa/kernel/coprocessor.S
+++ b/arch/xtensa/kernel/coprocessor.S
@@ -29,7 +29,7 @@
.if XTENSA_HAVE_COPROCESSOR(x); \
.align 4; \
.Lsave_cp_regs_cp##x: \
- xchal_cp##x##_store a2 a4 a5 a6 a7; \
+ xchal_cp##x##_store a2 a3 a4 a5 a6; \
jx a0; \
.endif
@@ -46,7 +46,7 @@
.if XTENSA_HAVE_COPROCESSOR(x); \
.align 4; \
.Lload_cp_regs_cp##x: \
- xchal_cp##x##_load a2 a4 a5 a6 a7; \
+ xchal_cp##x##_load a2 a3 a4 a5 a6; \
jx a0; \
.endif
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 839769c35477d4acc2369e45000ca7b0b6af39a7 Mon Sep 17 00:00:00 2001
From: Max Filippov <jcmvbkbc(a)gmail.com>
Date: Wed, 13 Apr 2022 22:44:36 -0700
Subject: [PATCH] xtensa: fix a7 clobbering in coprocessor context load/store
Fast coprocessor exception handler saves a3..a6, but coprocessor context
load/store code uses a4..a7 as temporaries, potentially clobbering a7.
'Potentially' because coprocessor state load/store macros may not use
all four temporary registers (and neither FPU nor HiFi macros do).
Use a3..a6 as intended.
Cc: stable(a)vger.kernel.org
Fixes: c658eac628aa ("[XTENSA] Add support for configurable registers and coprocessors")
Signed-off-by: Max Filippov <jcmvbkbc(a)gmail.com>
diff --git a/arch/xtensa/kernel/coprocessor.S b/arch/xtensa/kernel/coprocessor.S
index 45cc0ae0af6f..c7b9f12896f2 100644
--- a/arch/xtensa/kernel/coprocessor.S
+++ b/arch/xtensa/kernel/coprocessor.S
@@ -29,7 +29,7 @@
.if XTENSA_HAVE_COPROCESSOR(x); \
.align 4; \
.Lsave_cp_regs_cp##x: \
- xchal_cp##x##_store a2 a4 a5 a6 a7; \
+ xchal_cp##x##_store a2 a3 a4 a5 a6; \
jx a0; \
.endif
@@ -46,7 +46,7 @@
.if XTENSA_HAVE_COPROCESSOR(x); \
.align 4; \
.Lload_cp_regs_cp##x: \
- xchal_cp##x##_load a2 a4 a5 a6 a7; \
+ xchal_cp##x##_load a2 a3 a4 a5 a6; \
jx a0; \
.endif
On 4/22/22 08:47, Damien Le Moal wrote:
> On 4/21/22 10:39, Zheyu Ma wrote:
>> Before detecting the cable type on the dma bar, the driver should check
>> whether the 'bmdma_addr' is zero, which means the adapter does not
>> support DMA, otherwise we will get the following error:
>>
>> [ 5.146634] Bad IO access at port 0x1 (return inb(port))
>> [ 5.147206] WARNING: CPU: 2 PID: 303 at lib/iomap.c:44 ioread8+0x4a/0x60
>> [ 5.150856] RIP: 0010:ioread8+0x4a/0x60
>> [ 5.160238] Call Trace:
>> [ 5.160470] <TASK>
>> [ 5.160674] marvell_cable_detect+0x6e/0xc0 [pata_marvell]
>> [ 5.161728] ata_eh_recover+0x3520/0x6cc0
>> [ 5.168075] ata_do_eh+0x49/0x3c0
>>
>> Signed-off-by: Zheyu Ma <zheyuma97(a)gmail.com>
>> ---Changes in v2:
>> - Delete the useless 'else'
>
> Note for future contributions: The change log should be placed *after*
> the "---" that comes before the "diff" line below. Otherwise, the change
> log pollutes the commit message.
>
> I fixed that and applied to for-5.18-fixes. Thanks.
I completely overlooked that this needs a CC stable...
Greg,
Could you please pickup this commit for stable ?
In Linus tree/rc4, it is:
aafa9f958342 ("ata: pata_marvell: Check the 'bmdma_addr' beforing reading")
Thanks !
>
>> ---
>> drivers/ata/pata_marvell.c | 2 ++
>> 1 file changed, 2 insertions(+)
>>
>> diff --git a/drivers/ata/pata_marvell.c b/drivers/ata/pata_marvell.c
>> index 0c5a51970fbf..014ccb0f45dc 100644
>> --- a/drivers/ata/pata_marvell.c
>> +++ b/drivers/ata/pata_marvell.c
>> @@ -77,6 +77,8 @@ static int marvell_cable_detect(struct ata_port *ap)
>> switch(ap->port_no)
>> {
>> case 0:
>> + if (!ap->ioaddr.bmdma_addr)
>> + return ATA_CBL_PATA_UNK;
>> if (ioread8(ap->ioaddr.bmdma_addr + 1) & 1)
>> return ATA_CBL_PATA40;
>> return ATA_CBL_PATA80;
>
>
--
Damien Le Moal
Western Digital Research
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From e4a38402c36e42df28eb1a5394be87e6571fb48a Mon Sep 17 00:00:00 2001
From: Nico Pache <npache(a)redhat.com>
Date: Thu, 21 Apr 2022 16:36:01 -0700
Subject: [PATCH] oom_kill.c: futex: delay the OOM reaper to allow time for
proper futex cleanup
The pthread struct is allocated on PRIVATE|ANONYMOUS memory [1] which
can be targeted by the oom reaper. This mapping is used to store the
futex robust list head; the kernel does not keep a copy of the robust
list and instead references a userspace address to maintain the
robustness during a process death.
A race can occur between exit_mm and the oom reaper that allows the oom
reaper to free the memory of the futex robust list before the exit path
has handled the futex death:
CPU1 CPU2
--------------------------------------------------------------------
page_fault
do_exit "signal"
wake_oom_reaper
oom_reaper
oom_reap_task_mm (invalidates mm)
exit_mm
exit_mm_release
futex_exit_release
futex_cleanup
exit_robust_list
get_user (EFAULT- can't access memory)
If the get_user EFAULT's, the kernel will be unable to recover the
waiters on the robust_list, leaving userspace mutexes hung indefinitely.
Delay the OOM reaper, allowing more time for the exit path to perform
the futex cleanup.
Reproducer: https://gitlab.com/jsavitz/oom_futex_reproducer
Based on a patch by Michal Hocko.
Link: https://elixir.bootlin.com/glibc/glibc-2.35/source/nptl/allocatestack.c#L370 [1]
Link: https://lkml.kernel.org/r/20220414144042.677008-1-npache@redhat.com
Fixes: 212925802454 ("mm: oom: let oom_reap_task and exit_mmap run concurrently")
Signed-off-by: Joel Savitz <jsavitz(a)redhat.com>
Signed-off-by: Nico Pache <npache(a)redhat.com>
Co-developed-by: Joel Savitz <jsavitz(a)redhat.com>
Suggested-by: Thomas Gleixner <tglx(a)linutronix.de>
Acked-by: Thomas Gleixner <tglx(a)linutronix.de>
Acked-by: Michal Hocko <mhocko(a)suse.com>
Cc: Rafael Aquini <aquini(a)redhat.com>
Cc: Waiman Long <longman(a)redhat.com>
Cc: Herton R. Krzesinski <herton(a)redhat.com>
Cc: Juri Lelli <juri.lelli(a)redhat.com>
Cc: Vincent Guittot <vincent.guittot(a)linaro.org>
Cc: Dietmar Eggemann <dietmar.eggemann(a)arm.com>
Cc: Steven Rostedt <rostedt(a)goodmis.org>
Cc: Ben Segall <bsegall(a)google.com>
Cc: Mel Gorman <mgorman(a)suse.de>
Cc: Daniel Bristot de Oliveira <bristot(a)redhat.com>
Cc: David Rientjes <rientjes(a)google.com>
Cc: Andrea Arcangeli <aarcange(a)redhat.com>
Cc: Davidlohr Bueso <dave(a)stgolabs.net>
Cc: Peter Zijlstra <peterz(a)infradead.org>
Cc: Ingo Molnar <mingo(a)redhat.com>
Cc: Joel Savitz <jsavitz(a)redhat.com>
Cc: Darren Hart <dvhart(a)infradead.org>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds(a)linux-foundation.org>
diff --git a/include/linux/sched.h b/include/linux/sched.h
index d5e3c00b74e1..a8911b1f35aa 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1443,6 +1443,7 @@ struct task_struct {
int pagefault_disabled;
#ifdef CONFIG_MMU
struct task_struct *oom_reaper_list;
+ struct timer_list oom_reaper_timer;
#endif
#ifdef CONFIG_VMAP_STACK
struct vm_struct *stack_vm_area;
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 7ec38194f8e1..49d7df39b02d 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -632,7 +632,7 @@ done:
*/
set_bit(MMF_OOM_SKIP, &mm->flags);
- /* Drop a reference taken by wake_oom_reaper */
+ /* Drop a reference taken by queue_oom_reaper */
put_task_struct(tsk);
}
@@ -644,12 +644,12 @@ static int oom_reaper(void *unused)
struct task_struct *tsk = NULL;
wait_event_freezable(oom_reaper_wait, oom_reaper_list != NULL);
- spin_lock(&oom_reaper_lock);
+ spin_lock_irq(&oom_reaper_lock);
if (oom_reaper_list != NULL) {
tsk = oom_reaper_list;
oom_reaper_list = tsk->oom_reaper_list;
}
- spin_unlock(&oom_reaper_lock);
+ spin_unlock_irq(&oom_reaper_lock);
if (tsk)
oom_reap_task(tsk);
@@ -658,22 +658,48 @@ static int oom_reaper(void *unused)
return 0;
}
-static void wake_oom_reaper(struct task_struct *tsk)
+static void wake_oom_reaper(struct timer_list *timer)
{
- /* mm is already queued? */
- if (test_and_set_bit(MMF_OOM_REAP_QUEUED, &tsk->signal->oom_mm->flags))
- return;
+ struct task_struct *tsk = container_of(timer, struct task_struct,
+ oom_reaper_timer);
+ struct mm_struct *mm = tsk->signal->oom_mm;
+ unsigned long flags;
- get_task_struct(tsk);
+ /* The victim managed to terminate on its own - see exit_mmap */
+ if (test_bit(MMF_OOM_SKIP, &mm->flags)) {
+ put_task_struct(tsk);
+ return;
+ }
- spin_lock(&oom_reaper_lock);
+ spin_lock_irqsave(&oom_reaper_lock, flags);
tsk->oom_reaper_list = oom_reaper_list;
oom_reaper_list = tsk;
- spin_unlock(&oom_reaper_lock);
+ spin_unlock_irqrestore(&oom_reaper_lock, flags);
trace_wake_reaper(tsk->pid);
wake_up(&oom_reaper_wait);
}
+/*
+ * Give the OOM victim time to exit naturally before invoking the oom_reaping.
+ * The timers timeout is arbitrary... the longer it is, the longer the worst
+ * case scenario for the OOM can take. If it is too small, the oom_reaper can
+ * get in the way and release resources needed by the process exit path.
+ * e.g. The futex robust list can sit in Anon|Private memory that gets reaped
+ * before the exit path is able to wake the futex waiters.
+ */
+#define OOM_REAPER_DELAY (2*HZ)
+static void queue_oom_reaper(struct task_struct *tsk)
+{
+ /* mm is already queued? */
+ if (test_and_set_bit(MMF_OOM_REAP_QUEUED, &tsk->signal->oom_mm->flags))
+ return;
+
+ get_task_struct(tsk);
+ timer_setup(&tsk->oom_reaper_timer, wake_oom_reaper, 0);
+ tsk->oom_reaper_timer.expires = jiffies + OOM_REAPER_DELAY;
+ add_timer(&tsk->oom_reaper_timer);
+}
+
static int __init oom_init(void)
{
oom_reaper_th = kthread_run(oom_reaper, NULL, "oom_reaper");
@@ -681,7 +707,7 @@ static int __init oom_init(void)
}
subsys_initcall(oom_init)
#else
-static inline void wake_oom_reaper(struct task_struct *tsk)
+static inline void queue_oom_reaper(struct task_struct *tsk)
{
}
#endif /* CONFIG_MMU */
@@ -932,7 +958,7 @@ static void __oom_kill_process(struct task_struct *victim, const char *message)
rcu_read_unlock();
if (can_oom_reap)
- wake_oom_reaper(victim);
+ queue_oom_reaper(victim);
mmdrop(mm);
put_task_struct(victim);
@@ -968,7 +994,7 @@ static void oom_kill_process(struct oom_control *oc, const char *message)
task_lock(victim);
if (task_will_free_mem(victim)) {
mark_oom_victim(victim);
- wake_oom_reaper(victim);
+ queue_oom_reaper(victim);
task_unlock(victim);
put_task_struct(victim);
return;
@@ -1067,7 +1093,7 @@ bool out_of_memory(struct oom_control *oc)
*/
if (task_will_free_mem(current)) {
mark_oom_victim(current);
- wake_oom_reaper(current);
+ queue_oom_reaper(current);
return true;
}
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From e4a38402c36e42df28eb1a5394be87e6571fb48a Mon Sep 17 00:00:00 2001
From: Nico Pache <npache(a)redhat.com>
Date: Thu, 21 Apr 2022 16:36:01 -0700
Subject: [PATCH] oom_kill.c: futex: delay the OOM reaper to allow time for
proper futex cleanup
The pthread struct is allocated on PRIVATE|ANONYMOUS memory [1] which
can be targeted by the oom reaper. This mapping is used to store the
futex robust list head; the kernel does not keep a copy of the robust
list and instead references a userspace address to maintain the
robustness during a process death.
A race can occur between exit_mm and the oom reaper that allows the oom
reaper to free the memory of the futex robust list before the exit path
has handled the futex death:
CPU1 CPU2
--------------------------------------------------------------------
page_fault
do_exit "signal"
wake_oom_reaper
oom_reaper
oom_reap_task_mm (invalidates mm)
exit_mm
exit_mm_release
futex_exit_release
futex_cleanup
exit_robust_list
get_user (EFAULT- can't access memory)
If the get_user EFAULT's, the kernel will be unable to recover the
waiters on the robust_list, leaving userspace mutexes hung indefinitely.
Delay the OOM reaper, allowing more time for the exit path to perform
the futex cleanup.
Reproducer: https://gitlab.com/jsavitz/oom_futex_reproducer
Based on a patch by Michal Hocko.
Link: https://elixir.bootlin.com/glibc/glibc-2.35/source/nptl/allocatestack.c#L370 [1]
Link: https://lkml.kernel.org/r/20220414144042.677008-1-npache@redhat.com
Fixes: 212925802454 ("mm: oom: let oom_reap_task and exit_mmap run concurrently")
Signed-off-by: Joel Savitz <jsavitz(a)redhat.com>
Signed-off-by: Nico Pache <npache(a)redhat.com>
Co-developed-by: Joel Savitz <jsavitz(a)redhat.com>
Suggested-by: Thomas Gleixner <tglx(a)linutronix.de>
Acked-by: Thomas Gleixner <tglx(a)linutronix.de>
Acked-by: Michal Hocko <mhocko(a)suse.com>
Cc: Rafael Aquini <aquini(a)redhat.com>
Cc: Waiman Long <longman(a)redhat.com>
Cc: Herton R. Krzesinski <herton(a)redhat.com>
Cc: Juri Lelli <juri.lelli(a)redhat.com>
Cc: Vincent Guittot <vincent.guittot(a)linaro.org>
Cc: Dietmar Eggemann <dietmar.eggemann(a)arm.com>
Cc: Steven Rostedt <rostedt(a)goodmis.org>
Cc: Ben Segall <bsegall(a)google.com>
Cc: Mel Gorman <mgorman(a)suse.de>
Cc: Daniel Bristot de Oliveira <bristot(a)redhat.com>
Cc: David Rientjes <rientjes(a)google.com>
Cc: Andrea Arcangeli <aarcange(a)redhat.com>
Cc: Davidlohr Bueso <dave(a)stgolabs.net>
Cc: Peter Zijlstra <peterz(a)infradead.org>
Cc: Ingo Molnar <mingo(a)redhat.com>
Cc: Joel Savitz <jsavitz(a)redhat.com>
Cc: Darren Hart <dvhart(a)infradead.org>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds(a)linux-foundation.org>
diff --git a/include/linux/sched.h b/include/linux/sched.h
index d5e3c00b74e1..a8911b1f35aa 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1443,6 +1443,7 @@ struct task_struct {
int pagefault_disabled;
#ifdef CONFIG_MMU
struct task_struct *oom_reaper_list;
+ struct timer_list oom_reaper_timer;
#endif
#ifdef CONFIG_VMAP_STACK
struct vm_struct *stack_vm_area;
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 7ec38194f8e1..49d7df39b02d 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -632,7 +632,7 @@ done:
*/
set_bit(MMF_OOM_SKIP, &mm->flags);
- /* Drop a reference taken by wake_oom_reaper */
+ /* Drop a reference taken by queue_oom_reaper */
put_task_struct(tsk);
}
@@ -644,12 +644,12 @@ static int oom_reaper(void *unused)
struct task_struct *tsk = NULL;
wait_event_freezable(oom_reaper_wait, oom_reaper_list != NULL);
- spin_lock(&oom_reaper_lock);
+ spin_lock_irq(&oom_reaper_lock);
if (oom_reaper_list != NULL) {
tsk = oom_reaper_list;
oom_reaper_list = tsk->oom_reaper_list;
}
- spin_unlock(&oom_reaper_lock);
+ spin_unlock_irq(&oom_reaper_lock);
if (tsk)
oom_reap_task(tsk);
@@ -658,22 +658,48 @@ static int oom_reaper(void *unused)
return 0;
}
-static void wake_oom_reaper(struct task_struct *tsk)
+static void wake_oom_reaper(struct timer_list *timer)
{
- /* mm is already queued? */
- if (test_and_set_bit(MMF_OOM_REAP_QUEUED, &tsk->signal->oom_mm->flags))
- return;
+ struct task_struct *tsk = container_of(timer, struct task_struct,
+ oom_reaper_timer);
+ struct mm_struct *mm = tsk->signal->oom_mm;
+ unsigned long flags;
- get_task_struct(tsk);
+ /* The victim managed to terminate on its own - see exit_mmap */
+ if (test_bit(MMF_OOM_SKIP, &mm->flags)) {
+ put_task_struct(tsk);
+ return;
+ }
- spin_lock(&oom_reaper_lock);
+ spin_lock_irqsave(&oom_reaper_lock, flags);
tsk->oom_reaper_list = oom_reaper_list;
oom_reaper_list = tsk;
- spin_unlock(&oom_reaper_lock);
+ spin_unlock_irqrestore(&oom_reaper_lock, flags);
trace_wake_reaper(tsk->pid);
wake_up(&oom_reaper_wait);
}
+/*
+ * Give the OOM victim time to exit naturally before invoking the oom_reaping.
+ * The timers timeout is arbitrary... the longer it is, the longer the worst
+ * case scenario for the OOM can take. If it is too small, the oom_reaper can
+ * get in the way and release resources needed by the process exit path.
+ * e.g. The futex robust list can sit in Anon|Private memory that gets reaped
+ * before the exit path is able to wake the futex waiters.
+ */
+#define OOM_REAPER_DELAY (2*HZ)
+static void queue_oom_reaper(struct task_struct *tsk)
+{
+ /* mm is already queued? */
+ if (test_and_set_bit(MMF_OOM_REAP_QUEUED, &tsk->signal->oom_mm->flags))
+ return;
+
+ get_task_struct(tsk);
+ timer_setup(&tsk->oom_reaper_timer, wake_oom_reaper, 0);
+ tsk->oom_reaper_timer.expires = jiffies + OOM_REAPER_DELAY;
+ add_timer(&tsk->oom_reaper_timer);
+}
+
static int __init oom_init(void)
{
oom_reaper_th = kthread_run(oom_reaper, NULL, "oom_reaper");
@@ -681,7 +707,7 @@ static int __init oom_init(void)
}
subsys_initcall(oom_init)
#else
-static inline void wake_oom_reaper(struct task_struct *tsk)
+static inline void queue_oom_reaper(struct task_struct *tsk)
{
}
#endif /* CONFIG_MMU */
@@ -932,7 +958,7 @@ static void __oom_kill_process(struct task_struct *victim, const char *message)
rcu_read_unlock();
if (can_oom_reap)
- wake_oom_reaper(victim);
+ queue_oom_reaper(victim);
mmdrop(mm);
put_task_struct(victim);
@@ -968,7 +994,7 @@ static void oom_kill_process(struct oom_control *oc, const char *message)
task_lock(victim);
if (task_will_free_mem(victim)) {
mark_oom_victim(victim);
- wake_oom_reaper(victim);
+ queue_oom_reaper(victim);
task_unlock(victim);
put_task_struct(victim);
return;
@@ -1067,7 +1093,7 @@ bool out_of_memory(struct oom_control *oc)
*/
if (task_will_free_mem(current)) {
mark_oom_victim(current);
- wake_oom_reaper(current);
+ queue_oom_reaper(current);
return true;
}
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 5f24d5a579d1eace79d505b148808a850b417d4c Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy(a)csgroup.eu>
Date: Thu, 21 Apr 2022 16:35:46 -0700
Subject: [PATCH] mm, hugetlb: allow for "high" userspace addresses
This is a fix for commit f6795053dac8 ("mm: mmap: Allow for "high"
userspace addresses") for hugetlb.
This patch adds support for "high" userspace addresses that are
optionally supported on the system and have to be requested via a hint
mechanism ("high" addr parameter to mmap).
Architectures such as powerpc and x86 achieve this by making changes to
their architectural versions of hugetlb_get_unmapped_area() function.
However, arm64 uses the generic version of that function.
So take into account arch_get_mmap_base() and arch_get_mmap_end() in
hugetlb_get_unmapped_area(). To allow that, move those two macros out
of mm/mmap.c into include/linux/sched/mm.h
If these macros are not defined in architectural code then they default
to (TASK_SIZE) and (base) so should not introduce any behavioural
changes to architectures that do not define them.
For the time being, only ARM64 is affected by this change.
Catalin (ARM64) said
"We should have fixed hugetlb_get_unmapped_area() as well when we added
support for 52-bit VA. The reason for commit f6795053dac8 was to
prevent normal mmap() from returning addresses above 48-bit by default
as some user-space had hard assumptions about this.
It's a slight ABI change if you do this for hugetlb_get_unmapped_area()
but I doubt anyone would notice. It's more likely that the current
behaviour would cause issues, so I'd rather have them consistent.
Basically when arm64 gained support for 52-bit addresses we did not
want user-space calling mmap() to suddenly get such high addresses,
otherwise we could have inadvertently broken some programs (similar
behaviour to x86 here). Hence we added commit f6795053dac8. But we
missed hugetlbfs which could still get such high mmap() addresses. So
in theory that's a potential regression that should have bee addressed
at the same time as commit f6795053dac8 (and before arm64 enabled
52-bit addresses)"
Link: https://lkml.kernel.org/r/ab847b6edb197bffdfe189e70fb4ac76bfe79e0d.16500337…
Fixes: f6795053dac8 ("mm: mmap: Allow for "high" userspace addresses")
Signed-off-by: Christophe Leroy <christophe.leroy(a)csgroup.eu>
Reviewed-by: Catalin Marinas <catalin.marinas(a)arm.com>
Cc: Steve Capper <steve.capper(a)arm.com>
Cc: Will Deacon <will.deacon(a)arm.com>
Cc: <stable(a)vger.kernel.org> [5.0.x]
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds(a)linux-foundation.org>
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 99c7477cee5c..dd3a088db11d 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -206,7 +206,7 @@ hugetlb_get_unmapped_area_bottomup(struct file *file, unsigned long addr,
info.flags = 0;
info.length = len;
info.low_limit = current->mm->mmap_base;
- info.high_limit = TASK_SIZE;
+ info.high_limit = arch_get_mmap_end(addr);
info.align_mask = PAGE_MASK & ~huge_page_mask(h);
info.align_offset = 0;
return vm_unmapped_area(&info);
@@ -222,7 +222,7 @@ hugetlb_get_unmapped_area_topdown(struct file *file, unsigned long addr,
info.flags = VM_UNMAPPED_AREA_TOPDOWN;
info.length = len;
info.low_limit = max(PAGE_SIZE, mmap_min_addr);
- info.high_limit = current->mm->mmap_base;
+ info.high_limit = arch_get_mmap_base(addr, current->mm->mmap_base);
info.align_mask = PAGE_MASK & ~huge_page_mask(h);
info.align_offset = 0;
addr = vm_unmapped_area(&info);
@@ -237,7 +237,7 @@ hugetlb_get_unmapped_area_topdown(struct file *file, unsigned long addr,
VM_BUG_ON(addr != -ENOMEM);
info.flags = 0;
info.low_limit = current->mm->mmap_base;
- info.high_limit = TASK_SIZE;
+ info.high_limit = arch_get_mmap_end(addr);
addr = vm_unmapped_area(&info);
}
@@ -251,6 +251,7 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma;
struct hstate *h = hstate_file(file);
+ const unsigned long mmap_end = arch_get_mmap_end(addr);
if (len & ~huge_page_mask(h))
return -EINVAL;
@@ -266,7 +267,7 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
if (addr) {
addr = ALIGN(addr, huge_page_size(h));
vma = find_vma(mm, addr);
- if (TASK_SIZE - len >= addr &&
+ if (mmap_end - len >= addr &&
(!vma || addr + len <= vm_start_gap(vma)))
return addr;
}
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index a80356e9dc69..1ad1f4bfa025 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -136,6 +136,14 @@ static inline void mm_update_next_owner(struct mm_struct *mm)
#endif /* CONFIG_MEMCG */
#ifdef CONFIG_MMU
+#ifndef arch_get_mmap_end
+#define arch_get_mmap_end(addr) (TASK_SIZE)
+#endif
+
+#ifndef arch_get_mmap_base
+#define arch_get_mmap_base(addr, base) (base)
+#endif
+
extern void arch_pick_mmap_layout(struct mm_struct *mm,
struct rlimit *rlim_stack);
extern unsigned long
diff --git a/mm/mmap.c b/mm/mmap.c
index 3aa839f81e63..313b57d55a63 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2117,14 +2117,6 @@ unsigned long vm_unmapped_area(struct vm_unmapped_area_info *info)
return addr;
}
-#ifndef arch_get_mmap_end
-#define arch_get_mmap_end(addr) (TASK_SIZE)
-#endif
-
-#ifndef arch_get_mmap_base
-#define arch_get_mmap_base(addr, base) (base)
-#endif
-
/* Get an address range which is currently unmapped.
* For shmat() with addr=0.
*