The function brcmf_usb_dl_writeimage() calls the function
brcmf_usb_dl_cmd() but dose not check its return value. The
'state.state' and the 'state.bytes' are uninitialized if the
function brcmf_usb_dl_cmd() fails. It is dangerous to use
uninitialized variables in the conditions.
Add error handling for brcmf_usb_dl_cmd() to jump to error
handling path if the brcmf_usb_dl_cmd() fails and the
'state.state' and the 'state.bytes' are uninitialized.
Fixes: 71bb244ba2fd ("brcm80211: fmac: add USB support for bcm43235/6/8 chipsets")
Cc: stable(a)vger.kernel.org # v3.4+
Signed-off-by: Wentao Liang <vulab(a)iscas.ac.cn>
---
drivers/net/wireless/broadcom/brcm80211/brcmfmac/usb.c | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/usb.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/usb.c
index 50dddac8a2ab..1c97cd777225 100644
--- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/usb.c
+++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/usb.c
@@ -901,7 +901,9 @@ brcmf_usb_dl_writeimage(struct brcmf_usbdev_info *devinfo, u8 *fw, int fwlen)
}
/* 1) Prepare USB boot loader for runtime image */
- brcmf_usb_dl_cmd(devinfo, DL_START, &state, sizeof(state));
+ err = brcmf_usb_dl_cmd(devinfo, DL_START, &state, sizeof(state));
+ if (err)
+ goto fail;
rdlstate = le32_to_cpu(state.state);
rdlbytes = le32_to_cpu(state.bytes);
--
2.42.0.windows.2
When a CPU chooses to call push_dl_task and picks a task to push to
another CPU's runqueue then it will call find_lock_later_rq method
which would take a double lock on both CPUs' runqueues. If one of the
locks aren't readily available, it may lead to dropping the current
runqueue lock and reacquiring both the locks at once. During this window
it is possible that the task is already migrated and is running on some
other CPU. These cases are already handled. However, if the task is
migrated and has already been executed and another CPU is now trying to
wake it up (ttwu) such that it is queued again on the runqeue
(on_rq is 1) and also if the task was run by the same CPU, then the
current checks will pass even though the task was migrated out and is no
longer in the pushable tasks list.
Please go through the original rt change for more details on the issue.
To fix this, after the lock is obtained inside the find_lock_later_rq,
it ensures that the task is still at the head of pushable tasks list.
Also removed some checks that are no longer needed with the addition of
this new check.
However, the new check of pushable tasks list only applies when
find_lock_later_rq is called by push_dl_task. For the other caller i.e.
dl_task_offline_migration, existing checks are used.
Signed-off-by: Harshit Agarwal <harshit(a)nutanix.com>
Cc: stable(a)vger.kernel.org
---
Changes in v3:
- Incorporated review comments from Juri around the commit message as
well as around the comment regarding checks in find_lock_later_rq.
- Link to v2:
https://lore.kernel.org/stable/20250317022325.52791-1-harshit@nutanix.com/
Changes in v2:
- As per Juri's suggestion, moved the check inside find_lock_later_rq
similar to rt change. Here we distinguish among the push_dl_task
caller vs dl_task_offline_migration by checking if the task is
throttled or not.
- Fixed the commit message to refer to the rt change by title.
- Link to v1:
https://lore.kernel.org/lkml/20250307204255.60640-1-harshit@nutanix.com/
---
kernel/sched/deadline.c | 73 +++++++++++++++++++++++++++--------------
1 file changed, 49 insertions(+), 24 deletions(-)
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 38e4537790af..e0c95f33e1ed 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -2621,6 +2621,25 @@ static int find_later_rq(struct task_struct *task)
return -1;
}
+static struct task_struct *pick_next_pushable_dl_task(struct rq *rq)
+{
+ struct task_struct *p;
+
+ if (!has_pushable_dl_tasks(rq))
+ return NULL;
+
+ p = __node_2_pdl(rb_first_cached(&rq->dl.pushable_dl_tasks_root));
+
+ WARN_ON_ONCE(rq->cpu != task_cpu(p));
+ WARN_ON_ONCE(task_current(rq, p));
+ WARN_ON_ONCE(p->nr_cpus_allowed <= 1);
+
+ WARN_ON_ONCE(!task_on_rq_queued(p));
+ WARN_ON_ONCE(!dl_task(p));
+
+ return p;
+}
+
/* Locks the rq it finds */
static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq)
{
@@ -2648,12 +2667,37 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq)
/* Retry if something changed. */
if (double_lock_balance(rq, later_rq)) {
- if (unlikely(task_rq(task) != rq ||
+ /*
+ * double_lock_balance had to release rq->lock, in the
+ * meantime, task may no longer be fit to be migrated.
+ * Check the following to ensure that the task is
+ * still suitable for migration:
+ * 1. It is possible the task was scheduled,
+ * migrate_disabled was set and then got preempted,
+ * so we must check the task migration disable
+ * flag.
+ * 2. The CPU picked is in the task's affinity.
+ * 3. For throttled task (dl_task_offline_migration),
+ * check the following:
+ * - the task is not on the rq anymore (it was
+ * migrated)
+ * - the task is not on CPU anymore
+ * - the task is still a dl task
+ * - the task is not queued on the rq anymore
+ * 4. For the non-throttled task (push_dl_task), the
+ * check to ensure that this task is still at the
+ * head of the pushable tasks list is enough.
+ */
+ if (unlikely(is_migration_disabled(task) ||
!cpumask_test_cpu(later_rq->cpu, &task->cpus_mask) ||
- task_on_cpu(rq, task) ||
- !dl_task(task) ||
- is_migration_disabled(task) ||
- !task_on_rq_queued(task))) {
+ (task->dl.dl_throttled &&
+ (task_rq(task) != rq ||
+ task_on_cpu(rq, task) ||
+ !dl_task(task) ||
+ !task_on_rq_queued(task))) ||
+ (!task->dl.dl_throttled &&
+ task != pick_next_pushable_dl_task(rq)))) {
+
double_unlock_balance(rq, later_rq);
later_rq = NULL;
break;
@@ -2676,25 +2720,6 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq)
return later_rq;
}
-static struct task_struct *pick_next_pushable_dl_task(struct rq *rq)
-{
- struct task_struct *p;
-
- if (!has_pushable_dl_tasks(rq))
- return NULL;
-
- p = __node_2_pdl(rb_first_cached(&rq->dl.pushable_dl_tasks_root));
-
- WARN_ON_ONCE(rq->cpu != task_cpu(p));
- WARN_ON_ONCE(task_current(rq, p));
- WARN_ON_ONCE(p->nr_cpus_allowed <= 1);
-
- WARN_ON_ONCE(!task_on_rq_queued(p));
- WARN_ON_ONCE(!dl_task(p));
-
- return p;
-}
-
/*
* See if the non running -deadline tasks on this rq
* can be sent to some other CPU where they can preempt
--
2.49.0.111.g5b97a56fa0
A recent optimization change in LLVM [1] aims to transform certain loop
idioms into calls to strlen() or wcslen(). This change transforms the
first while loop in UniStrcat() into a call to wcslen(), breaking the
build when UniStrcat() gets inlined into alloc_path_with_tree_prefix():
ld.lld: error: undefined symbol: wcslen
>>> referenced by nls_ucs2_utils.h:54 (fs/smb/client/../../nls/nls_ucs2_utils.h:54)
>>> vmlinux.o:(alloc_path_with_tree_prefix)
>>> referenced by nls_ucs2_utils.h:54 (fs/smb/client/../../nls/nls_ucs2_utils.h:54)
>>> vmlinux.o:(alloc_path_with_tree_prefix)
Disable this optimization with '-fno-builtin-wcslen', which prevents the
compiler from assuming that wcslen() is available in the kernel's C
library
Cc: stable(a)vger.kernel.org
Link: https://github.com/llvm/llvm-project/commit/9694844d7e36fd5e01011ab56b64f27… [1]
Signed-off-by: Nathan Chancellor <nathan(a)kernel.org>
---
Makefile | 3 +++
1 file changed, 3 insertions(+)
diff --git a/Makefile b/Makefile
index 38689a0c3605..f42418556507 100644
--- a/Makefile
+++ b/Makefile
@@ -1068,6 +1068,9 @@ ifdef CONFIG_CC_IS_GCC
KBUILD_CFLAGS += -fconserve-stack
endif
+# Ensure compilers do not transform certain loops into calls to wcslen()
+KBUILD_CFLAGS += -fno-builtin-wcslen
+
# change __FILE__ to the relative path to the source directory
ifdef building_out_of_srctree
KBUILD_CPPFLAGS += $(call cc-option,-ffile-prefix-map=$(srcroot)/=)
---
base-commit: 0af2f6be1b4281385b618cb86ad946eded089ac8
change-id: 20250407-fno-builtin-wcslen-90a858ae7d54
Best regards,
--
Nathan Chancellor <nathan(a)kernel.org>
Recently, during a debugging session using local MPTCP connections, I
noticed MPJoinAckHMacFailure was strangely not zero on the server side.
The first patch fixes this issue -- present since v5.9 -- and the second
one validates it in the selftests.
Signed-off-by: Matthieu Baerts (NGI0) <matttbe(a)kernel.org>
---
Matthieu Baerts (NGI0) (2):
mptcp: only inc MPJoinAckHMacFailure for HMAC failures
selftests: mptcp: validate MPJoin HMacFailure counters
net/mptcp/subflow.c | 8 ++++++--
tools/testing/selftests/net/mptcp/mptcp_join.sh | 18 ++++++++++++++++++
2 files changed, 24 insertions(+), 2 deletions(-)
---
base-commit: 61f96e684edd28ca40555ec49ea1555df31ba619
change-id: 20250407-net-mptcp-hmac-failure-mib-66f599305ff3
Best regards,
--
Matthieu Baerts (NGI0) <matttbe(a)kernel.org>
The patch titled
Subject: mm: protect kernel pgtables in apply_to_pte_range()
has been added to the -mm mm-hotfixes-unstable branch. Its filename is
mm-protect-kernel-pgtables-in-apply_to_pte_range.patch
This patch will shortly appear at
https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patche…
This patch will later appear in the mm-hotfixes-unstable branch at
git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next via the mm-everything
branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
and is updated there every 2-3 working days
------------------------------------------------------
From: Alexander Gordeev <agordeev(a)linux.ibm.com>
Subject: mm: protect kernel pgtables in apply_to_pte_range()
Date: Tue, 8 Apr 2025 18:07:32 +0200
The lazy MMU mode can only be entered and left under the protection of the
page table locks for all page tables which may be modified. Yet, when it
comes to kernel mappings apply_to_pte_range() does not take any locks.
That does not conform arch_enter|leave_lazy_mmu_mode() semantics and could
potentially lead to re-schedulling a process while in lazy MMU mode or
racing on a kernel page table updates.
Link: https://lkml.kernel.org/r/ef8f6538b83b7fc3372602f90375348f9b4f3596.17441281…
Fixes: 38e0edb15bd0 ("mm/apply_to_range: call pte function with lazy updates")
Signed-off-by: Alexander Gordeev <agordeev(a)linux.ibm.com>
Cc: <stable(a)vger.kernel.org>
Cc: Andrey Ryabinin <ryabinin.a.a(a)gmail.com>
Cc: Guenetr Roeck <linux(a)roeck-us.net>
Cc: Hugh Dickins <hughd(a)google.com>
Cc: Jeremy Fitzhardinge <jeremy(a)goop.org>
Cc: Juegren Gross <jgross(a)suse.com>
Cc: Nicholas Piggin <npiggin(a)gmail.com>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/kasan/shadow.c | 7 ++-----
mm/memory.c | 5 ++++-
2 files changed, 6 insertions(+), 6 deletions(-)
--- a/mm/kasan/shadow.c~mm-protect-kernel-pgtables-in-apply_to_pte_range
+++ a/mm/kasan/shadow.c
@@ -308,14 +308,14 @@ static int kasan_populate_vmalloc_pte(pt
__memset((void *)page, KASAN_VMALLOC_INVALID, PAGE_SIZE);
pte = pfn_pte(PFN_DOWN(__pa(page)), PAGE_KERNEL);
- spin_lock(&init_mm.page_table_lock);
if (likely(pte_none(ptep_get(ptep)))) {
set_pte_at(&init_mm, addr, ptep, pte);
page = 0;
}
- spin_unlock(&init_mm.page_table_lock);
+
if (page)
free_page(page);
+
return 0;
}
@@ -401,13 +401,10 @@ static int kasan_depopulate_vmalloc_pte(
page = (unsigned long)__va(pte_pfn(ptep_get(ptep)) << PAGE_SHIFT);
- spin_lock(&init_mm.page_table_lock);
-
if (likely(!pte_none(ptep_get(ptep)))) {
pte_clear(&init_mm, addr, ptep);
free_page(page);
}
- spin_unlock(&init_mm.page_table_lock);
return 0;
}
--- a/mm/memory.c~mm-protect-kernel-pgtables-in-apply_to_pte_range
+++ a/mm/memory.c
@@ -2926,6 +2926,7 @@ static int apply_to_pte_range(struct mm_
pte = pte_offset_kernel(pmd, addr);
if (!pte)
return err;
+ spin_lock(&init_mm.page_table_lock);
} else {
if (create)
pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
@@ -2951,7 +2952,9 @@ static int apply_to_pte_range(struct mm_
arch_leave_lazy_mmu_mode();
- if (mm != &init_mm)
+ if (mm == &init_mm)
+ spin_unlock(&init_mm.page_table_lock);
+ else
pte_unmap_unlock(mapped_pte, ptl);
*mask |= PGTBL_PTE_MODIFIED;
_
Patches currently in -mm which might be from agordeev(a)linux.ibm.com are
kasan-avoid-sleepable-page-allocation-from-atomic-context.patch
mm-cleanup-apply_to_pte_range-routine.patch
mm-protect-kernel-pgtables-in-apply_to_pte_range.patch
The patch titled
Subject: mm: clean up apply_to_pte_range()
has been added to the -mm mm-hotfixes-unstable branch. Its filename is
mm-cleanup-apply_to_pte_range-routine.patch
This patch will shortly appear at
https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patche…
This patch will later appear in the mm-hotfixes-unstable branch at
git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next via the mm-everything
branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
and is updated there every 2-3 working days
------------------------------------------------------
From: Alexander Gordeev <agordeev(a)linux.ibm.com>
Subject: mm: clean up apply_to_pte_range()
Date: Tue, 8 Apr 2025 18:07:31 +0200
Reverse 'create' vs 'mm == &init_mm' conditions and move page table mask
modification out of the atomic context. This is a prerequisite for fixing
missing kernel page tables lock.
Link: https://lkml.kernel.org/r/0c65bc334f17ff1d7d92d31c69d7065769bbce4e.17441281…
Fixes: 38e0edb15bd0 ("mm/apply_to_range: call pte function with lazy updates")
Signed-off-by: Alexander Gordeev <agordeev(a)linux.ibm.com>
Cc: <stable(a)vger.kernel.org>
Cc: Andrey Ryabinin <ryabinin.a.a(a)gmail.com>
Cc: Guenetr Roeck <linux(a)roeck-us.net>
Cc: Hugh Dickins <hughd(a)google.com>
Cc: Jeremy Fitzhardinge <jeremy(a)goop.org>
Cc: Juegren Gross <jgross(a)suse.com>
Cc: Nicholas Piggin <npiggin(a)gmail.com>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/memory.c | 28 +++++++++++++++++-----------
1 file changed, 17 insertions(+), 11 deletions(-)
--- a/mm/memory.c~mm-cleanup-apply_to_pte_range-routine
+++ a/mm/memory.c
@@ -2915,24 +2915,28 @@ static int apply_to_pte_range(struct mm_
pte_fn_t fn, void *data, bool create,
pgtbl_mod_mask *mask)
{
+ int err = create ? -ENOMEM : -EINVAL;
pte_t *pte, *mapped_pte;
- int err = 0;
spinlock_t *ptl;
- if (create) {
- mapped_pte = pte = (mm == &init_mm) ?
- pte_alloc_kernel_track(pmd, addr, mask) :
- pte_alloc_map_lock(mm, pmd, addr, &ptl);
+ if (mm == &init_mm) {
+ if (create)
+ pte = pte_alloc_kernel_track(pmd, addr, mask);
+ else
+ pte = pte_offset_kernel(pmd, addr);
if (!pte)
- return -ENOMEM;
+ return err;
} else {
- mapped_pte = pte = (mm == &init_mm) ?
- pte_offset_kernel(pmd, addr) :
- pte_offset_map_lock(mm, pmd, addr, &ptl);
+ if (create)
+ pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
+ else
+ pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
if (!pte)
- return -EINVAL;
+ return err;
+ mapped_pte = pte;
}
+ err = 0;
arch_enter_lazy_mmu_mode();
if (fn) {
@@ -2944,12 +2948,14 @@ static int apply_to_pte_range(struct mm_
}
} while (addr += PAGE_SIZE, addr != end);
}
- *mask |= PGTBL_PTE_MODIFIED;
arch_leave_lazy_mmu_mode();
if (mm != &init_mm)
pte_unmap_unlock(mapped_pte, ptl);
+
+ *mask |= PGTBL_PTE_MODIFIED;
+
return err;
}
_
Patches currently in -mm which might be from agordeev(a)linux.ibm.com are
kasan-avoid-sleepable-page-allocation-from-atomic-context.patch
mm-cleanup-apply_to_pte_range-routine.patch
mm-protect-kernel-pgtables-in-apply_to_pte_range.patch
The patch titled
Subject: kasan: avoid sleepable page allocation from atomic context
has been added to the -mm mm-hotfixes-unstable branch. Its filename is
kasan-avoid-sleepable-page-allocation-from-atomic-context.patch
This patch will shortly appear at
https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patche…
This patch will later appear in the mm-hotfixes-unstable branch at
git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next via the mm-everything
branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
and is updated there every 2-3 working days
------------------------------------------------------
From: Alexander Gordeev <agordeev(a)linux.ibm.com>
Subject: kasan: avoid sleepable page allocation from atomic context
Date: Tue, 8 Apr 2025 18:07:30 +0200
Patch series "mm: Fix apply_to_pte_range() vs lazy MMU mode", v2.
This series is an attempt to fix the violation of lazy MMU mode context
requirement as described for arch_enter_lazy_mmu_mode():
This mode can only be entered and left under the protection of
the page table locks for all page tables which may be modified.
On s390 if I make arch_enter_lazy_mmu_mode() -> preempt_enable() and
arch_leave_lazy_mmu_mode() -> preempt_disable() I am getting this:
[ 553.332108] preempt_count: 1, expected: 0
[ 553.332117] no locks held by multipathd/2116.
[ 553.332128] CPU: 24 PID: 2116 Comm: multipathd Kdump: loaded Tainted:
[ 553.332139] Hardware name: IBM 3931 A01 701 (LPAR)
[ 553.332146] Call Trace:
[ 553.332152] [<00000000158de23a>] dump_stack_lvl+0xfa/0x150
[ 553.332167] [<0000000013e10d12>] __might_resched+0x57a/0x5e8
[ 553.332178] [<00000000144eb6c2>] __alloc_pages+0x2ba/0x7c0
[ 553.332189] [<00000000144d5cdc>] __get_free_pages+0x2c/0x88
[ 553.332198] [<00000000145663f6>] kasan_populate_vmalloc_pte+0x4e/0x110
[ 553.332207] [<000000001447625c>] apply_to_pte_range+0x164/0x3c8
[ 553.332218] [<000000001448125a>] apply_to_pmd_range+0xda/0x318
[ 553.332226] [<000000001448181c>] __apply_to_page_range+0x384/0x768
[ 553.332233] [<0000000014481c28>] apply_to_page_range+0x28/0x38
[ 553.332241] [<00000000145665da>] kasan_populate_vmalloc+0x82/0x98
[ 553.332249] [<00000000144c88d0>] alloc_vmap_area+0x590/0x1c90
[ 553.332257] [<00000000144ca108>] __get_vm_area_node.constprop.0+0x138/0x260
[ 553.332265] [<00000000144d17fc>] __vmalloc_node_range+0x134/0x360
[ 553.332274] [<0000000013d5dbf2>] alloc_thread_stack_node+0x112/0x378
[ 553.332284] [<0000000013d62726>] dup_task_struct+0x66/0x430
[ 553.332293] [<0000000013d63962>] copy_process+0x432/0x4b80
[ 553.332302] [<0000000013d68300>] kernel_clone+0xf0/0x7d0
[ 553.332311] [<0000000013d68bd6>] __do_sys_clone+0xae/0xc8
[ 553.332400] [<0000000013d68dee>] __s390x_sys_clone+0xd6/0x118
[ 553.332410] [<0000000013c9d34c>] do_syscall+0x22c/0x328
[ 553.332419] [<00000000158e7366>] __do_syscall+0xce/0xf0
[ 553.332428] [<0000000015913260>] system_call+0x70/0x98
This exposes a KASAN issue fixed with patch 1 and apply_to_pte_range()
issue fixed with patch 3, while patch 2 is a prerequisite.
Commit b9ef323ea168 ("powerpc/64s: Disable preemption in hash lazy mmu
mode") looks like powerpc-only fix, yet not entirely conforming to the
above provided requirement (page tables itself are still not protected).
If I am not mistaken, xen and sparc are alike.
This patch (of 3):
apply_to_page_range() enters lazy MMU mode and then invokes
kasan_populate_vmalloc_pte() callback on each page table walk iteration.
The lazy MMU mode may only be entered only under protection of the page
table lock. However, the callback can go into sleep when trying to
allocate a single page.
Change __get_free_page() allocation mode from GFP_KERNEL to GFP_ATOMIC to
avoid scheduling out while in atomic context.
Link: https://lkml.kernel.org/r/cover.1744128123.git.agordeev@linux.ibm.com
Link: https://lkml.kernel.org/r/2d9f4ac4528701b59d511a379a60107fa608ad30.17441281…
Fixes: 3c5c3cfb9ef4 ("kasan: support backing vmalloc space with real shadow memory")
Signed-off-by: Alexander Gordeev <agordeev(a)linux.ibm.com>
Cc: Andrey Ryabinin <ryabinin.a.a(a)gmail.com>
Cc: Guenetr Roeck <linux(a)roeck-us.net>
Cc: Hugh Dickins <hughd(a)google.com>
Cc: Jeremy Fitzhardinge <jeremy(a)goop.org>
Cc: Juegren Gross <jgross(a)suse.com>
Cc: Nicholas Piggin <npiggin(a)gmail.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/kasan/shadow.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
--- a/mm/kasan/shadow.c~kasan-avoid-sleepable-page-allocation-from-atomic-context
+++ a/mm/kasan/shadow.c
@@ -301,7 +301,7 @@ static int kasan_populate_vmalloc_pte(pt
if (likely(!pte_none(ptep_get(ptep))))
return 0;
- page = __get_free_page(GFP_KERNEL);
+ page = __get_free_page(GFP_ATOMIC);
if (!page)
return -ENOMEM;
_
Patches currently in -mm which might be from agordeev(a)linux.ibm.com are
kasan-avoid-sleepable-page-allocation-from-atomic-context.patch
mm-cleanup-apply_to_pte_range-routine.patch
mm-protect-kernel-pgtables-in-apply_to_pte_range.patch
Hi folks,
This series fixes support for correctly saving and restoring fltcon0
and fltcon1 registers on gs101 for non-alive banks where the fltcon
register offset is not at a fixed offset (unlike previous SoCs).
This is done by adding a eint_fltcon_offset and providing GS101
specific pin macros that take an additional parameter (similar to
how exynosautov920 handles it's eint_con_offset).
Additionally the SoC specific suspend and resume callbacks are
re-factored so that each SoC variant has it's own callback containing
the peculiarities for that SoC.
Finally support for filter selection on alive banks is added, this is
currently only enabled for gs101. The code path can be excercised using
`echo mem > /sys/power/state`
regards,
Peter
To: Krzysztof Kozlowski <krzk(a)kernel.org>
To: Sylwester Nawrocki <s.nawrocki(a)samsung.com>
To: Alim Akhtar <alim.akhtar(a)samsung.com>
To: Linus Walleij <linus.walleij(a)linaro.org>
Cc: linux-arm-kernel(a)lists.infradead.org
Cc: linux-samsung-soc(a)vger.kernel.org
Cc: linux-gpio(a)vger.kernel.org
Cc: linux-kernel(a)vger.kernel.org
Cc: andre.draszik(a)linaro.org
Cc: tudor.ambarus(a)linaro.org
Cc: willmcvicker(a)google.com
Cc: semen.protsenko(a)linaro.org
Cc: kernel-team(a)android.com
Cc: jaewon02.kim(a)samsung.com
Signed-off-by: Peter Griffin <peter.griffin(a)linaro.org>
---
Changes in v6:
- Make drvdata->suspend/resume symmetrically reversed (Krzysztof)
- rebase on linux-next
- Link to v5: https://lore.kernel.org/r/20250312-pinctrl-fltcon-suspend-v5-0-d98d5b271242…
Changes in v5:
- Split drvdata suspend & resume callbacks into a dedicated patch (Krzysztof)
- Add comment about stable dependency (Krzysztof)
- Add back in {} braces (Krzysztof)
- Link to v4: https://lore.kernel.org/r/20250307-pinctrl-fltcon-suspend-v4-0-2d775e486036…
Changes in v4:
- save->eint_fltcon1 is an argument to pr_debug(), not readl() change alignment accordingly (Andre)
- Link to v3: https://lore.kernel.org/r/20250306-pinctrl-fltcon-suspend-v3-0-f9ab4ff6a24e…
Changes in v3:
- Ensure EXYNOS_FLTCON_DIGITAL bit is cleared (Andre)
- Make it obvious that exynos_eint_set_filter() is conditional on bank type (Andre)
- Make it obvious exynos_set_wakeup() is conditional on bank type (Andre)
- Align style where the '+' is placed first (Andre)
- Remove unnecessary braces (Andre)
- Link to v2: https://lore.kernel.org/r/20250301-pinctrl-fltcon-suspend-v2-0-a7eef9bb443b…
Changes in v2:
- Remove eint_flt_selectable bool as it can be deduced from EINT_TYPE_WKUP (Peter)
- Move filter config register comment to header file (Andre)
- Rename EXYNOS_FLTCON_DELAY to EXYNOS_FLTCON_ANALOG (Andre)
- Remove misleading old comment (Andre)
- Refactor exynos_eint_update_flt_reg() into a loop (Andre)
- Split refactor of suspend/resume callbacks & gs101 parts into separate patches (Andre)
- Link to v1: https://lore.kernel.org/r/20250120-pinctrl-fltcon-suspend-v1-0-e77900b2a854…
---
Peter Griffin (4):
pinctrl: samsung: refactor drvdata suspend & resume callbacks
pinctrl: samsung: add dedicated SoC eint suspend/resume callbacks
pinctrl: samsung: add gs101 specific eint suspend/resume callbacks
pinctrl: samsung: Add filter selection support for alive bank on gs101
drivers/pinctrl/samsung/pinctrl-exynos-arm64.c | 52 ++---
drivers/pinctrl/samsung/pinctrl-exynos.c | 294 +++++++++++++++----------
drivers/pinctrl/samsung/pinctrl-exynos.h | 28 ++-
drivers/pinctrl/samsung/pinctrl-samsung.c | 21 +-
drivers/pinctrl/samsung/pinctrl-samsung.h | 8 +-
5 files changed, 252 insertions(+), 151 deletions(-)
---
base-commit: cd37a617b4bfb43f84dbbf8058317b487f5203ae
change-id: 20250120-pinctrl-fltcon-suspend-2333a137c4d4
Best regards,
--
Peter Griffin <peter.griffin(a)linaro.org>
Overview
========
When a CPU chooses to call push_rt_task and picks a task to push to
another CPU's runqueue then it will call find_lock_lowest_rq method
which would take a double lock on both CPUs' runqueues. If one of the
locks aren't readily available, it may lead to dropping the current
runqueue lock and reacquiring both the locks at once. During this window
it is possible that the task is already migrated and is running on some
other CPU. These cases are already handled. However, if the task is
migrated and has already been executed and another CPU is now trying to
wake it up (ttwu) such that it is queued again on the runqeue
(on_rq is 1) and also if the task was run by the same CPU, then the
current checks will pass even though the task was migrated out and is no
longer in the pushable tasks list.
Crashes
=======
This bug resulted in quite a few flavors of crashes triggering kernel
panics with various crash signatures such as assert failures, page
faults, null pointer dereferences, and queue corruption errors all
coming from scheduler itself.
Some of the crashes:
-> kernel BUG at kernel/sched/rt.c:1616! BUG_ON(idx >= MAX_RT_PRIO)
Call Trace:
? __die_body+0x1a/0x60
? die+0x2a/0x50
? do_trap+0x85/0x100
? pick_next_task_rt+0x6e/0x1d0
? do_error_trap+0x64/0xa0
? pick_next_task_rt+0x6e/0x1d0
? exc_invalid_op+0x4c/0x60
? pick_next_task_rt+0x6e/0x1d0
? asm_exc_invalid_op+0x12/0x20
? pick_next_task_rt+0x6e/0x1d0
__schedule+0x5cb/0x790
? update_ts_time_stats+0x55/0x70
schedule_idle+0x1e/0x40
do_idle+0x15e/0x200
cpu_startup_entry+0x19/0x20
start_secondary+0x117/0x160
secondary_startup_64_no_verify+0xb0/0xbb
-> BUG: kernel NULL pointer dereference, address: 00000000000000c0
Call Trace:
? __die_body+0x1a/0x60
? no_context+0x183/0x350
? __warn+0x8a/0xe0
? exc_page_fault+0x3d6/0x520
? asm_exc_page_fault+0x1e/0x30
? pick_next_task_rt+0xb5/0x1d0
? pick_next_task_rt+0x8c/0x1d0
__schedule+0x583/0x7e0
? update_ts_time_stats+0x55/0x70
schedule_idle+0x1e/0x40
do_idle+0x15e/0x200
cpu_startup_entry+0x19/0x20
start_secondary+0x117/0x160
secondary_startup_64_no_verify+0xb0/0xbb
-> BUG: unable to handle page fault for address: ffff9464daea5900
kernel BUG at kernel/sched/rt.c:1861! BUG_ON(rq->cpu != task_cpu(p))
-> kernel BUG at kernel/sched/rt.c:1055! BUG_ON(!rq->nr_running)
Call Trace:
? __die_body+0x1a/0x60
? die+0x2a/0x50
? do_trap+0x85/0x100
? dequeue_top_rt_rq+0xa2/0xb0
? do_error_trap+0x64/0xa0
? dequeue_top_rt_rq+0xa2/0xb0
? exc_invalid_op+0x4c/0x60
? dequeue_top_rt_rq+0xa2/0xb0
? asm_exc_invalid_op+0x12/0x20
? dequeue_top_rt_rq+0xa2/0xb0
dequeue_rt_entity+0x1f/0x70
dequeue_task_rt+0x2d/0x70
__schedule+0x1a8/0x7e0
? blk_finish_plug+0x25/0x40
schedule+0x3c/0xb0
futex_wait_queue_me+0xb6/0x120
futex_wait+0xd9/0x240
do_futex+0x344/0xa90
? get_mm_exe_file+0x30/0x60
? audit_exe_compare+0x58/0x70
? audit_filter_rules.constprop.26+0x65e/0x1220
__x64_sys_futex+0x148/0x1f0
do_syscall_64+0x30/0x80
entry_SYSCALL_64_after_hwframe+0x62/0xc7
-> BUG: unable to handle page fault for address: ffff8cf3608bc2c0
Call Trace:
? __die_body+0x1a/0x60
? no_context+0x183/0x350
? spurious_kernel_fault+0x171/0x1c0
? exc_page_fault+0x3b6/0x520
? plist_check_list+0x15/0x40
? plist_check_list+0x2e/0x40
? asm_exc_page_fault+0x1e/0x30
? _cond_resched+0x15/0x30
? futex_wait_queue_me+0xc8/0x120
? futex_wait+0xd9/0x240
? try_to_wake_up+0x1b8/0x490
? futex_wake+0x78/0x160
? do_futex+0xcd/0xa90
? plist_check_list+0x15/0x40
? plist_check_list+0x2e/0x40
? plist_del+0x6a/0xd0
? plist_check_list+0x15/0x40
? plist_check_list+0x2e/0x40
? dequeue_pushable_task+0x20/0x70
? __schedule+0x382/0x7e0
? asm_sysvec_reschedule_ipi+0xa/0x20
? schedule+0x3c/0xb0
? exit_to_user_mode_prepare+0x9e/0x150
? irqentry_exit_to_user_mode+0x5/0x30
? asm_sysvec_reschedule_ipi+0x12/0x20
Above are some of the common examples of the crashes that were observed
due to this issue.
Details
=======
Let's look at the following scenario to understand this race.
1) CPU A enters push_rt_task
a) CPU A has chosen next_task = task p.
b) CPU A calls find_lock_lowest_rq(Task p, CPU Z’s rq).
c) CPU A identifies CPU X as a destination CPU (X < Z).
d) CPU A enters double_lock_balance(CPU Z’s rq, CPU X’s rq).
e) Since X is lower than Z, CPU A unlocks CPU Z’s rq. Someone else has
locked CPU X’s rq, and thus, CPU A must wait.
2) At CPU Z
a) Previous task has completed execution and thus, CPU Z enters
schedule, locks its own rq after CPU A releases it.
b) CPU Z dequeues previous task and begins executing task p.
c) CPU Z unlocks its rq.
d) Task p yields the CPU (ex. by doing IO or waiting to acquire a
lock) which triggers the schedule function on CPU Z.
e) CPU Z enters schedule again, locks its own rq, and dequeues task p.
f) As part of dequeue, it sets p.on_rq = 0 and unlocks its rq.
3) At CPU B
a) CPU B enters try_to_wake_up with input task p.
b) Since CPU Z dequeued task p, p.on_rq = 0, and CPU B updates
B.state = WAKING.
c) CPU B via select_task_rq determines CPU Y as the target CPU.
4) The race
a) CPU A acquires CPU X’s lock and relocks CPU Z.
b) CPU A reads task p.cpu = Z and incorrectly concludes task p is
still on CPU Z.
c) CPU A failed to notice task p had been dequeued from CPU Z while
CPU A was waiting for locks in double_lock_balance. If CPU A knew
that task p had been dequeued, it would return NULL forcing
push_rt_task to give up the task p's migration.
d) CPU B updates task p.cpu = Y and calls ttwu_queue.
e) CPU B locks Ys rq. CPU B enqueues task p onto Y and sets task
p.on_rq = 1.
f) CPU B unlocks CPU Y, triggering memory synchronization.
g) CPU A reads task p.on_rq = 1, cementing its assumption that task p
has not migrated.
h) CPU A decides to migrate p to CPU X.
This leads to A dequeuing p from Y's queue and various crashes down the
line.
Solution
========
The solution here is fairly simple. After obtaining the lock (at 4a),
the check is enhanced to make sure that the task is still at the head of
the pushable tasks list. If not, then it is anyway not suitable for
being pushed out.
Testing
=======
The fix is tested on a cluster of 3 nodes, where the panics due to this
are hit every couple of days. A fix similar to this was deployed on such
cluster and was stable for more than 30 days.
Co-developed-by: Jon Kohler <jon(a)nutanix.com>
Signed-off-by: Jon Kohler <jon(a)nutanix.com>
Co-developed-by: Gauri Patwardhan <gauri.patwardhan(a)nutanix.com>
Signed-off-by: Gauri Patwardhan <gauri.patwardhan(a)nutanix.com>
Co-developed-by: Rahul Chunduru <rahul.chunduru(a)nutanix.com>
Signed-off-by: Rahul Chunduru <rahul.chunduru(a)nutanix.com>
Signed-off-by: Harshit Agarwal <harshit(a)nutanix.com>
Tested-by: Will Ton <william.ton(a)nutanix.com>
Reviewed-by: Steven Rostedt (Google) <rostedt(a)goodmis.org>
Cc: stable(a)vger.kernel.org
---
Changes in v2:
- As per Steve's suggestion, removed some checks that are done after
obtaining the lock that are no longer needed with the addition of new
check.
- Moved up is_migration_disabled check.
- Link to v1:
https://lore.kernel.org/lkml/20250211054646.23987-1-harshit@nutanix.com/
Changes in v3:
- Updated commit message to add stable maintainers and reviewed-by tag.
- Link to v2:
https://lore.kernel.org/lkml/20250214170844.201692-1-harshit@nutanix.com/
---
kernel/sched/rt.c | 54 +++++++++++++++++++++++------------------------
1 file changed, 26 insertions(+), 28 deletions(-)
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 4b8e33c615b1..4762dd3f50c5 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1885,6 +1885,27 @@ static int find_lowest_rq(struct task_struct *task)
return -1;
}
+static struct task_struct *pick_next_pushable_task(struct rq *rq)
+{
+ struct task_struct *p;
+
+ if (!has_pushable_tasks(rq))
+ return NULL;
+
+ p = plist_first_entry(&rq->rt.pushable_tasks,
+ struct task_struct, pushable_tasks);
+
+ BUG_ON(rq->cpu != task_cpu(p));
+ BUG_ON(task_current(rq, p));
+ BUG_ON(task_current_donor(rq, p));
+ BUG_ON(p->nr_cpus_allowed <= 1);
+
+ BUG_ON(!task_on_rq_queued(p));
+ BUG_ON(!rt_task(p));
+
+ return p;
+}
+
/* Will lock the rq it finds */
static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
{
@@ -1915,18 +1936,16 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
/*
* We had to unlock the run queue. In
* the mean time, task could have
- * migrated already or had its affinity changed.
- * Also make sure that it wasn't scheduled on its rq.
+ * migrated already or had its affinity changed,
+ * therefore check if the task is still at the
+ * head of the pushable tasks list.
* It is possible the task was scheduled, set
* "migrate_disabled" and then got preempted, so we must
* check the task migration disable flag here too.
*/
- if (unlikely(task_rq(task) != rq ||
+ if (unlikely(is_migration_disabled(task) ||
!cpumask_test_cpu(lowest_rq->cpu, &task->cpus_mask) ||
- task_on_cpu(rq, task) ||
- !rt_task(task) ||
- is_migration_disabled(task) ||
- !task_on_rq_queued(task))) {
+ task != pick_next_pushable_task(rq))) {
double_unlock_balance(rq, lowest_rq);
lowest_rq = NULL;
@@ -1946,27 +1965,6 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
return lowest_rq;
}
-static struct task_struct *pick_next_pushable_task(struct rq *rq)
-{
- struct task_struct *p;
-
- if (!has_pushable_tasks(rq))
- return NULL;
-
- p = plist_first_entry(&rq->rt.pushable_tasks,
- struct task_struct, pushable_tasks);
-
- BUG_ON(rq->cpu != task_cpu(p));
- BUG_ON(task_current(rq, p));
- BUG_ON(task_current_donor(rq, p));
- BUG_ON(p->nr_cpus_allowed <= 1);
-
- BUG_ON(!task_on_rq_queued(p));
- BUG_ON(!rt_task(p));
-
- return p;
-}
-
/*
* If the current CPU has more than one RT task, see if the non
* running task can migrate over to a CPU that is running a task
--
2.22.3
Replace kzalloc with kvzalloc for the exit_dump buffer allocation, which
can require large contiguous memory depending on the implementation.
This change prevents allocation failures by allowing the system to fall
back to vmalloc when contiguous memory allocation fails.
Since this buffer is only used for debugging purposes, physical memory
contiguity is not required, making vmalloc a suitable alternative.
Cc: stable(a)vger.kernel.org
Fixes: 07814a9439a3b0 ("sched_ext: Print debug dump after an error exit")
Suggested-by: Rik van Riel <riel(a)surriel.com>
Signed-off-by: Breno Leitao <leitao(a)debian.org>
Acked-by: Andrea Righi <arighi(a)nvidia.com>
---
Changes in v3:
- Rewording the patch message
- Link to v2: https://lore.kernel.org/r/20250408-scx-v2-1-1979fc040903@debian.org
Changes in v2:
- Use kvfree() on the free path as well.
- Link to v1: https://lore.kernel.org/r/20250407-scx-v1-1-774ba74a2c17@debian.org
---
kernel/sched/ext.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 66bcd40a28ca1..db9af6a3c04fd 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -4623,7 +4623,7 @@ static void scx_ops_bypass(bool bypass)
static void free_exit_info(struct scx_exit_info *ei)
{
- kfree(ei->dump);
+ kvfree(ei->dump);
kfree(ei->msg);
kfree(ei->bt);
kfree(ei);
@@ -4639,7 +4639,7 @@ static struct scx_exit_info *alloc_exit_info(size_t exit_dump_len)
ei->bt = kcalloc(SCX_EXIT_BT_LEN, sizeof(ei->bt[0]), GFP_KERNEL);
ei->msg = kzalloc(SCX_EXIT_MSG_LEN, GFP_KERNEL);
- ei->dump = kzalloc(exit_dump_len, GFP_KERNEL);
+ ei->dump = kvzalloc(exit_dump_len, GFP_KERNEL);
if (!ei->bt || !ei->msg || !ei->dump) {
free_exit_info(ei);
---
base-commit: 0af2f6be1b4281385b618cb86ad946eded089ac8
change-id: 20250407-scx-11dbf94803c3
Best regards,
--
Breno Leitao <leitao(a)debian.org>