This commit addresses a rarely observed endpoint command timeout
which causes kernel panic due to warn when 'panic_on_warn' is enabled
and unnecessary call trace prints when 'panic_on_warn' is disabled.
It is seen during fast software-controlled connect/disconnect testcases.
The following is one such endpoint command timeout that we observed:
1. Connect
=======
->dwc3_thread_interrupt
->dwc3_ep0_interrupt
->configfs_composite_setup
->composite_setup
->usb_ep_queue
->dwc3_gadget_ep0_queue
->__dwc3_gadget_ep0_queue
->__dwc3_ep0_do_control_data
->dwc3_send_gadget_ep_cmd
2. Disconnect
==========
->dwc3_thread_interrupt
->dwc3_gadget_disconnect_interrupt
->dwc3_ep0_reset_state
->dwc3_ep0_end_control_data
->dwc3_send_gadget_ep_cmd
In the issue scenario, in Exynos platforms, we observed that control
transfers for the previous connect have not yet been completed and end
transfer command sent as a part of the disconnect sequence and
processing of USB_ENDPOINT_HALT feature request from the host timeout.
This maybe an expected scenario since the controller is processing EP
commands sent as a part of the previous connect. It maybe better to
remove WARN_ON in all places where device endpoint commands are sent to
avoid unnecessary kernel panic due to warn.
Cc: stable(a)vger.kernel.org
Signed-off-by: Akash M <akash.m5(a)samsung.com>
Signed-off-by: Selvarasu Ganesan <selvarasu.g(a)samsung.com>
---
Changes in v2:
- Removed the 'Fixes' tag from the commit message, as this patch does
not contain a fix.
- And Retained the 'stable' tag, as these changes are intended to be
applied across all stable kernels.
- Additionally, replaced 'dev_warn*' with 'dev_err*'."
Link to v1: https://lore.kernel.org/all/20250807005638.thhsgjn73aaov2af@synopsys.com/
---
drivers/usb/dwc3/ep0.c | 20 ++++++++++++++++----
drivers/usb/dwc3/gadget.c | 10 ++++++++--
2 files changed, 24 insertions(+), 6 deletions(-)
diff --git a/drivers/usb/dwc3/ep0.c b/drivers/usb/dwc3/ep0.c
index 666ac432f52d..b4229aa13f37 100644
--- a/drivers/usb/dwc3/ep0.c
+++ b/drivers/usb/dwc3/ep0.c
@@ -288,7 +288,9 @@ void dwc3_ep0_out_start(struct dwc3 *dwc)
dwc3_ep0_prepare_one_trb(dep, dwc->ep0_trb_addr, 8,
DWC3_TRBCTL_CONTROL_SETUP, false);
ret = dwc3_ep0_start_trans(dep);
- WARN_ON(ret < 0);
+ if (ret < 0)
+ dev_err(dwc->dev, "ep0 out start transfer failed: %d\n", ret);
+
for (i = 2; i < DWC3_ENDPOINTS_NUM; i++) {
struct dwc3_ep *dwc3_ep;
@@ -1061,7 +1063,9 @@ static void __dwc3_ep0_do_control_data(struct dwc3 *dwc,
ret = dwc3_ep0_start_trans(dep);
}
- WARN_ON(ret < 0);
+ if (ret < 0)
+ dev_err(dwc->dev,
+ "ep0 data phase start transfer failed: %d\n", ret);
}
static int dwc3_ep0_start_control_status(struct dwc3_ep *dep)
@@ -1078,7 +1082,12 @@ static int dwc3_ep0_start_control_status(struct dwc3_ep *dep)
static void __dwc3_ep0_do_control_status(struct dwc3 *dwc, struct dwc3_ep *dep)
{
- WARN_ON(dwc3_ep0_start_control_status(dep));
+ int ret;
+
+ ret = dwc3_ep0_start_control_status(dep);
+ if (ret)
+ dev_err(dwc->dev,
+ "ep0 status phase start transfer failed: %d\n", ret);
}
static void dwc3_ep0_do_control_status(struct dwc3 *dwc,
@@ -1121,7 +1130,10 @@ void dwc3_ep0_end_control_data(struct dwc3 *dwc, struct dwc3_ep *dep)
cmd |= DWC3_DEPCMD_PARAM(dep->resource_index);
memset(¶ms, 0, sizeof(params));
ret = dwc3_send_gadget_ep_cmd(dep, cmd, ¶ms);
- WARN_ON_ONCE(ret);
+ if (ret)
+ dev_err_ratelimited(dwc->dev,
+ "ep0 data phase end transfer failed: %d\n", ret);
+
dep->resource_index = 0;
}
diff --git a/drivers/usb/dwc3/gadget.c b/drivers/usb/dwc3/gadget.c
index 4a3e97e606d1..4a3d076c1015 100644
--- a/drivers/usb/dwc3/gadget.c
+++ b/drivers/usb/dwc3/gadget.c
@@ -1772,7 +1772,11 @@ static int __dwc3_stop_active_transfer(struct dwc3_ep *dep, bool force, bool int
dep->flags |= DWC3_EP_DELAY_STOP;
return 0;
}
- WARN_ON_ONCE(ret);
+
+ if (ret)
+ dev_err_ratelimited(dep->dwc->dev,
+ "end transfer failed: %d\n", ret);
+
dep->resource_index = 0;
if (!interrupt)
@@ -4039,7 +4043,9 @@ static void dwc3_clear_stall_all_ep(struct dwc3 *dwc)
dep->flags &= ~DWC3_EP_STALL;
ret = dwc3_send_clear_stall_ep_cmd(dep);
- WARN_ON_ONCE(ret);
+ if (ret)
+ dev_err_ratelimited(dwc->dev,
+ "failed to clear STALL on %s\n", dep->name);
}
}
--
2.17.1
#regzbot introduced: v6.12.34..v6.12.35
After upgrade to kernel 6.12.35, vfio passthrough for my GPU has stopped working within a windows VM, it sees device in device manager but reports that it did not start correctly. I compared lspci logs in the vm before and after upgrade to 6.12.35, and here are the changes I noticed:
- the reported link speed for the passthrough GPU has changed from 2.5 to 16GT/s
- the passthrough GPU has lost it's 'BusMaster' and MSI enable flags
- latency measurement feature appeared
These entries also began appearing within the vm in dmesg when host kernel is 6.12.35 or above:
[ 1.963177] nouveau 0000:01:00.0: sec2(gsp): mbox 1c503000 00000001
[ 1.963296] nouveau 0000:01:00.0: sec2(gsp):booter-load: boot failed: -5
...
[ 1.964580] nouveau 0000:01:00.0: gsp: init failed, -5
[ 1.964641] nouveau 0000:01:00.0: init failed with -5
[ 1.964681] nouveau: drm:00000000:00000080: init failed with -5
[ 1.964721] nouveau 0000:01:00.0: drm: Device allocation failed: -5
[ 1.966318] nouveau 0000:01:00.0: probe with driver nouveau failed with error -5
6.12.34 worked fine, and latest 6.12 LTS does not work either. I am using intel CPU and nvidia GPU (for passthrough, and as my GPU on linux system).
Hi there,
I have explicitly disabled mptpcp by default on my custom kernel and
this seems to be causing the test case to fail. Even after enabling
mtpcp via sysctl command or adding an entry to /etc/sysctl.conf this
fails. I don't think this test should be failing and should account for
cases where mptcp has not been enabled by default?
These are the test logs:
$ sudo tools/testing/selftests/bpf/test_progs -t mptcp
Can't find bpf_testmod.ko kernel module: -2
WARNING! Selftests relying on bpf_testmod.ko will be skipped.
run_test:PASS:bpf_prog_attach 0 nsec
run_test:PASS:connect to fd 0 nsec
verify_tsk:PASS:bpf_map_lookup_elem 0 nsec
verify_tsk:PASS:unexpected invoked count 0 nsec
verify_tsk:PASS:unexpected is_mptcp 0 nsec
test_base:PASS:run_test tcp 0 nsec
(network_helpers.c:107: errno: Protocol not available) Failed to create
server socket
test_base:FAIL:start_mptcp_server unexpected start_mptcp_server: actual
-1 < expected 0
#178/1 mptcp/base:FAIL
test_mptcpify:PASS:test__join_cgroup 0 nsec
create_netns:PASS:ip netns add mptcp_ns 0 nsec
create_netns:PASS:ip -net mptcp_ns link set dev lo up 0 nsec
test_mptcpify:PASS:create_netns 0 nsec
run_mptcpify:PASS:skel_open_load 0 nsec
run_mptcpify:PASS:skel_attach 0 nsec
(network_helpers.c:107: errno: Protocol not available) Failed to create
server socket
run_mptcpify:FAIL:start_server unexpected start_server: actual -1 <
expected 0
test_mptcpify:FAIL:run_mptcpify unexpected error: -5 (errno 92)
#178/2 mptcp/mptcpify:FAIL
#178 mptcp:FAIL
All error logs:
test_base:PASS:test__join_cgroup 0 nsec
create_netns:PASS:ip netns add mptcp_ns 0 nsec
create_netns:PASS:ip -net mptcp_ns link set dev lo up 0 nsec
test_base:PASS:create_netns 0 nsec
test_base:PASS:start_server 0 nsec
run_test:PASS:skel_open_load 0 nsec
run_test:PASS:skel_attach 0 nsec
run_test:PASS:bpf_prog_attach 0 nsec
run_test:PASS:connect to fd 0 nsec
verify_tsk:PASS:bpf_map_lookup_elem 0 nsec
verify_tsk:PASS:unexpected invoked count 0 nsec
verify_tsk:PASS:unexpected is_mptcp 0 nsec
test_base:PASS:run_test tcp 0 nsec
(network_helpers.c:107: errno: Protocol not available) Failed to create
server socket
test_base:FAIL:start_mptcp_server unexpected start_mptcp_server: actual
-1 < expected 0
#178/1 mptcp/base:FAIL
test_mptcpify:PASS:test__join_cgroup 0 nsec
create_netns:PASS:ip netns add mptcp_ns 0 nsec
create_netns:PASS:ip -net mptcp_ns link set dev lo up 0 nsec
test_mptcpify:PASS:create_netns 0 nsec
run_mptcpify:PASS:skel_open_load 0 nsec
run_mptcpify:PASS:skel_attach 0 nsec
(network_helpers.c:107: errno: Protocol not available) Failed to create
server socket
run_mptcpify:FAIL:start_server unexpected start_server: actual -1 <
expected 0
test_mptcpify:FAIL:run_mptcpify unexpected error: -5 (errno 92)
#178/2 mptcp/mptcpify:FAIL
#178 mptcp:FAIL
Summary: 0/0 PASSED, 0 SKIPPED, 1 FAILED
This is the custom patch I had applied on the LTS v6.12.36 kernel and
tested it:
diff --git a/net/mptcp/ctrl.c b/net/mptcp/ctrl.c
index dd595d9b5e50c..bdcc4136e92ef 100644
--- a/net/mptcp/ctrl.c
+++ b/net/mptcp/ctrl.c
@@ -89,7 +89,7 @@ const char *mptcp_get_scheduler(const struct net *net)
static void mptcp_pernet_set_defaults(struct mptcp_pernet *pernet)
{
- pernet->mptcp_enabled = 1;
+ pernet->mptcp_enabled = 0;
pernet->add_addr_timeout = TCP_RTO_MAX;
pernet->blackhole_timeout = 3600;
atomic_set(&pernet->active_disable_times, 0);
--
Thanks & Regards, Harshvardhan
Since commit
commit f16d9fb6cf03 ("power: supply: bq27xxx: Retrieve again when busy")
the console log of some devices with hdq but no bq27000 battery
(like the Pandaboard) is flooded with messages like:
[ 34.247833] power_supply bq27000-battery: driver failed to report 'status' property: -1
as soon as user-space is finding a /sys entry and trying to read the
"status" property.
It turns out that the offending commit changes the logic to now return the
value of cache.flags if it is <0. This is likely under the assumption that
it is an error number. In normal errors from bq27xxx_read() this is indeed
the case.
But there is special code to detect if no bq27000 is installed or accessible
through hdq/1wire and wants to report this. In that case, the cache.flags
are set (historically) to constant -1 which did make reading properties
return -ENODEV. So everything appeared to be fine before the return value was
fixed. Now the -1 is returned as -ENOPERM instead of -ENODEV, triggering the
error condition in power_supply_format_property() which then floods the
console log.
So we change the detection of missing bq27000 battery to simply set
cache.flags = -ENODEV
instead of -1.
Fixes: f16d9fb6cf03 ("power: supply: bq27xxx: Retrieve again when busy")
Cc: Jerry Lv <Jerry.Lv(a)axis.com>
Cc: stable(a)vger.kernel.org
Signed-off-by: H. Nikolaus Schaller <hns(a)goldelico.com>
---
drivers/power/supply/bq27xxx_battery.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/power/supply/bq27xxx_battery.c b/drivers/power/supply/bq27xxx_battery.c
index 93dcebbe11417..efe02ad695a62 100644
--- a/drivers/power/supply/bq27xxx_battery.c
+++ b/drivers/power/supply/bq27xxx_battery.c
@@ -1920,7 +1920,7 @@ static void bq27xxx_battery_update_unlocked(struct bq27xxx_device_info *di)
cache.flags = bq27xxx_read(di, BQ27XXX_REG_FLAGS, has_singe_flag);
if ((cache.flags & 0xff) == 0xff)
- cache.flags = -1; /* read error */
+ cache.flags = -ENODEV; /* read error */
if (cache.flags >= 0) {
cache.capacity = bq27xxx_battery_read_soc(di);
--
2.50.0
Our syztester report the lockdep WARNING [1]. kmemleak_scan_thread()
invokes scan_block() which may invoke a nomal printk() to print warning
message. This can cause a deadlock in the scenario reported below:
CPU0 CPU1
---- ----
lock(kmemleak_lock);
lock(&port->lock);
lock(kmemleak_lock);
lock(console_owner);
To solve this problem, switch to printk_safe mode before printing warning
message, this will redirect all printk()-s to a special per-CPU buffer,
which will be flushed later from a safe context (irq work), and this
deadlock problem can be avoided. The proper API to use should be
printk_deferred_enter()/printk_deferred_exit() if we want to deferred the
printing [2].
This patch also fix some similar cases that need to use the printk
deferring [3].
[1]
https://lore.kernel.org/all/20250730094914.566582-1-gubowen5@huawei.com/
[2]
https://lore.kernel.org/all/5ca375cd-4a20-4807-b897-68b289626550@redhat.com/
[3]
https://lore.kernel.org/all/aJCir5Wh362XzLSx@arm.com/
====================
Cc: stable(a)vger.kernel.org # 5.10
Signed-off-by: Gu Bowen <gubowen5(a)huawei.com>
---
mm/kmemleak.c | 21 ++++++++++++++++++++-
1 file changed, 20 insertions(+), 1 deletion(-)
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index 4801751cb6b6..381145dde54f 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -390,9 +390,15 @@ static struct kmemleak_object *lookup_object(unsigned long ptr, int alias)
else if (object->pointer == ptr || alias)
return object;
else {
+ /*
+ * Printk deferring due to the kmemleak_lock held.
+ * This is done to avoid deadlock.
+ */
+ printk_deferred_enter();
kmemleak_warn("Found object by alias at 0x%08lx\n",
ptr);
dump_object_info(object);
+ printk_deferred_exit();
break;
}
}
@@ -433,8 +439,15 @@ static struct kmemleak_object *mem_pool_alloc(gfp_t gfp)
list_del(&object->object_list);
else if (mem_pool_free_count)
object = &mem_pool[--mem_pool_free_count];
- else
+ else {
+ /*
+ * Printk deferring due to the kmemleak_lock held.
+ * This is done to avoid deadlock.
+ */
+ printk_deferred_enter();
pr_warn_once("Memory pool empty, consider increasing CONFIG_DEBUG_KMEMLEAK_MEM_POOL_SIZE\n");
+ printk_deferred_exit();
+ }
raw_spin_unlock_irqrestore(&kmemleak_lock, flags);
return object;
@@ -632,6 +645,11 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size,
else if (parent->pointer + parent->size <= ptr)
link = &parent->rb_node.rb_right;
else {
+ /*
+ * Printk deferring due to the kmemleak_lock held.
+ * This is done to avoid deadlock.
+ */
+ printk_deferred_enter();
kmemleak_stop("Cannot insert 0x%lx into the object search tree (overlaps existing)\n",
ptr);
/*
@@ -639,6 +657,7 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size,
* be freed while the kmemleak_lock is held.
*/
dump_object_info(parent);
+ printk_deferred_exit();
kmem_cache_free(object_cache, object);
object = NULL;
goto out;
--
2.25.1
The patch titled
Subject: proc: proc_maps_open allow proc_mem_open to return NULL
has been added to the -mm mm-hotfixes-unstable branch. Its filename is
proc-proc_maps_open-allow-proc_mem_open-to-return-null.patch
This patch will shortly appear at
https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patche…
This patch will later appear in the mm-hotfixes-unstable branch at
git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next via the mm-everything
branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
and is updated there every 2-3 working days
------------------------------------------------------
From: Jialin Wang <wjl.linux(a)gmail.com>
Subject: proc: proc_maps_open allow proc_mem_open to return NULL
Date: Fri, 8 Aug 2025 00:54:55 +0800
commit 65c66047259f ("proc: fix the issue of proc_mem_open returning
NULL") breaks `perf record -g -p PID` when profiling a kernel thread.
The strace of `perf record -g -p $(pgrep kswapd0)` shows:
openat(AT_FDCWD, "/proc/65/task/65/maps", O_RDONLY) = -1 ESRCH (No such process)
This patch partially reverts the commit to fix it.
Link: https://lkml.kernel.org/r/20250807165455.73656-1-wjl.linux@gmail.com
Fixes: 65c66047259f ("proc: fix the issue of proc_mem_open returning NULL")
Signed-off-by: Jialin Wang <wjl.linux(a)gmail.com>
Cc: Penglei Jiang <superman.xpt(a)gmail.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
fs/proc/task_mmu.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
--- a/fs/proc/task_mmu.c~proc-proc_maps_open-allow-proc_mem_open-to-return-null
+++ a/fs/proc/task_mmu.c
@@ -340,8 +340,8 @@ static int proc_maps_open(struct inode *
priv->inode = inode;
priv->mm = proc_mem_open(inode, PTRACE_MODE_READ);
- if (IS_ERR_OR_NULL(priv->mm)) {
- int err = priv->mm ? PTR_ERR(priv->mm) : -ESRCH;
+ if (IS_ERR(priv->mm)) {
+ int err = PTR_ERR(priv->mm);
seq_release_private(inode, file);
return err;
_
Patches currently in -mm which might be from wjl.linux(a)gmail.com are
proc-proc_maps_open-allow-proc_mem_open-to-return-null.patch
The quilt patch titled
Subject: mm: fix accounting of memmap pages for early sections
has been removed from the -mm tree. Its filename was
mm-fix-accounting-of-memmap-pages-for-early-sections.patch
This patch was dropped because an updated version will be issued
------------------------------------------------------
From: Sumanth Korikkar <sumanthk(a)linux.ibm.com>
Subject: mm: fix accounting of memmap pages for early sections
Date: Mon, 4 Aug 2025 10:40:15 +0200
memmap pages can be allocated either from the memblock (boot) allocator
during early boot or from the buddy allocator.
When these memmap pages are removed via arch_remove_memory(), the
deallocation path depends on their source:
* For pages from the buddy allocator, depopulate_section_memmap() is
called, which also decrements the count of nr_memmap_pages.
* For pages from the boot allocator, free_map_bootmem() is called. But
it currently does not adjust the nr_memmap_boot_pages.
To fix this inconsistency, update free_map_bootmem() to also decrement the
nr_memmap_boot_pages count by invoking memmap_boot_pages_add(), mirroring
how free_vmemmap_page() handles this for boot-allocated pages.
This ensures correct tracking of memmap pages regardless of allocation
source.
Link: https://lkml.kernel.org/r/20250804084015.270570-1-sumanthk@linux.ibm.com
Fixes: 15995a352474 ("mm: report per-page metadata information")
Signed-off-by: Sumanth Korikkar <sumanthk(a)linux.ibm.com>
Cc: Alexander Gordeev <agordeev(a)linux.ibm.com>
Cc: David Hildenbrand <david(a)redhat.com>
Cc: Gerald Schaefer <gerald.schaefer(a)linux.ibm.com>
Cc: Heiko Carstens <hca(a)linux.ibm.com>
Cc: Vasily Gorbik <gor(a)linux.ibm.com>
Cc: David Rientjes <rientjes(a)google.com>
Cc: Pasha Tatashin <pasha.tatashin(a)soleen.com>
Cc: Sourav Panda <souravpanda(a)google.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/sparse.c | 1 +
1 file changed, 1 insertion(+)
--- a/mm/sparse.c~mm-fix-accounting-of-memmap-pages-for-early-sections
+++ a/mm/sparse.c
@@ -688,6 +688,7 @@ static void free_map_bootmem(struct page
unsigned long start = (unsigned long)memmap;
unsigned long end = (unsigned long)(memmap + PAGES_PER_SECTION);
+ memmap_boot_pages_add(-1L * (DIV_ROUND_UP(end - start, PAGE_SIZE)));
vmemmap_free(start, end, NULL);
}
_
Patches currently in -mm which might be from sumanthk(a)linux.ibm.com are
mm-fix-accounting-of-memmap-pages.patch
The patch titled
Subject: mm: fix accounting of memmap pages
has been added to the -mm mm-hotfixes-unstable branch. Its filename is
mm-fix-accounting-of-memmap-pages.patch
This patch will shortly appear at
https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patche…
This patch will later appear in the mm-hotfixes-unstable branch at
git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next via the mm-everything
branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
and is updated there every 2-3 working days
------------------------------------------------------
From: Sumanth Korikkar <sumanthk(a)linux.ibm.com>
Subject: mm: fix accounting of memmap pages
Date: Thu, 7 Aug 2025 20:35:45 +0200
For !CONFIG_SPARSEMEM_VMEMMAP, memmap page accounting is currently done
upfront in sparse_buffer_init(). However, sparse_buffer_alloc() may
return NULL in failure scenario.
Also, memmap pages may be allocated either from the memblock allocator
during early boot or from the buddy allocator. When removed via
arch_remove_memory(), accounting of memmap pages must reflect the original
allocation source.
To ensure correctness:
* Account memmap pages after successful allocation in sparse_init_nid()
and section_activate().
* Account memmap pages in section_deactivate() based on allocation
source.
Link: https://lkml.kernel.org/r/20250807183545.1424509-1-sumanthk@linux.ibm.com
Fixes: 15995a352474 ("mm: report per-page metadata information")
Signed-off-by: Sumanth Korikkar <sumanthk(a)linux.ibm.com>
Suggested-by: David Hildenbrand <david(a)redhat.com>
Cc: Alexander Gordeev <agordeev(a)linux.ibm.com>
Cc: Gerald Schaefer <gerald.schaefer(a)linux.ibm.com>
Cc: Heiko Carstens <hca(a)linux.ibm.com>
Cc: Vasily Gorbik <gor(a)linux.ibm.com>
Cc: Wei Yang <richard.weiyang(a)gmail.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/sparse-vmemmap.c | 5 -----
mm/sparse.c | 15 +++++++++------
2 files changed, 9 insertions(+), 11 deletions(-)
--- a/mm/sparse.c~mm-fix-accounting-of-memmap-pages
+++ a/mm/sparse.c
@@ -454,9 +454,6 @@ static void __init sparse_buffer_init(un
*/
sparsemap_buf = memmap_alloc(size, section_map_size(), addr, nid, true);
sparsemap_buf_end = sparsemap_buf + size;
-#ifndef CONFIG_SPARSEMEM_VMEMMAP
- memmap_boot_pages_add(DIV_ROUND_UP(size, PAGE_SIZE));
-#endif
}
static void __init sparse_buffer_fini(void)
@@ -567,6 +564,8 @@ static void __init sparse_init_nid(int n
sparse_buffer_fini();
goto failed;
}
+ memmap_boot_pages_add(DIV_ROUND_UP(PAGES_PER_SECTION * sizeof(struct page),
+ PAGE_SIZE));
sparse_init_early_section(nid, map, pnum, 0);
}
}
@@ -680,7 +679,6 @@ static void depopulate_section_memmap(un
unsigned long start = (unsigned long) pfn_to_page(pfn);
unsigned long end = start + nr_pages * sizeof(struct page);
- memmap_pages_add(-1L * (DIV_ROUND_UP(end - start, PAGE_SIZE)));
vmemmap_free(start, end, altmap);
}
static void free_map_bootmem(struct page *memmap)
@@ -857,10 +855,14 @@ static void section_deactivate(unsigned
* The memmap of early sections is always fully populated. See
* section_activate() and pfn_valid() .
*/
- if (!section_is_early)
+ if (!section_is_early) {
+ memmap_pages_add(-1L * (DIV_ROUND_UP(nr_pages * sizeof(struct page), PAGE_SIZE)));
depopulate_section_memmap(pfn, nr_pages, altmap);
- else if (memmap)
+ } else if (memmap) {
+ memmap_boot_pages_add(-1L * (DIV_ROUND_UP(nr_pages * sizeof(struct page),
+ PAGE_SIZE)));
free_map_bootmem(memmap);
+ }
if (empty)
ms->section_mem_map = (unsigned long)NULL;
@@ -905,6 +907,7 @@ static struct page * __meminit section_a
section_deactivate(pfn, nr_pages, altmap);
return ERR_PTR(-ENOMEM);
}
+ memmap_pages_add(DIV_ROUND_UP(nr_pages * sizeof(struct page), PAGE_SIZE));
return memmap;
}
--- a/mm/sparse-vmemmap.c~mm-fix-accounting-of-memmap-pages
+++ a/mm/sparse-vmemmap.c
@@ -578,11 +578,6 @@ struct page * __meminit __populate_secti
if (r < 0)
return NULL;
- if (system_state == SYSTEM_BOOTING)
- memmap_boot_pages_add(DIV_ROUND_UP(end - start, PAGE_SIZE));
- else
- memmap_pages_add(DIV_ROUND_UP(end - start, PAGE_SIZE));
-
return pfn_to_page(pfn);
}
_
Patches currently in -mm which might be from sumanthk(a)linux.ibm.com are
mm-fix-accounting-of-memmap-pages-for-early-sections.patch
mm-fix-accounting-of-memmap-pages.patch
When a CPU chooses to call push_dl_task and picks a task to push to
another CPU's runqueue then it will call find_lock_later_rq method
which would take a double lock on both CPUs' runqueues. If one of the
locks aren't readily available, it may lead to dropping the current
runqueue lock and reacquiring both the locks at once. During this window
it is possible that the task is already migrated and is running on some
other CPU. These cases are already handled. However, if the task is
migrated and has already been executed and another CPU is now trying to
wake it up (ttwu) such that it is queued again on the runqeue
(on_rq is 1) and also if the task was run by the same CPU, then the
current checks will pass even though the task was migrated out and is no
longer in the pushable tasks list.
Please go through the original rt change for more details on the issue.
To fix this, after the lock is obtained inside the find_lock_later_rq,
it ensures that the task is still at the head of pushable tasks list.
Also removed some checks that are no longer needed with the addition of
this new check.
However, the new check of pushable tasks list only applies when
find_lock_later_rq is called by push_dl_task. For the other caller i.e.
dl_task_offline_migration, existing checks are used.
Signed-off-by: Harshit Agarwal <harshit(a)nutanix.com>
Cc: stable(a)vger.kernel.org
---
Changes in v3:
- Incorporated review comments from Juri around the commit message as
well as around the comment regarding checks in find_lock_later_rq.
- Link to v2:
https://lore.kernel.org/stable/20250317022325.52791-1-harshit@nutanix.com/
Changes in v2:
- As per Juri's suggestion, moved the check inside find_lock_later_rq
similar to rt change. Here we distinguish among the push_dl_task
caller vs dl_task_offline_migration by checking if the task is
throttled or not.
- Fixed the commit message to refer to the rt change by title.
- Link to v1:
https://lore.kernel.org/lkml/20250307204255.60640-1-harshit@nutanix.com/
---
kernel/sched/deadline.c | 73 +++++++++++++++++++++++++++--------------
1 file changed, 49 insertions(+), 24 deletions(-)
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 38e4537790af..e0c95f33e1ed 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -2621,6 +2621,25 @@ static int find_later_rq(struct task_struct *task)
return -1;
}
+static struct task_struct *pick_next_pushable_dl_task(struct rq *rq)
+{
+ struct task_struct *p;
+
+ if (!has_pushable_dl_tasks(rq))
+ return NULL;
+
+ p = __node_2_pdl(rb_first_cached(&rq->dl.pushable_dl_tasks_root));
+
+ WARN_ON_ONCE(rq->cpu != task_cpu(p));
+ WARN_ON_ONCE(task_current(rq, p));
+ WARN_ON_ONCE(p->nr_cpus_allowed <= 1);
+
+ WARN_ON_ONCE(!task_on_rq_queued(p));
+ WARN_ON_ONCE(!dl_task(p));
+
+ return p;
+}
+
/* Locks the rq it finds */
static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq)
{
@@ -2648,12 +2667,37 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq)
/* Retry if something changed. */
if (double_lock_balance(rq, later_rq)) {
- if (unlikely(task_rq(task) != rq ||
+ /*
+ * double_lock_balance had to release rq->lock, in the
+ * meantime, task may no longer be fit to be migrated.
+ * Check the following to ensure that the task is
+ * still suitable for migration:
+ * 1. It is possible the task was scheduled,
+ * migrate_disabled was set and then got preempted,
+ * so we must check the task migration disable
+ * flag.
+ * 2. The CPU picked is in the task's affinity.
+ * 3. For throttled task (dl_task_offline_migration),
+ * check the following:
+ * - the task is not on the rq anymore (it was
+ * migrated)
+ * - the task is not on CPU anymore
+ * - the task is still a dl task
+ * - the task is not queued on the rq anymore
+ * 4. For the non-throttled task (push_dl_task), the
+ * check to ensure that this task is still at the
+ * head of the pushable tasks list is enough.
+ */
+ if (unlikely(is_migration_disabled(task) ||
!cpumask_test_cpu(later_rq->cpu, &task->cpus_mask) ||
- task_on_cpu(rq, task) ||
- !dl_task(task) ||
- is_migration_disabled(task) ||
- !task_on_rq_queued(task))) {
+ (task->dl.dl_throttled &&
+ (task_rq(task) != rq ||
+ task_on_cpu(rq, task) ||
+ !dl_task(task) ||
+ !task_on_rq_queued(task))) ||
+ (!task->dl.dl_throttled &&
+ task != pick_next_pushable_dl_task(rq)))) {
+
double_unlock_balance(rq, later_rq);
later_rq = NULL;
break;
@@ -2676,25 +2720,6 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq)
return later_rq;
}
-static struct task_struct *pick_next_pushable_dl_task(struct rq *rq)
-{
- struct task_struct *p;
-
- if (!has_pushable_dl_tasks(rq))
- return NULL;
-
- p = __node_2_pdl(rb_first_cached(&rq->dl.pushable_dl_tasks_root));
-
- WARN_ON_ONCE(rq->cpu != task_cpu(p));
- WARN_ON_ONCE(task_current(rq, p));
- WARN_ON_ONCE(p->nr_cpus_allowed <= 1);
-
- WARN_ON_ONCE(!task_on_rq_queued(p));
- WARN_ON_ONCE(!dl_task(p));
-
- return p;
-}
-
/*
* See if the non running -deadline tasks on this rq
* can be sent to some other CPU where they can preempt
--
2.49.0.111.g5b97a56fa0