Hi,
I noticed there appears to be a regression in DLM (fs/dlm/) when moving from Linux 5.4.229 to 5.4.288; I get a kernel panic when using dlm_ls_lockx() (DLM user) with a timeout >0, and the panic occurs when the timeout is reached (eg, attempting to take a lock on a resource that is already locked); the host where the timeout occurs is the one that panics: ... [ 187.976007] DLM: Assertion failed on line 1239 of file fs/dlm/lock.c DLM: assertion: "!lkb->lkb_status" DLM: time = 4294853632 [ 187.976009] lkb: nodeid 2 id 1 remid 2 exflags 40000 flags 800001 sts 1 rq 5 gr -1 wait_type 4 wait_nodeid 2 seq 0 [ 187.976009] [ 187.976010] Kernel panic - not syncing: DLM: Record message above and reboot. [ 187.976099] CPU: 9 PID: 7409 Comm: dlm_scand Kdump: loaded Tainted: P OE 5.4.288-esos.prod #1 [ 187.976195] Hardware name: Quantum H2012/H12SSW-NT, BIOS T20201009143356 10/09/2020 [ 187.976282] Call Trace: [ 187.976356] dump_stack+0x50/0x63 [ 187.976429] panic+0x10c/0x2e3 [ 187.976501] kill_lkb+0x51/0x52 [ 187.976570] kref_put+0x16/0x2f [ 187.976638] __put_lkb+0x2f/0x95 [ 187.976707] dlm_scan_timeout+0x18b/0x19c [ 187.976779] ? dlm_uevent+0x19/0x19 [ 187.976848] dlm_scand+0x94/0xd1 [ 187.976920] kthread+0xe4/0xe9 [ 187.976988] ? kthread_flush_worker+0x70/0x70 [ 187.977062] ret_from_fork+0x35/0x40 ...
I examined the commits for fs/dlm/ between 5.4.229 and 5.4.288 and this is the offender: dlm: replace usage of found with dedicated list iterator variable [ Upstream commit dc1acd5c94699389a9ed023e94dd860c846ea1f6 ]
Specifically, the change highlighted below in this hunk for dlm_scan_timeout() in fs/dlm/lock.c: @@ -1867,27 +1867,28 @@ void dlm_scan_timeout(struct dlm_ls *ls) do_cancel = 0; do_warn = 0; mutex_lock(&ls->ls_timeout_mutex); - list_for_each_entry(lkb, &ls->ls_timeout, lkb_time_list) { + list_for_each_entry(iter, &ls->ls_timeout, lkb_time_list) {
wait_us = ktime_to_us(ktime_sub(ktime_get(), - lkb->lkb_timestamp)); + iter->lkb_timestamp));
- if ((lkb->lkb_exflags & DLM_LKF_TIMEOUT) && - wait_us >= (lkb->lkb_timeout_cs * 10000)) + if ((iter->lkb_exflags & DLM_LKF_TIMEOUT) && + wait_us >= (iter->lkb_timeout_cs * 10000)) do_cancel = 1;
- if ((lkb->lkb_flags & DLM_IFL_WATCH_TIMEWARN) && + if ((iter->lkb_flags & DLM_IFL_WATCH_TIMEWARN) && wait_us >= dlm_config.ci_timewarn_cs * 10000) do_warn = 1;
if (!do_cancel && !do_warn) continue; - hold_lkb(lkb); + hold_lkb(iter); + lkb = iter; break; } mutex_unlock(&ls->ls_timeout_mutex);
- if (!do_cancel && !do_warn) + if (!lkb) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ break;
r = lkb->lkb_resource;
Reverting this single line change resolves the kernel panic: $ diff -Naur fs/dlm/lock.c{.orig,} --- fs/dlm/lock.c.orig 2024-12-19 12:05:05.000000000 -0500 +++ fs/dlm/lock.c 2025-02-16 21:21:42.544181390 -0500 @@ -1888,7 +1888,7 @@ } mutex_unlock(&ls->ls_timeout_mutex);
- if (!lkb) + if (!do_cancel && !do_warn) break;
r = lkb->lkb_resource;
It appears this same "dlm: replace usage of found with dedicated list iterator variable" commit was pulled into other stable branches as well, and I don't see any fix in the latest 5.4.x patch release (5.4.290).
--Marc
On Mon, Feb 17, 2025 at 02:23:43PM -0500, Marc Smith wrote:
Hi,
I noticed there appears to be a regression in DLM (fs/dlm/) when moving from Linux 5.4.229 to 5.4.288; I get a kernel panic when using dlm_ls_lockx() (DLM user) with a timeout >0, and the panic occurs when the timeout is reached (eg, attempting to take a lock on a resource that is already locked); the host where the timeout occurs is the one that panics: ... [ 187.976007] DLM: Assertion failed on line 1239 of file fs/dlm/lock.c DLM: assertion: "!lkb->lkb_status" DLM: time = 4294853632 [ 187.976009] lkb: nodeid 2 id 1 remid 2 exflags 40000 flags 800001 sts 1 rq 5 gr -1 wait_type 4 wait_nodeid 2 seq 0 [ 187.976009] [ 187.976010] Kernel panic - not syncing: DLM: Record message above and reboot. [ 187.976099] CPU: 9 PID: 7409 Comm: dlm_scand Kdump: loaded Tainted: P OE 5.4.288-esos.prod #1 [ 187.976195] Hardware name: Quantum H2012/H12SSW-NT, BIOS T20201009143356 10/09/2020 [ 187.976282] Call Trace: [ 187.976356] dump_stack+0x50/0x63 [ 187.976429] panic+0x10c/0x2e3 [ 187.976501] kill_lkb+0x51/0x52 [ 187.976570] kref_put+0x16/0x2f [ 187.976638] __put_lkb+0x2f/0x95 [ 187.976707] dlm_scan_timeout+0x18b/0x19c [ 187.976779] ? dlm_uevent+0x19/0x19 [ 187.976848] dlm_scand+0x94/0xd1 [ 187.976920] kthread+0xe4/0xe9 [ 187.976988] ? kthread_flush_worker+0x70/0x70 [ 187.977062] ret_from_fork+0x35/0x40 ...
I examined the commits for fs/dlm/ between 5.4.229 and 5.4.288 and this is the offender: dlm: replace usage of found with dedicated list iterator variable [ Upstream commit dc1acd5c94699389a9ed023e94dd860c846ea1f6 ]
Specifically, the change highlighted below in this hunk for dlm_scan_timeout() in fs/dlm/lock.c: @@ -1867,27 +1867,28 @@ void dlm_scan_timeout(struct dlm_ls *ls) do_cancel = 0; do_warn = 0; mutex_lock(&ls->ls_timeout_mutex);
list_for_each_entry(lkb, &ls->ls_timeout, lkb_time_list) {
list_for_each_entry(iter, &ls->ls_timeout, lkb_time_list) { wait_us = ktime_to_us(ktime_sub(ktime_get(),
lkb->lkb_timestamp));
iter->lkb_timestamp));
if ((lkb->lkb_exflags & DLM_LKF_TIMEOUT) &&
wait_us >= (lkb->lkb_timeout_cs * 10000))
if ((iter->lkb_exflags & DLM_LKF_TIMEOUT) &&
wait_us >= (iter->lkb_timeout_cs * 10000)) do_cancel = 1;
if ((lkb->lkb_flags & DLM_IFL_WATCH_TIMEWARN) &&
if ((iter->lkb_flags & DLM_IFL_WATCH_TIMEWARN) && wait_us >= dlm_config.ci_timewarn_cs * 10000) do_warn = 1; if (!do_cancel && !do_warn) continue;
hold_lkb(lkb);
hold_lkb(iter);
lkb = iter; break; } mutex_unlock(&ls->ls_timeout_mutex);
if (!do_cancel && !do_warn)
if (!lkb)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ break;
r = lkb->lkb_resource;
Reverting this single line change resolves the kernel panic: $ diff -Naur fs/dlm/lock.c{.orig,} --- fs/dlm/lock.c.orig 2024-12-19 12:05:05.000000000 -0500 +++ fs/dlm/lock.c 2025-02-16 21:21:42.544181390 -0500 @@ -1888,7 +1888,7 @@ } mutex_unlock(&ls->ls_timeout_mutex);
if (!lkb)
if (!do_cancel && !do_warn) break; r = lkb->lkb_resource;
It appears this same "dlm: replace usage of found with dedicated list iterator variable" commit was pulled into other stable branches as well, and I don't see any fix in the latest 5.4.x patch release (5.4.290).
What commit needs to be backported to resolve this?
thanks,
greg k-h
Hi,
On Mon, Feb 17, 2025 at 2:24 PM Marc Smith msmith626@gmail.com wrote:
Hi,
I noticed there appears to be a regression in DLM (fs/dlm/) when moving from Linux 5.4.229 to 5.4.288; I get a kernel panic when using dlm_ls_lockx() (DLM user) with a timeout >0, and the panic occurs when the timeout is reached (eg, attempting to take a lock on a resource that is already locked); the host where the timeout occurs is the one that panics: ... [ 187.976007] DLM: Assertion failed on line 1239 of file fs/dlm/lock.c DLM: assertion: "!lkb->lkb_status" DLM: time = 4294853632 [ 187.976009] lkb: nodeid 2 id 1 remid 2 exflags 40000 flags 800001 sts 1 rq 5 gr -1 wait_type 4 wait_nodeid 2 seq 0 [ 187.976009] [ 187.976010] Kernel panic - not syncing: DLM: Record message above and reboot. [ 187.976099] CPU: 9 PID: 7409 Comm: dlm_scand Kdump: loaded Tainted: P OE 5.4.288-esos.prod #1 [ 187.976195] Hardware name: Quantum H2012/H12SSW-NT, BIOS T20201009143356 10/09/2020 [ 187.976282] Call Trace: [ 187.976356] dump_stack+0x50/0x63 [ 187.976429] panic+0x10c/0x2e3 [ 187.976501] kill_lkb+0x51/0x52 [ 187.976570] kref_put+0x16/0x2f [ 187.976638] __put_lkb+0x2f/0x95 [ 187.976707] dlm_scan_timeout+0x18b/0x19c [ 187.976779] ? dlm_uevent+0x19/0x19 [ 187.976848] dlm_scand+0x94/0xd1 [ 187.976920] kthread+0xe4/0xe9 [ 187.976988] ? kthread_flush_worker+0x70/0x70 [ 187.977062] ret_from_fork+0x35/0x40 ...
I examined the commits for fs/dlm/ between 5.4.229 and 5.4.288 and this is the offender: dlm: replace usage of found with dedicated list iterator variable [ Upstream commit dc1acd5c94699389a9ed023e94dd860c846ea1f6 ]
Specifically, the change highlighted below in this hunk for dlm_scan_timeout() in fs/dlm/lock.c: @@ -1867,27 +1867,28 @@ void dlm_scan_timeout(struct dlm_ls *ls) do_cancel = 0; do_warn = 0; mutex_lock(&ls->ls_timeout_mutex);
list_for_each_entry(lkb, &ls->ls_timeout, lkb_time_list) {
list_for_each_entry(iter, &ls->ls_timeout, lkb_time_list) { wait_us = ktime_to_us(ktime_sub(ktime_get(),
lkb->lkb_timestamp));
iter->lkb_timestamp));
if ((lkb->lkb_exflags & DLM_LKF_TIMEOUT) &&
wait_us >= (lkb->lkb_timeout_cs * 10000))
if ((iter->lkb_exflags & DLM_LKF_TIMEOUT) &&
wait_us >= (iter->lkb_timeout_cs * 10000)) do_cancel = 1;
if ((lkb->lkb_flags & DLM_IFL_WATCH_TIMEWARN) &&
if ((iter->lkb_flags & DLM_IFL_WATCH_TIMEWARN) && wait_us >= dlm_config.ci_timewarn_cs * 10000) do_warn = 1; if (!do_cancel && !do_warn) continue;
hold_lkb(lkb);
hold_lkb(iter);
lkb = iter; break; } mutex_unlock(&ls->ls_timeout_mutex);
if (!do_cancel && !do_warn)
if (!lkb)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ break;
r = lkb->lkb_resource;
Reverting this single line change resolves the kernel panic: $ diff -Naur fs/dlm/lock.c{.orig,} --- fs/dlm/lock.c.orig 2024-12-19 12:05:05.000000000 -0500 +++ fs/dlm/lock.c 2025-02-16 21:21:42.544181390 -0500 @@ -1888,7 +1888,7 @@ } mutex_unlock(&ls->ls_timeout_mutex);
if (!lkb)
if (!do_cancel && !do_warn) break; r = lkb->lkb_resource;
It appears this same "dlm: replace usage of found with dedicated list iterator variable" commit was pulled into other stable branches as well, and I don't see any fix in the latest 5.4.x patch release (5.4.290).
This works, or just init the lkb back to NULL there:
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c index 1899bb266e2e..7e02e5b55965 100644 --- a/fs/dlm/lock.c +++ b/fs/dlm/lock.c @@ -1893,6 +1893,7 @@ void dlm_scan_timeout(struct dlm_ls *ls) if (dlm_locking_stopped(ls)) break;
+ lkb = NULL; do_cancel = 0; do_warn = 0; mutex_lock(&ls->ls_timeout_mutex);
Can you provide more details about the use case of timeout? Are you using DLM in user space or kernel?
- Alex
On Tue, Feb 18, 2025 at 10:59 AM Alexander Aring aahringo@redhat.com wrote:
Hi,
On Mon, Feb 17, 2025 at 2:24 PM Marc Smith msmith626@gmail.com wrote:
Hi,
I noticed there appears to be a regression in DLM (fs/dlm/) when moving from Linux 5.4.229 to 5.4.288; I get a kernel panic when using dlm_ls_lockx() (DLM user) with a timeout >0, and the panic occurs when the timeout is reached (eg, attempting to take a lock on a resource that is already locked); the host where the timeout occurs is the one that panics: ... [ 187.976007] DLM: Assertion failed on line 1239 of file fs/dlm/lock.c DLM: assertion: "!lkb->lkb_status" DLM: time = 4294853632 [ 187.976009] lkb: nodeid 2 id 1 remid 2 exflags 40000 flags 800001 sts 1 rq 5 gr -1 wait_type 4 wait_nodeid 2 seq 0 [ 187.976009] [ 187.976010] Kernel panic - not syncing: DLM: Record message above and reboot. [ 187.976099] CPU: 9 PID: 7409 Comm: dlm_scand Kdump: loaded Tainted: P OE 5.4.288-esos.prod #1 [ 187.976195] Hardware name: Quantum H2012/H12SSW-NT, BIOS T20201009143356 10/09/2020 [ 187.976282] Call Trace: [ 187.976356] dump_stack+0x50/0x63 [ 187.976429] panic+0x10c/0x2e3 [ 187.976501] kill_lkb+0x51/0x52 [ 187.976570] kref_put+0x16/0x2f [ 187.976638] __put_lkb+0x2f/0x95 [ 187.976707] dlm_scan_timeout+0x18b/0x19c [ 187.976779] ? dlm_uevent+0x19/0x19 [ 187.976848] dlm_scand+0x94/0xd1 [ 187.976920] kthread+0xe4/0xe9 [ 187.976988] ? kthread_flush_worker+0x70/0x70 [ 187.977062] ret_from_fork+0x35/0x40 ...
I examined the commits for fs/dlm/ between 5.4.229 and 5.4.288 and this is the offender: dlm: replace usage of found with dedicated list iterator variable [ Upstream commit dc1acd5c94699389a9ed023e94dd860c846ea1f6 ]
Specifically, the change highlighted below in this hunk for dlm_scan_timeout() in fs/dlm/lock.c: @@ -1867,27 +1867,28 @@ void dlm_scan_timeout(struct dlm_ls *ls) do_cancel = 0; do_warn = 0; mutex_lock(&ls->ls_timeout_mutex);
list_for_each_entry(lkb, &ls->ls_timeout, lkb_time_list) {
list_for_each_entry(iter, &ls->ls_timeout, lkb_time_list) { wait_us = ktime_to_us(ktime_sub(ktime_get(),
lkb->lkb_timestamp));
iter->lkb_timestamp));
if ((lkb->lkb_exflags & DLM_LKF_TIMEOUT) &&
wait_us >= (lkb->lkb_timeout_cs * 10000))
if ((iter->lkb_exflags & DLM_LKF_TIMEOUT) &&
wait_us >= (iter->lkb_timeout_cs * 10000)) do_cancel = 1;
if ((lkb->lkb_flags & DLM_IFL_WATCH_TIMEWARN) &&
if ((iter->lkb_flags & DLM_IFL_WATCH_TIMEWARN) && wait_us >= dlm_config.ci_timewarn_cs * 10000) do_warn = 1; if (!do_cancel && !do_warn) continue;
hold_lkb(lkb);
hold_lkb(iter);
lkb = iter; break; } mutex_unlock(&ls->ls_timeout_mutex);
if (!do_cancel && !do_warn)
if (!lkb)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ break;
r = lkb->lkb_resource;
Reverting this single line change resolves the kernel panic: $ diff -Naur fs/dlm/lock.c{.orig,} --- fs/dlm/lock.c.orig 2024-12-19 12:05:05.000000000 -0500 +++ fs/dlm/lock.c 2025-02-16 21:21:42.544181390 -0500 @@ -1888,7 +1888,7 @@ } mutex_unlock(&ls->ls_timeout_mutex);
if (!lkb)
if (!do_cancel && !do_warn) break; r = lkb->lkb_resource;
It appears this same "dlm: replace usage of found with dedicated list iterator variable" commit was pulled into other stable branches as well, and I don't see any fix in the latest 5.4.x patch release (5.4.290).
This works, or just init the lkb back to NULL there:
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c index 1899bb266e2e..7e02e5b55965 100644 --- a/fs/dlm/lock.c +++ b/fs/dlm/lock.c @@ -1893,6 +1893,7 @@ void dlm_scan_timeout(struct dlm_ls *ls) if (dlm_locking_stopped(ls)) break;
lkb = NULL; do_cancel = 0; do_warn = 0; mutex_lock(&ls->ls_timeout_mutex);
Can you provide more details about the use case of timeout? Are you using DLM in user space or kernel?
Yes, using dlm_ls_lockx() from DLM user space library in an application; it's used to protect a shared resource with an exclusive lock, so if another node attempts to take that lock and it times out waiting, then we see the kernel panic.
--Marc
- Alex
linux-stable-mirror@lists.linaro.org