Linux 5.4.x DLM Regression - Linux-stable-mirror

17 Feb 2025

Hi,
I noticed there appears to be a regression in DLM (fs/dlm/) when
moving from Linux 5.4.229 to 5.4.288; I get a kernel panic when using
dlm_ls_lockx() (DLM user) with a timeout >0, and the panic occurs when
the timeout is reached (eg, attempting to take a lock on a resource
that is already locked); the host where the timeout occurs is the one
that panics:
...
[  187.976007]
               DLM:  Assertion failed on line 1239 of file fs/dlm/lock.c
               DLM:  assertion:  "!lkb->lkb_status"
               DLM:  time = 4294853632
[  187.976009] lkb: nodeid 2 id 1 remid 2 exflags 40000 flags 800001
sts 1 rq 5 gr -1 wait_type 4 wait_nodeid 2 seq 0
[  187.976009]
[  187.976010] Kernel panic - not syncing: DLM:  Record message above
and reboot.
[  187.976099] CPU: 9 PID: 7409 Comm: dlm_scand Kdump: loaded Tainted:
P           OE     5.4.288-esos.prod #1
[  187.976195] Hardware name: Quantum H2012/H12SSW-NT, BIOS
T20201009143356 10/09/2020
[  187.976282] Call Trace:
[  187.976356]  dump_stack+0x50/0x63
[  187.976429]  panic+0x10c/0x2e3
[  187.976501]  kill_lkb+0x51/0x52
[  187.976570]  kref_put+0x16/0x2f
[  187.976638]  __put_lkb+0x2f/0x95
[  187.976707]  dlm_scan_timeout+0x18b/0x19c
[  187.976779]  ? dlm_uevent+0x19/0x19
[  187.976848]  dlm_scand+0x94/0xd1
[  187.976920]  kthread+0xe4/0xe9
[  187.976988]  ? kthread_flush_worker+0x70/0x70
[  187.977062]  ret_from_fork+0x35/0x40
...
I examined the commits for fs/dlm/ between 5.4.229 and 5.4.288 and
this is the offender:
dlm: replace usage of found with dedicated list iterator variable
[ Upstream commit dc1acd5c94699389a9ed023e94dd860c846ea1f6 ]
Specifically, the change highlighted below in this hunk for
dlm_scan_timeout() in fs/dlm/lock.c:
@@ -1867,27 +1867,28 @@ void dlm_scan_timeout(struct dlm_ls *ls)
                do_cancel = 0;
                do_warn = 0;
                mutex_lock(&ls->ls_timeout_mutex);
-               list_for_each_entry(lkb, &ls->ls_timeout, lkb_time_list) {
+               list_for_each_entry(iter, &ls->ls_timeout, lkb_time_list) {
wait_us = ktime_to_us(ktime_sub(ktime_get(),
-                                                       lkb->lkb_timestamp));
+                                                       iter->lkb_timestamp));
-                       if ((lkb->lkb_exflags & DLM_LKF_TIMEOUT) &&
-                           wait_us >= (lkb->lkb_timeout_cs * 10000))
+                       if ((iter->lkb_exflags & DLM_LKF_TIMEOUT) &&
+                           wait_us >= (iter->lkb_timeout_cs * 10000))
                                do_cancel = 1;
-                       if ((lkb->lkb_flags & DLM_IFL_WATCH_TIMEWARN) &&
+                       if ((iter->lkb_flags & DLM_IFL_WATCH_TIMEWARN) &&
                            wait_us >= dlm_config.ci_timewarn_cs * 10000)
                                do_warn = 1;
if (!do_cancel && !do_warn)
                                continue;
-                       hold_lkb(lkb);
+                       hold_lkb(iter);
+                       lkb = iter;
                        break;
                }
                mutex_unlock(&ls->ls_timeout_mutex);
-               if (!do_cancel && !do_warn)
+               if (!lkb)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
                        break;
r = lkb->lkb_resource;
Reverting this single line change resolves the kernel panic:
$ diff -Naur fs/dlm/lock.c{.orig,}

--- fs/dlm/lock.c.orig  2024-12-19 12:05:05.000000000 -0500
+++ fs/dlm/lock.c       2025-02-16 21:21:42.544181390 -0500
@@ -1888,7 +1888,7 @@
                }
                mutex_unlock(&ls->ls_timeout_mutex);
-               if (!lkb)
+               if (!do_cancel && !do_warn)
                        break;
r = lkb->lkb_resource;
It appears this same "dlm: replace usage of found with dedicated list
iterator variable" commit was pulled into other stable branches as
well, and I don't see any fix in the latest 5.4.x patch release
(5.4.290).
--Marc