The scand kthread can send dlm messages out, especially dlm remove messages to free memory for unused rsb on other nodes. To send out dlm messages it is required that midcomms is right initialized. This patch moves the midcomms start before scand is started.
Cc: stable@vger.kernel.org Fixes: e7fd41792fc0 ("[DLM] The core of the DLM for GFS2/CLVM") Signed-off-by: Alexander Aring aahringo@redhat.com --- fs/dlm/lockspace.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-)
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c index d0b4e2181a5f..99bc96f90779 100644 --- a/fs/dlm/lockspace.c +++ b/fs/dlm/lockspace.c @@ -381,23 +381,23 @@ static int threads_start(void) { int error;
- error = dlm_scand_start(); + /* Thread for sending/receiving messages for all lockspace's */ + error = dlm_midcomms_start(); if (error) { - log_print("cannot start dlm_scand thread %d", error); + log_print("cannot start dlm midcomms %d", error); goto fail; }
- /* Thread for sending/receiving messages for all lockspace's */ - error = dlm_midcomms_start(); + error = dlm_scand_start(); if (error) { - log_print("cannot start dlm midcomms %d", error); - goto scand_fail; + log_print("cannot start dlm_scand thread %d", error); + goto midcomms_fail; }
return 0;
- scand_fail: - dlm_scand_stop(); + midcomms_fail: + dlm_midcomms_stop(); fail: return error; }
While working on processing dlm message in softirq context I experienced the following KASAN use-after-free warning:
[ 151.760477] ================================================================== [ 151.761803] BUG: KASAN: use-after-free in dlm_midcomms_commit_mhandle+0x19d/0x4b0 [ 151.763414] Read of size 4 at addr ffff88811a980c60 by task lock_torture/1347
[ 151.765284] CPU: 7 PID: 1347 Comm: lock_torture Not tainted 6.1.0-rc4+ #2828 [ 151.766778] Hardware name: Red Hat KVM/RHEL-AV, BIOS 1.16.0-3.module+el8.7.0+16134+e5908aa2 04/01/2014 [ 151.768726] Call Trace: [ 151.769277] <TASK> [ 151.769748] dump_stack_lvl+0x5b/0x86 [ 151.770556] print_report+0x180/0x4c8 [ 151.771378] ? kasan_complete_mode_report_info+0x7c/0x1e0 [ 151.772241] ? dlm_midcomms_commit_mhandle+0x19d/0x4b0 [ 151.773069] kasan_report+0x93/0x1a0 [ 151.773668] ? dlm_midcomms_commit_mhandle+0x19d/0x4b0 [ 151.774514] __asan_load4+0x7e/0xa0 [ 151.775089] dlm_midcomms_commit_mhandle+0x19d/0x4b0 [ 151.775890] ? create_message.isra.29.constprop.64+0x57/0xc0 [ 151.776770] send_common+0x19f/0x1b0 [ 151.777342] ? remove_from_waiters+0x60/0x60 [ 151.778017] ? lock_downgrade+0x410/0x410 [ 151.778648] ? __this_cpu_preempt_check+0x13/0x20 [ 151.779421] ? rcu_lockdep_current_cpu_online+0x88/0xc0 [ 151.780292] _convert_lock+0x46/0x150 [ 151.780893] convert_lock+0x7b/0xc0 [ 151.781459] dlm_lock+0x3ac/0x580 [ 151.781993] ? 0xffffffffc0540000 [ 151.782522] ? torture_stop+0x120/0x120 [dlm_locktorture] [ 151.783379] ? dlm_scan_rsbs+0xa70/0xa70 [ 151.784003] ? preempt_count_sub+0xd6/0x130 [ 151.784661] ? is_module_address+0x47/0x70 [ 151.785309] ? torture_stop+0x120/0x120 [dlm_locktorture] [ 151.786166] ? 0xffffffffc0540000 [ 151.786693] ? lockdep_init_map_type+0xc3/0x360 [ 151.787414] ? 0xffffffffc0540000 [ 151.787947] torture_dlm_lock_sync.isra.3+0xe9/0x150 [dlm_locktorture] [ 151.789004] ? torture_stop+0x120/0x120 [dlm_locktorture] [ 151.789858] ? 0xffffffffc0540000 [ 151.790392] ? lock_torture_cleanup+0x20/0x20 [dlm_locktorture] [ 151.791347] ? delay_tsc+0x94/0xc0 [ 151.791898] torture_ex_iter+0xc3/0xea [dlm_locktorture] [ 151.792735] ? torture_start+0x30/0x30 [dlm_locktorture] [ 151.793606] lock_torture+0x177/0x270 [dlm_locktorture] [ 151.794448] ? torture_dlm_lock_sync.isra.3+0x150/0x150 [dlm_locktorture] [ 151.795539] ? lock_torture_stats+0x80/0x80 [dlm_locktorture] [ 151.796476] ? do_raw_spin_lock+0x11e/0x1e0 [ 151.797152] ? mark_held_locks+0x34/0xb0 [ 151.797784] ? _raw_spin_unlock_irqrestore+0x30/0x70 [ 151.798581] ? __kthread_parkme+0x79/0x110 [ 151.799246] ? trace_preempt_on+0x2a/0xf0 [ 151.799902] ? __kthread_parkme+0x79/0x110 [ 151.800579] ? preempt_count_sub+0xd6/0x130 [ 151.801271] ? __kasan_check_read+0x11/0x20 [ 151.801963] ? __kthread_parkme+0xec/0x110 [ 151.802630] ? lock_torture_stats+0x80/0x80 [dlm_locktorture] [ 151.803569] kthread+0x192/0x1d0 [ 151.804104] ? kthread_complete_and_exit+0x30/0x30 [ 151.804881] ret_from_fork+0x1f/0x30 [ 151.805480] </TASK>
[ 151.806111] Allocated by task 1347: [ 151.806681] kasan_save_stack+0x26/0x50 [ 151.807308] kasan_set_track+0x25/0x30 [ 151.807920] kasan_save_alloc_info+0x1e/0x30 [ 151.808609] __kasan_slab_alloc+0x63/0x80 [ 151.809263] kmem_cache_alloc+0x1ad/0x830 [ 151.809916] dlm_allocate_mhandle+0x17/0x20 [ 151.810590] dlm_midcomms_get_mhandle+0x96/0x260 [ 151.811344] _create_message+0x95/0x180 [ 151.811994] create_message.isra.29.constprop.64+0x57/0xc0 [ 151.812880] send_common+0x129/0x1b0 [ 151.813467] _convert_lock+0x46/0x150 [ 151.814074] convert_lock+0x7b/0xc0 [ 151.814648] dlm_lock+0x3ac/0x580 [ 151.815199] torture_dlm_lock_sync.isra.3+0xe9/0x150 [dlm_locktorture] [ 151.816258] torture_ex_iter+0xc3/0xea [dlm_locktorture] [ 151.817129] lock_torture+0x177/0x270 [dlm_locktorture] [ 151.817986] kthread+0x192/0x1d0 [ 151.818518] ret_from_fork+0x1f/0x30
[ 151.819369] Freed by task 1336: [ 151.819890] kasan_save_stack+0x26/0x50 [ 151.820514] kasan_set_track+0x25/0x30 [ 151.821128] kasan_save_free_info+0x2e/0x50 [ 151.821812] __kasan_slab_free+0x107/0x1a0 [ 151.822483] kmem_cache_free+0x204/0x5e0 [ 151.823152] dlm_free_mhandle+0x18/0x20 [ 151.823781] dlm_mhandle_release+0x2e/0x40 [ 151.824454] rcu_core+0x583/0x1330 [ 151.825047] rcu_core_si+0xe/0x20 [ 151.825594] __do_softirq+0xf4/0x5c2
[ 151.826450] Last potentially related work creation: [ 151.827238] kasan_save_stack+0x26/0x50 [ 151.827870] __kasan_record_aux_stack+0xa2/0xc0 [ 151.828609] kasan_record_aux_stack_noalloc+0xb/0x20 [ 151.829415] call_rcu+0x4c/0x760 [ 151.829954] dlm_mhandle_delete+0x97/0xb0 [ 151.830718] dlm_process_incoming_buffer+0x2fc/0xb30 [ 151.831524] process_dlm_messages+0x16e/0x470 [ 151.832245] process_one_work+0x505/0xa10 [ 151.832905] worker_thread+0x67/0x650 [ 151.833507] kthread+0x192/0x1d0 [ 151.834046] ret_from_fork+0x1f/0x30
[ 151.834900] The buggy address belongs to the object at ffff88811a980c30 which belongs to the cache dlm_mhandle of size 88 [ 151.836894] The buggy address is located 48 bytes inside of 88-byte region [ffff88811a980c30, ffff88811a980c88)
[ 151.839007] The buggy address belongs to the physical page: [ 151.839904] page:0000000076cf5d62 refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x11a980 [ 151.841378] flags: 0x8000000000000200(slab|zone=2) [ 151.842141] raw: 8000000000000200 0000000000000000 dead000000000122 ffff8881089b43c0 [ 151.843401] raw: 0000000000000000 0000000000220022 00000001ffffffff 0000000000000000 [ 151.844640] page dumped because: kasan: bad access detected
[ 151.845822] Memory state around the buggy address: [ 151.846602] ffff88811a980b00: fb fb fb fb fc fc fc fc fa fb fb fb fb fb fb fb [ 151.847761] ffff88811a980b80: fb fb fb fc fc fc fc fa fb fb fb fb fb fb fb fb [ 151.848921] >ffff88811a980c00: fb fb fc fc fc fc fa fb fb fb fb fb fb fb fb fb [ 151.850076] ^ [ 151.851085] ffff88811a980c80: fb fc fc fc fc fa fb fb fb fb fb fb fb fb fb fb [ 151.852269] ffff88811a980d00: fc fc fc fc fa fb fb fb fb fb fb fb fb fb fb fc [ 151.853428] ================================================================== [ 151.855618] Disabling lock debugging due to kernel taint
It accessing a mhandle in dlm_midcomms_commit_mhandle() and the mhandle was freed by a call_rcu() call in dlm_process_incoming_buffer(), dlm_mhandle_delete(). For me it looks like it got freed because a ack of this messages was received. Now there is a short race between commit the dlm message to be transmitted and getting an ack back. If the ack is faster than returning from dlm_midcomms_commit_msg_3_2() we run into a use-after free because we still need to reference the mhandle when calling srcu_read_unlock().
To avoid that, we don't allow that mhandle getting freed between dlm_midcomms_commit_msg_3_2() and srcu_read_unlock() by using rcu read lock. We can do that because mhandle is protected by rcu handling.
Cc: stable@vger.kernel.org Fixes: 489d8e559c65 ("fs: dlm: add reliable connection if reconnect") Signed-off-by: Alexander Aring aahringo@redhat.com --- fs/dlm/midcomms.c | 7 +++++++ 1 file changed, 7 insertions(+)
diff --git a/fs/dlm/midcomms.c b/fs/dlm/midcomms.c index fc015a6abe17..2e60d9a2c883 100644 --- a/fs/dlm/midcomms.c +++ b/fs/dlm/midcomms.c @@ -1214,8 +1214,15 @@ void dlm_midcomms_commit_mhandle(struct dlm_mhandle *mh, dlm_free_mhandle(mh); break; case DLM_VERSION_3_2: + /* held rcu read lock here, because we sending the + * dlm message out, when we do that we could receive + * an ack back which releases the mhandle and we + * get a use after free. + */ + rcu_read_lock(); dlm_midcomms_commit_msg_3_2(mh, name, namelen); srcu_read_unlock(&nodes_srcu, mh->idx); + rcu_read_unlock(); break; default: srcu_read_unlock(&nodes_srcu, mh->idx);
If we release a midcomms node structure there should be nothing left inside the dlm midcomms send queue. However sometimes we have this case because I believe some DLM_FIN message didn't got acked... if we run into a timeout handling then we should be sure there is no pending send dlm message inside this queue when releasing midcomms node structure.
Cc: stable@vger.kernel.org Fixes: 489d8e559c65 ("fs: dlm: add reliable connection if reconnect") Signed-off-by: Alexander Aring aahringo@redhat.com --- fs/dlm/midcomms.c | 1 + 1 file changed, 1 insertion(+)
diff --git a/fs/dlm/midcomms.c b/fs/dlm/midcomms.c index 2e60d9a2c883..a3eb19c8cec5 100644 --- a/fs/dlm/midcomms.c +++ b/fs/dlm/midcomms.c @@ -1402,6 +1402,7 @@ static void midcomms_node_release(struct rcu_head *rcu) struct midcomms_node *node = container_of(rcu, struct midcomms_node, rcu);
WARN_ON_ONCE(atomic_read(&node->send_queue_cnt)); + dlm_send_queue_flush(node); kfree(node); }
This patch changes to set the stop tx flag before we commit the dlm message. This flag will report about unexpected transmissions after we send the DLM_FIN messages out which should be the last message which is send out. When we commit the dlm fin message it could be that we already got an ack back and the FIN until CLOSED state change already happened. We should not set this flag when we are in CLOSED state, to avoid this race we simple set the tx flag before the state change can be in progress by moving it before dlm_midcomms_commit_mhandle().
Cc: stable@vger.kernel.org Fixes: 489d8e559c65 ("fs: dlm: add reliable connection if reconnect") Signed-off-by: Alexander Aring aahringo@redhat.com --- fs/dlm/midcomms.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/fs/dlm/midcomms.c b/fs/dlm/midcomms.c index a3eb19c8cec5..9d459d5bf800 100644 --- a/fs/dlm/midcomms.c +++ b/fs/dlm/midcomms.c @@ -406,6 +406,7 @@ static int dlm_send_fin(struct midcomms_node *node, if (!mh) return -ENOMEM;
+ set_bit(DLM_NODE_FLAG_STOP_TX, &node->flags); mh->ack_rcv = ack_rcv;
m_header = (struct dlm_header *)ppc; @@ -417,7 +418,6 @@ static int dlm_send_fin(struct midcomms_node *node,
pr_debug("sending fin msg to node %d\n", node->nodeid); dlm_midcomms_commit_mhandle(mh, NULL, 0); - set_bit(DLM_NODE_FLAG_STOP_TX, &node->flags);
return 0; }
Similar like the stop tx flag the rx flag should warn about dlm message being received at DLM_FIN state change and we are not assuming any other application DLM messages anymore. If we receive a FIN message and we are in the state DLM_FIN_WAIT2 we call midcomms_node_reset() which puts the midcomms node into DLM_CLOSED state. Afterwards we should not the DLM_NODE_FLAG_STOP_RX flag what we are currently doing. This patch handles to set the DLM_NODE_FLAG_STOP_RX flag only in those cases if we receive a FIN message and we don't assume other dlm application messages to be received anymore.
Cc: stable@vger.kernel.org Fixes: 489d8e559c65 ("fs: dlm: add reliable connection if reconnect") Signed-off-by: Alexander Aring aahringo@redhat.com --- fs/dlm/midcomms.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/fs/dlm/midcomms.c b/fs/dlm/midcomms.c index 9d459d5bf800..1fef99214204 100644 --- a/fs/dlm/midcomms.c +++ b/fs/dlm/midcomms.c @@ -524,6 +524,7 @@ static void dlm_midcomms_receive_buffer(union dlm_packet *p, break; case DLM_FIN_WAIT1: node->state = DLM_CLOSING; + set_bit(DLM_NODE_FLAG_STOP_RX, &node->flags); pr_debug("switch node %d to state %s\n", node->nodeid, dlm_state_str(node->state)); break; @@ -544,8 +545,6 @@ static void dlm_midcomms_receive_buffer(union dlm_packet *p, return; } spin_unlock(&node->state_lock); - - set_bit(DLM_NODE_FLAG_STOP_RX, &node->flags); break; default: WARN_ON_ONCE(test_bit(DLM_NODE_FLAG_STOP_RX, &node->flags));
This patch moves the send fin handling which should appear in a specific state change into the state change handling while the per node state_lock is held. I expierenced issues when other messages because we changed the state and a fin message was send out in a different state.
Cc: stable@vger.kernel.org Fixes: 489d8e559c65 ("fs: dlm: add reliable connection if reconnect") Signed-off-by: Alexander Aring aahringo@redhat.com --- fs/dlm/midcomms.c | 33 +++++++++------------------------ 1 file changed, 9 insertions(+), 24 deletions(-)
diff --git a/fs/dlm/midcomms.c b/fs/dlm/midcomms.c index 1fef99214204..fd9f413cabcd 100644 --- a/fs/dlm/midcomms.c +++ b/fs/dlm/midcomms.c @@ -402,7 +402,7 @@ static int dlm_send_fin(struct midcomms_node *node, struct dlm_mhandle *mh; char *ppc;
- mh = dlm_midcomms_get_mhandle(node->nodeid, mb_len, GFP_NOFS, &ppc); + mh = dlm_midcomms_get_mhandle(node->nodeid, mb_len, GFP_ATOMIC, &ppc); if (!mh) return -ENOMEM;
@@ -518,8 +518,8 @@ static void dlm_midcomms_receive_buffer(union dlm_packet *p, node->state = DLM_LAST_ACK; pr_debug("switch node %d to state %s case 1\n", node->nodeid, dlm_state_str(node->state)); - spin_unlock(&node->state_lock); - goto send_fin; + set_bit(DLM_NODE_FLAG_STOP_RX, &node->flags); + dlm_send_fin(node, dlm_pas_fin_ack_rcv); } break; case DLM_FIN_WAIT1: @@ -563,12 +563,6 @@ static void dlm_midcomms_receive_buffer(union dlm_packet *p, log_print_ratelimited("ignore dlm msg because seq mismatch, seq: %u, expected: %u, nodeid: %d", seq, node->seq_next, node->nodeid); } - - return; - -send_fin: - set_bit(DLM_NODE_FLAG_STOP_RX, &node->flags); - dlm_send_fin(node, dlm_pas_fin_ack_rcv); }
static struct midcomms_node * @@ -1368,11 +1362,11 @@ void dlm_midcomms_remove_member(int nodeid) case DLM_CLOSE_WAIT: /* passive shutdown DLM_LAST_ACK case 2 */ node->state = DLM_LAST_ACK; - spin_unlock(&node->state_lock); - pr_debug("switch node %d to state %s case 2\n", node->nodeid, dlm_state_str(node->state)); - goto send_fin; + set_bit(DLM_NODE_FLAG_STOP_RX, &node->flags); + dlm_send_fin(node, dlm_pas_fin_ack_rcv); + break; case DLM_LAST_ACK: /* probably receive fin caught it, do nothing */ break; @@ -1388,12 +1382,6 @@ void dlm_midcomms_remove_member(int nodeid) spin_unlock(&node->state_lock);
srcu_read_unlock(&nodes_srcu, idx); - return; - -send_fin: - set_bit(DLM_NODE_FLAG_STOP_RX, &node->flags); - dlm_send_fin(node, dlm_pas_fin_ack_rcv); - srcu_read_unlock(&nodes_srcu, idx); }
static void midcomms_node_release(struct rcu_head *rcu) @@ -1425,6 +1413,7 @@ static void midcomms_shutdown(struct midcomms_node *node) node->state = DLM_FIN_WAIT1; pr_debug("switch node %d to state %s case 2\n", node->nodeid, dlm_state_str(node->state)); + dlm_send_fin(node, dlm_act_fin_ack_rcv); break; case DLM_CLOSED: /* we have what we want */ @@ -1438,12 +1427,8 @@ static void midcomms_shutdown(struct midcomms_node *node) } spin_unlock(&node->state_lock);
- if (node->state == DLM_FIN_WAIT1) { - dlm_send_fin(node, dlm_act_fin_ack_rcv); - - if (DLM_DEBUG_FENCE_TERMINATION) - msleep(5000); - } + if (DLM_DEBUG_FENCE_TERMINATION) + msleep(5000);
/* wait for other side dlm + fin */ ret = wait_event_timeout(node->shutdown_wait,
This patch moves to send a ack back for receiving a FIN message only when we are in valid states. In other cases and there might be a sender waiting for a ack we just let it timeout at the senders time and hopefully all other cleanups will remove the FIN message on their sending queue. As an example we should never send out an ACK being in LAST_ACK state or we cannot assume a working socket communication when we are in CLOSED state.
Cc: stable@vger.kernel.org Fixes: 489d8e559c65 ("fs: dlm: add reliable connection if reconnect") Signed-off-by: Alexander Aring aahringo@redhat.com --- fs/dlm/midcomms.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-)
diff --git a/fs/dlm/midcomms.c b/fs/dlm/midcomms.c index fd9f413cabcd..ecfb3beb0bb8 100644 --- a/fs/dlm/midcomms.c +++ b/fs/dlm/midcomms.c @@ -375,7 +375,7 @@ static int dlm_send_ack(int nodeid, uint32_t seq) struct dlm_msg *msg; char *ppc;
- msg = dlm_lowcomms_new_msg(nodeid, mb_len, GFP_NOFS, &ppc, + msg = dlm_lowcomms_new_msg(nodeid, mb_len, GFP_ATOMIC, &ppc, NULL, NULL); if (!msg) return -ENOMEM; @@ -498,15 +498,14 @@ static void dlm_midcomms_receive_buffer(union dlm_packet *p,
switch (p->header.h_cmd) { case DLM_FIN: - /* send ack before fin */ - dlm_send_ack(node->nodeid, node->seq_next); - spin_lock(&node->state_lock); pr_debug("receive fin msg from node %d with state %s\n", node->nodeid, dlm_state_str(node->state));
switch (node->state) { case DLM_ESTABLISHED: + dlm_send_ack(node->nodeid, node->seq_next); + node->state = DLM_CLOSE_WAIT; pr_debug("switch node %d to state %s\n", node->nodeid, dlm_state_str(node->state)); @@ -523,12 +522,14 @@ static void dlm_midcomms_receive_buffer(union dlm_packet *p, } break; case DLM_FIN_WAIT1: + dlm_send_ack(node->nodeid, node->seq_next); node->state = DLM_CLOSING; set_bit(DLM_NODE_FLAG_STOP_RX, &node->flags); pr_debug("switch node %d to state %s\n", node->nodeid, dlm_state_str(node->state)); break; case DLM_FIN_WAIT2: + dlm_send_ack(node->nodeid, node->seq_next); midcomms_node_reset(node); pr_debug("switch node %d to state %s\n", node->nodeid, dlm_state_str(node->state));
linux-stable-mirror@lists.linaro.org