when running data-check and ecounter a normal IO errror, raid10d handle the error, one resync IO added into conf->retry_list waiting for raid10d to handle it, so barrier will not drop to zero and the normal IO(read error) will stuck in wait_barrier in raid10_read_request. after this, resyc thread will stuck in raise_barrier, other process will stuck in wait_barrier. Ignore barrier for read error retry in raid10_read_request to avoid deadlock. for kernel linux-4.19.y
processA md0_raid10 md0_resync processB ------------------------------------------------------------------------- | | | | read io error | | | | handle_read_error raise_barrier | | | (nr_pending=1,barrier=1) | | | wait_barrier | | (nr_waiting=1,barrier=1) allow_barrier | | (nr_pending=0) | | | | | conf->retry_list | | | | wait_barrier (nr_waiting=2,barrier=1)
[ 1452.065519] INFO: task md0_raid10:381 blocked for more than 120 seconds. [ 1452.065852] Tainted: G OE K 4.19.280 #2 [ 1452.066018] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [ 1452.066189] md0_raid10 D 0 381 2 0x80000000 [ 1452.066191] Call Trace: [ 1452.066197] __schedule+0x3f8/0x8b0 [ 1452.066199] schedule+0x36/0x80 [ 1452.066201] wait_barrier+0x150/0x1b0 [ 1452.066203] ? wait_woken+0x80/0x80 [ 1452.066205] raid10_read_request+0xa8/0x510 [ 1452.066206] handle_read_error+0xa9/0x220 [ 1452.066207] ? pick_next_task_fair+0x15d/0x610 [ 1452.066208] raid10d+0xa01/0x1510 [ 1452.066210] ? schedule+0x36/0x80 [ 1452.066211] md_thread+0x133/0x180 [ 1452.066212] ? md_thread+0x133/0x180 [ 1452.066213] ? wait_woken+0x80/0x80 [ 1452.066214] kthread+0x105/0x140
Signed-off-by: linminggui linminggui1@bigo.sg --- drivers/md/raid10.c | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-)
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 9f9cd2f..9f00400 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1137,6 +1137,7 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio, int slot = r10_bio->read_slot; struct md_rdev *err_rdev = NULL; gfp_t gfp = GFP_NOIO; + bool error_retry = false;
if (slot >= 0 && r10_bio->devs[slot].rdev) { /* @@ -1153,6 +1154,9 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio, */ gfp = GFP_NOIO | __GFP_HIGH;
+ error_retry = true; + atomic_inc(&conf->nr_pending); + rcu_read_lock(); disk = r10_bio->devs[slot].devnum; err_rdev = rcu_dereference(conf->mirrors[disk].rdev); @@ -1169,8 +1173,10 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio, * Register the new request and wait if the reconstruction * thread has put up a bar for new requests. * Continue immediately if no resync is active currently. + * Ignore barrier if this is an error retry. */ - wait_barrier(conf); + if (!error_retry) + wait_barrier(conf);
sectors = r10_bio->sectors; while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && @@ -1181,12 +1187,14 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio, * pass */ raid10_log(conf->mddev, "wait reshape"); - allow_barrier(conf); + if (!error_retry) + allow_barrier(conf); wait_event(conf->wait_barrier, conf->reshape_progress <= bio->bi_iter.bi_sector || conf->reshape_progress >= bio->bi_iter.bi_sector + sectors); - wait_barrier(conf); + if (!error_retry) + wait_barrier(conf); }
rdev = read_balance(conf, r10_bio, &max_sectors); @@ -1208,9 +1216,11 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio, struct bio *split = bio_split(bio, max_sectors, gfp, &conf->bio_split); bio_chain(split, bio); - allow_barrier(conf); + if (!error_retry) + allow_barrier(conf); generic_make_request(bio); - wait_barrier(conf); + if (!error_retry) + wait_barrier(conf); bio = split; r10_bio->master_bio = bio; r10_bio->sectors = max_sectors;