[PATCH 5.15 195/476] md: Whenassemble the array, consult the superblock of the freshest device

21 Feb 2024

5.15-stable review patch.  If anyone has any objections, please let me know.
------------------
From: Alex Lyakas alex.lyakas@zadara.com
[ Upstream commit dc1cc22ed58f11d58d8553c5ec5f11cbfc3e3039 ]
Upon assembling the array, both kernel and mdadm allow the devices to have event
counter difference of 1, and still consider them as up-to-date.
However, a device whose event count is behind by 1, may in fact not be up-to-date,
and array resync with such a device may cause data corruption.
To avoid this, consult the superblock of the freshest device about the status
of a device, whose event counter is behind by 1.
Signed-off-by: Alex Lyakas alex.lyakas@zadara.com
Signed-off-by: Song Liu song@kernel.org
Link: https://lore.kernel.org/r/1702470271-16073-1-git-send-email-alex.lyakas@zada...
Signed-off-by: Sasha Levin sashal@kernel.org
---
 drivers/md/md.c | 54 ++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 44 insertions(+), 10 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index aae9ec78c0e8..ff65e5eddfa1 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -1162,6 +1162,7 @@ struct super_type  {
    				  struct md_rdev *refdev,
    				  int minor_version);
    int		    (*validate_super)(struct mddev *mddev,
+					      struct md_rdev *freshest,
    				      struct md_rdev *rdev);
    void		    (*sync_super)(struct mddev *mddev,
    				  struct md_rdev *rdev);
@@ -1300,8 +1301,9 @@ static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor
/*
  * validate_super for 0.90.0
+ * note: we are not using "freshest" for 0.9 superblock
  */
-static int super_90_validate(struct mddev *mddev, struct md_rdev *rdev)
+static int super_90_validate(struct mddev *mddev, struct md_rdev *freshest, struct md_rdev *rdev)
 {
    mdp_disk_t *desc;
    mdp_super_t *sb = page_address(rdev->sb_page);
@@ -1816,7 +1818,7 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_
    return ret;
 }
-static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
+static int super_1_validate(struct mddev *mddev, struct md_rdev *freshest, struct md_rdev *rdev)
 {
    struct mdp_superblock_1 *sb = page_address(rdev->sb_page);
    __u64 ev1 = le64_to_cpu(sb->events);
@@ -1912,13 +1914,15 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
    	}
    } else if (mddev->pers == NULL) {
    	/* Insist of good event counter while assembling, except for
-		 * spares (which don't need an event count) */
-		++ev1;
+		 * spares (which don't need an event count).
+		 * Similar to mdadm, we allow event counter difference of 1
+		 * from the freshest device.
+		 */
    	if (rdev->desc_nr >= 0 &&
    	    rdev->desc_nr < le32_to_cpu(sb->max_dev) &&
    	    (le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < MD_DISK_ROLE_MAX ||
    	     le16_to_cpu(sb->dev_roles[rdev->desc_nr]) == MD_DISK_ROLE_JOURNAL))
-			if (ev1 < mddev->events)
+			if (ev1 + 1 < mddev->events)
    			return -EINVAL;
    } else if (mddev->bitmap) {
    	/* If adding to array with a bitmap, then we can accept an
@@ -1939,8 +1943,38 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
    	    rdev->desc_nr >= le32_to_cpu(sb->max_dev)) {
    		role = MD_DISK_ROLE_SPARE;
    		rdev->desc_nr = -1;
-		} else
+		} else if (mddev->pers == NULL && freshest && ev1 < mddev->events) {
+			/*
+			 * If we are assembling, and our event counter is smaller than the
+			 * highest event counter, we cannot trust our superblock about the role.
+			 * It could happen that our rdev was marked as Faulty, and all other
+			 * superblocks were updated with +1 event counter.
+			 * Then, before the next superblock update, which typically happens when
+			 * remove_and_add_spares() removes the device from the array, there was
+			 * a crash or reboot.
+			 * If we allow current rdev without consulting the freshest superblock,
+			 * we could cause data corruption.
+			 * Note that in this case our event counter is smaller by 1 than the
+			 * highest, otherwise, this rdev would not be allowed into array;
+			 * both kernel and mdadm allow event counter difference of 1.
+			 */
+			struct mdp_superblock_1 *freshest_sb = page_address(freshest->sb_page);
+			u32 freshest_max_dev = le32_to_cpu(freshest_sb->max_dev);
+
+			if (rdev->desc_nr >= freshest_max_dev) {
+				/* this is unexpected, better not proceed */
+				pr_warn("md: %s: rdev[%pg]: desc_nr(%d) >= freshest(%pg)->sb->max_dev(%u)\n",
+						mdname(mddev), rdev->bdev, rdev->desc_nr,
+						freshest->bdev, freshest_max_dev);
+				return -EUCLEAN;
+			}
+
+			role = le16_to_cpu(freshest_sb->dev_roles[rdev->desc_nr]);
+			pr_debug("md: %s: rdev[%pg]: role=%d(0x%x) according to freshest %pg\n",
+				     mdname(mddev), rdev->bdev, role, role, freshest->bdev);
+		} else {
    		role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]);
+		}
    	switch(role) {
    	case MD_DISK_ROLE_SPARE: /* spare */
    		break;
@@ -2890,7 +2924,7 @@ static int add_bound_rdev(struct md_rdev *rdev)
    	 * and should be added immediately.
    	 */
    	super_types[mddev->major_version].
-			validate_super(mddev, rdev);
+			validate_super(mddev, NULL/*freshest*/, rdev);
    	if (add_journal)
    		mddev_suspend(mddev);
    	err = mddev->pers->hot_add_disk(mddev, rdev);
@@ -3809,7 +3843,7 @@ static int analyze_sbs(struct mddev *mddev)
    }
super_types[mddev->major_version].
-		validate_super(mddev, freshest);
+		validate_super(mddev, NULL/*freshest*/, freshest);
i = 0;
    rdev_for_each_safe(rdev, tmp, mddev) {
@@ -3824,7 +3858,7 @@ static int analyze_sbs(struct mddev *mddev)
    	}
    	if (rdev != freshest) {
    		if (super_types[mddev->major_version].
-			    validate_super(mddev, rdev)) {
+			    validate_super(mddev, freshest, rdev)) {
    			pr_warn("md: kicking non-fresh %s from array!\n",
    				bdevname(rdev->bdev,b));
    			md_kick_rdev_from_array(rdev);
@@ -6795,7 +6829,7 @@ int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info)
    		rdev->saved_raid_disk = rdev->raid_disk;
    	} else
    		super_types[mddev->major_version].
-				validate_super(mddev, rdev);
+				validate_super(mddev, NULL/*freshest*/, rdev);
    	if ((info->state & (1<<MD_DISK_SYNC)) &&
    	     rdev->raid_disk != info->raid_disk) {
    		/* This was a hot-add request, but events doesn't
-- 
2.43.0





    

2026

2025

2024

2023

2022

2021

2020

2019

2018

2017

[PATCH 5.15 195/476] md: Whenassemble the array, consult the superblock of the freshest device