[PATCH 6.1 01/23] xfs: fix interval filtering in multi-step fsmap queries

11 Jun 2025

From: "Darrick J. Wong" djwong@kernel.org
[ Upstream commit 63ef7a35912dd743cabd65d5bb95891625c0dd46 ]
I noticed a bug in ranged GETFSMAP queries:
# xfs_io -c 'fsmap -vvvv' /opt
 EXT: DEV  BLOCK-RANGE           OWNER              FILE-OFFSET      AG AG-OFFSET           TOTAL
   0: 8:80 [0..7]:               static fs metadata                  0  (0..7)                  8
<snip>
   9: 8:80 [192..223]:           137                0..31            0  (192..223)             32
# xfs_io -c 'fsmap -vvvv -d 208 208' /opt
#
That's not right -- we asked what block maps block 208, and we should've
received a mapping for inode 137 offset 16.  Instead, we get nothing.
The root cause of this problem is a mis-interaction between the fsmap
code and how btree ranged queries work.  xfs_btree_query_range returns
any btree record that overlaps with the query interval, even if the
record starts before or ends after the interval.  Similarly, GETFSMAP is
supposed to return a recordset containing all records that overlap the
range queried.
However, it's possible that the recordset is larger than the buffer that
the caller provided to convey mappings to userspace.  In /that/ case,
userspace is supposed to copy the last record returned to fmh_keys[0]
and call GETFSMAP again.  In this case, we do not want to return
mappings that we have already supplied to the caller.  The call to
xfs_btree_query_range is the same, but now we ignore any records that
start before fmh_keys[0].
Unfortunately, we didn't implement the filtering predicate correctly.
The predicate should only be called when we're calling back for more
records.  Accomplish this by setting info->low.rm_blockcount to a
nonzero value and ensuring that it is cleared as necessary.  As a
result, we no longer want to adjust dkeys[0] in the main setup function
because that's confusing.
This patch doesn't touch the logdev/rtbitmap backends because they have
bigger problems that will be addressed by subsequent patches.
Found via xfs/556 with parent pointers enabled.
Fixes: e89c041338ed ("xfs: implement the GETFSMAP ioctl")
Signed-off-by: Darrick J. Wong djwong@kernel.org
Reviewed-by: Dave Chinner dchinner@redhat.com
Signed-off-by: Leah Rumancik leah.rumancik@gmail.com
Acked-by: "Darrick J. Wong" djwong@kernel.org
---
 fs/xfs/xfs_fsmap.c | 67 +++++++++++++++++++++++++++++++++-------------
 1 file changed, 48 insertions(+), 19 deletions(-)

diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c
index a5b9754c62d1..2011f1bf7ce0 100644
--- a/fs/xfs/xfs_fsmap.c
+++ b/fs/xfs/xfs_fsmap.c
@@ -160,11 +160,18 @@ struct xfs_getfsmap_info {
    struct xfs_buf		*agf_bp;	/* AGF, for refcount queries */
    struct xfs_perag	*pag;		/* AG info, if applicable */
    xfs_daddr_t		next_daddr;	/* next daddr we expect */
    u64			missing_owner;	/* owner of holes */
    u32			dev;		/* device id */
-	struct xfs_rmap_irec	low;		/* low rmap key */
+	/*
+	 * Low rmap key for the query.  If low.rm_blockcount is nonzero, this
+	 * is the second (or later) call to retrieve the recordset in pieces.
+	 * xfs_getfsmap_rec_before_start will compare all records retrieved
+	 * by the rmapbt query to filter out any records that start before
+	 * the last record.
+	 */
+	struct xfs_rmap_irec	low;
    struct xfs_rmap_irec	high;		/* high rmap key */
    bool			last;		/* last extent? */
 };
/* Associate a device with a getfsmap handler. */
@@ -235,34 +242,45 @@ xfs_getfsmap_format(
rec = &info->fsmap_recs[info->head->fmh_entries++];
    xfs_fsmap_from_internal(rec, xfm);
 }
+static inline bool
+xfs_getfsmap_rec_before_start(
+	struct xfs_getfsmap_info	*info,
+	const struct xfs_rmap_irec	*rec,
+	xfs_daddr_t			rec_daddr)
+{
+	if (info->low.rm_blockcount)
+		return xfs_rmap_compare(rec, &info->low) < 0;
+	return false;
+}
+
 /*
  * Format a reverse mapping for getfsmap, having translated rm_startblock
  * into the appropriate daddr units.
  */
 STATIC int
 xfs_getfsmap_helper(
    struct xfs_trans		*tp,
    struct xfs_getfsmap_info	*info,
    const struct xfs_rmap_irec	*rec,
    xfs_daddr_t			rec_daddr)
 {
    struct xfs_fsmap		fmr;
    struct xfs_mount		*mp = tp->t_mountp;
    bool				shared;
    int				error;
if (fatal_signal_pending(current))
    	return -EINTR;
/*
     * Filter out records that start before our startpoint, if the
     * caller requested that.
     */
-	if (xfs_rmap_compare(rec, &info->low) < 0) {
+	if (xfs_getfsmap_rec_before_start(info, rec, rec_daddr)) {
    	rec_daddr += XFS_FSB_TO_BB(mp, rec->rm_blockcount);
    	if (info->next_daddr < rec_daddr)
    		info->next_daddr = rec_daddr;
    	return 0;
    }
@@ -604,13 +622,31 @@ __xfs_getfsmap_datadev(
    info->low.rm_startblock = XFS_FSB_TO_AGBNO(mp, start_fsb);
    info->low.rm_offset = XFS_BB_TO_FSBT(mp, keys[0].fmr_offset);
    error = xfs_fsmap_owner_to_rmap(&info->low, &keys[0]);
    if (error)
    	return error;
-	info->low.rm_blockcount = 0;
+	info->low.rm_blockcount = XFS_BB_TO_FSBT(mp, keys[0].fmr_length);
    xfs_getfsmap_set_irec_flags(&info->low, &keys[0]);
+	/* Adjust the low key if we are continuing from where we left off. */
+	if (info->low.rm_blockcount == 0) {
+		/* empty */
+	} else if (XFS_RMAP_NON_INODE_OWNER(info->low.rm_owner) ||
+		   (info->low.rm_flags & (XFS_RMAP_ATTR_FORK |
+					  XFS_RMAP_BMBT_BLOCK |
+					  XFS_RMAP_UNWRITTEN))) {
+		info->low.rm_startblock += info->low.rm_blockcount;
+		info->low.rm_owner = 0;
+		info->low.rm_offset = 0;
+
+		start_fsb += info->low.rm_blockcount;
+		if (XFS_FSB_TO_DADDR(mp, start_fsb) >= eofs)
+			return 0;
+	} else {
+		info->low.rm_offset += info->low.rm_blockcount;
+	}
+
    info->high.rm_startblock = -1U;
    info->high.rm_owner = ULLONG_MAX;
    info->high.rm_offset = ULLONG_MAX;
    info->high.rm_blockcount = 0;
    info->high.rm_flags = XFS_RMAP_KEY_FLAGS | XFS_RMAP_REC_FLAGS;
@@ -657,16 +693,12 @@ __xfs_getfsmap_datadev(
/*
    	 * Set the AG low key to the start of the AG prior to
    	 * moving on to the next AG.
    	 */
-		if (pag->pag_agno == start_ag) {
-			info->low.rm_startblock = 0;
-			info->low.rm_owner = 0;
-			info->low.rm_offset = 0;
-			info->low.rm_flags = 0;
-		}
+		if (pag->pag_agno == start_ag)
+			memset(&info->low, 0, sizeof(info->low));
/*
    	 * If this is the last AG, report any gap at the end of it
    	 * before we drop the reference to the perag when the loop
    	 * terminates.
@@ -899,25 +931,21 @@ xfs_getfsmap(
     *
     * If the low key mapping refers to file data, the same physical
     * blocks could be mapped to several other files/offsets.
     * According to rmapbt record ordering, the minimal next
     * possible record for the block range is the next starting
-	 * offset in the same inode. Therefore, bump the file offset to
-	 * continue the search appropriately.  For all other low key
-	 * mapping types (attr blocks, metadata), bump the physical
-	 * offset as there can be no other mapping for the same physical
-	 * block range.
+	 * offset in the same inode. Therefore, each fsmap backend bumps
+	 * the file offset to continue the search appropriately.  For
+	 * all other low key mapping types (attr blocks, metadata), each
+	 * fsmap backend bumps the physical offset as there can be no
+	 * other mapping for the same physical block range.
     */
    dkeys[0] = head->fmh_keys[0];
    if (dkeys[0].fmr_flags & (FMR_OF_SPECIAL_OWNER | FMR_OF_EXTENT_MAP)) {
-		dkeys[0].fmr_physical += dkeys[0].fmr_length;
-		dkeys[0].fmr_owner = 0;
    	if (dkeys[0].fmr_offset)
    		return -EINVAL;
-	} else
-		dkeys[0].fmr_offset += dkeys[0].fmr_length;
-	dkeys[0].fmr_length = 0;
+	}
    memset(&dkeys[1], 0xFF, sizeof(struct xfs_fsmap));
if (!xfs_getfsmap_check_keys(dkeys, &head->fmh_keys[1]))
    	return -EINVAL;
@@ -958,10 +986,11 @@ xfs_getfsmap(
    		break;
info.dev = handlers[i].dev;
    	info.last = false;
    	info.pag = NULL;
+		info.low.rm_blockcount = 0;
    	error = handlers[i].fn(tp, dkeys, &info);
    	if (error)
    		break;
    	xfs_trans_cancel(tp);
    	tp = NULL;
-- 
2.50.0.rc1.591.g9c95f17f64-goog



    

2026

2025

2024

2023

2022

2021

2020

2019

2018

2017

[PATCH 6.1 01/23] xfs: fix interval filtering in multi-step fsmap queries