From: "Darrick J. Wong" djwong@kernel.org
[ Upstream commit 63ef7a35912dd743cabd65d5bb95891625c0dd46 ]
I noticed a bug in ranged GETFSMAP queries:
# xfs_io -c 'fsmap -vvvv' /opt EXT: DEV BLOCK-RANGE OWNER FILE-OFFSET AG AG-OFFSET TOTAL 0: 8:80 [0..7]: static fs metadata 0 (0..7) 8 <snip> 9: 8:80 [192..223]: 137 0..31 0 (192..223) 32 # xfs_io -c 'fsmap -vvvv -d 208 208' /opt #
That's not right -- we asked what block maps block 208, and we should've received a mapping for inode 137 offset 16. Instead, we get nothing.
The root cause of this problem is a mis-interaction between the fsmap code and how btree ranged queries work. xfs_btree_query_range returns any btree record that overlaps with the query interval, even if the record starts before or ends after the interval. Similarly, GETFSMAP is supposed to return a recordset containing all records that overlap the range queried.
However, it's possible that the recordset is larger than the buffer that the caller provided to convey mappings to userspace. In /that/ case, userspace is supposed to copy the last record returned to fmh_keys[0] and call GETFSMAP again. In this case, we do not want to return mappings that we have already supplied to the caller. The call to xfs_btree_query_range is the same, but now we ignore any records that start before fmh_keys[0].
Unfortunately, we didn't implement the filtering predicate correctly. The predicate should only be called when we're calling back for more records. Accomplish this by setting info->low.rm_blockcount to a nonzero value and ensuring that it is cleared as necessary. As a result, we no longer want to adjust dkeys[0] in the main setup function because that's confusing.
This patch doesn't touch the logdev/rtbitmap backends because they have bigger problems that will be addressed by subsequent patches.
Found via xfs/556 with parent pointers enabled.
Fixes: e89c041338ed ("xfs: implement the GETFSMAP ioctl") Signed-off-by: Darrick J. Wong djwong@kernel.org Reviewed-by: Dave Chinner dchinner@redhat.com Signed-off-by: Leah Rumancik leah.rumancik@gmail.com Acked-by: "Darrick J. Wong" djwong@kernel.org --- fs/xfs/xfs_fsmap.c | 67 +++++++++++++++++++++++++++++++++------------- 1 file changed, 48 insertions(+), 19 deletions(-)
diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c index a5b9754c62d1..2011f1bf7ce0 100644 --- a/fs/xfs/xfs_fsmap.c +++ b/fs/xfs/xfs_fsmap.c @@ -160,11 +160,18 @@ struct xfs_getfsmap_info { struct xfs_buf *agf_bp; /* AGF, for refcount queries */ struct xfs_perag *pag; /* AG info, if applicable */ xfs_daddr_t next_daddr; /* next daddr we expect */ u64 missing_owner; /* owner of holes */ u32 dev; /* device id */ - struct xfs_rmap_irec low; /* low rmap key */ + /* + * Low rmap key for the query. If low.rm_blockcount is nonzero, this + * is the second (or later) call to retrieve the recordset in pieces. + * xfs_getfsmap_rec_before_start will compare all records retrieved + * by the rmapbt query to filter out any records that start before + * the last record. + */ + struct xfs_rmap_irec low; struct xfs_rmap_irec high; /* high rmap key */ bool last; /* last extent? */ };
/* Associate a device with a getfsmap handler. */ @@ -235,34 +242,45 @@ xfs_getfsmap_format(
rec = &info->fsmap_recs[info->head->fmh_entries++]; xfs_fsmap_from_internal(rec, xfm); }
+static inline bool +xfs_getfsmap_rec_before_start( + struct xfs_getfsmap_info *info, + const struct xfs_rmap_irec *rec, + xfs_daddr_t rec_daddr) +{ + if (info->low.rm_blockcount) + return xfs_rmap_compare(rec, &info->low) < 0; + return false; +} + /* * Format a reverse mapping for getfsmap, having translated rm_startblock * into the appropriate daddr units. */ STATIC int xfs_getfsmap_helper( struct xfs_trans *tp, struct xfs_getfsmap_info *info, const struct xfs_rmap_irec *rec, xfs_daddr_t rec_daddr) { struct xfs_fsmap fmr; struct xfs_mount *mp = tp->t_mountp; bool shared; int error;
if (fatal_signal_pending(current)) return -EINTR;
/* * Filter out records that start before our startpoint, if the * caller requested that. */ - if (xfs_rmap_compare(rec, &info->low) < 0) { + if (xfs_getfsmap_rec_before_start(info, rec, rec_daddr)) { rec_daddr += XFS_FSB_TO_BB(mp, rec->rm_blockcount); if (info->next_daddr < rec_daddr) info->next_daddr = rec_daddr; return 0; } @@ -604,13 +622,31 @@ __xfs_getfsmap_datadev( info->low.rm_startblock = XFS_FSB_TO_AGBNO(mp, start_fsb); info->low.rm_offset = XFS_BB_TO_FSBT(mp, keys[0].fmr_offset); error = xfs_fsmap_owner_to_rmap(&info->low, &keys[0]); if (error) return error; - info->low.rm_blockcount = 0; + info->low.rm_blockcount = XFS_BB_TO_FSBT(mp, keys[0].fmr_length); xfs_getfsmap_set_irec_flags(&info->low, &keys[0]);
+ /* Adjust the low key if we are continuing from where we left off. */ + if (info->low.rm_blockcount == 0) { + /* empty */ + } else if (XFS_RMAP_NON_INODE_OWNER(info->low.rm_owner) || + (info->low.rm_flags & (XFS_RMAP_ATTR_FORK | + XFS_RMAP_BMBT_BLOCK | + XFS_RMAP_UNWRITTEN))) { + info->low.rm_startblock += info->low.rm_blockcount; + info->low.rm_owner = 0; + info->low.rm_offset = 0; + + start_fsb += info->low.rm_blockcount; + if (XFS_FSB_TO_DADDR(mp, start_fsb) >= eofs) + return 0; + } else { + info->low.rm_offset += info->low.rm_blockcount; + } + info->high.rm_startblock = -1U; info->high.rm_owner = ULLONG_MAX; info->high.rm_offset = ULLONG_MAX; info->high.rm_blockcount = 0; info->high.rm_flags = XFS_RMAP_KEY_FLAGS | XFS_RMAP_REC_FLAGS; @@ -657,16 +693,12 @@ __xfs_getfsmap_datadev(
/* * Set the AG low key to the start of the AG prior to * moving on to the next AG. */ - if (pag->pag_agno == start_ag) { - info->low.rm_startblock = 0; - info->low.rm_owner = 0; - info->low.rm_offset = 0; - info->low.rm_flags = 0; - } + if (pag->pag_agno == start_ag) + memset(&info->low, 0, sizeof(info->low));
/* * If this is the last AG, report any gap at the end of it * before we drop the reference to the perag when the loop * terminates. @@ -899,25 +931,21 @@ xfs_getfsmap( * * If the low key mapping refers to file data, the same physical * blocks could be mapped to several other files/offsets. * According to rmapbt record ordering, the minimal next * possible record for the block range is the next starting - * offset in the same inode. Therefore, bump the file offset to - * continue the search appropriately. For all other low key - * mapping types (attr blocks, metadata), bump the physical - * offset as there can be no other mapping for the same physical - * block range. + * offset in the same inode. Therefore, each fsmap backend bumps + * the file offset to continue the search appropriately. For + * all other low key mapping types (attr blocks, metadata), each + * fsmap backend bumps the physical offset as there can be no + * other mapping for the same physical block range. */ dkeys[0] = head->fmh_keys[0]; if (dkeys[0].fmr_flags & (FMR_OF_SPECIAL_OWNER | FMR_OF_EXTENT_MAP)) { - dkeys[0].fmr_physical += dkeys[0].fmr_length; - dkeys[0].fmr_owner = 0; if (dkeys[0].fmr_offset) return -EINVAL; - } else - dkeys[0].fmr_offset += dkeys[0].fmr_length; - dkeys[0].fmr_length = 0; + } memset(&dkeys[1], 0xFF, sizeof(struct xfs_fsmap));
if (!xfs_getfsmap_check_keys(dkeys, &head->fmh_keys[1])) return -EINVAL;
@@ -958,10 +986,11 @@ xfs_getfsmap( break;
info.dev = handlers[i].dev; info.last = false; info.pag = NULL; + info.low.rm_blockcount = 0; error = handlers[i].fn(tp, dkeys, &info); if (error) break; xfs_trans_cancel(tp); tp = NULL;