Hi Greg,
This 5.4.y backport series contains XFS fixes from v5.10. The patchset has been acked by Darrick.
Brian Foster (1): xfs: sync lazy sb accounting on quiesce of read-only mounts
Christoph Hellwig (8): xfs: remove the xfs_efi_log_item_t typedef xfs: remove the xfs_efd_log_item_t typedef xfs: remove the xfs_inode_log_item_t typedef xfs: factor out a xfs_defer_create_intent helper xfs: merge the ->log_item defer op into ->create_intent xfs: merge the ->diff_items defer op into ->create_intent xfs: turn dfp_intent into a xfs_log_item xfs: refactor xfs_defer_finish_noroll
Darrick J. Wong (15): xfs: log new intent items created as part of finishing recovered intent items xfs: proper replay of deferred ops queued during log recovery xfs: xfs_defer_capture should absorb remaining block reservations xfs: xfs_defer_capture should absorb remaining transaction reservation xfs: clean up bmap intent item recovery checking xfs: clean up xfs_bui_item_recover iget/trans_alloc/ilock ordering xfs: fix an incore inode UAF in xfs_bui_recover xfs: change the order in which child and parent defer ops are finished xfs: periodically relog deferred intent items xfs: expose the log push threshold xfs: only relog deferred intent items if free space in the log gets low xfs: fix missing CoW blocks writeback conversion retry xfs: ensure inobt record walks always make forward progress xfs: fix the forward progress assertion in xfs_iwalk_run_callbacks xfs: prevent UAF in xfs_log_item_in_current_chkpt
Dave Chinner (1): xfs: fix finobt btree block recovery ordering
fs/xfs/libxfs/xfs_defer.c | 358 ++++++++++++++++++++++++-------- fs/xfs/libxfs/xfs_defer.h | 49 ++++- fs/xfs/libxfs/xfs_inode_fork.c | 2 +- fs/xfs/libxfs/xfs_trans_inode.c | 2 +- fs/xfs/xfs_aops.c | 4 +- fs/xfs/xfs_bmap_item.c | 238 +++++++++++---------- fs/xfs/xfs_bmap_item.h | 3 +- fs/xfs/xfs_extfree_item.c | 175 +++++++++------- fs/xfs/xfs_extfree_item.h | 18 +- fs/xfs/xfs_icreate_item.c | 1 + fs/xfs/xfs_inode.c | 4 +- fs/xfs/xfs_inode_item.c | 2 +- fs/xfs/xfs_inode_item.h | 4 +- fs/xfs/xfs_iwalk.c | 27 ++- fs/xfs/xfs_log.c | 68 ++++-- fs/xfs/xfs_log.h | 3 + fs/xfs/xfs_log_cil.c | 8 +- fs/xfs/xfs_log_recover.c | 160 ++++++++------ fs/xfs/xfs_mount.c | 3 +- fs/xfs/xfs_refcount_item.c | 173 ++++++++------- fs/xfs/xfs_refcount_item.h | 3 +- fs/xfs/xfs_rmap_item.c | 161 +++++++------- fs/xfs/xfs_rmap_item.h | 3 +- fs/xfs/xfs_stats.c | 4 + fs/xfs/xfs_stats.h | 1 + fs/xfs/xfs_super.c | 8 +- fs/xfs/xfs_trace.h | 1 + fs/xfs/xfs_trans.h | 10 + 28 files changed, 946 insertions(+), 547 deletions(-)
From: Christoph Hellwig hch@lst.de
commit 82ff450b2d936d778361a1de43eb078cc043c7fe upstream.
Signed-off-by: Christoph Hellwig hch@lst.de Reviewed-by: Brian Foster bfoster@redhat.com Reviewed-by: Darrick J. Wong darrick.wong@oracle.com Signed-off-by: Darrick J. Wong darrick.wong@oracle.com Signed-off-by: Chandan Babu R chandan.babu@oracle.com Acked-by: Darrick J. Wong djwong@kernel.org --- fs/xfs/xfs_extfree_item.c | 2 +- fs/xfs/xfs_extfree_item.h | 10 +++++----- fs/xfs/xfs_log_recover.c | 4 ++-- fs/xfs/xfs_super.c | 2 +- 4 files changed, 9 insertions(+), 9 deletions(-)
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index a05a1074e8f8..d3ee862086fb 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -161,7 +161,7 @@ xfs_efi_init(
ASSERT(nextents > 0); if (nextents > XFS_EFI_MAX_FAST_EXTENTS) { - size = (uint)(sizeof(xfs_efi_log_item_t) + + size = (uint)(sizeof(struct xfs_efi_log_item) + ((nextents - 1) * sizeof(xfs_extent_t))); efip = kmem_zalloc(size, 0); } else { diff --git a/fs/xfs/xfs_extfree_item.h b/fs/xfs/xfs_extfree_item.h index 16aaab06d4ec..b9b567f35575 100644 --- a/fs/xfs/xfs_extfree_item.h +++ b/fs/xfs/xfs_extfree_item.h @@ -50,13 +50,13 @@ struct kmem_zone; * of commit failure or log I/O errors. Note that the EFD is not inserted in the * AIL, so at this point both the EFI and EFD are freed. */ -typedef struct xfs_efi_log_item { +struct xfs_efi_log_item { struct xfs_log_item efi_item; atomic_t efi_refcount; atomic_t efi_next_extent; unsigned long efi_flags; /* misc flags */ xfs_efi_log_format_t efi_format; -} xfs_efi_log_item_t; +};
/* * This is the "extent free done" log item. It is used to log @@ -65,7 +65,7 @@ typedef struct xfs_efi_log_item { */ typedef struct xfs_efd_log_item { struct xfs_log_item efd_item; - xfs_efi_log_item_t *efd_efip; + struct xfs_efi_log_item *efd_efip; uint efd_next_extent; xfs_efd_log_format_t efd_format; } xfs_efd_log_item_t; @@ -78,10 +78,10 @@ typedef struct xfs_efd_log_item { extern struct kmem_zone *xfs_efi_zone; extern struct kmem_zone *xfs_efd_zone;
-xfs_efi_log_item_t *xfs_efi_init(struct xfs_mount *, uint); +struct xfs_efi_log_item *xfs_efi_init(struct xfs_mount *, uint); int xfs_efi_copy_format(xfs_log_iovec_t *buf, xfs_efi_log_format_t *dst_efi_fmt); -void xfs_efi_item_free(xfs_efi_log_item_t *); +void xfs_efi_item_free(struct xfs_efi_log_item *); void xfs_efi_release(struct xfs_efi_log_item *);
int xfs_efi_recover(struct xfs_mount *mp, diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 46b1e255f55f..cffa9b695de8 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -3384,7 +3384,7 @@ xlog_recover_efd_pass2( struct xlog_recover_item *item) { xfs_efd_log_format_t *efd_formatp; - xfs_efi_log_item_t *efip = NULL; + struct xfs_efi_log_item *efip = NULL; struct xfs_log_item *lip; uint64_t efi_id; struct xfs_ail_cursor cur; @@ -3405,7 +3405,7 @@ xlog_recover_efd_pass2( lip = xfs_trans_ail_cursor_first(ailp, &cur, 0); while (lip != NULL) { if (lip->li_type == XFS_LI_EFI) { - efip = (xfs_efi_log_item_t *)lip; + efip = (struct xfs_efi_log_item *)lip; if (efip->efi_format.efi_id == efi_id) { /* * Drop the EFD reference to the EFI. This diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index f1407900aeef..b86612699a15 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -1920,7 +1920,7 @@ xfs_init_zones(void) if (!xfs_efd_zone) goto out_destroy_buf_item_zone;
- xfs_efi_zone = kmem_zone_init((sizeof(xfs_efi_log_item_t) + + xfs_efi_zone = kmem_zone_init((sizeof(struct xfs_efi_log_item) + ((XFS_EFI_MAX_FAST_EXTENTS - 1) * sizeof(xfs_extent_t))), "xfs_efi_item"); if (!xfs_efi_zone)
From: Christoph Hellwig hch@lst.de
commit c84e819090f39e96e4d432c9047a50d2424f99e0 upstream.
Signed-off-by: Christoph Hellwig hch@lst.de Reviewed-by: Brian Foster bfoster@redhat.com Reviewed-by: Darrick J. Wong darrick.wong@oracle.com Signed-off-by: Darrick J. Wong darrick.wong@oracle.com Signed-off-by: Chandan Babu R chandan.babu@oracle.com Acked-by: Darrick J. Wong djwong@kernel.org --- fs/xfs/xfs_extfree_item.h | 4 ++-- fs/xfs/xfs_super.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-)
diff --git a/fs/xfs/xfs_extfree_item.h b/fs/xfs/xfs_extfree_item.h index b9b567f35575..a2a736a77fa9 100644 --- a/fs/xfs/xfs_extfree_item.h +++ b/fs/xfs/xfs_extfree_item.h @@ -63,12 +63,12 @@ struct xfs_efi_log_item { * the fact that some extents earlier mentioned in an efi item * have been freed. */ -typedef struct xfs_efd_log_item { +struct xfs_efd_log_item { struct xfs_log_item efd_item; struct xfs_efi_log_item *efd_efip; uint efd_next_extent; xfs_efd_log_format_t efd_format; -} xfs_efd_log_item_t; +};
/* * Max number of extents in fast allocation path. diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index b86612699a15..9b2d7e4e263e 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -1914,7 +1914,7 @@ xfs_init_zones(void) if (!xfs_buf_item_zone) goto out_destroy_trans_zone;
- xfs_efd_zone = kmem_zone_init((sizeof(xfs_efd_log_item_t) + + xfs_efd_zone = kmem_zone_init((sizeof(struct xfs_efd_log_item) + ((XFS_EFD_MAX_FAST_EXTENTS - 1) * sizeof(xfs_extent_t))), "xfs_efd_item"); if (!xfs_efd_zone)
From: Christoph Hellwig hch@lst.de
commit fd9cbe51215198ccffa64169c98eae35b0916088 upstream.
Signed-off-by: Christoph Hellwig hch@lst.de Reviewed-by: Brian Foster bfoster@redhat.com Reviewed-by: Darrick J. Wong darrick.wong@oracle.com Signed-off-by: Darrick J. Wong darrick.wong@oracle.com Signed-off-by: Chandan Babu R chandan.babu@oracle.com Acked-by: Darrick J. Wong djwong@kernel.org --- fs/xfs/libxfs/xfs_inode_fork.c | 2 +- fs/xfs/libxfs/xfs_trans_inode.c | 2 +- fs/xfs/xfs_inode.c | 4 ++-- fs/xfs/xfs_inode_item.c | 2 +- fs/xfs/xfs_inode_item.h | 4 ++-- fs/xfs/xfs_super.c | 4 ++-- 6 files changed, 9 insertions(+), 9 deletions(-)
diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c index 15d6f947620f..93357072b19d 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.c +++ b/fs/xfs/libxfs/xfs_inode_fork.c @@ -592,7 +592,7 @@ void xfs_iflush_fork( xfs_inode_t *ip, xfs_dinode_t *dip, - xfs_inode_log_item_t *iip, + struct xfs_inode_log_item *iip, int whichfork) { char *cp; diff --git a/fs/xfs/libxfs/xfs_trans_inode.c b/fs/xfs/libxfs/xfs_trans_inode.c index 0ba7368b9a5f..1d0e78e0099d 100644 --- a/fs/xfs/libxfs/xfs_trans_inode.c +++ b/fs/xfs/libxfs/xfs_trans_inode.c @@ -27,7 +27,7 @@ xfs_trans_ijoin( struct xfs_inode *ip, uint lock_flags) { - xfs_inode_log_item_t *iip; + struct xfs_inode_log_item *iip;
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); if (ip->i_itemp == NULL) diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index e5a90a0b8f8a..02f77a359972 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -2555,7 +2555,7 @@ xfs_ifree_cluster( xfs_daddr_t blkno; xfs_buf_t *bp; xfs_inode_t *ip; - xfs_inode_log_item_t *iip; + struct xfs_inode_log_item *iip; struct xfs_log_item *lip; struct xfs_perag *pag; struct xfs_ino_geometry *igeo = M_IGEO(mp); @@ -2617,7 +2617,7 @@ xfs_ifree_cluster( */ list_for_each_entry(lip, &bp->b_li_list, li_bio_list) { if (lip->li_type == XFS_LI_INODE) { - iip = (xfs_inode_log_item_t *)lip; + iip = (struct xfs_inode_log_item *)lip; ASSERT(iip->ili_logged == 1); lip->li_cb = xfs_istale_done; xfs_trans_ail_copy_lsn(mp->m_ail, diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 76a60526af94..83b8f5655636 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -781,7 +781,7 @@ xfs_iflush_abort( xfs_inode_t *ip, bool stale) { - xfs_inode_log_item_t *iip = ip->i_itemp; + struct xfs_inode_log_item *iip = ip->i_itemp;
if (iip) { if (test_bit(XFS_LI_IN_AIL, &iip->ili_item.li_flags)) { diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h index 07a60e74c39c..ad667fd4ae62 100644 --- a/fs/xfs/xfs_inode_item.h +++ b/fs/xfs/xfs_inode_item.h @@ -13,7 +13,7 @@ struct xfs_bmbt_rec; struct xfs_inode; struct xfs_mount;
-typedef struct xfs_inode_log_item { +struct xfs_inode_log_item { struct xfs_log_item ili_item; /* common portion */ struct xfs_inode *ili_inode; /* inode ptr */ xfs_lsn_t ili_flush_lsn; /* lsn at last flush */ @@ -23,7 +23,7 @@ typedef struct xfs_inode_log_item { unsigned int ili_last_fields; /* fields when flushed */ unsigned int ili_fields; /* fields to be logged */ unsigned int ili_fsync_fields; /* logged since last fsync */ -} xfs_inode_log_item_t; +};
static inline int xfs_inode_clean(xfs_inode_t *ip) { diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 9b2d7e4e263e..9e73d2b29911 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -1934,8 +1934,8 @@ xfs_init_zones(void) goto out_destroy_efi_zone;
xfs_ili_zone = - kmem_zone_init_flags(sizeof(xfs_inode_log_item_t), "xfs_ili", - KM_ZONE_SPREAD, NULL); + kmem_zone_init_flags(sizeof(struct xfs_inode_log_item), + "xfs_ili", KM_ZONE_SPREAD, NULL); if (!xfs_ili_zone) goto out_destroy_inode_zone; xfs_icreate_zone = kmem_zone_init(sizeof(struct xfs_icreate_item),
From: Christoph Hellwig hch@lst.de
commit e046e949486ec92d83b2ccdf0e7e9144f74ef028 upstream.
Create a helper that encapsulates the whole logic to create a defer intent. This reorders some of the work that was done, but none of that has an affect on the operation as only fields that don't directly interact are affected.
Signed-off-by: Christoph Hellwig hch@lst.de Reviewed-by: Brian Foster bfoster@redhat.com Reviewed-by: Darrick J. Wong darrick.wong@oracle.com Signed-off-by: Darrick J. Wong darrick.wong@oracle.com Signed-off-by: Chandan Babu R chandan.babu@oracle.com Acked-by: Darrick J. Wong djwong@kernel.org --- fs/xfs/libxfs/xfs_defer.c | 39 +++++++++++++++++++++++---------------- 1 file changed, 23 insertions(+), 16 deletions(-)
diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c index 8cc3faa62404..a799cd61d85e 100644 --- a/fs/xfs/libxfs/xfs_defer.c +++ b/fs/xfs/libxfs/xfs_defer.c @@ -178,6 +178,23 @@ static const struct xfs_defer_op_type *defer_op_types[] = { [XFS_DEFER_OPS_TYPE_AGFL_FREE] = &xfs_agfl_free_defer_type, };
+static void +xfs_defer_create_intent( + struct xfs_trans *tp, + struct xfs_defer_pending *dfp, + bool sort) +{ + const struct xfs_defer_op_type *ops = defer_op_types[dfp->dfp_type]; + struct list_head *li; + + if (sort) + list_sort(tp->t_mountp, &dfp->dfp_work, ops->diff_items); + + dfp->dfp_intent = ops->create_intent(tp, dfp->dfp_count); + list_for_each(li, &dfp->dfp_work) + ops->log_item(tp, dfp->dfp_intent, li); +} + /* * For each pending item in the intake list, log its intent item and the * associated extents, then add the entire intake list to the end of @@ -187,17 +204,11 @@ STATIC void xfs_defer_create_intents( struct xfs_trans *tp) { - struct list_head *li; struct xfs_defer_pending *dfp; - const struct xfs_defer_op_type *ops;
list_for_each_entry(dfp, &tp->t_dfops, dfp_list) { - ops = defer_op_types[dfp->dfp_type]; - dfp->dfp_intent = ops->create_intent(tp, dfp->dfp_count); trace_xfs_defer_create_intent(tp->t_mountp, dfp); - list_sort(tp->t_mountp, &dfp->dfp_work, ops->diff_items); - list_for_each(li, &dfp->dfp_work) - ops->log_item(tp, dfp->dfp_intent, li); + xfs_defer_create_intent(tp, dfp, true); } }
@@ -427,17 +438,13 @@ xfs_defer_finish_noroll( } if (error == -EAGAIN) { /* - * Caller wants a fresh transaction, so log a - * new log intent item to replace the old one - * and roll the transaction. See "Requesting - * a Fresh Transaction while Finishing - * Deferred Work" above. + * Caller wants a fresh transaction, so log a new log + * intent item to replace the old one and roll the + * transaction. See "Requesting a Fresh Transaction + * while Finishing Deferred Work" above. */ - dfp->dfp_intent = ops->create_intent(*tp, - dfp->dfp_count); dfp->dfp_done = NULL; - list_for_each(li, &dfp->dfp_work) - ops->log_item(*tp, dfp->dfp_intent, li); + xfs_defer_create_intent(*tp, dfp, false); } else { /* Done with the dfp, free it. */ list_del(&dfp->dfp_list);
From: Christoph Hellwig hch@lst.de
commit c1f09188e8de0ae65433cb9c8ace4feb66359bcc upstream.
These are aways called together, and my merging them we reduce the amount of indirect calls, improve type safety and in general clean up the code a bit.
Signed-off-by: Christoph Hellwig hch@lst.de Reviewed-by: Brian Foster bfoster@redhat.com Reviewed-by: Darrick J. Wong darrick.wong@oracle.com Signed-off-by: Darrick J. Wong darrick.wong@oracle.com Signed-off-by: Chandan Babu R chandan.babu@oracle.com Acked-by: Darrick J. Wong djwong@kernel.org --- fs/xfs/libxfs/xfs_defer.c | 6 ++--- fs/xfs/libxfs/xfs_defer.h | 4 ++-- fs/xfs/xfs_bmap_item.c | 47 +++++++++++++++--------------------- fs/xfs/xfs_extfree_item.c | 49 ++++++++++++++++---------------------- fs/xfs/xfs_refcount_item.c | 48 ++++++++++++++++--------------------- fs/xfs/xfs_rmap_item.c | 48 ++++++++++++++++--------------------- 6 files changed, 83 insertions(+), 119 deletions(-)
diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c index a799cd61d85e..081380daa4b3 100644 --- a/fs/xfs/libxfs/xfs_defer.c +++ b/fs/xfs/libxfs/xfs_defer.c @@ -185,14 +185,12 @@ xfs_defer_create_intent( bool sort) { const struct xfs_defer_op_type *ops = defer_op_types[dfp->dfp_type]; - struct list_head *li;
if (sort) list_sort(tp->t_mountp, &dfp->dfp_work, ops->diff_items);
- dfp->dfp_intent = ops->create_intent(tp, dfp->dfp_count); - list_for_each(li, &dfp->dfp_work) - ops->log_item(tp, dfp->dfp_intent, li); + dfp->dfp_intent = ops->create_intent(tp, &dfp->dfp_work, + dfp->dfp_count); }
/* diff --git a/fs/xfs/libxfs/xfs_defer.h b/fs/xfs/libxfs/xfs_defer.h index 7c28d7608ac6..d6a4577c25b0 100644 --- a/fs/xfs/libxfs/xfs_defer.h +++ b/fs/xfs/libxfs/xfs_defer.h @@ -50,8 +50,8 @@ struct xfs_defer_op_type { void (*finish_cleanup)(struct xfs_trans *, void *, int); void (*cancel_item)(struct list_head *); int (*diff_items)(void *, struct list_head *, struct list_head *); - void *(*create_intent)(struct xfs_trans *, uint); - void (*log_item)(struct xfs_trans *, void *, struct list_head *); + void *(*create_intent)(struct xfs_trans *tp, struct list_head *items, + unsigned int count); unsigned int max_items; };
diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c index 243e5e0f82a3..b6f9aa73f000 100644 --- a/fs/xfs/xfs_bmap_item.c +++ b/fs/xfs/xfs_bmap_item.c @@ -278,27 +278,6 @@ xfs_bmap_update_diff_items( return ba->bi_owner->i_ino - bb->bi_owner->i_ino; }
-/* Get an BUI. */ -STATIC void * -xfs_bmap_update_create_intent( - struct xfs_trans *tp, - unsigned int count) -{ - struct xfs_bui_log_item *buip; - - ASSERT(count == XFS_BUI_MAX_FAST_EXTENTS); - ASSERT(tp != NULL); - - buip = xfs_bui_init(tp->t_mountp); - ASSERT(buip != NULL); - - /* - * Get a log_item_desc to point at the new item. - */ - xfs_trans_add_item(tp, &buip->bui_item); - return buip; -} - /* Set the map extent flags for this mapping. */ static void xfs_trans_set_bmap_flags( @@ -326,16 +305,12 @@ xfs_trans_set_bmap_flags( STATIC void xfs_bmap_update_log_item( struct xfs_trans *tp, - void *intent, - struct list_head *item) + struct xfs_bui_log_item *buip, + struct xfs_bmap_intent *bmap) { - struct xfs_bui_log_item *buip = intent; - struct xfs_bmap_intent *bmap; uint next_extent; struct xfs_map_extent *map;
- bmap = container_of(item, struct xfs_bmap_intent, bi_list); - tp->t_flags |= XFS_TRANS_DIRTY; set_bit(XFS_LI_DIRTY, &buip->bui_item.li_flags);
@@ -355,6 +330,23 @@ xfs_bmap_update_log_item( bmap->bi_bmap.br_state); }
+STATIC void * +xfs_bmap_update_create_intent( + struct xfs_trans *tp, + struct list_head *items, + unsigned int count) +{ + struct xfs_bui_log_item *buip = xfs_bui_init(tp->t_mountp); + struct xfs_bmap_intent *bmap; + + ASSERT(count == XFS_BUI_MAX_FAST_EXTENTS); + + xfs_trans_add_item(tp, &buip->bui_item); + list_for_each_entry(bmap, items, bi_list) + xfs_bmap_update_log_item(tp, buip, bmap); + return buip; +} + /* Get an BUD so we can process all the deferred rmap updates. */ STATIC void * xfs_bmap_update_create_done( @@ -419,7 +411,6 @@ const struct xfs_defer_op_type xfs_bmap_update_defer_type = { .diff_items = xfs_bmap_update_diff_items, .create_intent = xfs_bmap_update_create_intent, .abort_intent = xfs_bmap_update_abort_intent, - .log_item = xfs_bmap_update_log_item, .create_done = xfs_bmap_update_create_done, .finish_item = xfs_bmap_update_finish_item, .cancel_item = xfs_bmap_update_cancel_item, diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index d3ee862086fb..45bc0a88d942 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -412,41 +412,16 @@ xfs_extent_free_diff_items( XFS_FSB_TO_AGNO(mp, rb->xefi_startblock); }
-/* Get an EFI. */ -STATIC void * -xfs_extent_free_create_intent( - struct xfs_trans *tp, - unsigned int count) -{ - struct xfs_efi_log_item *efip; - - ASSERT(tp != NULL); - ASSERT(count > 0); - - efip = xfs_efi_init(tp->t_mountp, count); - ASSERT(efip != NULL); - - /* - * Get a log_item_desc to point at the new item. - */ - xfs_trans_add_item(tp, &efip->efi_item); - return efip; -} - /* Log a free extent to the intent item. */ STATIC void xfs_extent_free_log_item( struct xfs_trans *tp, - void *intent, - struct list_head *item) + struct xfs_efi_log_item *efip, + struct xfs_extent_free_item *free) { - struct xfs_efi_log_item *efip = intent; - struct xfs_extent_free_item *free; uint next_extent; struct xfs_extent *extp;
- free = container_of(item, struct xfs_extent_free_item, xefi_list); - tp->t_flags |= XFS_TRANS_DIRTY; set_bit(XFS_LI_DIRTY, &efip->efi_item.li_flags);
@@ -462,6 +437,24 @@ xfs_extent_free_log_item( extp->ext_len = free->xefi_blockcount; }
+STATIC void * +xfs_extent_free_create_intent( + struct xfs_trans *tp, + struct list_head *items, + unsigned int count) +{ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_efi_log_item *efip = xfs_efi_init(mp, count); + struct xfs_extent_free_item *free; + + ASSERT(count > 0); + + xfs_trans_add_item(tp, &efip->efi_item); + list_for_each_entry(free, items, xefi_list) + xfs_extent_free_log_item(tp, efip, free); + return efip; +} + /* Get an EFD so we can process all the free extents. */ STATIC void * xfs_extent_free_create_done( @@ -516,7 +509,6 @@ const struct xfs_defer_op_type xfs_extent_free_defer_type = { .diff_items = xfs_extent_free_diff_items, .create_intent = xfs_extent_free_create_intent, .abort_intent = xfs_extent_free_abort_intent, - .log_item = xfs_extent_free_log_item, .create_done = xfs_extent_free_create_done, .finish_item = xfs_extent_free_finish_item, .cancel_item = xfs_extent_free_cancel_item, @@ -582,7 +574,6 @@ const struct xfs_defer_op_type xfs_agfl_free_defer_type = { .diff_items = xfs_extent_free_diff_items, .create_intent = xfs_extent_free_create_intent, .abort_intent = xfs_extent_free_abort_intent, - .log_item = xfs_extent_free_log_item, .create_done = xfs_extent_free_create_done, .finish_item = xfs_agfl_free_finish_item, .cancel_item = xfs_extent_free_cancel_item, diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c index d5708d40ad87..254cbb808035 100644 --- a/fs/xfs/xfs_refcount_item.c +++ b/fs/xfs/xfs_refcount_item.c @@ -284,27 +284,6 @@ xfs_refcount_update_diff_items( XFS_FSB_TO_AGNO(mp, rb->ri_startblock); }
-/* Get an CUI. */ -STATIC void * -xfs_refcount_update_create_intent( - struct xfs_trans *tp, - unsigned int count) -{ - struct xfs_cui_log_item *cuip; - - ASSERT(tp != NULL); - ASSERT(count > 0); - - cuip = xfs_cui_init(tp->t_mountp, count); - ASSERT(cuip != NULL); - - /* - * Get a log_item_desc to point at the new item. - */ - xfs_trans_add_item(tp, &cuip->cui_item); - return cuip; -} - /* Set the phys extent flags for this reverse mapping. */ static void xfs_trans_set_refcount_flags( @@ -328,16 +307,12 @@ xfs_trans_set_refcount_flags( STATIC void xfs_refcount_update_log_item( struct xfs_trans *tp, - void *intent, - struct list_head *item) + struct xfs_cui_log_item *cuip, + struct xfs_refcount_intent *refc) { - struct xfs_cui_log_item *cuip = intent; - struct xfs_refcount_intent *refc; uint next_extent; struct xfs_phys_extent *ext;
- refc = container_of(item, struct xfs_refcount_intent, ri_list); - tp->t_flags |= XFS_TRANS_DIRTY; set_bit(XFS_LI_DIRTY, &cuip->cui_item.li_flags);
@@ -354,6 +329,24 @@ xfs_refcount_update_log_item( xfs_trans_set_refcount_flags(ext, refc->ri_type); }
+STATIC void * +xfs_refcount_update_create_intent( + struct xfs_trans *tp, + struct list_head *items, + unsigned int count) +{ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_cui_log_item *cuip = xfs_cui_init(mp, count); + struct xfs_refcount_intent *refc; + + ASSERT(count > 0); + + xfs_trans_add_item(tp, &cuip->cui_item); + list_for_each_entry(refc, items, ri_list) + xfs_refcount_update_log_item(tp, cuip, refc); + return cuip; +} + /* Get an CUD so we can process all the deferred refcount updates. */ STATIC void * xfs_refcount_update_create_done( @@ -432,7 +425,6 @@ const struct xfs_defer_op_type xfs_refcount_update_defer_type = { .diff_items = xfs_refcount_update_diff_items, .create_intent = xfs_refcount_update_create_intent, .abort_intent = xfs_refcount_update_abort_intent, - .log_item = xfs_refcount_update_log_item, .create_done = xfs_refcount_update_create_done, .finish_item = xfs_refcount_update_finish_item, .finish_cleanup = xfs_refcount_update_finish_cleanup, diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c index 02f84d9a511c..adcfbe171d11 100644 --- a/fs/xfs/xfs_rmap_item.c +++ b/fs/xfs/xfs_rmap_item.c @@ -352,41 +352,16 @@ xfs_rmap_update_diff_items( XFS_FSB_TO_AGNO(mp, rb->ri_bmap.br_startblock); }
-/* Get an RUI. */ -STATIC void * -xfs_rmap_update_create_intent( - struct xfs_trans *tp, - unsigned int count) -{ - struct xfs_rui_log_item *ruip; - - ASSERT(tp != NULL); - ASSERT(count > 0); - - ruip = xfs_rui_init(tp->t_mountp, count); - ASSERT(ruip != NULL); - - /* - * Get a log_item_desc to point at the new item. - */ - xfs_trans_add_item(tp, &ruip->rui_item); - return ruip; -} - /* Log rmap updates in the intent item. */ STATIC void xfs_rmap_update_log_item( struct xfs_trans *tp, - void *intent, - struct list_head *item) + struct xfs_rui_log_item *ruip, + struct xfs_rmap_intent *rmap) { - struct xfs_rui_log_item *ruip = intent; - struct xfs_rmap_intent *rmap; uint next_extent; struct xfs_map_extent *map;
- rmap = container_of(item, struct xfs_rmap_intent, ri_list); - tp->t_flags |= XFS_TRANS_DIRTY; set_bit(XFS_LI_DIRTY, &ruip->rui_item.li_flags);
@@ -406,6 +381,24 @@ xfs_rmap_update_log_item( rmap->ri_bmap.br_state); }
+STATIC void * +xfs_rmap_update_create_intent( + struct xfs_trans *tp, + struct list_head *items, + unsigned int count) +{ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_rui_log_item *ruip = xfs_rui_init(mp, count); + struct xfs_rmap_intent *rmap; + + ASSERT(count > 0); + + xfs_trans_add_item(tp, &ruip->rui_item); + list_for_each_entry(rmap, items, ri_list) + xfs_rmap_update_log_item(tp, ruip, rmap); + return ruip; +} + /* Get an RUD so we can process all the deferred rmap updates. */ STATIC void * xfs_rmap_update_create_done( @@ -476,7 +469,6 @@ const struct xfs_defer_op_type xfs_rmap_update_defer_type = { .diff_items = xfs_rmap_update_diff_items, .create_intent = xfs_rmap_update_create_intent, .abort_intent = xfs_rmap_update_abort_intent, - .log_item = xfs_rmap_update_log_item, .create_done = xfs_rmap_update_create_done, .finish_item = xfs_rmap_update_finish_item, .finish_cleanup = xfs_rmap_update_finish_cleanup,
From: Christoph Hellwig hch@lst.de
commit d367a868e46b025a8ced8e00ef2b3a3c2f3bf732 upstream.
This avoids a per-item indirect call, and also simplifies the interface a bit.
Signed-off-by: Christoph Hellwig hch@lst.de Reviewed-by: Brian Foster bfoster@redhat.com Reviewed-by: Darrick J. Wong darrick.wong@oracle.com Signed-off-by: Darrick J. Wong darrick.wong@oracle.com Signed-off-by: Chandan Babu R chandan.babu@oracle.com Acked-by: Darrick J. Wong djwong@kernel.org --- fs/xfs/libxfs/xfs_defer.c | 5 +---- fs/xfs/libxfs/xfs_defer.h | 3 +-- fs/xfs/xfs_bmap_item.c | 9 ++++++--- fs/xfs/xfs_extfree_item.c | 7 ++++--- fs/xfs/xfs_refcount_item.c | 6 ++++-- fs/xfs/xfs_rmap_item.c | 6 ++++-- 6 files changed, 20 insertions(+), 16 deletions(-)
diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c index 081380daa4b3..f5a3c5262933 100644 --- a/fs/xfs/libxfs/xfs_defer.c +++ b/fs/xfs/libxfs/xfs_defer.c @@ -186,11 +186,8 @@ xfs_defer_create_intent( { const struct xfs_defer_op_type *ops = defer_op_types[dfp->dfp_type];
- if (sort) - list_sort(tp->t_mountp, &dfp->dfp_work, ops->diff_items); - dfp->dfp_intent = ops->create_intent(tp, &dfp->dfp_work, - dfp->dfp_count); + dfp->dfp_count, sort); }
/* diff --git a/fs/xfs/libxfs/xfs_defer.h b/fs/xfs/libxfs/xfs_defer.h index d6a4577c25b0..660f5c3821d6 100644 --- a/fs/xfs/libxfs/xfs_defer.h +++ b/fs/xfs/libxfs/xfs_defer.h @@ -49,9 +49,8 @@ struct xfs_defer_op_type { void **); void (*finish_cleanup)(struct xfs_trans *, void *, int); void (*cancel_item)(struct list_head *); - int (*diff_items)(void *, struct list_head *, struct list_head *); void *(*create_intent)(struct xfs_trans *tp, struct list_head *items, - unsigned int count); + unsigned int count, bool sort); unsigned int max_items; };
diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c index b6f9aa73f000..f1d1fee01198 100644 --- a/fs/xfs/xfs_bmap_item.c +++ b/fs/xfs/xfs_bmap_item.c @@ -334,14 +334,18 @@ STATIC void * xfs_bmap_update_create_intent( struct xfs_trans *tp, struct list_head *items, - unsigned int count) + unsigned int count, + bool sort) { - struct xfs_bui_log_item *buip = xfs_bui_init(tp->t_mountp); + struct xfs_mount *mp = tp->t_mountp; + struct xfs_bui_log_item *buip = xfs_bui_init(mp); struct xfs_bmap_intent *bmap;
ASSERT(count == XFS_BUI_MAX_FAST_EXTENTS);
xfs_trans_add_item(tp, &buip->bui_item); + if (sort) + list_sort(mp, items, xfs_bmap_update_diff_items); list_for_each_entry(bmap, items, bi_list) xfs_bmap_update_log_item(tp, buip, bmap); return buip; @@ -408,7 +412,6 @@ xfs_bmap_update_cancel_item(
const struct xfs_defer_op_type xfs_bmap_update_defer_type = { .max_items = XFS_BUI_MAX_FAST_EXTENTS, - .diff_items = xfs_bmap_update_diff_items, .create_intent = xfs_bmap_update_create_intent, .abort_intent = xfs_bmap_update_abort_intent, .create_done = xfs_bmap_update_create_done, diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index 45bc0a88d942..6667344eda9d 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -441,7 +441,8 @@ STATIC void * xfs_extent_free_create_intent( struct xfs_trans *tp, struct list_head *items, - unsigned int count) + unsigned int count, + bool sort) { struct xfs_mount *mp = tp->t_mountp; struct xfs_efi_log_item *efip = xfs_efi_init(mp, count); @@ -450,6 +451,8 @@ xfs_extent_free_create_intent( ASSERT(count > 0);
xfs_trans_add_item(tp, &efip->efi_item); + if (sort) + list_sort(mp, items, xfs_extent_free_diff_items); list_for_each_entry(free, items, xefi_list) xfs_extent_free_log_item(tp, efip, free); return efip; @@ -506,7 +509,6 @@ xfs_extent_free_cancel_item(
const struct xfs_defer_op_type xfs_extent_free_defer_type = { .max_items = XFS_EFI_MAX_FAST_EXTENTS, - .diff_items = xfs_extent_free_diff_items, .create_intent = xfs_extent_free_create_intent, .abort_intent = xfs_extent_free_abort_intent, .create_done = xfs_extent_free_create_done, @@ -571,7 +573,6 @@ xfs_agfl_free_finish_item( /* sub-type with special handling for AGFL deferred frees */ const struct xfs_defer_op_type xfs_agfl_free_defer_type = { .max_items = XFS_EFI_MAX_FAST_EXTENTS, - .diff_items = xfs_extent_free_diff_items, .create_intent = xfs_extent_free_create_intent, .abort_intent = xfs_extent_free_abort_intent, .create_done = xfs_extent_free_create_done, diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c index 254cbb808035..2941b9379843 100644 --- a/fs/xfs/xfs_refcount_item.c +++ b/fs/xfs/xfs_refcount_item.c @@ -333,7 +333,8 @@ STATIC void * xfs_refcount_update_create_intent( struct xfs_trans *tp, struct list_head *items, - unsigned int count) + unsigned int count, + bool sort) { struct xfs_mount *mp = tp->t_mountp; struct xfs_cui_log_item *cuip = xfs_cui_init(mp, count); @@ -342,6 +343,8 @@ xfs_refcount_update_create_intent( ASSERT(count > 0);
xfs_trans_add_item(tp, &cuip->cui_item); + if (sort) + list_sort(mp, items, xfs_refcount_update_diff_items); list_for_each_entry(refc, items, ri_list) xfs_refcount_update_log_item(tp, cuip, refc); return cuip; @@ -422,7 +425,6 @@ xfs_refcount_update_cancel_item(
const struct xfs_defer_op_type xfs_refcount_update_defer_type = { .max_items = XFS_CUI_MAX_FAST_EXTENTS, - .diff_items = xfs_refcount_update_diff_items, .create_intent = xfs_refcount_update_create_intent, .abort_intent = xfs_refcount_update_abort_intent, .create_done = xfs_refcount_update_create_done, diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c index adcfbe171d11..2867bb6d17be 100644 --- a/fs/xfs/xfs_rmap_item.c +++ b/fs/xfs/xfs_rmap_item.c @@ -385,7 +385,8 @@ STATIC void * xfs_rmap_update_create_intent( struct xfs_trans *tp, struct list_head *items, - unsigned int count) + unsigned int count, + bool sort) { struct xfs_mount *mp = tp->t_mountp; struct xfs_rui_log_item *ruip = xfs_rui_init(mp, count); @@ -394,6 +395,8 @@ xfs_rmap_update_create_intent( ASSERT(count > 0);
xfs_trans_add_item(tp, &ruip->rui_item); + if (sort) + list_sort(mp, items, xfs_rmap_update_diff_items); list_for_each_entry(rmap, items, ri_list) xfs_rmap_update_log_item(tp, ruip, rmap); return ruip; @@ -466,7 +469,6 @@ xfs_rmap_update_cancel_item(
const struct xfs_defer_op_type xfs_rmap_update_defer_type = { .max_items = XFS_RUI_MAX_FAST_EXTENTS, - .diff_items = xfs_rmap_update_diff_items, .create_intent = xfs_rmap_update_create_intent, .abort_intent = xfs_rmap_update_abort_intent, .create_done = xfs_rmap_update_create_done,
From: Christoph Hellwig hch@lst.de
commit 13a8333339072b8654c1d2c75550ee9f41ee15de upstream.
All defer op instance place their own extension of the log item into the dfp_intent field. Replace that with a xfs_log_item to improve type safety and make the code easier to follow.
Signed-off-by: Christoph Hellwig hch@lst.de Reviewed-by: Brian Foster bfoster@redhat.com Reviewed-by: Darrick J. Wong darrick.wong@oracle.com Signed-off-by: Darrick J. Wong darrick.wong@oracle.com Signed-off-by: Chandan Babu R chandan.babu@oracle.com Acked-by: Darrick J. Wong djwong@kernel.org --- fs/xfs/libxfs/xfs_defer.h | 11 ++++++----- fs/xfs/xfs_bmap_item.c | 12 ++++++------ fs/xfs/xfs_extfree_item.c | 12 ++++++------ fs/xfs/xfs_refcount_item.c | 12 ++++++------ fs/xfs/xfs_rmap_item.c | 12 ++++++------ 5 files changed, 30 insertions(+), 29 deletions(-)
diff --git a/fs/xfs/libxfs/xfs_defer.h b/fs/xfs/libxfs/xfs_defer.h index 660f5c3821d6..7b6cc3808a91 100644 --- a/fs/xfs/libxfs/xfs_defer.h +++ b/fs/xfs/libxfs/xfs_defer.h @@ -28,7 +28,7 @@ enum xfs_defer_ops_type { struct xfs_defer_pending { struct list_head dfp_list; /* pending items */ struct list_head dfp_work; /* work items */ - void *dfp_intent; /* log intent item */ + struct xfs_log_item *dfp_intent; /* log intent item */ void *dfp_done; /* log done item */ unsigned int dfp_count; /* # extent items */ enum xfs_defer_ops_type dfp_type; @@ -43,14 +43,15 @@ void xfs_defer_move(struct xfs_trans *dtp, struct xfs_trans *stp);
/* Description of a deferred type. */ struct xfs_defer_op_type { - void (*abort_intent)(void *); - void *(*create_done)(struct xfs_trans *, void *, unsigned int); + struct xfs_log_item *(*create_intent)(struct xfs_trans *tp, + struct list_head *items, unsigned int count, bool sort); + void (*abort_intent)(struct xfs_log_item *intent); + void *(*create_done)(struct xfs_trans *tp, struct xfs_log_item *intent, + unsigned int count); int (*finish_item)(struct xfs_trans *, struct list_head *, void *, void **); void (*finish_cleanup)(struct xfs_trans *, void *, int); void (*cancel_item)(struct list_head *); - void *(*create_intent)(struct xfs_trans *tp, struct list_head *items, - unsigned int count, bool sort); unsigned int max_items; };
diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c index f1d1fee01198..f4d5c5d661ea 100644 --- a/fs/xfs/xfs_bmap_item.c +++ b/fs/xfs/xfs_bmap_item.c @@ -330,7 +330,7 @@ xfs_bmap_update_log_item( bmap->bi_bmap.br_state); }
-STATIC void * +static struct xfs_log_item * xfs_bmap_update_create_intent( struct xfs_trans *tp, struct list_head *items, @@ -348,17 +348,17 @@ xfs_bmap_update_create_intent( list_sort(mp, items, xfs_bmap_update_diff_items); list_for_each_entry(bmap, items, bi_list) xfs_bmap_update_log_item(tp, buip, bmap); - return buip; + return &buip->bui_item; }
/* Get an BUD so we can process all the deferred rmap updates. */ STATIC void * xfs_bmap_update_create_done( struct xfs_trans *tp, - void *intent, + struct xfs_log_item *intent, unsigned int count) { - return xfs_trans_get_bud(tp, intent); + return xfs_trans_get_bud(tp, BUI_ITEM(intent)); }
/* Process a deferred rmap update. */ @@ -394,9 +394,9 @@ xfs_bmap_update_finish_item( /* Abort all pending BUIs. */ STATIC void xfs_bmap_update_abort_intent( - void *intent) + struct xfs_log_item *intent) { - xfs_bui_release(intent); + xfs_bui_release(BUI_ITEM(intent)); }
/* Cancel a deferred rmap update. */ diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index 6667344eda9d..a9316fdb3bb4 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -437,7 +437,7 @@ xfs_extent_free_log_item( extp->ext_len = free->xefi_blockcount; }
-STATIC void * +static struct xfs_log_item * xfs_extent_free_create_intent( struct xfs_trans *tp, struct list_head *items, @@ -455,17 +455,17 @@ xfs_extent_free_create_intent( list_sort(mp, items, xfs_extent_free_diff_items); list_for_each_entry(free, items, xefi_list) xfs_extent_free_log_item(tp, efip, free); - return efip; + return &efip->efi_item; }
/* Get an EFD so we can process all the free extents. */ STATIC void * xfs_extent_free_create_done( struct xfs_trans *tp, - void *intent, + struct xfs_log_item *intent, unsigned int count) { - return xfs_trans_get_efd(tp, intent, count); + return xfs_trans_get_efd(tp, EFI_ITEM(intent), count); }
/* Process a free extent. */ @@ -491,9 +491,9 @@ xfs_extent_free_finish_item( /* Abort all pending EFIs. */ STATIC void xfs_extent_free_abort_intent( - void *intent) + struct xfs_log_item *intent) { - xfs_efi_release(intent); + xfs_efi_release(EFI_ITEM(intent)); }
/* Cancel a free extent. */ diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c index 2941b9379843..a8d6864d58e6 100644 --- a/fs/xfs/xfs_refcount_item.c +++ b/fs/xfs/xfs_refcount_item.c @@ -329,7 +329,7 @@ xfs_refcount_update_log_item( xfs_trans_set_refcount_flags(ext, refc->ri_type); }
-STATIC void * +static struct xfs_log_item * xfs_refcount_update_create_intent( struct xfs_trans *tp, struct list_head *items, @@ -347,17 +347,17 @@ xfs_refcount_update_create_intent( list_sort(mp, items, xfs_refcount_update_diff_items); list_for_each_entry(refc, items, ri_list) xfs_refcount_update_log_item(tp, cuip, refc); - return cuip; + return &cuip->cui_item; }
/* Get an CUD so we can process all the deferred refcount updates. */ STATIC void * xfs_refcount_update_create_done( struct xfs_trans *tp, - void *intent, + struct xfs_log_item *intent, unsigned int count) { - return xfs_trans_get_cud(tp, intent); + return xfs_trans_get_cud(tp, CUI_ITEM(intent)); }
/* Process a deferred refcount update. */ @@ -407,9 +407,9 @@ xfs_refcount_update_finish_cleanup( /* Abort all pending CUIs. */ STATIC void xfs_refcount_update_abort_intent( - void *intent) + struct xfs_log_item *intent) { - xfs_cui_release(intent); + xfs_cui_release(CUI_ITEM(intent)); }
/* Cancel a deferred refcount update. */ diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c index 2867bb6d17be..70d58557d779 100644 --- a/fs/xfs/xfs_rmap_item.c +++ b/fs/xfs/xfs_rmap_item.c @@ -381,7 +381,7 @@ xfs_rmap_update_log_item( rmap->ri_bmap.br_state); }
-STATIC void * +static struct xfs_log_item * xfs_rmap_update_create_intent( struct xfs_trans *tp, struct list_head *items, @@ -399,17 +399,17 @@ xfs_rmap_update_create_intent( list_sort(mp, items, xfs_rmap_update_diff_items); list_for_each_entry(rmap, items, ri_list) xfs_rmap_update_log_item(tp, ruip, rmap); - return ruip; + return &ruip->rui_item; }
/* Get an RUD so we can process all the deferred rmap updates. */ STATIC void * xfs_rmap_update_create_done( struct xfs_trans *tp, - void *intent, + struct xfs_log_item *intent, unsigned int count) { - return xfs_trans_get_rud(tp, intent); + return xfs_trans_get_rud(tp, RUI_ITEM(intent)); }
/* Process a deferred rmap update. */ @@ -451,9 +451,9 @@ xfs_rmap_update_finish_cleanup( /* Abort all pending RUIs. */ STATIC void xfs_rmap_update_abort_intent( - void *intent) + struct xfs_log_item *intent) { - xfs_rui_release(intent); + xfs_rui_release(RUI_ITEM(intent)); }
/* Cancel a deferred rmap update. */
From: Christoph Hellwig hch@lst.de
commit bb47d79750f1a68a75d4c7defc2da934ba31de14 upstream.
Split out a helper that operates on a single xfs_defer_pending structure to untangle the code.
Signed-off-by: Christoph Hellwig hch@lst.de Reviewed-by: Brian Foster bfoster@redhat.com Reviewed-by: Darrick J. Wong darrick.wong@oracle.com Signed-off-by: Darrick J. Wong darrick.wong@oracle.com Signed-off-by: Chandan Babu R chandan.babu@oracle.com Acked-by: Darrick J. Wong djwong@kernel.org --- fs/xfs/libxfs/xfs_defer.c | 128 ++++++++++++++++++-------------------- 1 file changed, 59 insertions(+), 69 deletions(-)
diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c index f5a3c5262933..ad7ed5f39d04 100644 --- a/fs/xfs/libxfs/xfs_defer.c +++ b/fs/xfs/libxfs/xfs_defer.c @@ -359,6 +359,53 @@ xfs_defer_cancel_list( } }
+/* + * Log an intent-done item for the first pending intent, and finish the work + * items. + */ +static int +xfs_defer_finish_one( + struct xfs_trans *tp, + struct xfs_defer_pending *dfp) +{ + const struct xfs_defer_op_type *ops = defer_op_types[dfp->dfp_type]; + void *state = NULL; + struct list_head *li, *n; + int error; + + trace_xfs_defer_pending_finish(tp->t_mountp, dfp); + + dfp->dfp_done = ops->create_done(tp, dfp->dfp_intent, dfp->dfp_count); + list_for_each_safe(li, n, &dfp->dfp_work) { + list_del(li); + dfp->dfp_count--; + error = ops->finish_item(tp, li, dfp->dfp_done, &state); + if (error == -EAGAIN) { + /* + * Caller wants a fresh transaction; put the work item + * back on the list and log a new log intent item to + * replace the old one. See "Requesting a Fresh + * Transaction while Finishing Deferred Work" above. + */ + list_add(li, &dfp->dfp_work); + dfp->dfp_count++; + dfp->dfp_done = NULL; + xfs_defer_create_intent(tp, dfp, false); + } + + if (error) + goto out; + } + + /* Done with the dfp, free it. */ + list_del(&dfp->dfp_list); + kmem_free(dfp); +out: + if (ops->finish_cleanup) + ops->finish_cleanup(tp, state, error); + return error; +} + /* * Finish all the pending work. This involves logging intent items for * any work items that wandered in since the last transaction roll (if @@ -372,11 +419,7 @@ xfs_defer_finish_noroll( struct xfs_trans **tp) { struct xfs_defer_pending *dfp; - struct list_head *li; - struct list_head *n; - void *state; int error = 0; - const struct xfs_defer_op_type *ops; LIST_HEAD(dop_pending);
ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES); @@ -385,83 +428,30 @@ xfs_defer_finish_noroll(
/* Until we run out of pending work to finish... */ while (!list_empty(&dop_pending) || !list_empty(&(*tp)->t_dfops)) { - /* log intents and pull in intake items */ xfs_defer_create_intents(*tp); list_splice_tail_init(&(*tp)->t_dfops, &dop_pending);
- /* - * Roll the transaction. - */ error = xfs_defer_trans_roll(tp); if (error) - goto out; + goto out_shutdown;
- /* Log an intent-done item for the first pending item. */ dfp = list_first_entry(&dop_pending, struct xfs_defer_pending, dfp_list); - ops = defer_op_types[dfp->dfp_type]; - trace_xfs_defer_pending_finish((*tp)->t_mountp, dfp); - dfp->dfp_done = ops->create_done(*tp, dfp->dfp_intent, - dfp->dfp_count); - - /* Finish the work items. */ - state = NULL; - list_for_each_safe(li, n, &dfp->dfp_work) { - list_del(li); - dfp->dfp_count--; - error = ops->finish_item(*tp, li, dfp->dfp_done, - &state); - if (error == -EAGAIN) { - /* - * Caller wants a fresh transaction; - * put the work item back on the list - * and jump out. - */ - list_add(li, &dfp->dfp_work); - dfp->dfp_count++; - break; - } else if (error) { - /* - * Clean up after ourselves and jump out. - * xfs_defer_cancel will take care of freeing - * all these lists and stuff. - */ - if (ops->finish_cleanup) - ops->finish_cleanup(*tp, state, error); - goto out; - } - } - if (error == -EAGAIN) { - /* - * Caller wants a fresh transaction, so log a new log - * intent item to replace the old one and roll the - * transaction. See "Requesting a Fresh Transaction - * while Finishing Deferred Work" above. - */ - dfp->dfp_done = NULL; - xfs_defer_create_intent(*tp, dfp, false); - } else { - /* Done with the dfp, free it. */ - list_del(&dfp->dfp_list); - kmem_free(dfp); - } - - if (ops->finish_cleanup) - ops->finish_cleanup(*tp, state, error); - } - -out: - if (error) { - xfs_defer_trans_abort(*tp, &dop_pending); - xfs_force_shutdown((*tp)->t_mountp, SHUTDOWN_CORRUPT_INCORE); - trace_xfs_defer_finish_error(*tp, error); - xfs_defer_cancel_list((*tp)->t_mountp, &dop_pending); - xfs_defer_cancel(*tp); - return error; + error = xfs_defer_finish_one(*tp, dfp); + if (error && error != -EAGAIN) + goto out_shutdown; }
trace_xfs_defer_finish_done(*tp, _RET_IP_); return 0; + +out_shutdown: + xfs_defer_trans_abort(*tp, &dop_pending); + xfs_force_shutdown((*tp)->t_mountp, SHUTDOWN_CORRUPT_INCORE); + trace_xfs_defer_finish_error(*tp, error); + xfs_defer_cancel_list((*tp)->t_mountp, &dop_pending); + xfs_defer_cancel(*tp); + return error; }
int
From: "Darrick J. Wong" darrick.wong@oracle.com
commit 93293bcbde93567efaf4e6bcd58cad270e1fcbf5 upstream.
[Slightly edit fs/xfs/xfs_bmap_item.c & fs/xfs/xfs_refcount_item.c to resolve merge conflicts]
During a code inspection, I found a serious bug in the log intent item recovery code when an intent item cannot complete all the work and decides to requeue itself to get that done. When this happens, the item recovery creates a new incore deferred op representing the remaining work and attaches it to the transaction that it allocated. At the end of _item_recover, it moves the entire chain of deferred ops to the dummy parent_tp that xlog_recover_process_intents passed to it, but fail to log a new intent item for the remaining work before committing the transaction for the single unit of work.
xlog_finish_defer_ops logs those new intent items once recovery has finished dealing with the intent items that it recovered, but this isn't sufficient. If the log is forced to disk after a recovered log item decides to requeue itself and the system goes down before we call xlog_finish_defer_ops, the second log recovery will never see the new intent item and therefore has no idea that there was more work to do. It will finish recovery leaving the filesystem in a corrupted state.
The same logic applies to /any/ deferred ops added during intent item recovery, not just the one handling the remaining work.
Signed-off-by: Darrick J. Wong darrick.wong@oracle.com Reviewed-by: Christoph Hellwig hch@lst.de Reviewed-by: Dave Chinner dchinner@redhat.com Signed-off-by: Chandan Babu R chandan.babu@oracle.com Acked-by: Darrick J. Wong djwong@kernel.org --- fs/xfs/libxfs/xfs_defer.c | 26 ++++++++++++++++++++++++-- fs/xfs/libxfs/xfs_defer.h | 6 ++++++ fs/xfs/xfs_bmap_item.c | 2 +- fs/xfs/xfs_refcount_item.c | 2 +- 4 files changed, 32 insertions(+), 4 deletions(-)
diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c index ad7ed5f39d04..4991b02f4993 100644 --- a/fs/xfs/libxfs/xfs_defer.c +++ b/fs/xfs/libxfs/xfs_defer.c @@ -186,8 +186,9 @@ xfs_defer_create_intent( { const struct xfs_defer_op_type *ops = defer_op_types[dfp->dfp_type];
- dfp->dfp_intent = ops->create_intent(tp, &dfp->dfp_work, - dfp->dfp_count, sort); + if (!dfp->dfp_intent) + dfp->dfp_intent = ops->create_intent(tp, &dfp->dfp_work, + dfp->dfp_count, sort); }
/* @@ -390,6 +391,7 @@ xfs_defer_finish_one( list_add(li, &dfp->dfp_work); dfp->dfp_count++; dfp->dfp_done = NULL; + dfp->dfp_intent = NULL; xfs_defer_create_intent(tp, dfp, false); }
@@ -552,3 +554,23 @@ xfs_defer_move(
xfs_defer_reset(stp); } + +/* + * Prepare a chain of fresh deferred ops work items to be completed later. Log + * recovery requires the ability to put off until later the actual finishing + * work so that it can process unfinished items recovered from the log in + * correct order. + * + * Create and log intent items for all the work that we're capturing so that we + * can be assured that the items will get replayed if the system goes down + * before log recovery gets a chance to finish the work it put off. Then we + * move the chain from stp to dtp. + */ +void +xfs_defer_capture( + struct xfs_trans *dtp, + struct xfs_trans *stp) +{ + xfs_defer_create_intents(stp); + xfs_defer_move(dtp, stp); +} diff --git a/fs/xfs/libxfs/xfs_defer.h b/fs/xfs/libxfs/xfs_defer.h index 7b6cc3808a91..bc3098044c41 100644 --- a/fs/xfs/libxfs/xfs_defer.h +++ b/fs/xfs/libxfs/xfs_defer.h @@ -61,4 +61,10 @@ extern const struct xfs_defer_op_type xfs_rmap_update_defer_type; extern const struct xfs_defer_op_type xfs_extent_free_defer_type; extern const struct xfs_defer_op_type xfs_agfl_free_defer_type;
+/* + * Functions to capture a chain of deferred operations and continue them later. + * This doesn't normally happen except log recovery. + */ +void xfs_defer_capture(struct xfs_trans *dtp, struct xfs_trans *stp); + #endif /* __XFS_DEFER_H__ */ diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c index f4d5c5d661ea..8cbee34b5eaa 100644 --- a/fs/xfs/xfs_bmap_item.c +++ b/fs/xfs/xfs_bmap_item.c @@ -541,7 +541,7 @@ xfs_bui_recover( }
set_bit(XFS_BUI_RECOVERED, &buip->bui_flags); - xfs_defer_move(parent_tp, tp); + xfs_defer_capture(parent_tp, tp); error = xfs_trans_commit(tp); xfs_iunlock(ip, XFS_ILOCK_EXCL); xfs_irele(ip); diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c index a8d6864d58e6..7c674bc7ddf6 100644 --- a/fs/xfs/xfs_refcount_item.c +++ b/fs/xfs/xfs_refcount_item.c @@ -574,7 +574,7 @@ xfs_cui_recover(
xfs_refcount_finish_one_cleanup(tp, rcur, error); set_bit(XFS_CUI_RECOVERED, &cuip->cui_flags); - xfs_defer_move(parent_tp, tp); + xfs_defer_capture(parent_tp, tp); error = xfs_trans_commit(tp); return error;
From: Dave Chinner dchinner@redhat.com
commit 671459676ab0e1d371c8d6b184ad1faa05b6941e upstream.
[ In 5.4.y, xlog_recover_get_buf_lsn() is defined inside fs/xfs/xfs_log_recover.c ]
Nathan popped up on #xfs and pointed out that we fail to handle finobt btree blocks in xlog_recover_get_buf_lsn(). This means they always fall through the entire magic number matching code to "recover immediately". Whilst most of the time this is the correct behaviour, occasionally it will be incorrect and could potentially overwrite more recent metadata because we don't check the LSN in the on disk metadata at all.
This bug has been present since the finobt was first introduced, and is a potential cause of the occasional xfs_iget_check_free_state() failures we see that indicate that the inode btree state does not match the on disk inode state.
Fixes: aafc3c246529 ("xfs: support the XFS_BTNUM_FINOBT free inode btree type") Reported-by: Nathan Scott nathans@redhat.com Signed-off-by: Dave Chinner dchinner@redhat.com Reviewed-by: Darrick J. Wong darrick.wong@oracle.com Signed-off-by: Darrick J. Wong darrick.wong@oracle.com Reviewed-by: Brian Foster bfoster@redhat.com Signed-off-by: Chandan Babu R chandan.babu@oracle.com Acked-by: Darrick J. Wong djwong@kernel.org --- fs/xfs/xfs_log_recover.c | 2 ++ 1 file changed, 2 insertions(+)
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index cffa9b695de8..0d920c363939 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -2206,6 +2206,8 @@ xlog_recover_get_buf_lsn( case XFS_ABTC_MAGIC: case XFS_RMAP_CRC_MAGIC: case XFS_REFC_CRC_MAGIC: + case XFS_FIBT_CRC_MAGIC: + case XFS_FIBT_MAGIC: case XFS_IBT_CRC_MAGIC: case XFS_IBT_MAGIC: { struct xfs_btree_block *btb = blk;
From: "Darrick J. Wong" darrick.wong@oracle.com
commit e6fff81e487089e47358a028526a9f63cdbcd503 upstream.
When we replay unfinished intent items that have been recovered from the log, it's possible that the replay will cause the creation of more deferred work items. As outlined in commit 509955823cc9c ("xfs: log recovery should replay deferred ops in order"), later work items have an implicit ordering dependency on earlier work items. Therefore, recovery must replay the items (both recovered and created) in the same order that they would have been during normal operation.
For log recovery, we enforce this ordering by using an empty transaction to collect deferred ops that get created in the process of recovering a log intent item to prevent them from being committed before the rest of the recovered intent items. After we finish committing all the recovered log items, we allocate a transaction with an enormous block reservation, splice our huge list of created deferred ops into that transaction, and commit it, thereby finishing all those ops.
This is /really/ hokey -- it's the one place in XFS where we allow nested transactions; the splicing of the defer ops list is is inelegant and has to be done twice per recovery function; and the broken way we handle inode pointers and block reservations cause subtle use-after-free and allocator problems that will be fixed by this patch and the two patches after it.
Therefore, replace the hokey empty transaction with a structure designed to capture each chain of deferred ops that are created as part of recovering a single unfinished log intent. Finally, refactor the loop that replays those chains to do so using one transaction per chain.
Signed-off-by: Darrick J. Wong darrick.wong@oracle.com Reviewed-by: Brian Foster bfoster@redhat.com Reviewed-by: Christoph Hellwig hch@lst.de Signed-off-by: Chandan Babu R chandan.babu@oracle.com Acked-by: Darrick J. Wong djwong@kernel.org --- fs/xfs/libxfs/xfs_defer.c | 89 ++++++++++++++++++++-- fs/xfs/libxfs/xfs_defer.h | 19 ++++- fs/xfs/xfs_bmap_item.c | 18 ++--- fs/xfs/xfs_bmap_item.h | 3 +- fs/xfs/xfs_extfree_item.c | 9 ++- fs/xfs/xfs_extfree_item.h | 4 +- fs/xfs/xfs_log_recover.c | 151 +++++++++++++++++++++---------------- fs/xfs/xfs_refcount_item.c | 18 ++--- fs/xfs/xfs_refcount_item.h | 3 +- fs/xfs/xfs_rmap_item.c | 8 +- fs/xfs/xfs_rmap_item.h | 3 +- 11 files changed, 213 insertions(+), 112 deletions(-)
diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c index 4991b02f4993..0448197d3b71 100644 --- a/fs/xfs/libxfs/xfs_defer.c +++ b/fs/xfs/libxfs/xfs_defer.c @@ -563,14 +563,89 @@ xfs_defer_move( * * Create and log intent items for all the work that we're capturing so that we * can be assured that the items will get replayed if the system goes down - * before log recovery gets a chance to finish the work it put off. Then we - * move the chain from stp to dtp. + * before log recovery gets a chance to finish the work it put off. The entire + * deferred ops state is transferred to the capture structure and the + * transaction is then ready for the caller to commit it. If there are no + * intent items to capture, this function returns NULL. */ +static struct xfs_defer_capture * +xfs_defer_ops_capture( + struct xfs_trans *tp) +{ + struct xfs_defer_capture *dfc; + + if (list_empty(&tp->t_dfops)) + return NULL; + + /* Create an object to capture the defer ops. */ + dfc = kmem_zalloc(sizeof(*dfc), KM_NOFS); + INIT_LIST_HEAD(&dfc->dfc_list); + INIT_LIST_HEAD(&dfc->dfc_dfops); + + xfs_defer_create_intents(tp); + + /* Move the dfops chain and transaction state to the capture struct. */ + list_splice_init(&tp->t_dfops, &dfc->dfc_dfops); + dfc->dfc_tpflags = tp->t_flags & XFS_TRANS_LOWMODE; + tp->t_flags &= ~XFS_TRANS_LOWMODE; + + return dfc; +} + +/* Release all resources that we used to capture deferred ops. */ void -xfs_defer_capture( - struct xfs_trans *dtp, - struct xfs_trans *stp) +xfs_defer_ops_release( + struct xfs_mount *mp, + struct xfs_defer_capture *dfc) { - xfs_defer_create_intents(stp); - xfs_defer_move(dtp, stp); + xfs_defer_cancel_list(mp, &dfc->dfc_dfops); + kmem_free(dfc); +} + +/* + * Capture any deferred ops and commit the transaction. This is the last step + * needed to finish a log intent item that we recovered from the log. + */ +int +xfs_defer_ops_capture_and_commit( + struct xfs_trans *tp, + struct list_head *capture_list) +{ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_defer_capture *dfc; + int error; + + /* If we don't capture anything, commit transaction and exit. */ + dfc = xfs_defer_ops_capture(tp); + if (!dfc) + return xfs_trans_commit(tp); + + /* Commit the transaction and add the capture structure to the list. */ + error = xfs_trans_commit(tp); + if (error) { + xfs_defer_ops_release(mp, dfc); + return error; + } + + list_add_tail(&dfc->dfc_list, capture_list); + return 0; +} + +/* + * Attach a chain of captured deferred ops to a new transaction and free the + * capture structure. + */ +void +xfs_defer_ops_continue( + struct xfs_defer_capture *dfc, + struct xfs_trans *tp) +{ + ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); + ASSERT(!(tp->t_flags & XFS_TRANS_DIRTY)); + + /* Move captured dfops chain and state to the transaction. */ + list_splice_init(&dfc->dfc_dfops, &tp->t_dfops); + tp->t_flags |= dfc->dfc_tpflags; + + kmem_free(dfc); } diff --git a/fs/xfs/libxfs/xfs_defer.h b/fs/xfs/libxfs/xfs_defer.h index bc3098044c41..2c27f439298d 100644 --- a/fs/xfs/libxfs/xfs_defer.h +++ b/fs/xfs/libxfs/xfs_defer.h @@ -7,6 +7,7 @@ #define __XFS_DEFER_H__
struct xfs_defer_op_type; +struct xfs_defer_capture;
/* * Header for deferred operation list. @@ -61,10 +62,26 @@ extern const struct xfs_defer_op_type xfs_rmap_update_defer_type; extern const struct xfs_defer_op_type xfs_extent_free_defer_type; extern const struct xfs_defer_op_type xfs_agfl_free_defer_type;
+/* + * This structure enables a dfops user to detach the chain of deferred + * operations from a transaction so that they can be continued later. + */ +struct xfs_defer_capture { + /* List of other capture structures. */ + struct list_head dfc_list; + + /* Deferred ops state saved from the transaction. */ + struct list_head dfc_dfops; + unsigned int dfc_tpflags; +}; + /* * Functions to capture a chain of deferred operations and continue them later. * This doesn't normally happen except log recovery. */ -void xfs_defer_capture(struct xfs_trans *dtp, struct xfs_trans *stp); +int xfs_defer_ops_capture_and_commit(struct xfs_trans *tp, + struct list_head *capture_list); +void xfs_defer_ops_continue(struct xfs_defer_capture *d, struct xfs_trans *tp); +void xfs_defer_ops_release(struct xfs_mount *mp, struct xfs_defer_capture *d);
#endif /* __XFS_DEFER_H__ */ diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c index 8cbee34b5eaa..e83729bf4997 100644 --- a/fs/xfs/xfs_bmap_item.c +++ b/fs/xfs/xfs_bmap_item.c @@ -425,8 +425,8 @@ const struct xfs_defer_op_type xfs_bmap_update_defer_type = { */ int xfs_bui_recover( - struct xfs_trans *parent_tp, - struct xfs_bui_log_item *buip) + struct xfs_bui_log_item *buip, + struct list_head *capture_list) { int error = 0; unsigned int bui_type; @@ -442,7 +442,7 @@ xfs_bui_recover( struct xfs_trans *tp; struct xfs_inode *ip = NULL; struct xfs_bmbt_irec irec; - struct xfs_mount *mp = parent_tp->t_mountp; + struct xfs_mount *mp = buip->bui_item.li_mountp;
ASSERT(!test_bit(XFS_BUI_RECOVERED, &buip->bui_flags));
@@ -491,12 +491,7 @@ xfs_bui_recover( XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK), 0, 0, &tp); if (error) return error; - /* - * Recovery stashes all deferred ops during intent processing and - * finishes them on completion. Transfer current dfops state to this - * transaction and transfer the result back before we return. - */ - xfs_defer_move(tp, parent_tp); + budp = xfs_trans_get_bud(tp, buip);
/* Grab the inode. */ @@ -541,15 +536,12 @@ xfs_bui_recover( }
set_bit(XFS_BUI_RECOVERED, &buip->bui_flags); - xfs_defer_capture(parent_tp, tp); - error = xfs_trans_commit(tp); + error = xfs_defer_ops_capture_and_commit(tp, capture_list); xfs_iunlock(ip, XFS_ILOCK_EXCL); xfs_irele(ip); - return error;
err_inode: - xfs_defer_move(parent_tp, tp); xfs_trans_cancel(tp); if (ip) { xfs_iunlock(ip, XFS_ILOCK_EXCL); diff --git a/fs/xfs/xfs_bmap_item.h b/fs/xfs/xfs_bmap_item.h index ad479cc73de8..a95e99c26979 100644 --- a/fs/xfs/xfs_bmap_item.h +++ b/fs/xfs/xfs_bmap_item.h @@ -77,6 +77,7 @@ extern struct kmem_zone *xfs_bud_zone; struct xfs_bui_log_item *xfs_bui_init(struct xfs_mount *); void xfs_bui_item_free(struct xfs_bui_log_item *); void xfs_bui_release(struct xfs_bui_log_item *); -int xfs_bui_recover(struct xfs_trans *parent_tp, struct xfs_bui_log_item *buip); +int xfs_bui_recover(struct xfs_bui_log_item *buip, + struct list_head *capture_list);
#endif /* __XFS_BMAP_ITEM_H__ */ diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index a9316fdb3bb4..2db85c2c6d99 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -586,9 +586,10 @@ const struct xfs_defer_op_type xfs_agfl_free_defer_type = { */ int xfs_efi_recover( - struct xfs_mount *mp, - struct xfs_efi_log_item *efip) + struct xfs_efi_log_item *efip, + struct list_head *capture_list) { + struct xfs_mount *mp = efip->efi_item.li_mountp; struct xfs_efd_log_item *efdp; struct xfs_trans *tp; int i; @@ -637,8 +638,8 @@ xfs_efi_recover( }
set_bit(XFS_EFI_RECOVERED, &efip->efi_flags); - error = xfs_trans_commit(tp); - return error; + + return xfs_defer_ops_capture_and_commit(tp, capture_list);
abort_error: xfs_trans_cancel(tp); diff --git a/fs/xfs/xfs_extfree_item.h b/fs/xfs/xfs_extfree_item.h index a2a736a77fa9..883f0f1d8989 100644 --- a/fs/xfs/xfs_extfree_item.h +++ b/fs/xfs/xfs_extfree_item.h @@ -84,7 +84,7 @@ int xfs_efi_copy_format(xfs_log_iovec_t *buf, void xfs_efi_item_free(struct xfs_efi_log_item *); void xfs_efi_release(struct xfs_efi_log_item *);
-int xfs_efi_recover(struct xfs_mount *mp, - struct xfs_efi_log_item *efip); +int xfs_efi_recover(struct xfs_efi_log_item *efip, + struct list_head *capture_list);
#endif /* __XFS_EXTFREE_ITEM_H__ */ diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 0d920c363939..388a2ec2d879 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -4587,9 +4587,9 @@ xlog_recover_process_data( /* Recover the EFI if necessary. */ STATIC int xlog_recover_process_efi( - struct xfs_mount *mp, struct xfs_ail *ailp, - struct xfs_log_item *lip) + struct xfs_log_item *lip, + struct list_head *capture_list) { struct xfs_efi_log_item *efip; int error; @@ -4602,7 +4602,7 @@ xlog_recover_process_efi( return 0;
spin_unlock(&ailp->ail_lock); - error = xfs_efi_recover(mp, efip); + error = xfs_efi_recover(efip, capture_list); spin_lock(&ailp->ail_lock);
return error; @@ -4627,9 +4627,9 @@ xlog_recover_cancel_efi( /* Recover the RUI if necessary. */ STATIC int xlog_recover_process_rui( - struct xfs_mount *mp, struct xfs_ail *ailp, - struct xfs_log_item *lip) + struct xfs_log_item *lip, + struct list_head *capture_list) { struct xfs_rui_log_item *ruip; int error; @@ -4642,7 +4642,7 @@ xlog_recover_process_rui( return 0;
spin_unlock(&ailp->ail_lock); - error = xfs_rui_recover(mp, ruip); + error = xfs_rui_recover(ruip, capture_list); spin_lock(&ailp->ail_lock);
return error; @@ -4667,9 +4667,9 @@ xlog_recover_cancel_rui( /* Recover the CUI if necessary. */ STATIC int xlog_recover_process_cui( - struct xfs_trans *parent_tp, struct xfs_ail *ailp, - struct xfs_log_item *lip) + struct xfs_log_item *lip, + struct list_head *capture_list) { struct xfs_cui_log_item *cuip; int error; @@ -4682,7 +4682,7 @@ xlog_recover_process_cui( return 0;
spin_unlock(&ailp->ail_lock); - error = xfs_cui_recover(parent_tp, cuip); + error = xfs_cui_recover(cuip, capture_list); spin_lock(&ailp->ail_lock);
return error; @@ -4707,9 +4707,9 @@ xlog_recover_cancel_cui( /* Recover the BUI if necessary. */ STATIC int xlog_recover_process_bui( - struct xfs_trans *parent_tp, struct xfs_ail *ailp, - struct xfs_log_item *lip) + struct xfs_log_item *lip, + struct list_head *capture_list) { struct xfs_bui_log_item *buip; int error; @@ -4722,7 +4722,7 @@ xlog_recover_process_bui( return 0;
spin_unlock(&ailp->ail_lock); - error = xfs_bui_recover(parent_tp, buip); + error = xfs_bui_recover(buip, capture_list); spin_lock(&ailp->ail_lock);
return error; @@ -4761,37 +4761,65 @@ static inline bool xlog_item_is_intent(struct xfs_log_item *lip) /* Take all the collected deferred ops and finish them in order. */ static int xlog_finish_defer_ops( - struct xfs_trans *parent_tp) + struct xfs_mount *mp, + struct list_head *capture_list) { - struct xfs_mount *mp = parent_tp->t_mountp; + struct xfs_defer_capture *dfc, *next; struct xfs_trans *tp; int64_t freeblks; - uint resblks; - int error; + uint64_t resblks; + int error = 0;
- /* - * We're finishing the defer_ops that accumulated as a result of - * recovering unfinished intent items during log recovery. We - * reserve an itruncate transaction because it is the largest - * permanent transaction type. Since we're the only user of the fs - * right now, take 93% (15/16) of the available free blocks. Use - * weird math to avoid a 64-bit division. - */ - freeblks = percpu_counter_sum(&mp->m_fdblocks); - if (freeblks <= 0) - return -ENOSPC; - resblks = min_t(int64_t, UINT_MAX, freeblks); - resblks = (resblks * 15) >> 4; - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, resblks, - 0, XFS_TRANS_RESERVE, &tp); - if (error) - return error; - /* transfer all collected dfops to this transaction */ - xfs_defer_move(tp, parent_tp); + list_for_each_entry_safe(dfc, next, capture_list, dfc_list) { + /* + * We're finishing the defer_ops that accumulated as a result + * of recovering unfinished intent items during log recovery. + * We reserve an itruncate transaction because it is the + * largest permanent transaction type. Since we're the only + * user of the fs right now, take 93% (15/16) of the available + * free blocks. Use weird math to avoid a 64-bit division. + */ + freeblks = percpu_counter_sum(&mp->m_fdblocks); + if (freeblks <= 0) + return -ENOSPC; + + resblks = min_t(uint64_t, UINT_MAX, freeblks); + resblks = (resblks * 15) >> 4; + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, resblks, + 0, XFS_TRANS_RESERVE, &tp); + if (error) + return error; + + /* + * Transfer to this new transaction all the dfops we captured + * from recovering a single intent item. + */ + list_del_init(&dfc->dfc_list); + xfs_defer_ops_continue(dfc, tp); + + error = xfs_trans_commit(tp); + if (error) + return error; + }
- return xfs_trans_commit(tp); + ASSERT(list_empty(capture_list)); + return 0; }
+/* Release all the captured defer ops and capture structures in this list. */ +static void +xlog_abort_defer_ops( + struct xfs_mount *mp, + struct list_head *capture_list) +{ + struct xfs_defer_capture *dfc; + struct xfs_defer_capture *next; + + list_for_each_entry_safe(dfc, next, capture_list, dfc_list) { + list_del_init(&dfc->dfc_list); + xfs_defer_ops_release(mp, dfc); + } +} /* * When this is called, all of the log intent items which did not have * corresponding log done items should be in the AIL. What we do now @@ -4812,35 +4840,23 @@ STATIC int xlog_recover_process_intents( struct xlog *log) { - struct xfs_trans *parent_tp; + LIST_HEAD(capture_list); struct xfs_ail_cursor cur; struct xfs_log_item *lip; struct xfs_ail *ailp; - int error; + int error = 0; #if defined(DEBUG) || defined(XFS_WARN) xfs_lsn_t last_lsn; #endif
- /* - * The intent recovery handlers commit transactions to complete recovery - * for individual intents, but any new deferred operations that are - * queued during that process are held off until the very end. The - * purpose of this transaction is to serve as a container for deferred - * operations. Each intent recovery handler must transfer dfops here - * before its local transaction commits, and we'll finish the entire - * list below. - */ - error = xfs_trans_alloc_empty(log->l_mp, &parent_tp); - if (error) - return error; - ailp = log->l_ailp; spin_lock(&ailp->ail_lock); - lip = xfs_trans_ail_cursor_first(ailp, &cur, 0); #if defined(DEBUG) || defined(XFS_WARN) last_lsn = xlog_assign_lsn(log->l_curr_cycle, log->l_curr_block); #endif - while (lip != NULL) { + for (lip = xfs_trans_ail_cursor_first(ailp, &cur, 0); + lip != NULL; + lip = xfs_trans_ail_cursor_next(ailp, &cur)) { /* * We're done when we see something other than an intent. * There should be no intents left in the AIL now. @@ -4862,35 +4878,40 @@ xlog_recover_process_intents(
/* * NOTE: If your intent processing routine can create more - * deferred ops, you /must/ attach them to the dfops in this - * routine or else those subsequent intents will get + * deferred ops, you /must/ attach them to the capture list in + * the recover routine or else those subsequent intents will be * replayed in the wrong order! */ switch (lip->li_type) { case XFS_LI_EFI: - error = xlog_recover_process_efi(log->l_mp, ailp, lip); + error = xlog_recover_process_efi(ailp, lip, &capture_list); break; case XFS_LI_RUI: - error = xlog_recover_process_rui(log->l_mp, ailp, lip); + error = xlog_recover_process_rui(ailp, lip, &capture_list); break; case XFS_LI_CUI: - error = xlog_recover_process_cui(parent_tp, ailp, lip); + error = xlog_recover_process_cui(ailp, lip, &capture_list); break; case XFS_LI_BUI: - error = xlog_recover_process_bui(parent_tp, ailp, lip); + error = xlog_recover_process_bui(ailp, lip, &capture_list); break; } if (error) - goto out; - lip = xfs_trans_ail_cursor_next(ailp, &cur); + break; } -out: + xfs_trans_ail_cursor_done(&cur); spin_unlock(&ailp->ail_lock); - if (!error) - error = xlog_finish_defer_ops(parent_tp); - xfs_trans_cancel(parent_tp); + if (error) + goto err;
+ error = xlog_finish_defer_ops(log->l_mp, &capture_list); + if (error) + goto err; + + return 0; +err: + xlog_abort_defer_ops(log->l_mp, &capture_list); return error; }
diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c index 7c674bc7ddf6..c071f8600e8e 100644 --- a/fs/xfs/xfs_refcount_item.c +++ b/fs/xfs/xfs_refcount_item.c @@ -439,8 +439,8 @@ const struct xfs_defer_op_type xfs_refcount_update_defer_type = { */ int xfs_cui_recover( - struct xfs_trans *parent_tp, - struct xfs_cui_log_item *cuip) + struct xfs_cui_log_item *cuip, + struct list_head *capture_list) { int i; int error = 0; @@ -456,7 +456,7 @@ xfs_cui_recover( xfs_extlen_t new_len; struct xfs_bmbt_irec irec; bool requeue_only = false; - struct xfs_mount *mp = parent_tp->t_mountp; + struct xfs_mount *mp = cuip->cui_item.li_mountp;
ASSERT(!test_bit(XFS_CUI_RECOVERED, &cuip->cui_flags));
@@ -511,12 +511,7 @@ xfs_cui_recover( mp->m_refc_maxlevels * 2, 0, XFS_TRANS_RESERVE, &tp); if (error) return error; - /* - * Recovery stashes all deferred ops during intent processing and - * finishes them on completion. Transfer current dfops state to this - * transaction and transfer the result back before we return. - */ - xfs_defer_move(tp, parent_tp); + cudp = xfs_trans_get_cud(tp, cuip);
for (i = 0; i < cuip->cui_format.cui_nextents; i++) { @@ -574,13 +569,10 @@ xfs_cui_recover(
xfs_refcount_finish_one_cleanup(tp, rcur, error); set_bit(XFS_CUI_RECOVERED, &cuip->cui_flags); - xfs_defer_capture(parent_tp, tp); - error = xfs_trans_commit(tp); - return error; + return xfs_defer_ops_capture_and_commit(tp, capture_list);
abort_error: xfs_refcount_finish_one_cleanup(tp, rcur, error); - xfs_defer_move(parent_tp, tp); xfs_trans_cancel(tp); return error; } diff --git a/fs/xfs/xfs_refcount_item.h b/fs/xfs/xfs_refcount_item.h index e47530f30489..de5f48ff4f74 100644 --- a/fs/xfs/xfs_refcount_item.h +++ b/fs/xfs/xfs_refcount_item.h @@ -80,6 +80,7 @@ extern struct kmem_zone *xfs_cud_zone; struct xfs_cui_log_item *xfs_cui_init(struct xfs_mount *, uint); void xfs_cui_item_free(struct xfs_cui_log_item *); void xfs_cui_release(struct xfs_cui_log_item *); -int xfs_cui_recover(struct xfs_trans *parent_tp, struct xfs_cui_log_item *cuip); +int xfs_cui_recover(struct xfs_cui_log_item *cuip, + struct list_head *capture_list);
#endif /* __XFS_REFCOUNT_ITEM_H__ */ diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c index 70d58557d779..5bdf1f5e51b8 100644 --- a/fs/xfs/xfs_rmap_item.c +++ b/fs/xfs/xfs_rmap_item.c @@ -483,9 +483,10 @@ const struct xfs_defer_op_type xfs_rmap_update_defer_type = { */ int xfs_rui_recover( - struct xfs_mount *mp, - struct xfs_rui_log_item *ruip) + struct xfs_rui_log_item *ruip, + struct list_head *capture_list) { + struct xfs_mount *mp = ruip->rui_item.li_mountp; int i; int error = 0; struct xfs_map_extent *rmap; @@ -592,8 +593,7 @@ xfs_rui_recover(
xfs_rmap_finish_one_cleanup(tp, rcur, error); set_bit(XFS_RUI_RECOVERED, &ruip->rui_flags); - error = xfs_trans_commit(tp); - return error; + return xfs_defer_ops_capture_and_commit(tp, capture_list);
abort_error: xfs_rmap_finish_one_cleanup(tp, rcur, error); diff --git a/fs/xfs/xfs_rmap_item.h b/fs/xfs/xfs_rmap_item.h index 8708e4a5aa5c..5cf4acb0e915 100644 --- a/fs/xfs/xfs_rmap_item.h +++ b/fs/xfs/xfs_rmap_item.h @@ -82,6 +82,7 @@ int xfs_rui_copy_format(struct xfs_log_iovec *buf, struct xfs_rui_log_format *dst_rui_fmt); void xfs_rui_item_free(struct xfs_rui_log_item *); void xfs_rui_release(struct xfs_rui_log_item *); -int xfs_rui_recover(struct xfs_mount *mp, struct xfs_rui_log_item *ruip); +int xfs_rui_recover(struct xfs_rui_log_item *ruip, + struct list_head *capture_list);
#endif /* __XFS_RMAP_ITEM_H__ */
From: "Darrick J. Wong" darrick.wong@oracle.com
commit 4f9a60c48078c0efa3459678fa8d6e050e8ada5d upstream.
When xfs_defer_capture extracts the deferred ops and transaction state from a transaction, it should record the remaining block reservations so that when we continue the dfops chain, we can reserve the same number of blocks to use. We capture the reservations for both data and realtime volumes.
This adds the requirement that every log intent item recovery function must be careful to reserve enough blocks to handle both itself and all defer ops that it can queue. On the other hand, this enables us to do away with the handwaving block estimation nonsense that was going on in xlog_finish_defer_ops.
Signed-off-by: Darrick J. Wong darrick.wong@oracle.com Reviewed-by: Christoph Hellwig hch@lst.de Reviewed-by: Brian Foster bfoster@redhat.com Signed-off-by: Chandan Babu R chandan.babu@oracle.com Acked-by: Darrick J. Wong djwong@kernel.org --- fs/xfs/libxfs/xfs_defer.c | 4 ++++ fs/xfs/libxfs/xfs_defer.h | 4 ++++ fs/xfs/xfs_log_recover.c | 21 +++------------------ 3 files changed, 11 insertions(+), 18 deletions(-)
diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c index 0448197d3b71..4c36ab9dd33e 100644 --- a/fs/xfs/libxfs/xfs_defer.c +++ b/fs/xfs/libxfs/xfs_defer.c @@ -589,6 +589,10 @@ xfs_defer_ops_capture( dfc->dfc_tpflags = tp->t_flags & XFS_TRANS_LOWMODE; tp->t_flags &= ~XFS_TRANS_LOWMODE;
+ /* Capture the remaining block reservations along with the dfops. */ + dfc->dfc_blkres = tp->t_blk_res - tp->t_blk_res_used; + dfc->dfc_rtxres = tp->t_rtx_res - tp->t_rtx_res_used; + return dfc; }
diff --git a/fs/xfs/libxfs/xfs_defer.h b/fs/xfs/libxfs/xfs_defer.h index 2c27f439298d..7b0794ad58ca 100644 --- a/fs/xfs/libxfs/xfs_defer.h +++ b/fs/xfs/libxfs/xfs_defer.h @@ -73,6 +73,10 @@ struct xfs_defer_capture { /* Deferred ops state saved from the transaction. */ struct list_head dfc_dfops; unsigned int dfc_tpflags; + + /* Block reservations for the data and rt devices. */ + unsigned int dfc_blkres; + unsigned int dfc_rtxres; };
/* diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 388a2ec2d879..a591420a2c89 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -4766,27 +4766,12 @@ xlog_finish_defer_ops( { struct xfs_defer_capture *dfc, *next; struct xfs_trans *tp; - int64_t freeblks; - uint64_t resblks; int error = 0;
list_for_each_entry_safe(dfc, next, capture_list, dfc_list) { - /* - * We're finishing the defer_ops that accumulated as a result - * of recovering unfinished intent items during log recovery. - * We reserve an itruncate transaction because it is the - * largest permanent transaction type. Since we're the only - * user of the fs right now, take 93% (15/16) of the available - * free blocks. Use weird math to avoid a 64-bit division. - */ - freeblks = percpu_counter_sum(&mp->m_fdblocks); - if (freeblks <= 0) - return -ENOSPC; - - resblks = min_t(uint64_t, UINT_MAX, freeblks); - resblks = (resblks * 15) >> 4; - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, resblks, - 0, XFS_TRANS_RESERVE, &tp); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, + dfc->dfc_blkres, dfc->dfc_rtxres, + XFS_TRANS_RESERVE, &tp); if (error) return error;
From: "Darrick J. Wong" darrick.wong@oracle.com
commit 929b92f64048d90d23e40a59c47adf59f5026903 upstream.
When xfs_defer_capture extracts the deferred ops and transaction state from a transaction, it should record the transaction reservation type from the old transaction so that when we continue the dfops chain, we still use the same reservation parameters.
Doing this means that the log item recovery functions get to determine the transaction reservation instead of abusing tr_itruncate in yet another part of xfs.
Signed-off-by: Darrick J. Wong darrick.wong@oracle.com Reviewed-by: Brian Foster bfoster@redhat.com Reviewed-by: Christoph Hellwig hch@lst.de Signed-off-by: Chandan Babu R chandan.babu@oracle.com Acked-by: Darrick J. Wong djwong@kernel.org --- fs/xfs/libxfs/xfs_defer.c | 3 +++ fs/xfs/libxfs/xfs_defer.h | 3 +++ fs/xfs/xfs_log_recover.c | 17 ++++++++++++++--- 3 files changed, 20 insertions(+), 3 deletions(-)
diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c index 4c36ab9dd33e..d92863773736 100644 --- a/fs/xfs/libxfs/xfs_defer.c +++ b/fs/xfs/libxfs/xfs_defer.c @@ -593,6 +593,9 @@ xfs_defer_ops_capture( dfc->dfc_blkres = tp->t_blk_res - tp->t_blk_res_used; dfc->dfc_rtxres = tp->t_rtx_res - tp->t_rtx_res_used;
+ /* Preserve the log reservation size. */ + dfc->dfc_logres = tp->t_log_res; + return dfc; }
diff --git a/fs/xfs/libxfs/xfs_defer.h b/fs/xfs/libxfs/xfs_defer.h index 7b0794ad58ca..d5b7494513e8 100644 --- a/fs/xfs/libxfs/xfs_defer.h +++ b/fs/xfs/libxfs/xfs_defer.h @@ -77,6 +77,9 @@ struct xfs_defer_capture { /* Block reservations for the data and rt devices. */ unsigned int dfc_blkres; unsigned int dfc_rtxres; + + /* Log reservation saved from the transaction. */ + unsigned int dfc_logres; };
/* diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index a591420a2c89..1e6ef00b833a 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -4769,9 +4769,20 @@ xlog_finish_defer_ops( int error = 0;
list_for_each_entry_safe(dfc, next, capture_list, dfc_list) { - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, - dfc->dfc_blkres, dfc->dfc_rtxres, - XFS_TRANS_RESERVE, &tp); + struct xfs_trans_res resv; + + /* + * Create a new transaction reservation from the captured + * information. Set logcount to 1 to force the new transaction + * to regrant every roll so that we can make forward progress + * in recovery no matter how full the log might be. + */ + resv.tr_logres = dfc->dfc_logres; + resv.tr_logcount = 1; + resv.tr_logflags = XFS_TRANS_PERM_LOG_RES; + + error = xfs_trans_alloc(mp, &resv, dfc->dfc_blkres, + dfc->dfc_rtxres, XFS_TRANS_RESERVE, &tp); if (error) return error;
From: "Darrick J. Wong" darrick.wong@oracle.com
commit 919522e89f8e71fc6a8f8abe17be4011573c6ea0 upstream.
The bmap intent item checking code in xfs_bui_item_recover is spread all over the function. We should check the recovered log item at the top before we allocate any resources or do anything else, so do that.
Signed-off-by: Darrick J. Wong darrick.wong@oracle.com Reviewed-by: Dave Chinner dchinner@redhat.com Reviewed-by: Christoph Hellwig hch@lst.de Signed-off-by: Chandan Babu R chandan.babu@oracle.com Acked-by: Darrick J. Wong djwong@kernel.org --- fs/xfs/xfs_bmap_item.c | 38 ++++++++++++-------------------------- 1 file changed, 12 insertions(+), 26 deletions(-)
diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c index e83729bf4997..381dd4f078b0 100644 --- a/fs/xfs/xfs_bmap_item.c +++ b/fs/xfs/xfs_bmap_item.c @@ -434,9 +434,7 @@ xfs_bui_recover( xfs_fsblock_t startblock_fsb; xfs_fsblock_t inode_fsb; xfs_filblks_t count; - bool op_ok; struct xfs_bud_log_item *budp; - enum xfs_bmap_intent_type type; int whichfork; xfs_exntst_t state; struct xfs_trans *tp; @@ -462,16 +460,19 @@ xfs_bui_recover( XFS_FSB_TO_DADDR(mp, bmap->me_startblock)); inode_fsb = XFS_BB_TO_FSB(mp, XFS_FSB_TO_DADDR(mp, XFS_INO_TO_FSB(mp, bmap->me_owner))); - switch (bmap->me_flags & XFS_BMAP_EXTENT_TYPE_MASK) { + state = (bmap->me_flags & XFS_BMAP_EXTENT_UNWRITTEN) ? + XFS_EXT_UNWRITTEN : XFS_EXT_NORM; + whichfork = (bmap->me_flags & XFS_BMAP_EXTENT_ATTR_FORK) ? + XFS_ATTR_FORK : XFS_DATA_FORK; + bui_type = bmap->me_flags & XFS_BMAP_EXTENT_TYPE_MASK; + switch (bui_type) { case XFS_BMAP_MAP: case XFS_BMAP_UNMAP: - op_ok = true; break; default: - op_ok = false; - break; + return -EFSCORRUPTED; } - if (!op_ok || startblock_fsb == 0 || + if (startblock_fsb == 0 || bmap->me_len == 0 || inode_fsb == 0 || startblock_fsb >= mp->m_sb.sb_dblocks || @@ -502,32 +503,17 @@ xfs_bui_recover( if (VFS_I(ip)->i_nlink == 0) xfs_iflags_set(ip, XFS_IRECOVERY);
- /* Process deferred bmap item. */ - state = (bmap->me_flags & XFS_BMAP_EXTENT_UNWRITTEN) ? - XFS_EXT_UNWRITTEN : XFS_EXT_NORM; - whichfork = (bmap->me_flags & XFS_BMAP_EXTENT_ATTR_FORK) ? - XFS_ATTR_FORK : XFS_DATA_FORK; - bui_type = bmap->me_flags & XFS_BMAP_EXTENT_TYPE_MASK; - switch (bui_type) { - case XFS_BMAP_MAP: - case XFS_BMAP_UNMAP: - type = bui_type; - break; - default: - XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp); - error = -EFSCORRUPTED; - goto err_inode; - } xfs_trans_ijoin(tp, ip, 0);
count = bmap->me_len; - error = xfs_trans_log_finish_bmap_update(tp, budp, type, ip, whichfork, - bmap->me_startoff, bmap->me_startblock, &count, state); + error = xfs_trans_log_finish_bmap_update(tp, budp, bui_type, ip, + whichfork, bmap->me_startoff, bmap->me_startblock, + &count, state); if (error) goto err_inode;
if (count > 0) { - ASSERT(type == XFS_BMAP_UNMAP); + ASSERT(bui_type == XFS_BMAP_UNMAP); irec.br_startblock = bmap->me_startblock; irec.br_blockcount = count; irec.br_startoff = bmap->me_startoff;
From: "Darrick J. Wong" darrick.wong@oracle.com
commit 64a3f3315bc60f710a0a25c1798ac0ea58c6fa1f upstream.
In most places in XFS, we have a specific order in which we gather resources: grab the inode, allocate a transaction, then lock the inode. xfs_bui_item_recover doesn't do it in that order, so fix it to be more consistent. This also makes the error bailout code a bit less weird.
Signed-off-by: Darrick J. Wong darrick.wong@oracle.com Reviewed-by: Dave Chinner dchinner@redhat.com Reviewed-by: Christoph Hellwig hch@lst.de Reviewed-by: Brian Foster bfoster@redhat.com Signed-off-by: Chandan Babu R chandan.babu@oracle.com Acked-by: Darrick J. Wong djwong@kernel.org --- fs/xfs/xfs_bmap_item.c | 38 ++++++++++++++++++++++++-------------- 1 file changed, 24 insertions(+), 14 deletions(-)
diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c index 381dd4f078b0..f7015eabfdc9 100644 --- a/fs/xfs/xfs_bmap_item.c +++ b/fs/xfs/xfs_bmap_item.c @@ -22,6 +22,7 @@ #include "xfs_bmap_btree.h" #include "xfs_trans_space.h" #include "xfs_error.h" +#include "xfs_quota.h"
kmem_zone_t *xfs_bui_zone; kmem_zone_t *xfs_bud_zone; @@ -488,21 +489,26 @@ xfs_bui_recover( return -EFSCORRUPTED; }
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, - XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK), 0, 0, &tp); + /* Grab the inode. */ + error = xfs_iget(mp, NULL, bmap->me_owner, 0, 0, &ip); if (error) return error;
- budp = xfs_trans_get_bud(tp, buip); - - /* Grab the inode. */ - error = xfs_iget(mp, tp, bmap->me_owner, 0, XFS_ILOCK_EXCL, &ip); + error = xfs_qm_dqattach(ip); if (error) - goto err_inode; + goto err_rele;
if (VFS_I(ip)->i_nlink == 0) xfs_iflags_set(ip, XFS_IRECOVERY);
+ /* Allocate transaction and do the work. */ + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, + XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK), 0, 0, &tp); + if (error) + goto err_rele; + + budp = xfs_trans_get_bud(tp, buip); + xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, 0);
count = bmap->me_len; @@ -510,7 +516,7 @@ xfs_bui_recover( whichfork, bmap->me_startoff, bmap->me_startblock, &count, state); if (error) - goto err_inode; + goto err_cancel;
if (count > 0) { ASSERT(bui_type == XFS_BMAP_UNMAP); @@ -522,16 +528,20 @@ xfs_bui_recover( }
set_bit(XFS_BUI_RECOVERED, &buip->bui_flags); + /* Commit transaction, which frees the transaction. */ error = xfs_defer_ops_capture_and_commit(tp, capture_list); + if (error) + goto err_unlock; + xfs_iunlock(ip, XFS_ILOCK_EXCL); xfs_irele(ip); - return error; + return 0;
-err_inode: +err_cancel: xfs_trans_cancel(tp); - if (ip) { - xfs_iunlock(ip, XFS_ILOCK_EXCL); - xfs_irele(ip); - } +err_unlock: + xfs_iunlock(ip, XFS_ILOCK_EXCL); +err_rele: + xfs_irele(ip); return error; }
From: "Darrick J. Wong" darrick.wong@oracle.com
commit ff4ab5e02a0447dd1e290883eb6cd7d94848e590 upstream.
In xfs_bui_item_recover, there exists a use-after-free bug with regards to the inode that is involved in the bmap replay operation. If the mapping operation does not complete, we call xfs_bmap_unmap_extent to create a deferred op to finish the unmapping work, and we retain a pointer to the incore inode.
Unfortunately, the very next thing we do is commit the transaction and drop the inode. If reclaim tears down the inode before we try to finish the defer ops, we dereference garbage and blow up. Therefore, create a way to join inodes to the defer ops freezer so that we can maintain the xfs_inode reference until we're done with the inode.
Note: This imposes the requirement that there be enough memory to keep every incore inode in memory throughout recovery.
Signed-off-by: Darrick J. Wong darrick.wong@oracle.com Reviewed-by: Brian Foster bfoster@redhat.com Reviewed-by: Christoph Hellwig hch@lst.de Signed-off-by: Chandan Babu R chandan.babu@oracle.com Acked-by: Darrick J. Wong djwong@kernel.org --- fs/xfs/libxfs/xfs_defer.c | 43 +++++++++++++++++++++++++++++++++----- fs/xfs/libxfs/xfs_defer.h | 11 ++++++++-- fs/xfs/xfs_bmap_item.c | 7 +++++-- fs/xfs/xfs_extfree_item.c | 2 +- fs/xfs/xfs_log_recover.c | 7 ++++++- fs/xfs/xfs_refcount_item.c | 2 +- fs/xfs/xfs_rmap_item.c | 2 +- 7 files changed, 61 insertions(+), 13 deletions(-)
diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c index d92863773736..714756931317 100644 --- a/fs/xfs/libxfs/xfs_defer.c +++ b/fs/xfs/libxfs/xfs_defer.c @@ -16,6 +16,7 @@ #include "xfs_inode.h" #include "xfs_inode_item.h" #include "xfs_trace.h" +#include "xfs_icache.h"
/* * Deferred Operations in XFS @@ -567,10 +568,14 @@ xfs_defer_move( * deferred ops state is transferred to the capture structure and the * transaction is then ready for the caller to commit it. If there are no * intent items to capture, this function returns NULL. + * + * If capture_ip is not NULL, the capture structure will obtain an extra + * reference to the inode. */ static struct xfs_defer_capture * xfs_defer_ops_capture( - struct xfs_trans *tp) + struct xfs_trans *tp, + struct xfs_inode *capture_ip) { struct xfs_defer_capture *dfc;
@@ -596,6 +601,15 @@ xfs_defer_ops_capture( /* Preserve the log reservation size. */ dfc->dfc_logres = tp->t_log_res;
+ /* + * Grab an extra reference to this inode and attach it to the capture + * structure. + */ + if (capture_ip) { + ihold(VFS_I(capture_ip)); + dfc->dfc_capture_ip = capture_ip; + } + return dfc; }
@@ -606,24 +620,33 @@ xfs_defer_ops_release( struct xfs_defer_capture *dfc) { xfs_defer_cancel_list(mp, &dfc->dfc_dfops); + if (dfc->dfc_capture_ip) + xfs_irele(dfc->dfc_capture_ip); kmem_free(dfc); }
/* * Capture any deferred ops and commit the transaction. This is the last step - * needed to finish a log intent item that we recovered from the log. + * needed to finish a log intent item that we recovered from the log. If any + * of the deferred ops operate on an inode, the caller must pass in that inode + * so that the reference can be transferred to the capture structure. The + * caller must hold ILOCK_EXCL on the inode, and must unlock it before calling + * xfs_defer_ops_continue. */ int xfs_defer_ops_capture_and_commit( struct xfs_trans *tp, + struct xfs_inode *capture_ip, struct list_head *capture_list) { struct xfs_mount *mp = tp->t_mountp; struct xfs_defer_capture *dfc; int error;
+ ASSERT(!capture_ip || xfs_isilocked(capture_ip, XFS_ILOCK_EXCL)); + /* If we don't capture anything, commit transaction and exit. */ - dfc = xfs_defer_ops_capture(tp); + dfc = xfs_defer_ops_capture(tp, capture_ip); if (!dfc) return xfs_trans_commit(tp);
@@ -640,16 +663,26 @@ xfs_defer_ops_capture_and_commit(
/* * Attach a chain of captured deferred ops to a new transaction and free the - * capture structure. + * capture structure. If an inode was captured, it will be passed back to the + * caller with ILOCK_EXCL held and joined to the transaction with lockflags==0. + * The caller now owns the inode reference. */ void xfs_defer_ops_continue( struct xfs_defer_capture *dfc, - struct xfs_trans *tp) + struct xfs_trans *tp, + struct xfs_inode **captured_ipp) { ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); ASSERT(!(tp->t_flags & XFS_TRANS_DIRTY));
+ /* Lock and join the captured inode to the new transaction. */ + if (dfc->dfc_capture_ip) { + xfs_ilock(dfc->dfc_capture_ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, dfc->dfc_capture_ip, 0); + } + *captured_ipp = dfc->dfc_capture_ip; + /* Move captured dfops chain and state to the transaction. */ list_splice_init(&dfc->dfc_dfops, &tp->t_dfops); tp->t_flags |= dfc->dfc_tpflags; diff --git a/fs/xfs/libxfs/xfs_defer.h b/fs/xfs/libxfs/xfs_defer.h index d5b7494513e8..4c3248d47a35 100644 --- a/fs/xfs/libxfs/xfs_defer.h +++ b/fs/xfs/libxfs/xfs_defer.h @@ -80,6 +80,12 @@ struct xfs_defer_capture {
/* Log reservation saved from the transaction. */ unsigned int dfc_logres; + + /* + * An inode reference that must be maintained to complete the deferred + * work. + */ + struct xfs_inode *dfc_capture_ip; };
/* @@ -87,8 +93,9 @@ struct xfs_defer_capture { * This doesn't normally happen except log recovery. */ int xfs_defer_ops_capture_and_commit(struct xfs_trans *tp, - struct list_head *capture_list); -void xfs_defer_ops_continue(struct xfs_defer_capture *d, struct xfs_trans *tp); + struct xfs_inode *capture_ip, struct list_head *capture_list); +void xfs_defer_ops_continue(struct xfs_defer_capture *d, struct xfs_trans *tp, + struct xfs_inode **captured_ipp); void xfs_defer_ops_release(struct xfs_mount *mp, struct xfs_defer_capture *d);
#endif /* __XFS_DEFER_H__ */ diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c index f7015eabfdc9..888449ac8b75 100644 --- a/fs/xfs/xfs_bmap_item.c +++ b/fs/xfs/xfs_bmap_item.c @@ -528,8 +528,11 @@ xfs_bui_recover( }
set_bit(XFS_BUI_RECOVERED, &buip->bui_flags); - /* Commit transaction, which frees the transaction. */ - error = xfs_defer_ops_capture_and_commit(tp, capture_list); + /* + * Commit transaction, which frees the transaction and saves the inode + * for later replay activities. + */ + error = xfs_defer_ops_capture_and_commit(tp, ip, capture_list); if (error) goto err_unlock;
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index 2db85c2c6d99..0333b20afafd 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -639,7 +639,7 @@ xfs_efi_recover(
set_bit(XFS_EFI_RECOVERED, &efip->efi_flags);
- return xfs_defer_ops_capture_and_commit(tp, capture_list); + return xfs_defer_ops_capture_and_commit(tp, NULL, capture_list);
abort_error: xfs_trans_cancel(tp); diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 1e6ef00b833a..6c60cdd10d33 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -4766,6 +4766,7 @@ xlog_finish_defer_ops( { struct xfs_defer_capture *dfc, *next; struct xfs_trans *tp; + struct xfs_inode *ip; int error = 0;
list_for_each_entry_safe(dfc, next, capture_list, dfc_list) { @@ -4791,9 +4792,13 @@ xlog_finish_defer_ops( * from recovering a single intent item. */ list_del_init(&dfc->dfc_list); - xfs_defer_ops_continue(dfc, tp); + xfs_defer_ops_continue(dfc, tp, &ip);
error = xfs_trans_commit(tp); + if (ip) { + xfs_iunlock(ip, XFS_ILOCK_EXCL); + xfs_irele(ip); + } if (error) return error; } diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c index c071f8600e8e..98f67dd64ce8 100644 --- a/fs/xfs/xfs_refcount_item.c +++ b/fs/xfs/xfs_refcount_item.c @@ -569,7 +569,7 @@ xfs_cui_recover(
xfs_refcount_finish_one_cleanup(tp, rcur, error); set_bit(XFS_CUI_RECOVERED, &cuip->cui_flags); - return xfs_defer_ops_capture_and_commit(tp, capture_list); + return xfs_defer_ops_capture_and_commit(tp, NULL, capture_list);
abort_error: xfs_refcount_finish_one_cleanup(tp, rcur, error); diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c index 5bdf1f5e51b8..32f580fa1877 100644 --- a/fs/xfs/xfs_rmap_item.c +++ b/fs/xfs/xfs_rmap_item.c @@ -593,7 +593,7 @@ xfs_rui_recover(
xfs_rmap_finish_one_cleanup(tp, rcur, error); set_bit(XFS_RUI_RECOVERED, &ruip->rui_flags); - return xfs_defer_ops_capture_and_commit(tp, capture_list); + return xfs_defer_ops_capture_and_commit(tp, NULL, capture_list);
abort_error: xfs_rmap_finish_one_cleanup(tp, rcur, error);
From: "Darrick J. Wong" darrick.wong@oracle.com
commit 27dada070d59c28a441f1907d2cec891b17dcb26 upstream.
The defer ops code has been finishing items in the wrong order -- if a top level defer op creates items A and B, and finishing item A creates more defer ops A1 and A2, we'll put the new items on the end of the chain and process them in the order A B A1 A2. This is kind of weird, since it's convenient for programmers to be able to think of A and B as an ordered sequence where all the sub-tasks for A must finish before we move on to B, e.g. A A1 A2 D.
Right now, our log intent items are not so complex that this matters, but this will become important for the atomic extent swapping patchset. In order to maintain correct reference counting of extents, we have to unmap and remap extents in that order, and we want to complete that work before moving on to the next range that the user wants to swap. This patch fixes defer ops to satsify that requirement.
The primary symptom of the incorrect order was noticed in an early performance analysis of the atomic extent swap code. An astonishingly large number of deferred work items accumulated when userspace requested an atomic update of two very fragmented files. The cause of this was traced to the same ordering bug in the inner loop of xfs_defer_finish_noroll.
If the ->finish_item method of a deferred operation queues new deferred operations, those new deferred ops are appended to the tail of the pending work list. To illustrate, say that a caller creates a transaction t0 with four deferred operations D0-D3. The first thing defer ops does is roll the transaction to t1, leaving us with:
t1: D0(t0), D1(t0), D2(t0), D3(t0)
Let's say that finishing each of D0-D3 will create two new deferred ops. After finish D0 and roll, we'll have the following chain:
t2: D1(t0), D2(t0), D3(t0), d4(t1), d5(t1)
d4 and d5 were logged to t1. Notice that while we're about to start work on D1, we haven't actually completed all the work implied by D0 being finished. So far we've been careful (or lucky) to structure the dfops callers such that D1 doesn't depend on d4 or d5 being finished, but this is a potential logic bomb.
There's a second problem lurking. Let's see what happens as we finish D1-D3:
t3: D2(t0), D3(t0), d4(t1), d5(t1), d6(t2), d7(t2) t4: D3(t0), d4(t1), d5(t1), d6(t2), d7(t2), d8(t3), d9(t3) t5: d4(t1), d5(t1), d6(t2), d7(t2), d8(t3), d9(t3), d10(t4), d11(t4)
Let's say that d4-d11 are simple work items that don't queue any other operations, which means that we can complete each d4 and roll to t6:
t6: d5(t1), d6(t2), d7(t2), d8(t3), d9(t3), d10(t4), d11(t4) t7: d6(t2), d7(t2), d8(t3), d9(t3), d10(t4), d11(t4) ... t11: d10(t4), d11(t4) t12: d11(t4) <done>
When we try to roll to transaction #12, we're holding defer op d11, which we logged way back in t4. This means that the tail of the log is pinned at t4. If the log is very small or there are a lot of other threads updating metadata, this means that we might have wrapped the log and cannot get roll to t11 because there isn't enough space left before we'd run into t4.
Let's shift back to the original failure. I mentioned before that I discovered this flaw while developing the atomic file update code. In that scenario, we have a defer op (D0) that finds a range of file blocks to remap, creates a handful of new defer ops to do that, and then asks to be continued with however much work remains.
So, D0 is the original swapext deferred op. The first thing defer ops does is rolls to t1:
t1: D0(t0)
We try to finish D0, logging d1 and d2 in the process, but can't get all the work done. We log a done item and a new intent item for the work that D0 still has to do, and roll to t2:
t2: D0'(t1), d1(t1), d2(t1)
We roll and try to finish D0', but still can't get all the work done, so we log a done item and a new intent item for it, requeue D0 a second time, and roll to t3:
t3: D0''(t2), d1(t1), d2(t1), d3(t2), d4(t2)
If it takes 48 more rolls to complete D0, then we'll finally dispense with D0 in t50:
t50: D<fifty primes>(t49), d1(t1), ..., d102(t50)
We then try to roll again to get a chain like this:
t51: d1(t1), d2(t1), ..., d101(t50), d102(t50) ... t152: d102(t50) <done>
Notice that in rolling to transaction #51, we're holding on to a log intent item for d1 that was logged in transaction #1. This means that the tail of the log is pinned at t1. If the log is very small or there are a lot of other threads updating metadata, this means that we might have wrapped the log and cannot roll to t51 because there isn't enough space left before we'd run into t1. This is of course problem #2 again.
But notice the third problem with this scenario: we have 102 defer ops tied to this transaction! Each of these items are backed by pinned kernel memory, which means that we risk OOM if the chains get too long.
Yikes. Problem #1 is a subtle logic bomb that could hit someone in the future; problem #2 applies (rarely) to the current upstream, and problem
This is not how incremental deferred operations were supposed to work. The dfops design of logging in the same transaction an intent-done item and a new intent item for the work remaining was to make it so that we only have to juggle enough deferred work items to finish that one small piece of work. Deferred log item recovery will find that first unfinished work item and restart it, no matter how many other intent items might follow it in the log. Therefore, it's ok to put the new intents at the start of the dfops chain.
For the first example, the chains look like this:
t2: d4(t1), d5(t1), D1(t0), D2(t0), D3(t0) t3: d5(t1), D1(t0), D2(t0), D3(t0) ... t9: d9(t7), D3(t0) t10: D3(t0) t11: d10(t10), d11(t10) t12: d11(t10)
For the second example, the chains look like this:
t1: D0(t0) t2: d1(t1), d2(t1), D0'(t1) t3: d2(t1), D0'(t1) t4: D0'(t1) t5: d1(t4), d2(t4), D0''(t4) ... t148: D0<50 primes>(t147) t149: d101(t148), d102(t148) t150: d102(t148) <done>
This actually sucks more for pinning the log tail (we try to roll to t10 while holding an intent item that was logged in t1) but we've solved problem #1. We've also reduced the maximum chain length from:
sum(all the new items) + nr_original_items
to:
max(new items that each original item creates) + nr_original_items
This solves problem #3 by sharply reducing the number of defer ops that can be attached to a transaction at any given time. The change makes the problem of log tail pinning worse, but is improvement we need to solve problem #2. Actually solving #2, however, is left to the next patch.
Note that a subsequent analysis of some hard-to-trigger reflink and COW livelocks on extremely fragmented filesystems (or systems running a lot of IO threads) showed the same symptoms -- uncomfortably large numbers of incore deferred work items and occasional stalls in the transaction grant code while waiting for log reservations. I think this patch and the next one will also solve these problems.
As originally written, the code used list_splice_tail_init instead of list_splice_init, so change that, and leave a short comment explaining our actions.
Signed-off-by: Darrick J. Wong darrick.wong@oracle.com Reviewed-by: Dave Chinner dchinner@redhat.com Reviewed-by: Brian Foster bfoster@redhat.com Signed-off-by: Chandan Babu R chandan.babu@oracle.com Acked-by: Darrick J. Wong djwong@kernel.org --- fs/xfs/libxfs/xfs_defer.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-)
diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c index 714756931317..c817b8924f9a 100644 --- a/fs/xfs/libxfs/xfs_defer.c +++ b/fs/xfs/libxfs/xfs_defer.c @@ -431,8 +431,17 @@ xfs_defer_finish_noroll(
/* Until we run out of pending work to finish... */ while (!list_empty(&dop_pending) || !list_empty(&(*tp)->t_dfops)) { + /* + * Deferred items that are created in the process of finishing + * other deferred work items should be queued at the head of + * the pending list, which puts them ahead of the deferred work + * that was created by the caller. This keeps the number of + * pending work items to a minimum, which decreases the amount + * of time that any one intent item can stick around in memory, + * pinning the log tail. + */ xfs_defer_create_intents(*tp); - list_splice_tail_init(&(*tp)->t_dfops, &dop_pending); + list_splice_init(&(*tp)->t_dfops, &dop_pending);
error = xfs_defer_trans_roll(tp); if (error)
From: "Darrick J. Wong" darrick.wong@oracle.com
commit 4e919af7827a6adfc28e82cd6c4ffcfcc3dd6118 upstream.
[ Modify xfs_{bmap|extfree|refcount|rmap}_item.c to fix merge conflicts ]
There's a subtle design flaw in the deferred log item code that can lead to pinning the log tail. Taking up the defer ops chain examples from the previous commit, we can get trapped in sequences like this:
Caller hands us a transaction t0 with D0-D3 attached. The defer ops chain will look like the following if the transaction rolls succeed:
t1: D0(t0), D1(t0), D2(t0), D3(t0) t2: d4(t1), d5(t1), D1(t0), D2(t0), D3(t0) t3: d5(t1), D1(t0), D2(t0), D3(t0) ... t9: d9(t7), D3(t0) t10: D3(t0) t11: d10(t10), d11(t10) t12: d11(t10)
In transaction 9, we finish d9 and try to roll to t10 while holding onto an intent item for D3 that we logged in t0.
The previous commit changed the order in which we place new defer ops in the defer ops processing chain to reduce the maximum chain length. Now make xfs_defer_finish_noroll capable of relogging the entire chain periodically so that we can always move the log tail forward. Most chains will never get relogged, except for operations that generate very long chains (large extents containing many blocks with different sharing levels) or are on filesystems with small logs and a lot of ongoing metadata updates.
Callers are now required to ensure that the transaction reservation is large enough to handle logging done items and new intent items for the maximum possible chain length. Most callers are careful to keep the chain lengths low, so the overhead should be minimal.
The decision to relog an intent item is made based on whether the intent was logged in a previous checkpoint, since there's no point in relogging an intent into the same checkpoint.
Signed-off-by: Darrick J. Wong darrick.wong@oracle.com Reviewed-by: Brian Foster bfoster@redhat.com Signed-off-by: Chandan Babu R chandan.babu@oracle.com Acked-by: Darrick J. Wong djwong@kernel.org --- fs/xfs/libxfs/xfs_defer.c | 42 +++++++++++++++ fs/xfs/xfs_bmap_item.c | 83 +++++++++++++++++++---------- fs/xfs/xfs_extfree_item.c | 104 +++++++++++++++++++++++-------------- fs/xfs/xfs_refcount_item.c | 95 +++++++++++++++++++++------------ fs/xfs/xfs_rmap_item.c | 93 +++++++++++++++++++++------------ fs/xfs/xfs_stats.c | 4 ++ fs/xfs/xfs_stats.h | 1 + fs/xfs/xfs_trace.h | 1 + fs/xfs/xfs_trans.h | 10 ++++ 9 files changed, 300 insertions(+), 133 deletions(-)
diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c index c817b8924f9a..b0b382323413 100644 --- a/fs/xfs/libxfs/xfs_defer.c +++ b/fs/xfs/libxfs/xfs_defer.c @@ -17,6 +17,7 @@ #include "xfs_inode_item.h" #include "xfs_trace.h" #include "xfs_icache.h" +#include "xfs_log.h"
/* * Deferred Operations in XFS @@ -361,6 +362,42 @@ xfs_defer_cancel_list( } }
+/* + * Prevent a log intent item from pinning the tail of the log by logging a + * done item to release the intent item; and then log a new intent item. + * The caller should provide a fresh transaction and roll it after we're done. + */ +static int +xfs_defer_relog( + struct xfs_trans **tpp, + struct list_head *dfops) +{ + struct xfs_defer_pending *dfp; + + ASSERT((*tpp)->t_flags & XFS_TRANS_PERM_LOG_RES); + + list_for_each_entry(dfp, dfops, dfp_list) { + /* + * If the log intent item for this deferred op is not a part of + * the current log checkpoint, relog the intent item to keep + * the log tail moving forward. We're ok with this being racy + * because an incorrect decision means we'll be a little slower + * at pushing the tail. + */ + if (dfp->dfp_intent == NULL || + xfs_log_item_in_current_chkpt(dfp->dfp_intent)) + continue; + + trace_xfs_defer_relog_intent((*tpp)->t_mountp, dfp); + XFS_STATS_INC((*tpp)->t_mountp, defer_relog); + dfp->dfp_intent = xfs_trans_item_relog(dfp->dfp_intent, *tpp); + } + + if ((*tpp)->t_flags & XFS_TRANS_DIRTY) + return xfs_defer_trans_roll(tpp); + return 0; +} + /* * Log an intent-done item for the first pending intent, and finish the work * items. @@ -447,6 +484,11 @@ xfs_defer_finish_noroll( if (error) goto out_shutdown;
+ /* Possibly relog intent items to keep the log moving. */ + error = xfs_defer_relog(tp, &dop_pending); + if (error) + goto out_shutdown; + dfp = list_first_entry(&dop_pending, struct xfs_defer_pending, dfp_list); error = xfs_defer_finish_one(*tp, dfp); diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c index 888449ac8b75..7b0c4d9679d9 100644 --- a/fs/xfs/xfs_bmap_item.c +++ b/fs/xfs/xfs_bmap_item.c @@ -125,34 +125,6 @@ xfs_bui_item_release( xfs_bui_release(BUI_ITEM(lip)); }
-static const struct xfs_item_ops xfs_bui_item_ops = { - .iop_size = xfs_bui_item_size, - .iop_format = xfs_bui_item_format, - .iop_unpin = xfs_bui_item_unpin, - .iop_release = xfs_bui_item_release, -}; - -/* - * Allocate and initialize an bui item with the given number of extents. - */ -struct xfs_bui_log_item * -xfs_bui_init( - struct xfs_mount *mp) - -{ - struct xfs_bui_log_item *buip; - - buip = kmem_zone_zalloc(xfs_bui_zone, 0); - - xfs_log_item_init(mp, &buip->bui_item, XFS_LI_BUI, &xfs_bui_item_ops); - buip->bui_format.bui_nextents = XFS_BUI_MAX_FAST_EXTENTS; - buip->bui_format.bui_id = (uintptr_t)(void *)buip; - atomic_set(&buip->bui_next_extent, 0); - atomic_set(&buip->bui_refcount, 2); - - return buip; -} - static inline struct xfs_bud_log_item *BUD_ITEM(struct xfs_log_item *lip) { return container_of(lip, struct xfs_bud_log_item, bud_item); @@ -548,3 +520,58 @@ xfs_bui_recover( xfs_irele(ip); return error; } + +/* Relog an intent item to push the log tail forward. */ +static struct xfs_log_item * +xfs_bui_item_relog( + struct xfs_log_item *intent, + struct xfs_trans *tp) +{ + struct xfs_bud_log_item *budp; + struct xfs_bui_log_item *buip; + struct xfs_map_extent *extp; + unsigned int count; + + count = BUI_ITEM(intent)->bui_format.bui_nextents; + extp = BUI_ITEM(intent)->bui_format.bui_extents; + + tp->t_flags |= XFS_TRANS_DIRTY; + budp = xfs_trans_get_bud(tp, BUI_ITEM(intent)); + set_bit(XFS_LI_DIRTY, &budp->bud_item.li_flags); + + buip = xfs_bui_init(tp->t_mountp); + memcpy(buip->bui_format.bui_extents, extp, count * sizeof(*extp)); + atomic_set(&buip->bui_next_extent, count); + xfs_trans_add_item(tp, &buip->bui_item); + set_bit(XFS_LI_DIRTY, &buip->bui_item.li_flags); + return &buip->bui_item; +} + +static const struct xfs_item_ops xfs_bui_item_ops = { + .iop_size = xfs_bui_item_size, + .iop_format = xfs_bui_item_format, + .iop_unpin = xfs_bui_item_unpin, + .iop_release = xfs_bui_item_release, + .iop_relog = xfs_bui_item_relog, +}; + +/* + * Allocate and initialize an bui item with the given number of extents. + */ +struct xfs_bui_log_item * +xfs_bui_init( + struct xfs_mount *mp) + +{ + struct xfs_bui_log_item *buip; + + buip = kmem_zone_zalloc(xfs_bui_zone, 0); + + xfs_log_item_init(mp, &buip->bui_item, XFS_LI_BUI, &xfs_bui_item_ops); + buip->bui_format.bui_nextents = XFS_BUI_MAX_FAST_EXTENTS; + buip->bui_format.bui_id = (uintptr_t)(void *)buip; + atomic_set(&buip->bui_next_extent, 0); + atomic_set(&buip->bui_refcount, 2); + + return buip; +} diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index 0333b20afafd..de3cdce892fd 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -139,44 +139,6 @@ xfs_efi_item_release( xfs_efi_release(EFI_ITEM(lip)); }
-static const struct xfs_item_ops xfs_efi_item_ops = { - .iop_size = xfs_efi_item_size, - .iop_format = xfs_efi_item_format, - .iop_unpin = xfs_efi_item_unpin, - .iop_release = xfs_efi_item_release, -}; - - -/* - * Allocate and initialize an efi item with the given number of extents. - */ -struct xfs_efi_log_item * -xfs_efi_init( - struct xfs_mount *mp, - uint nextents) - -{ - struct xfs_efi_log_item *efip; - uint size; - - ASSERT(nextents > 0); - if (nextents > XFS_EFI_MAX_FAST_EXTENTS) { - size = (uint)(sizeof(struct xfs_efi_log_item) + - ((nextents - 1) * sizeof(xfs_extent_t))); - efip = kmem_zalloc(size, 0); - } else { - efip = kmem_zone_zalloc(xfs_efi_zone, 0); - } - - xfs_log_item_init(mp, &efip->efi_item, XFS_LI_EFI, &xfs_efi_item_ops); - efip->efi_format.efi_nextents = nextents; - efip->efi_format.efi_id = (uintptr_t)(void *)efip; - atomic_set(&efip->efi_next_extent, 0); - atomic_set(&efip->efi_refcount, 2); - - return efip; -} - /* * Copy an EFI format buffer from the given buf, and into the destination * EFI format structure. @@ -645,3 +607,69 @@ xfs_efi_recover( xfs_trans_cancel(tp); return error; } + +/* Relog an intent item to push the log tail forward. */ +static struct xfs_log_item * +xfs_efi_item_relog( + struct xfs_log_item *intent, + struct xfs_trans *tp) +{ + struct xfs_efd_log_item *efdp; + struct xfs_efi_log_item *efip; + struct xfs_extent *extp; + unsigned int count; + + count = EFI_ITEM(intent)->efi_format.efi_nextents; + extp = EFI_ITEM(intent)->efi_format.efi_extents; + + tp->t_flags |= XFS_TRANS_DIRTY; + efdp = xfs_trans_get_efd(tp, EFI_ITEM(intent), count); + efdp->efd_next_extent = count; + memcpy(efdp->efd_format.efd_extents, extp, count * sizeof(*extp)); + set_bit(XFS_LI_DIRTY, &efdp->efd_item.li_flags); + + efip = xfs_efi_init(tp->t_mountp, count); + memcpy(efip->efi_format.efi_extents, extp, count * sizeof(*extp)); + atomic_set(&efip->efi_next_extent, count); + xfs_trans_add_item(tp, &efip->efi_item); + set_bit(XFS_LI_DIRTY, &efip->efi_item.li_flags); + return &efip->efi_item; +} + +static const struct xfs_item_ops xfs_efi_item_ops = { + .iop_size = xfs_efi_item_size, + .iop_format = xfs_efi_item_format, + .iop_unpin = xfs_efi_item_unpin, + .iop_release = xfs_efi_item_release, + .iop_relog = xfs_efi_item_relog, +}; + +/* + * Allocate and initialize an efi item with the given number of extents. + */ +struct xfs_efi_log_item * +xfs_efi_init( + struct xfs_mount *mp, + uint nextents) + +{ + struct xfs_efi_log_item *efip; + uint size; + + ASSERT(nextents > 0); + if (nextents > XFS_EFI_MAX_FAST_EXTENTS) { + size = (uint)(sizeof(struct xfs_efi_log_item) + + ((nextents - 1) * sizeof(xfs_extent_t))); + efip = kmem_zalloc(size, 0); + } else { + efip = kmem_zone_zalloc(xfs_efi_zone, 0); + } + + xfs_log_item_init(mp, &efip->efi_item, XFS_LI_EFI, &xfs_efi_item_ops); + efip->efi_format.efi_nextents = nextents; + efip->efi_format.efi_id = (uintptr_t)(void *)efip; + atomic_set(&efip->efi_next_extent, 0); + atomic_set(&efip->efi_refcount, 2); + + return efip; +} diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c index 98f67dd64ce8..fa1018a6e677 100644 --- a/fs/xfs/xfs_refcount_item.c +++ b/fs/xfs/xfs_refcount_item.c @@ -123,40 +123,6 @@ xfs_cui_item_release( xfs_cui_release(CUI_ITEM(lip)); }
-static const struct xfs_item_ops xfs_cui_item_ops = { - .iop_size = xfs_cui_item_size, - .iop_format = xfs_cui_item_format, - .iop_unpin = xfs_cui_item_unpin, - .iop_release = xfs_cui_item_release, -}; - -/* - * Allocate and initialize an cui item with the given number of extents. - */ -struct xfs_cui_log_item * -xfs_cui_init( - struct xfs_mount *mp, - uint nextents) - -{ - struct xfs_cui_log_item *cuip; - - ASSERT(nextents > 0); - if (nextents > XFS_CUI_MAX_FAST_EXTENTS) - cuip = kmem_zalloc(xfs_cui_log_item_sizeof(nextents), - 0); - else - cuip = kmem_zone_zalloc(xfs_cui_zone, 0); - - xfs_log_item_init(mp, &cuip->cui_item, XFS_LI_CUI, &xfs_cui_item_ops); - cuip->cui_format.cui_nextents = nextents; - cuip->cui_format.cui_id = (uintptr_t)(void *)cuip; - atomic_set(&cuip->cui_next_extent, 0); - atomic_set(&cuip->cui_refcount, 2); - - return cuip; -} - static inline struct xfs_cud_log_item *CUD_ITEM(struct xfs_log_item *lip) { return container_of(lip, struct xfs_cud_log_item, cud_item); @@ -576,3 +542,64 @@ xfs_cui_recover( xfs_trans_cancel(tp); return error; } + +/* Relog an intent item to push the log tail forward. */ +static struct xfs_log_item * +xfs_cui_item_relog( + struct xfs_log_item *intent, + struct xfs_trans *tp) +{ + struct xfs_cud_log_item *cudp; + struct xfs_cui_log_item *cuip; + struct xfs_phys_extent *extp; + unsigned int count; + + count = CUI_ITEM(intent)->cui_format.cui_nextents; + extp = CUI_ITEM(intent)->cui_format.cui_extents; + + tp->t_flags |= XFS_TRANS_DIRTY; + cudp = xfs_trans_get_cud(tp, CUI_ITEM(intent)); + set_bit(XFS_LI_DIRTY, &cudp->cud_item.li_flags); + + cuip = xfs_cui_init(tp->t_mountp, count); + memcpy(cuip->cui_format.cui_extents, extp, count * sizeof(*extp)); + atomic_set(&cuip->cui_next_extent, count); + xfs_trans_add_item(tp, &cuip->cui_item); + set_bit(XFS_LI_DIRTY, &cuip->cui_item.li_flags); + return &cuip->cui_item; +} + +static const struct xfs_item_ops xfs_cui_item_ops = { + .iop_size = xfs_cui_item_size, + .iop_format = xfs_cui_item_format, + .iop_unpin = xfs_cui_item_unpin, + .iop_release = xfs_cui_item_release, + .iop_relog = xfs_cui_item_relog, +}; + +/* + * Allocate and initialize an cui item with the given number of extents. + */ +struct xfs_cui_log_item * +xfs_cui_init( + struct xfs_mount *mp, + uint nextents) + +{ + struct xfs_cui_log_item *cuip; + + ASSERT(nextents > 0); + if (nextents > XFS_CUI_MAX_FAST_EXTENTS) + cuip = kmem_zalloc(xfs_cui_log_item_sizeof(nextents), + 0); + else + cuip = kmem_zone_zalloc(xfs_cui_zone, 0); + + xfs_log_item_init(mp, &cuip->cui_item, XFS_LI_CUI, &xfs_cui_item_ops); + cuip->cui_format.cui_nextents = nextents; + cuip->cui_format.cui_id = (uintptr_t)(void *)cuip; + atomic_set(&cuip->cui_next_extent, 0); + atomic_set(&cuip->cui_refcount, 2); + + return cuip; +} diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c index 32f580fa1877..ba1dbb6c4063 100644 --- a/fs/xfs/xfs_rmap_item.c +++ b/fs/xfs/xfs_rmap_item.c @@ -122,39 +122,6 @@ xfs_rui_item_release( xfs_rui_release(RUI_ITEM(lip)); }
-static const struct xfs_item_ops xfs_rui_item_ops = { - .iop_size = xfs_rui_item_size, - .iop_format = xfs_rui_item_format, - .iop_unpin = xfs_rui_item_unpin, - .iop_release = xfs_rui_item_release, -}; - -/* - * Allocate and initialize an rui item with the given number of extents. - */ -struct xfs_rui_log_item * -xfs_rui_init( - struct xfs_mount *mp, - uint nextents) - -{ - struct xfs_rui_log_item *ruip; - - ASSERT(nextents > 0); - if (nextents > XFS_RUI_MAX_FAST_EXTENTS) - ruip = kmem_zalloc(xfs_rui_log_item_sizeof(nextents), 0); - else - ruip = kmem_zone_zalloc(xfs_rui_zone, 0); - - xfs_log_item_init(mp, &ruip->rui_item, XFS_LI_RUI, &xfs_rui_item_ops); - ruip->rui_format.rui_nextents = nextents; - ruip->rui_format.rui_id = (uintptr_t)(void *)ruip; - atomic_set(&ruip->rui_next_extent, 0); - atomic_set(&ruip->rui_refcount, 2); - - return ruip; -} - /* * Copy an RUI format buffer from the given buf, and into the destination * RUI format structure. The RUI/RUD items were designed not to need any @@ -600,3 +567,63 @@ xfs_rui_recover( xfs_trans_cancel(tp); return error; } + +/* Relog an intent item to push the log tail forward. */ +static struct xfs_log_item * +xfs_rui_item_relog( + struct xfs_log_item *intent, + struct xfs_trans *tp) +{ + struct xfs_rud_log_item *rudp; + struct xfs_rui_log_item *ruip; + struct xfs_map_extent *extp; + unsigned int count; + + count = RUI_ITEM(intent)->rui_format.rui_nextents; + extp = RUI_ITEM(intent)->rui_format.rui_extents; + + tp->t_flags |= XFS_TRANS_DIRTY; + rudp = xfs_trans_get_rud(tp, RUI_ITEM(intent)); + set_bit(XFS_LI_DIRTY, &rudp->rud_item.li_flags); + + ruip = xfs_rui_init(tp->t_mountp, count); + memcpy(ruip->rui_format.rui_extents, extp, count * sizeof(*extp)); + atomic_set(&ruip->rui_next_extent, count); + xfs_trans_add_item(tp, &ruip->rui_item); + set_bit(XFS_LI_DIRTY, &ruip->rui_item.li_flags); + return &ruip->rui_item; +} + +static const struct xfs_item_ops xfs_rui_item_ops = { + .iop_size = xfs_rui_item_size, + .iop_format = xfs_rui_item_format, + .iop_unpin = xfs_rui_item_unpin, + .iop_release = xfs_rui_item_release, + .iop_relog = xfs_rui_item_relog, +}; + +/* + * Allocate and initialize an rui item with the given number of extents. + */ +struct xfs_rui_log_item * +xfs_rui_init( + struct xfs_mount *mp, + uint nextents) + +{ + struct xfs_rui_log_item *ruip; + + ASSERT(nextents > 0); + if (nextents > XFS_RUI_MAX_FAST_EXTENTS) + ruip = kmem_zalloc(xfs_rui_log_item_sizeof(nextents), 0); + else + ruip = kmem_zone_zalloc(xfs_rui_zone, 0); + + xfs_log_item_init(mp, &ruip->rui_item, XFS_LI_RUI, &xfs_rui_item_ops); + ruip->rui_format.rui_nextents = nextents; + ruip->rui_format.rui_id = (uintptr_t)(void *)ruip; + atomic_set(&ruip->rui_next_extent, 0); + atomic_set(&ruip->rui_refcount, 2); + + return ruip; +} diff --git a/fs/xfs/xfs_stats.c b/fs/xfs/xfs_stats.c index f70f1255220b..20e0534a772c 100644 --- a/fs/xfs/xfs_stats.c +++ b/fs/xfs/xfs_stats.c @@ -23,6 +23,7 @@ int xfs_stats_format(struct xfsstats __percpu *stats, char *buf) uint64_t xs_xstrat_bytes = 0; uint64_t xs_write_bytes = 0; uint64_t xs_read_bytes = 0; + uint64_t defer_relog = 0;
static const struct xstats_entry { char *desc; @@ -70,10 +71,13 @@ int xfs_stats_format(struct xfsstats __percpu *stats, char *buf) xs_xstrat_bytes += per_cpu_ptr(stats, i)->s.xs_xstrat_bytes; xs_write_bytes += per_cpu_ptr(stats, i)->s.xs_write_bytes; xs_read_bytes += per_cpu_ptr(stats, i)->s.xs_read_bytes; + defer_relog += per_cpu_ptr(stats, i)->s.defer_relog; }
len += scnprintf(buf + len, PATH_MAX-len, "xpc %Lu %Lu %Lu\n", xs_xstrat_bytes, xs_write_bytes, xs_read_bytes); + len += scnprintf(buf + len, PATH_MAX-len, "defer_relog %llu\n", + defer_relog); len += scnprintf(buf + len, PATH_MAX-len, "debug %u\n", #if defined(DEBUG) 1); diff --git a/fs/xfs/xfs_stats.h b/fs/xfs/xfs_stats.h index 34d704f703d2..43ffba74f045 100644 --- a/fs/xfs/xfs_stats.h +++ b/fs/xfs/xfs_stats.h @@ -137,6 +137,7 @@ struct __xfsstats { uint64_t xs_xstrat_bytes; uint64_t xs_write_bytes; uint64_t xs_read_bytes; + uint64_t defer_relog; };
#define xfsstats_offset(f) (offsetof(struct __xfsstats, f)/sizeof(uint32_t)) diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index f94908125e8f..4b5818395406 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -2418,6 +2418,7 @@ DEFINE_DEFER_PENDING_EVENT(xfs_defer_create_intent); DEFINE_DEFER_PENDING_EVENT(xfs_defer_cancel_list); DEFINE_DEFER_PENDING_EVENT(xfs_defer_pending_finish); DEFINE_DEFER_PENDING_EVENT(xfs_defer_pending_abort); +DEFINE_DEFER_PENDING_EVENT(xfs_defer_relog_intent);
#define DEFINE_BMAP_FREE_DEFERRED_EVENT DEFINE_PHYS_EXTENT_DEFERRED_EVENT DEFINE_BMAP_FREE_DEFERRED_EVENT(xfs_bmap_free_defer); diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index 64d7f171ebd3..941647027f00 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -77,6 +77,8 @@ struct xfs_item_ops { void (*iop_release)(struct xfs_log_item *); xfs_lsn_t (*iop_committed)(struct xfs_log_item *, xfs_lsn_t); void (*iop_error)(struct xfs_log_item *, xfs_buf_t *); + struct xfs_log_item *(*iop_relog)(struct xfs_log_item *intent, + struct xfs_trans *tp); };
/* @@ -244,4 +246,12 @@ void xfs_trans_buf_copy_type(struct xfs_buf *dst_bp,
extern kmem_zone_t *xfs_trans_zone;
+static inline struct xfs_log_item * +xfs_trans_item_relog( + struct xfs_log_item *lip, + struct xfs_trans *tp) +{ + return lip->li_ops->iop_relog(lip, tp); +} + #endif /* __XFS_TRANS_H__ */
From: "Darrick J. Wong" darrick.wong@oracle.com
commit ed1575daf71e4e21d8ae735b6e687c95454aaa17 upstream.
Separate the computation of the log push threshold and the push logic in xlog_grant_push_ail. This enables higher level code to determine (for example) that it is holding on to a logged intent item and the log is so busy that it is more than 75% full. In that case, it would be desirable to move the log item towards the head to release the tail, which we will cover in the next patch.
Signed-off-by: Darrick J. Wong darrick.wong@oracle.com Reviewed-by: Brian Foster bfoster@redhat.com Signed-off-by: Chandan Babu R chandan.babu@oracle.com Acked-by: Darrick J. Wong djwong@kernel.org --- fs/xfs/xfs_icreate_item.c | 1 + fs/xfs/xfs_log.c | 40 +++++++++++++++++++++++++++++---------- fs/xfs/xfs_log.h | 2 ++ 3 files changed, 33 insertions(+), 10 deletions(-)
diff --git a/fs/xfs/xfs_icreate_item.c b/fs/xfs/xfs_icreate_item.c index 3ebd1b7f49d8..7d940b289db5 100644 --- a/fs/xfs/xfs_icreate_item.c +++ b/fs/xfs/xfs_icreate_item.c @@ -10,6 +10,7 @@ #include "xfs_trans.h" #include "xfs_trans_priv.h" #include "xfs_icreate_item.h" +#include "xfs_log_priv.h" #include "xfs_log.h"
kmem_zone_t *xfs_icreate_zone; /* inode create item zone */ diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 63c0f1e9d101..ebbf9b9c8504 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -1537,14 +1537,14 @@ xlog_commit_record( }
/* - * Push on the buffer cache code if we ever use more than 75% of the on-disk - * log space. This code pushes on the lsn which would supposedly free up - * the 25% which we want to leave free. We may need to adopt a policy which - * pushes on an lsn which is further along in the log once we reach the high - * water mark. In this manner, we would be creating a low water mark. + * Compute the LSN that we'd need to push the log tail towards in order to have + * (a) enough on-disk log space to log the number of bytes specified, (b) at + * least 25% of the log space free, and (c) at least 256 blocks free. If the + * log free space already meets all three thresholds, this function returns + * NULLCOMMITLSN. */ -STATIC void -xlog_grant_push_ail( +xfs_lsn_t +xlog_grant_push_threshold( struct xlog *log, int need_bytes) { @@ -1570,7 +1570,7 @@ xlog_grant_push_ail( free_threshold = max(free_threshold, (log->l_logBBsize >> 2)); free_threshold = max(free_threshold, 256); if (free_blocks >= free_threshold) - return; + return NULLCOMMITLSN;
xlog_crack_atomic_lsn(&log->l_tail_lsn, &threshold_cycle, &threshold_block); @@ -1590,13 +1590,33 @@ xlog_grant_push_ail( if (XFS_LSN_CMP(threshold_lsn, last_sync_lsn) > 0) threshold_lsn = last_sync_lsn;
+ return threshold_lsn; +} + +/* + * Push the tail of the log if we need to do so to maintain the free log space + * thresholds set out by xlog_grant_push_threshold. We may need to adopt a + * policy which pushes on an lsn which is further along in the log once we + * reach the high water mark. In this manner, we would be creating a low water + * mark. + */ +STATIC void +xlog_grant_push_ail( + struct xlog *log, + int need_bytes) +{ + xfs_lsn_t threshold_lsn; + + threshold_lsn = xlog_grant_push_threshold(log, need_bytes); + if (threshold_lsn == NULLCOMMITLSN || XLOG_FORCED_SHUTDOWN(log)) + return; + /* * Get the transaction layer to kick the dirty buffers out to * disk asynchronously. No point in trying to do this if * the filesystem is shutting down. */ - if (!XLOG_FORCED_SHUTDOWN(log)) - xfs_ail_push(log->l_ailp, threshold_lsn); + xfs_ail_push(log->l_ailp, threshold_lsn); }
/* diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h index 84e06805160f..4ede2163beb2 100644 --- a/fs/xfs/xfs_log.h +++ b/fs/xfs/xfs_log.h @@ -146,4 +146,6 @@ void xfs_log_quiesce(struct xfs_mount *mp); bool xfs_log_check_lsn(struct xfs_mount *, xfs_lsn_t); bool xfs_log_in_recovery(struct xfs_mount *);
+xfs_lsn_t xlog_grant_push_threshold(struct xlog *log, int need_bytes); + #endif /* __XFS_LOG_H__ */
From: "Darrick J. Wong" darrick.wong@oracle.com
commit 74f4d6a1e065c92428c5b588099e307a582d79d9 upstream.
Now that we have the ability to ask the log how far the tail needs to be pushed to maintain its free space targets, augment the decision to relog an intent item so that we only do it if the log has hit the 75% full threshold. There's no point in relogging an intent into the same checkpoint, and there's no need to relog if there's plenty of free space in the log.
Signed-off-by: Darrick J. Wong darrick.wong@oracle.com Reviewed-by: Brian Foster bfoster@redhat.com Signed-off-by: Chandan Babu R chandan.babu@oracle.com Acked-by: Darrick J. Wong djwong@kernel.org --- fs/xfs/libxfs/xfs_defer.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+)
diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c index b0b382323413..3a78a189ea01 100644 --- a/fs/xfs/libxfs/xfs_defer.c +++ b/fs/xfs/libxfs/xfs_defer.c @@ -372,7 +372,10 @@ xfs_defer_relog( struct xfs_trans **tpp, struct list_head *dfops) { + struct xlog *log = (*tpp)->t_mountp->m_log; struct xfs_defer_pending *dfp; + xfs_lsn_t threshold_lsn = NULLCOMMITLSN; +
ASSERT((*tpp)->t_flags & XFS_TRANS_PERM_LOG_RES);
@@ -388,6 +391,19 @@ xfs_defer_relog( xfs_log_item_in_current_chkpt(dfp->dfp_intent)) continue;
+ /* + * Figure out where we need the tail to be in order to maintain + * the minimum required free space in the log. Only sample + * the log threshold once per call. + */ + if (threshold_lsn == NULLCOMMITLSN) { + threshold_lsn = xlog_grant_push_threshold(log, 0); + if (threshold_lsn == NULLCOMMITLSN) + break; + } + if (XFS_LSN_CMP(dfp->dfp_intent->li_lsn, threshold_lsn) >= 0) + continue; + trace_xfs_defer_relog_intent((*tpp)->t_mountp, dfp); XFS_STATS_INC((*tpp)->t_mountp, defer_relog); dfp->dfp_intent = xfs_trans_item_relog(dfp->dfp_intent, *tpp);
From: "Darrick J. Wong" darrick.wong@oracle.com
commit c2f09217a4305478c55adc9a98692488dd19cd32 upstream.
[ Set xfs_writepage_ctx->fork to XFS_DATA_FORK since 5.4.y tracks current extent's fork in this variable ]
In commit 7588cbeec6df, we tried to fix a race stemming from the lack of coordination between higher level code that wants to allocate and remap CoW fork extents into the data fork. Christoph cites as examples the always_cow mode, and a directio write completion racing with writeback.
According to the comments before the goto retry, we want to restart the lookup to catch the extent in the data fork, but we don't actually reset whichfork or cow_fsb, which means the second try executes using stale information. Up until now I think we've gotten lucky that either there's something left in the CoW fork to cause cow_fsb to be reset, or either data/cow fork sequence numbers have advanced enough to force a fresh lookup from the data fork. However, if we reach the retry with an empty stable CoW fork and a stable data fork, neither of those things happens. The retry foolishly re-calls xfs_convert_blocks on the CoW fork which fails again. This time, we toss the write.
I've recently been working on extending reflink to the realtime device. When the realtime extent size is larger than a single block, we have to force the page cache to CoW the entire rt extent if a write (or fallocate) are not aligned with the rt extent size. The strategy I've chosen to deal with this is derived from Dave's blocksize > pagesize series: dirtying around the write range, and ensuring that writeback always starts mapping on an rt extent boundary. This has brought this race front and center, since generic/522 blows up immediately.
However, I'm pretty sure this is a bug outright, independent of that.
Fixes: 7588cbeec6df ("xfs: retry COW fork delalloc conversion when no extent was found") Signed-off-by: Darrick J. Wong darrick.wong@oracle.com Reviewed-by: Christoph Hellwig hch@lst.de Signed-off-by: Chandan Babu R chandan.babu@oracle.com Acked-by: Darrick J. Wong djwong@kernel.org --- fs/xfs/xfs_aops.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index f16d5f196c6b..5d9f8e4c4cde 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -495,7 +495,7 @@ xfs_map_blocks( ssize_t count = i_blocksize(inode); xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + count); - xfs_fileoff_t cow_fsb = NULLFILEOFF; + xfs_fileoff_t cow_fsb; struct xfs_bmbt_irec imap; struct xfs_iext_cursor icur; int retries = 0; @@ -529,6 +529,8 @@ xfs_map_blocks( * landed in a hole and we skip the block. */ retry: + cow_fsb = NULLFILEOFF; + wpc->fork = XFS_DATA_FORK; xfs_ilock(ip, XFS_ILOCK_SHARED); ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE || (ip->i_df.if_flags & XFS_IFEXTENTS));
From: "Darrick J. Wong" darrick.wong@oracle.com
commit 27c14b5daa82861220d6fa6e27b51f05f21ffaa7 upstream.
[ In xfs_iwalk_ag(), Replace a call to XFS_IS_CORRUPT() with a call to ASSERT() ]
The aim of the inode btree record iterator function is to call a callback on every record in the btree. To avoid having to tear down and recreate the inode btree cursor around every callback, it caches a certain number of records in a memory buffer. After each batch of callback invocations, we have to perform a btree lookup to find the next record after where we left off.
However, if the keys of the inode btree are corrupt, the lookup might put us in the wrong part of the inode btree, causing the walk function to loop forever. Therefore, we add extra cursor tracking to make sure that we never go backwards neither when performing the lookup nor when jumping to the next inobt record. This also fixes an off by one error where upon resume the lookup should have been for the inode /after/ the point at which we stopped.
Found by fuzzing xfs/460 with keys[2].startino = ones causing bulkstat and quotacheck to hang.
Fixes: a211432c27ff ("xfs: create simplified inode walk function") Signed-off-by: Darrick J. Wong darrick.wong@oracle.com Reviewed-by: Chandan Babu R chandanrlinux@gmail.com Signed-off-by: Chandan Babu R chandan.babu@oracle.com Acked-by: Darrick J. Wong djwong@kernel.org --- fs/xfs/xfs_iwalk.c | 27 ++++++++++++++++++++++++--- 1 file changed, 24 insertions(+), 3 deletions(-)
diff --git a/fs/xfs/xfs_iwalk.c b/fs/xfs/xfs_iwalk.c index aa375cf53021..1f53af6b0112 100644 --- a/fs/xfs/xfs_iwalk.c +++ b/fs/xfs/xfs_iwalk.c @@ -55,6 +55,9 @@ struct xfs_iwalk_ag { /* Where do we start the traversal? */ xfs_ino_t startino;
+ /* What was the last inode number we saw when iterating the inobt? */ + xfs_ino_t lastino; + /* Array of inobt records we cache. */ struct xfs_inobt_rec_incore *recs;
@@ -300,6 +303,9 @@ xfs_iwalk_ag_start( return error; XFS_WANT_CORRUPTED_RETURN(mp, *has_more == 1);
+ iwag->lastino = XFS_AGINO_TO_INO(mp, agno, + irec->ir_startino + XFS_INODES_PER_CHUNK - 1); + /* * If the LE lookup yielded an inobt record before the cursor position, * skip it and see if there's another one after it. @@ -346,15 +352,17 @@ xfs_iwalk_run_callbacks( struct xfs_mount *mp = iwag->mp; struct xfs_trans *tp = iwag->tp; struct xfs_inobt_rec_incore *irec; - xfs_agino_t restart; + xfs_agino_t next_agino; int error;
+ next_agino = XFS_INO_TO_AGINO(mp, iwag->lastino) + 1; + ASSERT(iwag->nr_recs > 0);
/* Delete cursor but remember the last record we cached... */ xfs_iwalk_del_inobt(tp, curpp, agi_bpp, 0); irec = &iwag->recs[iwag->nr_recs - 1]; - restart = irec->ir_startino + XFS_INODES_PER_CHUNK - 1; + ASSERT(next_agino == irec->ir_startino + XFS_INODES_PER_CHUNK);
error = xfs_iwalk_ag_recs(iwag); if (error) @@ -371,7 +379,7 @@ xfs_iwalk_run_callbacks( if (error) return error;
- return xfs_inobt_lookup(*curpp, restart, XFS_LOOKUP_GE, has_more); + return xfs_inobt_lookup(*curpp, next_agino, XFS_LOOKUP_GE, has_more); }
/* Walk all inodes in a single AG, from @iwag->startino to the end of the AG. */ @@ -395,6 +403,7 @@ xfs_iwalk_ag(
while (!error && has_more) { struct xfs_inobt_rec_incore *irec; + xfs_ino_t rec_fsino;
cond_resched(); if (xfs_pwork_want_abort(&iwag->pwork)) @@ -406,6 +415,15 @@ xfs_iwalk_ag( if (error || !has_more) break;
+ /* Make sure that we always move forward. */ + rec_fsino = XFS_AGINO_TO_INO(mp, agno, irec->ir_startino); + if (iwag->lastino != NULLFSINO && iwag->lastino >= rec_fsino) { + ASSERT(iwag->lastino < rec_fsino); + error = -EFSCORRUPTED; + goto out; + } + iwag->lastino = rec_fsino + XFS_INODES_PER_CHUNK - 1; + /* No allocated inodes in this chunk; skip it. */ if (iwag->skip_empty && irec->ir_freecount == irec->ir_count) { error = xfs_btree_increment(cur, 0, &has_more); @@ -534,6 +552,7 @@ xfs_iwalk( .trim_start = 1, .skip_empty = 1, .pwork = XFS_PWORK_SINGLE_THREADED, + .lastino = NULLFSINO, }; xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, startino); int error; @@ -622,6 +641,7 @@ xfs_iwalk_threaded( iwag->data = data; iwag->startino = startino; iwag->sz_recs = xfs_iwalk_prefetch(inode_records); + iwag->lastino = NULLFSINO; xfs_pwork_queue(&pctl, &iwag->pwork); startino = XFS_AGINO_TO_INO(mp, agno + 1, 0); if (flags & XFS_INOBT_WALK_SAME_AG) @@ -695,6 +715,7 @@ xfs_inobt_walk( .startino = startino, .sz_recs = xfs_inobt_walk_prefetch(inobt_records), .pwork = XFS_PWORK_SINGLE_THREADED, + .lastino = NULLFSINO, }; xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, startino); int error;
From: "Darrick J. Wong" darrick.wong@oracle.com
commit a5336d6bb2d02d0e9d4d3c8be04b80b8b68d56c8 upstream.
In commit 27c14b5daa82 we started tracking the last inode seen during an inode walk to avoid infinite loops if a corrupt inobt record happens to have a lower ir_startino than the record preceeding it. Unfortunately, the assertion trips over the case where there are completely empty inobt records (which can happen quite easily on 64k page filesystems) because we advance the tracking cursor without actually putting the empty record into the processing buffer. Fix the assert to allow for this case.
Reported-by: zlang@redhat.com Fixes: 27c14b5daa82 ("xfs: ensure inobt record walks always make forward progress") Signed-off-by: Darrick J. Wong darrick.wong@oracle.com Reviewed-by: Zorro Lang zlang@redhat.com Reviewed-by: Dave Chinner dchinner@redhat.com Signed-off-by: Chandan Babu R chandan.babu@oracle.com Acked-by: Darrick J. Wong djwong@kernel.org --- fs/xfs/xfs_iwalk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/fs/xfs/xfs_iwalk.c b/fs/xfs/xfs_iwalk.c index 1f53af6b0112..cc5c0c835884 100644 --- a/fs/xfs/xfs_iwalk.c +++ b/fs/xfs/xfs_iwalk.c @@ -362,7 +362,7 @@ xfs_iwalk_run_callbacks( /* Delete cursor but remember the last record we cached... */ xfs_iwalk_del_inobt(tp, curpp, agi_bpp, 0); irec = &iwag->recs[iwag->nr_recs - 1]; - ASSERT(next_agino == irec->ir_startino + XFS_INODES_PER_CHUNK); + ASSERT(next_agino >= irec->ir_startino + XFS_INODES_PER_CHUNK);
error = xfs_iwalk_ag_recs(iwag); if (error)
From: "Darrick J. Wong" djwong@kernel.org
commit f8d92a66e810acbef6ddbc0bd0cbd9b117ce8acd upstream.
[ Continue to interpret xfs_log_item->li_seq as an LSN rather than a CIL sequence number. ]
While I was running with KASAN and lockdep enabled, I stumbled upon an KASAN report about a UAF to a freed CIL checkpoint. Looking at the comment for xfs_log_item_in_current_chkpt, it seems pretty obvious to me that the original patch to xfs_defer_finish_noroll should have done something to lock the CIL to prevent it from switching the CIL contexts while the predicate runs.
For upper level code that needs to know if a given log item is new enough not to need relogging, add a new wrapper that takes the CIL context lock long enough to sample the current CIL context. This is kind of racy in that the CIL can switch the contexts immediately after sampling, but that's ok because the consequence is that the defer ops code is a little slow to relog items.
================================================================== BUG: KASAN: use-after-free in xfs_log_item_in_current_chkpt+0x139/0x160 [xfs] Read of size 8 at addr ffff88804ea5f608 by task fsstress/527999
CPU: 1 PID: 527999 Comm: fsstress Tainted: G D 5.16.0-rc4-xfsx #rc4 Call Trace: <TASK> dump_stack_lvl+0x45/0x59 print_address_description.constprop.0+0x1f/0x140 kasan_report.cold+0x83/0xdf xfs_log_item_in_current_chkpt+0x139/0x160 xfs_defer_finish_noroll+0x3bb/0x1e30 __xfs_trans_commit+0x6c8/0xcf0 xfs_reflink_remap_extent+0x66f/0x10e0 xfs_reflink_remap_blocks+0x2dd/0xa90 xfs_file_remap_range+0x27b/0xc30 vfs_dedupe_file_range_one+0x368/0x420 vfs_dedupe_file_range+0x37c/0x5d0 do_vfs_ioctl+0x308/0x1260 __x64_sys_ioctl+0xa1/0x170 do_syscall_64+0x35/0x80 entry_SYSCALL_64_after_hwframe+0x44/0xae RIP: 0033:0x7f2c71a2950b Code: 0f 1e fa 48 8b 05 85 39 0d 00 64 c7 00 26 00 00 00 48 c7 c0 ff ff ff ff c3 66 0f 1f 44 00 00 f3 0f 1e fa b8 10 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 55 39 0d 00 f7 d8 64 89 01 48 RSP: 002b:00007ffe8c0e03c8 EFLAGS: 00000246 ORIG_RAX: 0000000000000010 RAX: ffffffffffffffda RBX: 00005600862a8740 RCX: 00007f2c71a2950b RDX: 00005600862a7be0 RSI: 00000000c0189436 RDI: 0000000000000004 RBP: 000000000000000b R08: 0000000000000027 R09: 0000000000000003 R10: 0000000000000000 R11: 0000000000000246 R12: 000000000000005a R13: 00005600862804a8 R14: 0000000000016000 R15: 00005600862a8a20 </TASK>
Allocated by task 464064: kasan_save_stack+0x1e/0x50 __kasan_kmalloc+0x81/0xa0 kmem_alloc+0xcd/0x2c0 [xfs] xlog_cil_ctx_alloc+0x17/0x1e0 [xfs] xlog_cil_push_work+0x141/0x13d0 [xfs] process_one_work+0x7f6/0x1380 worker_thread+0x59d/0x1040 kthread+0x3b0/0x490 ret_from_fork+0x1f/0x30
Freed by task 51: kasan_save_stack+0x1e/0x50 kasan_set_track+0x21/0x30 kasan_set_free_info+0x20/0x30 __kasan_slab_free+0xed/0x130 slab_free_freelist_hook+0x7f/0x160 kfree+0xde/0x340 xlog_cil_committed+0xbfd/0xfe0 [xfs] xlog_cil_process_committed+0x103/0x1c0 [xfs] xlog_state_do_callback+0x45d/0xbd0 [xfs] xlog_ioend_work+0x116/0x1c0 [xfs] process_one_work+0x7f6/0x1380 worker_thread+0x59d/0x1040 kthread+0x3b0/0x490 ret_from_fork+0x1f/0x30
Last potentially related work creation: kasan_save_stack+0x1e/0x50 __kasan_record_aux_stack+0xb7/0xc0 insert_work+0x48/0x2e0 __queue_work+0x4e7/0xda0 queue_work_on+0x69/0x80 xlog_cil_push_now.isra.0+0x16b/0x210 [xfs] xlog_cil_force_seq+0x1b7/0x850 [xfs] xfs_log_force_seq+0x1c7/0x670 [xfs] xfs_file_fsync+0x7c1/0xa60 [xfs] __x64_sys_fsync+0x52/0x80 do_syscall_64+0x35/0x80 entry_SYSCALL_64_after_hwframe+0x44/0xae
The buggy address belongs to the object at ffff88804ea5f600 which belongs to the cache kmalloc-256 of size 256 The buggy address is located 8 bytes inside of 256-byte region [ffff88804ea5f600, ffff88804ea5f700) The buggy address belongs to the page: page:ffffea00013a9780 refcount:1 mapcount:0 mapping:0000000000000000 index:0xffff88804ea5ea00 pfn:0x4ea5e head:ffffea00013a9780 order:1 compound_mapcount:0 flags: 0x4fff80000010200(slab|head|node=1|zone=1|lastcpupid=0xfff) raw: 04fff80000010200 ffffea0001245908 ffffea00011bd388 ffff888004c42b40 raw: ffff88804ea5ea00 0000000000100009 00000001ffffffff 0000000000000000 page dumped because: kasan: bad access detected
Memory state around the buggy address: ffff88804ea5f500: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc ffff88804ea5f580: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
ffff88804ea5f600: fa fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
^ ffff88804ea5f680: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ffff88804ea5f700: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc ==================================================================
Fixes: 4e919af7827a ("xfs: periodically relog deferred intent items") Signed-off-by: Darrick J. Wong djwong@kernel.org Reviewed-by: Dave Chinner dchinner@redhat.com Signed-off-by: Chandan Babu R chandan.babu@oracle.com Acked-by: Darrick J. Wong djwong@kernel.org --- fs/xfs/xfs_log_cil.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-)
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index 550fd5de2404..ae9b8efcfa54 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -1178,21 +1178,19 @@ xlog_cil_force_lsn( */ bool xfs_log_item_in_current_chkpt( - struct xfs_log_item *lip) + struct xfs_log_item *lip) { - struct xfs_cil_ctx *ctx; + struct xfs_cil *cil = lip->li_mountp->m_log->l_cilp;
if (list_empty(&lip->li_cil)) return false;
- ctx = lip->li_mountp->m_log->l_cilp->xc_ctx; - /* * li_seq is written on the first commit of a log item to record the * first checkpoint it is written to. Hence if it is different to the * current sequence, we're in a new checkpoint. */ - if (XFS_LSN_CMP(lip->li_seq, ctx->sequence) != 0) + if (XFS_LSN_CMP(lip->li_seq, READ_ONCE(cil->xc_current_sequence)) != 0) return false; return true; }
From: Brian Foster bfoster@redhat.com
commit 50d25484bebe94320c49dd1347d3330c7063bbdb upstream.
[ Modify xfs_log_unmount_write() to return zero when the log is in a read-only state ]
xfs_log_sbcount() syncs the superblock specifically to accumulate the in-core percpu superblock counters and commit them to disk. This is required to maintain filesystem consistency across quiesce (freeze, read-only mount/remount) or unmount when lazy superblock accounting is enabled because individual transactions do not update the superblock directly.
This mechanism works as expected for writable mounts, but xfs_log_sbcount() skips the update for read-only mounts. Read-only mounts otherwise still allow log recovery and write out an unmount record during log quiesce. If a read-only mount performs log recovery, it can modify the in-core superblock counters and write an unmount record when the filesystem unmounts without ever syncing the in-core counters. This leaves the filesystem with a clean log but in an inconsistent state with regard to lazy sb counters.
Update xfs_log_sbcount() to use the same logic xfs_log_unmount_write() uses to determine when to write an unmount record. This ensures that lazy accounting is always synced before the log is cleaned. Refactor this logic into a new helper to distinguish between a writable filesystem and a writable log. Specifically, the log is writable unless the filesystem is mounted with the norecovery mount option, the underlying log device is read-only, or the filesystem is shutdown. Drop the freeze state check because the update is already allowed during the freezing process and no context calls this function on an already frozen fs. Also, retain the shutdown check in xfs_log_unmount_write() to catch the case where the preceding log force might have triggered a shutdown.
Signed-off-by: Brian Foster bfoster@redhat.com Reviewed-by: Gao Xiang hsiangkao@redhat.com Reviewed-by: Allison Henderson allison.henderson@oracle.com Reviewed-by: Darrick J. Wong darrick.wong@oracle.com Reviewed-by: Bill O'Donnell billodo@redhat.com Reviewed-by: Darrick J. Wong djwong@kernel.org Signed-off-by: Darrick J. Wong djwong@kernel.org Signed-off-by: Chandan Babu R chandan.babu@oracle.com Acked-by: Darrick J. Wong djwong@kernel.org --- fs/xfs/xfs_log.c | 28 ++++++++++++++++++++-------- fs/xfs/xfs_log.h | 1 + fs/xfs/xfs_mount.c | 3 +-- 3 files changed, 22 insertions(+), 10 deletions(-)
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index ebbf9b9c8504..03a52b3919b8 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -369,6 +369,25 @@ xlog_tic_add_region(xlog_ticket_t *tic, uint len, uint type) tic->t_res_num++; }
+bool +xfs_log_writable( + struct xfs_mount *mp) +{ + /* + * Never write to the log on norecovery mounts, if the block device is + * read-only, or if the filesystem is shutdown. Read-only mounts still + * allow internal writes for log recovery and unmount purposes, so don't + * restrict that case here. + */ + if (mp->m_flags & XFS_MOUNT_NORECOVERY) + return false; + if (xfs_readonly_buftarg(mp->m_log->l_targ)) + return false; + if (XFS_FORCED_SHUTDOWN(mp)) + return false; + return true; +} + /* * Replenish the byte reservation required by moving the grant write head. */ @@ -895,15 +914,8 @@ xfs_log_unmount_write(xfs_mount_t *mp) #endif int error;
- /* - * Don't write out unmount record on norecovery mounts or ro devices. - * Or, if we are doing a forced umount (typically because of IO errors). - */ - if (mp->m_flags & XFS_MOUNT_NORECOVERY || - xfs_readonly_buftarg(log->l_targ)) { - ASSERT(mp->m_flags & XFS_MOUNT_RDONLY); + if (!xfs_log_writable(mp)) return 0; - }
error = xfs_log_force(mp, XFS_LOG_SYNC); ASSERT(error || !(XLOG_FORCED_SHUTDOWN(log))); diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h index 4ede2163beb2..dc9229e7ddaa 100644 --- a/fs/xfs/xfs_log.h +++ b/fs/xfs/xfs_log.h @@ -132,6 +132,7 @@ int xfs_log_reserve(struct xfs_mount *mp, int xfs_log_regrant(struct xfs_mount *mp, struct xlog_ticket *tic); void xfs_log_unmount(struct xfs_mount *mp); int xfs_log_force_umount(struct xfs_mount *mp, int logerror); +bool xfs_log_writable(struct xfs_mount *mp);
struct xlog_ticket *xfs_log_ticket_get(struct xlog_ticket *ticket); void xfs_log_ticket_put(struct xlog_ticket *ticket); diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index bbcf48a625b2..2860966af6c2 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -1218,8 +1218,7 @@ xfs_fs_writable( int xfs_log_sbcount(xfs_mount_t *mp) { - /* allow this to proceed during the freeze sequence... */ - if (!xfs_fs_writable(mp, SB_FREEZE_COMPLETE)) + if (!xfs_log_writable(mp)) return 0;
/*
linux-stable-mirror@lists.linaro.org