commit 51232df5e4b268936beccde5248f312a316800be upstream.
When the managed cache is enabled, the last reference count of a workgroup must be used for its workstation.
Otherwise, it could lead to incorrect (un)freezes in the reclaim path, and it would be harmful.
A typical race as follows:
Thread 1 (In the reclaim path) Thread 2 workgroup_freeze(grp, 1) refcnt = 1 ... workgroup_unfreeze(grp, 1) refcnt = 1 workgroup_get(grp) refcnt = 2 (x) workgroup_put(grp) refcnt = 1 (x) ...unexpected behaviors
* grp is detached but still used, which violates cache-managed freeze constraint.
Reviewed-by: Chao Yu yuchao0@huawei.com Signed-off-by: Gao Xiang gaoxiang25@huawei.com ---
change log v3: - avoid using xa_tag apis since Matthew refuses the first XArray patch in the previous version again;
RESEND in order to ping due to lack of response from Matthew and Greg for half a month, patch 2/2 is needed to be backported to linux-4.19.
change log v2: - fix tagged pointer apis to make it work properly pointed out by Matthew;
Thanks, Gao Xiang
drivers/staging/erofs/internal.h | 1 + drivers/staging/erofs/utils.c | 139 ++++++++++++++++++++++++++++----------- 2 files changed, 101 insertions(+), 39 deletions(-)
diff --git a/drivers/staging/erofs/internal.h b/drivers/staging/erofs/internal.h index c70f0c5237ea..58d8cbc3f921 100644 --- a/drivers/staging/erofs/internal.h +++ b/drivers/staging/erofs/internal.h @@ -260,6 +260,7 @@ static inline bool erofs_workgroup_get(struct erofs_workgroup *grp, int *ocnt) }
#define __erofs_workgroup_get(grp) atomic_inc(&(grp)->refcount) +#define __erofs_workgroup_put(grp) atomic_dec(&(grp)->refcount)
extern int erofs_workgroup_put(struct erofs_workgroup *grp);
diff --git a/drivers/staging/erofs/utils.c b/drivers/staging/erofs/utils.c index dd2ac9dbc4b4..2d96820da62e 100644 --- a/drivers/staging/erofs/utils.c +++ b/drivers/staging/erofs/utils.c @@ -87,12 +87,21 @@ int erofs_register_workgroup(struct super_block *sb, grp = (void *)((unsigned long)grp | 1UL << RADIX_TREE_EXCEPTIONAL_SHIFT);
- err = radix_tree_insert(&sbi->workstn_tree, - grp->index, grp); + /* + * Bump up reference count before making this workgroup + * visible to other users in order to avoid potential UAF + * without serialized by erofs_workstn_lock. + */ + __erofs_workgroup_get(grp);
- if (!err) { - __erofs_workgroup_get(grp); - } + err = radix_tree_insert(&sbi->workstn_tree, + grp->index, grp); + if (unlikely(err)) + /* + * it's safe to decrease since the workgroup isn't visible + * and refcount >= 2 (cannot be freezed). + */ + __erofs_workgroup_put(grp);
erofs_workstn_unlock(sbi); radix_tree_preload_end(); @@ -101,19 +110,99 @@ int erofs_register_workgroup(struct super_block *sb,
extern void erofs_workgroup_free_rcu(struct erofs_workgroup *grp);
+static void __erofs_workgroup_free(struct erofs_workgroup *grp) +{ + atomic_long_dec(&erofs_global_shrink_cnt); + erofs_workgroup_free_rcu(grp); +} + int erofs_workgroup_put(struct erofs_workgroup *grp) { int count = atomic_dec_return(&grp->refcount);
if (count == 1) atomic_long_inc(&erofs_global_shrink_cnt); - else if (!count) { - atomic_long_dec(&erofs_global_shrink_cnt); - erofs_workgroup_free_rcu(grp); - } + else if (!count) + __erofs_workgroup_free(grp); return count; }
+#ifdef EROFS_FS_HAS_MANAGED_CACHE +/* for cache-managed case, customized reclaim paths exist */ +static void erofs_workgroup_unfreeze_final(struct erofs_workgroup *grp) +{ + erofs_workgroup_unfreeze(grp, 0); + __erofs_workgroup_free(grp); +} + +bool erofs_try_to_release_workgroup(struct erofs_sb_info *sbi, + struct erofs_workgroup *grp, + bool cleanup) +{ + void *entry; + + /* + * for managed cache enabled, the refcount of workgroups + * themselves could be < 0 (freezed). So there is no guarantee + * that all refcount > 0 if managed cache is enabled. + */ + if (!erofs_workgroup_try_to_freeze(grp, 1)) + return false; + + /* + * note that all cached pages should be unlinked + * before delete it from the radix tree. + * Otherwise some cached pages of an orphan old workgroup + * could be still linked after the new one is available. + */ + if (erofs_try_to_free_all_cached_pages(sbi, grp)) { + erofs_workgroup_unfreeze(grp, 1); + return false; + } + + /* + * it is impossible to fail after the workgroup is freezed, + * however in order to avoid some race conditions, add a + * DBG_BUGON to observe this in advance. + */ + entry = radix_tree_delete(&sbi->workstn_tree, grp->index); + DBG_BUGON((void *)((unsigned long)entry & + ~RADIX_TREE_EXCEPTIONAL_ENTRY) != grp); + + /* + * if managed cache is enable, the last refcount + * should indicate the related workstation. + */ + erofs_workgroup_unfreeze_final(grp); + return true; +} + +#else +/* for nocache case, no customized reclaim path at all */ +bool erofs_try_to_release_workgroup(struct erofs_sb_info *sbi, + struct erofs_workgroup *grp, + bool cleanup) +{ + int cnt = atomic_read(&grp->refcount); + void *entry; + + DBG_BUGON(cnt <= 0); + DBG_BUGON(cleanup && cnt != 1); + + if (cnt > 1) + return false; + + entry = radix_tree_delete(&sbi->workstn_tree, grp->index); + DBG_BUGON((void *)((unsigned long)entry & + ~RADIX_TREE_EXCEPTIONAL_ENTRY) != grp); + + /* (rarely) could be grabbed again when freeing */ + erofs_workgroup_put(grp); + return true; +} + +#endif + unsigned long erofs_shrink_workstation(struct erofs_sb_info *sbi, unsigned long nr_shrink, bool cleanup) @@ -130,44 +219,16 @@ unsigned long erofs_shrink_workstation(struct erofs_sb_info *sbi, batch, first_index, PAGEVEC_SIZE);
for (i = 0; i < found; ++i) { - int cnt; struct erofs_workgroup *grp = (void *) ((unsigned long)batch[i] & ~RADIX_TREE_EXCEPTIONAL_ENTRY);
first_index = grp->index + 1;
- cnt = atomic_read(&grp->refcount); - BUG_ON(cnt <= 0); - - if (cleanup) - BUG_ON(cnt != 1); - -#ifndef EROFS_FS_HAS_MANAGED_CACHE - else if (cnt > 1) -#else - if (!erofs_workgroup_try_to_freeze(grp, 1)) -#endif + /* try to shrink each valid workgroup */ + if (!erofs_try_to_release_workgroup(sbi, grp, cleanup)) continue;
- if (radix_tree_delete(&sbi->workstn_tree, - grp->index) != grp) { -#ifdef EROFS_FS_HAS_MANAGED_CACHE -skip: - erofs_workgroup_unfreeze(grp, 1); -#endif - continue; - } - -#ifdef EROFS_FS_HAS_MANAGED_CACHE - if (erofs_try_to_free_all_cached_pages(sbi, grp)) - goto skip; - - erofs_workgroup_unfreeze(grp, 1); -#endif - /* (rarely) grabbed again when freeing */ - erofs_workgroup_put(grp); - ++freed; if (unlikely(!--nr_shrink)) break;
On Mon, Mar 18, 2019 at 03:26:34PM +0800, Gao Xiang wrote:
commit 51232df5e4b268936beccde5248f312a316800be upstream.
When the managed cache is enabled, the last reference count of a workgroup must be used for its workstation.
Otherwise, it could lead to incorrect (un)freezes in the reclaim path, and it would be harmful.
A typical race as follows:
Thread 1 (In the reclaim path) Thread 2 workgroup_freeze(grp, 1) refcnt = 1 ... workgroup_unfreeze(grp, 1) refcnt = 1 workgroup_get(grp) refcnt = 2 (x) workgroup_put(grp) refcnt = 1 (x) ...unexpected behaviors
- grp is detached but still used, which violates cache-managed freeze constraint.
Reviewed-by: Chao Yu yuchao0@huawei.com Signed-off-by: Gao Xiang gaoxiang25@huawei.com
change log v3:
- avoid using xa_tag apis since Matthew refuses the first XArray patch in the previous version again;
RESEND in order to ping due to lack of response from Matthew and Greg for half a month, patch 2/2 is needed to be backported to linux-4.19.
I've queued this up now, thanks.
greg k-h
linux-stable-mirror@lists.linaro.org