From: Kairui Song kasong@tencent.com
This series fixes the page cache corruption issue reported by Christian Theune [1]. The issue was reported affects kernels back to 5.19. Current maintained effected branches includes 6.1 and 6.6 and the fix was included in 6.10 already.
This series can be applied for both 6.1 and 6.6.
Patch 3/3 is the fixing patch. It was initially submitted and merge as an optimization but found to have fixed the corruption by handling race correctly.
Patch 1/3 and 2/3 is required for 3/3.
Patch 3/3 included some unit test code, making the LOC of the backport a bit higher, but should be OK to be kept, since they are just test code.
Note there seems still some unresolved problem in Link [1] but that should be a different issue, and the commits being backported have been well tested, they fix the corruption issue just fine.
Link: https://lore.kernel.org/linux-mm/A5A976CB-DB57-4513-A700-656580488AB6@flying... [1]
Kairui Song (3): mm/filemap: return early if failed to allocate memory for split lib/xarray: introduce a new helper xas_get_order mm/filemap: optimize filemap folio adding
include/linux/xarray.h | 6 +++ lib/test_xarray.c | 93 ++++++++++++++++++++++++++++++++++++++++++ lib/xarray.c | 49 ++++++++++++++-------- mm/filemap.c | 50 ++++++++++++++++++----- 4 files changed, 169 insertions(+), 29 deletions(-)
From: Kairui Song kasong@tencent.com
commit de60fd8ddeda2b41fbe11df11733838c5f684616 upstream.
xas_split_alloc could fail with NOMEM, and in such case, it should abort early instead of keep going and fail the xas_split below.
Link: https://lkml.kernel.org/r/20240416071722.45997-1-ryncsn@gmail.com Link: https://lkml.kernel.org/r/20240415171857.19244-1-ryncsn@gmail.com Link: https://lkml.kernel.org/r/20240415171857.19244-2-ryncsn@gmail.com Signed-off-by: Kairui Song kasong@tencent.com Acked-by: Matthew Wilcox (Oracle) willy@infradead.org Signed-off-by: Andrew Morton akpm@linux-foundation.org Stable-dep-of: 6758c1128ceb ("mm/filemap: optimize filemap folio adding") --- mm/filemap.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-)
diff --git a/mm/filemap.c b/mm/filemap.c index 2809b1174f04..f85c13a1b739 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -867,9 +867,12 @@ noinline int __filemap_add_folio(struct address_space *mapping, unsigned int order = xa_get_order(xas.xa, xas.xa_index); void *entry, *old = NULL;
- if (order > folio_order(folio)) + if (order > folio_order(folio)) { xas_split_alloc(&xas, xa_load(xas.xa, xas.xa_index), order, gfp); + if (xas_error(&xas)) + goto error; + } xas_lock_irq(&xas); xas_for_each_conflict(&xas, entry) { old = entry;
From: Kairui Song kasong@tencent.com
commit a4864671ca0bf51c8e78242951741df52c06766f upstream.
It can be used after xas_load to check the order of loaded entries. Compared to xa_get_order, it saves an XA_STATE and avoid a rewalk.
Added new test for xas_get_order, to make the test work, we have to export xas_get_order with EXPORT_SYMBOL_GPL.
Also fix a sparse warning by checking the slot value with xa_entry instead of accessing it directly, as suggested by Matthew Wilcox.
[kasong@tencent.com: simplify comment, sparse warning fix, per Matthew Wilcox] Link: https://lkml.kernel.org/r/20240416071722.45997-4-ryncsn@gmail.com Link: https://lkml.kernel.org/r/20240415171857.19244-4-ryncsn@gmail.com Signed-off-by: Kairui Song kasong@tencent.com Cc: Matthew Wilcox (Oracle) willy@infradead.org Signed-off-by: Andrew Morton akpm@linux-foundation.org Stable-dep-of: 6758c1128ceb ("mm/filemap: optimize filemap folio adding") --- include/linux/xarray.h | 6 ++++++ lib/test_xarray.c | 34 +++++++++++++++++++++++++++++ lib/xarray.c | 49 ++++++++++++++++++++++++++---------------- 3 files changed, 71 insertions(+), 18 deletions(-)
diff --git a/include/linux/xarray.h b/include/linux/xarray.h index 44dd6d6e01bc..0e2feb72e9e5 100644 --- a/include/linux/xarray.h +++ b/include/linux/xarray.h @@ -1530,6 +1530,7 @@ void xas_create_range(struct xa_state *);
#ifdef CONFIG_XARRAY_MULTI int xa_get_order(struct xarray *, unsigned long index); +int xas_get_order(struct xa_state *xas); void xas_split(struct xa_state *, void *entry, unsigned int order); void xas_split_alloc(struct xa_state *, void *entry, unsigned int order, gfp_t); #else @@ -1538,6 +1539,11 @@ static inline int xa_get_order(struct xarray *xa, unsigned long index) return 0; }
+static inline int xas_get_order(struct xa_state *xas) +{ + return 0; +} + static inline void xas_split(struct xa_state *xas, void *entry, unsigned int order) { diff --git a/lib/test_xarray.c b/lib/test_xarray.c index e77d4856442c..2e229012920b 100644 --- a/lib/test_xarray.c +++ b/lib/test_xarray.c @@ -1756,6 +1756,39 @@ static noinline void check_get_order(struct xarray *xa) } }
+static noinline void check_xas_get_order(struct xarray *xa) +{ + XA_STATE(xas, xa, 0); + + unsigned int max_order = IS_ENABLED(CONFIG_XARRAY_MULTI) ? 20 : 1; + unsigned int order; + unsigned long i, j; + + for (order = 0; order < max_order; order++) { + for (i = 0; i < 10; i++) { + xas_set_order(&xas, i << order, order); + do { + xas_lock(&xas); + xas_store(&xas, xa_mk_value(i)); + xas_unlock(&xas); + } while (xas_nomem(&xas, GFP_KERNEL)); + + for (j = i << order; j < (i + 1) << order; j++) { + xas_set_order(&xas, j, 0); + rcu_read_lock(); + xas_load(&xas); + XA_BUG_ON(xa, xas_get_order(&xas) != order); + rcu_read_unlock(); + } + + xas_lock(&xas); + xas_set_order(&xas, i << order, order); + xas_store(&xas, NULL); + xas_unlock(&xas); + } + } +} + static noinline void check_destroy(struct xarray *xa) { unsigned long index; @@ -1805,6 +1838,7 @@ static int xarray_checks(void) check_reserve(&xa0); check_multi_store(&array); check_get_order(&array); + check_xas_get_order(&array); check_xa_alloc(); check_find(&array); check_find_entry(&array); diff --git a/lib/xarray.c b/lib/xarray.c index e9bd29826e8b..341878f98c5b 100644 --- a/lib/xarray.c +++ b/lib/xarray.c @@ -1752,39 +1752,52 @@ void *xa_store_range(struct xarray *xa, unsigned long first, EXPORT_SYMBOL(xa_store_range);
/** - * xa_get_order() - Get the order of an entry. - * @xa: XArray. - * @index: Index of the entry. + * xas_get_order() - Get the order of an entry. + * @xas: XArray operation state. + * + * Called after xas_load, the xas should not be in an error state. * * Return: A number between 0 and 63 indicating the order of the entry. */ -int xa_get_order(struct xarray *xa, unsigned long index) +int xas_get_order(struct xa_state *xas) { - XA_STATE(xas, xa, index); - void *entry; int order = 0;
- rcu_read_lock(); - entry = xas_load(&xas); - - if (!entry) - goto unlock; - - if (!xas.xa_node) - goto unlock; + if (!xas->xa_node) + return 0;
for (;;) { - unsigned int slot = xas.xa_offset + (1 << order); + unsigned int slot = xas->xa_offset + (1 << order);
if (slot >= XA_CHUNK_SIZE) break; - if (!xa_is_sibling(xas.xa_node->slots[slot])) + if (!xa_is_sibling(xa_entry(xas->xa, xas->xa_node, slot))) break; order++; }
- order += xas.xa_node->shift; -unlock: + order += xas->xa_node->shift; + return order; +} +EXPORT_SYMBOL_GPL(xas_get_order); + +/** + * xa_get_order() - Get the order of an entry. + * @xa: XArray. + * @index: Index of the entry. + * + * Return: A number between 0 and 63 indicating the order of the entry. + */ +int xa_get_order(struct xarray *xa, unsigned long index) +{ + XA_STATE(xas, xa, index); + int order = 0; + void *entry; + + rcu_read_lock(); + entry = xas_load(&xas); + if (entry) + order = xas_get_order(&xas); rcu_read_unlock();
return order;
From: Kairui Song kasong@tencent.com
commit 6758c1128ceb45d1a35298912b974eb4895b7dd9 upstream.
Instead of doing multiple tree walks, do one optimism range check with lock hold, and exit if raced with another insertion. If a shadow exists, check it with a new xas_get_order helper before releasing the lock to avoid redundant tree walks for getting its order.
Drop the lock and do the allocation only if a split is needed.
In the best case, it only need to walk the tree once. If it needs to alloc and split, 3 walks are issued (One for first ranged conflict check and order retrieving, one for the second check after allocation, one for the insert after split).
Testing with 4K pages, in an 8G cgroup, with 16G brd as block device:
echo 3 > /proc/sys/vm/drop_caches
fio -name=cached --numjobs=16 --filename=/mnt/test.img \ --buffered=1 --ioengine=mmap --rw=randread --time_based \ --ramp_time=30s --runtime=5m --group_reporting
Before: bw ( MiB/s): min= 1027, max= 3520, per=100.00%, avg=2445.02, stdev=18.90, samples=8691 iops : min=263001, max=901288, avg=625924.36, stdev=4837.28, samples=8691
After (+7.3%): bw ( MiB/s): min= 493, max= 3947, per=100.00%, avg=2625.56, stdev=25.74, samples=8651 iops : min=126454, max=1010681, avg=672142.61, stdev=6590.48, samples=8651
Test result with THP (do a THP randread then switch to 4K page in hope it issues a lot of splitting):
echo 3 > /proc/sys/vm/drop_caches
fio -name=cached --numjobs=16 --filename=/mnt/test.img \ --buffered=1 --ioengine=mmap -thp=1 --readonly \ --rw=randread --time_based --ramp_time=30s --runtime=10m \ --group_reporting
fio -name=cached --numjobs=16 --filename=/mnt/test.img \ --buffered=1 --ioengine=mmap \ --rw=randread --time_based --runtime=5s --group_reporting
Before: bw ( KiB/s): min= 4141, max=14202, per=100.00%, avg=7935.51, stdev=96.85, samples=18976 iops : min= 1029, max= 3548, avg=1979.52, stdev=24.23, samples=18976ยท
READ: bw=4545B/s (4545B/s), 4545B/s-4545B/s (4545B/s-4545B/s), io=64.0KiB (65.5kB), run=14419-14419msec
After (+12.5%): bw ( KiB/s): min= 4611, max=15370, per=100.00%, avg=8928.74, stdev=105.17, samples=19146 iops : min= 1151, max= 3842, avg=2231.27, stdev=26.29, samples=19146
READ: bw=4635B/s (4635B/s), 4635B/s-4635B/s (4635B/s-4635B/s), io=64.0KiB (65.5kB), run=14137-14137msec
The performance is better for both 4K (+7.5%) and THP (+12.5%) cached read.
Link: https://lkml.kernel.org/r/20240415171857.19244-5-ryncsn@gmail.com Signed-off-by: Kairui Song kasong@tencent.com Cc: Matthew Wilcox (Oracle) willy@infradead.org Signed-off-by: Andrew Morton akpm@linux-foundation.org Closes: https://lore.kernel.org/linux-mm/A5A976CB-DB57-4513-A700-656580488AB6@flying... [ kasong@tencent.com: minor adjustment of variable declarations ] --- lib/test_xarray.c | 59 +++++++++++++++++++++++++++++++++++++++++++++++ mm/filemap.c | 53 +++++++++++++++++++++++++++++++----------- 2 files changed, 98 insertions(+), 14 deletions(-)
diff --git a/lib/test_xarray.c b/lib/test_xarray.c index 2e229012920b..542926da61a3 100644 --- a/lib/test_xarray.c +++ b/lib/test_xarray.c @@ -1789,6 +1789,64 @@ static noinline void check_xas_get_order(struct xarray *xa) } }
+static noinline void check_xas_conflict_get_order(struct xarray *xa) +{ + XA_STATE(xas, xa, 0); + + void *entry; + int only_once; + unsigned int max_order = IS_ENABLED(CONFIG_XARRAY_MULTI) ? 20 : 1; + unsigned int order; + unsigned long i, j, k; + + for (order = 0; order < max_order; order++) { + for (i = 0; i < 10; i++) { + xas_set_order(&xas, i << order, order); + do { + xas_lock(&xas); + xas_store(&xas, xa_mk_value(i)); + xas_unlock(&xas); + } while (xas_nomem(&xas, GFP_KERNEL)); + + /* + * Ensure xas_get_order works with xas_for_each_conflict. + */ + j = i << order; + for (k = 0; k < order; k++) { + only_once = 0; + xas_set_order(&xas, j + (1 << k), k); + xas_lock(&xas); + xas_for_each_conflict(&xas, entry) { + XA_BUG_ON(xa, entry != xa_mk_value(i)); + XA_BUG_ON(xa, xas_get_order(&xas) != order); + only_once++; + } + XA_BUG_ON(xa, only_once != 1); + xas_unlock(&xas); + } + + if (order < max_order - 1) { + only_once = 0; + xas_set_order(&xas, (i & ~1UL) << order, order + 1); + xas_lock(&xas); + xas_for_each_conflict(&xas, entry) { + XA_BUG_ON(xa, entry != xa_mk_value(i)); + XA_BUG_ON(xa, xas_get_order(&xas) != order); + only_once++; + } + XA_BUG_ON(xa, only_once != 1); + xas_unlock(&xas); + } + + xas_set_order(&xas, i << order, order); + xas_lock(&xas); + xas_store(&xas, NULL); + xas_unlock(&xas); + } + } +} + + static noinline void check_destroy(struct xarray *xa) { unsigned long index; @@ -1839,6 +1897,7 @@ static int xarray_checks(void) check_multi_store(&array); check_get_order(&array); check_xas_get_order(&array); + check_xas_conflict_get_order(&array); check_xa_alloc(); check_find(&array); check_find_entry(&array); diff --git a/mm/filemap.c b/mm/filemap.c index f85c13a1b739..d3b925232a59 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -841,6 +841,8 @@ noinline int __filemap_add_folio(struct address_space *mapping, { XA_STATE(xas, &mapping->i_pages, index); int huge = folio_test_hugetlb(folio); + void *alloced_shadow = NULL; + int alloced_order = 0; bool charged = false; long nr = 1;
@@ -863,16 +865,10 @@ noinline int __filemap_add_folio(struct address_space *mapping, folio->mapping = mapping; folio->index = xas.xa_index;
- do { - unsigned int order = xa_get_order(xas.xa, xas.xa_index); + for (;;) { + int order = -1, split_order = 0; void *entry, *old = NULL;
- if (order > folio_order(folio)) { - xas_split_alloc(&xas, xa_load(xas.xa, xas.xa_index), - order, gfp); - if (xas_error(&xas)) - goto error; - } xas_lock_irq(&xas); xas_for_each_conflict(&xas, entry) { old = entry; @@ -880,19 +876,33 @@ noinline int __filemap_add_folio(struct address_space *mapping, xas_set_err(&xas, -EEXIST); goto unlock; } + /* + * If a larger entry exists, + * it will be the first and only entry iterated. + */ + if (order == -1) + order = xas_get_order(&xas); + } + + /* entry may have changed before we re-acquire the lock */ + if (alloced_order && (old != alloced_shadow || order != alloced_order)) { + xas_destroy(&xas); + alloced_order = 0; }
if (old) { - if (shadowp) - *shadowp = old; - /* entry may have been split before we acquired lock */ - order = xa_get_order(xas.xa, xas.xa_index); - if (order > folio_order(folio)) { + if (order > 0 && order > folio_order(folio)) { /* How to handle large swap entries? */ BUG_ON(shmem_mapping(mapping)); + if (!alloced_order) { + split_order = order; + goto unlock; + } xas_split(&xas, old, order); xas_reset(&xas); } + if (shadowp) + *shadowp = old; }
xas_store(&xas, folio); @@ -908,9 +918,24 @@ noinline int __filemap_add_folio(struct address_space *mapping, __lruvec_stat_mod_folio(folio, NR_FILE_THPS, nr); } + unlock: xas_unlock_irq(&xas); - } while (xas_nomem(&xas, gfp)); + + /* split needed, alloc here and retry. */ + if (split_order) { + xas_split_alloc(&xas, old, split_order, gfp); + if (xas_error(&xas)) + goto error; + alloced_shadow = old; + alloced_order = split_order; + xas_reset(&xas); + continue; + } + + if (!xas_nomem(&xas, gfp)) + break; + }
if (xas_error(&xas)) goto error;
On Wed, Oct 02, 2024 at 05:06:22AM +0800, Kairui Song wrote:
From: Kairui Song kasong@tencent.com
This series fixes the page cache corruption issue reported by Christian Theune [1]. The issue was reported affects kernels back to 5.19. Current maintained effected branches includes 6.1 and 6.6 and the fix was included in 6.10 already.
This series can be applied for both 6.1 and 6.6.
Patch 3/3 is the fixing patch. It was initially submitted and merge as an optimization but found to have fixed the corruption by handling race correctly.
Patch 1/3 and 2/3 is required for 3/3.
Patch 3/3 included some unit test code, making the LOC of the backport a bit higher, but should be OK to be kept, since they are just test code.
Note there seems still some unresolved problem in Link [1] but that should be a different issue, and the commits being backported have been well tested, they fix the corruption issue just fine.
Link: https://lore.kernel.org/linux-mm/A5A976CB-DB57-4513-A700-656580488AB6@flying... [1]
Kairui Song (3): mm/filemap: return early if failed to allocate memory for split lib/xarray: introduce a new helper xas_get_order mm/filemap: optimize filemap folio adding
include/linux/xarray.h | 6 +++ lib/test_xarray.c | 93 ++++++++++++++++++++++++++++++++++++++++++ lib/xarray.c | 49 ++++++++++++++-------- mm/filemap.c | 50 ++++++++++++++++++----- 4 files changed, 169 insertions(+), 29 deletions(-)
-- 2.46.1
All now queued up, thanks.
greg k-h
linux-stable-mirror@lists.linaro.org