Instead of computing the number of descriptor blocks a transaction can
have each time we need it (which is currently when starting each
transaction but will become more frequent later) precompute the number
once during journal initialization together with maximum transaction
size. We perform the precomputation whenever journal feature set is
updated similarly as for computation of
journal->j_revoke_records_per_block.
CC: stable(a)vger.kernel.org
Signed-off-by: Jan Kara <jack(a)suse.cz>
---
fs/jbd2/journal.c | 61 ++++++++++++++++++++++++++++++++-----------
fs/jbd2/transaction.c | 24 +----------------
include/linux/jbd2.h | 7 +++++
3 files changed, 54 insertions(+), 38 deletions(-)
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 1bb73750d307..ae5b544ed0cc 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -1451,6 +1451,48 @@ static int journal_revoke_records_per_block(journal_t *journal)
return space / record_size;
}
+static int jbd2_journal_get_max_txn_bufs(journal_t *journal)
+{
+ return (journal->j_total_len - journal->j_fc_wbufsize) / 4;
+}
+
+/*
+ * Base amount of descriptor blocks we reserve for each transaction.
+ */
+static int jbd2_descriptor_blocks_per_trans(journal_t *journal)
+{
+ int tag_space = journal->j_blocksize - sizeof(journal_header_t);
+ int tags_per_block;
+
+ /* Subtract UUID */
+ tag_space -= 16;
+ if (jbd2_journal_has_csum_v2or3(journal))
+ tag_space -= sizeof(struct jbd2_journal_block_tail);
+ /* Commit code leaves a slack space of 16 bytes at the end of block */
+ tags_per_block = (tag_space - 16) / journal_tag_bytes(journal);
+ /*
+ * Revoke descriptors are accounted separately so we need to reserve
+ * space for commit block and normal transaction descriptor blocks.
+ */
+ return 1 + DIV_ROUND_UP(jbd2_journal_get_max_txn_bufs(journal),
+ tags_per_block);
+}
+
+/*
+ * Initialize number of blocks each transaction reserves for its bookkeeping
+ * and maximum number of blocks a transaction can use. This needs to be called
+ * after the journal size and the fastcommit area size are initialized.
+ */
+static void jbd2_journal_init_transaction_limits(journal_t *journal)
+{
+ journal->j_revoke_records_per_block =
+ journal_revoke_records_per_block(journal);
+ journal->j_transaction_overhead_buffers =
+ jbd2_descriptor_blocks_per_trans(journal);
+ journal->j_max_transaction_buffers =
+ jbd2_journal_get_max_txn_bufs(journal);
+}
+
/*
* Load the on-disk journal superblock and read the key fields into the
* journal_t.
@@ -1492,8 +1534,8 @@ static int journal_load_superblock(journal_t *journal)
if (jbd2_journal_has_csum_v2or3(journal))
journal->j_csum_seed = jbd2_chksum(journal, ~0, sb->s_uuid,
sizeof(sb->s_uuid));
- journal->j_revoke_records_per_block =
- journal_revoke_records_per_block(journal);
+ /* After journal features are set, we can compute transaction limits */
+ jbd2_journal_init_transaction_limits(journal);
if (jbd2_has_feature_fast_commit(journal)) {
journal->j_fc_last = be32_to_cpu(sb->s_maxlen);
@@ -1698,11 +1740,6 @@ journal_t *jbd2_journal_init_inode(struct inode *inode)
return journal;
}
-static int jbd2_journal_get_max_txn_bufs(journal_t *journal)
-{
- return (journal->j_total_len - journal->j_fc_wbufsize) / 4;
-}
-
/*
* Given a journal_t structure, initialise the various fields for
* startup of a new journaling session. We use this both when creating
@@ -1748,8 +1785,6 @@ static int journal_reset(journal_t *journal)
journal->j_commit_sequence = journal->j_transaction_sequence - 1;
journal->j_commit_request = journal->j_commit_sequence;
- journal->j_max_transaction_buffers = jbd2_journal_get_max_txn_bufs(journal);
-
/*
* Now that journal recovery is done, turn fast commits off here. This
* way, if fast commit was enabled before the crash but if now FS has
@@ -2290,8 +2325,6 @@ jbd2_journal_initialize_fast_commit(journal_t *journal)
journal->j_fc_first = journal->j_last + 1;
journal->j_fc_off = 0;
journal->j_free = journal->j_last - journal->j_first;
- journal->j_max_transaction_buffers =
- jbd2_journal_get_max_txn_bufs(journal);
return 0;
}
@@ -2379,8 +2412,7 @@ int jbd2_journal_set_features(journal_t *journal, unsigned long compat,
sb->s_feature_ro_compat |= cpu_to_be32(ro);
sb->s_feature_incompat |= cpu_to_be32(incompat);
unlock_buffer(journal->j_sb_buffer);
- journal->j_revoke_records_per_block =
- journal_revoke_records_per_block(journal);
+ jbd2_journal_init_transaction_limits(journal);
return 1;
#undef COMPAT_FEATURE_ON
@@ -2411,8 +2443,7 @@ void jbd2_journal_clear_features(journal_t *journal, unsigned long compat,
sb->s_feature_compat &= ~cpu_to_be32(compat);
sb->s_feature_ro_compat &= ~cpu_to_be32(ro);
sb->s_feature_incompat &= ~cpu_to_be32(incompat);
- journal->j_revoke_records_per_block =
- journal_revoke_records_per_block(journal);
+ jbd2_journal_init_transaction_limits(journal);
}
EXPORT_SYMBOL(jbd2_journal_clear_features);
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index cb0b8d6fc0c6..a095f1a3114b 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -62,28 +62,6 @@ void jbd2_journal_free_transaction(transaction_t *transaction)
kmem_cache_free(transaction_cache, transaction);
}
-/*
- * Base amount of descriptor blocks we reserve for each transaction.
- */
-static int jbd2_descriptor_blocks_per_trans(journal_t *journal)
-{
- int tag_space = journal->j_blocksize - sizeof(journal_header_t);
- int tags_per_block;
-
- /* Subtract UUID */
- tag_space -= 16;
- if (jbd2_journal_has_csum_v2or3(journal))
- tag_space -= sizeof(struct jbd2_journal_block_tail);
- /* Commit code leaves a slack space of 16 bytes at the end of block */
- tags_per_block = (tag_space - 16) / journal_tag_bytes(journal);
- /*
- * Revoke descriptors are accounted separately so we need to reserve
- * space for commit block and normal transaction descriptor blocks.
- */
- return 1 + DIV_ROUND_UP(journal->j_max_transaction_buffers,
- tags_per_block);
-}
-
/*
* jbd2_get_transaction: obtain a new transaction_t object.
*
@@ -109,7 +87,7 @@ static void jbd2_get_transaction(journal_t *journal,
transaction->t_expires = jiffies + journal->j_commit_interval;
atomic_set(&transaction->t_updates, 0);
atomic_set(&transaction->t_outstanding_credits,
- jbd2_descriptor_blocks_per_trans(journal) +
+ journal->j_transaction_overhead_buffers +
atomic_read(&journal->j_reserved_credits));
atomic_set(&transaction->t_outstanding_revokes, 0);
atomic_set(&transaction->t_handle_count, 0);
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index f91b930abe20..b900c642210c 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -1085,6 +1085,13 @@ struct journal_s
*/
int j_revoke_records_per_block;
+ /**
+ * @j_transaction_overhead:
+ *
+ * Number of blocks each transaction needs for its own bookkeeping
+ */
+ int j_transaction_overhead_buffers;
+
/**
* @j_commit_interval:
*
--
2.35.3
Commit 9f356e5a4f12 ("jbd2: Account descriptor blocks into
t_outstanding_credits") started to account descriptor blocks into
transactions outstanding credits. However it didn't appropriately
decrease the maximum amount of credits available to userspace. Thus if
the filesystem requests a transaction smaller than
j_max_transaction_buffers but large enough that when descriptor blocks
are added the size exceeds j_max_transaction_buffers, we confuse
add_transaction_credits() into thinking previous handles have grown the
transaction too much and enter infinite journal commit loop in
start_this_handle() -> add_transaction_credits() trying to create
transaction with enough credits available.
Fix the problem by properly accounting for transaction space reserved
for descriptor blocks when verifying requested transaction handle size.
CC: stable(a)vger.kernel.org
Fixes: 9f356e5a4f12 ("jbd2: Account descriptor blocks into t_outstanding_credits")
Reported-by: Alexander Coffin <alex.coffin(a)maticrobots.com>
Link: https://lore.kernel.org/all/CA+hUFcuGs04JHZ_WzA1zGN57+ehL2qmHOt5a7RMpo+rv6V…
Signed-off-by: Jan Kara <jack(a)suse.cz>
---
fs/jbd2/transaction.c | 21 ++++++++++++++-------
1 file changed, 14 insertions(+), 7 deletions(-)
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index a095f1a3114b..66513c18ca29 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -191,6 +191,13 @@ static void sub_reserved_credits(journal_t *journal, int blocks)
wake_up(&journal->j_wait_reserved);
}
+/* Maximum number of blocks for user transaction payload */
+static int jbd2_max_user_trans_buffers(journal_t *journal)
+{
+ return journal->j_max_transaction_buffers -
+ journal->j_transaction_overhead_buffers;
+}
+
/*
* Wait until we can add credits for handle to the running transaction. Called
* with j_state_lock held for reading. Returns 0 if handle joined the running
@@ -240,12 +247,12 @@ __must_hold(&journal->j_state_lock)
* big to fit this handle? Wait until reserved credits are freed.
*/
if (atomic_read(&journal->j_reserved_credits) + total >
- journal->j_max_transaction_buffers) {
+ jbd2_max_user_trans_buffers(journal)) {
read_unlock(&journal->j_state_lock);
jbd2_might_wait_for_commit(journal);
wait_event(journal->j_wait_reserved,
atomic_read(&journal->j_reserved_credits) + total <=
- journal->j_max_transaction_buffers);
+ jbd2_max_user_trans_buffers(journal));
__acquire(&journal->j_state_lock); /* fake out sparse */
return 1;
}
@@ -285,14 +292,14 @@ __must_hold(&journal->j_state_lock)
needed = atomic_add_return(rsv_blocks, &journal->j_reserved_credits);
/* We allow at most half of a transaction to be reserved */
- if (needed > journal->j_max_transaction_buffers / 2) {
+ if (needed > jbd2_max_user_trans_buffers(journal) / 2) {
sub_reserved_credits(journal, rsv_blocks);
atomic_sub(total, &t->t_outstanding_credits);
read_unlock(&journal->j_state_lock);
jbd2_might_wait_for_commit(journal);
wait_event(journal->j_wait_reserved,
atomic_read(&journal->j_reserved_credits) + rsv_blocks
- <= journal->j_max_transaction_buffers / 2);
+ <= jbd2_max_user_trans_buffers(journal) / 2);
__acquire(&journal->j_state_lock); /* fake out sparse */
return 1;
}
@@ -322,12 +329,12 @@ static int start_this_handle(journal_t *journal, handle_t *handle,
* size and limit the number of total credits to not exceed maximum
* transaction size per operation.
*/
- if ((rsv_blocks > journal->j_max_transaction_buffers / 2) ||
- (rsv_blocks + blocks > journal->j_max_transaction_buffers)) {
+ if (rsv_blocks > jbd2_max_user_trans_buffers(journal) / 2 ||
+ rsv_blocks + blocks > jbd2_max_user_trans_buffers(journal)) {
printk(KERN_ERR "JBD2: %s wants too many credits "
"credits:%d rsv_credits:%d max:%d\n",
current->comm, blocks, rsv_blocks,
- journal->j_max_transaction_buffers);
+ jbd2_max_user_trans_buffers(journal));
WARN_ON(1);
return -ENOSPC;
}
--
2.35.3
On some pinephones the video output sometimes freezes (flips between two
frames) [1]. It seems to be that the reason for this behaviour is that
PLL-MIPI is outside its limits, and the GPU is not running at a fixed
rate.
In this patch series I propose the following changes:
1. sunxi-ng: Adhere to the following constraints given in the
Allwinner A64 Manual regarding PLL-MIPI:
* M/N <= 3
* (PLL_VIDEO0)/M >= 24MHz
* 500MHz <= clockrate <= 1400MHz
2. Remove two operating points from the A64 DTS OPPs, so that the GPU
runs at a fixed rate of 432 MHz.
Note, that when pinning the GPU to 432 MHz the issue [1] completely
disappears for me. I've searched the BSP and could not find any
indication that supports the idea of having the three OPPs. The only
frequency I found in the BPSs for A64 is 432 MHz, which has also proven
stable for me.
I very much appreciate your feedback!
[1] https://gitlab.com/postmarketOS/pmaports/-/issues/805
Signed-off-by: Frank Oltmanns <frank(a)oltmanns.dev>
---
Changes in v4:
- sunxi-ng: common: Address review comments.
- Link to v3: https://lore.kernel.org/r/20240304-pinephone-pll-fixes-v3-0-94ab828f269a@ol…
Changes in v3:
- dts: Pin GPU to 432 MHz.
- nkm and a64: Move minimum and maximum rate handling to the common part
of the sunxi-ng driver.
- Removed st7703 patch from series.
- Link to v2: https://lore.kernel.org/r/20240205-pinephone-pll-fixes-v2-0-96a46a2d8c9b@ol…
Changes in v2:
- dts: Increase minimum GPU frequency to 192 MHz.
- nkm and a64: Add minimum and maximum rate for PLL-MIPI.
- nkm: Use the same approach for skipping invalid rates in
ccu_nkm_find_best() as in ccu_nkm_find_best_with_parent_adj().
- nkm: Improve names for ratio struct members and hence get rid of
describing comments.
- nkm and a64: Correct description in the commit messages: M/N <= 3
- Remove patches for nm as they were not needed.
- st7703: Rework the commit message to cover more background for the
change.
- Link to v1: https://lore.kernel.org/r/20231218-pinephone-pll-fixes-v1-0-e238b6ed6dc1@ol…
---
Frank Oltmanns (5):
clk: sunxi-ng: common: Support minimum and maximum rate
clk: sunxi-ng: a64: Set minimum and maximum rate for PLL-MIPI
clk: sunxi-ng: nkm: Support constraints on m/n ratio and parent rate
clk: sunxi-ng: a64: Add constraints on PLL-MIPI's n/m ratio and parent rate
arm64: dts: allwinner: a64: Run GPU at 432 MHz
arch/arm64/boot/dts/allwinner/sun50i-a64.dtsi | 8 --------
drivers/clk/sunxi-ng/ccu-sun50i-a64.c | 14 +++++++++-----
drivers/clk/sunxi-ng/ccu_common.c | 19 +++++++++++++++++++
drivers/clk/sunxi-ng/ccu_common.h | 3 +++
drivers/clk/sunxi-ng/ccu_nkm.c | 21 +++++++++++++++++++++
drivers/clk/sunxi-ng/ccu_nkm.h | 2 ++
6 files changed, 54 insertions(+), 13 deletions(-)
---
base-commit: dcb6c8ee6acc6c347caec1e73fb900c0f4ff9806
change-id: 20231218-pinephone-pll-fixes-0ccdfde273e4
Best regards,
--
Frank Oltmanns <frank(a)oltmanns.dev>
The patch titled
Subject: mm/shmem: disable PMD-sized page cache if needed
has been added to the -mm mm-hotfixes-unstable branch. Its filename is
mm-shmem-disable-pmd-sized-page-cache-if-needed.patch
This patch will shortly appear at
https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patche…
This patch will later appear in the mm-hotfixes-unstable branch at
git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next via the mm-everything
branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
and is updated there every 2-3 working days
------------------------------------------------------
From: Gavin Shan <gshan(a)redhat.com>
Subject: mm/shmem: disable PMD-sized page cache if needed
Date: Thu, 27 Jun 2024 10:39:52 +1000
For shmem files, it's possible that PMD-sized page cache can't be
supported by xarray. For example, 512MB page cache on ARM64 when the base
page size is 64KB can't be supported by xarray. It leads to errors as the
following messages indicate when this sort of xarray entry is split.
WARNING: CPU: 34 PID: 7578 at lib/xarray.c:1025 xas_split_alloc+0xf8/0x128
Modules linked in: binfmt_misc nft_fib_inet nft_fib_ipv4 nft_fib_ipv6 \
nft_fib nft_reject_inet nf_reject_ipv4 nf_reject_ipv6 nft_reject \
nft_ct nft_chain_nat nf_nat nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 \
ip_set rfkill nf_tables nfnetlink vfat fat virtio_balloon drm fuse xfs \
libcrc32c crct10dif_ce ghash_ce sha2_ce sha256_arm64 sha1_ce virtio_net \
net_failover virtio_console virtio_blk failover dimlib virtio_mmio
CPU: 34 PID: 7578 Comm: test Kdump: loaded Tainted: G W 6.10.0-rc5-gavin+ #9
Hardware name: QEMU KVM Virtual Machine, BIOS edk2-20240524-1.el9 05/24/2024
pstate: 83400005 (Nzcv daif +PAN -UAO +TCO +DIT -SSBS BTYPE=--)
pc : xas_split_alloc+0xf8/0x128
lr : split_huge_page_to_list_to_order+0x1c4/0x720
sp : ffff8000882af5f0
x29: ffff8000882af5f0 x28: ffff8000882af650 x27: ffff8000882af768
x26: 0000000000000cc0 x25: 000000000000000d x24: ffff00010625b858
x23: ffff8000882af650 x22: ffffffdfc0900000 x21: 0000000000000000
x20: 0000000000000000 x19: ffffffdfc0900000 x18: 0000000000000000
x17: 0000000000000000 x16: 0000018000000000 x15: 52f8004000000000
x14: 0000e00000000000 x13: 0000000000002000 x12: 0000000000000020
x11: 52f8000000000000 x10: 52f8e1c0ffff6000 x9 : ffffbeb9619a681c
x8 : 0000000000000003 x7 : 0000000000000000 x6 : ffff00010b02ddb0
x5 : ffffbeb96395e378 x4 : 0000000000000000 x3 : 0000000000000cc0
x2 : 000000000000000d x1 : 000000000000000c x0 : 0000000000000000
Call trace:
xas_split_alloc+0xf8/0x128
split_huge_page_to_list_to_order+0x1c4/0x720
truncate_inode_partial_folio+0xdc/0x160
shmem_undo_range+0x2bc/0x6a8
shmem_fallocate+0x134/0x430
vfs_fallocate+0x124/0x2e8
ksys_fallocate+0x4c/0xa0
__arm64_sys_fallocate+0x24/0x38
invoke_syscall.constprop.0+0x7c/0xd8
do_el0_svc+0xb4/0xd0
el0_svc+0x44/0x1d8
el0t_64_sync_handler+0x134/0x150
el0t_64_sync+0x17c/0x180
Fix it by disabling PMD-sized page cache when HPAGE_PMD_ORDER is larger
than MAX_PAGECACHE_ORDER. As Matthew Wilcox pointed, the page cache in a
shmem file isn't represented by a multi-index entry and doesn't have this
limitation when the xarry entry is split until commit 6b24ca4a1a8d ("mm:
Use multi-index entries in the page cache").
Link: https://lkml.kernel.org/r/20240627003953.1262512-5-gshan@redhat.com
Fixes: 6b24ca4a1a8d ("mm: Use multi-index entries in the page cache")
Signed-off-by: Gavin Shan <gshan(a)redhat.com>
Acked-by: David Hildenbrand <david(a)redhat.com>
Cc: Darrick J. Wong <djwong(a)kernel.org>
Cc: Don Dutile <ddutile(a)redhat.com>
Cc: Hugh Dickins <hughd(a)google.com>
Cc: Linus Torvalds <torvalds(a)linux-foundation.org>
Cc: Matthew Wilcox (Oracle) <willy(a)infradead.org>
Cc: Ryan Roberts <ryan.roberts(a)arm.com>
Cc: William Kucharski <william.kucharski(a)oracle.com>
Cc: Zhenyu Zhang <zhenyzha(a)redhat.com>
Cc: <stable(a)vger.kernel.org> [5.17+]
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/shmem.c | 15 +++++++++++++--
1 file changed, 13 insertions(+), 2 deletions(-)
--- a/mm/shmem.c~mm-shmem-disable-pmd-sized-page-cache-if-needed
+++ a/mm/shmem.c
@@ -541,8 +541,9 @@ static bool shmem_confirm_swap(struct ad
static int shmem_huge __read_mostly = SHMEM_HUGE_NEVER;
-bool shmem_is_huge(struct inode *inode, pgoff_t index, bool shmem_huge_force,
- struct mm_struct *mm, unsigned long vm_flags)
+static bool __shmem_is_huge(struct inode *inode, pgoff_t index,
+ bool shmem_huge_force, struct mm_struct *mm,
+ unsigned long vm_flags)
{
loff_t i_size;
@@ -573,6 +574,16 @@ bool shmem_is_huge(struct inode *inode,
}
}
+bool shmem_is_huge(struct inode *inode, pgoff_t index,
+ bool shmem_huge_force, struct mm_struct *mm,
+ unsigned long vm_flags)
+{
+ if (HPAGE_PMD_ORDER > MAX_PAGECACHE_ORDER)
+ return false;
+
+ return __shmem_is_huge(inode, index, shmem_huge_force, mm, vm_flags);
+}
+
#if defined(CONFIG_SYSFS)
static int shmem_parse_huge(const char *str)
{
_
Patches currently in -mm which might be from gshan(a)redhat.com are
mm-filemap-make-max_pagecache_order-acceptable-to-xarray.patch
mm-readahead-limit-page-cache-size-in-page_cache_ra_order.patch
mm-filemap-skip-to-create-pmd-sized-page-cache-if-needed.patch
mm-shmem-disable-pmd-sized-page-cache-if-needed.patch
The patch titled
Subject: mm/filemap: skip to create PMD-sized page cache if needed
has been added to the -mm mm-hotfixes-unstable branch. Its filename is
mm-filemap-skip-to-create-pmd-sized-page-cache-if-needed.patch
This patch will shortly appear at
https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patche…
This patch will later appear in the mm-hotfixes-unstable branch at
git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next via the mm-everything
branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
and is updated there every 2-3 working days
------------------------------------------------------
From: Gavin Shan <gshan(a)redhat.com>
Subject: mm/filemap: skip to create PMD-sized page cache if needed
Date: Thu, 27 Jun 2024 10:39:51 +1000
On ARM64, HPAGE_PMD_ORDER is 13 when the base page size is 64KB. The
PMD-sized page cache can't be supported by xarray as the following error
messages indicate.
------------[ cut here ]------------
WARNING: CPU: 35 PID: 7484 at lib/xarray.c:1025 xas_split_alloc+0xf8/0x128
Modules linked in: nft_fib_inet nft_fib_ipv4 nft_fib_ipv6 nft_fib \
nft_reject_inet nf_reject_ipv4 nf_reject_ipv6 nft_reject nft_ct \
nft_chain_nat nf_nat nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 \
ip_set rfkill nf_tables nfnetlink vfat fat virtio_balloon drm \
fuse xfs libcrc32c crct10dif_ce ghash_ce sha2_ce sha256_arm64 \
sha1_ce virtio_net net_failover virtio_console virtio_blk failover \
dimlib virtio_mmio
CPU: 35 PID: 7484 Comm: test Kdump: loaded Tainted: G W 6.10.0-rc5-gavin+ #9
Hardware name: QEMU KVM Virtual Machine, BIOS edk2-20240524-1.el9 05/24/2024
pstate: 83400005 (Nzcv daif +PAN -UAO +TCO +DIT -SSBS BTYPE=--)
pc : xas_split_alloc+0xf8/0x128
lr : split_huge_page_to_list_to_order+0x1c4/0x720
sp : ffff800087a4f6c0
x29: ffff800087a4f6c0 x28: ffff800087a4f720 x27: 000000001fffffff
x26: 0000000000000c40 x25: 000000000000000d x24: ffff00010625b858
x23: ffff800087a4f720 x22: ffffffdfc0780000 x21: 0000000000000000
x20: 0000000000000000 x19: ffffffdfc0780000 x18: 000000001ff40000
x17: 00000000ffffffff x16: 0000018000000000 x15: 51ec004000000000
x14: 0000e00000000000 x13: 0000000000002000 x12: 0000000000000020
x11: 51ec000000000000 x10: 51ece1c0ffff8000 x9 : ffffbeb961a44d28
x8 : 0000000000000003 x7 : ffffffdfc0456420 x6 : ffff0000e1aa6eb8
x5 : 20bf08b4fe778fca x4 : ffffffdfc0456420 x3 : 0000000000000c40
x2 : 000000000000000d x1 : 000000000000000c x0 : 0000000000000000
Call trace:
xas_split_alloc+0xf8/0x128
split_huge_page_to_list_to_order+0x1c4/0x720
truncate_inode_partial_folio+0xdc/0x160
truncate_inode_pages_range+0x1b4/0x4a8
truncate_pagecache_range+0x84/0xa0
xfs_flush_unmap_range+0x70/0x90 [xfs]
xfs_file_fallocate+0xfc/0x4d8 [xfs]
vfs_fallocate+0x124/0x2e8
ksys_fallocate+0x4c/0xa0
__arm64_sys_fallocate+0x24/0x38
invoke_syscall.constprop.0+0x7c/0xd8
do_el0_svc+0xb4/0xd0
el0_svc+0x44/0x1d8
el0t_64_sync_handler+0x134/0x150
el0t_64_sync+0x17c/0x180
Fix it by skipping to allocate PMD-sized page cache when its size is
larger than MAX_PAGECACHE_ORDER. For this specific case, we will fall to
regular path where the readahead window is determined by BDI's sysfs file
(read_ahead_kb).
Link: https://lkml.kernel.org/r/20240627003953.1262512-4-gshan@redhat.com
Fixes: 4687fdbb805a ("mm/filemap: Support VM_HUGEPAGE for file mappings")
Signed-off-by: Gavin Shan <gshan(a)redhat.com>
Suggested-by: David Hildenbrand <david(a)redhat.com>
Acked-by: David Hildenbrand <david(a)redhat.com>
Cc: Darrick J. Wong <djwong(a)kernel.org>
Cc: Don Dutile <ddutile(a)redhat.com>
Cc: Hugh Dickins <hughd(a)google.com>
Cc: Linus Torvalds <torvalds(a)linux-foundation.org>
Cc: Matthew Wilcox (Oracle) <willy(a)infradead.org>
Cc: Ryan Roberts <ryan.roberts(a)arm.com>
Cc: William Kucharski <william.kucharski(a)oracle.com>
Cc: Zhenyu Zhang <zhenyzha(a)redhat.com>
Cc: <stable(a)vger.kernel.org> [5.18+]
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/filemap.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
--- a/mm/filemap.c~mm-filemap-skip-to-create-pmd-sized-page-cache-if-needed
+++ a/mm/filemap.c
@@ -3124,7 +3124,7 @@ static struct file *do_sync_mmap_readahe
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
/* Use the readahead code, even if readahead is disabled */
- if (vm_flags & VM_HUGEPAGE) {
+ if ((vm_flags & VM_HUGEPAGE) && HPAGE_PMD_ORDER <= MAX_PAGECACHE_ORDER) {
fpin = maybe_unlock_mmap_for_io(vmf, fpin);
ractl._index &= ~((unsigned long)HPAGE_PMD_NR - 1);
ra->size = HPAGE_PMD_NR;
_
Patches currently in -mm which might be from gshan(a)redhat.com are
mm-filemap-make-max_pagecache_order-acceptable-to-xarray.patch
mm-readahead-limit-page-cache-size-in-page_cache_ra_order.patch
mm-filemap-skip-to-create-pmd-sized-page-cache-if-needed.patch
mm-shmem-disable-pmd-sized-page-cache-if-needed.patch