June 2024 - Linux-stable-mirror

by Jiaxun Yang

Hi all, This series fixed multiple problems for MT ASE handling in current kernel code. PATCH 1 is critical, although it's not causing problems on MT kernel, we are risking clobbering register here. Rest are usual build fixes for adopting toolcahins. Please apply it to fixes tree. Thanks. - Jiaxun Signed-off-by: Jiaxun Yang <jiaxun.yang(a)flygoat.com> --- Jiaxun Yang (4): MIPS: mipsmtregs: Fix target register for MFTC0 MIPS: asmmacro: Fix MT ASE macros MIPS: cps-vec: Replace MT instructions with macros MIPS: Use toolchain MT ASE support whenever possible arch/mips/Makefile | 2 + arch/mips/include/asm/asmmacro.h | 241 +++++++++++++++++++++++++++++++++---- arch/mips/include/asm/mipsmtregs.h | 42 ++++++- arch/mips/kernel/cps-vec.S | 62 ++++------ 4 files changed, 287 insertions(+), 60 deletions(-) --- base-commit: 6906a84c482f098d31486df8dc98cead21cce2d0 change-id: 20240616-mips-mt-fixes-50eb56d2159c Best regards, -- Jiaxun Yang <jiaxun.yang(a)flygoat.com>

1 year, 5 months

3
12
0 0

[merged mm-hotfixes-stable] nilfs2-fix-incorrect-inode-allocation-from-reserved-inodes.patch removed from -mm tree

by Andrew Morton

The quilt patch titled Subject: nilfs2: fix incorrect inode allocation from reserved inodes has been removed from the -mm tree. Its filename was nilfs2-fix-incorrect-inode-allocation-from-reserved-inodes.patch This patch was dropped because it was merged into the mm-hotfixes-stable branch of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm ------------------------------------------------------ From: Ryusuke Konishi <konishi.ryusuke(a)gmail.com> Subject: nilfs2: fix incorrect inode allocation from reserved inodes Date: Sun, 23 Jun 2024 14:11:35 +0900 If the bitmap block that manages the inode allocation status is corrupted, nilfs_ifile_create_inode() may allocate a new inode from the reserved inode area where it should not be allocated. Previous fix commit d325dc6eb763 ("nilfs2: fix use-after-free bug of struct nilfs_root"), fixed the problem that reserved inodes with inode numbers less than NILFS_USER_INO (=11) were incorrectly reallocated due to bitmap corruption, but since the start number of non-reserved inodes is read from the super block and may change, in which case inode allocation may occur from the extended reserved inode area. If that happens, access to that inode will cause an IO error, causing the file system to degrade to an error state. Fix this potential issue by adding a wraparound option to the common metadata object allocation routine and by modifying nilfs_ifile_create_inode() to disable the option so that it only allocates inodes with inode numbers greater than or equal to the inode number read in "nilfs->ns_first_ino", regardless of the bitmap status of reserved inodes. Link: https://lkml.kernel.org/r/20240623051135.4180-4-konishi.ryusuke@gmail.com Signed-off-by: Ryusuke Konishi <konishi.ryusuke(a)gmail.com> Cc: Hillf Danton <hdanton(a)sina.com> Cc: Jan Kara <jack(a)suse.cz> Cc: Matthew Wilcox (Oracle) <willy(a)infradead.org> Cc: <stable(a)vger.kernel.org> Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org> --- fs/nilfs2/alloc.c | 19 +++++++++++++++---- fs/nilfs2/alloc.h | 4 ++-- fs/nilfs2/dat.c | 2 +- fs/nilfs2/ifile.c | 7 ++----- 4 files changed, 20 insertions(+), 12 deletions(-) --- a/fs/nilfs2/alloc.c~nilfs2-fix-incorrect-inode-allocation-from-reserved-inodes +++ a/fs/nilfs2/alloc.c @@ -377,11 +377,12 @@ void *nilfs_palloc_block_get_entry(const * @target: offset number of an entry in the group (start point) * @bsize: size in bits * @lock: spin lock protecting @bitmap + * @wrap: whether to wrap around */ static int nilfs_palloc_find_available_slot(unsigned char *bitmap, unsigned long target, unsigned int bsize, - spinlock_t *lock) + spinlock_t *lock, bool wrap) { int pos, end = bsize; @@ -397,6 +398,8 @@ static int nilfs_palloc_find_available_s end = target; } + if (!wrap) + return -ENOSPC; /* wrap around */ for (pos = 0; pos < end; pos++) { @@ -495,9 +498,10 @@ int nilfs_palloc_count_max_entries(struc * nilfs_palloc_prepare_alloc_entry - prepare to allocate a persistent object * @inode: inode of metadata file using this allocator * @req: nilfs_palloc_req structure exchanged for the allocation + * @wrap: whether to wrap around */ int nilfs_palloc_prepare_alloc_entry(struct inode *inode, - struct nilfs_palloc_req *req) + struct nilfs_palloc_req *req, bool wrap) { struct buffer_head *desc_bh, *bitmap_bh; struct nilfs_palloc_group_desc *desc; @@ -516,7 +520,7 @@ int nilfs_palloc_prepare_alloc_entry(str entries_per_group = nilfs_palloc_entries_per_group(inode); for (i = 0; i < ngroups; i += n) { - if (group >= ngroups) { + if (group >= ngroups && wrap) { /* wrap around */ group = 0; maxgroup = nilfs_palloc_group(inode, req->pr_entry_nr, @@ -550,7 +554,14 @@ int nilfs_palloc_prepare_alloc_entry(str bitmap_kaddr = kmap_local_page(bitmap_bh->b_page); bitmap = bitmap_kaddr + bh_offset(bitmap_bh); pos = nilfs_palloc_find_available_slot( - bitmap, group_offset, entries_per_group, lock); + bitmap, group_offset, entries_per_group, lock, + wrap); + /* + * Since the search for a free slot in the second and + * subsequent bitmap blocks always starts from the + * beginning, the wrap flag only has an effect on the + * first search. + */ kunmap_local(bitmap_kaddr); if (pos >= 0) goto found; --- a/fs/nilfs2/alloc.h~nilfs2-fix-incorrect-inode-allocation-from-reserved-inodes +++ a/fs/nilfs2/alloc.h @@ -50,8 +50,8 @@ struct nilfs_palloc_req { struct buffer_head *pr_entry_bh; }; -int nilfs_palloc_prepare_alloc_entry(struct inode *, - struct nilfs_palloc_req *); +int nilfs_palloc_prepare_alloc_entry(struct inode *inode, + struct nilfs_palloc_req *req, bool wrap); void nilfs_palloc_commit_alloc_entry(struct inode *, struct nilfs_palloc_req *); void nilfs_palloc_abort_alloc_entry(struct inode *, struct nilfs_palloc_req *); --- a/fs/nilfs2/dat.c~nilfs2-fix-incorrect-inode-allocation-from-reserved-inodes +++ a/fs/nilfs2/dat.c @@ -75,7 +75,7 @@ int nilfs_dat_prepare_alloc(struct inode { int ret; - ret = nilfs_palloc_prepare_alloc_entry(dat, req); + ret = nilfs_palloc_prepare_alloc_entry(dat, req, true); if (ret < 0) return ret; --- a/fs/nilfs2/ifile.c~nilfs2-fix-incorrect-inode-allocation-from-reserved-inodes +++ a/fs/nilfs2/ifile.c @@ -56,13 +56,10 @@ int nilfs_ifile_create_inode(struct inod struct nilfs_palloc_req req; int ret; - req.pr_entry_nr = 0; /* - * 0 says find free inode from beginning - * of a group. dull code!! - */ + req.pr_entry_nr = NILFS_FIRST_INO(ifile->i_sb); req.pr_entry_bh = NULL; - ret = nilfs_palloc_prepare_alloc_entry(ifile, &req); + ret = nilfs_palloc_prepare_alloc_entry(ifile, &req, false); if (!ret) { ret = nilfs_palloc_get_entry_block(ifile, req.pr_entry_nr, 1, &req.pr_entry_bh); _ Patches currently in -mm which might be from konishi.ryusuke(a)gmail.com are

1 year, 5 months

1
0
0 0

[merged mm-hotfixes-stable] nilfs2-add-missing-check-for-inode-numbers-on-directory-entries.patch removed from -mm tree

by Andrew Morton

The quilt patch titled Subject: nilfs2: add missing check for inode numbers on directory entries has been removed from the -mm tree. Its filename was nilfs2-add-missing-check-for-inode-numbers-on-directory-entries.patch This patch was dropped because it was merged into the mm-hotfixes-stable branch of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm ------------------------------------------------------ From: Ryusuke Konishi <konishi.ryusuke(a)gmail.com> Subject: nilfs2: add missing check for inode numbers on directory entries Date: Sun, 23 Jun 2024 14:11:34 +0900 Syzbot reported that mounting and unmounting a specific pattern of corrupted nilfs2 filesystem images causes a use-after-free of metadata file inodes, which triggers a kernel bug in lru_add_fn(). As Jan Kara pointed out, this is because the link count of a metadata file gets corrupted to 0, and nilfs_evict_inode(), which is called from iput(), tries to delete that inode (ifile inode in this case). The inconsistency occurs because directories containing the inode numbers of these metadata files that should not be visible in the namespace are read without checking. Fix this issue by treating the inode numbers of these internal files as errors in the sanity check helper when reading directory folios/pages. Also thanks to Hillf Danton and Matthew Wilcox for their initial mm-layer analysis. Link: https://lkml.kernel.org/r/20240623051135.4180-3-konishi.ryusuke@gmail.com Signed-off-by: Ryusuke Konishi <konishi.ryusuke(a)gmail.com> Reported-by: syzbot+d79afb004be235636ee8(a)syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=d79afb004be235636ee8 Reported-by: Jan Kara <jack(a)suse.cz> Closes: https://lkml.kernel.org/r/20240617075758.wewhukbrjod5fp5o@quack3 Tested-by: Ryusuke Konishi <konishi.ryusuke(a)gmail.com> Cc: Hillf Danton <hdanton(a)sina.com> Cc: Matthew Wilcox (Oracle) <willy(a)infradead.org> Cc: <stable(a)vger.kernel.org> Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org> --- fs/nilfs2/dir.c | 6 ++++++ fs/nilfs2/nilfs.h | 5 +++++ 2 files changed, 11 insertions(+) --- a/fs/nilfs2/dir.c~nilfs2-add-missing-check-for-inode-numbers-on-directory-entries +++ a/fs/nilfs2/dir.c @@ -135,6 +135,9 @@ static bool nilfs_check_folio(struct fol goto Enamelen; if (((offs + rec_len - 1) ^ offs) & ~(chunk_size-1)) goto Espan; + if (unlikely(p->inode && + NILFS_PRIVATE_INODE(le64_to_cpu(p->inode)))) + goto Einumber; } if (offs != limit) goto Eend; @@ -160,6 +163,9 @@ Enamelen: goto bad_entry; Espan: error = "directory entry across blocks"; + goto bad_entry; +Einumber: + error = "disallowed inode number"; bad_entry: nilfs_error(sb, "bad entry in directory #%lu: %s - offset=%lu, inode=%lu, rec_len=%zd, name_len=%d", --- a/fs/nilfs2/nilfs.h~nilfs2-add-missing-check-for-inode-numbers-on-directory-entries +++ a/fs/nilfs2/nilfs.h @@ -121,6 +121,11 @@ enum { ((ino) >= NILFS_FIRST_INO(sb) || \ ((ino) < NILFS_USER_INO && (NILFS_SYS_INO_BITS & BIT(ino)))) +#define NILFS_PRIVATE_INODE(ino) ({ \ + ino_t __ino = (ino); \ + ((__ino) < NILFS_USER_INO && (__ino) != NILFS_ROOT_INO && \ + (__ino) != NILFS_SKETCH_INO); }) + /** * struct nilfs_transaction_info: context information for synchronization * @ti_magic: Magic number _ Patches currently in -mm which might be from konishi.ryusuke(a)gmail.com are

1 year, 5 months

1
0
0 0

[merged mm-hotfixes-stable] nilfs2-fix-inode-number-range-checks.patch removed from -mm tree

by Andrew Morton

The quilt patch titled Subject: nilfs2: fix inode number range checks has been removed from the -mm tree. Its filename was nilfs2-fix-inode-number-range-checks.patch This patch was dropped because it was merged into the mm-hotfixes-stable branch of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm ------------------------------------------------------ From: Ryusuke Konishi <konishi.ryusuke(a)gmail.com> Subject: nilfs2: fix inode number range checks Date: Sun, 23 Jun 2024 14:11:33 +0900 Patch series "nilfs2: fix potential issues related to reserved inodes". This series fixes one use-after-free issue reported by syzbot, caused by nilfs2's internal inode being exposed in the namespace on a corrupted filesystem, and a couple of flaws that cause problems if the starting number of non-reserved inodes written in the on-disk super block is intentionally (or corruptly) changed from its default value. This patch (of 3): In the current implementation of nilfs2, "nilfs->ns_first_ino", which gives the first non-reserved inode number, is read from the superblock, but its lower limit is not checked. As a result, if a number that overlaps with the inode number range of reserved inodes such as the root directory or metadata files is set in the super block parameter, the inode number test macros (NILFS_MDT_INODE and NILFS_VALID_INODE) will not function properly. In addition, these test macros use left bit-shift calculations using with the inode number as the shift count via the BIT macro, but the result of a shift calculation that exceeds the bit width of an integer is undefined in the C specification, so if "ns_first_ino" is set to a large value other than the default value NILFS_USER_INO (=11), the macros may potentially malfunction depending on the environment. Fix these issues by checking the lower bound of "nilfs->ns_first_ino" and by preventing bit shifts equal to or greater than the NILFS_USER_INO constant in the inode number test macros. Also, change the type of "ns_first_ino" from signed integer to unsigned integer to avoid the need for type casting in comparisons such as the lower bound check introduced this time. Link: https://lkml.kernel.org/r/20240623051135.4180-1-konishi.ryusuke@gmail.com Link: https://lkml.kernel.org/r/20240623051135.4180-2-konishi.ryusuke@gmail.com Signed-off-by: Ryusuke Konishi <konishi.ryusuke(a)gmail.com> Cc: Hillf Danton <hdanton(a)sina.com> Cc: Jan Kara <jack(a)suse.cz> Cc: Matthew Wilcox (Oracle) <willy(a)infradead.org> Cc: <stable(a)vger.kernel.org> Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org> --- fs/nilfs2/nilfs.h | 5 +++-- fs/nilfs2/the_nilfs.c | 6 ++++++ fs/nilfs2/the_nilfs.h | 2 +- 3 files changed, 10 insertions(+), 3 deletions(-) --- a/fs/nilfs2/nilfs.h~nilfs2-fix-inode-number-range-checks +++ a/fs/nilfs2/nilfs.h @@ -116,9 +116,10 @@ enum { #define NILFS_FIRST_INO(sb) (((struct the_nilfs *)sb->s_fs_info)->ns_first_ino) #define NILFS_MDT_INODE(sb, ino) \ - ((ino) < NILFS_FIRST_INO(sb) && (NILFS_MDT_INO_BITS & BIT(ino))) + ((ino) < NILFS_USER_INO && (NILFS_MDT_INO_BITS & BIT(ino))) #define NILFS_VALID_INODE(sb, ino) \ - ((ino) >= NILFS_FIRST_INO(sb) || (NILFS_SYS_INO_BITS & BIT(ino))) + ((ino) >= NILFS_FIRST_INO(sb) || \ + ((ino) < NILFS_USER_INO && (NILFS_SYS_INO_BITS & BIT(ino)))) /** * struct nilfs_transaction_info: context information for synchronization --- a/fs/nilfs2/the_nilfs.c~nilfs2-fix-inode-number-range-checks +++ a/fs/nilfs2/the_nilfs.c @@ -452,6 +452,12 @@ static int nilfs_store_disk_layout(struc } nilfs->ns_first_ino = le32_to_cpu(sbp->s_first_ino); + if (nilfs->ns_first_ino < NILFS_USER_INO) { + nilfs_err(nilfs->ns_sb, + "too small lower limit for non-reserved inode numbers: %u", + nilfs->ns_first_ino); + return -EINVAL; + } nilfs->ns_blocks_per_segment = le32_to_cpu(sbp->s_blocks_per_segment); if (nilfs->ns_blocks_per_segment < NILFS_SEG_MIN_BLOCKS) { --- a/fs/nilfs2/the_nilfs.h~nilfs2-fix-inode-number-range-checks +++ a/fs/nilfs2/the_nilfs.h @@ -182,7 +182,7 @@ struct the_nilfs { unsigned long ns_nrsvsegs; unsigned long ns_first_data_block; int ns_inode_size; - int ns_first_ino; + unsigned int ns_first_ino; u32 ns_crc_seed; /* /sys/fs/<nilfs>/<device> */ _ Patches currently in -mm which might be from konishi.ryusuke(a)gmail.com are

1 year, 5 months

1
0
0 0

[merged mm-hotfixes-stable] mm-avoid-overflows-in-dirty-throttling-logic.patch removed from -mm tree

by Andrew Morton

The quilt patch titled Subject: mm: avoid overflows in dirty throttling logic has been removed from the -mm tree. Its filename was mm-avoid-overflows-in-dirty-throttling-logic.patch This patch was dropped because it was merged into the mm-hotfixes-stable branch of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm ------------------------------------------------------ From: Jan Kara <jack(a)suse.cz> Subject: mm: avoid overflows in dirty throttling logic Date: Fri, 21 Jun 2024 16:42:38 +0200 The dirty throttling logic is interspersed with assumptions that dirty limits in PAGE_SIZE units fit into 32-bit (so that various multiplications fit into 64-bits). If limits end up being larger, we will hit overflows, possible divisions by 0 etc. Fix these problems by never allowing so large dirty limits as they have dubious practical value anyway. For dirty_bytes / dirty_background_bytes interfaces we can just refuse to set so large limits. For dirty_ratio / dirty_background_ratio it isn't so simple as the dirty limit is computed from the amount of available memory which can change due to memory hotplug etc. So when converting dirty limits from ratios to numbers of pages, we just don't allow the result to exceed UINT_MAX. This is root-only triggerable problem which occurs when the operator sets dirty limits to >16 TB. Link: https://lkml.kernel.org/r/20240621144246.11148-2-jack@suse.cz Signed-off-by: Jan Kara <jack(a)suse.cz> Reported-by: Zach O'Keefe <zokeefe(a)google.com> Reviewed-By: Zach O'Keefe <zokeefe(a)google.com> Cc: <stable(a)vger.kernel.org> Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org> --- mm/page-writeback.c | 30 ++++++++++++++++++++++++++---- 1 file changed, 26 insertions(+), 4 deletions(-) --- a/mm/page-writeback.c~mm-avoid-overflows-in-dirty-throttling-logic +++ a/mm/page-writeback.c @@ -415,13 +415,20 @@ static void domain_dirty_limits(struct d else bg_thresh = (bg_ratio * available_memory) / PAGE_SIZE; - if (bg_thresh >= thresh) - bg_thresh = thresh / 2; tsk = current; if (rt_task(tsk)) { bg_thresh += bg_thresh / 4 + global_wb_domain.dirty_limit / 32; thresh += thresh / 4 + global_wb_domain.dirty_limit / 32; } + /* + * Dirty throttling logic assumes the limits in page units fit into + * 32-bits. This gives 16TB dirty limits max which is hopefully enough. + */ + if (thresh > UINT_MAX) + thresh = UINT_MAX; + /* This makes sure bg_thresh is within 32-bits as well */ + if (bg_thresh >= thresh) + bg_thresh = thresh / 2; dtc->thresh = thresh; dtc->bg_thresh = bg_thresh; @@ -471,7 +478,11 @@ static unsigned long node_dirty_limit(st if (rt_task(tsk)) dirty += dirty / 4; - return dirty; + /* + * Dirty throttling logic assumes the limits in page units fit into + * 32-bits. This gives 16TB dirty limits max which is hopefully enough. + */ + return min_t(unsigned long, dirty, UINT_MAX); } /** @@ -508,10 +519,17 @@ static int dirty_background_bytes_handle void *buffer, size_t *lenp, loff_t *ppos) { int ret; + unsigned long old_bytes = dirty_background_bytes; ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos); - if (ret == 0 && write) + if (ret == 0 && write) { + if (DIV_ROUND_UP(dirty_background_bytes, PAGE_SIZE) > + UINT_MAX) { + dirty_background_bytes = old_bytes; + return -ERANGE; + } dirty_background_ratio = 0; + } return ret; } @@ -537,6 +555,10 @@ static int dirty_bytes_handler(struct ct ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos); if (ret == 0 && write && vm_dirty_bytes != old_bytes) { + if (DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE) > UINT_MAX) { + vm_dirty_bytes = old_bytes; + return -ERANGE; + } writeback_set_ratelimit(); vm_dirty_ratio = 0; } _ Patches currently in -mm which might be from jack(a)suse.cz are readahead-make-sure-sync-readahead-reads-needed-page.patch filemap-fix-page_cache_next_miss-when-no-hole-found.patch readahead-properly-shorten-readahead-when-falling-back-to-do_page_cache_ra.patch readahead-drop-pointless-index-from-force_page_cache_ra.patch readahead-drop-index-argument-of-page_cache_async_readahead.patch readahead-drop-dead-code-in-page_cache_ra_order.patch readahead-drop-dead-code-in-ondemand_readahead.patch readahead-disentangle-async-and-sync-readahead.patch readahead-fold-try_context_readahead-into-its-single-caller.patch readahead-simplify-gotos-in-page_cache_sync_ra.patch

1 year, 5 months

1
0
0 0

[merged mm-hotfixes-stable] revert-mm-writeback-fix-possible-divide-by-zero-in-wb_dirty_limits-again.patch removed from -mm tree

by Andrew Morton

The quilt patch titled Subject: Revert "mm/writeback: fix possible divide-by-zero in wb_dirty_limits(), again" has been removed from the -mm tree. Its filename was revert-mm-writeback-fix-possible-divide-by-zero-in-wb_dirty_limits-again.patch This patch was dropped because it was merged into the mm-hotfixes-stable branch of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm ------------------------------------------------------ From: Jan Kara <jack(a)suse.cz> Subject: Revert "mm/writeback: fix possible divide-by-zero in wb_dirty_limits(), again" Date: Fri, 21 Jun 2024 16:42:37 +0200 Patch series "mm: Avoid possible overflows in dirty throttling". Dirty throttling logic assumes dirty limits in page units fit into 32-bits. This patch series makes sure this is true (see patch 2/2 for more details). This patch (of 2): This reverts commit 9319b647902cbd5cc884ac08a8a6d54ce111fc78. The commit is broken in several ways. Firstly, the removed (u64) cast from the multiplication will introduce a multiplication overflow on 32-bit archs if wb_thresh * bg_thresh >= 1<<32 (which is actually common - the default settings with 4GB of RAM will trigger this). Secondly, the div64_u64() is unnecessarily expensive on 32-bit archs. We have div64_ul() in case we want to be safe & cheap. Thirdly, if dirty thresholds are larger than 1<<32 pages, then dirty balancing is going to blow up in many other spectacular ways anyway so trying to fix one possible overflow is just moot. Link: https://lkml.kernel.org/r/20240621144017.30993-1-jack@suse.cz Link: https://lkml.kernel.org/r/20240621144246.11148-1-jack@suse.cz Fixes: 9319b647902c ("mm/writeback: fix possible divide-by-zero in wb_dirty_limits(), again") Signed-off-by: Jan Kara <jack(a)suse.cz> Reviewed-By: Zach O'Keefe <zokeefe(a)google.com> Cc: <stable(a)vger.kernel.org> Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org> --- mm/page-writeback.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) --- a/mm/page-writeback.c~revert-mm-writeback-fix-possible-divide-by-zero-in-wb_dirty_limits-again +++ a/mm/page-writeback.c @@ -1660,7 +1660,7 @@ static inline void wb_dirty_limits(struc */ dtc->wb_thresh = __wb_calc_thresh(dtc, dtc->thresh); dtc->wb_bg_thresh = dtc->thresh ? - div64_u64(dtc->wb_thresh * dtc->bg_thresh, dtc->thresh) : 0; + div_u64((u64)dtc->wb_thresh * dtc->bg_thresh, dtc->thresh) : 0; /* * In order to avoid the stacked BDI deadlock we need _ Patches currently in -mm which might be from jack(a)suse.cz are readahead-make-sure-sync-readahead-reads-needed-page.patch filemap-fix-page_cache_next_miss-when-no-hole-found.patch readahead-properly-shorten-readahead-when-falling-back-to-do_page_cache_ra.patch readahead-drop-pointless-index-from-force_page_cache_ra.patch readahead-drop-index-argument-of-page_cache_async_readahead.patch readahead-drop-dead-code-in-page_cache_ra_order.patch readahead-drop-dead-code-in-ondemand_readahead.patch readahead-disentangle-async-and-sync-readahead.patch readahead-fold-try_context_readahead-into-its-single-caller.patch readahead-simplify-gotos-in-page_cache_sync_ra.patch

1 year, 5 months

1
0
0 0

[merged mm-hotfixes-stable] mm-optimize-the-redundant-loop-of-mm_update_owner_next.patch removed from -mm tree

by Andrew Morton

The quilt patch titled Subject: mm: optimize the redundant loop of mm_update_owner_next() has been removed from the -mm tree. Its filename was mm-optimize-the-redundant-loop-of-mm_update_owner_next.patch This patch was dropped because it was merged into the mm-hotfixes-stable branch of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm ------------------------------------------------------ From: Jinliang Zheng <alexjlzheng(a)tencent.com> Subject: mm: optimize the redundant loop of mm_update_owner_next() Date: Thu, 20 Jun 2024 20:21:24 +0800 When mm_update_owner_next() is racing with swapoff (try_to_unuse()) or /proc or ptrace or page migration (get_task_mm()), it is impossible to find an appropriate task_struct in the loop whose mm_struct is the same as the target mm_struct. If the above race condition is combined with the stress-ng-zombie and stress-ng-dup tests, such a long loop can easily cause a Hard Lockup in write_lock_irq() for tasklist_lock. Recognize this situation in advance and exit early. Link: https://lkml.kernel.org/r/20240620122123.3877432-1-alexjlzheng@tencent.com Signed-off-by: Jinliang Zheng <alexjlzheng(a)tencent.com> Acked-by: Michal Hocko <mhocko(a)suse.com> Cc: Christian Brauner <brauner(a)kernel.org> Cc: Jens Axboe <axboe(a)kernel.dk> Cc: Mateusz Guzik <mjguzik(a)gmail.com> Cc: Matthew Wilcox (Oracle) <willy(a)infradead.org> Cc: Oleg Nesterov <oleg(a)redhat.com> Cc: Tycho Andersen <tandersen(a)netflix.com> Cc: <stable(a)vger.kernel.org> Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org> --- kernel/exit.c | 2 ++ 1 file changed, 2 insertions(+) --- a/kernel/exit.c~mm-optimize-the-redundant-loop-of-mm_update_owner_next +++ a/kernel/exit.c @@ -484,6 +484,8 @@ retry: * Search through everything else, we should not get here often. */ for_each_process(g) { + if (atomic_read(&mm->mm_users) <= 1) + break; if (g->flags & PF_KTHREAD) continue; for_each_thread(g, c) { _ Patches currently in -mm which might be from alexjlzheng(a)tencent.com are

1 year, 5 months

1
0
0 0

[alternative-merged] mm-gup-clear-the-lru-flag-of-a-page-before-adding-to-lru-batch.patch removed from -mm tree

by Andrew Morton

The quilt patch titled Subject: mm/gup: clear the LRU flag of a page before adding to LRU batch has been removed from the -mm tree. Its filename was mm-gup-clear-the-lru-flag-of-a-page-before-adding-to-lru-batch.patch This patch was dropped because an alternative patch was or shall be merged ------------------------------------------------------ From: yangge <yangge1116(a)126.com> Subject: mm/gup: clear the LRU flag of a page before adding to LRU batch Date: Sat, 22 Jun 2024 14:48:04 +0800 If a large number of CMA memory are configured in system (for example, the CMA memory accounts for 50% of the system memory), starting a virtual machine, it will call pin_user_pages_remote(..., FOLL_LONGTERM, ...) to pin memory. Normally if a page is present and in CMA area, pin_user_pages_remote() will migrate the page from CMA area to non-CMA area because of FOLL_LONGTERM flag. But the current code will cause the migration failure due to unexpected page refcounts, and eventually cause the virtual machine fail to start. If a page is added in LRU batch, its refcount increases one, remove the page from LRU batch decreases one. Page migration requires the page is not referenced by others except page mapping. Before migrating a page, we should try to drain the page from LRU batch in case the page is in it, however, folio_test_lru() is not sufficient to tell whether the page is in LRU batch or not, if the page is in LRU batch, the migration will fail. To solve the problem above, we modify the logic of adding to LRU batch. Before adding a page to LRU batch, we clear the LRU flag of the page so that we can check whether the page is in LRU batch by folio_test_lru(page). Seems making the LRU flag of the page invisible a long time is no problem, because a new page is allocated from buddy and added to the lru batch, its LRU flag is also not visible for a long time. Link: https://lkml.kernel.org/r/1719038884-1903-1-git-send-email-yangge1116@126.c… Signed-off-by: yangge <yangge1116(a)126.com> Cc: Baolin Wang <baolin.wang(a)linux.alibaba.com> Cc: David Hildenbrand <david(a)redhat.com> Cc: Alex Shi <alex.shi(a)linux.alibaba.com> Cc: Shaohua Li <shli(a)kernel.org> Cc: <stable(a)vger.kernel.org> Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org> --- mm/swap.c | 43 +++++++++++++++++++++++++++++++------------ 1 file changed, 31 insertions(+), 12 deletions(-) --- a/mm/swap.c~mm-gup-clear-the-lru-flag-of-a-page-before-adding-to-lru-batch +++ a/mm/swap.c @@ -212,10 +212,6 @@ static void folio_batch_move_lru(struct for (i = 0; i < folio_batch_count(fbatch); i++) { struct folio *folio = fbatch->folios[i]; - /* block memcg migration while the folio moves between lru */ - if (move_fn != lru_add_fn && !folio_test_clear_lru(folio)) - continue; - folio_lruvec_relock_irqsave(folio, &lruvec, &flags); move_fn(lruvec, folio); @@ -256,11 +252,16 @@ static void lru_move_tail_fn(struct lruv void folio_rotate_reclaimable(struct folio *folio) { if (!folio_test_locked(folio) && !folio_test_dirty(folio) && - !folio_test_unevictable(folio) && folio_test_lru(folio)) { + !folio_test_unevictable(folio)) { struct folio_batch *fbatch; unsigned long flags; folio_get(folio); + if (!folio_test_clear_lru(folio)) { + folio_put(folio); + return; + } + local_lock_irqsave(&lru_rotate.lock, flags); fbatch = this_cpu_ptr(&lru_rotate.fbatch); folio_batch_add_and_move(fbatch, folio, lru_move_tail_fn); @@ -353,11 +354,15 @@ static void folio_activate_drain(int cpu void folio_activate(struct folio *folio) { - if (folio_test_lru(folio) && !folio_test_active(folio) && - !folio_test_unevictable(folio)) { + if (!folio_test_active(folio) && !folio_test_unevictable(folio)) { struct folio_batch *fbatch; folio_get(folio); + if (!folio_test_clear_lru(folio)) { + folio_put(folio); + return; + } + local_lock(&cpu_fbatches.lock); fbatch = this_cpu_ptr(&cpu_fbatches.activate); folio_batch_add_and_move(fbatch, folio, folio_activate_fn); @@ -701,6 +706,11 @@ void deactivate_file_folio(struct folio return; folio_get(folio); + if (!folio_test_clear_lru(folio)) { + folio_put(folio); + return; + } + local_lock(&cpu_fbatches.lock); fbatch = this_cpu_ptr(&cpu_fbatches.lru_deactivate_file); folio_batch_add_and_move(fbatch, folio, lru_deactivate_file_fn); @@ -717,11 +727,16 @@ void deactivate_file_folio(struct folio */ void folio_deactivate(struct folio *folio) { - if (folio_test_lru(folio) && !folio_test_unevictable(folio) && - (folio_test_active(folio) || lru_gen_enabled())) { + if (!folio_test_unevictable(folio) && (folio_test_active(folio) || + lru_gen_enabled())) { struct folio_batch *fbatch; folio_get(folio); + if (!folio_test_clear_lru(folio)) { + folio_put(folio); + return; + } + local_lock(&cpu_fbatches.lock); fbatch = this_cpu_ptr(&cpu_fbatches.lru_deactivate); folio_batch_add_and_move(fbatch, folio, lru_deactivate_fn); @@ -738,12 +753,16 @@ void folio_deactivate(struct folio *foli */ void folio_mark_lazyfree(struct folio *folio) { - if (folio_test_lru(folio) && folio_test_anon(folio) && - folio_test_swapbacked(folio) && !folio_test_swapcache(folio) && - !folio_test_unevictable(folio)) { + if (folio_test_anon(folio) && folio_test_swapbacked(folio) && + !folio_test_swapcache(folio) && !folio_test_unevictable(folio)) { struct folio_batch *fbatch; folio_get(folio); + if (!folio_test_clear_lru(folio)) { + folio_put(folio); + return; + } + local_lock(&cpu_fbatches.lock); fbatch = this_cpu_ptr(&cpu_fbatches.lru_lazyfree); folio_batch_add_and_move(fbatch, folio, lru_lazyfree_fn); _ Patches currently in -mm which might be from yangge1116(a)126.com are

1 year, 5 months

1
0
0 0

[v2 linus-tree PATCH] mm: gup: do not call try_grab_folio() in slow path

by Yang Shi

The try_grab_folio() is supposed to be used in fast path and it elevates folio refcount by using add ref unless zero. We are guaranteed to have at least one stable reference in slow path, so the simple atomic add could be used. The performance difference should be trivial, but the misuse may be confusing and misleading. In another thread [1] a kernel warning was reported when pinning folio in CMA memory when launching SEV virtual machine. The splat looks like: [ 464.325306] WARNING: CPU: 13 PID: 6734 at mm/gup.c:1313 __get_user_pages+0x423/0x520 [ 464.325464] CPU: 13 PID: 6734 Comm: qemu-kvm Kdump: loaded Not tainted 6.6.33+ #6 [ 464.325477] RIP: 0010:__get_user_pages+0x423/0x520 [ 464.325515] Call Trace: [ 464.325520] <TASK> [ 464.325523] ? __get_user_pages+0x423/0x520 [ 464.325528] ? __warn+0x81/0x130 [ 464.325536] ? __get_user_pages+0x423/0x520 [ 464.325541] ? report_bug+0x171/0x1a0 [ 464.325549] ? handle_bug+0x3c/0x70 [ 464.325554] ? exc_invalid_op+0x17/0x70 [ 464.325558] ? asm_exc_invalid_op+0x1a/0x20 [ 464.325567] ? __get_user_pages+0x423/0x520 [ 464.325575] __gup_longterm_locked+0x212/0x7a0 [ 464.325583] internal_get_user_pages_fast+0xfb/0x190 [ 464.325590] pin_user_pages_fast+0x47/0x60 [ 464.325598] sev_pin_memory+0xca/0x170 [kvm_amd] [ 464.325616] sev_mem_enc_register_region+0x81/0x130 [kvm_amd] Per the analysis done by yangge, when starting the SEV virtual machine, it will call pin_user_pages_fast(..., FOLL_LONGTERM, ...) to pin the memory. But the page is in CMA area, so fast GUP will fail then fallback to the slow path due to the longterm pinnalbe check in try_grab_folio(). The slow path will try to pin the pages then migrate them out of CMA area. But the slow path also uses try_grab_folio() to pin the page, it will also fail due to the same check then the above warning is triggered. [1] https://lore.kernel.org/linux-mm/1719478388-31917-1-git-send-email-yangge11… Fixes: 57edfcfd3419 ("mm/gup: accelerate thp gup even for "pages != NULL"") Cc: <stable(a)vger.kernel.org> [6.6+] Reported-by: yangge <yangge1116(a)126.com> Signed-off-by: Yang Shi <yang(a)os.amperecomputing.com> --- mm/gup.c | 278 +++++++++++++++++++++++++---------------------- mm/huge_memory.c | 2 +- mm/internal.h | 3 +- 3 files changed, 148 insertions(+), 135 deletions(-) v2: 1. Fixed the build warning 2. Reworked the commit log to include the bug report and analysis (reworded by me) from yangge 3. Rebased onto the latest Linus's tree diff --git a/mm/gup.c b/mm/gup.c index ca0f5cedce9b..6be165224c1e 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -97,95 +97,6 @@ static inline struct folio *try_get_folio(struct page *page, int refs) return folio; } -/** - * try_grab_folio() - Attempt to get or pin a folio. - * @page: pointer to page to be grabbed - * @refs: the value to (effectively) add to the folio's refcount - * @flags: gup flags: these are the FOLL_* flag values. - * - * "grab" names in this file mean, "look at flags to decide whether to use - * FOLL_PIN or FOLL_GET behavior, when incrementing the folio's refcount. - * - * Either FOLL_PIN or FOLL_GET (or neither) must be set, but not both at the - * same time. (That's true throughout the get_user_pages*() and - * pin_user_pages*() APIs.) Cases: - * - * FOLL_GET: folio's refcount will be incremented by @refs. - * - * FOLL_PIN on large folios: folio's refcount will be incremented by - * @refs, and its pincount will be incremented by @refs. - * - * FOLL_PIN on single-page folios: folio's refcount will be incremented by - * @refs * GUP_PIN_COUNTING_BIAS. - * - * Return: The folio containing @page (with refcount appropriately - * incremented) for success, or NULL upon failure. If neither FOLL_GET - * nor FOLL_PIN was set, that's considered failure, and furthermore, - * a likely bug in the caller, so a warning is also emitted. - */ -struct folio *try_grab_folio(struct page *page, int refs, unsigned int flags) -{ - struct folio *folio; - - if (WARN_ON_ONCE((flags & (FOLL_GET | FOLL_PIN)) == 0)) - return NULL; - - if (unlikely(!(flags & FOLL_PCI_P2PDMA) && is_pci_p2pdma_page(page))) - return NULL; - - if (flags & FOLL_GET) - return try_get_folio(page, refs); - - /* FOLL_PIN is set */ - - /* - * Don't take a pin on the zero page - it's not going anywhere - * and it is used in a *lot* of places. - */ - if (is_zero_page(page)) - return page_folio(page); - - folio = try_get_folio(page, refs); - if (!folio) - return NULL; - - /* - * Can't do FOLL_LONGTERM + FOLL_PIN gup fast path if not in a - * right zone, so fail and let the caller fall back to the slow - * path. - */ - if (unlikely((flags & FOLL_LONGTERM) && - !folio_is_longterm_pinnable(folio))) { - if (!put_devmap_managed_folio_refs(folio, refs)) - folio_put_refs(folio, refs); - return NULL; - } - - /* - * When pinning a large folio, use an exact count to track it. - * - * However, be sure to *also* increment the normal folio - * refcount field at least once, so that the folio really - * is pinned. That's why the refcount from the earlier - * try_get_folio() is left intact. - */ - if (folio_test_large(folio)) - atomic_add(refs, &folio->_pincount); - else - folio_ref_add(folio, - refs * (GUP_PIN_COUNTING_BIAS - 1)); - /* - * Adjust the pincount before re-checking the PTE for changes. - * This is essentially a smp_mb() and is paired with a memory - * barrier in folio_try_share_anon_rmap_*(). - */ - smp_mb__after_atomic(); - - node_stat_mod_folio(folio, NR_FOLL_PIN_ACQUIRED, refs); - - return folio; -} - static void gup_put_folio(struct folio *folio, int refs, unsigned int flags) { if (flags & FOLL_PIN) { @@ -203,28 +114,31 @@ static void gup_put_folio(struct folio *folio, int refs, unsigned int flags) } /** - * try_grab_page() - elevate a page's refcount by a flag-dependent amount - * @page: pointer to page to be grabbed - * @flags: gup flags: these are the FOLL_* flag values. + * try_grab_folio() - add a folio's refcount by a flag-dependent amount + * @folio: pointer to folio to be grabbed + * @refs: the value to (effectively) add to the folio's refcount + * @flags: gup flags: these are the FOLL_* flag values * * This might not do anything at all, depending on the flags argument. * * "grab" names in this file mean, "look at flags to decide whether to use - * FOLL_PIN or FOLL_GET behavior, when incrementing the page's refcount. + * FOLL_PIN or FOLL_GET behavior, when incrementing the folio's refcount. * * Either FOLL_PIN or FOLL_GET (or neither) may be set, but not both at the same - * time. Cases: please see the try_grab_folio() documentation, with - * "refs=1". + * time. * * Return: 0 for success, or if no action was required (if neither FOLL_PIN * nor FOLL_GET was set, nothing is done). A negative error code for failure: * - * -ENOMEM FOLL_GET or FOLL_PIN was set, but the page could not + * -ENOMEM FOLL_GET or FOLL_PIN was set, but the folio could not * be grabbed. + * + * It is called when we have a stable reference for the folio, typically in + * GUP slow path. */ -int __must_check try_grab_page(struct page *page, unsigned int flags) +int __must_check try_grab_folio(struct folio *folio, int refs, unsigned int flags) { - struct folio *folio = page_folio(page); + struct page *page = &folio->page; if (WARN_ON_ONCE(folio_ref_count(folio) <= 0)) return -ENOMEM; @@ -233,7 +147,7 @@ int __must_check try_grab_page(struct page *page, unsigned int flags) return -EREMOTEIO; if (flags & FOLL_GET) - folio_ref_inc(folio); + folio_ref_add(folio, refs); else if (flags & FOLL_PIN) { /* * Don't take a pin on the zero page - it's not going anywhere @@ -243,18 +157,18 @@ int __must_check try_grab_page(struct page *page, unsigned int flags) return 0; /* - * Similar to try_grab_folio(): be sure to *also* - * increment the normal page refcount field at least once, + * Increment the normal page refcount field at least once, * so that the page really is pinned. */ if (folio_test_large(folio)) { - folio_ref_add(folio, 1); - atomic_add(1, &folio->_pincount); + folio_ref_add(folio, refs); + atomic_add(refs, &folio->_pincount); } else { - folio_ref_add(folio, GUP_PIN_COUNTING_BIAS); + folio_ref_add(folio, + refs * GUP_PIN_COUNTING_BIAS); } - node_stat_mod_folio(folio, NR_FOLL_PIN_ACQUIRED, 1); + node_stat_mod_folio(folio, NR_FOLL_PIN_ACQUIRED, refs); } return 0; @@ -535,7 +449,7 @@ static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end, */ static int gup_hugepte(struct vm_area_struct *vma, pte_t *ptep, unsigned long sz, unsigned long addr, unsigned long end, unsigned int flags, - struct page **pages, int *nr) + struct page **pages, int *nr, bool fast) { unsigned long pte_end; struct page *page; @@ -558,9 +472,15 @@ static int gup_hugepte(struct vm_area_struct *vma, pte_t *ptep, unsigned long sz page = pte_page(pte); refs = record_subpages(page, sz, addr, end, pages + *nr); - folio = try_grab_folio(page, refs, flags); - if (!folio) - return 0; + if (fast) { + folio = try_grab_folio_fast(page, refs, flags); + if (!folio) + return 0; + } else { + folio = page_folio(page); + if (try_grab_folio(folio, refs, flags)) + return 0; + } if (unlikely(pte_val(pte) != pte_val(ptep_get(ptep)))) { gup_put_folio(folio, refs, flags); @@ -588,7 +508,7 @@ static int gup_hugepte(struct vm_area_struct *vma, pte_t *ptep, unsigned long sz static int gup_hugepd(struct vm_area_struct *vma, hugepd_t hugepd, unsigned long addr, unsigned int pdshift, unsigned long end, unsigned int flags, - struct page **pages, int *nr) + struct page **pages, int *nr, bool fast) { pte_t *ptep; unsigned long sz = 1UL << hugepd_shift(hugepd); @@ -598,7 +518,7 @@ static int gup_hugepd(struct vm_area_struct *vma, hugepd_t hugepd, ptep = hugepte_offset(hugepd, addr, pdshift); do { next = hugepte_addr_end(addr, end, sz); - ret = gup_hugepte(vma, ptep, sz, addr, end, flags, pages, nr); + ret = gup_hugepte(vma, ptep, sz, addr, end, flags, pages, nr, fast); if (ret != 1) return ret; } while (ptep++, addr = next, addr != end); @@ -625,7 +545,7 @@ static struct page *follow_hugepd(struct vm_area_struct *vma, hugepd_t hugepd, ptep = hugepte_offset(hugepd, addr, pdshift); ptl = huge_pte_lock(h, vma->vm_mm, ptep); ret = gup_hugepd(vma, hugepd, addr, pdshift, addr + PAGE_SIZE, - flags, &page, &nr); + flags, &page, &nr, false); spin_unlock(ptl); if (ret == 1) { @@ -642,7 +562,7 @@ static struct page *follow_hugepd(struct vm_area_struct *vma, hugepd_t hugepd, static inline int gup_hugepd(struct vm_area_struct *vma, hugepd_t hugepd, unsigned long addr, unsigned int pdshift, unsigned long end, unsigned int flags, - struct page **pages, int *nr) + struct page **pages, int *nr, bool fast) { return 0; } @@ -729,7 +649,7 @@ static struct page *follow_huge_pud(struct vm_area_struct *vma, gup_must_unshare(vma, flags, page)) return ERR_PTR(-EMLINK); - ret = try_grab_page(page, flags); + ret = try_grab_folio(page_folio(page), 1, flags); if (ret) page = ERR_PTR(ret); else @@ -806,7 +726,7 @@ static struct page *follow_huge_pmd(struct vm_area_struct *vma, VM_BUG_ON_PAGE((flags & FOLL_PIN) && PageAnon(page) && !PageAnonExclusive(page), page); - ret = try_grab_page(page, flags); + ret = try_grab_folio(page_folio(page), 1, flags); if (ret) return ERR_PTR(ret); @@ -968,8 +888,8 @@ static struct page *follow_page_pte(struct vm_area_struct *vma, VM_BUG_ON_PAGE((flags & FOLL_PIN) && PageAnon(page) && !PageAnonExclusive(page), page); - /* try_grab_page() does nothing unless FOLL_GET or FOLL_PIN is set. */ - ret = try_grab_page(page, flags); + /* try_grab_folio() does nothing unless FOLL_GET or FOLL_PIN is set. */ + ret = try_grab_folio(page_folio(page), 1, flags); if (unlikely(ret)) { page = ERR_PTR(ret); goto out; @@ -1233,7 +1153,7 @@ static int get_gate_page(struct mm_struct *mm, unsigned long address, goto unmap; *page = pte_page(entry); } - ret = try_grab_page(*page, gup_flags); + ret = try_grab_folio(page_folio(*page), 1, gup_flags); if (unlikely(ret)) goto unmap; out: @@ -1636,20 +1556,19 @@ static long __get_user_pages(struct mm_struct *mm, * pages. */ if (page_increm > 1) { - struct folio *folio; + struct folio *folio = page_folio(page); /* * Since we already hold refcount on the * large folio, this should never fail. */ - folio = try_grab_folio(page, page_increm - 1, - foll_flags); - if (WARN_ON_ONCE(!folio)) { + if (try_grab_folio(folio, page_increm - 1, + foll_flags)) { /* * Release the 1st page ref if the * folio is problematic, fail hard. */ - gup_put_folio(page_folio(page), 1, + gup_put_folio(folio, 1, foll_flags); ret = -EFAULT; goto out; @@ -2797,6 +2716,101 @@ EXPORT_SYMBOL(get_user_pages_unlocked); * This code is based heavily on the PowerPC implementation by Nick Piggin. */ #ifdef CONFIG_HAVE_GUP_FAST +/** + * try_grab_folio_fast() - Attempt to get or pin a folio in fast path. + * @page: pointer to page to be grabbed + * @refs: the value to (effectively) add to the folio's refcount + * @flags: gup flags: these are the FOLL_* flag values. + * + * "grab" names in this file mean, "look at flags to decide whether to use + * FOLL_PIN or FOLL_GET behavior, when incrementing the folio's refcount. + * + * Either FOLL_PIN or FOLL_GET (or neither) must be set, but not both at the + * same time. (That's true throughout the get_user_pages*() and + * pin_user_pages*() APIs.) Cases: + * + * FOLL_GET: folio's refcount will be incremented by @refs. + * + * FOLL_PIN on large folios: folio's refcount will be incremented by + * @refs, and its pincount will be incremented by @refs. + * + * FOLL_PIN on single-page folios: folio's refcount will be incremented by + * @refs * GUP_PIN_COUNTING_BIAS. + * + * Return: The folio containing @page (with refcount appropriately + * incremented) for success, or NULL upon failure. If neither FOLL_GET + * nor FOLL_PIN was set, that's considered failure, and furthermore, + * a likely bug in the caller, so a warning is also emitted. + * + * It uses add ref unless zero to elevate the folio refcount and must be called + * in fast path only. + */ +static struct folio *try_grab_folio_fast(struct page *page, int refs, + unsigned int flags) +{ + struct folio *folio; + + /* Raise warn if it is not called in fast GUP */ + VM_WARN_ON_ONCE(!irqs_disabled()); + + if (WARN_ON_ONCE((flags & (FOLL_GET | FOLL_PIN)) == 0)) + return NULL; + + if (unlikely(!(flags & FOLL_PCI_P2PDMA) && is_pci_p2pdma_page(page))) + return NULL; + + if (flags & FOLL_GET) + return try_get_folio(page, refs); + + /* FOLL_PIN is set */ + + /* + * Don't take a pin on the zero page - it's not going anywhere + * and it is used in a *lot* of places. + */ + if (is_zero_page(page)) + return page_folio(page); + + folio = try_get_folio(page, refs); + if (!folio) + return NULL; + + /* + * Can't do FOLL_LONGTERM + FOLL_PIN gup fast path if not in a + * right zone, so fail and let the caller fall back to the slow + * path. + */ + if (unlikely((flags & FOLL_LONGTERM) && + !folio_is_longterm_pinnable(folio))) { + if (!put_devmap_managed_folio_refs(folio, refs)) + folio_put_refs(folio, refs); + return NULL; + } + + /* + * When pinning a large folio, use an exact count to track it. + * + * However, be sure to *also* increment the normal folio + * refcount field at least once, so that the folio really + * is pinned. That's why the refcount from the earlier + * try_get_folio() is left intact. + */ + if (folio_test_large(folio)) + atomic_add(refs, &folio->_pincount); + else + folio_ref_add(folio, + refs * (GUP_PIN_COUNTING_BIAS - 1)); + /* + * Adjust the pincount before re-checking the PTE for changes. + * This is essentially a smp_mb() and is paired with a memory + * barrier in folio_try_share_anon_rmap_*(). + */ + smp_mb__after_atomic(); + + node_stat_mod_folio(folio, NR_FOLL_PIN_ACQUIRED, refs); + + return folio; +} /* * Used in the GUP-fast path to determine whether GUP is permitted to work on @@ -2962,7 +2976,7 @@ static int gup_fast_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr, VM_BUG_ON(!pfn_valid(pte_pfn(pte))); page = pte_page(pte); - folio = try_grab_folio(page, 1, flags); + folio = try_grab_folio_fast(page, 1, flags); if (!folio) goto pte_unmap; @@ -3049,7 +3063,7 @@ static int gup_fast_devmap_leaf(unsigned long pfn, unsigned long addr, break; } - folio = try_grab_folio(page, 1, flags); + folio = try_grab_folio_fast(page, 1, flags); if (!folio) { gup_fast_undo_dev_pagemap(nr, nr_start, flags, pages); break; @@ -3138,7 +3152,7 @@ static int gup_fast_pmd_leaf(pmd_t orig, pmd_t *pmdp, unsigned long addr, page = pmd_page(orig); refs = record_subpages(page, PMD_SIZE, addr, end, pages + *nr); - folio = try_grab_folio(page, refs, flags); + folio = try_grab_folio_fast(page, refs, flags); if (!folio) return 0; @@ -3182,7 +3196,7 @@ static int gup_fast_pud_leaf(pud_t orig, pud_t *pudp, unsigned long addr, page = pud_page(orig); refs = record_subpages(page, PUD_SIZE, addr, end, pages + *nr); - folio = try_grab_folio(page, refs, flags); + folio = try_grab_folio_fast(page, refs, flags); if (!folio) return 0; @@ -3222,7 +3236,7 @@ static int gup_fast_pgd_leaf(pgd_t orig, pgd_t *pgdp, unsigned long addr, page = pgd_page(orig); refs = record_subpages(page, PGDIR_SIZE, addr, end, pages + *nr); - folio = try_grab_folio(page, refs, flags); + folio = try_grab_folio_fast(page, refs, flags); if (!folio) return 0; @@ -3276,7 +3290,7 @@ static int gup_fast_pmd_range(pud_t *pudp, pud_t pud, unsigned long addr, * pmd format and THP pmd format */ if (gup_hugepd(NULL, __hugepd(pmd_val(pmd)), addr, - PMD_SHIFT, next, flags, pages, nr) != 1) + PMD_SHIFT, next, flags, pages, nr, true) != 1) return 0; } else if (!gup_fast_pte_range(pmd, pmdp, addr, next, flags, pages, nr)) @@ -3306,7 +3320,7 @@ static int gup_fast_pud_range(p4d_t *p4dp, p4d_t p4d, unsigned long addr, return 0; } else if (unlikely(is_hugepd(__hugepd(pud_val(pud))))) { if (gup_hugepd(NULL, __hugepd(pud_val(pud)), addr, - PUD_SHIFT, next, flags, pages, nr) != 1) + PUD_SHIFT, next, flags, pages, nr, true) != 1) return 0; } else if (!gup_fast_pmd_range(pudp, pud, addr, next, flags, pages, nr)) @@ -3333,7 +3347,7 @@ static int gup_fast_p4d_range(pgd_t *pgdp, pgd_t pgd, unsigned long addr, BUILD_BUG_ON(p4d_leaf(p4d)); if (unlikely(is_hugepd(__hugepd(p4d_val(p4d))))) { if (gup_hugepd(NULL, __hugepd(p4d_val(p4d)), addr, - P4D_SHIFT, next, flags, pages, nr) != 1) + P4D_SHIFT, next, flags, pages, nr, true) != 1) return 0; } else if (!gup_fast_pud_range(p4dp, p4d, addr, next, flags, pages, nr)) @@ -3362,7 +3376,7 @@ static void gup_fast_pgd_range(unsigned long addr, unsigned long end, return; } else if (unlikely(is_hugepd(__hugepd(pgd_val(pgd))))) { if (gup_hugepd(NULL, __hugepd(pgd_val(pgd)), addr, - PGDIR_SHIFT, next, flags, pages, nr) != 1) + PGDIR_SHIFT, next, flags, pages, nr, true) != 1) return; } else if (!gup_fast_p4d_range(pgdp, pgd, addr, next, flags, pages, nr)) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index db7946a0a28c..2120f7478e55 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1331,7 +1331,7 @@ struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr, if (!*pgmap) return ERR_PTR(-EFAULT); page = pfn_to_page(pfn); - ret = try_grab_page(page, flags); + ret = try_grab_folio(page_folio(page), 1, flags); if (ret) page = ERR_PTR(ret); diff --git a/mm/internal.h b/mm/internal.h index 6902b7dd8509..52db9219b2db 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1182,8 +1182,7 @@ int migrate_device_coherent_page(struct page *page); /* * mm/gup.c */ -struct folio *try_grab_folio(struct page *page, int refs, unsigned int flags); -int __must_check try_grab_page(struct page *page, unsigned int flags); +int __must_check try_grab_folio(struct folio *folio, int refs, unsigned int flags); /* * mm/huge_memory.c -- 2.41.0

1 year, 5 months

3
2
0 0

+ mm-gup-do-not-call-try_grab_folio-in-slow-path.patch added to mm-hotfixes-unstable branch

by Andrew Morton

The patch titled Subject: mm: gup: do not call try_grab_folio() in slow path has been added to the -mm mm-hotfixes-unstable branch. Its filename is mm-gup-do-not-call-try_grab_folio-in-slow-path.patch This patch will shortly appear at https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patche… This patch will later appear in the mm-hotfixes-unstable branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm Before you just go and hit "reply", please: a) Consider who else should be cc'ed b) Prefer to cc a suitable mailing list as well c) Ideally: find the original patch on the mailing list and do a reply-to-all to that, adding suitable additional cc's *** Remember to use Documentation/process/submit-checklist.rst when testing your code *** The -mm tree is included into linux-next via the mm-everything branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm and is updated there every 2-3 working days ------------------------------------------------------ From: Yang Shi <yang(a)os.amperecomputing.com> Subject: mm: gup: do not call try_grab_folio() in slow path Date: Thu, 27 Jun 2024 16:16:01 -0700 The try_grab_folio() is supposed to be used in fast path and it elevates folio refcount by using add ref unless zero. We are guaranteed to have at least one stable reference in slow path, so the simple atomic add could be used. The performance difference should be trivial, but the misuse may be confusing and misleading. In another thread [1] a kernel warning was reported when pinning folio in CMA memory when launching SEV virtual machine. The splat looks like: [ 464.325306] WARNING: CPU: 13 PID: 6734 at mm/gup.c:1313 __get_user_pages+0x423/0x520 [ 464.325464] CPU: 13 PID: 6734 Comm: qemu-kvm Kdump: loaded Not tainted 6.6.33+ #6 [ 464.325477] RIP: 0010:__get_user_pages+0x423/0x520 [ 464.325515] Call Trace: [ 464.325520] <TASK> [ 464.325523] ? __get_user_pages+0x423/0x520 [ 464.325528] ? __warn+0x81/0x130 [ 464.325536] ? __get_user_pages+0x423/0x520 [ 464.325541] ? report_bug+0x171/0x1a0 [ 464.325549] ? handle_bug+0x3c/0x70 [ 464.325554] ? exc_invalid_op+0x17/0x70 [ 464.325558] ? asm_exc_invalid_op+0x1a/0x20 [ 464.325567] ? __get_user_pages+0x423/0x520 [ 464.325575] __gup_longterm_locked+0x212/0x7a0 [ 464.325583] internal_get_user_pages_fast+0xfb/0x190 [ 464.325590] pin_user_pages_fast+0x47/0x60 [ 464.325598] sev_pin_memory+0xca/0x170 [kvm_amd] [ 464.325616] sev_mem_enc_register_region+0x81/0x130 [kvm_amd] Per the analysis done by yangge, when starting the SEV virtual machine, it will call pin_user_pages_fast(..., FOLL_LONGTERM, ...) to pin the memory. But the page is in CMA area, so fast GUP will fail then fallback to the slow path due to the longterm pinnalbe check in try_grab_folio(). The slow path will try to pin the pages then migrate them out of CMA area. But the slow path also uses try_grab_folio() to pin the page, it will also fail due to the same check then the above warning is triggered. [1] https://lore.kernel.org/linux-mm/1719478388-31917-1-git-send-email-yangge11… Link: https://lkml.kernel.org/r/20240627231601.1713119-1-yang@os.amperecomputing.… Fixes: 57edfcfd3419 ("mm/gup: accelerate thp gup even for "pages != NULL"") Signed-off-by: Yang Shi <yang(a)os.amperecomputing.com> Reported-by: yangge <yangge1116(a)126.com> Cc: David Hildenbrand <david(a)redhat.com> Cc: Peter Xu <peterx(a)redhat.com> Cc: <stable(a)vger.kernel.org> [6.6+] Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org> --- mm/gup.c | 278 +++++++++++++++++++++++---------------------- mm/huge_memory.c | 2 mm/internal.h | 3 3 files changed, 148 insertions(+), 135 deletions(-) --- a/mm/gup.c~mm-gup-do-not-call-try_grab_folio-in-slow-path +++ a/mm/gup.c @@ -97,95 +97,6 @@ retry: return folio; } -/** - * try_grab_folio() - Attempt to get or pin a folio. - * @page: pointer to page to be grabbed - * @refs: the value to (effectively) add to the folio's refcount - * @flags: gup flags: these are the FOLL_* flag values. - * - * "grab" names in this file mean, "look at flags to decide whether to use - * FOLL_PIN or FOLL_GET behavior, when incrementing the folio's refcount. - * - * Either FOLL_PIN or FOLL_GET (or neither) must be set, but not both at the - * same time. (That's true throughout the get_user_pages*() and - * pin_user_pages*() APIs.) Cases: - * - * FOLL_GET: folio's refcount will be incremented by @refs. - * - * FOLL_PIN on large folios: folio's refcount will be incremented by - * @refs, and its pincount will be incremented by @refs. - * - * FOLL_PIN on single-page folios: folio's refcount will be incremented by - * @refs * GUP_PIN_COUNTING_BIAS. - * - * Return: The folio containing @page (with refcount appropriately - * incremented) for success, or NULL upon failure. If neither FOLL_GET - * nor FOLL_PIN was set, that's considered failure, and furthermore, - * a likely bug in the caller, so a warning is also emitted. - */ -struct folio *try_grab_folio(struct page *page, int refs, unsigned int flags) -{ - struct folio *folio; - - if (WARN_ON_ONCE((flags & (FOLL_GET | FOLL_PIN)) == 0)) - return NULL; - - if (unlikely(!(flags & FOLL_PCI_P2PDMA) && is_pci_p2pdma_page(page))) - return NULL; - - if (flags & FOLL_GET) - return try_get_folio(page, refs); - - /* FOLL_PIN is set */ - - /* - * Don't take a pin on the zero page - it's not going anywhere - * and it is used in a *lot* of places. - */ - if (is_zero_page(page)) - return page_folio(page); - - folio = try_get_folio(page, refs); - if (!folio) - return NULL; - - /* - * Can't do FOLL_LONGTERM + FOLL_PIN gup fast path if not in a - * right zone, so fail and let the caller fall back to the slow - * path. - */ - if (unlikely((flags & FOLL_LONGTERM) && - !folio_is_longterm_pinnable(folio))) { - if (!put_devmap_managed_folio_refs(folio, refs)) - folio_put_refs(folio, refs); - return NULL; - } - - /* - * When pinning a large folio, use an exact count to track it. - * - * However, be sure to *also* increment the normal folio - * refcount field at least once, so that the folio really - * is pinned. That's why the refcount from the earlier - * try_get_folio() is left intact. - */ - if (folio_test_large(folio)) - atomic_add(refs, &folio->_pincount); - else - folio_ref_add(folio, - refs * (GUP_PIN_COUNTING_BIAS - 1)); - /* - * Adjust the pincount before re-checking the PTE for changes. - * This is essentially a smp_mb() and is paired with a memory - * barrier in folio_try_share_anon_rmap_*(). - */ - smp_mb__after_atomic(); - - node_stat_mod_folio(folio, NR_FOLL_PIN_ACQUIRED, refs); - - return folio; -} - static void gup_put_folio(struct folio *folio, int refs, unsigned int flags) { if (flags & FOLL_PIN) { @@ -203,28 +114,31 @@ static void gup_put_folio(struct folio * } /** - * try_grab_page() - elevate a page's refcount by a flag-dependent amount - * @page: pointer to page to be grabbed - * @flags: gup flags: these are the FOLL_* flag values. + * try_grab_folio() - add a folio's refcount by a flag-dependent amount + * @folio: pointer to folio to be grabbed + * @refs: the value to (effectively) add to the folio's refcount + * @flags: gup flags: these are the FOLL_* flag values * * This might not do anything at all, depending on the flags argument. * * "grab" names in this file mean, "look at flags to decide whether to use - * FOLL_PIN or FOLL_GET behavior, when incrementing the page's refcount. + * FOLL_PIN or FOLL_GET behavior, when incrementing the folio's refcount. * * Either FOLL_PIN or FOLL_GET (or neither) may be set, but not both at the same - * time. Cases: please see the try_grab_folio() documentation, with - * "refs=1". + * time. * * Return: 0 for success, or if no action was required (if neither FOLL_PIN * nor FOLL_GET was set, nothing is done). A negative error code for failure: * - * -ENOMEM FOLL_GET or FOLL_PIN was set, but the page could not + * -ENOMEM FOLL_GET or FOLL_PIN was set, but the folio could not * be grabbed. + * + * It is called when we have a stable reference for the folio, typically in + * GUP slow path. */ -int __must_check try_grab_page(struct page *page, unsigned int flags) +int __must_check try_grab_folio(struct folio *folio, int refs, unsigned int flags) { - struct folio *folio = page_folio(page); + struct page *page = &folio->page; if (WARN_ON_ONCE(folio_ref_count(folio) <= 0)) return -ENOMEM; @@ -233,7 +147,7 @@ int __must_check try_grab_page(struct pa return -EREMOTEIO; if (flags & FOLL_GET) - folio_ref_inc(folio); + folio_ref_add(folio, refs); else if (flags & FOLL_PIN) { /* * Don't take a pin on the zero page - it's not going anywhere @@ -243,18 +157,18 @@ int __must_check try_grab_page(struct pa return 0; /* - * Similar to try_grab_folio(): be sure to *also* - * increment the normal page refcount field at least once, + * Increment the normal page refcount field at least once, * so that the page really is pinned. */ if (folio_test_large(folio)) { - folio_ref_add(folio, 1); - atomic_add(1, &folio->_pincount); + folio_ref_add(folio, refs); + atomic_add(refs, &folio->_pincount); } else { - folio_ref_add(folio, GUP_PIN_COUNTING_BIAS); + folio_ref_add(folio, + refs * GUP_PIN_COUNTING_BIAS); } - node_stat_mod_folio(folio, NR_FOLL_PIN_ACQUIRED, 1); + node_stat_mod_folio(folio, NR_FOLL_PIN_ACQUIRED, refs); } return 0; @@ -535,7 +449,7 @@ static unsigned long hugepte_addr_end(un */ static int gup_hugepte(struct vm_area_struct *vma, pte_t *ptep, unsigned long sz, unsigned long addr, unsigned long end, unsigned int flags, - struct page **pages, int *nr) + struct page **pages, int *nr, bool fast) { unsigned long pte_end; struct page *page; @@ -558,9 +472,15 @@ static int gup_hugepte(struct vm_area_st page = pte_page(pte); refs = record_subpages(page, sz, addr, end, pages + *nr); - folio = try_grab_folio(page, refs, flags); - if (!folio) - return 0; + if (fast) { + folio = try_grab_folio_fast(page, refs, flags); + if (!folio) + return 0; + } else { + folio = page_folio(page); + if (try_grab_folio(folio, refs, flags)) + return 0; + } if (unlikely(pte_val(pte) != pte_val(ptep_get(ptep)))) { gup_put_folio(folio, refs, flags); @@ -588,7 +508,7 @@ static int gup_hugepte(struct vm_area_st static int gup_hugepd(struct vm_area_struct *vma, hugepd_t hugepd, unsigned long addr, unsigned int pdshift, unsigned long end, unsigned int flags, - struct page **pages, int *nr) + struct page **pages, int *nr, bool fast) { pte_t *ptep; unsigned long sz = 1UL << hugepd_shift(hugepd); @@ -598,7 +518,7 @@ static int gup_hugepd(struct vm_area_str ptep = hugepte_offset(hugepd, addr, pdshift); do { next = hugepte_addr_end(addr, end, sz); - ret = gup_hugepte(vma, ptep, sz, addr, end, flags, pages, nr); + ret = gup_hugepte(vma, ptep, sz, addr, end, flags, pages, nr, fast); if (ret != 1) return ret; } while (ptep++, addr = next, addr != end); @@ -625,7 +545,7 @@ static struct page *follow_hugepd(struct ptep = hugepte_offset(hugepd, addr, pdshift); ptl = huge_pte_lock(h, vma->vm_mm, ptep); ret = gup_hugepd(vma, hugepd, addr, pdshift, addr + PAGE_SIZE, - flags, &page, &nr); + flags, &page, &nr, false); spin_unlock(ptl); if (ret == 1) { @@ -642,7 +562,7 @@ static struct page *follow_hugepd(struct static inline int gup_hugepd(struct vm_area_struct *vma, hugepd_t hugepd, unsigned long addr, unsigned int pdshift, unsigned long end, unsigned int flags, - struct page **pages, int *nr) + struct page **pages, int *nr, bool fast) { return 0; } @@ -729,7 +649,7 @@ static struct page *follow_huge_pud(stru gup_must_unshare(vma, flags, page)) return ERR_PTR(-EMLINK); - ret = try_grab_page(page, flags); + ret = try_grab_folio(page_folio(page), 1, flags); if (ret) page = ERR_PTR(ret); else @@ -806,7 +726,7 @@ static struct page *follow_huge_pmd(stru VM_BUG_ON_PAGE((flags & FOLL_PIN) && PageAnon(page) && !PageAnonExclusive(page), page); - ret = try_grab_page(page, flags); + ret = try_grab_folio(page_folio(page), 1, flags); if (ret) return ERR_PTR(ret); @@ -968,8 +888,8 @@ static struct page *follow_page_pte(stru VM_BUG_ON_PAGE((flags & FOLL_PIN) && PageAnon(page) && !PageAnonExclusive(page), page); - /* try_grab_page() does nothing unless FOLL_GET or FOLL_PIN is set. */ - ret = try_grab_page(page, flags); + /* try_grab_folio() does nothing unless FOLL_GET or FOLL_PIN is set. */ + ret = try_grab_folio(page_folio(page), 1, flags); if (unlikely(ret)) { page = ERR_PTR(ret); goto out; @@ -1233,7 +1153,7 @@ static int get_gate_page(struct mm_struc goto unmap; *page = pte_page(entry); } - ret = try_grab_page(*page, gup_flags); + ret = try_grab_folio(page_folio(*page), 1, gup_flags); if (unlikely(ret)) goto unmap; out: @@ -1636,20 +1556,19 @@ next_page: * pages. */ if (page_increm > 1) { - struct folio *folio; + struct folio *folio = page_folio(page); /* * Since we already hold refcount on the * large folio, this should never fail. */ - folio = try_grab_folio(page, page_increm - 1, - foll_flags); - if (WARN_ON_ONCE(!folio)) { + if (try_grab_folio(folio, page_increm - 1, + foll_flags)) { /* * Release the 1st page ref if the * folio is problematic, fail hard. */ - gup_put_folio(page_folio(page), 1, + gup_put_folio(folio, 1, foll_flags); ret = -EFAULT; goto out; @@ -2797,6 +2716,101 @@ EXPORT_SYMBOL(get_user_pages_unlocked); * This code is based heavily on the PowerPC implementation by Nick Piggin. */ #ifdef CONFIG_HAVE_GUP_FAST +/** + * try_grab_folio_fast() - Attempt to get or pin a folio in fast path. + * @page: pointer to page to be grabbed + * @refs: the value to (effectively) add to the folio's refcount + * @flags: gup flags: these are the FOLL_* flag values. + * + * "grab" names in this file mean, "look at flags to decide whether to use + * FOLL_PIN or FOLL_GET behavior, when incrementing the folio's refcount. + * + * Either FOLL_PIN or FOLL_GET (or neither) must be set, but not both at the + * same time. (That's true throughout the get_user_pages*() and + * pin_user_pages*() APIs.) Cases: + * + * FOLL_GET: folio's refcount will be incremented by @refs. + * + * FOLL_PIN on large folios: folio's refcount will be incremented by + * @refs, and its pincount will be incremented by @refs. + * + * FOLL_PIN on single-page folios: folio's refcount will be incremented by + * @refs * GUP_PIN_COUNTING_BIAS. + * + * Return: The folio containing @page (with refcount appropriately + * incremented) for success, or NULL upon failure. If neither FOLL_GET + * nor FOLL_PIN was set, that's considered failure, and furthermore, + * a likely bug in the caller, so a warning is also emitted. + * + * It uses add ref unless zero to elevate the folio refcount and must be called + * in fast path only. + */ +static struct folio *try_grab_folio_fast(struct page *page, int refs, + unsigned int flags) +{ + struct folio *folio; + + /* Raise warn if it is not called in fast GUP */ + VM_WARN_ON_ONCE(!irqs_disabled()); + + if (WARN_ON_ONCE((flags & (FOLL_GET | FOLL_PIN)) == 0)) + return NULL; + + if (unlikely(!(flags & FOLL_PCI_P2PDMA) && is_pci_p2pdma_page(page))) + return NULL; + + if (flags & FOLL_GET) + return try_get_folio(page, refs); + + /* FOLL_PIN is set */ + + /* + * Don't take a pin on the zero page - it's not going anywhere + * and it is used in a *lot* of places. + */ + if (is_zero_page(page)) + return page_folio(page); + + folio = try_get_folio(page, refs); + if (!folio) + return NULL; + + /* + * Can't do FOLL_LONGTERM + FOLL_PIN gup fast path if not in a + * right zone, so fail and let the caller fall back to the slow + * path. + */ + if (unlikely((flags & FOLL_LONGTERM) && + !folio_is_longterm_pinnable(folio))) { + if (!put_devmap_managed_folio_refs(folio, refs)) + folio_put_refs(folio, refs); + return NULL; + } + + /* + * When pinning a large folio, use an exact count to track it. + * + * However, be sure to *also* increment the normal folio + * refcount field at least once, so that the folio really + * is pinned. That's why the refcount from the earlier + * try_get_folio() is left intact. + */ + if (folio_test_large(folio)) + atomic_add(refs, &folio->_pincount); + else + folio_ref_add(folio, + refs * (GUP_PIN_COUNTING_BIAS - 1)); + /* + * Adjust the pincount before re-checking the PTE for changes. + * This is essentially a smp_mb() and is paired with a memory + * barrier in folio_try_share_anon_rmap_*(). + */ + smp_mb__after_atomic(); + + node_stat_mod_folio(folio, NR_FOLL_PIN_ACQUIRED, refs); + + return folio; +} /* * Used in the GUP-fast path to determine whether GUP is permitted to work on @@ -2962,7 +2976,7 @@ static int gup_fast_pte_range(pmd_t pmd, VM_BUG_ON(!pfn_valid(pte_pfn(pte))); page = pte_page(pte); - folio = try_grab_folio(page, 1, flags); + folio = try_grab_folio_fast(page, 1, flags); if (!folio) goto pte_unmap; @@ -3049,7 +3063,7 @@ static int gup_fast_devmap_leaf(unsigned break; } - folio = try_grab_folio(page, 1, flags); + folio = try_grab_folio_fast(page, 1, flags); if (!folio) { gup_fast_undo_dev_pagemap(nr, nr_start, flags, pages); break; @@ -3138,7 +3152,7 @@ static int gup_fast_pmd_leaf(pmd_t orig, page = pmd_page(orig); refs = record_subpages(page, PMD_SIZE, addr, end, pages + *nr); - folio = try_grab_folio(page, refs, flags); + folio = try_grab_folio_fast(page, refs, flags); if (!folio) return 0; @@ -3182,7 +3196,7 @@ static int gup_fast_pud_leaf(pud_t orig, page = pud_page(orig); refs = record_subpages(page, PUD_SIZE, addr, end, pages + *nr); - folio = try_grab_folio(page, refs, flags); + folio = try_grab_folio_fast(page, refs, flags); if (!folio) return 0; @@ -3222,7 +3236,7 @@ static int gup_fast_pgd_leaf(pgd_t orig, page = pgd_page(orig); refs = record_subpages(page, PGDIR_SIZE, addr, end, pages + *nr); - folio = try_grab_folio(page, refs, flags); + folio = try_grab_folio_fast(page, refs, flags); if (!folio) return 0; @@ -3276,7 +3290,7 @@ static int gup_fast_pmd_range(pud_t *pud * pmd format and THP pmd format */ if (gup_hugepd(NULL, __hugepd(pmd_val(pmd)), addr, - PMD_SHIFT, next, flags, pages, nr) != 1) + PMD_SHIFT, next, flags, pages, nr, true) != 1) return 0; } else if (!gup_fast_pte_range(pmd, pmdp, addr, next, flags, pages, nr)) @@ -3306,7 +3320,7 @@ static int gup_fast_pud_range(p4d_t *p4d return 0; } else if (unlikely(is_hugepd(__hugepd(pud_val(pud))))) { if (gup_hugepd(NULL, __hugepd(pud_val(pud)), addr, - PUD_SHIFT, next, flags, pages, nr) != 1) + PUD_SHIFT, next, flags, pages, nr, true) != 1) return 0; } else if (!gup_fast_pmd_range(pudp, pud, addr, next, flags, pages, nr)) @@ -3333,7 +3347,7 @@ static int gup_fast_p4d_range(pgd_t *pgd BUILD_BUG_ON(p4d_leaf(p4d)); if (unlikely(is_hugepd(__hugepd(p4d_val(p4d))))) { if (gup_hugepd(NULL, __hugepd(p4d_val(p4d)), addr, - P4D_SHIFT, next, flags, pages, nr) != 1) + P4D_SHIFT, next, flags, pages, nr, true) != 1) return 0; } else if (!gup_fast_pud_range(p4dp, p4d, addr, next, flags, pages, nr)) @@ -3362,7 +3376,7 @@ static void gup_fast_pgd_range(unsigned return; } else if (unlikely(is_hugepd(__hugepd(pgd_val(pgd))))) { if (gup_hugepd(NULL, __hugepd(pgd_val(pgd)), addr, - PGDIR_SHIFT, next, flags, pages, nr) != 1) + PGDIR_SHIFT, next, flags, pages, nr, true) != 1) return; } else if (!gup_fast_p4d_range(pgdp, pgd, addr, next, flags, pages, nr)) --- a/mm/huge_memory.c~mm-gup-do-not-call-try_grab_folio-in-slow-path +++ a/mm/huge_memory.c @@ -1331,7 +1331,7 @@ struct page *follow_devmap_pmd(struct vm if (!*pgmap) return ERR_PTR(-EFAULT); page = pfn_to_page(pfn); - ret = try_grab_page(page, flags); + ret = try_grab_folio(page_folio(page), 1, flags); if (ret) page = ERR_PTR(ret); --- a/mm/internal.h~mm-gup-do-not-call-try_grab_folio-in-slow-path +++ a/mm/internal.h @@ -1182,8 +1182,7 @@ int migrate_device_coherent_page(struct /* * mm/gup.c */ -struct folio *try_grab_folio(struct page *page, int refs, unsigned int flags); -int __must_check try_grab_page(struct page *page, unsigned int flags); +int __must_check try_grab_folio(struct folio *folio, int refs, unsigned int flags); /* * mm/huge_memory.c _ Patches currently in -mm which might be from yang(a)os.amperecomputing.com are mm-page_ref-remove-folio_try_get_rcu.patch mm-gup-do-not-call-try_grab_folio-in-slow-path.patch hugetlbfs-add-mte-support.patch

1 year, 5 months

1
0
0 0

2025

2024

2023

2022

2021

2020

2019

2018

2017

Linux-stable-mirror June 2024