From: Xiubo Li <xiubli(a)redhat.com>
If a client sends out a cap update dropping caps with the prior 'seq'
just before an incoming cap revoke request, then the client may drop
the revoke because it believes it's already released the requested
capabilities.
This causes the MDS to wait indefinitely for the client to respond
to the revoke. It's therefore always a good idea to ack the cap
revoke request with the bumped up 'seq'.
Currently if the cap->issued equals to the newcaps the check_caps()
will do nothing, we should force flush the caps.
Cc: stable(a)vger.kernel.org
Link: https://tracker.ceph.com/issues/61782
Signed-off-by: Xiubo Li <xiubli(a)redhat.com>
---
fs/ceph/caps.c | 16 ++++++++++++----
fs/ceph/super.h | 7 ++++---
2 files changed, 16 insertions(+), 7 deletions(-)
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 24c31f795938..ba5809cf8f02 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -2024,6 +2024,8 @@ bool __ceph_should_report_size(struct ceph_inode_info *ci)
* CHECK_CAPS_AUTHONLY - we should only check the auth cap
* CHECK_CAPS_FLUSH - we should flush any dirty caps immediately, without
* further delay.
+ * CHECK_CAPS_FLUSH_FORCE - we should flush any caps immediately, without
+ * further delay.
*/
void ceph_check_caps(struct ceph_inode_info *ci, int flags)
{
@@ -2105,7 +2107,7 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags)
}
doutc(cl, "%p %llx.%llx file_want %s used %s dirty %s "
- "flushing %s issued %s revoking %s retain %s %s%s%s\n",
+ "flushing %s issued %s revoking %s retain %s %s%s%s%s\n",
inode, ceph_vinop(inode), ceph_cap_string(file_wanted),
ceph_cap_string(used), ceph_cap_string(ci->i_dirty_caps),
ceph_cap_string(ci->i_flushing_caps),
@@ -2113,7 +2115,8 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags)
ceph_cap_string(retain),
(flags & CHECK_CAPS_AUTHONLY) ? " AUTHONLY" : "",
(flags & CHECK_CAPS_FLUSH) ? " FLUSH" : "",
- (flags & CHECK_CAPS_NOINVAL) ? " NOINVAL" : "");
+ (flags & CHECK_CAPS_NOINVAL) ? " NOINVAL" : "",
+ (flags & CHECK_CAPS_FLUSH_FORCE) ? " FLUSH_FORCE" : "");
/*
* If we no longer need to hold onto old our caps, and we may
@@ -2223,6 +2226,9 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags)
goto ack;
}
+ if (flags & CHECK_CAPS_FLUSH_FORCE)
+ goto ack;
+
/* things we might delay */
if ((cap->issued & ~retain) == 0)
continue; /* nope, all good */
@@ -3518,6 +3524,7 @@ static void handle_cap_grant(struct inode *inode,
bool queue_invalidate = false;
bool deleted_inode = false;
bool fill_inline = false;
+ int flags = 0;
/*
* If there is at least one crypto block then we'll trust
@@ -3751,6 +3758,7 @@ static void handle_cap_grant(struct inode *inode,
/* don't let check_caps skip sending a response to MDS for revoke msgs */
if (le32_to_cpu(grant->op) == CEPH_CAP_OP_REVOKE) {
cap->mds_wanted = 0;
+ flags |= CHECK_CAPS_FLUSH_FORCE;
if (cap == ci->i_auth_cap)
check_caps = 1; /* check auth cap only */
else
@@ -3806,9 +3814,9 @@ static void handle_cap_grant(struct inode *inode,
mutex_unlock(&session->s_mutex);
if (check_caps == 1)
- ceph_check_caps(ci, CHECK_CAPS_AUTHONLY | CHECK_CAPS_NOINVAL);
+ ceph_check_caps(ci, flags | CHECK_CAPS_AUTHONLY | CHECK_CAPS_NOINVAL);
else if (check_caps == 2)
- ceph_check_caps(ci, CHECK_CAPS_NOINVAL);
+ ceph_check_caps(ci, flags | CHECK_CAPS_NOINVAL);
}
/*
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index b0b368ed3018..831e8ec4d5da 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -200,9 +200,10 @@ struct ceph_cap {
struct list_head caps_item;
};
-#define CHECK_CAPS_AUTHONLY 1 /* only check auth cap */
-#define CHECK_CAPS_FLUSH 2 /* flush any dirty caps */
-#define CHECK_CAPS_NOINVAL 4 /* don't invalidate pagecache */
+#define CHECK_CAPS_AUTHONLY 1 /* only check auth cap */
+#define CHECK_CAPS_FLUSH 2 /* flush any dirty caps */
+#define CHECK_CAPS_NOINVAL 4 /* don't invalidate pagecache */
+#define CHECK_CAPS_FLUSH_FORCE 8 /* force flush any caps */
struct ceph_cap_flush {
u64 tid;
--
2.45.1
From: Baokun Li <libaokun1(a)huawei.com>
[ Upstream commit 9e8e819f8f272c4e5dcd0bd6c7450e36481ed139 ]
When setting values of type unsigned int through sysfs, we use kstrtoul()
to parse it and then truncate part of it as the final set value, when the
set value is greater than UINT_MAX, the set value will not match what we
see because of the truncation. As follows:
$ echo 4294967296 > /sys/fs/ext4/sda/mb_max_linear_groups
$ cat /sys/fs/ext4/sda/mb_max_linear_groups
0
So we use kstrtouint() to parse the attr_pointer_ui type to avoid the
inconsistency described above. In addition, a judgment is added to avoid
setting s_resv_clusters less than 0.
Signed-off-by: Baokun Li <libaokun1(a)huawei.com>
Reviewed-by: Jan Kara <jack(a)suse.cz>
Link: https://lore.kernel.org/r/20240319113325.3110393-2-libaokun1@huawei.com
Signed-off-by: Theodore Ts'o <tytso(a)mit.edu>
Stable-dep-of: 13df4d44a3aa ("ext4: fix slab-out-of-bounds in ext4_mb_find_good_group_avg_frag_lists()")
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
Signed-off-by: Baokun Li <libaokun1(a)huawei.com>
---
fs/ext4/sysfs.c | 11 ++++++-----
1 file changed, 6 insertions(+), 5 deletions(-)
diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c
index 6d332dff79dd..ca820620b974 100644
--- a/fs/ext4/sysfs.c
+++ b/fs/ext4/sysfs.c
@@ -104,7 +104,7 @@ static ssize_t reserved_clusters_store(struct ext4_sb_info *sbi,
int ret;
ret = kstrtoull(skip_spaces(buf), 0, &val);
- if (ret || val >= clusters)
+ if (ret || val >= clusters || (s64)val < 0)
return -EINVAL;
atomic64_set(&sbi->s_resv_clusters, val);
@@ -451,7 +451,8 @@ static ssize_t ext4_attr_store(struct kobject *kobj,
s_kobj);
struct ext4_attr *a = container_of(attr, struct ext4_attr, attr);
void *ptr = calc_ptr(a, sbi);
- unsigned long t;
+ unsigned int t;
+ unsigned long lt;
int ret;
switch (a->attr_id) {
@@ -460,7 +461,7 @@ static ssize_t ext4_attr_store(struct kobject *kobj,
case attr_pointer_ui:
if (!ptr)
return 0;
- ret = kstrtoul(skip_spaces(buf), 0, &t);
+ ret = kstrtouint(skip_spaces(buf), 0, &t);
if (ret)
return ret;
if (a->attr_ptr == ptr_ext4_super_block_offset)
@@ -471,10 +472,10 @@ static ssize_t ext4_attr_store(struct kobject *kobj,
case attr_pointer_ul:
if (!ptr)
return 0;
- ret = kstrtoul(skip_spaces(buf), 0, &t);
+ ret = kstrtoul(skip_spaces(buf), 0, <);
if (ret)
return ret;
- *((unsigned long *) ptr) = t;
+ *((unsigned long *) ptr) = lt;
return len;
case attr_inode_readahead:
return inode_readahead_blks_store(sbi, buf, len);
--
2.39.2
V2 main changes:
- Add Rob's reviewed-by in the binding patch.
- Re-name the error out labels and new RXWM macro.
- In #3 patch, add one fix tag, and CC stable kernel.
Based on i.MX8QM HSIO PHY driver, refine i.MX8QM SATA driver by using PHY
interface.
[PATCH v2 1/4] dt-bindings: ata: Add i.MX8QM AHCI compatible string
[PATCH v2 2/4] ata: ahci_imx: Clean up code by using i.MX8Q HSIO PHY
[PATCH v2 3/4] ata: ahci_imx: Enlarge RX water mark for i.MX8QM SATA
[PATCH v2 4/4] ata: ahci_imx: Correct the email address
Documentation/devicetree/bindings/ata/imx-sata.yaml | 47 +++++++++++
drivers/ata/ahci_imx.c | 406 ++++++++++++++++++++++++-----------------------------------------------------------------
2 files changed, 155 insertions(+), 298 deletions(-)
The patch titled
Subject: mm/mglru: fix ineffective protection calculation
has been added to the -mm mm-unstable branch. Its filename is
mm-mglru-fix-ineffective-protection-calculation.patch
This patch will shortly appear at
https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patche…
This patch will later appear in the mm-unstable branch at
git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next via the mm-everything
branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
and is updated there every 2-3 working days
------------------------------------------------------
From: Yu Zhao <yuzhao(a)google.com>
Subject: mm/mglru: fix ineffective protection calculation
Date: Fri, 12 Jul 2024 17:29:56 -0600
mem_cgroup_calculate_protection() is not stateless and should only be used
as part of a top-down tree traversal. shrink_one() traverses the per-node
memcg LRU instead of the root_mem_cgroup tree, and therefore it should not
call mem_cgroup_calculate_protection().
The existing misuse in shrink_one() can cause ineffective protection of
sub-trees that are grandchildren of root_mem_cgroup. Fix it by reusing
lru_gen_age_node(), which already traverses the root_mem_cgroup tree, to
calculate the protection.
Previously lru_gen_age_node() opportunistically skips the first pass,
i.e., when scan_control->priority is DEF_PRIORITY. On the second pass,
lruvec_is_sizable() uses appropriate scan_control->priority, set by
set_initial_priority() from lru_gen_shrink_node(), to decide whether a
memcg is too small to reclaim from.
Now lru_gen_age_node() unconditionally traverses the root_mem_cgroup tree.
So it should call set_initial_priority() upfront, to make sure
lruvec_is_sizable() uses appropriate scan_control->priority on the first
pass. Otherwise, lruvec_is_reclaimable() can return false negatives and
result in premature OOM kills when min_ttl_ms is used.
Link: https://lkml.kernel.org/r/20240712232956.1427127-1-yuzhao@google.com
Fixes: e4dde56cd208 ("mm: multi-gen LRU: per-node lru_gen_folio lists")
Signed-off-by: Yu Zhao <yuzhao(a)google.com>
Reported-by: T.J. Mercier <tjmercier(a)google.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/vmscan.c | 82 +++++++++++++++++++++++---------------------------
1 file changed, 38 insertions(+), 44 deletions(-)
--- a/mm/vmscan.c~mm-mglru-fix-ineffective-protection-calculation
+++ a/mm/vmscan.c
@@ -3915,6 +3915,32 @@ done:
* working set protection
******************************************************************************/
+static void set_initial_priority(struct pglist_data *pgdat, struct scan_control *sc)
+{
+ int priority;
+ unsigned long reclaimable;
+
+ if (sc->priority != DEF_PRIORITY || sc->nr_to_reclaim < MIN_LRU_BATCH)
+ return;
+ /*
+ * Determine the initial priority based on
+ * (total >> priority) * reclaimed_to_scanned_ratio = nr_to_reclaim,
+ * where reclaimed_to_scanned_ratio = inactive / total.
+ */
+ reclaimable = node_page_state(pgdat, NR_INACTIVE_FILE);
+ if (can_reclaim_anon_pages(NULL, pgdat->node_id, sc))
+ reclaimable += node_page_state(pgdat, NR_INACTIVE_ANON);
+
+ /* round down reclaimable and round up sc->nr_to_reclaim */
+ priority = fls_long(reclaimable) - 1 - fls_long(sc->nr_to_reclaim - 1);
+
+ /*
+ * The estimation is based on LRU pages only, so cap it to prevent
+ * overshoots of shrinker objects by large margins.
+ */
+ sc->priority = clamp(priority, DEF_PRIORITY / 2, DEF_PRIORITY);
+}
+
static bool lruvec_is_sizable(struct lruvec *lruvec, struct scan_control *sc)
{
int gen, type, zone;
@@ -3948,19 +3974,17 @@ static bool lruvec_is_reclaimable(struct
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
DEFINE_MIN_SEQ(lruvec);
- /* see the comment on lru_gen_folio */
- gen = lru_gen_from_seq(min_seq[LRU_GEN_FILE]);
- birth = READ_ONCE(lruvec->lrugen.timestamps[gen]);
-
- if (time_is_after_jiffies(birth + min_ttl))
+ if (mem_cgroup_below_min(NULL, memcg))
return false;
if (!lruvec_is_sizable(lruvec, sc))
return false;
- mem_cgroup_calculate_protection(NULL, memcg);
+ /* see the comment on lru_gen_folio */
+ gen = lru_gen_from_seq(min_seq[LRU_GEN_FILE]);
+ birth = READ_ONCE(lruvec->lrugen.timestamps[gen]);
- return !mem_cgroup_below_min(NULL, memcg);
+ return time_is_before_jiffies(birth + min_ttl);
}
/* to protect the working set of the last N jiffies */
@@ -3970,23 +3994,20 @@ static void lru_gen_age_node(struct pgli
{
struct mem_cgroup *memcg;
unsigned long min_ttl = READ_ONCE(lru_gen_min_ttl);
+ bool reclaimable = !min_ttl;
VM_WARN_ON_ONCE(!current_is_kswapd());
- /* check the order to exclude compaction-induced reclaim */
- if (!min_ttl || sc->order || sc->priority == DEF_PRIORITY)
- return;
+ set_initial_priority(pgdat, sc);
memcg = mem_cgroup_iter(NULL, NULL, NULL);
do {
struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
- if (lruvec_is_reclaimable(lruvec, sc, min_ttl)) {
- mem_cgroup_iter_break(NULL, memcg);
- return;
- }
+ mem_cgroup_calculate_protection(NULL, memcg);
- cond_resched();
+ if (!reclaimable)
+ reclaimable = lruvec_is_reclaimable(lruvec, sc, min_ttl);
} while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
/*
@@ -3994,7 +4015,7 @@ static void lru_gen_age_node(struct pgli
* younger than min_ttl. However, another possibility is all memcgs are
* either too small or below min.
*/
- if (mutex_trylock(&oom_lock)) {
+ if (!reclaimable && mutex_trylock(&oom_lock)) {
struct oom_control oc = {
.gfp_mask = sc->gfp_mask,
};
@@ -4786,8 +4807,7 @@ static int shrink_one(struct lruvec *lru
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
struct pglist_data *pgdat = lruvec_pgdat(lruvec);
- mem_cgroup_calculate_protection(NULL, memcg);
-
+ /* lru_gen_age_node() called mem_cgroup_calculate_protection() */
if (mem_cgroup_below_min(NULL, memcg))
return MEMCG_LRU_YOUNG;
@@ -4911,32 +4931,6 @@ static void lru_gen_shrink_lruvec(struct
blk_finish_plug(&plug);
}
-static void set_initial_priority(struct pglist_data *pgdat, struct scan_control *sc)
-{
- int priority;
- unsigned long reclaimable;
-
- if (sc->priority != DEF_PRIORITY || sc->nr_to_reclaim < MIN_LRU_BATCH)
- return;
- /*
- * Determine the initial priority based on
- * (total >> priority) * reclaimed_to_scanned_ratio = nr_to_reclaim,
- * where reclaimed_to_scanned_ratio = inactive / total.
- */
- reclaimable = node_page_state(pgdat, NR_INACTIVE_FILE);
- if (can_reclaim_anon_pages(NULL, pgdat->node_id, sc))
- reclaimable += node_page_state(pgdat, NR_INACTIVE_ANON);
-
- /* round down reclaimable and round up sc->nr_to_reclaim */
- priority = fls_long(reclaimable) - 1 - fls_long(sc->nr_to_reclaim - 1);
-
- /*
- * The estimation is based on LRU pages only, so cap it to prevent
- * overshoots of shrinker objects by large margins.
- */
- sc->priority = clamp(priority, DEF_PRIORITY / 2, DEF_PRIORITY);
-}
-
static void lru_gen_shrink_node(struct pglist_data *pgdat, struct scan_control *sc)
{
struct blk_plug plug;
_
Patches currently in -mm which might be from yuzhao(a)google.com are
mm-mglru-fix-ineffective-protection-calculation.patch
The patch titled
Subject: mm/mglru: fix ineffective protection calculation
has been added to the -mm mm-hotfixes-unstable branch. Its filename is
mm-mglru-fix-ineffective-protection-calculation.patch
This patch will shortly appear at
https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patche…
This patch will later appear in the mm-hotfixes-unstable branch at
git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next via the mm-everything
branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
and is updated there every 2-3 working days
------------------------------------------------------
From: Yu Zhao <yuzhao(a)google.com>
Subject: mm/mglru: fix ineffective protection calculation
Date: Fri, 12 Jul 2024 17:29:56 -0600
mem_cgroup_calculate_protection() is not stateless and should only be used
as part of a top-down tree traversal. shrink_one() traverses the per-node
memcg LRU instead of the root_mem_cgroup tree, and therefore it should not
call mem_cgroup_calculate_protection().
The existing misuse in shrink_one() can cause ineffective protection of
sub-trees that are grandchildren of root_mem_cgroup. Fix it by reusing
lru_gen_age_node(), which already traverses the root_mem_cgroup tree, to
calculate the protection.
Previously lru_gen_age_node() opportunistically skips the first pass,
i.e., when scan_control->priority is DEF_PRIORITY. On the second pass,
lruvec_is_sizable() uses appropriate scan_control->priority, set by
set_initial_priority() from lru_gen_shrink_node(), to decide whether a
memcg is too small to reclaim from.
Now lru_gen_age_node() unconditionally traverses the root_mem_cgroup tree.
So it should call set_initial_priority() upfront, to make sure
lruvec_is_sizable() uses appropriate scan_control->priority on the first
pass. Otherwise, lruvec_is_reclaimable() can return false negatives and
result in premature OOM kills when min_ttl_ms is used.
Link: https://lkml.kernel.org/r/20240712232956.1427127-1-yuzhao@google.com
Fixes: e4dde56cd208 ("mm: multi-gen LRU: per-node lru_gen_folio lists")
Signed-off-by: Yu Zhao <yuzhao(a)google.com>
Reported-by: T.J. Mercier <tjmercier(a)google.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/vmscan.c | 30 ++++++++++++------------------
1 file changed, 12 insertions(+), 18 deletions(-)
--- a/mm/vmscan.c~mm-mglru-fix-ineffective-protection-calculation
+++ a/mm/vmscan.c
@@ -3933,19 +3933,17 @@ static bool lruvec_is_reclaimable(struct
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
DEFINE_MIN_SEQ(lruvec);
- /* see the comment on lru_gen_folio */
- gen = lru_gen_from_seq(min_seq[LRU_GEN_FILE]);
- birth = READ_ONCE(lruvec->lrugen.timestamps[gen]);
-
- if (time_is_after_jiffies(birth + min_ttl))
+ if (mem_cgroup_below_min(NULL, memcg))
return false;
if (!lruvec_is_sizable(lruvec, sc))
return false;
- mem_cgroup_calculate_protection(NULL, memcg);
+ /* see the comment on lru_gen_folio */
+ gen = lru_gen_from_seq(min_seq[LRU_GEN_FILE]);
+ birth = READ_ONCE(lruvec->lrugen.timestamps[gen]);
- return !mem_cgroup_below_min(NULL, memcg);
+ return time_is_before_jiffies(birth + min_ttl);
}
/* to protect the working set of the last N jiffies */
@@ -3955,23 +3953,20 @@ static void lru_gen_age_node(struct pgli
{
struct mem_cgroup *memcg;
unsigned long min_ttl = READ_ONCE(lru_gen_min_ttl);
+ bool reclaimable = !min_ttl;
VM_WARN_ON_ONCE(!current_is_kswapd());
- /* check the order to exclude compaction-induced reclaim */
- if (!min_ttl || sc->order || sc->priority == DEF_PRIORITY)
- return;
+ set_initial_priority(pgdat, sc);
memcg = mem_cgroup_iter(NULL, NULL, NULL);
do {
struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
- if (lruvec_is_reclaimable(lruvec, sc, min_ttl)) {
- mem_cgroup_iter_break(NULL, memcg);
- return;
- }
+ mem_cgroup_calculate_protection(NULL, memcg);
- cond_resched();
+ if (!reclaimable)
+ reclaimable = lruvec_is_reclaimable(lruvec, sc, min_ttl);
} while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
/*
@@ -3979,7 +3974,7 @@ static void lru_gen_age_node(struct pgli
* younger than min_ttl. However, another possibility is all memcgs are
* either too small or below min.
*/
- if (mutex_trylock(&oom_lock)) {
+ if (!reclaimable && mutex_trylock(&oom_lock)) {
struct oom_control oc = {
.gfp_mask = sc->gfp_mask,
};
@@ -4772,8 +4767,7 @@ static int shrink_one(struct lruvec *lru
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
struct pglist_data *pgdat = lruvec_pgdat(lruvec);
- mem_cgroup_calculate_protection(NULL, memcg);
-
+ /* lru_gen_age_node() called mem_cgroup_calculate_protection() */
if (mem_cgroup_below_min(NULL, memcg))
return MEMCG_LRU_YOUNG;
_
Patches currently in -mm which might be from yuzhao(a)google.com are
mm-mglru-fix-ineffective-protection-calculation.patch
The patch titled
Subject: mm/huge_memory: avoid PMD-size page cache if needed
has been added to the -mm mm-hotfixes-unstable branch. Its filename is
mm-huge_memory-avoid-pmd-size-page-cache-if-needed.patch
This patch will shortly appear at
https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patche…
This patch will later appear in the mm-hotfixes-unstable branch at
git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next via the mm-everything
branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
and is updated there every 2-3 working days
------------------------------------------------------
From: Gavin Shan <gshan(a)redhat.com>
Subject: mm/huge_memory: avoid PMD-size page cache if needed
Date: Mon, 15 Jul 2024 10:04:23 +1000
xarray can't support arbitrary page cache size. the largest and supported
page cache size is defined as MAX_PAGECACHE_ORDER by commit 099d90642a71
("mm/filemap: make MAX_PAGECACHE_ORDER acceptable to xarray"). However,
it's possible to have 512MB page cache in the huge memory's collapsing
path on ARM64 system whose base page size is 64KB. 512MB page cache is
breaking the limitation and a warning is raised when the xarray entry is
split as shown in the following example.
[root@dhcp-10-26-1-207 ~]# cat /proc/1/smaps | grep KernelPageSize
KernelPageSize: 64 kB
[root@dhcp-10-26-1-207 ~]# cat /tmp/test.c
:
int main(int argc, char **argv)
{
const char *filename = TEST_XFS_FILENAME;
int fd = 0;
void *buf = (void *)-1, *p;
int pgsize = getpagesize();
int ret = 0;
if (pgsize != 0x10000) {
fprintf(stdout, "System with 64KB base page size is required!\n");
return -EPERM;
}
system("echo 0 > /sys/devices/virtual/bdi/253:0/read_ahead_kb");
system("echo 1 > /proc/sys/vm/drop_caches");
/* Open the xfs file */
fd = open(filename, O_RDONLY);
assert(fd > 0);
/* Create VMA */
buf = mmap(NULL, TEST_MEM_SIZE, PROT_READ, MAP_SHARED, fd, 0);
assert(buf != (void *)-1);
fprintf(stdout, "mapped buffer at 0x%p\n", buf);
/* Populate VMA */
ret = madvise(buf, TEST_MEM_SIZE, MADV_NOHUGEPAGE);
assert(ret == 0);
ret = madvise(buf, TEST_MEM_SIZE, MADV_POPULATE_READ);
assert(ret == 0);
/* Collapse VMA */
ret = madvise(buf, TEST_MEM_SIZE, MADV_HUGEPAGE);
assert(ret == 0);
ret = madvise(buf, TEST_MEM_SIZE, MADV_COLLAPSE);
if (ret) {
fprintf(stdout, "Error %d to madvise(MADV_COLLAPSE)\n", errno);
goto out;
}
/* Split xarray entry. Write permission is needed */
munmap(buf, TEST_MEM_SIZE);
buf = (void *)-1;
close(fd);
fd = open(filename, O_RDWR);
assert(fd > 0);
fallocate(fd, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE,
TEST_MEM_SIZE - pgsize, pgsize);
out:
if (buf != (void *)-1)
munmap(buf, TEST_MEM_SIZE);
if (fd > 0)
close(fd);
return ret;
}
[root@dhcp-10-26-1-207 ~]# gcc /tmp/test.c -o /tmp/test
[root@dhcp-10-26-1-207 ~]# /tmp/test
------------[ cut here ]------------
WARNING: CPU: 25 PID: 7560 at lib/xarray.c:1025 xas_split_alloc+0xf8/0x128
Modules linked in: nft_fib_inet nft_fib_ipv4 nft_fib_ipv6 nft_fib \
nft_reject_inet nf_reject_ipv4 nf_reject_ipv6 nft_reject nft_ct \
nft_chain_nat nf_nat nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 \
ip_set rfkill nf_tables nfnetlink vfat fat virtio_balloon drm fuse \
xfs libcrc32c crct10dif_ce ghash_ce sha2_ce sha256_arm64 virtio_net \
sha1_ce net_failover virtio_blk virtio_console failover dimlib virtio_mmio
CPU: 25 PID: 7560 Comm: test Kdump: loaded Not tainted 6.10.0-rc7-gavin+ #9
Hardware name: QEMU KVM Virtual Machine, BIOS edk2-20240524-1.el9 05/24/2024
pstate: 83400005 (Nzcv daif +PAN -UAO +TCO +DIT -SSBS BTYPE=--)
pc : xas_split_alloc+0xf8/0x128
lr : split_huge_page_to_list_to_order+0x1c4/0x780
sp : ffff8000ac32f660
x29: ffff8000ac32f660 x28: ffff0000e0969eb0 x27: ffff8000ac32f6c0
x26: 0000000000000c40 x25: ffff0000e0969eb0 x24: 000000000000000d
x23: ffff8000ac32f6c0 x22: ffffffdfc0700000 x21: 0000000000000000
x20: 0000000000000000 x19: ffffffdfc0700000 x18: 0000000000000000
x17: 0000000000000000 x16: ffffd5f3708ffc70 x15: 0000000000000000
x14: 0000000000000000 x13: 0000000000000000 x12: 0000000000000000
x11: ffffffffffffffc0 x10: 0000000000000040 x9 : ffffd5f3708e692c
x8 : 0000000000000003 x7 : 0000000000000000 x6 : ffff0000e0969eb8
x5 : ffffd5f37289e378 x4 : 0000000000000000 x3 : 0000000000000c40
x2 : 000000000000000d x1 : 000000000000000c x0 : 0000000000000000
Call trace:
xas_split_alloc+0xf8/0x128
split_huge_page_to_list_to_order+0x1c4/0x780
truncate_inode_partial_folio+0xdc/0x160
truncate_inode_pages_range+0x1b4/0x4a8
truncate_pagecache_range+0x84/0xa0
xfs_flush_unmap_range+0x70/0x90 [xfs]
xfs_file_fallocate+0xfc/0x4d8 [xfs]
vfs_fallocate+0x124/0x2f0
ksys_fallocate+0x4c/0xa0
__arm64_sys_fallocate+0x24/0x38
invoke_syscall.constprop.0+0x7c/0xd8
do_el0_svc+0xb4/0xd0
el0_svc+0x44/0x1d8
el0t_64_sync_handler+0x134/0x150
el0t_64_sync+0x17c/0x180
Fix it by correcting the supported page cache orders, different sets for
DAX and other files. With it corrected, 512MB page cache becomes
disallowed on all non-DAX files on ARM64 system where the base page size
is 64KB. After this patch is applied, the test program fails with error
-EINVAL returned from __thp_vma_allowable_orders() and the madvise()
system call to collapse the page caches.
Link: https://lkml.kernel.org/r/20240715000423.316491-1-gshan@redhat.com
Fixes: 6b24ca4a1a8d ("mm: Use multi-index entries in the page cache")
Signed-off-by: Gavin Shan <gshan(a)redhat.com>
Acked-by: David Hildenbrand <david(a)redhat.com>
Reviewed-by: Ryan Roberts <ryan.roberts(a)arm.com>
Acked-by: Zi Yan <ziy(a)nvidia.com>
Cc: Baolin Wang <baolin.wang(a)linux.alibaba.com>
Cc: Barry Song <baohua(a)kernel.org>
Cc: Don Dutile <ddutile(a)redhat.com>
Cc: Matthew Wilcox (Oracle) <willy(a)infradead.org>
Cc: Peter Xu <peterx(a)redhat.com>
Cc: Ryan Roberts <ryan.roberts(a)arm.com>
Cc: William Kucharski <william.kucharski(a)oracle.com>
Cc: <stable(a)vger.kernel.org> [5.17+]
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
include/linux/huge_mm.h | 12 +++++++++---
mm/huge_memory.c | 12 ++++++++++--
2 files changed, 19 insertions(+), 5 deletions(-)
--- a/include/linux/huge_mm.h~mm-huge_memory-avoid-pmd-size-page-cache-if-needed
+++ a/include/linux/huge_mm.h
@@ -72,14 +72,20 @@ extern struct kobj_attribute shmem_enabl
#define THP_ORDERS_ALL_ANON ((BIT(PMD_ORDER + 1) - 1) & ~(BIT(0) | BIT(1)))
/*
- * Mask of all large folio orders supported for file THP.
+ * Mask of all large folio orders supported for file THP. Folios in a DAX
+ * file is never split and the MAX_PAGECACHE_ORDER limit does not apply to
+ * it.
*/
-#define THP_ORDERS_ALL_FILE (BIT(PMD_ORDER) | BIT(PUD_ORDER))
+#define THP_ORDERS_ALL_FILE_DAX \
+ (BIT(PMD_ORDER) | BIT(PUD_ORDER))
+#define THP_ORDERS_ALL_FILE_DEFAULT \
+ ((BIT(MAX_PAGECACHE_ORDER + 1) - 1) & ~BIT(0))
/*
* Mask of all large folio orders supported for THP.
*/
-#define THP_ORDERS_ALL (THP_ORDERS_ALL_ANON | THP_ORDERS_ALL_FILE)
+#define THP_ORDERS_ALL \
+ (THP_ORDERS_ALL_ANON | THP_ORDERS_ALL_FILE_DAX | THP_ORDERS_ALL_FILE_DEFAULT)
#define TVA_SMAPS (1 << 0) /* Will be used for procfs */
#define TVA_IN_PF (1 << 1) /* Page fault handler */
--- a/mm/huge_memory.c~mm-huge_memory-avoid-pmd-size-page-cache-if-needed
+++ a/mm/huge_memory.c
@@ -88,9 +88,17 @@ unsigned long __thp_vma_allowable_orders
bool smaps = tva_flags & TVA_SMAPS;
bool in_pf = tva_flags & TVA_IN_PF;
bool enforce_sysfs = tva_flags & TVA_ENFORCE_SYSFS;
+ unsigned long supported_orders;
+
/* Check the intersection of requested and supported orders. */
- orders &= vma_is_anonymous(vma) ?
- THP_ORDERS_ALL_ANON : THP_ORDERS_ALL_FILE;
+ if (vma_is_anonymous(vma))
+ supported_orders = THP_ORDERS_ALL_ANON;
+ else if (vma_is_dax(vma))
+ supported_orders = THP_ORDERS_ALL_FILE_DAX;
+ else
+ supported_orders = THP_ORDERS_ALL_FILE_DEFAULT;
+
+ orders &= supported_orders;
if (!orders)
return 0;
_
Patches currently in -mm which might be from gshan(a)redhat.com are
mm-huge_memory-avoid-pmd-size-page-cache-if-needed.patch
Hi,
There are two patches for bpf that I think should be backported to
v6.6 kernel to fix a privilege escalation vulnerability in kernelCTF.
bpf: Fix too early release of tcx_entry
1cb6f0bae50441f4b4b32a28315853b279c7404e
selftests/bpf: Extend tcx tests to cover late tcx_entry release
5f1d18de79180deac2822c93e431bbe547f7d3ce
Thanks!
Best regards,
He
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.15.y
git checkout FETCH_HEAD
git cherry-pick -x 310d6c15e9104c99d5d9d0ff8e5383a79da7d5e6
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024071546-swerve-grew-de52@gregkh' --subject-prefix 'PATCH 5.15.y' HEAD^..
Possible dependencies:
310d6c15e910 ("mm/damon/core: merge regions aggressively when max_nr_regions is unmet")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 310d6c15e9104c99d5d9d0ff8e5383a79da7d5e6 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj(a)kernel.org>
Date: Mon, 24 Jun 2024 10:58:14 -0700
Subject: [PATCH] mm/damon/core: merge regions aggressively when max_nr_regions
is unmet
DAMON keeps the number of regions under max_nr_regions by skipping regions
split operations when doing so can make the number higher than the limit.
It works well for preventing violation of the limit. But, if somehow the
violation happens, it cannot recovery well depending on the situation. In
detail, if the real number of regions having different access pattern is
higher than the limit, the mechanism cannot reduce the number below the
limit. In such a case, the system could suffer from high monitoring
overhead of DAMON.
The violation can actually happen. For an example, the user could reduce
max_nr_regions while DAMON is running, to be lower than the current number
of regions. Fix the problem by repeating the merge operations with
increasing aggressiveness in kdamond_merge_regions() for the case, until
the limit is met.
[sj(a)kernel.org: increase regions merge aggressiveness while respecting min_nr_regions]
Link: https://lkml.kernel.org/r/20240626164753.46270-1-sj@kernel.org
[sj(a)kernel.org: ensure max threshold attempt for max_nr_regions violation]
Link: https://lkml.kernel.org/r/20240627163153.75969-1-sj@kernel.org
Link: https://lkml.kernel.org/r/20240624175814.89611-1-sj@kernel.org
Fixes: b9a6ac4e4ede ("mm/damon: adaptively adjust regions")
Signed-off-by: SeongJae Park <sj(a)kernel.org>
Cc: <stable(a)vger.kernel.org> [5.15+]
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
diff --git a/mm/damon/core.c b/mm/damon/core.c
index 6392f1cc97a3..e66823d6b10b 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -1358,14 +1358,31 @@ static void damon_merge_regions_of(struct damon_target *t, unsigned int thres,
* access frequencies are similar. This is for minimizing the monitoring
* overhead under the dynamically changeable access pattern. If a merge was
* unnecessarily made, later 'kdamond_split_regions()' will revert it.
+ *
+ * The total number of regions could be higher than the user-defined limit,
+ * max_nr_regions for some cases. For example, the user can update
+ * max_nr_regions to a number that lower than the current number of regions
+ * while DAMON is running. For such a case, repeat merging until the limit is
+ * met while increasing @threshold up to possible maximum level.
*/
static void kdamond_merge_regions(struct damon_ctx *c, unsigned int threshold,
unsigned long sz_limit)
{
struct damon_target *t;
+ unsigned int nr_regions;
+ unsigned int max_thres;
- damon_for_each_target(t, c)
- damon_merge_regions_of(t, threshold, sz_limit);
+ max_thres = c->attrs.aggr_interval /
+ (c->attrs.sample_interval ? c->attrs.sample_interval : 1);
+ do {
+ nr_regions = 0;
+ damon_for_each_target(t, c) {
+ damon_merge_regions_of(t, threshold, sz_limit);
+ nr_regions += damon_nr_regions(t);
+ }
+ threshold = max(1, threshold * 2);
+ } while (nr_regions > c->attrs.max_nr_regions &&
+ threshold / 2 < max_thres);
}
/*
I have created a bz for this issue as well -
https://bugzilla.redhat.com/show_bug.cgi?id=2293600
In my digging, it seems that the 6.9+ kernels do not have the xpad
kernel module (running # sudo modprobe xpad returns no value)
A workaround was to manually install xone or xpad kernel module and
self sign the key (keeping secure boot intact) however, this is
somewhat tedious since it did work before.