The patch titled
Subject: scripts/gdb: add iteration function for rbtree
has been added to the -mm mm-nonmm-unstable branch. Its filename is
scripts-gdb-add-iteration-function-for-rbtree.patch
This patch will shortly appear at
https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patche…
This patch will later appear in the mm-nonmm-unstable branch at
git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next via the mm-everything
branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
and is updated there every 2-3 working days
------------------------------------------------------
From: Kuan-Ying Lee <kuan-ying.lee(a)canonical.com>
Subject: scripts/gdb: add iteration function for rbtree
Date: Tue, 23 Jul 2024 14:48:58 +0800
Add inorder iteration function for rbtree usage.
This is a preparation patch for the next patch to fix the gdb mounts
issue.
Link: https://lkml.kernel.org/r/20240723064902.124154-3-kuan-ying.lee@canonical.c…
Fixes: 2eea9ce4310d ("mounts: keep list of mounts in an rbtree")
Signed-off-by: Kuan-Ying Lee <kuan-ying.lee(a)canonical.com>
Cc: Jan Kiszka <jan.kiszka(a)siemens.com>
Cc: Kieran Bingham <kbingham(a)kernel.org>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
scripts/gdb/linux/rbtree.py | 12 ++++++++++++
1 file changed, 12 insertions(+)
--- a/scripts/gdb/linux/rbtree.py~scripts-gdb-add-iteration-function-for-rbtree
+++ a/scripts/gdb/linux/rbtree.py
@@ -9,6 +9,18 @@ from linux import utils
rb_root_type = utils.CachedType("struct rb_root")
rb_node_type = utils.CachedType("struct rb_node")
+def rb_inorder_for_each(root):
+ def inorder(node):
+ if node:
+ yield from inorder(node['rb_left'])
+ yield node
+ yield from inorder(node['rb_right'])
+
+ yield from inorder(root['rb_node'])
+
+def rb_inorder_for_each_entry(root, gdbtype, member):
+ for node in rb_inorder_for_each(root):
+ yield utils.container_of(node, gdbtype, member)
def rb_first(root):
if root.type == rb_root_type.get_type():
_
Patches currently in -mm which might be from kuan-ying.lee(a)canonical.com are
scripts-gdb-fix-timerlist-parsing-issue.patch
scripts-gdb-add-iteration-function-for-rbtree.patch
scripts-gdb-fix-lx-mounts-command-error.patch
scripts-gdb-add-lx-stack_depot_lookup-command.patch
scripts-gdb-add-lx-kasan_mem_to_shadow-command.patch
The patch titled
Subject: scripts/gdb: fix timerlist parsing issue
has been added to the -mm mm-nonmm-unstable branch. Its filename is
scripts-gdb-fix-timerlist-parsing-issue.patch
This patch will shortly appear at
https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patche…
This patch will later appear in the mm-nonmm-unstable branch at
git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next via the mm-everything
branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
and is updated there every 2-3 working days
------------------------------------------------------
From: Kuan-Ying Lee <kuan-ying.lee(a)canonical.com>
Subject: scripts/gdb: fix timerlist parsing issue
Date: Tue, 23 Jul 2024 14:48:57 +0800
Patch series "Fix some GDB command error and add some GDB commands", v3.
Fix some GDB command errors and add some useful GDB commands.
This patch (of 5):
Commit 7988e5ae2be7 ("tick: Split nohz and highres features from
nohz_mode") and commit 7988e5ae2be7 ("tick: Split nohz and highres
features from nohz_mode") move 'tick_stopped' and 'nohz_mode' to flags
field which will break the gdb lx-mounts command:
(gdb) lx-timerlist
Python Exception <class 'gdb.error'>: There is no member named nohz_mode.
Error occurred in Python: There is no member named nohz_mode.
(gdb) lx-timerlist
Python Exception <class 'gdb.error'>: There is no member named tick_stopped.
Error occurred in Python: There is no member named tick_stopped.
We move 'tick_stopped' and 'nohz_mode' to flags field instead.
Link: https://lkml.kernel.org/r/20240723064902.124154-1-kuan-ying.lee@canonical.c…
Link: https://lkml.kernel.org/r/20240723064902.124154-2-kuan-ying.lee@canonical.c…
Fixes: a478ffb2ae23 ("tick: Move individual bit features to debuggable mask accesses")
Fixes: 7988e5ae2be7 ("tick: Split nohz and highres features from nohz_mode")
Signed-off-by: Kuan-Ying Lee <kuan-ying.lee(a)canonical.com>
Cc: Jan Kiszka <jan.kiszka(a)siemens.com>
Cc: Kieran Bingham <kbingham(a)kernel.org>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
scripts/gdb/linux/timerlist.py | 31 ++++++++++++++++---------------
1 file changed, 16 insertions(+), 15 deletions(-)
--- a/scripts/gdb/linux/timerlist.py~scripts-gdb-fix-timerlist-parsing-issue
+++ a/scripts/gdb/linux/timerlist.py
@@ -87,21 +87,22 @@ def print_cpu(hrtimer_bases, cpu, max_cl
text += "\n"
if constants.LX_CONFIG_TICK_ONESHOT:
- fmts = [(" .{} : {}", 'nohz_mode'),
- (" .{} : {} nsecs", 'last_tick'),
- (" .{} : {}", 'tick_stopped'),
- (" .{} : {}", 'idle_jiffies'),
- (" .{} : {}", 'idle_calls'),
- (" .{} : {}", 'idle_sleeps'),
- (" .{} : {} nsecs", 'idle_entrytime'),
- (" .{} : {} nsecs", 'idle_waketime'),
- (" .{} : {} nsecs", 'idle_exittime'),
- (" .{} : {} nsecs", 'idle_sleeptime'),
- (" .{}: {} nsecs", 'iowait_sleeptime'),
- (" .{} : {}", 'last_jiffies'),
- (" .{} : {}", 'next_timer'),
- (" .{} : {} nsecs", 'idle_expires')]
- text += "\n".join([s.format(f, ts[f]) for s, f in fmts])
+ TS_FLAG_STOPPED = 1 << 1
+ TS_FLAG_NOHZ = 1 << 4
+ text += f" .{'nohz':15s}: {int(bool(ts['flags'] & TS_FLAG_NOHZ))}\n"
+ text += f" .{'last_tick':15s}: {ts['last_tick']}\n"
+ text += f" .{'tick_stopped':15s}: {int(bool(ts['flags'] & TS_FLAG_STOPPED))}\n"
+ text += f" .{'idle_jiffies':15s}: {ts['idle_jiffies']}\n"
+ text += f" .{'idle_calls':15s}: {ts['idle_calls']}\n"
+ text += f" .{'idle_sleeps':15s}: {ts['idle_sleeps']}\n"
+ text += f" .{'idle_entrytime':15s}: {ts['idle_entrytime']} nsecs\n"
+ text += f" .{'idle_waketime':15s}: {ts['idle_waketime']} nsecs\n"
+ text += f" .{'idle_exittime':15s}: {ts['idle_exittime']} nsecs\n"
+ text += f" .{'idle_sleeptime':15s}: {ts['idle_sleeptime']} nsecs\n"
+ text += f" .{'iowait_sleeptime':15s}: {ts['iowait_sleeptime']} nsecs\n"
+ text += f" .{'last_jiffies':15s}: {ts['last_jiffies']}\n"
+ text += f" .{'next_timer':15s}: {ts['next_timer']}\n"
+ text += f" .{'idle_expires':15s}: {ts['idle_expires']} nsecs\n"
text += "\njiffies: {}\n".format(jiffies)
text += "\n"
_
Patches currently in -mm which might be from kuan-ying.lee(a)canonical.com are
scripts-gdb-fix-timerlist-parsing-issue.patch
scripts-gdb-add-iteration-function-for-rbtree.patch
scripts-gdb-fix-lx-mounts-command-error.patch
scripts-gdb-add-lx-stack_depot_lookup-command.patch
scripts-gdb-add-lx-kasan_mem_to_shadow-command.patch
From: Zhang Yi <yi.zhang(a)huawei.com>
commit 4df031ff5876d94b48dd9ee486ba5522382a06b2 upstream
After commit 3da40c7b0898 ("ext4: only call ext4_truncate when size <=
isize"), i_disksize could always be updated to i_size in ext4_setattr(),
and we could sure that i_disksize <= i_size since holding inode lock and
if i_disksize < i_size there are delalloc writes pending in the range
upto i_size. If the end of the current write is <= i_size, there's no
need to touch i_disksize since writeback will push i_disksize upto
i_size eventually. So we can switch to check i_size instead of
i_disksize in ext4_da_write_end() when write to the end of the file.
we also could remove ext4_mark_inode_dirty() together because we defer
inode dirtying to generic_write_end() or ext4_da_write_inline_data_end().
Cc: stable(a)vger.kernel.org
Signed-off-by: Zhang Yi <yi.zhang(a)huawei.com>
Reviewed-by: Jan Kara <jack(a)suse.cz>
Signed-off-by: Theodore Ts'o <tytso(a)mit.edu>
Link: https://lore.kernel.org/r/20210716122024.1105856-2-yi.zhang@huawei.com
Reviewed-by: Cheng Nie <niecheng1(a)uniontech.com>
Signed-off-by: Dandan Zhang <zhangdandan(a)uniontech.com>
Signed-off-by: WangYuli <wangyuli(a)uniontech.com>
---
fs/ext4/inode.c | 34 ++++++++++++++++++----------------
1 file changed, 18 insertions(+), 16 deletions(-)
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 646285fbc9fc..d8a8e4ee5ff8 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3207,35 +3207,37 @@ static int ext4_da_write_end(struct file *file,
end = start + copied - 1;
/*
- * generic_write_end() will run mark_inode_dirty() if i_size
- * changes. So let's piggyback the i_disksize mark_inode_dirty
- * into that.
+ * Since we are holding inode lock, we are sure i_disksize <=
+ * i_size. We also know that if i_disksize < i_size, there are
+ * delalloc writes pending in the range upto i_size. If the end of
+ * the current write is <= i_size, there's no need to touch
+ * i_disksize since writeback will push i_disksize upto i_size
+ * eventually. If the end of the current write is > i_size and
+ * inside an allocated block (ext4_da_should_update_i_disksize()
+ * check), we need to update i_disksize here as neither
+ * ext4_writepage() nor certain ext4_writepages() paths not
+ * allocating blocks update i_disksize.
+ *
+ * Note that we defer inode dirtying to generic_write_end() /
+ * ext4_da_write_inline_data_end().
*/
new_i_size = pos + copied;
- if (copied && new_i_size > EXT4_I(inode)->i_disksize) {
+ if (copied && new_i_size > inode->i_size) {
if (ext4_has_inline_data(inode) ||
- ext4_da_should_update_i_disksize(page, end)) {
+ ext4_da_should_update_i_disksize(page, end))
ext4_update_i_disksize(inode, new_i_size);
- /* We need to mark inode dirty even if
- * new_i_size is less that inode->i_size
- * bu greater than i_disksize.(hint delalloc)
- */
- ext4_mark_inode_dirty(handle, inode);
- }
}
if (write_mode != CONVERT_INLINE_DATA &&
ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA) &&
ext4_has_inline_data(inode))
- ret2 = ext4_da_write_inline_data_end(inode, pos, len, copied,
+ ret = ext4_da_write_inline_data_end(inode, pos, len, copied,
page);
else
- ret2 = generic_write_end(file, mapping, pos, len, copied,
+ ret = generic_write_end(file, mapping, pos, len, copied,
page, fsdata);
- copied = ret2;
- if (ret2 < 0)
- ret = ret2;
+ copied = ret;
ret2 = ext4_journal_stop(handle);
if (!ret)
ret = ret2;
--
2.43.4
commit 3cad1bc010416c6dd780643476bc59ed742436b9 upstream.
When fcntl_setlk() races with close(), it removes the created lock with
do_lock_file_wait().
However, LSMs can allow the first do_lock_file_wait() that created the lock
while denying the second do_lock_file_wait() that tries to remove the lock.
In theory (but AFAIK not in practice), posix_lock_file() could also fail to
remove a lock due to GFP_KERNEL allocation failure (when splitting a range
in the middle).
After the bug has been triggered, use-after-free reads will occur in
lock_get_status() when userspace reads /proc/locks. This can likely be used
to read arbitrary kernel memory, but can't corrupt kernel memory.
This only affects systems with SELinux / Smack / AppArmor / BPF-LSM in
enforcing mode and only works from some security contexts.
Fix it by calling locks_remove_posix() instead, which is designed to
reliably get rid of POSIX locks associated with the given file and
files_struct and is also used by filp_flush().
Fixes: c293621bbf67 ("[PATCH] stale POSIX lock handling")
Cc: stable(a)kernel.org
Link: https://bugs.chromium.org/p/project-zero/issues/detail?id=2563
Signed-off-by: Jann Horn <jannh(a)google.com>
Link: https://lore.kernel.org/r/20240702-fs-lock-recover-2-v1-1-edd456f63789@goog…
Reviewed-by: Jeff Layton <jlayton(a)kernel.org>
Signed-off-by: Christian Brauner <brauner(a)kernel.org>
[stable fixup: ->c.flc_type was ->fl_type in older kernels]
Signed-off-by: Jann Horn <jannh(a)google.com>
---
fs/locks.c | 9 ++++-----
1 file changed, 4 insertions(+), 5 deletions(-)
diff --git a/fs/locks.c b/fs/locks.c
index fb717dae9029..31659a2d9862 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -2381,8 +2381,9 @@ int fcntl_setlk(unsigned int fd, struct file *filp, unsigned int cmd,
error = do_lock_file_wait(filp, cmd, file_lock);
/*
- * Attempt to detect a close/fcntl race and recover by releasing the
- * lock that was just acquired. There is no need to do that when we're
+ * Detect close/fcntl races and recover by zapping all POSIX locks
+ * associated with this file and our files_struct, just like on
+ * filp_flush(). There is no need to do that when we're
* unlocking though, or for OFD locks.
*/
if (!error && file_lock->fl_type != F_UNLCK &&
@@ -2397,9 +2398,7 @@ int fcntl_setlk(unsigned int fd, struct file *filp, unsigned int cmd,
f = files_lookup_fd_locked(files, fd);
spin_unlock(&files->file_lock);
if (f != filp) {
- file_lock->fl_type = F_UNLCK;
- error = do_lock_file_wait(filp, cmd, file_lock);
- WARN_ON_ONCE(error);
+ locks_remove_posix(filp, files);
error = -EBADF;
}
}
base-commit: 2eaf5c0d81911ba05bace3a722cbcd708fdbbcba
--
2.45.2.1089.g2a221341d9-goog
Unaccepted memory is considered unusable free memory, which is not
counted as free on the zone watermark check. This causes
get_page_from_freelist() to accept more memory to hit the high
watermark, but it creates problems in the reclaim path.
The reclaim path encounters a failed zone watermark check and attempts
to reclaim memory. This is usually successful, but if there is little or
no reclaimable memory, it can result in endless reclaim with little to
no progress. This can occur early in the boot process, just after start
of the init process when the only reclaimable memory is the page cache
of the init executable and its libraries.
To address this issue, teach shrink_node() and shrink_zones() to accept
memory before attempting to reclaim.
Signed-off-by: Kirill A. Shutemov <kirill.shutemov(a)linux.intel.com>
Reported-by: Jianxiong Gao <jxgao(a)google.com>
Fixes: dcdfdd40fa82 ("mm: Add support for unaccepted memory")
Cc: stable(a)vger.kernel.org # v6.5+
---
mm/internal.h | 9 +++++++++
mm/page_alloc.c | 8 +-------
mm/vmscan.c | 36 ++++++++++++++++++++++++++++++++++++
3 files changed, 46 insertions(+), 7 deletions(-)
diff --git a/mm/internal.h b/mm/internal.h
index cc2c5e07fad3..ea55cbad061f 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -1515,4 +1515,13 @@ static inline void shrinker_debugfs_remove(struct dentry *debugfs_entry,
void workingset_update_node(struct xa_node *node);
extern struct list_lru shadow_nodes;
+#ifdef CONFIG_UNACCEPTED_MEMORY
+bool try_to_accept_memory(struct zone *zone, unsigned int order);
+#else
+static inline bool try_to_accept_memory(struct zone *zone, unsigned int order)
+{
+ return false;
+}
+#endif /* CONFIG_UNACCEPTED_MEMORY */
+
#endif /* __MM_INTERNAL_H */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 9ecf99190ea2..9a108c92245f 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -287,7 +287,6 @@ EXPORT_SYMBOL(nr_online_nodes);
static bool page_contains_unaccepted(struct page *page, unsigned int order);
static void accept_page(struct page *page, unsigned int order);
-static bool try_to_accept_memory(struct zone *zone, unsigned int order);
static inline bool has_unaccepted_memory(void);
static bool __free_unaccepted(struct page *page);
@@ -6940,7 +6939,7 @@ static bool try_to_accept_memory_one(struct zone *zone)
return true;
}
-static bool try_to_accept_memory(struct zone *zone, unsigned int order)
+bool try_to_accept_memory(struct zone *zone, unsigned int order)
{
long to_accept;
int ret = false;
@@ -6999,11 +6998,6 @@ static void accept_page(struct page *page, unsigned int order)
{
}
-static bool try_to_accept_memory(struct zone *zone, unsigned int order)
-{
- return false;
-}
-
static inline bool has_unaccepted_memory(void)
{
return false;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 2e34de9cd0d4..b2af1263b1bc 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -5900,12 +5900,44 @@ static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc)
} while ((memcg = mem_cgroup_iter(target_memcg, memcg, NULL)));
}
+#ifdef CONFIG_UNACCEPTED_MEMORY
+static bool node_try_to_accept_memory(pg_data_t *pgdat, struct scan_control *sc)
+{
+ bool progress = false;
+ struct zone *zone;
+ int z;
+
+ for (z = 0; z <= sc->reclaim_idx; z++) {
+ zone = pgdat->node_zones + z;
+ if (!managed_zone(zone))
+ continue;
+
+ if (try_to_accept_memory(zone, sc->order))
+ progress = true;
+ }
+
+ return progress;
+}
+#else
+static inline bool node_try_to_accept_memory(pg_data_t *pgdat,
+ struct scan_control *sc)
+{
+ return false;
+}
+#endif /* CONFIG_UNACCEPTED_MEMORY */
+
static void shrink_node(pg_data_t *pgdat, struct scan_control *sc)
{
unsigned long nr_reclaimed, nr_scanned, nr_node_reclaimed;
struct lruvec *target_lruvec;
bool reclaimable = false;
+ /* Try to accept memory before going for reclaim */
+ if (node_try_to_accept_memory(pgdat, sc)) {
+ if (!should_continue_reclaim(pgdat, 0, sc))
+ return;
+ }
+
if (lru_gen_enabled() && root_reclaim(sc)) {
lru_gen_shrink_node(pgdat, sc);
return;
@@ -6118,6 +6150,10 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
GFP_KERNEL | __GFP_HARDWALL))
continue;
+ /* Try to accept memory before going for reclaim */
+ if (try_to_accept_memory(zone, sc->order))
+ continue;
+
/*
* If we already have plenty of memory free for
* compaction in this zone, don't free any more.
--
2.43.0