Changelog: v8: Added review-bys and addressed comments from Mike Rapoport and Pratyush Yadav. Added "memblock: Unpreserve memory in case of error" to handle rollback if preserve fails half way through.
This series refactors the KHO framework to better support in-kernel users like the upcoming LUO. The current design, which relies on a notifier chain and debugfs for control, is too restrictive for direct programmatic use.
The core of this rework is the removal of the notifier chain in favor of a direct registration API. This decouples clients from the shutdown-time finalization sequence, allowing them to manage their preserved state more flexibly and at any time.
In support of this new model, this series also: - Exports kho_finalize() and kho_abort() for programmatic control. - Makes the debugfs interface optional. - Introduces APIs to unpreserve memory and fixes a bug in the abort path where client state was being incorrectly discarded. Note that this is an interim step, as a more comprehensive fix is planned as part of the stateless KHO work [1]. - Moves all KHO code into a new kernel/liveupdate/ directory to consolidate live update components.
[1] https://lore.kernel.org/all/20251020100306.2709352-1-jasonmiu@google.com
Mike Rapoport (Microsoft) (1): kho: drop notifiers
Pasha Tatashin (7): kho: allow to drive kho from within kernel kho: make debugfs interface optional kho: add interfaces to unpreserve folios and page ranges kho: don't unpreserve memory during abort liveupdate: kho: move to kernel/liveupdate liveupdate: kho: move kho debugfs directory to liveupdate memblock: Unpreserve memory in case of error
Documentation/core-api/kho/concepts.rst | 2 +- MAINTAINERS | 3 +- include/linux/kexec_handover.h | 53 +- init/Kconfig | 2 + kernel/Kconfig.kexec | 24 - kernel/Makefile | 3 +- kernel/kexec_handover_internal.h | 16 - kernel/liveupdate/Kconfig | 39 ++ kernel/liveupdate/Makefile | 5 + kernel/{ => liveupdate}/kexec_handover.c | 508 +++++++----------- .../{ => liveupdate}/kexec_handover_debug.c | 0 kernel/liveupdate/kexec_handover_debugfs.c | 219 ++++++++ kernel/liveupdate/kexec_handover_internal.h | 56 ++ lib/test_kho.c | 33 +- mm/memblock.c | 82 ++- tools/testing/selftests/kho/init.c | 2 +- tools/testing/selftests/kho/vmtest.sh | 1 + 17 files changed, 590 insertions(+), 458 deletions(-) delete mode 100644 kernel/kexec_handover_internal.h create mode 100644 kernel/liveupdate/Kconfig create mode 100644 kernel/liveupdate/Makefile rename kernel/{ => liveupdate}/kexec_handover.c (80%) rename kernel/{ => liveupdate}/kexec_handover_debug.c (100%) create mode 100644 kernel/liveupdate/kexec_handover_debugfs.c create mode 100644 kernel/liveupdate/kexec_handover_internal.h
base-commit: 72fb0170ef1f45addf726319c52a0562b6913707
Allow kernel to drive finalize and abort without requiring triggers from the userspace.
Signed-off-by: Pasha Tatashin pasha.tatashin@soleen.com Reviewed-by: Pratyush Yadav pratyush@kernel.org Reviewed-by: Mike Rapoport (Microsoft) rppt@kernel.org --- include/linux/kexec_handover.h | 15 +++++++ kernel/kexec_handover.c | 75 +++++++++++++++++++++------------- 2 files changed, 61 insertions(+), 29 deletions(-)
diff --git a/include/linux/kexec_handover.h b/include/linux/kexec_handover.h index 25042c1d8d54..04d0108db98e 100644 --- a/include/linux/kexec_handover.h +++ b/include/linux/kexec_handover.h @@ -67,6 +67,10 @@ void kho_memory_init(void);
void kho_populate(phys_addr_t fdt_phys, u64 fdt_len, phys_addr_t scratch_phys, u64 scratch_len); + +int kho_finalize(void); +int kho_abort(void); + #else static inline bool kho_is_enabled(void) { @@ -139,6 +143,17 @@ static inline void kho_populate(phys_addr_t fdt_phys, u64 fdt_len, phys_addr_t scratch_phys, u64 scratch_len) { } + +static inline int kho_finalize(void) +{ + return -EOPNOTSUPP; +} + +static inline int kho_abort(void) +{ + return -EOPNOTSUPP; +} + #endif /* CONFIG_KEXEC_HANDOVER */
#endif /* LINUX_KEXEC_HANDOVER_H */ diff --git a/kernel/kexec_handover.c b/kernel/kexec_handover.c index de4466b47455..6458f369a346 100644 --- a/kernel/kexec_handover.c +++ b/kernel/kexec_handover.c @@ -1087,7 +1087,7 @@ static int kho_out_update_debugfs_fdt(void) return err; }
-static int kho_abort(void) +static int __kho_abort(void) { int err; unsigned long order; @@ -1120,7 +1120,27 @@ static int kho_abort(void) return err; }
-static int kho_finalize(void) +int kho_abort(void) +{ + int ret = 0; + + if (!kho_enable) + return -EOPNOTSUPP; + + guard(mutex)(&kho_out.lock); + if (!kho_out.finalized) + return -ENOENT; + + ret = __kho_abort(); + if (ret) + return ret; + + kho_out.finalized = false; + + return kho_out_update_debugfs_fdt(); +} + +static int __kho_finalize(void) { int err = 0; u64 *preserved_mem_map; @@ -1163,12 +1183,32 @@ static int kho_finalize(void) abort: if (err) { pr_err("Failed to convert KHO state tree: %d\n", err); - kho_abort(); + __kho_abort(); }
return err; }
+int kho_finalize(void) +{ + int ret; + + if (!kho_enable) + return -EOPNOTSUPP; + + guard(mutex)(&kho_out.lock); + if (kho_out.finalized) + return -EEXIST; + + ret = __kho_finalize(); + if (ret) + return ret; + + kho_out.finalized = true; + + return kho_out_update_debugfs_fdt(); +} + static int kho_out_finalize_get(void *data, u64 *val) { mutex_lock(&kho_out.lock); @@ -1178,35 +1218,12 @@ static int kho_out_finalize_get(void *data, u64 *val) return 0; }
-static int kho_out_finalize_set(void *data, u64 _val) +static int kho_out_finalize_set(void *data, u64 val) { - int ret = 0; - bool val = !!_val; - - mutex_lock(&kho_out.lock); - - if (val == kho_out.finalized) { - if (kho_out.finalized) - ret = -EEXIST; - else - ret = -ENOENT; - goto unlock; - } - if (val) - ret = kho_finalize(); + return kho_finalize(); else - ret = kho_abort(); - - if (ret) - goto unlock; - - kho_out.finalized = val; - ret = kho_out_update_debugfs_fdt(); - -unlock: - mutex_unlock(&kho_out.lock); - return ret; + return kho_abort(); }
DEFINE_DEBUGFS_ATTRIBUTE(fops_kho_out_finalize, kho_out_finalize_get,
Currently, KHO is controlled via debugfs interface, but once LUO is introduced, it can control KHO, and the debug interface becomes optional.
Add a separate config CONFIG_KEXEC_HANDOVER_DEBUGFS that enables the debugfs interface, and allows to inspect the tree.
Move all debugfs related code to a new file to keep the .c files clear of ifdefs.
Co-developed-by: Mike Rapoport (Microsoft) rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) rppt@kernel.org Signed-off-by: Pasha Tatashin pasha.tatashin@soleen.com Reviewed-by: Pratyush Yadav pratyush@kernel.org --- MAINTAINERS | 3 +- kernel/Kconfig.kexec | 10 ++ kernel/Makefile | 1 + kernel/kexec_handover.c | 226 +++----------------------- kernel/kexec_handover_debugfs.c | 216 ++++++++++++++++++++++++ kernel/kexec_handover_internal.h | 37 +++++ tools/testing/selftests/kho/vmtest.sh | 1 + 7 files changed, 286 insertions(+), 208 deletions(-) create mode 100644 kernel/kexec_handover_debugfs.c
diff --git a/MAINTAINERS b/MAINTAINERS index 5ea78444f035..35d7942b2082 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -13815,13 +13815,14 @@ KEXEC HANDOVER (KHO) M: Alexander Graf graf@amazon.com M: Mike Rapoport rppt@kernel.org M: Changyuan Lyu changyuanl@google.com +M: Pasha Tatashin pasha.tatashin@soleen.com L: kexec@lists.infradead.org L: linux-mm@kvack.org S: Maintained F: Documentation/admin-guide/mm/kho.rst F: Documentation/core-api/kho/* F: include/linux/kexec_handover.h -F: kernel/kexec_handover.c +F: kernel/kexec_handover* F: tools/testing/selftests/kho/
KEYS-ENCRYPTED diff --git a/kernel/Kconfig.kexec b/kernel/Kconfig.kexec index c94d36b5fcd9..9308a0fb1419 100644 --- a/kernel/Kconfig.kexec +++ b/kernel/Kconfig.kexec @@ -118,6 +118,16 @@ config KEXEC_HANDOVER_DEBUG scenarios and the extra code might be adding overhead it is only optionally enabled.
+config KEXEC_HANDOVER_DEBUGFS + bool "kexec handover debugfs interface" + depends on KEXEC_HANDOVER + depends on DEBUG_FS + help + Allow to control kexec handover device tree via debugfs + interface, i.e. finalize the state or aborting the finalization. + Also, enables inspecting the KHO fdt trees with the debugfs binary + blobs. + config CRASH_DUMP bool "kernel crash dumps" default ARCH_DEFAULT_CRASH_DUMP diff --git a/kernel/Makefile b/kernel/Makefile index 9fe722305c9b..2cf7909a74e5 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -84,6 +84,7 @@ obj-$(CONFIG_KEXEC_FILE) += kexec_file.o obj-$(CONFIG_KEXEC_ELF) += kexec_elf.o obj-$(CONFIG_KEXEC_HANDOVER) += kexec_handover.o obj-$(CONFIG_KEXEC_HANDOVER_DEBUG) += kexec_handover_debug.o +obj-$(CONFIG_KEXEC_HANDOVER_DEBUGFS) += kexec_handover_debugfs.o obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o obj-$(CONFIG_COMPAT) += compat.o obj-$(CONFIG_CGROUPS) += cgroup/ diff --git a/kernel/kexec_handover.c b/kernel/kexec_handover.c index 6458f369a346..da071277d85e 100644 --- a/kernel/kexec_handover.c +++ b/kernel/kexec_handover.c @@ -11,7 +11,6 @@ #include <linux/cleanup.h> #include <linux/cma.h> #include <linux/count_zeros.h> -#include <linux/debugfs.h> #include <linux/kexec.h> #include <linux/kexec_handover.h> #include <linux/libfdt.h> @@ -30,6 +29,7 @@ */ #include "../mm/internal.h" #include "kexec_internal.h" +#include "kexec_handover_internal.h"
#define KHO_FDT_COMPATIBLE "kho-v1" #define PROP_PRESERVED_MEMORY_MAP "preserved-memory-map" @@ -105,8 +105,6 @@ struct khoser_mem_chunk;
struct kho_serialization { struct page *fdt; - struct list_head fdt_list; - struct dentry *sub_fdt_dir; struct kho_mem_track track; /* First chunk of serialized preserved memory map */ struct khoser_mem_chunk *preserved_mem_map; @@ -114,20 +112,16 @@ struct kho_serialization {
struct kho_out { struct blocking_notifier_head chain_head; - - struct dentry *dir; - struct mutex lock; /* protects KHO FDT finalization */ - struct kho_serialization ser; bool finalized; + struct kho_debugfs dbg; };
static struct kho_out kho_out = { .chain_head = BLOCKING_NOTIFIER_INIT(kho_out.chain_head), .lock = __MUTEX_INITIALIZER(kho_out.lock), .ser = { - .fdt_list = LIST_HEAD_INIT(kho_out.ser.fdt_list), .track = { .orders = XARRAY_INIT(kho_out.ser.track.orders, 0), }, @@ -477,8 +471,8 @@ static void __init kho_mem_deserialize(const void *fdt) * area for early allocations that happen before page allocator is * initialized. */ -static struct kho_scratch *kho_scratch; -static unsigned int kho_scratch_cnt; +struct kho_scratch *kho_scratch; +unsigned int kho_scratch_cnt;
/* * The scratch areas are scaled by default as percent of memory allocated from @@ -674,37 +668,6 @@ static void __init kho_reserve_scratch(void) kho_enable = false; }
-struct fdt_debugfs { - struct list_head list; - struct debugfs_blob_wrapper wrapper; - struct dentry *file; -}; - -static int kho_debugfs_fdt_add(struct list_head *list, struct dentry *dir, - const char *name, const void *fdt) -{ - struct fdt_debugfs *f; - struct dentry *file; - - f = kmalloc(sizeof(*f), GFP_KERNEL); - if (!f) - return -ENOMEM; - - f->wrapper.data = (void *)fdt; - f->wrapper.size = fdt_totalsize(fdt); - - file = debugfs_create_blob(name, 0400, dir, &f->wrapper); - if (IS_ERR(file)) { - kfree(f); - return PTR_ERR(file); - } - - f->file = file; - list_add(&f->list, list); - - return 0; -} - /** * kho_add_subtree - record the physical address of a sub FDT in KHO root tree. * @ser: serialization control object passed by KHO notifiers. @@ -716,7 +679,8 @@ static int kho_debugfs_fdt_add(struct list_head *list, struct dentry *dir, * by KHO for the new kernel to retrieve it after kexec. * * A debugfs blob entry is also created at - * ``/sys/kernel/debug/kho/out/sub_fdts/@name``. + * ``/sys/kernel/debug/kho/out/sub_fdts/@name`` when kernel is configured with + * CONFIG_KEXEC_HANDOVER_DEBUGFS * * Return: 0 on success, error code on failure */ @@ -733,7 +697,7 @@ int kho_add_subtree(struct kho_serialization *ser, const char *name, void *fdt) if (err) return err;
- return kho_debugfs_fdt_add(&ser->fdt_list, ser->sub_fdt_dir, name, fdt); + return kho_debugfs_fdt_add(&kho_out.dbg, name, fdt, false); } EXPORT_SYMBOL_GPL(kho_add_subtree);
@@ -1064,29 +1028,6 @@ void *kho_restore_vmalloc(const struct kho_vmalloc *preservation) } EXPORT_SYMBOL_GPL(kho_restore_vmalloc);
-/* Handling for debug/kho/out */ - -static struct dentry *debugfs_root; - -static int kho_out_update_debugfs_fdt(void) -{ - int err = 0; - struct fdt_debugfs *ff, *tmp; - - if (kho_out.finalized) { - err = kho_debugfs_fdt_add(&kho_out.ser.fdt_list, kho_out.dir, - "fdt", page_to_virt(kho_out.ser.fdt)); - } else { - list_for_each_entry_safe(ff, tmp, &kho_out.ser.fdt_list, list) { - debugfs_remove(ff->file); - list_del(&ff->list); - kfree(ff); - } - } - - return err; -} - static int __kho_abort(void) { int err; @@ -1136,8 +1077,9 @@ int kho_abort(void) return ret;
kho_out.finalized = false; + kho_debugfs_cleanup(&kho_out.dbg);
- return kho_out_update_debugfs_fdt(); + return 0; }
static int __kho_finalize(void) @@ -1206,92 +1148,23 @@ int kho_finalize(void)
kho_out.finalized = true;
- return kho_out_update_debugfs_fdt(); + return kho_debugfs_fdt_add(&kho_out.dbg, "fdt", + page_to_virt(kho_out.ser.fdt), true); }
-static int kho_out_finalize_get(void *data, u64 *val) +bool kho_finalized(void) { - mutex_lock(&kho_out.lock); - *val = kho_out.finalized; - mutex_unlock(&kho_out.lock); - - return 0; -} - -static int kho_out_finalize_set(void *data, u64 val) -{ - if (val) - return kho_finalize(); - else - return kho_abort(); -} - -DEFINE_DEBUGFS_ATTRIBUTE(fops_kho_out_finalize, kho_out_finalize_get, - kho_out_finalize_set, "%llu\n"); - -static int scratch_phys_show(struct seq_file *m, void *v) -{ - for (int i = 0; i < kho_scratch_cnt; i++) - seq_printf(m, "0x%llx\n", kho_scratch[i].addr); - - return 0; -} -DEFINE_SHOW_ATTRIBUTE(scratch_phys); - -static int scratch_len_show(struct seq_file *m, void *v) -{ - for (int i = 0; i < kho_scratch_cnt; i++) - seq_printf(m, "0x%llx\n", kho_scratch[i].size); - - return 0; -} -DEFINE_SHOW_ATTRIBUTE(scratch_len); - -static __init int kho_out_debugfs_init(void) -{ - struct dentry *dir, *f, *sub_fdt_dir; - - dir = debugfs_create_dir("out", debugfs_root); - if (IS_ERR(dir)) - return -ENOMEM; - - sub_fdt_dir = debugfs_create_dir("sub_fdts", dir); - if (IS_ERR(sub_fdt_dir)) - goto err_rmdir; - - f = debugfs_create_file("scratch_phys", 0400, dir, NULL, - &scratch_phys_fops); - if (IS_ERR(f)) - goto err_rmdir; - - f = debugfs_create_file("scratch_len", 0400, dir, NULL, - &scratch_len_fops); - if (IS_ERR(f)) - goto err_rmdir; - - f = debugfs_create_file("finalize", 0600, dir, NULL, - &fops_kho_out_finalize); - if (IS_ERR(f)) - goto err_rmdir; - - kho_out.dir = dir; - kho_out.ser.sub_fdt_dir = sub_fdt_dir; - return 0; - -err_rmdir: - debugfs_remove_recursive(dir); - return -ENOENT; + guard(mutex)(&kho_out.lock); + return kho_out.finalized; }
struct kho_in { - struct dentry *dir; phys_addr_t fdt_phys; phys_addr_t scratch_phys; - struct list_head fdt_list; + struct kho_debugfs dbg; };
static struct kho_in kho_in = { - .fdt_list = LIST_HEAD_INIT(kho_in.fdt_list), };
static const void *kho_get_fdt(void) @@ -1355,56 +1228,6 @@ int kho_retrieve_subtree(const char *name, phys_addr_t *phys) } EXPORT_SYMBOL_GPL(kho_retrieve_subtree);
-/* Handling for debugfs/kho/in */ - -static __init int kho_in_debugfs_init(const void *fdt) -{ - struct dentry *sub_fdt_dir; - int err, child; - - kho_in.dir = debugfs_create_dir("in", debugfs_root); - if (IS_ERR(kho_in.dir)) - return PTR_ERR(kho_in.dir); - - sub_fdt_dir = debugfs_create_dir("sub_fdts", kho_in.dir); - if (IS_ERR(sub_fdt_dir)) { - err = PTR_ERR(sub_fdt_dir); - goto err_rmdir; - } - - err = kho_debugfs_fdt_add(&kho_in.fdt_list, kho_in.dir, "fdt", fdt); - if (err) - goto err_rmdir; - - fdt_for_each_subnode(child, fdt, 0) { - int len = 0; - const char *name = fdt_get_name(fdt, child, NULL); - const u64 *fdt_phys; - - fdt_phys = fdt_getprop(fdt, child, "fdt", &len); - if (!fdt_phys) - continue; - if (len != sizeof(*fdt_phys)) { - pr_warn("node `%s`'s prop `fdt` has invalid length: %d\n", - name, len); - continue; - } - err = kho_debugfs_fdt_add(&kho_in.fdt_list, sub_fdt_dir, name, - phys_to_virt(*fdt_phys)); - if (err) { - pr_warn("failed to add fdt `%s` to debugfs: %d\n", name, - err); - continue; - } - } - - return 0; - -err_rmdir: - debugfs_remove_recursive(kho_in.dir); - return err; -} - static __init int kho_init(void) { int err = 0; @@ -1419,27 +1242,16 @@ static __init int kho_init(void) goto err_free_scratch; }
- debugfs_root = debugfs_create_dir("kho", NULL); - if (IS_ERR(debugfs_root)) { - err = -ENOENT; + err = kho_debugfs_init(); + if (err) goto err_free_fdt; - }
- err = kho_out_debugfs_init(); + err = kho_out_debugfs_init(&kho_out.dbg); if (err) goto err_free_fdt;
if (fdt) { - err = kho_in_debugfs_init(fdt); - /* - * Failure to create /sys/kernel/debug/kho/in does not prevent - * reviving state from KHO and setting up KHO for the next - * kexec. - */ - if (err) - pr_err("failed exposing handover FDT in debugfs: %d\n", - err); - + kho_in_debugfs_init(&kho_in.dbg, fdt); return 0; }
diff --git a/kernel/kexec_handover_debugfs.c b/kernel/kexec_handover_debugfs.c new file mode 100644 index 000000000000..a91b279f1b23 --- /dev/null +++ b/kernel/kexec_handover_debugfs.c @@ -0,0 +1,216 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * kexec_handover_debugfs.c - kexec handover debugfs interfaces + * Copyright (C) 2023 Alexander Graf graf@amazon.com + * Copyright (C) 2025 Microsoft Corporation, Mike Rapoport rppt@kernel.org + * Copyright (C) 2025 Google LLC, Changyuan Lyu changyuanl@google.com + * Copyright (C) 2025 Google LLC, Pasha Tatashin pasha.tatashin@soleen.com + */ + +#define pr_fmt(fmt) "KHO: " fmt + +#include <linux/init.h> +#include <linux/io.h> +#include <linux/libfdt.h> +#include <linux/mm.h> +#include "kexec_handover_internal.h" + +static struct dentry *debugfs_root; + +struct fdt_debugfs { + struct list_head list; + struct debugfs_blob_wrapper wrapper; + struct dentry *file; +}; + +static int __kho_debugfs_fdt_add(struct list_head *list, struct dentry *dir, + const char *name, const void *fdt) +{ + struct fdt_debugfs *f; + struct dentry *file; + + f = kmalloc(sizeof(*f), GFP_KERNEL); + if (!f) + return -ENOMEM; + + f->wrapper.data = (void *)fdt; + f->wrapper.size = fdt_totalsize(fdt); + + file = debugfs_create_blob(name, 0400, dir, &f->wrapper); + if (IS_ERR(file)) { + kfree(f); + return PTR_ERR(file); + } + + f->file = file; + list_add(&f->list, list); + + return 0; +} + +int kho_debugfs_fdt_add(struct kho_debugfs *dbg, const char *name, + const void *fdt, bool root) +{ + struct dentry *dir; + + if (root) + dir = dbg->dir; + else + dir = dbg->sub_fdt_dir; + + return __kho_debugfs_fdt_add(&dbg->fdt_list, dir, name, fdt); +} + +void kho_debugfs_cleanup(struct kho_debugfs *dbg) +{ + struct fdt_debugfs *ff, *tmp; + + list_for_each_entry_safe(ff, tmp, &dbg->fdt_list, list) { + debugfs_remove(ff->file); + list_del(&ff->list); + kfree(ff); + } +} + +static int kho_out_finalize_get(void *data, u64 *val) +{ + *val = kho_finalized(); + + return 0; +} + +static int kho_out_finalize_set(void *data, u64 val) +{ + if (val) + return kho_finalize(); + else + return kho_abort(); +} + +DEFINE_DEBUGFS_ATTRIBUTE(kho_out_finalize_fops, kho_out_finalize_get, + kho_out_finalize_set, "%llu\n"); + +static int scratch_phys_show(struct seq_file *m, void *v) +{ + for (int i = 0; i < kho_scratch_cnt; i++) + seq_printf(m, "0x%llx\n", kho_scratch[i].addr); + + return 0; +} +DEFINE_SHOW_ATTRIBUTE(scratch_phys); + +static int scratch_len_show(struct seq_file *m, void *v) +{ + for (int i = 0; i < kho_scratch_cnt; i++) + seq_printf(m, "0x%llx\n", kho_scratch[i].size); + + return 0; +} +DEFINE_SHOW_ATTRIBUTE(scratch_len); + +__init void kho_in_debugfs_init(struct kho_debugfs *dbg, const void *fdt) +{ + struct dentry *dir, *sub_fdt_dir; + int err, child; + + INIT_LIST_HEAD(&dbg->fdt_list); + + dir = debugfs_create_dir("in", debugfs_root); + if (IS_ERR(dir)) { + err = PTR_ERR(dir); + goto err_out; + } + + sub_fdt_dir = debugfs_create_dir("sub_fdts", dir); + if (IS_ERR(sub_fdt_dir)) { + err = PTR_ERR(sub_fdt_dir); + goto err_rmdir; + } + + err = __kho_debugfs_fdt_add(&dbg->fdt_list, dir, "fdt", fdt); + if (err) + goto err_rmdir; + + fdt_for_each_subnode(child, fdt, 0) { + int len = 0; + const char *name = fdt_get_name(fdt, child, NULL); + const u64 *fdt_phys; + + fdt_phys = fdt_getprop(fdt, child, "fdt", &len); + if (!fdt_phys) + continue; + if (len != sizeof(*fdt_phys)) { + pr_warn("node %s prop fdt has invalid length: %d\n", + name, len); + continue; + } + err = __kho_debugfs_fdt_add(&dbg->fdt_list, sub_fdt_dir, name, + phys_to_virt(*fdt_phys)); + if (err) { + pr_warn("failed to add fdt %s to debugfs: %d\n", name, + err); + continue; + } + } + + dbg->dir = dir; + dbg->sub_fdt_dir = sub_fdt_dir; + + return; +err_rmdir: + debugfs_remove_recursive(dir); +err_out: + /* + * Failure to create /sys/kernel/debug/kho/in does not prevent + * reviving state from KHO and setting up KHO for the next + * kexec. + */ + if (err) + pr_err("failed exposing handover FDT in debugfs: %d\n", err); +} + +__init int kho_out_debugfs_init(struct kho_debugfs *dbg) +{ + struct dentry *dir, *f, *sub_fdt_dir; + + INIT_LIST_HEAD(&dbg->fdt_list); + + dir = debugfs_create_dir("out", debugfs_root); + if (IS_ERR(dir)) + return -ENOMEM; + + sub_fdt_dir = debugfs_create_dir("sub_fdts", dir); + if (IS_ERR(sub_fdt_dir)) + goto err_rmdir; + + f = debugfs_create_file("scratch_phys", 0400, dir, NULL, + &scratch_phys_fops); + if (IS_ERR(f)) + goto err_rmdir; + + f = debugfs_create_file("scratch_len", 0400, dir, NULL, + &scratch_len_fops); + if (IS_ERR(f)) + goto err_rmdir; + + f = debugfs_create_file("finalize", 0600, dir, NULL, + &kho_out_finalize_fops); + if (IS_ERR(f)) + goto err_rmdir; + + dbg->dir = dir; + dbg->sub_fdt_dir = sub_fdt_dir; + return 0; + +err_rmdir: + debugfs_remove_recursive(dir); + return -ENOENT; +} + +__init int kho_debugfs_init(void) +{ + debugfs_root = debugfs_create_dir("kho", NULL); + if (IS_ERR(debugfs_root)) + return -ENOENT; + return 0; +} diff --git a/kernel/kexec_handover_internal.h b/kernel/kexec_handover_internal.h index 05e9720ba7b9..28c0e971613d 100644 --- a/kernel/kexec_handover_internal.h +++ b/kernel/kexec_handover_internal.h @@ -2,8 +2,45 @@ #ifndef LINUX_KEXEC_HANDOVER_INTERNAL_H #define LINUX_KEXEC_HANDOVER_INTERNAL_H
+#include <linux/kexec_handover.h> +#include <linux/list.h> #include <linux/types.h>
+#ifdef CONFIG_KEXEC_HANDOVER_DEBUGFS +#include <linux/debugfs.h> + +struct kho_debugfs { + struct dentry *dir; + struct dentry *sub_fdt_dir; + struct list_head fdt_list; +}; + +#else +struct kho_debugfs {}; +#endif + +extern struct kho_scratch *kho_scratch; +extern unsigned int kho_scratch_cnt; + +bool kho_finalized(void); + +#ifdef CONFIG_KEXEC_HANDOVER_DEBUGFS +int kho_debugfs_init(void); +void kho_in_debugfs_init(struct kho_debugfs *dbg, const void *fdt); +int kho_out_debugfs_init(struct kho_debugfs *dbg); +int kho_debugfs_fdt_add(struct kho_debugfs *dbg, const char *name, + const void *fdt, bool root); +void kho_debugfs_cleanup(struct kho_debugfs *dbg); +#else +static inline int kho_debugfs_init(void) { return 0; } +static inline void kho_in_debugfs_init(struct kho_debugfs *dbg, + const void *fdt) { } +static inline int kho_out_debugfs_init(struct kho_debugfs *dbg) { return 0; } +static inline int kho_debugfs_fdt_add(struct kho_debugfs *dbg, const char *name, + const void *fdt, bool root) { return 0; } +static inline void kho_debugfs_cleanup(struct kho_debugfs *dbg) {} +#endif /* CONFIG_KEXEC_HANDOVER_DEBUGFS */ + #ifdef CONFIG_KEXEC_HANDOVER_DEBUG bool kho_scratch_overlap(phys_addr_t phys, size_t size); #else diff --git a/tools/testing/selftests/kho/vmtest.sh b/tools/testing/selftests/kho/vmtest.sh index 3f6c17166846..49fdac8e8b15 100755 --- a/tools/testing/selftests/kho/vmtest.sh +++ b/tools/testing/selftests/kho/vmtest.sh @@ -59,6 +59,7 @@ function build_kernel() { tee "$kconfig" > "$kho_config" <<EOF CONFIG_BLK_DEV_INITRD=y CONFIG_KEXEC_HANDOVER=y +CONFIG_KEXEC_HANDOVER_DEBUGFS=y CONFIG_TEST_KEXEC_HANDOVER=y CONFIG_DEBUG_KERNEL=y CONFIG_DEBUG_VM=y
From: "Mike Rapoport (Microsoft)" rppt@kernel.org
The KHO framework uses a notifier chain as the mechanism for clients to participate in the finalization process. While this works for a single, central state machine, it is too restrictive for kernel-internal components like pstore/reserve_mem or IMA. These components need a simpler, direct way to register their state for preservation (e.g., during their initcall) without being part of a complex, shutdown-time notifier sequence. The notifier model forces all participants into a single finalization flow and makes direct preservation from an arbitrary context difficult. This patch refactors the client participation model by removing the notifier chain and introducing a direct API for managing FDT subtrees.
The core kho_finalize() and kho_abort() state machine remains, but clients now register their data with KHO beforehand.
Signed-off-by: Mike Rapoport (Microsoft) rppt@kernel.org Co-developed-by: Pasha Tatashin pasha.tatashin@soleen.com Signed-off-by: Pasha Tatashin pasha.tatashin@soleen.com --- include/linux/kexec_handover.h | 28 +----- kernel/kexec_handover.c | 166 +++++++++++++++++-------------- kernel/kexec_handover_debugfs.c | 17 ++-- kernel/kexec_handover_internal.h | 5 +- lib/test_kho.c | 33 +----- mm/memblock.c | 62 +++--------- 6 files changed, 126 insertions(+), 185 deletions(-)
diff --git a/include/linux/kexec_handover.h b/include/linux/kexec_handover.h index 04d0108db98e..2faf290803ce 100644 --- a/include/linux/kexec_handover.h +++ b/include/linux/kexec_handover.h @@ -10,14 +10,7 @@ struct kho_scratch { phys_addr_t size; };
-/* KHO Notifier index */ -enum kho_event { - KEXEC_KHO_FINALIZE = 0, - KEXEC_KHO_ABORT = 1, -}; - struct folio; -struct notifier_block; struct page;
#define DECLARE_KHOSER_PTR(name, type) \ @@ -37,8 +30,6 @@ struct page; (typeof((s).ptr))((s).phys ? phys_to_virt((s).phys) : NULL); \ })
-struct kho_serialization; - struct kho_vmalloc_chunk; struct kho_vmalloc { DECLARE_KHOSER_PTR(first, struct kho_vmalloc_chunk *); @@ -57,12 +48,10 @@ int kho_preserve_vmalloc(void *ptr, struct kho_vmalloc *preservation); struct folio *kho_restore_folio(phys_addr_t phys); struct page *kho_restore_pages(phys_addr_t phys, unsigned int nr_pages); void *kho_restore_vmalloc(const struct kho_vmalloc *preservation); -int kho_add_subtree(struct kho_serialization *ser, const char *name, void *fdt); +int kho_add_subtree(const char *name, void *fdt); +void kho_remove_subtree(void *fdt); int kho_retrieve_subtree(const char *name, phys_addr_t *phys);
-int register_kho_notifier(struct notifier_block *nb); -int unregister_kho_notifier(struct notifier_block *nb); - void kho_memory_init(void);
void kho_populate(phys_addr_t fdt_phys, u64 fdt_len, phys_addr_t scratch_phys, @@ -114,23 +103,16 @@ static inline void *kho_restore_vmalloc(const struct kho_vmalloc *preservation) return NULL; }
-static inline int kho_add_subtree(struct kho_serialization *ser, - const char *name, void *fdt) +static inline int kho_add_subtree(const char *name, void *fdt) { return -EOPNOTSUPP; }
-static inline int kho_retrieve_subtree(const char *name, phys_addr_t *phys) +static inline void kho_remove_subtree(void *fdt) { - return -EOPNOTSUPP; }
-static inline int register_kho_notifier(struct notifier_block *nb) -{ - return -EOPNOTSUPP; -} - -static inline int unregister_kho_notifier(struct notifier_block *nb) +static inline int kho_retrieve_subtree(const char *name, phys_addr_t *phys) { return -EOPNOTSUPP; } diff --git a/kernel/kexec_handover.c b/kernel/kexec_handover.c index da071277d85e..82137eba1474 100644 --- a/kernel/kexec_handover.c +++ b/kernel/kexec_handover.c @@ -16,7 +16,6 @@ #include <linux/libfdt.h> #include <linux/list.h> #include <linux/memblock.h> -#include <linux/notifier.h> #include <linux/page-isolation.h> #include <linux/vmalloc.h>
@@ -103,29 +102,34 @@ struct kho_mem_track {
struct khoser_mem_chunk;
-struct kho_serialization { - struct page *fdt; - struct kho_mem_track track; - /* First chunk of serialized preserved memory map */ - struct khoser_mem_chunk *preserved_mem_map; +struct kho_sub_fdt { + struct list_head l; + const char *name; + void *fdt; };
struct kho_out { - struct blocking_notifier_head chain_head; - struct mutex lock; /* protects KHO FDT finalization */ - struct kho_serialization ser; + void *fdt; bool finalized; + struct mutex lock; /* protects KHO FDT finalization */ + + struct list_head sub_fdts; + struct mutex fdts_lock; + + struct kho_mem_track track; + /* First chunk of serialized preserved memory map */ + struct khoser_mem_chunk *preserved_mem_map; + struct kho_debugfs dbg; };
static struct kho_out kho_out = { - .chain_head = BLOCKING_NOTIFIER_INIT(kho_out.chain_head), .lock = __MUTEX_INITIALIZER(kho_out.lock), - .ser = { - .track = { - .orders = XARRAY_INIT(kho_out.ser.track.orders, 0), - }, + .track = { + .orders = XARRAY_INIT(kho_out.track.orders, 0), }, + .sub_fdts = LIST_HEAD_INIT(kho_out.sub_fdts), + .fdts_lock = __MUTEX_INITIALIZER(kho_out.fdts_lock), .finalized = false, };
@@ -369,7 +373,7 @@ static void kho_mem_ser_free(struct khoser_mem_chunk *first_chunk) } }
-static int kho_mem_serialize(struct kho_serialization *ser) +static int kho_mem_serialize(struct kho_out *kho_out) { struct khoser_mem_chunk *first_chunk = NULL; struct khoser_mem_chunk *chunk = NULL; @@ -377,7 +381,7 @@ static int kho_mem_serialize(struct kho_serialization *ser) unsigned long order; int err = -ENOMEM;
- xa_for_each(&ser->track.orders, order, physxa) { + xa_for_each(&kho_out->track.orders, order, physxa) { struct kho_mem_phys_bits *bits; unsigned long phys;
@@ -409,7 +413,7 @@ static int kho_mem_serialize(struct kho_serialization *ser) } }
- ser->preserved_mem_map = first_chunk; + kho_out->preserved_mem_map = first_chunk;
return 0;
@@ -670,7 +674,6 @@ static void __init kho_reserve_scratch(void)
/** * kho_add_subtree - record the physical address of a sub FDT in KHO root tree. - * @ser: serialization control object passed by KHO notifiers. * @name: name of the sub tree. * @fdt: the sub tree blob. * @@ -684,34 +687,41 @@ static void __init kho_reserve_scratch(void) * * Return: 0 on success, error code on failure */ -int kho_add_subtree(struct kho_serialization *ser, const char *name, void *fdt) +int kho_add_subtree(const char *name, void *fdt) { - int err = 0; - u64 phys = (u64)virt_to_phys(fdt); - void *root = page_to_virt(ser->fdt); + struct kho_sub_fdt *sub_fdt;
- err |= fdt_begin_node(root, name); - err |= fdt_property(root, PROP_SUB_FDT, &phys, sizeof(phys)); - err |= fdt_end_node(root); + sub_fdt = kmalloc(sizeof(*sub_fdt), GFP_KERNEL); + if (!sub_fdt) + return -ENOMEM;
- if (err) - return err; + INIT_LIST_HEAD(&sub_fdt->l); + sub_fdt->name = name; + sub_fdt->fdt = fdt;
- return kho_debugfs_fdt_add(&kho_out.dbg, name, fdt, false); -} -EXPORT_SYMBOL_GPL(kho_add_subtree); + guard(mutex)(&kho_out.fdts_lock); + list_add_tail(&sub_fdt->l, &kho_out.sub_fdts); + WARN_ON_ONCE(kho_debugfs_fdt_add(&kho_out.dbg, name, fdt, false));
-int register_kho_notifier(struct notifier_block *nb) -{ - return blocking_notifier_chain_register(&kho_out.chain_head, nb); + return 0; } -EXPORT_SYMBOL_GPL(register_kho_notifier); +EXPORT_SYMBOL_GPL(kho_add_subtree);
-int unregister_kho_notifier(struct notifier_block *nb) +void kho_remove_subtree(void *fdt) { - return blocking_notifier_chain_unregister(&kho_out.chain_head, nb); + struct kho_sub_fdt *sub_fdt; + + guard(mutex)(&kho_out.fdts_lock); + list_for_each_entry(sub_fdt, &kho_out.sub_fdts, l) { + if (sub_fdt->fdt == fdt) { + list_del(&sub_fdt->l); + kfree(sub_fdt); + kho_debugfs_fdt_remove(&kho_out.dbg, fdt); + break; + } + } } -EXPORT_SYMBOL_GPL(unregister_kho_notifier); +EXPORT_SYMBOL_GPL(kho_remove_subtree);
/** * kho_preserve_folio - preserve a folio across kexec. @@ -726,7 +736,7 @@ int kho_preserve_folio(struct folio *folio) { const unsigned long pfn = folio_pfn(folio); const unsigned int order = folio_order(folio); - struct kho_mem_track *track = &kho_out.ser.track; + struct kho_mem_track *track = &kho_out.track;
if (WARN_ON(kho_scratch_overlap(pfn << PAGE_SHIFT, PAGE_SIZE << order))) return -EINVAL; @@ -747,7 +757,7 @@ EXPORT_SYMBOL_GPL(kho_preserve_folio); */ int kho_preserve_pages(struct page *page, unsigned int nr_pages) { - struct kho_mem_track *track = &kho_out.ser.track; + struct kho_mem_track *track = &kho_out.track; const unsigned long start_pfn = page_to_pfn(page); const unsigned long end_pfn = start_pfn + nr_pages; unsigned long pfn = start_pfn; @@ -848,7 +858,7 @@ static struct kho_vmalloc_chunk *new_vmalloc_chunk(struct kho_vmalloc_chunk *cur
static void kho_vmalloc_unpreserve_chunk(struct kho_vmalloc_chunk *chunk) { - struct kho_mem_track *track = &kho_out.ser.track; + struct kho_mem_track *track = &kho_out.track; unsigned long pfn = PHYS_PFN(virt_to_phys(chunk));
__kho_unpreserve(track, pfn, pfn + 1); @@ -1030,11 +1040,11 @@ EXPORT_SYMBOL_GPL(kho_restore_vmalloc);
static int __kho_abort(void) { - int err; + int err = 0; unsigned long order; struct kho_mem_phys *physxa;
- xa_for_each(&kho_out.ser.track.orders, order, physxa) { + xa_for_each(&kho_out.track.orders, order, physxa) { struct kho_mem_phys_bits *bits; unsigned long phys;
@@ -1044,17 +1054,13 @@ static int __kho_abort(void) xa_destroy(&physxa->phys_bits); kfree(physxa); } - xa_destroy(&kho_out.ser.track.orders); + xa_destroy(&kho_out.track.orders);
- if (kho_out.ser.preserved_mem_map) { - kho_mem_ser_free(kho_out.ser.preserved_mem_map); - kho_out.ser.preserved_mem_map = NULL; + if (kho_out.preserved_mem_map) { + kho_mem_ser_free(kho_out.preserved_mem_map); + kho_out.preserved_mem_map = NULL; }
- err = blocking_notifier_call_chain(&kho_out.chain_head, KEXEC_KHO_ABORT, - NULL); - err = notifier_to_errno(err); - if (err) pr_err("Failed to abort KHO finalization: %d\n", err);
@@ -1077,7 +1083,8 @@ int kho_abort(void) return ret;
kho_out.finalized = false; - kho_debugfs_cleanup(&kho_out.dbg); + + kho_debugfs_fdt_remove(&kho_out.dbg, kho_out.fdt);
return 0; } @@ -1086,41 +1093,46 @@ static int __kho_finalize(void) { int err = 0; u64 *preserved_mem_map; - void *fdt = page_to_virt(kho_out.ser.fdt); + void *root = kho_out.fdt; + struct kho_sub_fdt *fdt;
- err |= fdt_create(fdt, PAGE_SIZE); - err |= fdt_finish_reservemap(fdt); - err |= fdt_begin_node(fdt, ""); - err |= fdt_property_string(fdt, "compatible", KHO_FDT_COMPATIBLE); + err |= fdt_create(root, PAGE_SIZE); + err |= fdt_finish_reservemap(root); + err |= fdt_begin_node(root, ""); + err |= fdt_property_string(root, "compatible", KHO_FDT_COMPATIBLE); /** * Reserve the preserved-memory-map property in the root FDT, so * that all property definitions will precede subnodes created by * KHO callers. */ - err |= fdt_property_placeholder(fdt, PROP_PRESERVED_MEMORY_MAP, + err |= fdt_property_placeholder(root, PROP_PRESERVED_MEMORY_MAP, sizeof(*preserved_mem_map), (void **)&preserved_mem_map); if (err) goto abort;
- err = kho_preserve_folio(page_folio(kho_out.ser.fdt)); + err = kho_preserve_folio(virt_to_folio(kho_out.fdt)); if (err) goto abort;
- err = blocking_notifier_call_chain(&kho_out.chain_head, - KEXEC_KHO_FINALIZE, &kho_out.ser); - err = notifier_to_errno(err); + err = kho_mem_serialize(&kho_out); if (err) goto abort;
- err = kho_mem_serialize(&kho_out.ser); - if (err) - goto abort; + *preserved_mem_map = (u64)virt_to_phys(kho_out.preserved_mem_map); + + mutex_lock(&kho_out.fdts_lock); + list_for_each_entry(fdt, &kho_out.sub_fdts, l) { + phys_addr_t phys = virt_to_phys(fdt->fdt);
- *preserved_mem_map = (u64)virt_to_phys(kho_out.ser.preserved_mem_map); + err |= fdt_begin_node(root, fdt->name); + err |= fdt_property(root, PROP_SUB_FDT, &phys, sizeof(phys)); + err |= fdt_end_node(root); + } + mutex_unlock(&kho_out.fdts_lock);
- err |= fdt_end_node(fdt); - err |= fdt_finish(fdt); + err |= fdt_end_node(root); + err |= fdt_finish(root);
abort: if (err) { @@ -1148,8 +1160,10 @@ int kho_finalize(void)
kho_out.finalized = true;
- return kho_debugfs_fdt_add(&kho_out.dbg, "fdt", - page_to_virt(kho_out.ser.fdt), true); + WARN_ON_ONCE(kho_debugfs_fdt_add(&kho_out.dbg, "fdt", + kho_out.fdt, true)); + + return 0; }
bool kho_finalized(void) @@ -1232,15 +1246,17 @@ static __init int kho_init(void) { int err = 0; const void *fdt = kho_get_fdt(); + struct page *fdt_page;
if (!kho_enable) return 0;
- kho_out.ser.fdt = alloc_page(GFP_KERNEL); - if (!kho_out.ser.fdt) { + fdt_page = alloc_page(GFP_KERNEL); + if (!fdt_page) { err = -ENOMEM; goto err_free_scratch; } + kho_out.fdt = page_to_virt(fdt_page);
err = kho_debugfs_init(); if (err) @@ -1268,8 +1284,8 @@ static __init int kho_init(void) return 0;
err_free_fdt: - put_page(kho_out.ser.fdt); - kho_out.ser.fdt = NULL; + put_page(fdt_page); + kho_out.fdt = NULL; err_free_scratch: for (int i = 0; i < kho_scratch_cnt; i++) { void *start = __va(kho_scratch[i].addr); @@ -1280,7 +1296,7 @@ static __init int kho_init(void) kho_enable = false; return err; } -late_initcall(kho_init); +fs_initcall(kho_init);
static void __init kho_release_scratch(void) { @@ -1416,7 +1432,7 @@ int kho_fill_kimage(struct kimage *image) if (!kho_out.finalized) return 0;
- image->kho.fdt = page_to_phys(kho_out.ser.fdt); + image->kho.fdt = virt_to_phys(kho_out.fdt);
scratch_size = sizeof(*kho_scratch) * kho_scratch_cnt; scratch = (struct kexec_buf){ diff --git a/kernel/kexec_handover_debugfs.c b/kernel/kexec_handover_debugfs.c index a91b279f1b23..46e9e6c0791f 100644 --- a/kernel/kexec_handover_debugfs.c +++ b/kernel/kexec_handover_debugfs.c @@ -61,14 +61,17 @@ int kho_debugfs_fdt_add(struct kho_debugfs *dbg, const char *name, return __kho_debugfs_fdt_add(&dbg->fdt_list, dir, name, fdt); }
-void kho_debugfs_cleanup(struct kho_debugfs *dbg) +void kho_debugfs_fdt_remove(struct kho_debugfs *dbg, void *fdt) { - struct fdt_debugfs *ff, *tmp; - - list_for_each_entry_safe(ff, tmp, &dbg->fdt_list, list) { - debugfs_remove(ff->file); - list_del(&ff->list); - kfree(ff); + struct fdt_debugfs *ff; + + list_for_each_entry(ff, &dbg->fdt_list, list) { + if (ff->wrapper.data == fdt) { + debugfs_remove(ff->file); + list_del(&ff->list); + kfree(ff); + break; + } } }
diff --git a/kernel/kexec_handover_internal.h b/kernel/kexec_handover_internal.h index 28c0e971613d..17ae101dc6ae 100644 --- a/kernel/kexec_handover_internal.h +++ b/kernel/kexec_handover_internal.h @@ -30,7 +30,7 @@ void kho_in_debugfs_init(struct kho_debugfs *dbg, const void *fdt); int kho_out_debugfs_init(struct kho_debugfs *dbg); int kho_debugfs_fdt_add(struct kho_debugfs *dbg, const char *name, const void *fdt, bool root); -void kho_debugfs_cleanup(struct kho_debugfs *dbg); +void kho_debugfs_fdt_remove(struct kho_debugfs *dbg, void *fdt); #else static inline int kho_debugfs_init(void) { return 0; } static inline void kho_in_debugfs_init(struct kho_debugfs *dbg, @@ -38,7 +38,8 @@ static inline void kho_in_debugfs_init(struct kho_debugfs *dbg, static inline int kho_out_debugfs_init(struct kho_debugfs *dbg) { return 0; } static inline int kho_debugfs_fdt_add(struct kho_debugfs *dbg, const char *name, const void *fdt, bool root) { return 0; } -static inline void kho_debugfs_cleanup(struct kho_debugfs *dbg) {} +static inline void kho_debugfs_fdt_remove(struct kho_debugfs *dbg, + void *fdt) { } #endif /* CONFIG_KEXEC_HANDOVER_DEBUGFS */
#ifdef CONFIG_KEXEC_HANDOVER_DEBUG diff --git a/lib/test_kho.c b/lib/test_kho.c index 60cd899ea745..1c6c4ce83666 100644 --- a/lib/test_kho.c +++ b/lib/test_kho.c @@ -39,33 +39,6 @@ struct kho_test_state {
static struct kho_test_state kho_test_state;
-static int kho_test_notifier(struct notifier_block *self, unsigned long cmd, - void *v) -{ - struct kho_test_state *state = &kho_test_state; - struct kho_serialization *ser = v; - int err = 0; - - switch (cmd) { - case KEXEC_KHO_ABORT: - return NOTIFY_DONE; - case KEXEC_KHO_FINALIZE: - /* Handled below */ - break; - default: - return NOTIFY_BAD; - } - - err |= kho_preserve_folio(state->fdt); - err |= kho_add_subtree(ser, KHO_TEST_FDT, folio_address(state->fdt)); - - return err ? NOTIFY_BAD : NOTIFY_DONE; -} - -static struct notifier_block kho_test_nb = { - .notifier_call = kho_test_notifier, -}; - static int kho_test_save_data(struct kho_test_state *state, void *fdt) { phys_addr_t *folios_info __free(kvfree) = NULL; @@ -120,6 +93,7 @@ static int kho_test_prepare_fdt(struct kho_test_state *state)
fdt = folio_address(state->fdt);
+ err |= kho_preserve_folio(state->fdt); err |= fdt_create(fdt, fdt_size); err |= fdt_finish_reservemap(fdt);
@@ -131,6 +105,7 @@ static int kho_test_prepare_fdt(struct kho_test_state *state)
err |= fdt_finish(fdt);
+ err = kho_add_subtree(KHO_TEST_FDT, folio_address(state->fdt)); if (err) folio_put(state->fdt);
@@ -203,7 +178,7 @@ static int kho_test_save(void) if (err) goto err_free_folios;
- err = register_kho_notifier(&kho_test_nb); + err = kho_add_subtree(KHO_TEST_FDT, folio_address(state->fdt)); if (err) goto err_free_fdt;
@@ -326,7 +301,7 @@ static void kho_test_cleanup(void)
static void __exit kho_test_exit(void) { - unregister_kho_notifier(&kho_test_nb); + kho_remove_subtree(folio_address(kho_test_state.fdt)); kho_test_cleanup(); } module_exit(kho_test_exit); diff --git a/mm/memblock.c b/mm/memblock.c index e23e16618e9b..e3bef9b35d63 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -2444,53 +2444,18 @@ int reserve_mem_release_by_name(const char *name) #define MEMBLOCK_KHO_FDT "memblock" #define MEMBLOCK_KHO_NODE_COMPATIBLE "memblock-v1" #define RESERVE_MEM_KHO_NODE_COMPATIBLE "reserve-mem-v1" -static struct page *kho_fdt; - -static int reserve_mem_kho_finalize(struct kho_serialization *ser) -{ - int err = 0, i; - - for (i = 0; i < reserved_mem_count; i++) { - struct reserve_mem_table *map = &reserved_mem_table[i]; - struct page *page = phys_to_page(map->start); - unsigned int nr_pages = map->size >> PAGE_SHIFT; - - err |= kho_preserve_pages(page, nr_pages); - } - - err |= kho_preserve_folio(page_folio(kho_fdt)); - err |= kho_add_subtree(ser, MEMBLOCK_KHO_FDT, page_to_virt(kho_fdt)); - - return notifier_from_errno(err); -} - -static int reserve_mem_kho_notifier(struct notifier_block *self, - unsigned long cmd, void *v) -{ - switch (cmd) { - case KEXEC_KHO_FINALIZE: - return reserve_mem_kho_finalize((struct kho_serialization *)v); - case KEXEC_KHO_ABORT: - return NOTIFY_DONE; - default: - return NOTIFY_BAD; - } -} - -static struct notifier_block reserve_mem_kho_nb = { - .notifier_call = reserve_mem_kho_notifier, -};
static int __init prepare_kho_fdt(void) { int err = 0, i; + struct page *fdt_page; void *fdt;
- kho_fdt = alloc_page(GFP_KERNEL); - if (!kho_fdt) + fdt_page = alloc_page(GFP_KERNEL); + if (!fdt_page) return -ENOMEM;
- fdt = page_to_virt(kho_fdt); + fdt = page_to_virt(fdt_page);
err |= fdt_create(fdt, PAGE_SIZE); err |= fdt_finish_reservemap(fdt); @@ -2499,7 +2464,10 @@ static int __init prepare_kho_fdt(void) err |= fdt_property_string(fdt, "compatible", MEMBLOCK_KHO_NODE_COMPATIBLE); for (i = 0; i < reserved_mem_count; i++) { struct reserve_mem_table *map = &reserved_mem_table[i]; + struct page *page = phys_to_page(map->start); + unsigned int nr_pages = map->size >> PAGE_SHIFT;
+ err |= kho_preserve_pages(page, nr_pages); err |= fdt_begin_node(fdt, map->name); err |= fdt_property_string(fdt, "compatible", RESERVE_MEM_KHO_NODE_COMPATIBLE); err |= fdt_property(fdt, "start", &map->start, sizeof(map->start)); @@ -2507,13 +2475,16 @@ static int __init prepare_kho_fdt(void) err |= fdt_end_node(fdt); } err |= fdt_end_node(fdt); - err |= fdt_finish(fdt);
+ err |= kho_preserve_folio(page_folio(fdt_page)); + + if (!err) + err = kho_add_subtree(MEMBLOCK_KHO_FDT, fdt); + if (err) { pr_err("failed to prepare memblock FDT for KHO: %d\n", err); - put_page(kho_fdt); - kho_fdt = NULL; + put_page(fdt_page); }
return err; @@ -2529,13 +2500,6 @@ static int __init reserve_mem_init(void) err = prepare_kho_fdt(); if (err) return err; - - err = register_kho_notifier(&reserve_mem_kho_nb); - if (err) { - put_page(kho_fdt); - kho_fdt = NULL; - } - return err; } late_initcall(reserve_mem_init);
On Fri, Oct 24 2025, Pasha Tatashin wrote:
From: "Mike Rapoport (Microsoft)" rppt@kernel.org
The KHO framework uses a notifier chain as the mechanism for clients to participate in the finalization process. While this works for a single, central state machine, it is too restrictive for kernel-internal components like pstore/reserve_mem or IMA. These components need a simpler, direct way to register their state for preservation (e.g., during their initcall) without being part of a complex, shutdown-time notifier sequence. The notifier model forces all participants into a single finalization flow and makes direct preservation from an arbitrary context difficult. This patch refactors the client participation model by removing the notifier chain and introducing a direct API for managing FDT subtrees.
The core kho_finalize() and kho_abort() state machine remains, but clients now register their data with KHO beforehand.
Signed-off-by: Mike Rapoport (Microsoft) rppt@kernel.org Co-developed-by: Pasha Tatashin pasha.tatashin@soleen.com Signed-off-by: Pasha Tatashin pasha.tatashin@soleen.com
Reviewed-by: Pratyush Yadav pratyush@kernel.org
[...]
On Fri, Oct 24, 2025 at 12:09:57PM -0400, Pasha Tatashin wrote:
From: "Mike Rapoport (Microsoft)" rppt@kernel.org
The KHO framework uses a notifier chain as the mechanism for clients to participate in the finalization process. While this works for a single, central state machine, it is too restrictive for kernel-internal components like pstore/reserve_mem or IMA. These components need a simpler, direct way to register their state for preservation (e.g., during their initcall) without being part of a complex, shutdown-time notifier sequence. The notifier model forces all participants into a single finalization flow and makes direct preservation from an arbitrary context difficult. This patch refactors the client participation model by removing the notifier chain and introducing a direct API for managing FDT subtrees.
The core kho_finalize() and kho_abort() state machine remains, but clients now register their data with KHO beforehand.
Signed-off-by: Mike Rapoport (Microsoft) rppt@kernel.org Co-developed-by: Pasha Tatashin pasha.tatashin@soleen.com Signed-off-by: Pasha Tatashin pasha.tatashin@soleen.com
include/linux/kexec_handover.h | 28 +----- kernel/kexec_handover.c | 166 +++++++++++++++++-------------- kernel/kexec_handover_debugfs.c | 17 ++-- kernel/kexec_handover_internal.h | 5 +- lib/test_kho.c | 33 +----- mm/memblock.c | 62 +++--------- 6 files changed, 126 insertions(+), 185 deletions(-)
diff --git a/lib/test_kho.c b/lib/test_kho.c index 60cd899ea745..1c6c4ce83666 100644 --- a/lib/test_kho.c +++ b/lib/test_kho.c @@ -120,6 +93,7 @@ static int kho_test_prepare_fdt(struct kho_test_state *state) fdt = folio_address(state->fdt);
- err |= kho_preserve_folio(state->fdt);
We should bail out here, no point creating an fdt if it won't be preserved.
err |= fdt_create(fdt, fdt_size); err |= fdt_finish_reservemap(fdt); @@ -131,6 +105,7 @@ static int kho_test_prepare_fdt(struct kho_test_state *state) err |= fdt_finish(fdt);
- err = kho_add_subtree(KHO_TEST_FDT, folio_address(state->fdt)); if (err) folio_put(state->fdt);
@@ -203,7 +178,7 @@ static int kho_test_save(void) if (err) goto err_free_folios;
- err = register_kho_notifier(&kho_test_nb);
- err = kho_add_subtree(KHO_TEST_FDT, folio_address(state->fdt));
This is the second time we add the same subtree, isn't it?
if (err) goto err_free_fdt; @@ -326,7 +301,7 @@ static void kho_test_cleanup(void) static void __exit kho_test_exit(void) {
- unregister_kho_notifier(&kho_test_nb);
- kho_remove_subtree(folio_address(kho_test_state.fdt)); kho_test_cleanup();
} module_exit(kho_test_exit); diff --git a/mm/memblock.c b/mm/memblock.c index e23e16618e9b..e3bef9b35d63 100644 --- a/mm/memblock.c +++ b/mm/memblock.c static int __init prepare_kho_fdt(void) { int err = 0, i;
- struct page *fdt_page; void *fdt;
- kho_fdt = alloc_page(GFP_KERNEL);
- if (!kho_fdt)
- fdt_page = alloc_page(GFP_KERNEL);
- if (!fdt_page) return -ENOMEM;
- fdt = page_to_virt(kho_fdt);
- fdt = page_to_virt(fdt_page);
err |= fdt_create(fdt, PAGE_SIZE); err |= fdt_finish_reservemap(fdt); @@ -2499,7 +2464,10 @@ static int __init prepare_kho_fdt(void) err |= fdt_property_string(fdt, "compatible", MEMBLOCK_KHO_NODE_COMPATIBLE); for (i = 0; i < reserved_mem_count; i++) { struct reserve_mem_table *map = &reserved_mem_table[i];
struct page *page = phys_to_page(map->start);unsigned int nr_pages = map->size >> PAGE_SHIFT;
err |= fdt_begin_node(fdt, map->name); err |= fdt_property_string(fdt, "compatible", RESERVE_MEM_KHO_NODE_COMPATIBLE); err |= fdt_property(fdt, "start", &map->start, sizeof(map->start));err |= kho_preserve_pages(page, nr_pages);@@ -2507,13 +2475,16 @@ static int __init prepare_kho_fdt(void) err |= fdt_end_node(fdt); } err |= fdt_end_node(fdt);
- err |= fdt_finish(fdt);
- err |= kho_preserve_folio(page_folio(fdt_page));
When looking at the end result after patch 8 it becomes a total mess. Let's move this right after the allocation and make it
err = kho_preserve_folio(page_folio(fdt_page); if (err) goto err_free_fdt;
- if (!err)
err = kho_add_subtree(MEMBLOCK_KHO_FDT, fdt);
and replace this pattern with usual kernel
if (err) goto err_free_fdt;
err = kho_add_subtree(MEMBLOCK_KHO_FDT, fdt); if (err) goto err_free_fdt;
so that only fdt operations will be a part of
err |= fdt_<function>
sequence.
if (err) { pr_err("failed to prepare memblock FDT for KHO: %d\n", err);
put_page(kho_fdt);kho_fdt = NULL;
}put_page(fdt_page);return err; @@ -2529,13 +2500,6 @@ static int __init reserve_mem_init(void) err = prepare_kho_fdt(); if (err) return err;
- err = register_kho_notifier(&reserve_mem_kho_nb);
- if (err) {
put_page(kho_fdt);kho_fdt = NULL;- }
- return err;
} late_initcall(reserve_mem_init); -- 2.51.1.821.gb6fe4d2222-goog
Allow users of KHO to cancel the previous preservation by adding the necessary interfaces to unpreserve folio and pages.
Signed-off-by: Pasha Tatashin pasha.tatashin@soleen.com Reviewed-by: Pratyush Yadav pratyush@kernel.org Reviewed-by: Mike Rapoport (Microsoft) rppt@kernel.org --- include/linux/kexec_handover.h | 12 +++++ kernel/kexec_handover.c | 84 ++++++++++++++++++++++++++++------ 2 files changed, 83 insertions(+), 13 deletions(-)
diff --git a/include/linux/kexec_handover.h b/include/linux/kexec_handover.h index 2faf290803ce..4ba145713838 100644 --- a/include/linux/kexec_handover.h +++ b/include/linux/kexec_handover.h @@ -43,7 +43,9 @@ bool kho_is_enabled(void); bool is_kho_boot(void);
int kho_preserve_folio(struct folio *folio); +int kho_unpreserve_folio(struct folio *folio); int kho_preserve_pages(struct page *page, unsigned int nr_pages); +int kho_unpreserve_pages(struct page *page, unsigned int nr_pages); int kho_preserve_vmalloc(void *ptr, struct kho_vmalloc *preservation); struct folio *kho_restore_folio(phys_addr_t phys); struct page *kho_restore_pages(phys_addr_t phys, unsigned int nr_pages); @@ -76,11 +78,21 @@ static inline int kho_preserve_folio(struct folio *folio) return -EOPNOTSUPP; }
+static inline int kho_unpreserve_folio(struct folio *folio) +{ + return -EOPNOTSUPP; +} + static inline int kho_preserve_pages(struct page *page, unsigned int nr_pages) { return -EOPNOTSUPP; }
+static inline int kho_unpreserve_pages(struct page *page, unsigned int nr_pages) +{ + return -EOPNOTSUPP; +} + static inline int kho_preserve_vmalloc(void *ptr, struct kho_vmalloc *preservation) { diff --git a/kernel/kexec_handover.c b/kernel/kexec_handover.c index 82137eba1474..994ee0b70757 100644 --- a/kernel/kexec_handover.c +++ b/kernel/kexec_handover.c @@ -157,26 +157,33 @@ static void *xa_load_or_alloc(struct xarray *xa, unsigned long index) return no_free_ptr(elm); }
-static void __kho_unpreserve(struct kho_mem_track *track, unsigned long pfn, - unsigned long end_pfn) +static void __kho_unpreserve_order(struct kho_mem_track *track, unsigned long pfn, + unsigned int order) { struct kho_mem_phys_bits *bits; struct kho_mem_phys *physxa; + const unsigned long pfn_high = pfn >> order;
- while (pfn < end_pfn) { - const unsigned int order = - min(count_trailing_zeros(pfn), ilog2(end_pfn - pfn)); - const unsigned long pfn_high = pfn >> order; + physxa = xa_load(&track->orders, order); + if (!physxa) + return; + + bits = xa_load(&physxa->phys_bits, pfn_high / PRESERVE_BITS); + if (!bits) + return;
- physxa = xa_load(&track->orders, order); - if (!physxa) - continue; + clear_bit(pfn_high % PRESERVE_BITS, bits->preserve); +} + +static void __kho_unpreserve(struct kho_mem_track *track, unsigned long pfn, + unsigned long end_pfn) +{ + unsigned int order;
- bits = xa_load(&physxa->phys_bits, pfn_high / PRESERVE_BITS); - if (!bits) - continue; + while (pfn < end_pfn) { + order = min(count_trailing_zeros(pfn), ilog2(end_pfn - pfn));
- clear_bit(pfn_high % PRESERVE_BITS, bits->preserve); + __kho_unpreserve_order(track, pfn, order);
pfn += 1 << order; } @@ -745,6 +752,30 @@ int kho_preserve_folio(struct folio *folio) } EXPORT_SYMBOL_GPL(kho_preserve_folio);
+/** + * kho_unpreserve_folio - unpreserve a folio. + * @folio: folio to unpreserve. + * + * Instructs KHO to unpreserve a folio that was preserved by + * kho_preserve_folio() before. The provided @folio (pfn and order) + * must exactly match a previously preserved folio. + * + * Return: 0 on success, error code on failure + */ +int kho_unpreserve_folio(struct folio *folio) +{ + const unsigned long pfn = folio_pfn(folio); + const unsigned int order = folio_order(folio); + struct kho_mem_track *track = &kho_out.track; + + if (kho_out.finalized) + return -EBUSY; + + __kho_unpreserve_order(track, pfn, order); + return 0; +} +EXPORT_SYMBOL_GPL(kho_unpreserve_folio); + /** * kho_preserve_pages - preserve contiguous pages across kexec * @page: first page in the list. @@ -789,6 +820,33 @@ int kho_preserve_pages(struct page *page, unsigned int nr_pages) } EXPORT_SYMBOL_GPL(kho_preserve_pages);
+/** + * kho_unpreserve_pages - unpreserve contiguous pages. + * @page: first page in the list. + * @nr_pages: number of pages. + * + * Instructs KHO to unpreserve @nr_pages contiguous pages starting from @page. + * This must be called with the same @page and @nr_pages as the corresponding + * kho_preserve_pages() call. Unpreserving arbitrary sub-ranges of larger + * preserved blocks is not supported. + * + * Return: 0 on success, error code on failure + */ +int kho_unpreserve_pages(struct page *page, unsigned int nr_pages) +{ + struct kho_mem_track *track = &kho_out.track; + const unsigned long start_pfn = page_to_pfn(page); + const unsigned long end_pfn = start_pfn + nr_pages; + + if (kho_out.finalized) + return -EBUSY; + + __kho_unpreserve(track, start_pfn, end_pfn); + + return 0; +} +EXPORT_SYMBOL_GPL(kho_unpreserve_pages); + struct kho_vmalloc_hdr { DECLARE_KHOSER_PTR(next, struct kho_vmalloc_chunk *); };
KHO allows clients to preserve memory regions at any point before the KHO state is finalized. The finalization process itself involves KHO performing its own actions, such as serializing the overall preserved memory map.
If this finalization process is aborted, the current implementation destroys KHO's internal memory tracking structures (`kho_out.ser.track.orders`). This behavior effectively unpreserves all memory from KHO's perspective, regardless of whether those preservations were made by clients before the finalization attempt or by KHO itself during finalization.
This premature unpreservation is incorrect. An abort of the finalization process should only undo actions taken by KHO as part of that specific finalization attempt. Individual memory regions preserved by clients prior to finalization should remain preserved, as their lifecycle is managed by the clients themselves. These clients might still need to call kho_unpreserve_folio() or kho_unpreserve_phys() based on their own logic, even after a KHO finalization attempt is aborted.
Signed-off-by: Pasha Tatashin pasha.tatashin@soleen.com --- kernel/kexec_handover.c | 21 +-------------------- 1 file changed, 1 insertion(+), 20 deletions(-)
diff --git a/kernel/kexec_handover.c b/kernel/kexec_handover.c index 994ee0b70757..6aa25d304bad 100644 --- a/kernel/kexec_handover.c +++ b/kernel/kexec_handover.c @@ -1098,31 +1098,12 @@ EXPORT_SYMBOL_GPL(kho_restore_vmalloc);
static int __kho_abort(void) { - int err = 0; - unsigned long order; - struct kho_mem_phys *physxa; - - xa_for_each(&kho_out.track.orders, order, physxa) { - struct kho_mem_phys_bits *bits; - unsigned long phys; - - xa_for_each(&physxa->phys_bits, phys, bits) - kfree(bits); - - xa_destroy(&physxa->phys_bits); - kfree(physxa); - } - xa_destroy(&kho_out.track.orders); - if (kho_out.preserved_mem_map) { kho_mem_ser_free(kho_out.preserved_mem_map); kho_out.preserved_mem_map = NULL; }
- if (err) - pr_err("Failed to abort KHO finalization: %d\n", err); - - return err; + return 0; }
int kho_abort(void)
On Fri, Oct 24 2025, Pasha Tatashin wrote:
KHO allows clients to preserve memory regions at any point before the KHO state is finalized. The finalization process itself involves KHO performing its own actions, such as serializing the overall preserved memory map.
If this finalization process is aborted, the current implementation destroys KHO's internal memory tracking structures (`kho_out.ser.track.orders`). This behavior effectively unpreserves all memory from KHO's perspective, regardless of whether those preservations were made by clients before the finalization attempt or by KHO itself during finalization.
This premature unpreservation is incorrect. An abort of the finalization process should only undo actions taken by KHO as part of that specific finalization attempt. Individual memory regions preserved by clients prior to finalization should remain preserved, as their lifecycle is managed by the clients themselves. These clients might still need to call kho_unpreserve_folio() or kho_unpreserve_phys() based on their own logic, even after a KHO finalization attempt is aborted.
Signed-off-by: Pasha Tatashin pasha.tatashin@soleen.com
Reviewed-by: Pratyush Yadav pratyush@kernel.org
[...]
Move KHO to kernel/liveupdate/ in preparation of placing all Live Update core kernel related files to the same place.
Signed-off-by: Pasha Tatashin pasha.tatashin@soleen.com Reviewed-by: Jason Gunthorpe jgg@nvidia.com Reviewed-by: Mike Rapoport (Microsoft) rppt@kernel.org --- Documentation/core-api/kho/concepts.rst | 2 +- MAINTAINERS | 2 +- init/Kconfig | 2 + kernel/Kconfig.kexec | 34 ---------------- kernel/Makefile | 4 +- kernel/liveupdate/Kconfig | 39 +++++++++++++++++++ kernel/liveupdate/Makefile | 5 +++ kernel/{ => liveupdate}/kexec_handover.c | 4 +- .../{ => liveupdate}/kexec_handover_debug.c | 0 .../{ => liveupdate}/kexec_handover_debugfs.c | 0 .../kexec_handover_internal.h | 0 11 files changed, 51 insertions(+), 41 deletions(-) create mode 100644 kernel/liveupdate/Kconfig create mode 100644 kernel/liveupdate/Makefile rename kernel/{ => liveupdate}/kexec_handover.c (99%) rename kernel/{ => liveupdate}/kexec_handover_debug.c (100%) rename kernel/{ => liveupdate}/kexec_handover_debugfs.c (100%) rename kernel/{ => liveupdate}/kexec_handover_internal.h (100%)
diff --git a/Documentation/core-api/kho/concepts.rst b/Documentation/core-api/kho/concepts.rst index 36d5c05cfb30..d626d1dbd678 100644 --- a/Documentation/core-api/kho/concepts.rst +++ b/Documentation/core-api/kho/concepts.rst @@ -70,5 +70,5 @@ in the FDT. That state is called the KHO finalization phase.
Public API ========== -.. kernel-doc:: kernel/kexec_handover.c +.. kernel-doc:: kernel/liveupdate/kexec_handover.c :export: diff --git a/MAINTAINERS b/MAINTAINERS index 35d7942b2082..400209b74d95 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -13822,7 +13822,7 @@ S: Maintained F: Documentation/admin-guide/mm/kho.rst F: Documentation/core-api/kho/* F: include/linux/kexec_handover.h -F: kernel/kexec_handover* +F: kernel/liveupdate/kexec_handover* F: tools/testing/selftests/kho/
KEYS-ENCRYPTED diff --git a/init/Kconfig b/init/Kconfig index cab3ad28ca49..0605de5d96c0 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -2138,6 +2138,8 @@ config TRACEPOINTS
source "kernel/Kconfig.kexec"
+source "kernel/liveupdate/Kconfig" + endmenu # General setup
source "arch/Kconfig" diff --git a/kernel/Kconfig.kexec b/kernel/Kconfig.kexec index 9308a0fb1419..15632358bcf7 100644 --- a/kernel/Kconfig.kexec +++ b/kernel/Kconfig.kexec @@ -94,40 +94,6 @@ config KEXEC_JUMP Jump between original kernel and kexeced kernel and invoke code in physical address mode via KEXEC
-config KEXEC_HANDOVER - bool "kexec handover" - depends on ARCH_SUPPORTS_KEXEC_HANDOVER && ARCH_SUPPORTS_KEXEC_FILE - depends on !DEFERRED_STRUCT_PAGE_INIT - select MEMBLOCK_KHO_SCRATCH - select KEXEC_FILE - select DEBUG_FS - select LIBFDT - select CMA - help - Allow kexec to hand over state across kernels by generating and - passing additional metadata to the target kernel. This is useful - to keep data or state alive across the kexec. For this to work, - both source and target kernels need to have this option enabled. - -config KEXEC_HANDOVER_DEBUG - bool "Enable Kexec Handover debug checks" - depends on KEXEC_HANDOVER_DEBUGFS - help - This option enables extra sanity checks for the Kexec Handover - subsystem. Since, KHO performance is crucial in live update - scenarios and the extra code might be adding overhead it is - only optionally enabled. - -config KEXEC_HANDOVER_DEBUGFS - bool "kexec handover debugfs interface" - depends on KEXEC_HANDOVER - depends on DEBUG_FS - help - Allow to control kexec handover device tree via debugfs - interface, i.e. finalize the state or aborting the finalization. - Also, enables inspecting the KHO fdt trees with the debugfs binary - blobs. - config CRASH_DUMP bool "kernel crash dumps" default ARCH_DEFAULT_CRASH_DUMP diff --git a/kernel/Makefile b/kernel/Makefile index 2cf7909a74e5..e83669841b8c 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -52,6 +52,7 @@ obj-y += printk/ obj-y += irq/ obj-y += rcu/ obj-y += livepatch/ +obj-y += liveupdate/ obj-y += dma/ obj-y += entry/ obj-y += unwind/ @@ -82,9 +83,6 @@ obj-$(CONFIG_CRASH_DUMP_KUNIT_TEST) += crash_core_test.o obj-$(CONFIG_KEXEC) += kexec.o obj-$(CONFIG_KEXEC_FILE) += kexec_file.o obj-$(CONFIG_KEXEC_ELF) += kexec_elf.o -obj-$(CONFIG_KEXEC_HANDOVER) += kexec_handover.o -obj-$(CONFIG_KEXEC_HANDOVER_DEBUG) += kexec_handover_debug.o -obj-$(CONFIG_KEXEC_HANDOVER_DEBUGFS) += kexec_handover_debugfs.o obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o obj-$(CONFIG_COMPAT) += compat.o obj-$(CONFIG_CGROUPS) += cgroup/ diff --git a/kernel/liveupdate/Kconfig b/kernel/liveupdate/Kconfig new file mode 100644 index 000000000000..ae8bdd87458a --- /dev/null +++ b/kernel/liveupdate/Kconfig @@ -0,0 +1,39 @@ +# SPDX-License-Identifier: GPL-2.0-only + +menu "Live Update and Kexec HandOver" + +config KEXEC_HANDOVER + bool "kexec handover" + depends on ARCH_SUPPORTS_KEXEC_HANDOVER && ARCH_SUPPORTS_KEXEC_FILE + depends on !DEFERRED_STRUCT_PAGE_INIT + select MEMBLOCK_KHO_SCRATCH + select KEXEC_FILE + select DEBUG_FS + select LIBFDT + select CMA + help + Allow kexec to hand over state across kernels by generating and + passing additional metadata to the target kernel. This is useful + to keep data or state alive across the kexec. For this to work, + both source and target kernels need to have this option enabled. + +config KEXEC_HANDOVER_DEBUG + bool "Enable Kexec Handover debug checks" + depends on KEXEC_HANDOVER_DEBUGFS + help + This option enables extra sanity checks for the Kexec Handover + subsystem. Since, KHO performance is crucial in live update + scenarios and the extra code might be adding overhead it is + only optionally enabled. + +config KEXEC_HANDOVER_DEBUGFS + bool "kexec handover debugfs interface" + depends on KEXEC_HANDOVER + depends on DEBUG_FS + help + Allow to control kexec handover device tree via debugfs + interface, i.e. finalize the state or aborting the finalization. + Also, enables inspecting the KHO fdt trees with the debugfs binary + blobs. + +endmenu diff --git a/kernel/liveupdate/Makefile b/kernel/liveupdate/Makefile new file mode 100644 index 000000000000..f52ce1ebcf86 --- /dev/null +++ b/kernel/liveupdate/Makefile @@ -0,0 +1,5 @@ +# SPDX-License-Identifier: GPL-2.0 + +obj-$(CONFIG_KEXEC_HANDOVER) += kexec_handover.o +obj-$(CONFIG_KEXEC_HANDOVER_DEBUG) += kexec_handover_debug.o +obj-$(CONFIG_KEXEC_HANDOVER_DEBUGFS) += kexec_handover_debugfs.o diff --git a/kernel/kexec_handover.c b/kernel/liveupdate/kexec_handover.c similarity index 99% rename from kernel/kexec_handover.c rename to kernel/liveupdate/kexec_handover.c index 6aa25d304bad..20c7a985828c 100644 --- a/kernel/kexec_handover.c +++ b/kernel/liveupdate/kexec_handover.c @@ -26,8 +26,8 @@ * KHO is tightly coupled with mm init and needs access to some of mm * internal APIs. */ -#include "../mm/internal.h" -#include "kexec_internal.h" +#include "../../mm/internal.h" +#include "../kexec_internal.h" #include "kexec_handover_internal.h"
#define KHO_FDT_COMPATIBLE "kho-v1" diff --git a/kernel/kexec_handover_debug.c b/kernel/liveupdate/kexec_handover_debug.c similarity index 100% rename from kernel/kexec_handover_debug.c rename to kernel/liveupdate/kexec_handover_debug.c diff --git a/kernel/kexec_handover_debugfs.c b/kernel/liveupdate/kexec_handover_debugfs.c similarity index 100% rename from kernel/kexec_handover_debugfs.c rename to kernel/liveupdate/kexec_handover_debugfs.c diff --git a/kernel/kexec_handover_internal.h b/kernel/liveupdate/kexec_handover_internal.h similarity index 100% rename from kernel/kexec_handover_internal.h rename to kernel/liveupdate/kexec_handover_internal.h
Now, that LUO and KHO both live under kernel/liveupdate, it makes sense to also move the kho debugfs files to liveupdate/ in order to keep current and upcoming LUO/KHO features organized.
The old names: /sys/kernel/debug/kho/out/ /sys/kernel/debug/kho/in/
The new names: /sys/kernel/debug/liveupdate/kho_out/ /sys/kernel/debug/liveupdate/kho_in/
Also, export the liveupdate_debufs_root, so future LUO selftests, kexec telemtry, and other users could use it as well.
Signed-off-by: Pasha Tatashin pasha.tatashin@soleen.com --- kernel/liveupdate/kexec_handover_debugfs.c | 10 +++++----- kernel/liveupdate/kexec_handover_internal.h | 2 ++ tools/testing/selftests/kho/init.c | 2 +- 3 files changed, 8 insertions(+), 6 deletions(-)
diff --git a/kernel/liveupdate/kexec_handover_debugfs.c b/kernel/liveupdate/kexec_handover_debugfs.c index 46e9e6c0791f..454b7d34ddc3 100644 --- a/kernel/liveupdate/kexec_handover_debugfs.c +++ b/kernel/liveupdate/kexec_handover_debugfs.c @@ -15,7 +15,7 @@ #include <linux/mm.h> #include "kexec_handover_internal.h"
-static struct dentry *debugfs_root; +struct dentry *liveupdate_debugfs_root;
struct fdt_debugfs { struct list_head list; @@ -118,7 +118,7 @@ __init void kho_in_debugfs_init(struct kho_debugfs *dbg, const void *fdt)
INIT_LIST_HEAD(&dbg->fdt_list);
- dir = debugfs_create_dir("in", debugfs_root); + dir = debugfs_create_dir("kho_in", liveupdate_debugfs_root); if (IS_ERR(dir)) { err = PTR_ERR(dir); goto err_out; @@ -178,7 +178,7 @@ __init int kho_out_debugfs_init(struct kho_debugfs *dbg)
INIT_LIST_HEAD(&dbg->fdt_list);
- dir = debugfs_create_dir("out", debugfs_root); + dir = debugfs_create_dir("kho_out", liveupdate_debugfs_root); if (IS_ERR(dir)) return -ENOMEM;
@@ -212,8 +212,8 @@ __init int kho_out_debugfs_init(struct kho_debugfs *dbg)
__init int kho_debugfs_init(void) { - debugfs_root = debugfs_create_dir("kho", NULL); - if (IS_ERR(debugfs_root)) + liveupdate_debugfs_root = debugfs_create_dir("liveupdate", NULL); + if (IS_ERR(liveupdate_debugfs_root)) return -ENOENT; return 0; } diff --git a/kernel/liveupdate/kexec_handover_internal.h b/kernel/liveupdate/kexec_handover_internal.h index 17ae101dc6ae..92798346fa5a 100644 --- a/kernel/liveupdate/kexec_handover_internal.h +++ b/kernel/liveupdate/kexec_handover_internal.h @@ -15,6 +15,8 @@ struct kho_debugfs { struct list_head fdt_list; };
+extern struct dentry *liveupdate_debugfs_root; + #else struct kho_debugfs {}; #endif diff --git a/tools/testing/selftests/kho/init.c b/tools/testing/selftests/kho/init.c index 6d9e91d55d68..f0136a30ce8b 100644 --- a/tools/testing/selftests/kho/init.c +++ b/tools/testing/selftests/kho/init.c @@ -11,7 +11,7 @@ /* from arch/x86/include/asm/setup.h */ #define COMMAND_LINE_SIZE 2048
-#define KHO_FINALIZE "/debugfs/kho/out/finalize" +#define KHO_FINALIZE "/debugfs/liveupdate/kho_out/finalize" #define KERNEL_IMAGE "/kernel"
static int mount_filesystems(void)
On Fri, Oct 24, 2025 at 12:10:01PM -0400, Pasha Tatashin wrote:
Now, that LUO and KHO both live under kernel/liveupdate, it makes
And they still don't :/
sense to also move the kho debugfs files to liveupdate/ in order to keep current and upcoming LUO/KHO features organized.
The old names: /sys/kernel/debug/kho/out/ /sys/kernel/debug/kho/in/
The new names: /sys/kernel/debug/liveupdate/kho_out/ /sys/kernel/debug/liveupdate/kho_in/
Also, export the liveupdate_debufs_root, so future LUO selftests, kexec telemtry, and other users could use it as well.
No, this is backwards. If anything it will be kho_debugfs_root. But I don't see why we can't have /sys/kernel/debug/liveupdate alongside /sys/kernel/debug/kho.
@Andrew, please drop this patch for now.
On Sun, 26 Oct 2025 18:32:44 +0200 Mike Rapoport rppt@kernel.org wrote:
@Andrew, please drop this patch for now.
Thanks, I dropped the v8 series.
If there is an error half way through KHO memory preservation, we should rollback and unpreserve everything that is partially preserved.
Signed-off-by: Pasha Tatashin pasha.tatashin@soleen.com Suggested-by: Pratyush Yadav pratyush@kernel.org --- mm/memblock.c | 26 ++++++++++++++++++++++---- 1 file changed, 22 insertions(+), 4 deletions(-)
diff --git a/mm/memblock.c b/mm/memblock.c index e3bef9b35d63..5ceaa02af7d6 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -2447,6 +2447,7 @@ int reserve_mem_release_by_name(const char *name)
static int __init prepare_kho_fdt(void) { + bool fdt_folio_preserved = false; int err = 0, i; struct page *fdt_page; void *fdt; @@ -2462,12 +2463,14 @@ static int __init prepare_kho_fdt(void)
err |= fdt_begin_node(fdt, ""); err |= fdt_property_string(fdt, "compatible", MEMBLOCK_KHO_NODE_COMPATIBLE); - for (i = 0; i < reserved_mem_count; i++) { + for (i = 0; !err && i < reserved_mem_count; i++) { struct reserve_mem_table *map = &reserved_mem_table[i]; struct page *page = phys_to_page(map->start); unsigned int nr_pages = map->size >> PAGE_SHIFT;
- err |= kho_preserve_pages(page, nr_pages); + err = kho_preserve_pages(page, nr_pages); + if (err) + break; err |= fdt_begin_node(fdt, map->name); err |= fdt_property_string(fdt, "compatible", RESERVE_MEM_KHO_NODE_COMPATIBLE); err |= fdt_property(fdt, "start", &map->start, sizeof(map->start)); @@ -2477,12 +2480,27 @@ static int __init prepare_kho_fdt(void) err |= fdt_end_node(fdt); err |= fdt_finish(fdt);
- err |= kho_preserve_folio(page_folio(fdt_page)); - if (!err) + err = kho_preserve_folio(page_folio(fdt_page)); + + if (!err) { + fdt_folio_preserved = true; err = kho_add_subtree(MEMBLOCK_KHO_FDT, fdt); + }
if (err) { + int nr_reserve_map_preserved = i; + + for (i = 0; i < nr_reserve_map_preserved; i++) { + struct reserve_mem_table *map = &reserved_mem_table[i]; + struct page *page = phys_to_page(map->start); + unsigned int nr_pages = map->size >> PAGE_SHIFT; + + kho_unpreserve_pages(page, nr_pages); + } + if (fdt_folio_preserved) + kho_unpreserve_folio(page_folio(fdt_page)); + pr_err("failed to prepare memblock FDT for KHO: %d\n", err); put_page(fdt_page); }
On Fri, Oct 24 2025, Pasha Tatashin wrote:
If there is an error half way through KHO memory preservation, we should rollback and unpreserve everything that is partially preserved.
Signed-off-by: Pasha Tatashin pasha.tatashin@soleen.com Suggested-by: Pratyush Yadav pratyush@kernel.org
Reviewed-by: Pratyush Yadav pratyush@kernel.org
[...]
On Fri, Oct 24, 2025 at 12:10:02PM -0400, Pasha Tatashin wrote:
If there is an error half way through KHO memory preservation, we should rollback and unpreserve everything that is partially preserved.
Signed-off-by: Pasha Tatashin pasha.tatashin@soleen.com Suggested-by: Pratyush Yadav pratyush@kernel.org
mm/memblock.c | 26 ++++++++++++++++++++++---- 1 file changed, 22 insertions(+), 4 deletions(-)
diff --git a/mm/memblock.c b/mm/memblock.c index e3bef9b35d63..5ceaa02af7d6 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -2447,6 +2447,7 @@ int reserve_mem_release_by_name(const char *name) static int __init prepare_kho_fdt(void) {
- bool fdt_folio_preserved = false;
fdt_preserved is enough IMHO.
int err = 0, i; struct page *fdt_page; void *fdt; @@ -2462,12 +2463,14 @@ static int __init prepare_kho_fdt(void) err |= fdt_begin_node(fdt, ""); err |= fdt_property_string(fdt, "compatible", MEMBLOCK_KHO_NODE_COMPATIBLE);
- for (i = 0; i < reserved_mem_count; i++) {
- for (i = 0; !err && i < reserved_mem_count; i++) { struct reserve_mem_table *map = &reserved_mem_table[i]; struct page *page = phys_to_page(map->start); unsigned int nr_pages = map->size >> PAGE_SHIFT;
err |= kho_preserve_pages(page, nr_pages);
err = kho_preserve_pages(page, nr_pages);if (err)break;
Please
goto err_unpreserve;
err |= fdt_begin_node(fdt, map->name); err |= fdt_property_string(fdt, "compatible", RESERVE_MEM_KHO_NODE_COMPATIBLE); err |= fdt_property(fdt, "start", &map->start, sizeof(map->start));
if (err) goto err_unpreserve;
and drop !err from the loop condition.
@@ -2477,12 +2480,27 @@ static int __init prepare_kho_fdt(void) err |= fdt_end_node(fdt); err |= fdt_finish(fdt);
- err |= kho_preserve_folio(page_folio(fdt_page));
- if (!err)
err = kho_preserve_folio(page_folio(fdt_page));- if (!err) {
err = kho_add_subtree(MEMBLOCK_KHO_FDT, fdt);fdt_folio_preserved = true;- }
if (err) {
int nr_reserve_map_preserved = i;
nr_preserved is clear enough. Also let's declare it before the preservation loop and count it there. Than we can make loop variable local which makes it safer against certain side channel attacks. I.e the loop that preserves the memory would be
for (unsigned int i = 0; i < reserve_mem_count; i++ nr_preserved++)
for (i = 0; i < nr_reserve_map_preserved; i++) {struct reserve_mem_table *map = &reserved_mem_table[i];struct page *page = phys_to_page(map->start);unsigned int nr_pages = map->size >> PAGE_SHIFT;kho_unpreserve_pages(page, nr_pages);}if (fdt_folio_preserved)kho_unpreserve_folio(page_folio(fdt_page));- pr_err("failed to prepare memblock FDT for KHO: %d\n", err); put_page(fdt_page); }
-- 2.51.1.821.gb6fe4d2222-goog
On Sun, Oct 26, 2025 at 12:29 PM Mike Rapoport rppt@kernel.org wrote:
On Fri, Oct 24, 2025 at 12:10:02PM -0400, Pasha Tatashin wrote:
If there is an error half way through KHO memory preservation, we should rollback and unpreserve everything that is partially preserved.
Signed-off-by: Pasha Tatashin pasha.tatashin@soleen.com Suggested-by: Pratyush Yadav pratyush@kernel.org
mm/memblock.c | 26 ++++++++++++++++++++++---- 1 file changed, 22 insertions(+), 4 deletions(-)
diff --git a/mm/memblock.c b/mm/memblock.c index e3bef9b35d63..5ceaa02af7d6 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -2447,6 +2447,7 @@ int reserve_mem_release_by_name(const char *name)
static int __init prepare_kho_fdt(void) {
bool fdt_folio_preserved = false;fdt_preserved is enough IMHO.
int err = 0, i; struct page *fdt_page; void *fdt;@@ -2462,12 +2463,14 @@ static int __init prepare_kho_fdt(void)
err |= fdt_begin_node(fdt, ""); err |= fdt_property_string(fdt, "compatible", MEMBLOCK_KHO_NODE_COMPATIBLE);
for (i = 0; i < reserved_mem_count; i++) {
for (i = 0; !err && i < reserved_mem_count; i++) { struct reserve_mem_table *map = &reserved_mem_table[i]; struct page *page = phys_to_page(map->start); unsigned int nr_pages = map->size >> PAGE_SHIFT;
err |= kho_preserve_pages(page, nr_pages);
err = kho_preserve_pages(page, nr_pages);if (err)break;Please
goto err_unpreserve;
While we can do that, we loose some symmetry of not performing fdt_end_node() and fdt_finish() if fdt lib ever adds some debugging facility to make sure that open nodes/trees are properly clodes, this is going to flag that. I prefer my current implementation.
err |= fdt_begin_node(fdt, map->name); err |= fdt_property_string(fdt, "compatible", RESERVE_MEM_KHO_NODE_COMPATIBLE); err |= fdt_property(fdt, "start", &map->start, sizeof(map->start));if (err) goto err_unpreserve;and drop !err from the loop condition.
That is going to miss one 'nr_preserved++' . We cannot do that, we could move it to the beginning of the loop, but I prefer keeping err right in the condition.
@@ -2477,12 +2480,27 @@ static int __init prepare_kho_fdt(void) err |= fdt_end_node(fdt); err |= fdt_finish(fdt);
err |= kho_preserve_folio(page_folio(fdt_page));if (!err)
err = kho_preserve_folio(page_folio(fdt_page));if (!err) {fdt_folio_preserved = true; err = kho_add_subtree(MEMBLOCK_KHO_FDT, fdt);} if (err) {int nr_reserve_map_preserved = i;nr_preserved is clear enough.
Sure.
Also let's declare it before the preservation loop and count it there. Than we can make loop variable local which makes it safer against certain side channel attacks. I.e the loop that preserves the memory would be
Sure.
for (unsigned int i = 0; i < reserve_mem_count; i++ nr_preserved++)
for (i = 0; i < nr_reserve_map_preserved; i++) {struct reserve_mem_table *map = &reserved_mem_table[i];struct page *page = phys_to_page(map->start);unsigned int nr_pages = map->size >> PAGE_SHIFT;kho_unpreserve_pages(page, nr_pages);}if (fdt_folio_preserved)kho_unpreserve_folio(page_folio(fdt_page));pr_err("failed to prepare memblock FDT for KHO: %d\n", err); put_page(fdt_page); }-- 2.51.1.821.gb6fe4d2222-goog
-- Sincerely yours, Mike.
linux-kselftest-mirror@lists.linaro.org