This series addresses comments and combines into one the two series [1] and [2], and adds review-bys.
This series refactors the KHO framework to better support in-kernel users like the upcoming LUO. The current design, which relies on a notifier chain and debugfs for control, is too restrictive for direct programmatic use.
The core of this rework is the removal of the notifier chain in favor of a direct registration API. This decouples clients from the shutdown-time finalization sequence, allowing them to manage their preserved state more flexibly and at any time.
Also, this series fixes a memory corruption bug in KHO that occurs when KFENCE is enabled.
The root cause is that KHO metadata, allocated via kzalloc(), can be randomly serviced by kfence_alloc(). When a kernel boots via KHO, the early memblock allocator is restricted to a "scratch area". This forces the KFENCE pool to be allocated within this scratch area, creating a conflict. If KHO metadata is subsequently placed in this pool, it gets corrupted during the next kexec operation.
[1] https://lore.kernel.org/all/20251007033100.836886-1-pasha.tatashin@soleen.co... [2] https://lore.kernel.org/all/20251015053121.3978358-1-pasha.tatashin@soleen.c...
Mike Rapoport (Microsoft) (1): kho: drop notifiers
Pasha Tatashin (9): kho: allow to drive kho from within kernel kho: make debugfs interface optional kho: add interfaces to unpreserve folios and page ranes kho: don't unpreserve memory during abort liveupdate: kho: move to kernel/liveupdate kho: move kho debugfs directory to liveupdate liveupdate: kho: warn and fail on metadata or preserved memory in scratch area liveupdate: kho: Increase metadata bitmap size to PAGE_SIZE liveupdate: kho: allocate metadata directly from the buddy allocator
Documentation/core-api/kho/concepts.rst | 2 +- MAINTAINERS | 3 +- include/linux/kexec_handover.h | 53 +- init/Kconfig | 2 + kernel/Kconfig.kexec | 15 - kernel/Makefile | 2 +- kernel/liveupdate/Kconfig | 38 ++ kernel/liveupdate/Makefile | 5 + kernel/{ => liveupdate}/kexec_handover.c | 588 +++++++++----------- kernel/liveupdate/kexec_handover_debug.c | 25 + kernel/liveupdate/kexec_handover_debugfs.c | 216 +++++++ kernel/liveupdate/kexec_handover_internal.h | 56 ++ lib/test_kho.c | 30 +- mm/memblock.c | 62 +-- tools/testing/selftests/kho/init.c | 2 +- tools/testing/selftests/kho/vmtest.sh | 1 + 16 files changed, 645 insertions(+), 455 deletions(-) create mode 100644 kernel/liveupdate/Kconfig create mode 100644 kernel/liveupdate/Makefile rename kernel/{ => liveupdate}/kexec_handover.c (78%) create mode 100644 kernel/liveupdate/kexec_handover_debug.c create mode 100644 kernel/liveupdate/kexec_handover_debugfs.c create mode 100644 kernel/liveupdate/kexec_handover_internal.h
base-commit: f406055cb18c6e299c4a783fc1effeb16be41803
Allow to do finalize and abort from kernel modules, so LUO could drive the KHO sequence via its own state machine.
Signed-off-by: Pasha Tatashin pasha.tatashin@soleen.com Reviewed-by: Pratyush Yadav pratyush@kernel.org --- include/linux/kexec_handover.h | 15 +++++++ kernel/kexec_handover.c | 74 ++++++++++++++++++++-------------- 2 files changed, 59 insertions(+), 30 deletions(-)
diff --git a/include/linux/kexec_handover.h b/include/linux/kexec_handover.h index 25042c1d8d54..04d0108db98e 100644 --- a/include/linux/kexec_handover.h +++ b/include/linux/kexec_handover.h @@ -67,6 +67,10 @@ void kho_memory_init(void);
void kho_populate(phys_addr_t fdt_phys, u64 fdt_len, phys_addr_t scratch_phys, u64 scratch_len); + +int kho_finalize(void); +int kho_abort(void); + #else static inline bool kho_is_enabled(void) { @@ -139,6 +143,17 @@ static inline void kho_populate(phys_addr_t fdt_phys, u64 fdt_len, phys_addr_t scratch_phys, u64 scratch_len) { } + +static inline int kho_finalize(void) +{ + return -EOPNOTSUPP; +} + +static inline int kho_abort(void) +{ + return -EOPNOTSUPP; +} + #endif /* CONFIG_KEXEC_HANDOVER */
#endif /* LINUX_KEXEC_HANDOVER_H */ diff --git a/kernel/kexec_handover.c b/kernel/kexec_handover.c index 76f0940fb485..76c34ea923f0 100644 --- a/kernel/kexec_handover.c +++ b/kernel/kexec_handover.c @@ -1067,7 +1067,7 @@ static int kho_out_update_debugfs_fdt(void) return err; }
-static int kho_abort(void) +static int __kho_abort(void) { int err; unsigned long order; @@ -1100,7 +1100,27 @@ static int kho_abort(void) return err; }
-static int kho_finalize(void) +int kho_abort(void) +{ + int ret = 0; + + if (!kho_enable) + return -EOPNOTSUPP; + + guard(mutex)(&kho_out.lock); + if (!kho_out.finalized) + return -ENOENT; + + ret = __kho_abort(); + if (ret) + return ret; + + kho_out.finalized = false; + + return kho_out_update_debugfs_fdt(); +} + +static int __kho_finalize(void) { int err = 0; u64 *preserved_mem_map; @@ -1143,12 +1163,32 @@ static int kho_finalize(void) abort: if (err) { pr_err("Failed to convert KHO state tree: %d\n", err); - kho_abort(); + __kho_abort(); }
return err; }
+int kho_finalize(void) +{ + int ret; + + if (!kho_enable) + return -EOPNOTSUPP; + + guard(mutex)(&kho_out.lock); + if (kho_out.finalized) + return -EEXIST; + + ret = __kho_finalize(); + if (ret) + return ret; + + kho_out.finalized = true; + + return kho_out_update_debugfs_fdt(); +} + static int kho_out_finalize_get(void *data, u64 *val) { mutex_lock(&kho_out.lock); @@ -1160,33 +1200,7 @@ static int kho_out_finalize_get(void *data, u64 *val)
static int kho_out_finalize_set(void *data, u64 _val) { - int ret = 0; - bool val = !!_val; - - mutex_lock(&kho_out.lock); - - if (val == kho_out.finalized) { - if (kho_out.finalized) - ret = -EEXIST; - else - ret = -ENOENT; - goto unlock; - } - - if (val) - ret = kho_finalize(); - else - ret = kho_abort(); - - if (ret) - goto unlock; - - kho_out.finalized = val; - ret = kho_out_update_debugfs_fdt(); - -unlock: - mutex_unlock(&kho_out.lock); - return ret; + return (!!_val) ? kho_finalize() : kho_abort(); }
DEFINE_DEBUGFS_ATTRIBUTE(fops_kho_out_finalize, kho_out_finalize_get,
Currently, KHO is controlled via debugfs interface, but once LUO is introduced, it can control KHO, and the debug interface becomes optional.
Add a separate config CONFIG_KEXEC_HANDOVER_DEBUGFS that enables the debugfs interface, and allows to inspect the tree.
Move all debugfs related code to a new file to keep the .c files clear of ifdefs.
Co-developed-by: Mike Rapoport (Microsoft) rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) rppt@kernel.org Signed-off-by: Pasha Tatashin pasha.tatashin@soleen.com --- MAINTAINERS | 3 +- kernel/Kconfig.kexec | 10 ++ kernel/Makefile | 1 + kernel/kexec_handover.c | 223 +++----------------------- kernel/kexec_handover_debugfs.c | 213 ++++++++++++++++++++++++ kernel/kexec_handover_internal.h | 44 +++++ tools/testing/selftests/kho/vmtest.sh | 1 + 7 files changed, 290 insertions(+), 205 deletions(-) create mode 100644 kernel/kexec_handover_debugfs.c create mode 100644 kernel/kexec_handover_internal.h
diff --git a/MAINTAINERS b/MAINTAINERS index 545a4776795e..54f627a639b1 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -13775,13 +13775,14 @@ KEXEC HANDOVER (KHO) M: Alexander Graf graf@amazon.com M: Mike Rapoport rppt@kernel.org M: Changyuan Lyu changyuanl@google.com +M: Pasha Tatashin pasha.tatashin@soleen.com L: kexec@lists.infradead.org L: linux-mm@kvack.org S: Maintained F: Documentation/admin-guide/mm/kho.rst F: Documentation/core-api/kho/* F: include/linux/kexec_handover.h -F: kernel/kexec_handover.c +F: kernel/kexec_handover* F: tools/testing/selftests/kho/
KEYS-ENCRYPTED diff --git a/kernel/Kconfig.kexec b/kernel/Kconfig.kexec index 422270d64820..03c3aa6263d3 100644 --- a/kernel/Kconfig.kexec +++ b/kernel/Kconfig.kexec @@ -109,6 +109,16 @@ config KEXEC_HANDOVER to keep data or state alive across the kexec. For this to work, both source and target kernels need to have this option enabled.
+config KEXEC_HANDOVER_DEBUGFS + bool "kexec handover debugfs interface" + depends on KEXEC_HANDOVER + depends on DEBUG_FS + help + Allow to control kexec handover device tree via debugfs + interface, i.e. finalize the state or aborting the finalization. + Also, enables inspecting the KHO fdt trees with the debugfs binary + blobs. + config CRASH_DUMP bool "kernel crash dumps" default ARCH_DEFAULT_CRASH_DUMP diff --git a/kernel/Makefile b/kernel/Makefile index df3dd8291bb6..06bfe691439b 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -83,6 +83,7 @@ obj-$(CONFIG_KEXEC) += kexec.o obj-$(CONFIG_KEXEC_FILE) += kexec_file.o obj-$(CONFIG_KEXEC_ELF) += kexec_elf.o obj-$(CONFIG_KEXEC_HANDOVER) += kexec_handover.o +obj-$(CONFIG_KEXEC_HANDOVER_DEBUGFS) += kexec_handover_debugfs.o obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o obj-$(CONFIG_COMPAT) += compat.o obj-$(CONFIG_CGROUPS) += cgroup/ diff --git a/kernel/kexec_handover.c b/kernel/kexec_handover.c index 76c34ea923f0..f3627430b3c3 100644 --- a/kernel/kexec_handover.c +++ b/kernel/kexec_handover.c @@ -10,7 +10,6 @@
#include <linux/cma.h> #include <linux/count_zeros.h> -#include <linux/debugfs.h> #include <linux/kexec.h> #include <linux/kexec_handover.h> #include <linux/libfdt.h> @@ -28,6 +27,7 @@ */ #include "../mm/internal.h" #include "kexec_internal.h" +#include "kexec_handover_internal.h"
#define KHO_FDT_COMPATIBLE "kho-v1" #define PROP_PRESERVED_MEMORY_MAP "preserved-memory-map" @@ -101,8 +101,6 @@ struct khoser_mem_chunk;
struct kho_serialization { struct page *fdt; - struct list_head fdt_list; - struct dentry *sub_fdt_dir; struct kho_mem_track track; /* First chunk of serialized preserved memory map */ struct khoser_mem_chunk *preserved_mem_map; @@ -110,20 +108,16 @@ struct kho_serialization {
struct kho_out { struct blocking_notifier_head chain_head; - - struct dentry *dir; - struct mutex lock; /* protects KHO FDT finalization */ - struct kho_serialization ser; bool finalized; + struct kho_debugfs dbg; };
static struct kho_out kho_out = { .chain_head = BLOCKING_NOTIFIER_INIT(kho_out.chain_head), .lock = __MUTEX_INITIALIZER(kho_out.lock), .ser = { - .fdt_list = LIST_HEAD_INIT(kho_out.ser.fdt_list), .track = { .orders = XARRAY_INIT(kho_out.ser.track.orders, 0), }, @@ -465,8 +459,8 @@ static void __init kho_mem_deserialize(const void *fdt) * area for early allocations that happen before page allocator is * initialized. */ -static struct kho_scratch *kho_scratch; -static unsigned int kho_scratch_cnt; +struct kho_scratch *kho_scratch; +unsigned int kho_scratch_cnt;
/* * The scratch areas are scaled by default as percent of memory allocated from @@ -662,37 +656,6 @@ static void __init kho_reserve_scratch(void) kho_enable = false; }
-struct fdt_debugfs { - struct list_head list; - struct debugfs_blob_wrapper wrapper; - struct dentry *file; -}; - -static int kho_debugfs_fdt_add(struct list_head *list, struct dentry *dir, - const char *name, const void *fdt) -{ - struct fdt_debugfs *f; - struct dentry *file; - - f = kmalloc(sizeof(*f), GFP_KERNEL); - if (!f) - return -ENOMEM; - - f->wrapper.data = (void *)fdt; - f->wrapper.size = fdt_totalsize(fdt); - - file = debugfs_create_blob(name, 0400, dir, &f->wrapper); - if (IS_ERR(file)) { - kfree(f); - return PTR_ERR(file); - } - - f->file = file; - list_add(&f->list, list); - - return 0; -} - /** * kho_add_subtree - record the physical address of a sub FDT in KHO root tree. * @ser: serialization control object passed by KHO notifiers. @@ -704,7 +667,8 @@ static int kho_debugfs_fdt_add(struct list_head *list, struct dentry *dir, * by KHO for the new kernel to retrieve it after kexec. * * A debugfs blob entry is also created at - * ``/sys/kernel/debug/kho/out/sub_fdts/@name``. + * ``/sys/kernel/debug/kho/out/sub_fdts/@name`` when kernel is configured with + * CONFIG_KEXEC_HANDOVER_DEBUGFS * * Return: 0 on success, error code on failure */ @@ -721,7 +685,7 @@ int kho_add_subtree(struct kho_serialization *ser, const char *name, void *fdt) if (err) return err;
- return kho_debugfs_fdt_add(&ser->fdt_list, ser->sub_fdt_dir, name, fdt); + return kho_debugfs_fdt_add(&kho_out.dbg, name, fdt, false); } EXPORT_SYMBOL_GPL(kho_add_subtree);
@@ -1044,29 +1008,6 @@ void *kho_restore_vmalloc(const struct kho_vmalloc *preservation) } EXPORT_SYMBOL_GPL(kho_restore_vmalloc);
-/* Handling for debug/kho/out */ - -static struct dentry *debugfs_root; - -static int kho_out_update_debugfs_fdt(void) -{ - int err = 0; - struct fdt_debugfs *ff, *tmp; - - if (kho_out.finalized) { - err = kho_debugfs_fdt_add(&kho_out.ser.fdt_list, kho_out.dir, - "fdt", page_to_virt(kho_out.ser.fdt)); - } else { - list_for_each_entry_safe(ff, tmp, &kho_out.ser.fdt_list, list) { - debugfs_remove(ff->file); - list_del(&ff->list); - kfree(ff); - } - } - - return err; -} - static int __kho_abort(void) { int err; @@ -1116,8 +1057,9 @@ int kho_abort(void) return ret;
kho_out.finalized = false; + kho_debugfs_cleanup(&kho_out.dbg);
- return kho_out_update_debugfs_fdt(); + return 0; }
static int __kho_finalize(void) @@ -1186,89 +1128,23 @@ int kho_finalize(void)
kho_out.finalized = true;
- return kho_out_update_debugfs_fdt(); -} - -static int kho_out_finalize_get(void *data, u64 *val) -{ - mutex_lock(&kho_out.lock); - *val = kho_out.finalized; - mutex_unlock(&kho_out.lock); - - return 0; -} - -static int kho_out_finalize_set(void *data, u64 _val) -{ - return (!!_val) ? kho_finalize() : kho_abort(); -} - -DEFINE_DEBUGFS_ATTRIBUTE(fops_kho_out_finalize, kho_out_finalize_get, - kho_out_finalize_set, "%llu\n"); - -static int scratch_phys_show(struct seq_file *m, void *v) -{ - for (int i = 0; i < kho_scratch_cnt; i++) - seq_printf(m, "0x%llx\n", kho_scratch[i].addr); - - return 0; -} -DEFINE_SHOW_ATTRIBUTE(scratch_phys); - -static int scratch_len_show(struct seq_file *m, void *v) -{ - for (int i = 0; i < kho_scratch_cnt; i++) - seq_printf(m, "0x%llx\n", kho_scratch[i].size); - - return 0; + return kho_debugfs_fdt_add(&kho_out.dbg, "fdt", + page_to_virt(kho_out.ser.fdt), true); } -DEFINE_SHOW_ATTRIBUTE(scratch_len);
-static __init int kho_out_debugfs_init(void) +bool kho_finalized(void) { - struct dentry *dir, *f, *sub_fdt_dir; - - dir = debugfs_create_dir("out", debugfs_root); - if (IS_ERR(dir)) - return -ENOMEM; - - sub_fdt_dir = debugfs_create_dir("sub_fdts", dir); - if (IS_ERR(sub_fdt_dir)) - goto err_rmdir; - - f = debugfs_create_file("scratch_phys", 0400, dir, NULL, - &scratch_phys_fops); - if (IS_ERR(f)) - goto err_rmdir; - - f = debugfs_create_file("scratch_len", 0400, dir, NULL, - &scratch_len_fops); - if (IS_ERR(f)) - goto err_rmdir; - - f = debugfs_create_file("finalize", 0600, dir, NULL, - &fops_kho_out_finalize); - if (IS_ERR(f)) - goto err_rmdir; - - kho_out.dir = dir; - kho_out.ser.sub_fdt_dir = sub_fdt_dir; - return 0; - -err_rmdir: - debugfs_remove_recursive(dir); - return -ENOENT; + guard(mutex)(&kho_out.lock); + return kho_out.finalized; }
struct kho_in { - struct dentry *dir; phys_addr_t fdt_phys; phys_addr_t scratch_phys; - struct list_head fdt_list; + struct kho_debugfs dbg; };
static struct kho_in kho_in = { - .fdt_list = LIST_HEAD_INIT(kho_in.fdt_list), };
static const void *kho_get_fdt(void) @@ -1332,56 +1208,6 @@ int kho_retrieve_subtree(const char *name, phys_addr_t *phys) } EXPORT_SYMBOL_GPL(kho_retrieve_subtree);
-/* Handling for debugfs/kho/in */ - -static __init int kho_in_debugfs_init(const void *fdt) -{ - struct dentry *sub_fdt_dir; - int err, child; - - kho_in.dir = debugfs_create_dir("in", debugfs_root); - if (IS_ERR(kho_in.dir)) - return PTR_ERR(kho_in.dir); - - sub_fdt_dir = debugfs_create_dir("sub_fdts", kho_in.dir); - if (IS_ERR(sub_fdt_dir)) { - err = PTR_ERR(sub_fdt_dir); - goto err_rmdir; - } - - err = kho_debugfs_fdt_add(&kho_in.fdt_list, kho_in.dir, "fdt", fdt); - if (err) - goto err_rmdir; - - fdt_for_each_subnode(child, fdt, 0) { - int len = 0; - const char *name = fdt_get_name(fdt, child, NULL); - const u64 *fdt_phys; - - fdt_phys = fdt_getprop(fdt, child, "fdt", &len); - if (!fdt_phys) - continue; - if (len != sizeof(*fdt_phys)) { - pr_warn("node `%s`'s prop `fdt` has invalid length: %d\n", - name, len); - continue; - } - err = kho_debugfs_fdt_add(&kho_in.fdt_list, sub_fdt_dir, name, - phys_to_virt(*fdt_phys)); - if (err) { - pr_warn("failed to add fdt `%s` to debugfs: %d\n", name, - err); - continue; - } - } - - return 0; - -err_rmdir: - debugfs_remove_recursive(kho_in.dir); - return err; -} - static __init int kho_init(void) { int err = 0; @@ -1396,27 +1222,16 @@ static __init int kho_init(void) goto err_free_scratch; }
- debugfs_root = debugfs_create_dir("kho", NULL); - if (IS_ERR(debugfs_root)) { - err = -ENOENT; + err = kho_debugfs_init(); + if (err) goto err_free_fdt; - }
- err = kho_out_debugfs_init(); + err = kho_out_debugfs_init(&kho_out.dbg); if (err) goto err_free_fdt;
if (fdt) { - err = kho_in_debugfs_init(fdt); - /* - * Failure to create /sys/kernel/debug/kho/in does not prevent - * reviving state from KHO and setting up KHO for the next - * kexec. - */ - if (err) - pr_err("failed exposing handover FDT in debugfs: %d\n", - err); - + kho_in_debugfs_init(&kho_in.dbg, fdt); return 0; }
diff --git a/kernel/kexec_handover_debugfs.c b/kernel/kexec_handover_debugfs.c new file mode 100644 index 000000000000..96fb9afd8af6 --- /dev/null +++ b/kernel/kexec_handover_debugfs.c @@ -0,0 +1,213 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * kexec_handover_debugfs.c - kexec handover debugfs interfaces + * Copyright (C) 2023 Alexander Graf graf@amazon.com + * Copyright (C) 2025 Microsoft Corporation, Mike Rapoport rppt@kernel.org + * Copyright (C) 2025 Google LLC, Changyuan Lyu changyuanl@google.com + * Copyright (C) 2025 Google LLC, Pasha Tatashin pasha.tatashin@soleen.com + */ + +#define pr_fmt(fmt) "KHO: " fmt + +#include <linux/init.h> +#include <linux/io.h> +#include <linux/libfdt.h> +#include <linux/mm.h> +#include "kexec_handover_internal.h" + +static struct dentry *debugfs_root; + +struct fdt_debugfs { + struct list_head list; + struct debugfs_blob_wrapper wrapper; + struct dentry *file; +}; + +static int __kho_debugfs_fdt_add(struct list_head *list, struct dentry *dir, + const char *name, const void *fdt) +{ + struct fdt_debugfs *f; + struct dentry *file; + + f = kmalloc(sizeof(*f), GFP_KERNEL); + if (!f) + return -ENOMEM; + + f->wrapper.data = (void *)fdt; + f->wrapper.size = fdt_totalsize(fdt); + + file = debugfs_create_blob(name, 0400, dir, &f->wrapper); + if (IS_ERR(file)) { + kfree(f); + return PTR_ERR(file); + } + + f->file = file; + list_add(&f->list, list); + + return 0; +} + +int kho_debugfs_fdt_add(struct kho_debugfs *dbg, const char *name, + const void *fdt, bool root) +{ + struct dentry *dir; + + if (root) + dir = dbg->dir; + else + dir = dbg->sub_fdt_dir; + + return __kho_debugfs_fdt_add(&dbg->fdt_list, dir, name, fdt); +} + +void kho_debugfs_cleanup(struct kho_debugfs *dbg) +{ + struct fdt_debugfs *ff, *tmp; + + list_for_each_entry_safe(ff, tmp, &dbg->fdt_list, list) { + debugfs_remove(ff->file); + list_del(&ff->list); + kfree(ff); + } +} + +static int kho_out_finalize_get(void *data, u64 *val) +{ + *val = kho_finalized(); + + return 0; +} + +static int kho_out_finalize_set(void *data, u64 _val) +{ + return (!!_val) ? kho_finalize() : kho_abort(); +} + +DEFINE_DEBUGFS_ATTRIBUTE(kho_out_finalize_fops, kho_out_finalize_get, + kho_out_finalize_set, "%llu\n"); + +static int scratch_phys_show(struct seq_file *m, void *v) +{ + for (int i = 0; i < kho_scratch_cnt; i++) + seq_printf(m, "0x%llx\n", kho_scratch[i].addr); + + return 0; +} +DEFINE_SHOW_ATTRIBUTE(scratch_phys); + +static int scratch_len_show(struct seq_file *m, void *v) +{ + for (int i = 0; i < kho_scratch_cnt; i++) + seq_printf(m, "0x%llx\n", kho_scratch[i].size); + + return 0; +} +DEFINE_SHOW_ATTRIBUTE(scratch_len); + +__init void kho_in_debugfs_init(struct kho_debugfs *dbg, const void *fdt) +{ + struct dentry *dir, *sub_fdt_dir; + int err, child; + + INIT_LIST_HEAD(&dbg->fdt_list); + + dir = debugfs_create_dir("in", debugfs_root); + if (IS_ERR(dir)) { + err = PTR_ERR(dir); + goto err_out; + } + + sub_fdt_dir = debugfs_create_dir("sub_fdts", dir); + if (IS_ERR(sub_fdt_dir)) { + err = PTR_ERR(sub_fdt_dir); + goto err_rmdir; + } + + err = __kho_debugfs_fdt_add(&dbg->fdt_list, dir, "fdt", fdt); + if (err) + goto err_rmdir; + + fdt_for_each_subnode(child, fdt, 0) { + int len = 0; + const char *name = fdt_get_name(fdt, child, NULL); + const u64 *fdt_phys; + + fdt_phys = fdt_getprop(fdt, child, "fdt", &len); + if (!fdt_phys) + continue; + if (len != sizeof(*fdt_phys)) { + pr_warn("node %s prop fdt has invalid length: %d\n", + name, len); + continue; + } + err = __kho_debugfs_fdt_add(&dbg->fdt_list, sub_fdt_dir, name, + phys_to_virt(*fdt_phys)); + if (err) { + pr_warn("failed to add fdt %s to debugfs: %d\n", name, + err); + continue; + } + } + + dbg->dir = dir; + dbg->sub_fdt_dir = sub_fdt_dir; + + return; +err_rmdir: + debugfs_remove_recursive(dir); +err_out: + /* + * Failure to create /sys/kernel/debug/kho/in does not prevent + * reviving state from KHO and setting up KHO for the next + * kexec. + */ + if (err) + pr_err("failed exposing handover FDT in debugfs: %d\n", err); +} + +__init int kho_out_debugfs_init(struct kho_debugfs *dbg) +{ + struct dentry *dir, *f, *sub_fdt_dir; + + INIT_LIST_HEAD(&dbg->fdt_list); + + dir = debugfs_create_dir("out", debugfs_root); + if (IS_ERR(dir)) + return -ENOMEM; + + sub_fdt_dir = debugfs_create_dir("sub_fdts", dir); + if (IS_ERR(sub_fdt_dir)) + goto err_rmdir; + + f = debugfs_create_file("scratch_phys", 0400, dir, NULL, + &scratch_phys_fops); + if (IS_ERR(f)) + goto err_rmdir; + + f = debugfs_create_file("scratch_len", 0400, dir, NULL, + &scratch_len_fops); + if (IS_ERR(f)) + goto err_rmdir; + + f = debugfs_create_file("finalize", 0600, dir, NULL, + &kho_out_finalize_fops); + if (IS_ERR(f)) + goto err_rmdir; + + dbg->dir = dir; + dbg->sub_fdt_dir = sub_fdt_dir; + return 0; + +err_rmdir: + debugfs_remove_recursive(dir); + return -ENOENT; +} + +__init int kho_debugfs_init(void) +{ + debugfs_root = debugfs_create_dir("kho", NULL); + if (IS_ERR(debugfs_root)) + return -ENOENT; + return 0; +} diff --git a/kernel/kexec_handover_internal.h b/kernel/kexec_handover_internal.h new file mode 100644 index 000000000000..042c189af768 --- /dev/null +++ b/kernel/kexec_handover_internal.h @@ -0,0 +1,44 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef LINUX_KEXEC_HANDOVER_INTERNAL_H +#define LINUX_KEXEC_HANDOVER_INTERNAL_H + +#include <linux/kexec_handover.h> +#include <linux/list.h> +#include <linux/types.h> + +#ifdef CONFIG_KEXEC_HANDOVER_DEBUGFS +#include <linux/debugfs.h> + +struct kho_debugfs { + struct dentry *dir; + struct dentry *sub_fdt_dir; + struct list_head fdt_list; +}; + +#else +struct kho_debugfs {}; +#endif + +extern struct kho_scratch *kho_scratch; +extern unsigned int kho_scratch_cnt; + +bool kho_finalized(void); + +#ifdef CONFIG_KEXEC_HANDOVER_DEBUGFS +int kho_debugfs_init(void); +void kho_in_debugfs_init(struct kho_debugfs *dbg, const void *fdt); +int kho_out_debugfs_init(struct kho_debugfs *dbg); +int kho_debugfs_fdt_add(struct kho_debugfs *dbg, const char *name, + const void *fdt, bool root); +void kho_debugfs_cleanup(struct kho_debugfs *dbg); +#else +static inline int kho_debugfs_init(void) { return 0; } +static inline void kho_in_debugfs_init(struct kho_debugfs *dbg, + const void *fdt) { } +static inline int kho_out_debugfs_init(struct kho_debugfs *dbg) { return 0; } +static inline int kho_debugfs_fdt_add(struct kho_debugfs *dbg, const char *name, + const void *fdt, bool root) { return 0; } +static inline void kho_debugfs_cleanup(struct kho_debugfs *dbg) {} +#endif /* CONFIG_KEXEC_HANDOVER_DEBUGFS */ + +#endif /* LINUX_KEXEC_HANDOVER_INTERNAL_H */ diff --git a/tools/testing/selftests/kho/vmtest.sh b/tools/testing/selftests/kho/vmtest.sh index 3f6c17166846..49fdac8e8b15 100755 --- a/tools/testing/selftests/kho/vmtest.sh +++ b/tools/testing/selftests/kho/vmtest.sh @@ -59,6 +59,7 @@ function build_kernel() { tee "$kconfig" > "$kho_config" <<EOF CONFIG_BLK_DEV_INITRD=y CONFIG_KEXEC_HANDOVER=y +CONFIG_KEXEC_HANDOVER_DEBUGFS=y CONFIG_TEST_KEXEC_HANDOVER=y CONFIG_DEBUG_KERNEL=y CONFIG_DEBUG_VM=y
From: "Mike Rapoport (Microsoft)" rppt@kernel.org
The KHO framework uses a notifier chain as the mechanism for clients to participate in the finalization process. While this works for a single, central state machine, it is too restrictive for kernel-internal components like pstore/reserve_mem or IMA. These components need a simpler, direct way to register their state for preservation (e.g., during their initcall) without being part of a complex, shutdown-time notifier sequence. The notifier model forces all participants into a single finalization flow and makes direct preservation from an arbitrary context difficult. This patch refactors the client participation model by removing the notifier chain and introducing a direct API for managing FDT subtrees.
The core kho_finalize() and kho_abort() state machine remains, but clients now register their data with KHO beforehand.
Signed-off-by: Mike Rapoport (Microsoft) rppt@kernel.org Co-developed-by: Pasha Tatashin pasha.tatashin@soleen.com Signed-off-by: Pasha Tatashin pasha.tatashin@soleen.com --- include/linux/kexec_handover.h | 28 +----- kernel/kexec_handover.c | 164 +++++++++++++++++-------------- kernel/kexec_handover_debugfs.c | 17 ++-- kernel/kexec_handover_internal.h | 5 +- lib/test_kho.c | 30 +----- mm/memblock.c | 62 +++--------- 6 files changed, 127 insertions(+), 179 deletions(-)
diff --git a/include/linux/kexec_handover.h b/include/linux/kexec_handover.h index 04d0108db98e..2faf290803ce 100644 --- a/include/linux/kexec_handover.h +++ b/include/linux/kexec_handover.h @@ -10,14 +10,7 @@ struct kho_scratch { phys_addr_t size; };
-/* KHO Notifier index */ -enum kho_event { - KEXEC_KHO_FINALIZE = 0, - KEXEC_KHO_ABORT = 1, -}; - struct folio; -struct notifier_block; struct page;
#define DECLARE_KHOSER_PTR(name, type) \ @@ -37,8 +30,6 @@ struct page; (typeof((s).ptr))((s).phys ? phys_to_virt((s).phys) : NULL); \ })
-struct kho_serialization; - struct kho_vmalloc_chunk; struct kho_vmalloc { DECLARE_KHOSER_PTR(first, struct kho_vmalloc_chunk *); @@ -57,12 +48,10 @@ int kho_preserve_vmalloc(void *ptr, struct kho_vmalloc *preservation); struct folio *kho_restore_folio(phys_addr_t phys); struct page *kho_restore_pages(phys_addr_t phys, unsigned int nr_pages); void *kho_restore_vmalloc(const struct kho_vmalloc *preservation); -int kho_add_subtree(struct kho_serialization *ser, const char *name, void *fdt); +int kho_add_subtree(const char *name, void *fdt); +void kho_remove_subtree(void *fdt); int kho_retrieve_subtree(const char *name, phys_addr_t *phys);
-int register_kho_notifier(struct notifier_block *nb); -int unregister_kho_notifier(struct notifier_block *nb); - void kho_memory_init(void);
void kho_populate(phys_addr_t fdt_phys, u64 fdt_len, phys_addr_t scratch_phys, @@ -114,23 +103,16 @@ static inline void *kho_restore_vmalloc(const struct kho_vmalloc *preservation) return NULL; }
-static inline int kho_add_subtree(struct kho_serialization *ser, - const char *name, void *fdt) +static inline int kho_add_subtree(const char *name, void *fdt) { return -EOPNOTSUPP; }
-static inline int kho_retrieve_subtree(const char *name, phys_addr_t *phys) +static inline void kho_remove_subtree(void *fdt) { - return -EOPNOTSUPP; }
-static inline int register_kho_notifier(struct notifier_block *nb) -{ - return -EOPNOTSUPP; -} - -static inline int unregister_kho_notifier(struct notifier_block *nb) +static inline int kho_retrieve_subtree(const char *name, phys_addr_t *phys) { return -EOPNOTSUPP; } diff --git a/kernel/kexec_handover.c b/kernel/kexec_handover.c index f3627430b3c3..e6890fe6a171 100644 --- a/kernel/kexec_handover.c +++ b/kernel/kexec_handover.c @@ -15,7 +15,6 @@ #include <linux/libfdt.h> #include <linux/list.h> #include <linux/memblock.h> -#include <linux/notifier.h> #include <linux/page-isolation.h> #include <linux/vmalloc.h>
@@ -99,29 +98,34 @@ struct kho_mem_track {
struct khoser_mem_chunk;
-struct kho_serialization { - struct page *fdt; - struct kho_mem_track track; - /* First chunk of serialized preserved memory map */ - struct khoser_mem_chunk *preserved_mem_map; +struct kho_sub_fdt { + struct list_head l; + const char *name; + void *fdt; };
struct kho_out { - struct blocking_notifier_head chain_head; - struct mutex lock; /* protects KHO FDT finalization */ - struct kho_serialization ser; + void *fdt; bool finalized; + struct mutex lock; /* protects KHO FDT finalization */ + + struct list_head sub_fdts; + struct mutex fdts_lock; + + struct kho_mem_track track; + /* First chunk of serialized preserved memory map */ + struct khoser_mem_chunk *preserved_mem_map; + struct kho_debugfs dbg; };
static struct kho_out kho_out = { - .chain_head = BLOCKING_NOTIFIER_INIT(kho_out.chain_head), .lock = __MUTEX_INITIALIZER(kho_out.lock), - .ser = { - .track = { - .orders = XARRAY_INIT(kho_out.ser.track.orders, 0), - }, + .track = { + .orders = XARRAY_INIT(kho_out.track.orders, 0), }, + .sub_fdts = LIST_HEAD_INIT(kho_out.sub_fdts), + .fdts_lock = __MUTEX_INITIALIZER(kho_out.fdts_lock), .finalized = false, };
@@ -362,14 +366,14 @@ static void kho_mem_ser_free(struct khoser_mem_chunk *first_chunk) } }
-static int kho_mem_serialize(struct kho_serialization *ser) +static int kho_mem_serialize(struct kho_out *kho_out) { struct khoser_mem_chunk *first_chunk = NULL; struct khoser_mem_chunk *chunk = NULL; struct kho_mem_phys *physxa; unsigned long order;
- xa_for_each(&ser->track.orders, order, physxa) { + xa_for_each(&kho_out->track.orders, order, physxa) { struct kho_mem_phys_bits *bits; unsigned long phys;
@@ -397,7 +401,7 @@ static int kho_mem_serialize(struct kho_serialization *ser) } }
- ser->preserved_mem_map = first_chunk; + kho_out->preserved_mem_map = first_chunk;
return 0;
@@ -658,7 +662,6 @@ static void __init kho_reserve_scratch(void)
/** * kho_add_subtree - record the physical address of a sub FDT in KHO root tree. - * @ser: serialization control object passed by KHO notifiers. * @name: name of the sub tree. * @fdt: the sub tree blob. * @@ -672,34 +675,45 @@ static void __init kho_reserve_scratch(void) * * Return: 0 on success, error code on failure */ -int kho_add_subtree(struct kho_serialization *ser, const char *name, void *fdt) +int kho_add_subtree(const char *name, void *fdt) { - int err = 0; - u64 phys = (u64)virt_to_phys(fdt); - void *root = page_to_virt(ser->fdt); + struct kho_sub_fdt *sub_fdt; + int err;
- err |= fdt_begin_node(root, name); - err |= fdt_property(root, PROP_SUB_FDT, &phys, sizeof(phys)); - err |= fdt_end_node(root); + sub_fdt = kmalloc(sizeof(*sub_fdt), GFP_KERNEL); + if (!sub_fdt) + return -ENOMEM;
- if (err) - return err; + INIT_LIST_HEAD(&sub_fdt->l); + sub_fdt->name = name; + sub_fdt->fdt = fdt;
- return kho_debugfs_fdt_add(&kho_out.dbg, name, fdt, false); + mutex_lock(&kho_out.fdts_lock); + list_add_tail(&sub_fdt->l, &kho_out.sub_fdts); + err = kho_debugfs_fdt_add(&kho_out.dbg, name, fdt, false); + mutex_unlock(&kho_out.fdts_lock); + + return err; } EXPORT_SYMBOL_GPL(kho_add_subtree);
-int register_kho_notifier(struct notifier_block *nb) +void kho_remove_subtree(void *fdt) { - return blocking_notifier_chain_register(&kho_out.chain_head, nb); -} -EXPORT_SYMBOL_GPL(register_kho_notifier); + struct kho_sub_fdt *sub_fdt; + + mutex_lock(&kho_out.fdts_lock); + list_for_each_entry(sub_fdt, &kho_out.sub_fdts, l) { + if (sub_fdt->fdt == fdt) { + list_del(&sub_fdt->l); + kfree(sub_fdt); + kho_debugfs_fdt_remove(&kho_out.dbg, fdt); + break; + } + } + mutex_unlock(&kho_out.fdts_lock);
-int unregister_kho_notifier(struct notifier_block *nb) -{ - return blocking_notifier_chain_unregister(&kho_out.chain_head, nb); } -EXPORT_SYMBOL_GPL(unregister_kho_notifier); +EXPORT_SYMBOL_GPL(kho_remove_subtree);
/** * kho_preserve_folio - preserve a folio across kexec. @@ -714,7 +728,7 @@ int kho_preserve_folio(struct folio *folio) { const unsigned long pfn = folio_pfn(folio); const unsigned int order = folio_order(folio); - struct kho_mem_track *track = &kho_out.ser.track; + struct kho_mem_track *track = &kho_out.track;
return __kho_preserve_order(track, pfn, order); } @@ -732,7 +746,7 @@ EXPORT_SYMBOL_GPL(kho_preserve_folio); */ int kho_preserve_pages(struct page *page, unsigned int nr_pages) { - struct kho_mem_track *track = &kho_out.ser.track; + struct kho_mem_track *track = &kho_out.track; const unsigned long start_pfn = page_to_pfn(page); const unsigned long end_pfn = start_pfn + nr_pages; unsigned long pfn = start_pfn; @@ -828,7 +842,7 @@ static struct kho_vmalloc_chunk *new_vmalloc_chunk(struct kho_vmalloc_chunk *cur
static void kho_vmalloc_unpreserve_chunk(struct kho_vmalloc_chunk *chunk) { - struct kho_mem_track *track = &kho_out.ser.track; + struct kho_mem_track *track = &kho_out.track; unsigned long pfn = PHYS_PFN(virt_to_phys(chunk));
__kho_unpreserve(track, pfn, pfn + 1); @@ -1010,11 +1024,11 @@ EXPORT_SYMBOL_GPL(kho_restore_vmalloc);
static int __kho_abort(void) { - int err; + int err = 0; unsigned long order; struct kho_mem_phys *physxa;
- xa_for_each(&kho_out.ser.track.orders, order, physxa) { + xa_for_each(&kho_out.track.orders, order, physxa) { struct kho_mem_phys_bits *bits; unsigned long phys;
@@ -1024,17 +1038,13 @@ static int __kho_abort(void) xa_destroy(&physxa->phys_bits); kfree(physxa); } - xa_destroy(&kho_out.ser.track.orders); + xa_destroy(&kho_out.track.orders);
- if (kho_out.ser.preserved_mem_map) { - kho_mem_ser_free(kho_out.ser.preserved_mem_map); - kho_out.ser.preserved_mem_map = NULL; + if (kho_out.preserved_mem_map) { + kho_mem_ser_free(kho_out.preserved_mem_map); + kho_out.preserved_mem_map = NULL; }
- err = blocking_notifier_call_chain(&kho_out.chain_head, KEXEC_KHO_ABORT, - NULL); - err = notifier_to_errno(err); - if (err) pr_err("Failed to abort KHO finalization: %d\n", err);
@@ -1057,7 +1067,8 @@ int kho_abort(void) return ret;
kho_out.finalized = false; - kho_debugfs_cleanup(&kho_out.dbg); + + kho_debugfs_fdt_remove(&kho_out.dbg, kho_out.fdt);
return 0; } @@ -1066,41 +1077,46 @@ static int __kho_finalize(void) { int err = 0; u64 *preserved_mem_map; - void *fdt = page_to_virt(kho_out.ser.fdt); + void *root = kho_out.fdt; + struct kho_sub_fdt *fdt;
- err |= fdt_create(fdt, PAGE_SIZE); - err |= fdt_finish_reservemap(fdt); - err |= fdt_begin_node(fdt, ""); - err |= fdt_property_string(fdt, "compatible", KHO_FDT_COMPATIBLE); + err |= fdt_create(root, PAGE_SIZE); + err |= fdt_finish_reservemap(root); + err |= fdt_begin_node(root, ""); + err |= fdt_property_string(root, "compatible", KHO_FDT_COMPATIBLE); /** * Reserve the preserved-memory-map property in the root FDT, so * that all property definitions will precede subnodes created by * KHO callers. */ - err |= fdt_property_placeholder(fdt, PROP_PRESERVED_MEMORY_MAP, + err |= fdt_property_placeholder(root, PROP_PRESERVED_MEMORY_MAP, sizeof(*preserved_mem_map), (void **)&preserved_mem_map); if (err) goto abort;
- err = kho_preserve_folio(page_folio(kho_out.ser.fdt)); + err = kho_preserve_folio(virt_to_folio(kho_out.fdt)); if (err) goto abort;
- err = blocking_notifier_call_chain(&kho_out.chain_head, - KEXEC_KHO_FINALIZE, &kho_out.ser); - err = notifier_to_errno(err); + err = kho_mem_serialize(&kho_out); if (err) goto abort;
- err = kho_mem_serialize(&kho_out.ser); - if (err) - goto abort; + *preserved_mem_map = (u64)virt_to_phys(kho_out.preserved_mem_map); + + mutex_lock(&kho_out.fdts_lock); + list_for_each_entry(fdt, &kho_out.sub_fdts, l) { + phys_addr_t phys = virt_to_phys(fdt->fdt);
- *preserved_mem_map = (u64)virt_to_phys(kho_out.ser.preserved_mem_map); + err |= fdt_begin_node(root, fdt->name); + err |= fdt_property(root, PROP_SUB_FDT, &phys, sizeof(phys)); + err |= fdt_end_node(root); + } + mutex_unlock(&kho_out.fdts_lock);
- err |= fdt_end_node(fdt); - err |= fdt_finish(fdt); + err |= fdt_end_node(root); + err |= fdt_finish(root);
abort: if (err) { @@ -1129,7 +1145,7 @@ int kho_finalize(void) kho_out.finalized = true;
return kho_debugfs_fdt_add(&kho_out.dbg, "fdt", - page_to_virt(kho_out.ser.fdt), true); + kho_out.fdt, true); }
bool kho_finalized(void) @@ -1212,15 +1228,17 @@ static __init int kho_init(void) { int err = 0; const void *fdt = kho_get_fdt(); + struct page *fdt_page;
if (!kho_enable) return 0;
- kho_out.ser.fdt = alloc_page(GFP_KERNEL); - if (!kho_out.ser.fdt) { + fdt_page = alloc_page(GFP_KERNEL); + if (!fdt_page) { err = -ENOMEM; goto err_free_scratch; } + kho_out.fdt = page_to_virt(fdt_page);
err = kho_debugfs_init(); if (err) @@ -1248,8 +1266,8 @@ static __init int kho_init(void) return 0;
err_free_fdt: - put_page(kho_out.ser.fdt); - kho_out.ser.fdt = NULL; + put_page(fdt_page); + kho_out.fdt = NULL; err_free_scratch: for (int i = 0; i < kho_scratch_cnt; i++) { void *start = __va(kho_scratch[i].addr); @@ -1260,7 +1278,7 @@ static __init int kho_init(void) kho_enable = false; return err; } -late_initcall(kho_init); +fs_initcall(kho_init);
static void __init kho_release_scratch(void) { @@ -1396,7 +1414,7 @@ int kho_fill_kimage(struct kimage *image) if (!kho_out.finalized) return 0;
- image->kho.fdt = page_to_phys(kho_out.ser.fdt); + image->kho.fdt = virt_to_phys(kho_out.fdt);
scratch_size = sizeof(*kho_scratch) * kho_scratch_cnt; scratch = (struct kexec_buf){ diff --git a/kernel/kexec_handover_debugfs.c b/kernel/kexec_handover_debugfs.c index 96fb9afd8af6..6ddcd17fac3c 100644 --- a/kernel/kexec_handover_debugfs.c +++ b/kernel/kexec_handover_debugfs.c @@ -61,14 +61,17 @@ int kho_debugfs_fdt_add(struct kho_debugfs *dbg, const char *name, return __kho_debugfs_fdt_add(&dbg->fdt_list, dir, name, fdt); }
-void kho_debugfs_cleanup(struct kho_debugfs *dbg) +void kho_debugfs_fdt_remove(struct kho_debugfs *dbg, void *fdt) { - struct fdt_debugfs *ff, *tmp; - - list_for_each_entry_safe(ff, tmp, &dbg->fdt_list, list) { - debugfs_remove(ff->file); - list_del(&ff->list); - kfree(ff); + struct fdt_debugfs *ff; + + list_for_each_entry(ff, &dbg->fdt_list, list) { + if (ff->wrapper.data == fdt) { + debugfs_remove(ff->file); + list_del(&ff->list); + kfree(ff); + break; + } } }
diff --git a/kernel/kexec_handover_internal.h b/kernel/kexec_handover_internal.h index 042c189af768..de90a678274d 100644 --- a/kernel/kexec_handover_internal.h +++ b/kernel/kexec_handover_internal.h @@ -30,7 +30,7 @@ void kho_in_debugfs_init(struct kho_debugfs *dbg, const void *fdt); int kho_out_debugfs_init(struct kho_debugfs *dbg); int kho_debugfs_fdt_add(struct kho_debugfs *dbg, const char *name, const void *fdt, bool root); -void kho_debugfs_cleanup(struct kho_debugfs *dbg); +void kho_debugfs_fdt_remove(struct kho_debugfs *dbg, void *fdt); #else static inline int kho_debugfs_init(void) { return 0; } static inline void kho_in_debugfs_init(struct kho_debugfs *dbg, @@ -38,7 +38,8 @@ static inline void kho_in_debugfs_init(struct kho_debugfs *dbg, static inline int kho_out_debugfs_init(struct kho_debugfs *dbg) { return 0; } static inline int kho_debugfs_fdt_add(struct kho_debugfs *dbg, const char *name, const void *fdt, bool root) { return 0; } -static inline void kho_debugfs_cleanup(struct kho_debugfs *dbg) {} +static inline void kho_debugfs_fdt_remove(struct kho_debugfs *dbg, + void *fdt) { } #endif /* CONFIG_KEXEC_HANDOVER_DEBUGFS */
#endif /* LINUX_KEXEC_HANDOVER_INTERNAL_H */ diff --git a/lib/test_kho.c b/lib/test_kho.c index 60cd899ea745..8d57049e8c8c 100644 --- a/lib/test_kho.c +++ b/lib/test_kho.c @@ -39,33 +39,17 @@ struct kho_test_state {
static struct kho_test_state kho_test_state;
-static int kho_test_notifier(struct notifier_block *self, unsigned long cmd, - void *v) +static int kho_test(void) { struct kho_test_state *state = &kho_test_state; - struct kho_serialization *ser = v; int err = 0;
- switch (cmd) { - case KEXEC_KHO_ABORT: - return NOTIFY_DONE; - case KEXEC_KHO_FINALIZE: - /* Handled below */ - break; - default: - return NOTIFY_BAD; - } - err |= kho_preserve_folio(state->fdt); - err |= kho_add_subtree(ser, KHO_TEST_FDT, folio_address(state->fdt)); + err |= kho_add_subtree(KHO_TEST_FDT, folio_address(state->fdt));
return err ? NOTIFY_BAD : NOTIFY_DONE; }
-static struct notifier_block kho_test_nb = { - .notifier_call = kho_test_notifier, -}; - static int kho_test_save_data(struct kho_test_state *state, void *fdt) { phys_addr_t *folios_info __free(kvfree) = NULL; @@ -102,6 +86,9 @@ static int kho_test_save_data(struct kho_test_state *state, void *fdt) if (!err) state->folios_info = no_free_ptr(folios_info);
+ if (!err) + err = kho_test(); + return err; }
@@ -203,14 +190,8 @@ static int kho_test_save(void) if (err) goto err_free_folios;
- err = register_kho_notifier(&kho_test_nb); - if (err) - goto err_free_fdt; - return 0;
-err_free_fdt: - folio_put(state->fdt); err_free_folios: kvfree(folios); return err; @@ -326,7 +307,6 @@ static void kho_test_cleanup(void)
static void __exit kho_test_exit(void) { - unregister_kho_notifier(&kho_test_nb); kho_test_cleanup(); } module_exit(kho_test_exit); diff --git a/mm/memblock.c b/mm/memblock.c index e23e16618e9b..e3bef9b35d63 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -2444,53 +2444,18 @@ int reserve_mem_release_by_name(const char *name) #define MEMBLOCK_KHO_FDT "memblock" #define MEMBLOCK_KHO_NODE_COMPATIBLE "memblock-v1" #define RESERVE_MEM_KHO_NODE_COMPATIBLE "reserve-mem-v1" -static struct page *kho_fdt; - -static int reserve_mem_kho_finalize(struct kho_serialization *ser) -{ - int err = 0, i; - - for (i = 0; i < reserved_mem_count; i++) { - struct reserve_mem_table *map = &reserved_mem_table[i]; - struct page *page = phys_to_page(map->start); - unsigned int nr_pages = map->size >> PAGE_SHIFT; - - err |= kho_preserve_pages(page, nr_pages); - } - - err |= kho_preserve_folio(page_folio(kho_fdt)); - err |= kho_add_subtree(ser, MEMBLOCK_KHO_FDT, page_to_virt(kho_fdt)); - - return notifier_from_errno(err); -} - -static int reserve_mem_kho_notifier(struct notifier_block *self, - unsigned long cmd, void *v) -{ - switch (cmd) { - case KEXEC_KHO_FINALIZE: - return reserve_mem_kho_finalize((struct kho_serialization *)v); - case KEXEC_KHO_ABORT: - return NOTIFY_DONE; - default: - return NOTIFY_BAD; - } -} - -static struct notifier_block reserve_mem_kho_nb = { - .notifier_call = reserve_mem_kho_notifier, -};
static int __init prepare_kho_fdt(void) { int err = 0, i; + struct page *fdt_page; void *fdt;
- kho_fdt = alloc_page(GFP_KERNEL); - if (!kho_fdt) + fdt_page = alloc_page(GFP_KERNEL); + if (!fdt_page) return -ENOMEM;
- fdt = page_to_virt(kho_fdt); + fdt = page_to_virt(fdt_page);
err |= fdt_create(fdt, PAGE_SIZE); err |= fdt_finish_reservemap(fdt); @@ -2499,7 +2464,10 @@ static int __init prepare_kho_fdt(void) err |= fdt_property_string(fdt, "compatible", MEMBLOCK_KHO_NODE_COMPATIBLE); for (i = 0; i < reserved_mem_count; i++) { struct reserve_mem_table *map = &reserved_mem_table[i]; + struct page *page = phys_to_page(map->start); + unsigned int nr_pages = map->size >> PAGE_SHIFT;
+ err |= kho_preserve_pages(page, nr_pages); err |= fdt_begin_node(fdt, map->name); err |= fdt_property_string(fdt, "compatible", RESERVE_MEM_KHO_NODE_COMPATIBLE); err |= fdt_property(fdt, "start", &map->start, sizeof(map->start)); @@ -2507,13 +2475,16 @@ static int __init prepare_kho_fdt(void) err |= fdt_end_node(fdt); } err |= fdt_end_node(fdt); - err |= fdt_finish(fdt);
+ err |= kho_preserve_folio(page_folio(fdt_page)); + + if (!err) + err = kho_add_subtree(MEMBLOCK_KHO_FDT, fdt); + if (err) { pr_err("failed to prepare memblock FDT for KHO: %d\n", err); - put_page(kho_fdt); - kho_fdt = NULL; + put_page(fdt_page); }
return err; @@ -2529,13 +2500,6 @@ static int __init reserve_mem_init(void) err = prepare_kho_fdt(); if (err) return err; - - err = register_kho_notifier(&reserve_mem_kho_nb); - if (err) { - put_page(kho_fdt); - kho_fdt = NULL; - } - return err; } late_initcall(reserve_mem_init);
Allow users of KHO to cancel the previous preservation by adding the necessary interfaces to unpreserve folio and pages.
Signed-off-by: Pasha Tatashin pasha.tatashin@soleen.com --- include/linux/kexec_handover.h | 12 +++++ kernel/kexec_handover.c | 85 ++++++++++++++++++++++++++++------ 2 files changed, 84 insertions(+), 13 deletions(-)
diff --git a/include/linux/kexec_handover.h b/include/linux/kexec_handover.h index 2faf290803ce..4ba145713838 100644 --- a/include/linux/kexec_handover.h +++ b/include/linux/kexec_handover.h @@ -43,7 +43,9 @@ bool kho_is_enabled(void); bool is_kho_boot(void);
int kho_preserve_folio(struct folio *folio); +int kho_unpreserve_folio(struct folio *folio); int kho_preserve_pages(struct page *page, unsigned int nr_pages); +int kho_unpreserve_pages(struct page *page, unsigned int nr_pages); int kho_preserve_vmalloc(void *ptr, struct kho_vmalloc *preservation); struct folio *kho_restore_folio(phys_addr_t phys); struct page *kho_restore_pages(phys_addr_t phys, unsigned int nr_pages); @@ -76,11 +78,21 @@ static inline int kho_preserve_folio(struct folio *folio) return -EOPNOTSUPP; }
+static inline int kho_unpreserve_folio(struct folio *folio) +{ + return -EOPNOTSUPP; +} + static inline int kho_preserve_pages(struct page *page, unsigned int nr_pages) { return -EOPNOTSUPP; }
+static inline int kho_unpreserve_pages(struct page *page, unsigned int nr_pages) +{ + return -EOPNOTSUPP; +} + static inline int kho_preserve_vmalloc(void *ptr, struct kho_vmalloc *preservation) { diff --git a/kernel/kexec_handover.c b/kernel/kexec_handover.c index e6890fe6a171..78acaa218c12 100644 --- a/kernel/kexec_handover.c +++ b/kernel/kexec_handover.c @@ -153,26 +153,33 @@ static void *xa_load_or_alloc(struct xarray *xa, unsigned long index, size_t sz) return elm; }
-static void __kho_unpreserve(struct kho_mem_track *track, unsigned long pfn, - unsigned long end_pfn) +static void __kho_unpreserve_order(struct kho_mem_track *track, unsigned long pfn, + unsigned int order) { struct kho_mem_phys_bits *bits; struct kho_mem_phys *physxa; + const unsigned long pfn_high = pfn >> order;
- while (pfn < end_pfn) { - const unsigned int order = - min(count_trailing_zeros(pfn), ilog2(end_pfn - pfn)); - const unsigned long pfn_high = pfn >> order; + physxa = xa_load(&track->orders, order); + if (!physxa) + return; + + bits = xa_load(&physxa->phys_bits, pfn_high / PRESERVE_BITS); + if (!bits) + return;
- physxa = xa_load(&track->orders, order); - if (!physxa) - continue; + clear_bit(pfn_high % PRESERVE_BITS, bits->preserve); +} + +static void __kho_unpreserve(struct kho_mem_track *track, unsigned long pfn, + unsigned long end_pfn) +{ + unsigned int order;
- bits = xa_load(&physxa->phys_bits, pfn_high / PRESERVE_BITS); - if (!bits) - continue; + while (pfn < end_pfn) { + order = min(count_trailing_zeros(pfn), ilog2(end_pfn - pfn));
- clear_bit(pfn_high % PRESERVE_BITS, bits->preserve); + __kho_unpreserve_order(track, pfn, order);
pfn += 1 << order; } @@ -734,6 +741,30 @@ int kho_preserve_folio(struct folio *folio) } EXPORT_SYMBOL_GPL(kho_preserve_folio);
+/** + * kho_unpreserve_folio - unpreserve a folio. + * @folio: folio to unpreserve. + * + * Instructs KHO to unpreserve a folio that was preserved by + * kho_preserve_folio() before. The provided @folio (pfn and order) + * must exactly match a previously preserved folio. + * + * Return: 0 on success, error code on failure + */ +int kho_unpreserve_folio(struct folio *folio) +{ + const unsigned long pfn = folio_pfn(folio); + const unsigned int order = folio_order(folio); + struct kho_mem_track *track = &kho_out.track; + + if (kho_out.finalized) + return -EBUSY; + + __kho_unpreserve_order(track, pfn, order); + return 0; +} +EXPORT_SYMBOL_GPL(kho_unpreserve_folio); + /** * kho_preserve_pages - preserve contiguous pages across kexec * @page: first page in the list. @@ -773,6 +804,34 @@ int kho_preserve_pages(struct page *page, unsigned int nr_pages) } EXPORT_SYMBOL_GPL(kho_preserve_pages);
+/** + * kho_unpreserve_pages - unpreserve contiguous pages. + * @page: first page in the list. + * @nr_pages: number of pages. + * + * Instructs KHO to unpreserve @nr_pages contigious pages starting from @page. + * This call must exactly match a granularity at which memory was originally + * preserved by kho_preserve_pages, call with the same @page and + * @nr_pages). Unpreserving arbitrary sub-ranges of larger preserved blocks is + * not supported. + * + * Return: 0 on success, error code on failure + */ +int kho_unpreserve_pages(struct page *page, unsigned int nr_pages) +{ + struct kho_mem_track *track = &kho_out.track; + const unsigned long start_pfn = page_to_pfn(page); + const unsigned long end_pfn = start_pfn + nr_pages; + + if (kho_out.finalized) + return -EBUSY; + + __kho_unpreserve(track, start_pfn, end_pfn); + + return 0; +} +EXPORT_SYMBOL_GPL(kho_unpreserve_pages); + struct kho_vmalloc_hdr { DECLARE_KHOSER_PTR(next, struct kho_vmalloc_chunk *); };
KHO allows clients to preserve memory regions at any point before the KHO state is finalized. The finalization process itself involves KHO performing its own actions, such as serializing the overall preserved memory map.
If this finalization process is aborted, the current implementation destroys KHO's internal memory tracking structures (`kho_out.ser.track.orders`). This behavior effectively unpreserves all memory from KHO's perspective, regardless of whether those preservations were made by clients before the finalization attempt or by KHO itself during finalization.
This premature unpreservation is incorrect. An abort of the finalization process should only undo actions taken by KHO as part of that specific finalization attempt. Individual memory regions preserved by clients prior to finalization should remain preserved, as their lifecycle is managed by the clients themselves. These clients might still need to call kho_unpreserve_folio() or kho_unpreserve_phys() based on their own logic, even after a KHO finalization attempt is aborted.
Signed-off-by: Pasha Tatashin pasha.tatashin@soleen.com --- kernel/kexec_handover.c | 21 +-------------------- 1 file changed, 1 insertion(+), 20 deletions(-)
diff --git a/kernel/kexec_handover.c b/kernel/kexec_handover.c index 78acaa218c12..89a8f839346b 100644 --- a/kernel/kexec_handover.c +++ b/kernel/kexec_handover.c @@ -1083,31 +1083,12 @@ EXPORT_SYMBOL_GPL(kho_restore_vmalloc);
static int __kho_abort(void) { - int err = 0; - unsigned long order; - struct kho_mem_phys *physxa; - - xa_for_each(&kho_out.track.orders, order, physxa) { - struct kho_mem_phys_bits *bits; - unsigned long phys; - - xa_for_each(&physxa->phys_bits, phys, bits) - kfree(bits); - - xa_destroy(&physxa->phys_bits); - kfree(physxa); - } - xa_destroy(&kho_out.track.orders); - if (kho_out.preserved_mem_map) { kho_mem_ser_free(kho_out.preserved_mem_map); kho_out.preserved_mem_map = NULL; }
- if (err) - pr_err("Failed to abort KHO finalization: %d\n", err); - - return err; + return 0; }
int kho_abort(void)
Move KHO to kernel/liveupdate/ in preparation of placing all Live Update core kernel related files to the same place.
Signed-off-by: Pasha Tatashin pasha.tatashin@soleen.com Reviewed-by: Jason Gunthorpe jgg@nvidia.com Reviewed-by: Mike Rapoport (Microsoft) rppt@kernel.org --- Documentation/core-api/kho/concepts.rst | 2 +- MAINTAINERS | 2 +- init/Kconfig | 2 ++ kernel/Kconfig.kexec | 25 ---------------- kernel/Makefile | 3 +- kernel/liveupdate/Kconfig | 30 +++++++++++++++++++ kernel/liveupdate/Makefile | 4 +++ kernel/{ => liveupdate}/kexec_handover.c | 6 ++-- .../{ => liveupdate}/kexec_handover_debugfs.c | 0 .../kexec_handover_internal.h | 0 10 files changed, 42 insertions(+), 32 deletions(-) create mode 100644 kernel/liveupdate/Kconfig create mode 100644 kernel/liveupdate/Makefile rename kernel/{ => liveupdate}/kexec_handover.c (99%) rename kernel/{ => liveupdate}/kexec_handover_debugfs.c (100%) rename kernel/{ => liveupdate}/kexec_handover_internal.h (100%)
diff --git a/Documentation/core-api/kho/concepts.rst b/Documentation/core-api/kho/concepts.rst index 36d5c05cfb30..d626d1dbd678 100644 --- a/Documentation/core-api/kho/concepts.rst +++ b/Documentation/core-api/kho/concepts.rst @@ -70,5 +70,5 @@ in the FDT. That state is called the KHO finalization phase.
Public API ========== -.. kernel-doc:: kernel/kexec_handover.c +.. kernel-doc:: kernel/liveupdate/kexec_handover.c :export: diff --git a/MAINTAINERS b/MAINTAINERS index 54f627a639b1..0e7fa104422c 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -13782,7 +13782,7 @@ S: Maintained F: Documentation/admin-guide/mm/kho.rst F: Documentation/core-api/kho/* F: include/linux/kexec_handover.h -F: kernel/kexec_handover* +F: kernel/liveupdate/kexec_handover* F: tools/testing/selftests/kho/
KEYS-ENCRYPTED diff --git a/init/Kconfig b/init/Kconfig index cab3ad28ca49..0605de5d96c0 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -2138,6 +2138,8 @@ config TRACEPOINTS
source "kernel/Kconfig.kexec"
+source "kernel/liveupdate/Kconfig" + endmenu # General setup
source "arch/Kconfig" diff --git a/kernel/Kconfig.kexec b/kernel/Kconfig.kexec index 03c3aa6263d3..15632358bcf7 100644 --- a/kernel/Kconfig.kexec +++ b/kernel/Kconfig.kexec @@ -94,31 +94,6 @@ config KEXEC_JUMP Jump between original kernel and kexeced kernel and invoke code in physical address mode via KEXEC
-config KEXEC_HANDOVER - bool "kexec handover" - depends on ARCH_SUPPORTS_KEXEC_HANDOVER && ARCH_SUPPORTS_KEXEC_FILE - depends on !DEFERRED_STRUCT_PAGE_INIT - select MEMBLOCK_KHO_SCRATCH - select KEXEC_FILE - select DEBUG_FS - select LIBFDT - select CMA - help - Allow kexec to hand over state across kernels by generating and - passing additional metadata to the target kernel. This is useful - to keep data or state alive across the kexec. For this to work, - both source and target kernels need to have this option enabled. - -config KEXEC_HANDOVER_DEBUGFS - bool "kexec handover debugfs interface" - depends on KEXEC_HANDOVER - depends on DEBUG_FS - help - Allow to control kexec handover device tree via debugfs - interface, i.e. finalize the state or aborting the finalization. - Also, enables inspecting the KHO fdt trees with the debugfs binary - blobs. - config CRASH_DUMP bool "kernel crash dumps" default ARCH_DEFAULT_CRASH_DUMP diff --git a/kernel/Makefile b/kernel/Makefile index 06bfe691439b..e83669841b8c 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -52,6 +52,7 @@ obj-y += printk/ obj-y += irq/ obj-y += rcu/ obj-y += livepatch/ +obj-y += liveupdate/ obj-y += dma/ obj-y += entry/ obj-y += unwind/ @@ -82,8 +83,6 @@ obj-$(CONFIG_CRASH_DUMP_KUNIT_TEST) += crash_core_test.o obj-$(CONFIG_KEXEC) += kexec.o obj-$(CONFIG_KEXEC_FILE) += kexec_file.o obj-$(CONFIG_KEXEC_ELF) += kexec_elf.o -obj-$(CONFIG_KEXEC_HANDOVER) += kexec_handover.o -obj-$(CONFIG_KEXEC_HANDOVER_DEBUGFS) += kexec_handover_debugfs.o obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o obj-$(CONFIG_COMPAT) += compat.o obj-$(CONFIG_CGROUPS) += cgroup/ diff --git a/kernel/liveupdate/Kconfig b/kernel/liveupdate/Kconfig new file mode 100644 index 000000000000..cea287842475 --- /dev/null +++ b/kernel/liveupdate/Kconfig @@ -0,0 +1,30 @@ +# SPDX-License-Identifier: GPL-2.0-only + +menu "Live Update and Kexec HandOver" + +config KEXEC_HANDOVER + bool "kexec handover" + depends on ARCH_SUPPORTS_KEXEC_HANDOVER && ARCH_SUPPORTS_KEXEC_FILE + depends on !DEFERRED_STRUCT_PAGE_INIT + select MEMBLOCK_KHO_SCRATCH + select KEXEC_FILE + select DEBUG_FS + select LIBFDT + select CMA + help + Allow kexec to hand over state across kernels by generating and + passing additional metadata to the target kernel. This is useful + to keep data or state alive across the kexec. For this to work, + both source and target kernels need to have this option enabled. + +config KEXEC_HANDOVER_DEBUGFS + bool "kexec handover debugfs interface" + depends on KEXEC_HANDOVER + depends on DEBUG_FS + help + Allow to control kexec handover device tree via debugfs + interface, i.e. finalize the state or aborting the finalization. + Also, enables inspecting the KHO fdt trees with the debugfs binary + blobs. + +endmenu diff --git a/kernel/liveupdate/Makefile b/kernel/liveupdate/Makefile new file mode 100644 index 000000000000..fcdf163b4b0e --- /dev/null +++ b/kernel/liveupdate/Makefile @@ -0,0 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0 + +obj-$(CONFIG_KEXEC_HANDOVER) += kexec_handover.o +obj-$(CONFIG_KEXEC_HANDOVER_DEBUGFS) += kexec_handover_debugfs.o diff --git a/kernel/kexec_handover.c b/kernel/liveupdate/kexec_handover.c similarity index 99% rename from kernel/kexec_handover.c rename to kernel/liveupdate/kexec_handover.c index 89a8f839346b..c87d00c40c82 100644 --- a/kernel/kexec_handover.c +++ b/kernel/liveupdate/kexec_handover.c @@ -24,8 +24,8 @@ * KHO is tightly coupled with mm init and needs access to some of mm * internal APIs. */ -#include "../mm/internal.h" -#include "kexec_internal.h" +#include "../../mm/internal.h" +#include "../kexec_internal.h" #include "kexec_handover_internal.h"
#define KHO_FDT_COMPATIBLE "kho-v1" @@ -1124,7 +1124,7 @@ static int __kho_finalize(void) err |= fdt_finish_reservemap(root); err |= fdt_begin_node(root, ""); err |= fdt_property_string(root, "compatible", KHO_FDT_COMPATIBLE); - /** + /* * Reserve the preserved-memory-map property in the root FDT, so * that all property definitions will precede subnodes created by * KHO callers. diff --git a/kernel/kexec_handover_debugfs.c b/kernel/liveupdate/kexec_handover_debugfs.c similarity index 100% rename from kernel/kexec_handover_debugfs.c rename to kernel/liveupdate/kexec_handover_debugfs.c diff --git a/kernel/kexec_handover_internal.h b/kernel/liveupdate/kexec_handover_internal.h similarity index 100% rename from kernel/kexec_handover_internal.h rename to kernel/liveupdate/kexec_handover_internal.h
Now, that LUO and KHO both live under kernel/liveupdate, it makes sense to also move the kho debugfs files to liveupdate/
The old names: /sys/kernel/debug/kho/out/ /sys/kernel/debug/kho/in/
The new names: /sys/kernel/debug/liveupdate/kho_out/ /sys/kernel/debug/liveupdate/kho_in/
Also, export the liveupdate_debufs_root, so future LUO selftests could use it as well.
Signed-off-by: Pasha Tatashin pasha.tatashin@soleen.com --- kernel/liveupdate/kexec_handover_debugfs.c | 10 +++++----- kernel/liveupdate/kexec_handover_internal.h | 2 ++ tools/testing/selftests/kho/init.c | 2 +- 3 files changed, 8 insertions(+), 6 deletions(-)
diff --git a/kernel/liveupdate/kexec_handover_debugfs.c b/kernel/liveupdate/kexec_handover_debugfs.c index 6ddcd17fac3c..63447564dbe8 100644 --- a/kernel/liveupdate/kexec_handover_debugfs.c +++ b/kernel/liveupdate/kexec_handover_debugfs.c @@ -15,7 +15,7 @@ #include <linux/mm.h> #include "kexec_handover_internal.h"
-static struct dentry *debugfs_root; +struct dentry *liveupdate_debugfs_root;
struct fdt_debugfs { struct list_head list; @@ -115,7 +115,7 @@ __init void kho_in_debugfs_init(struct kho_debugfs *dbg, const void *fdt)
INIT_LIST_HEAD(&dbg->fdt_list);
- dir = debugfs_create_dir("in", debugfs_root); + dir = debugfs_create_dir("kho_in", liveupdate_debugfs_root); if (IS_ERR(dir)) { err = PTR_ERR(dir); goto err_out; @@ -175,7 +175,7 @@ __init int kho_out_debugfs_init(struct kho_debugfs *dbg)
INIT_LIST_HEAD(&dbg->fdt_list);
- dir = debugfs_create_dir("out", debugfs_root); + dir = debugfs_create_dir("kho_out", liveupdate_debugfs_root); if (IS_ERR(dir)) return -ENOMEM;
@@ -209,8 +209,8 @@ __init int kho_out_debugfs_init(struct kho_debugfs *dbg)
__init int kho_debugfs_init(void) { - debugfs_root = debugfs_create_dir("kho", NULL); - if (IS_ERR(debugfs_root)) + liveupdate_debugfs_root = debugfs_create_dir("liveupdate", NULL); + if (IS_ERR(liveupdate_debugfs_root)) return -ENOENT; return 0; } diff --git a/kernel/liveupdate/kexec_handover_internal.h b/kernel/liveupdate/kexec_handover_internal.h index de90a678274d..b3fc1957affa 100644 --- a/kernel/liveupdate/kexec_handover_internal.h +++ b/kernel/liveupdate/kexec_handover_internal.h @@ -15,6 +15,8 @@ struct kho_debugfs { struct list_head fdt_list; };
+extern struct dentry *liveupdate_debugfs_root; + #else struct kho_debugfs {}; #endif diff --git a/tools/testing/selftests/kho/init.c b/tools/testing/selftests/kho/init.c index 6d9e91d55d68..f0136a30ce8b 100644 --- a/tools/testing/selftests/kho/init.c +++ b/tools/testing/selftests/kho/init.c @@ -11,7 +11,7 @@ /* from arch/x86/include/asm/setup.h */ #define COMMAND_LINE_SIZE 2048
-#define KHO_FINALIZE "/debugfs/kho/out/finalize" +#define KHO_FINALIZE "/debugfs/liveupdate/kho_out/finalize" #define KERNEL_IMAGE "/kernel"
static int mount_filesystems(void)
It is invalid for KHO metadata or preserved memory regions to be located within the KHO scratch area, as this area is overwritten when the next kernel is loaded, and used early in boot by the next kernel. This can lead to memory corruption.
Adds checks to kho_preserve_* and KHO's internal metadata allocators (xa_load_or_alloc, new_chunk) to verify that the physical address of the memory does not overlap with any defined scratch region. If an overlap is detected, the operation will fail and a WARN_ON is triggered. To avoid performance overhead in production kernels, these checks are enabled only when CONFIG_KEXEC_HANDOVER_DEBUG is selected.
Signed-off-by: Pasha Tatashin pasha.tatashin@soleen.com --- kernel/liveupdate/Kconfig | 8 ++++ kernel/liveupdate/Makefile | 1 + kernel/liveupdate/kexec_handover.c | 52 ++++++++++++++------- kernel/liveupdate/kexec_handover_debug.c | 25 ++++++++++ kernel/liveupdate/kexec_handover_internal.h | 9 ++++ 5 files changed, 78 insertions(+), 17 deletions(-) create mode 100644 kernel/liveupdate/kexec_handover_debug.c
diff --git a/kernel/liveupdate/Kconfig b/kernel/liveupdate/Kconfig index cea287842475..851d1a22b4c5 100644 --- a/kernel/liveupdate/Kconfig +++ b/kernel/liveupdate/Kconfig @@ -27,4 +27,12 @@ config KEXEC_HANDOVER_DEBUGFS Also, enables inspecting the KHO fdt trees with the debugfs binary blobs.
+config KEXEC_HANDOVER_DEBUG + bool "Enable Kexec Handover debug checks" + depends on KEXEC_HANDOVER_DEBUGFS + help + This option enables extra sanity checks for the Kexec Handover + subsystem. Since, KHO performance is crucial in live update + scenarios and the extra code might be adding overhead it is + only optionally enabled. endmenu diff --git a/kernel/liveupdate/Makefile b/kernel/liveupdate/Makefile index fcdf163b4b0e..f52ce1ebcf86 100644 --- a/kernel/liveupdate/Makefile +++ b/kernel/liveupdate/Makefile @@ -1,4 +1,5 @@ # SPDX-License-Identifier: GPL-2.0
obj-$(CONFIG_KEXEC_HANDOVER) += kexec_handover.o +obj-$(CONFIG_KEXEC_HANDOVER_DEBUG) += kexec_handover_debug.o obj-$(CONFIG_KEXEC_HANDOVER_DEBUGFS) += kexec_handover_debugfs.o diff --git a/kernel/liveupdate/kexec_handover.c b/kernel/liveupdate/kexec_handover.c index c87d00c40c82..ebfc31814d16 100644 --- a/kernel/liveupdate/kexec_handover.c +++ b/kernel/liveupdate/kexec_handover.c @@ -8,6 +8,7 @@
#define pr_fmt(fmt) "KHO: " fmt
+#include <linux/cleanup.h> #include <linux/cma.h> #include <linux/count_zeros.h> #include <linux/kexec.h> @@ -131,26 +132,26 @@ static struct kho_out kho_out = {
static void *xa_load_or_alloc(struct xarray *xa, unsigned long index, size_t sz) { - void *elm, *res; + void *res = xa_load(xa, index);
- elm = xa_load(xa, index); - if (elm) - return elm; + if (res) + return res; + + void *elm __free(kfree) = kzalloc(sz, GFP_KERNEL);
- elm = kzalloc(sz, GFP_KERNEL); if (!elm) return ERR_PTR(-ENOMEM);
+ if (WARN_ON(kho_scratch_overlap(virt_to_phys(elm), sz))) + return ERR_PTR(-EINVAL); + res = xa_cmpxchg(xa, index, NULL, elm, GFP_KERNEL); if (xa_is_err(res)) - res = ERR_PTR(xa_err(res)); - - if (res) { - kfree(elm); + return ERR_PTR(xa_err(res)); + else if (res) return res; - }
- return elm; + return no_free_ptr(elm); }
static void __kho_unpreserve_order(struct kho_mem_track *track, unsigned long pfn, @@ -350,15 +351,19 @@ static_assert(sizeof(struct khoser_mem_chunk) == PAGE_SIZE); static struct khoser_mem_chunk *new_chunk(struct khoser_mem_chunk *cur_chunk, unsigned long order) { - struct khoser_mem_chunk *chunk; + struct khoser_mem_chunk *chunk __free(kfree) = NULL;
chunk = kzalloc(PAGE_SIZE, GFP_KERNEL); if (!chunk) - return NULL; + return ERR_PTR(-ENOMEM); + + if (WARN_ON(kho_scratch_overlap(virt_to_phys(chunk), PAGE_SIZE))) + return ERR_PTR(-EINVAL); + chunk->hdr.order = order; if (cur_chunk) KHOSER_STORE_PTR(cur_chunk->hdr.next, chunk); - return chunk; + return no_free_ptr(chunk); }
static void kho_mem_ser_free(struct khoser_mem_chunk *first_chunk) @@ -379,14 +384,17 @@ static int kho_mem_serialize(struct kho_out *kho_out) struct khoser_mem_chunk *chunk = NULL; struct kho_mem_phys *physxa; unsigned long order; + int ret = -ENOMEM;
xa_for_each(&kho_out->track.orders, order, physxa) { struct kho_mem_phys_bits *bits; unsigned long phys;
chunk = new_chunk(chunk, order); - if (!chunk) + if (IS_ERR(chunk)) { + ret = PTR_ERR(chunk); goto err_free; + }
if (!first_chunk) first_chunk = chunk; @@ -396,8 +404,10 @@ static int kho_mem_serialize(struct kho_out *kho_out)
if (chunk->hdr.num_elms == ARRAY_SIZE(chunk->bitmaps)) { chunk = new_chunk(chunk, order); - if (!chunk) + if (IS_ERR(chunk)) { + ret = PTR_ERR(chunk); goto err_free; + } }
elm = &chunk->bitmaps[chunk->hdr.num_elms]; @@ -414,7 +424,7 @@ static int kho_mem_serialize(struct kho_out *kho_out)
err_free: kho_mem_ser_free(first_chunk); - return -ENOMEM; + return ret; }
static void __init deserialize_bitmap(unsigned int order, @@ -737,6 +747,9 @@ int kho_preserve_folio(struct folio *folio) const unsigned int order = folio_order(folio); struct kho_mem_track *track = &kho_out.track;
+ if (WARN_ON(kho_scratch_overlap(pfn << PAGE_SHIFT, PAGE_SIZE << order))) + return -EINVAL; + return __kho_preserve_order(track, pfn, order); } EXPORT_SYMBOL_GPL(kho_preserve_folio); @@ -784,6 +797,11 @@ int kho_preserve_pages(struct page *page, unsigned int nr_pages) unsigned long failed_pfn = 0; int err = 0;
+ if (WARN_ON(kho_scratch_overlap(start_pfn << PAGE_SHIFT, + nr_pages << PAGE_SHIFT))) { + return -EINVAL; + } + while (pfn < end_pfn) { const unsigned int order = min(count_trailing_zeros(pfn), ilog2(end_pfn - pfn)); diff --git a/kernel/liveupdate/kexec_handover_debug.c b/kernel/liveupdate/kexec_handover_debug.c new file mode 100644 index 000000000000..7986dcc63047 --- /dev/null +++ b/kernel/liveupdate/kexec_handover_debug.c @@ -0,0 +1,25 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * kexec_handover_debug.c - kexec handover optional debug functionality + * Copyright (C) 2025 Google LLC, Pasha Tatashin pasha.tatashin@soleen.com + */ + +#define pr_fmt(fmt) "KHO: " fmt + +#include "kexec_handover_internal.h" + +bool kho_scratch_overlap(phys_addr_t phys, size_t size) +{ + phys_addr_t scratch_start, scratch_end; + unsigned int i; + + for (i = 0; i < kho_scratch_cnt; i++) { + scratch_start = kho_scratch[i].addr; + scratch_end = kho_scratch[i].addr + kho_scratch[i].size - 1; + + if (phys <= scratch_end && (phys + size) > scratch_start) + return true; + } + + return false; +} diff --git a/kernel/liveupdate/kexec_handover_internal.h b/kernel/liveupdate/kexec_handover_internal.h index b3fc1957affa..92798346fa5a 100644 --- a/kernel/liveupdate/kexec_handover_internal.h +++ b/kernel/liveupdate/kexec_handover_internal.h @@ -44,4 +44,13 @@ static inline void kho_debugfs_fdt_remove(struct kho_debugfs *dbg, void *fdt) { } #endif /* CONFIG_KEXEC_HANDOVER_DEBUGFS */
+#ifdef CONFIG_KEXEC_HANDOVER_DEBUG +bool kho_scratch_overlap(phys_addr_t phys, size_t size); +#else +static inline bool kho_scratch_overlap(phys_addr_t phys, size_t size) +{ + return false; +} +#endif /* CONFIG_KEXEC_HANDOVER_DEBUG */ + #endif /* LINUX_KEXEC_HANDOVER_INTERNAL_H */
Metadata is preserved via 512-bytes, which requires using slabs. Slabs are not safe to be used with KHO because of kfence, and because partial slabs may lead leaks to the next kernel. Change the size to be PAGE_SIZE.
While this change could potentially increase metadata overhead on systems with sparsely preserved memory, this is being mitigated by ongoing work to reduce sparseness during preservation via 1G guest pages. Furthermore, this change aligns with future work on a stateless KHO, which will also use page-sized bitmaps for its radix tree metadata.
Signed-off-by: Pasha Tatashin pasha.tatashin@soleen.com --- kernel/liveupdate/kexec_handover.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-)
diff --git a/kernel/liveupdate/kexec_handover.c b/kernel/liveupdate/kexec_handover.c index ebfc31814d16..7c8e89a6b953 100644 --- a/kernel/liveupdate/kexec_handover.c +++ b/kernel/liveupdate/kexec_handover.c @@ -67,10 +67,10 @@ early_param("kho", kho_parse_enable); * Keep track of memory that is to be preserved across KHO. * * The serializing side uses two levels of xarrays to manage chunks of per-order - * 512 byte bitmaps. For instance if PAGE_SIZE = 4096, the entire 1G order of a - * 1TB system would fit inside a single 512 byte bitmap. For order 0 allocations - * each bitmap will cover 16M of address space. Thus, for 16G of memory at most - * 512K of bitmap memory will be needed for order 0. + * PAGE_SIZE byte bitmaps. For instance if PAGE_SIZE = 4096, the entire 1G order + * of a 8TB system would fit inside a single 4096 byte bitmap. For order 0 + * allocations each bitmap will cover 128M of address space. Thus, for 16G of + * memory at most 512K of bitmap memory will be needed for order 0. * * This approach is fully incremental, as the serialization progresses folios * can continue be aggregated to the tracker. The final step, immediately prior @@ -78,12 +78,14 @@ early_param("kho", kho_parse_enable); * successor kernel to parse. */
-#define PRESERVE_BITS (512 * 8) +#define PRESERVE_BITS (PAGE_SIZE * 8)
struct kho_mem_phys_bits { DECLARE_BITMAP(preserve, PRESERVE_BITS); };
+static_assert(sizeof(struct kho_mem_phys_bits) == PAGE_SIZE); + struct kho_mem_phys { /* * Points to kho_mem_phys_bits, a sparse bitmap array. Each bit is sized @@ -130,19 +132,19 @@ static struct kho_out kho_out = { .finalized = false, };
-static void *xa_load_or_alloc(struct xarray *xa, unsigned long index, size_t sz) +static void *xa_load_or_alloc(struct xarray *xa, unsigned long index) { void *res = xa_load(xa, index);
if (res) return res;
- void *elm __free(kfree) = kzalloc(sz, GFP_KERNEL); + void *elm __free(kfree) = kzalloc(PAGE_SIZE, GFP_KERNEL);
if (!elm) return ERR_PTR(-ENOMEM);
- if (WARN_ON(kho_scratch_overlap(virt_to_phys(elm), sz))) + if (WARN_ON(kho_scratch_overlap(virt_to_phys(elm), PAGE_SIZE))) return ERR_PTR(-EINVAL);
res = xa_cmpxchg(xa, index, NULL, elm, GFP_KERNEL); @@ -222,8 +224,7 @@ static int __kho_preserve_order(struct kho_mem_track *track, unsigned long pfn, } }
- bits = xa_load_or_alloc(&physxa->phys_bits, pfn_high / PRESERVE_BITS, - sizeof(*bits)); + bits = xa_load_or_alloc(&physxa->phys_bits, pfn_high / PRESERVE_BITS); if (IS_ERR(bits)) return PTR_ERR(bits);
KHO allocates metadata for its preserved memory map using the slab allocator via kzalloc(). This metadata is temporary and is used by the next kernel during early boot to find preserved memory.
A problem arises when KFENCE is enabled. kzalloc() calls can be randomly intercepted by kfence_alloc(), which services the allocation from a dedicated KFENCE memory pool. This pool is allocated early in boot via memblock.
When booting via KHO, the memblock allocator is restricted to a "scratch area", forcing the KFENCE pool to be allocated within it. This creates a conflict, as the scratch area is expected to be ephemeral and overwriteable by a subsequent kexec. If KHO metadata is placed in this KFENCE pool, it leads to memory corruption when the next kernel is loaded.
To fix this, modify KHO to allocate its metadata directly from the buddy allocator instead of slab.
Fixes: fc33e4b44b27 ("kexec: enable KHO support for memory preservation") Signed-off-by: Pasha Tatashin pasha.tatashin@soleen.com Reviewed-by: Pratyush Yadav pratyush@kernel.org --- kernel/liveupdate/kexec_handover.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-)
diff --git a/kernel/liveupdate/kexec_handover.c b/kernel/liveupdate/kexec_handover.c index 7c8e89a6b953..92662739a3a2 100644 --- a/kernel/liveupdate/kexec_handover.c +++ b/kernel/liveupdate/kexec_handover.c @@ -132,6 +132,8 @@ static struct kho_out kho_out = { .finalized = false, };
+DEFINE_FREE(kho_free_page, void *, free_page((unsigned long)_T)) + static void *xa_load_or_alloc(struct xarray *xa, unsigned long index) { void *res = xa_load(xa, index); @@ -139,7 +141,7 @@ static void *xa_load_or_alloc(struct xarray *xa, unsigned long index) if (res) return res;
- void *elm __free(kfree) = kzalloc(PAGE_SIZE, GFP_KERNEL); + void *elm __free(kho_free_page) = (void *)get_zeroed_page(GFP_KERNEL);
if (!elm) return ERR_PTR(-ENOMEM); @@ -352,9 +354,9 @@ static_assert(sizeof(struct khoser_mem_chunk) == PAGE_SIZE); static struct khoser_mem_chunk *new_chunk(struct khoser_mem_chunk *cur_chunk, unsigned long order) { - struct khoser_mem_chunk *chunk __free(kfree) = NULL; + struct khoser_mem_chunk *chunk __free(kho_free_page) = NULL;
- chunk = kzalloc(PAGE_SIZE, GFP_KERNEL); + chunk = (void *)get_zeroed_page(GFP_KERNEL); if (!chunk) return ERR_PTR(-ENOMEM);
linux-kselftest-mirror@lists.linaro.org