Add an API to enable the PCI subsystem to track all devices that are preserved across a Live Update, including both incoming devices (passed from the previous kernel) and outgoing devices (passed to the next kernel).
Use PCI segment number and BDF to keep track of devices across Live Update. This means the kernel must keep both identifiers constant across a Live Update for any preserved device. VFs are not supported for now, since that requires preserving SR-IOV state on the device to ensure the same number of VFs appear after kexec and with the same BDFs.
Drivers that preserve devices across Live Update can now register their struct liveupdate_file_handler with the PCI subsystem so that the PCI subsystem can allocate and manage File-Lifecycle-Bound (FLB) global data to track the list of incoming and outgoing preserved devices.
pci_liveupdate_register_fh(driver_fh) pci_liveupdate_unregister_fh(driver_fh)
Drivers can notify the PCI subsystem whenever a device is preserved and unpreserved with the following APIs:
pci_liveupdate_outgoing_preserve(pci_dev) pci_liveupdate_outgoing_unpreserve(pci_dev)
After a Live Update, the PCI subsystem can fetch its FLB global data from the previous kernel from the Live Update Orchestrator (LUO) to determine which devices are preserved. This API is also made available for drivers to use to check if a device was preserved before userspace retrieves the file for it.
pci_liveupdate_incoming_is_preserved(pci_dev)
Once a driver has finished restoring an incoming preserved device, it can notify the PCI subsystem with the following call:
pci_liveupdate_incoming_finish(pci_dev)
This will be used in subsequent commits by the vfio-pci driver to preserve VFIO devices across Live Update.
Signed-off-by: David Matlack dmatlack@google.com --- drivers/pci/Makefile | 1 + drivers/pci/liveupdate.c | 248 ++++++++++++++++++++++++++++++++++++ include/linux/kho/abi/pci.h | 53 ++++++++ include/linux/pci.h | 38 ++++++ 4 files changed, 340 insertions(+) create mode 100644 drivers/pci/liveupdate.c create mode 100644 include/linux/kho/abi/pci.h
diff --git a/drivers/pci/Makefile b/drivers/pci/Makefile index 67647f1880fb..0cb43e10e71d 100644 --- a/drivers/pci/Makefile +++ b/drivers/pci/Makefile @@ -16,6 +16,7 @@ obj-$(CONFIG_PROC_FS) += proc.o obj-$(CONFIG_SYSFS) += pci-sysfs.o slot.o obj-$(CONFIG_ACPI) += pci-acpi.o obj-$(CONFIG_GENERIC_PCI_IOMAP) += iomap.o +obj-$(CONFIG_LIVEUPDATE) += liveupdate.o endif
obj-$(CONFIG_OF) += of.o diff --git a/drivers/pci/liveupdate.c b/drivers/pci/liveupdate.c new file mode 100644 index 000000000000..f9bb97f3bada --- /dev/null +++ b/drivers/pci/liveupdate.c @@ -0,0 +1,248 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Copyright (c) 2025, Google LLC. + * David Matlack dmatlack@google.com + */ + +#include <linux/bsearch.h> +#include <linux/io.h> +#include <linux/kexec_handover.h> +#include <linux/kho/abi/pci.h> +#include <linux/liveupdate.h> +#include <linux/mutex.h> +#include <linux/mm.h> +#include <linux/pci.h> +#include <linux/sort.h> + +static DEFINE_MUTEX(pci_flb_outgoing_lock); +static DEFINE_MUTEX(pci_flb_incoming_lock); + +static int pci_flb_preserve(struct liveupdate_flb_op_args *args) +{ + struct pci_dev *dev = NULL; + struct folio *folio; + unsigned int order; + int nr_devices = 0; + int ret; + + /* + * Calculate the maximum number of devices based on what's present + * on the system currently (including VFs) to size the folio holding + * struct pci_ser. This is not perfect given devices could be + * hotplugged, but it's also unlikely that all devices in the system are + * going to be preserved anyway. + */ + for_each_pci_dev(dev) { + if (dev->is_virtfn) + continue; + + nr_devices += 1 + pci_sriov_get_totalvfs(dev); + } + + order = get_order(offsetof(struct pci_ser, devices[nr_devices + 1])); + + folio = folio_alloc(GFP_KERNEL | __GFP_ZERO, order); + if (!folio) + return -ENOMEM; + + ret = kho_preserve_folio(folio); + if (ret) { + folio_put(folio); + return ret; + } + + args->obj = folio_address(folio); + args->data = virt_to_phys(args->obj); + + return 0; +} + +static void pci_flb_unpreserve(struct liveupdate_flb_op_args *args) +{ + struct pci_ser *ser = args->obj; + struct folio *folio = virt_to_folio(ser); + + WARN_ON_ONCE(ser->nr_devices); + kho_unpreserve_folio(folio); + folio_put(folio); +} + +static int pci_flb_retrieve(struct liveupdate_flb_op_args *args) +{ + struct folio *folio; + + folio = kho_restore_folio(args->data); + if (!folio) + panic("Unable to restore preserved FLB data from KHO (0x%llx)\n", args->data); + + args->obj = folio_address(folio); + return 0; +} + +static void pci_flb_finish(struct liveupdate_flb_op_args *args) +{ + struct pci_ser *ser = args->obj; + + /* + * Sanity check that all devices have been finished via + * pci_liveupdate_incoming_finish(). + */ + WARN_ON_ONCE(ser->nr_devices); + folio_put(virt_to_folio(ser)); +} + +static struct liveupdate_flb_ops pci_liveupdate_flb_ops = { + .preserve = pci_flb_preserve, + .unpreserve = pci_flb_unpreserve, + .retrieve = pci_flb_retrieve, + .finish = pci_flb_finish, + .owner = THIS_MODULE, +}; + +static struct liveupdate_flb pci_liveupdate_flb = { + .ops = &pci_liveupdate_flb_ops, + .compatible = PCI_LUO_FLB_COMPATIBLE, +}; + +#define INIT_PCI_DEV_SER(_dev) { \ + .domain = pci_domain_nr((_dev)->bus), \ + .bdf = pci_dev_id(_dev), \ +} + +static int pci_dev_ser_cmp(const void *__a, const void *__b) +{ + const struct pci_dev_ser *a = __a, *b = __b; + + return cmp_int(a->domain << 16 | a->bdf, b->domain << 16 | b->bdf); +} + +static struct pci_dev_ser *pci_ser_find(struct pci_ser *ser, struct pci_dev *dev) +{ + const struct pci_dev_ser key = INIT_PCI_DEV_SER(dev); + + return bsearch(&key, ser->devices, ser->nr_devices, + sizeof(key), pci_dev_ser_cmp); +} + +static int pci_ser_delete(struct pci_ser *ser, struct pci_dev *dev) +{ + struct pci_dev_ser *dev_ser; + int i; + + dev_ser = pci_ser_find(ser, dev); + if (!dev_ser) + return -ENOENT; + + for (i = dev_ser - ser->devices; i < ser->nr_devices - 1; i++) + ser->devices[i] = ser->devices[i + 1]; + + ser->nr_devices--; + return 0; +} + +static int max_nr_devices(struct pci_ser *ser) +{ + u64 size; + + size = folio_size(virt_to_folio(ser)); + size -= offsetof(struct pci_ser, devices); + + return size / sizeof(struct pci_dev_ser); +} + +int pci_liveupdate_outgoing_preserve(struct pci_dev *dev) +{ + struct pci_dev_ser new = INIT_PCI_DEV_SER(dev); + struct pci_ser *ser; + int i, ret; + + /* VFs are not supported yet due to BDF instability across kexec */ + if (dev->is_virtfn) + return -EINVAL; + + guard(mutex)(&pci_flb_outgoing_lock); + + ret = liveupdate_flb_get_outgoing(&pci_liveupdate_flb, (void **)&ser); + if (ret) + return ret; + + if (ser->nr_devices == max_nr_devices(ser)) + return -E2BIG; + + for (i = ser->nr_devices; i > 0; i--) { + struct pci_dev_ser *prev = &ser->devices[i - 1]; + int cmp = pci_dev_ser_cmp(&new, prev); + + /* This device is already preserved. */ + if (cmp == 0) + return 0; + + if (cmp > 0) + break; + + ser->devices[i] = *prev; + } + + ser->devices[i] = new; + ser->nr_devices++; + return 0; +} +EXPORT_SYMBOL_GPL(pci_liveupdate_outgoing_preserve); + +void pci_liveupdate_outgoing_unpreserve(struct pci_dev *dev) +{ + struct pci_ser *ser; + int ret; + + guard(mutex)(&pci_flb_outgoing_lock); + + ret = liveupdate_flb_get_outgoing(&pci_liveupdate_flb, (void **)&ser); + if (WARN_ON_ONCE(ret)) + return; + + WARN_ON_ONCE(pci_ser_delete(ser, dev)); +} +EXPORT_SYMBOL_GPL(pci_liveupdate_outgoing_unpreserve); + +bool pci_liveupdate_incoming_is_preserved(struct pci_dev *dev) +{ + struct pci_ser *ser; + int ret; + + guard(mutex)(&pci_flb_incoming_lock); + + ret = liveupdate_flb_get_incoming(&pci_liveupdate_flb, (void **)&ser); + if (ret) + return false; + + return pci_ser_find(ser, dev); +} +EXPORT_SYMBOL_GPL(pci_liveupdate_incoming_is_preserved); + +void pci_liveupdate_incoming_finish(struct pci_dev *dev) +{ + struct pci_ser *ser; + int ret; + + guard(mutex)(&pci_flb_incoming_lock); + + ret = liveupdate_flb_get_incoming(&pci_liveupdate_flb, (void **)&ser); + if (WARN_ON_ONCE(ret)) + return; + + WARN_ON_ONCE(pci_ser_delete(ser, dev)); +} +EXPORT_SYMBOL_GPL(pci_liveupdate_incoming_finish); + +int pci_liveupdate_register_fh(struct liveupdate_file_handler *fh) +{ + return liveupdate_register_flb(fh, &pci_liveupdate_flb); +} +EXPORT_SYMBOL_GPL(pci_liveupdate_register_fh); + +int pci_liveupdate_unregister_fh(struct liveupdate_file_handler *fh) +{ + return liveupdate_unregister_flb(fh, &pci_liveupdate_flb); +} +EXPORT_SYMBOL_GPL(pci_liveupdate_unregister_fh); diff --git a/include/linux/kho/abi/pci.h b/include/linux/kho/abi/pci.h new file mode 100644 index 000000000000..53744b6f191a --- /dev/null +++ b/include/linux/kho/abi/pci.h @@ -0,0 +1,53 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * Copyright (c) 2025, Google LLC. + * David Matlack dmatlack@google.com + */ + +#ifndef _LINUX_KHO_ABI_PCI_H +#define _LINUX_KHO_ABI_PCI_H + +#include <linux/compiler.h> +#include <linux/types.h> + +/** + * DOC: PCI File-Lifecycle Bound (FLB) Live Update ABI + * + * This header defines the ABI for preserving core PCI state across kexec using + * Live Update File-Lifecycle Bound (FLB) data. + * + * This interface is a contract. Any modification to any of the serialization + * structs defined here constitutes a breaking change. Such changes require + * incrementing the version number in the PCI_LUO_FLB_COMPATIBLE string. + */ + +#define PCI_LUO_FLB_COMPATIBLE "pci-v1" + +/** + * struct pci_dev_ser - Serialized state about a single PCI device. + * + * @domain: The device's PCI domain number (segment). + * @bdf: The device's PCI bus, device, and function number. + */ +struct pci_dev_ser { + u16 domain; + u16 bdf; +} __packed; + +/** + * struct pci_ser - PCI Subsystem Live Update State + * + * This struct tracks state about all devices that are being preserved across + * a Live Update for the next kernel. + * + * @nr_devices: The number of devices that were preserved. + * @devices: Flexible array of pci_dev_ser structs for each device. Guaranteed + * to be sorted ascending by domain and bdf. + */ +struct pci_ser { + u64 nr_devices; + struct pci_dev_ser devices[]; +} __packed; + +#endif /* _LINUX_KHO_ABI_PCI_H */ diff --git a/include/linux/pci.h b/include/linux/pci.h index d1fdf81fbe1e..6a3c2d7e5b82 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -40,6 +40,7 @@ #include <linux/resource_ext.h> #include <linux/msi_api.h> #include <uapi/linux/pci.h> +#include <linux/liveupdate.h>
#include <linux/pci_ids.h>
@@ -2795,4 +2796,41 @@ void pci_uevent_ers(struct pci_dev *pdev, enum pci_ers_result err_type); WARN_ONCE(condition, "%s %s: " fmt, \ dev_driver_string(&(pdev)->dev), pci_name(pdev), ##arg)
+#ifdef CONFIG_LIVEUPDATE +int pci_liveupdate_outgoing_preserve(struct pci_dev *dev); +void pci_liveupdate_outgoing_unpreserve(struct pci_dev *dev); +bool pci_liveupdate_incoming_is_preserved(struct pci_dev *dev); +void pci_liveupdate_incoming_finish(struct pci_dev *dev); +int pci_liveupdate_register_fh(struct liveupdate_file_handler *fh); +int pci_liveupdate_unregister_fh(struct liveupdate_file_handler *fh); +#else /* !CONFIG_LIVEUPDATE */ +static inline int pci_liveupdate_outgoing_preserve(struct pci_dev *dev) +{ + return -EOPNOTSUPP; +} + +static inline void pci_liveupdate_outgoing_unpreserve(struct pci_dev *dev) +{ +} + +static inline bool pci_liveupdate_incoming_is_preserved(struct pci_dev *dev) +{ + return false; +} + +static inline void pci_liveupdate_incoming_finish(struct pci_dev *dev) +{ +} + +static inline int pci_liveupdate_register_fh(struct liveupdate_file_handler *fh) +{ + return -EOPNOTSUPP; +} + +static inline int pci_liveupdate_unregister_fh(struct liveupdate_file_handler *fh) +{ + return -EOPNOTSUPP; +} +#endif /* !CONFIG_LIVEUPDATE */ + #endif /* LINUX_PCI_H */