From: Niklas Schnelle schnelle@linux.ibm.com
[ Upstream commit 0467cdde8c4320bbfdb31a8cff1277b202f677fc ]
Instead of relying on the observed but not architected firmware behavior that PCI functions from the same card are listed in ascending RID order in clp_list_pci() ensure this by sorting. To allow for sorting separate the initial clp_list_pci() and creation of the virtual PCI busses.
Note that fundamentally in our per-PCI function hotplug design non RID order of discovery is still possible. For example when the two PFs of a two port NIC are hotplugged after initial boot and in descending RID order. In this case the virtual PCI bus would be created by the second PF using that PF's UID as domain number instead of that of the first PF. Thus the domain number would then change from the UID of the second PF to that of the first PF on reboot but there is really nothing we can do about that since changing domain numbers at runtime seems even worse. This only impacts the domain number as the RIDs are consistent and thus even with just the second PF visible it will show up in the correct position on the virtual bus.
Reviewed-by: Gerd Bayer gbayer@linux.ibm.com Signed-off-by: Niklas Schnelle schnelle@linux.ibm.com Signed-off-by: Heiko Carstens hca@linux.ibm.com Signed-off-by: Sasha Levin sashal@kernel.org --- arch/s390/include/asm/pci.h | 5 ++- arch/s390/pci/pci.c | 69 ++++++++++++++++++++++++++++++++----- arch/s390/pci/pci_clp.c | 12 ++++--- arch/s390/pci/pci_event.c | 13 ++++--- 4 files changed, 82 insertions(+), 17 deletions(-)
diff --git a/arch/s390/include/asm/pci.h b/arch/s390/include/asm/pci.h index 30820a649e6e7..fdec455892486 100644 --- a/arch/s390/include/asm/pci.h +++ b/arch/s390/include/asm/pci.h @@ -130,6 +130,7 @@ struct zpci_dev { u16 vfn; /* virtual function number */ u16 pchid; /* physical channel ID */ u16 maxstbl; /* Maximum store block size */ + u16 rid; /* RID as supplied by firmware */ u8 pfgid; /* function group ID */ u8 pft; /* pci function type */ u8 port; @@ -203,12 +204,14 @@ extern struct airq_iv *zpci_aif_sbv; ----------------------------------------------------------------------------- */ /* Base stuff */ struct zpci_dev *zpci_create_device(u32 fid, u32 fh, enum zpci_state state); +int zpci_add_device(struct zpci_dev *zdev); int zpci_enable_device(struct zpci_dev *); int zpci_disable_device(struct zpci_dev *); int zpci_scan_configured_device(struct zpci_dev *zdev, u32 fh); int zpci_deconfigure_device(struct zpci_dev *zdev); void zpci_device_reserved(struct zpci_dev *zdev); bool zpci_is_device_configured(struct zpci_dev *zdev); +int zpci_scan_devices(void);
int zpci_hot_reset_device(struct zpci_dev *zdev); int zpci_register_ioat(struct zpci_dev *, u8, u64, u64, u64, u8 *); @@ -218,7 +221,7 @@ void zpci_update_fh(struct zpci_dev *zdev, u32 fh);
/* CLP */ int clp_setup_writeback_mio(void); -int clp_scan_pci_devices(void); +int clp_scan_pci_devices(struct list_head *scan_list); int clp_query_pci_fn(struct zpci_dev *zdev); int clp_enable_fh(struct zpci_dev *zdev, u32 *fh, u8 nr_dma_as); int clp_disable_fh(struct zpci_dev *zdev, u32 *fh); diff --git a/arch/s390/pci/pci.c b/arch/s390/pci/pci.c index cff4838fad216..e70318fba275a 100644 --- a/arch/s390/pci/pci.c +++ b/arch/s390/pci/pci.c @@ -29,6 +29,7 @@ #include <linux/pci.h> #include <linux/printk.h> #include <linux/lockdep.h> +#include <linux/list_sort.h>
#include <asm/isc.h> #include <asm/airq.h> @@ -786,7 +787,6 @@ struct zpci_dev *zpci_create_device(u32 fid, u32 fh, enum zpci_state state) struct zpci_dev *zdev; int rc;
- zpci_dbg(1, "add fid:%x, fh:%x, c:%d\n", fid, fh, state); zdev = kzalloc(sizeof(*zdev), GFP_KERNEL); if (!zdev) return ERR_PTR(-ENOMEM); @@ -806,6 +806,19 @@ struct zpci_dev *zpci_create_device(u32 fid, u32 fh, enum zpci_state state) mutex_init(&zdev->fmb_lock); mutex_init(&zdev->kzdev_lock);
+ return zdev; + +error: + zpci_dbg(0, "crt fid:%x, rc:%d\n", fid, rc); + kfree(zdev); + return ERR_PTR(rc); +} + +int zpci_add_device(struct zpci_dev *zdev) +{ + int rc; + + zpci_dbg(1, "add fid:%x, fh:%x, c:%d\n", zdev->fid, zdev->fh, zdev->state); rc = zpci_init_iommu(zdev); if (rc) goto error; @@ -817,15 +830,13 @@ struct zpci_dev *zpci_create_device(u32 fid, u32 fh, enum zpci_state state) spin_lock(&zpci_list_lock); list_add_tail(&zdev->entry, &zpci_list); spin_unlock(&zpci_list_lock); - - return zdev; + return 0;
error_destroy_iommu: zpci_destroy_iommu(zdev); error: - zpci_dbg(0, "add fid:%x, rc:%d\n", fid, rc); - kfree(zdev); - return ERR_PTR(rc); + zpci_dbg(0, "add fid:%x, rc:%d\n", zdev->fid, rc); + return rc; }
bool zpci_is_device_configured(struct zpci_dev *zdev) @@ -1083,6 +1094,49 @@ bool zpci_is_enabled(void) return s390_pci_initialized; }
+static int zpci_cmp_rid(void *priv, const struct list_head *a, + const struct list_head *b) +{ + struct zpci_dev *za = container_of(a, struct zpci_dev, entry); + struct zpci_dev *zb = container_of(b, struct zpci_dev, entry); + + /* + * PCI functions without RID available maintain original order + * between themselves but sort before those with RID. + */ + if (za->rid == zb->rid) + return za->rid_available > zb->rid_available; + /* + * PCI functions with RID sort by RID ascending. + */ + return za->rid > zb->rid; +} + +static void zpci_add_devices(struct list_head *scan_list) +{ + struct zpci_dev *zdev, *tmp; + + list_sort(NULL, scan_list, &zpci_cmp_rid); + list_for_each_entry_safe(zdev, tmp, scan_list, entry) { + list_del_init(&zdev->entry); + zpci_add_device(zdev); + } +} + +int zpci_scan_devices(void) +{ + LIST_HEAD(scan_list); + int rc; + + rc = clp_scan_pci_devices(&scan_list); + if (rc) + return rc; + + zpci_add_devices(&scan_list); + zpci_bus_scan_busses(); + return 0; +} + static int __init pci_base_init(void) { int rc; @@ -1112,10 +1166,9 @@ static int __init pci_base_init(void) if (rc) goto out_irq;
- rc = clp_scan_pci_devices(); + rc = zpci_scan_devices(); if (rc) goto out_find; - zpci_bus_scan_busses();
s390_pci_initialized = 1; return 0; diff --git a/arch/s390/pci/pci_clp.c b/arch/s390/pci/pci_clp.c index ee90a91ed8881..3049e4eb01fe7 100644 --- a/arch/s390/pci/pci_clp.c +++ b/arch/s390/pci/pci_clp.c @@ -164,8 +164,10 @@ static int clp_store_query_pci_fn(struct zpci_dev *zdev, zdev->port = response->port; zdev->uid = response->uid; zdev->fmb_length = sizeof(u32) * response->fmb_len; - zdev->rid_available = response->rid_avail; zdev->is_physfn = response->is_physfn; + zdev->rid_available = response->rid_avail; + if (zdev->rid_available) + zdev->rid = response->rid; if (!s390_pci_no_rid && zdev->rid_available) zdev->devfn = response->rid & ZPCI_RID_MASK_DEVFN;
@@ -407,6 +409,7 @@ static int clp_find_pci(struct clp_req_rsp_list_pci *rrb, u32 fid,
static void __clp_add(struct clp_fh_list_entry *entry, void *data) { + struct list_head *scan_list = data; struct zpci_dev *zdev;
if (!entry->vendor_id) @@ -417,10 +420,11 @@ static void __clp_add(struct clp_fh_list_entry *entry, void *data) zpci_zdev_put(zdev); return; } - zpci_create_device(entry->fid, entry->fh, entry->config_state); + zdev = zpci_create_device(entry->fid, entry->fh, entry->config_state); + list_add_tail(&zdev->entry, scan_list); }
-int clp_scan_pci_devices(void) +int clp_scan_pci_devices(struct list_head *scan_list) { struct clp_req_rsp_list_pci *rrb; int rc; @@ -429,7 +433,7 @@ int clp_scan_pci_devices(void) if (!rrb) return -ENOMEM;
- rc = clp_list_pci(rrb, NULL, __clp_add); + rc = clp_list_pci(rrb, scan_list, __clp_add);
clp_free_block(rrb); return rc; diff --git a/arch/s390/pci/pci_event.c b/arch/s390/pci/pci_event.c index d4f19d33914cb..47f934f4e828e 100644 --- a/arch/s390/pci/pci_event.c +++ b/arch/s390/pci/pci_event.c @@ -340,6 +340,7 @@ static void __zpci_event_availability(struct zpci_ccdf_avail *ccdf) zdev = zpci_create_device(ccdf->fid, ccdf->fh, ZPCI_FN_STATE_CONFIGURED); if (IS_ERR(zdev)) break; + zpci_add_device(zdev); } else { /* the configuration request may be stale */ if (zdev->state != ZPCI_FN_STATE_STANDBY) @@ -349,10 +350,14 @@ static void __zpci_event_availability(struct zpci_ccdf_avail *ccdf) zpci_scan_configured_device(zdev, ccdf->fh); break; case 0x0302: /* Reserved -> Standby */ - if (!zdev) - zpci_create_device(ccdf->fid, ccdf->fh, ZPCI_FN_STATE_STANDBY); - else + if (!zdev) { + zdev = zpci_create_device(ccdf->fid, ccdf->fh, ZPCI_FN_STATE_STANDBY); + if (IS_ERR(zdev)) + break; + zpci_add_device(zdev); + } else { zpci_update_fh(zdev, ccdf->fh); + } break; case 0x0303: /* Deconfiguration requested */ if (zdev) { @@ -381,7 +386,7 @@ static void __zpci_event_availability(struct zpci_ccdf_avail *ccdf) break; case 0x0306: /* 0x308 or 0x302 for multiple devices */ zpci_remove_reserved_devices(); - clp_scan_pci_devices(); + zpci_scan_devices(); break; case 0x0308: /* Standby -> Reserved */ if (!zdev)
From: Niklas Schnelle schnelle@linux.ibm.com
[ Upstream commit 126034faaac5f356822c4a9bebfa75664da11056 ]
The newly introduced topology ID (TID) field in the CLP Query PCI Function explicitly identifies groups of PCI functions whose RIDs belong to the same (sub-)topology. When available use the TID instead of the PCHID to match zPCI busses/domains for multi-function devices. Note that currently only a single PCI bus per TID is supported. This change is required because in future machines the PCHID will not identify a PCI card but a specific port in the case of some multi-port NICs while from a PCI point of view the entire card is a subtopology.
Reviewed-by: Gerd Bayer gbayer@linux.ibm.com Signed-off-by: Niklas Schnelle schnelle@linux.ibm.com Signed-off-by: Heiko Carstens hca@linux.ibm.com Signed-off-by: Sasha Levin sashal@kernel.org --- arch/s390/include/asm/pci.h | 9 ++++++--- arch/s390/include/asm/pci_clp.h | 8 +++++--- arch/s390/pci/pci_bus.c | 17 ++++++++++------- arch/s390/pci/pci_clp.c | 3 +++ 4 files changed, 24 insertions(+), 13 deletions(-)
diff --git a/arch/s390/include/asm/pci.h b/arch/s390/include/asm/pci.h index fdec455892486..94b5ce797c63b 100644 --- a/arch/s390/include/asm/pci.h +++ b/arch/s390/include/asm/pci.h @@ -107,9 +107,10 @@ struct zpci_bus { struct list_head resources; struct list_head bus_next; struct resource bus_resource; - int pchid; + int topo; /* TID if topo_is_tid, PCHID otherwise */ int domain_nr; - bool multifunction; + u8 multifunction : 1; + u8 topo_is_tid : 1; enum pci_bus_speed max_bus_speed; };
@@ -131,6 +132,7 @@ struct zpci_dev { u16 pchid; /* physical channel ID */ u16 maxstbl; /* Maximum store block size */ u16 rid; /* RID as supplied by firmware */ + u16 tid; /* Topology for which RID is valid */ u8 pfgid; /* function group ID */ u8 pft; /* pci function type */ u8 port; @@ -141,7 +143,8 @@ struct zpci_dev { u8 is_physfn : 1; u8 util_str_avail : 1; u8 irqs_registered : 1; - u8 reserved : 2; + u8 tid_avail : 1; + u8 reserved : 1; unsigned int devfn; /* DEVFN part of the RID*/
u8 pfip[CLP_PFIP_NR_SEGMENTS]; /* pci function internal path */ diff --git a/arch/s390/include/asm/pci_clp.h b/arch/s390/include/asm/pci_clp.h index f0c677ddd2706..14afb9ce91f3b 100644 --- a/arch/s390/include/asm/pci_clp.h +++ b/arch/s390/include/asm/pci_clp.h @@ -110,7 +110,8 @@ struct clp_req_query_pci { struct clp_rsp_query_pci { struct clp_rsp_hdr hdr; u16 vfn; /* virtual fn number */ - u16 : 3; + u16 : 2; + u16 tid_avail : 1; u16 rid_avail : 1; u16 is_physfn : 1; u16 reserved1 : 1; @@ -130,8 +131,9 @@ struct clp_rsp_query_pci { u64 edma; /* end dma as */ #define ZPCI_RID_MASK_DEVFN 0x00ff u16 rid; /* BUS/DEVFN PCI address */ - u16 reserved0; - u32 reserved[10]; + u32 reserved0; + u16 tid; + u32 reserved[9]; u32 uid; /* user defined id */ u8 util_str[CLP_UTIL_STR_LEN]; /* utility string */ u32 reserved2[16]; diff --git a/arch/s390/pci/pci_bus.c b/arch/s390/pci/pci_bus.c index daa5d7450c7d3..54879e773e4a3 100644 --- a/arch/s390/pci/pci_bus.c +++ b/arch/s390/pci/pci_bus.c @@ -232,13 +232,13 @@ static void zpci_bus_put(struct zpci_bus *zbus) kref_put(&zbus->kref, zpci_bus_release); }
-static struct zpci_bus *zpci_bus_get(int pchid) +static struct zpci_bus *zpci_bus_get(int topo, bool topo_is_tid) { struct zpci_bus *zbus;
mutex_lock(&zbus_list_lock); list_for_each_entry(zbus, &zbus_list, bus_next) { - if (pchid == zbus->pchid) { + if (topo_is_tid == zbus->topo_is_tid && topo == zbus->topo) { kref_get(&zbus->kref); goto out_unlock; } @@ -249,7 +249,7 @@ static struct zpci_bus *zpci_bus_get(int pchid) return zbus; }
-static struct zpci_bus *zpci_bus_alloc(int pchid) +static struct zpci_bus *zpci_bus_alloc(int topo, bool topo_is_tid) { struct zpci_bus *zbus;
@@ -257,7 +257,8 @@ static struct zpci_bus *zpci_bus_alloc(int pchid) if (!zbus) return NULL;
- zbus->pchid = pchid; + zbus->topo = topo; + zbus->topo_is_tid = topo_is_tid; INIT_LIST_HEAD(&zbus->bus_next); mutex_lock(&zbus_list_lock); list_add_tail(&zbus->bus_next, &zbus_list); @@ -321,8 +322,9 @@ static int zpci_bus_add_device(struct zpci_bus *zbus, struct zpci_dev *zdev)
int zpci_bus_device_register(struct zpci_dev *zdev, struct pci_ops *ops) { + bool topo_is_tid = zdev->tid_avail; struct zpci_bus *zbus = NULL; - int rc = -EBADF; + int topo, rc = -EBADF;
if (zpci_nb_devices == ZPCI_NR_DEVICES) { pr_warn("Adding PCI function %08x failed because the configured limit of %d is reached\n", @@ -333,11 +335,12 @@ int zpci_bus_device_register(struct zpci_dev *zdev, struct pci_ops *ops) if (zdev->devfn >= ZPCI_FUNCTIONS_PER_BUS) return -EINVAL;
+ topo = topo_is_tid ? zdev->tid : zdev->pchid; if (!s390_pci_no_rid && zdev->rid_available) - zbus = zpci_bus_get(zdev->pchid); + zbus = zpci_bus_get(topo, topo_is_tid);
if (!zbus) { - zbus = zpci_bus_alloc(zdev->pchid); + zbus = zpci_bus_alloc(topo, topo_is_tid); if (!zbus) return -ENOMEM; } diff --git a/arch/s390/pci/pci_clp.c b/arch/s390/pci/pci_clp.c index 3049e4eb01fe7..658802b82f9eb 100644 --- a/arch/s390/pci/pci_clp.c +++ b/arch/s390/pci/pci_clp.c @@ -170,6 +170,9 @@ static int clp_store_query_pci_fn(struct zpci_dev *zdev, zdev->rid = response->rid; if (!s390_pci_no_rid && zdev->rid_available) zdev->devfn = response->rid & ZPCI_RID_MASK_DEVFN; + zdev->tid_avail = response->tid_avail; + if (zdev->tid_avail) + zdev->tid = response->tid;
memcpy(zdev->pfip, response->pfip, sizeof(zdev->pfip)); if (response->util_str_avail) {
From: Niklas Schnelle schnelle@linux.ibm.com
[ Upstream commit 25f39d3dcb48bbc824a77d16b3d977f0f3713cfe ]
Ensure that VFs used in isolation, that is with their parent PF not visible to the configuration but with their RID exposed, are treated compatibly with existing isolated VF use cases without exposed RID including RoCE Express VFs. This allows creating configurations where one LPAR manages PFs while their child VFs are used by other LPARs. This gives the LPAR managing the PFs a role analogous to that of the hypervisor in a typical use case of passing child VFs to guests.
Instead of creating a multifunction struct zpci_bus whenever a PCI function with RID exposed is discovered only create such a bus for configured physical functions and only consider multifunction busses when searching for an existing bus. Additionally only set zdev->devfn to the devfn part of the RID once the function is added to a multifunction bus.
This also fixes probing of more than 7 such isolated VFs from the same physical bus. This is because common PCI code in pci_scan_slot() only looks for more functions when pdev->multifunction is set which somewhat counter intutively is not the case for VFs.
Note that PFs are looked at before their child VFs is guaranteed because we sort the zpci_list by RID ascending.
Reviewed-by: Gerd Bayer gbayer@linux.ibm.com Signed-off-by: Niklas Schnelle schnelle@linux.ibm.com Signed-off-by: Heiko Carstens hca@linux.ibm.com Signed-off-by: Sasha Levin sashal@kernel.org --- arch/s390/pci/pci_bus.c | 33 ++++++++++++++++++++------------- arch/s390/pci/pci_clp.c | 2 -- 2 files changed, 20 insertions(+), 15 deletions(-)
diff --git a/arch/s390/pci/pci_bus.c b/arch/s390/pci/pci_bus.c index 54879e773e4a3..1b74a000ff645 100644 --- a/arch/s390/pci/pci_bus.c +++ b/arch/s390/pci/pci_bus.c @@ -168,9 +168,16 @@ void zpci_bus_scan_busses(void) mutex_unlock(&zbus_list_lock); }
+static bool zpci_bus_is_multifunction_root(struct zpci_dev *zdev) +{ + return !s390_pci_no_rid && zdev->rid_available && + zpci_is_device_configured(zdev) && + !zdev->vfn; +} + /* zpci_bus_create_pci_bus - Create the PCI bus associated with this zbus * @zbus: the zbus holding the zdevices - * @fr: PCI root function that will determine the bus's domain, and bus speeed + * @fr: PCI root function that will determine the bus's domain, and bus speed * @ops: the pci operations * * The PCI function @fr determines the domain (its UID), multifunction property @@ -188,7 +195,7 @@ static int zpci_bus_create_pci_bus(struct zpci_bus *zbus, struct zpci_dev *fr, s return domain;
zbus->domain_nr = domain; - zbus->multifunction = fr->rid_available; + zbus->multifunction = zpci_bus_is_multifunction_root(fr); zbus->max_bus_speed = fr->max_bus_speed;
/* @@ -238,6 +245,8 @@ static struct zpci_bus *zpci_bus_get(int topo, bool topo_is_tid)
mutex_lock(&zbus_list_lock); list_for_each_entry(zbus, &zbus_list, bus_next) { + if (!zbus->multifunction) + continue; if (topo_is_tid == zbus->topo_is_tid && topo == zbus->topo) { kref_get(&zbus->kref); goto out_unlock; @@ -293,19 +302,22 @@ static int zpci_bus_add_device(struct zpci_bus *zbus, struct zpci_dev *zdev) { int rc = -EINVAL;
+ if (zbus->multifunction) { + if (!zdev->rid_available) { + WARN_ONCE(1, "rid_available not set for multifunction\n"); + return rc; + } + zdev->devfn = zdev->rid & ZPCI_RID_MASK_DEVFN; + } + if (zbus->function[zdev->devfn]) { pr_err("devfn %04x is already assigned\n", zdev->devfn); return rc; } - zdev->zbus = zbus; zbus->function[zdev->devfn] = zdev; zpci_nb_devices++;
- if (zbus->multifunction && !zdev->rid_available) { - WARN_ONCE(1, "rid_available not set for multifunction\n"); - goto error; - } rc = zpci_init_slot(zdev); if (rc) goto error; @@ -332,13 +344,8 @@ int zpci_bus_device_register(struct zpci_dev *zdev, struct pci_ops *ops) return -ENOSPC; }
- if (zdev->devfn >= ZPCI_FUNCTIONS_PER_BUS) - return -EINVAL; - topo = topo_is_tid ? zdev->tid : zdev->pchid; - if (!s390_pci_no_rid && zdev->rid_available) - zbus = zpci_bus_get(topo, topo_is_tid); - + zbus = zpci_bus_get(topo, topo_is_tid); if (!zbus) { zbus = zpci_bus_alloc(topo, topo_is_tid); if (!zbus) diff --git a/arch/s390/pci/pci_clp.c b/arch/s390/pci/pci_clp.c index 658802b82f9eb..421c95e1841ef 100644 --- a/arch/s390/pci/pci_clp.c +++ b/arch/s390/pci/pci_clp.c @@ -168,8 +168,6 @@ static int clp_store_query_pci_fn(struct zpci_dev *zdev, zdev->rid_available = response->rid_avail; if (zdev->rid_available) zdev->rid = response->rid; - if (!s390_pci_no_rid && zdev->rid_available) - zdev->devfn = response->rid & ZPCI_RID_MASK_DEVFN; zdev->tid_avail = response->tid_avail; if (zdev->tid_avail) zdev->tid = response->tid;
From: Christian Brauner brauner@kernel.org
[ Upstream commit 6474353a5e3d0b2cf610153cea0c61f576a36d0a ]
Epoll relies on a racy fastpath check during __fput() in eventpoll_release() to avoid the hit of pointlessly acquiring a semaphore. Annotate that race by using WRITE_ONCE() and READ_ONCE().
Link: https://lore.kernel.org/r/66edfb3c.050a0220.3195df.001a.GAE@google.com Link: https://lore.kernel.org/r/20240925-fungieren-anbauen-79b334b00542@brauner Reviewed-by: Jan Kara jack@suse.cz Reported-by: syzbot+3b6b32dc50537a49bb4a@syzkaller.appspotmail.com Signed-off-by: Christian Brauner brauner@kernel.org Signed-off-by: Sasha Levin sashal@kernel.org --- fs/eventpoll.c | 6 ++++-- include/linux/eventpoll.h | 2 +- 2 files changed, 5 insertions(+), 3 deletions(-)
diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 6d0e2f547ae7d..7428450920525 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -823,7 +823,8 @@ static bool __ep_remove(struct eventpoll *ep, struct epitem *epi, bool force) to_free = NULL; head = file->f_ep; if (head->first == &epi->fllink && !epi->fllink.next) { - file->f_ep = NULL; + /* See eventpoll_release() for details. */ + WRITE_ONCE(file->f_ep, NULL); if (!is_file_epoll(file)) { struct epitems_head *v; v = container_of(head, struct epitems_head, epitems); @@ -1603,7 +1604,8 @@ static int attach_epitem(struct file *file, struct epitem *epi) spin_unlock(&file->f_lock); goto allocate; } - file->f_ep = head; + /* See eventpoll_release() for details. */ + WRITE_ONCE(file->f_ep, head); to_free = NULL; } hlist_add_head_rcu(&epi->fllink, file->f_ep); diff --git a/include/linux/eventpoll.h b/include/linux/eventpoll.h index 3337745d81bd6..0c0d00fcd131f 100644 --- a/include/linux/eventpoll.h +++ b/include/linux/eventpoll.h @@ -42,7 +42,7 @@ static inline void eventpoll_release(struct file *file) * because the file in on the way to be removed and nobody ( but * eventpoll ) has still a reference to this file. */ - if (likely(!file->f_ep)) + if (likely(!READ_ONCE(file->f_ep))) return;
/*
From: Mark Brown broonie@kernel.org
[ Upstream commit dca93d29845dfed60910ba13dbfb6ae6a0e19f6d ]
Currently if we encounter an error between fork() and exec() of a child process we log the error to stderr. This means that the errors don't get annotated with the child information which makes diagnostics harder and means that if we miss the exit signal from the child we can deadlock waiting for output from the child. Improve robustness and output quality by logging to stdout instead.
Signed-off-by: Mark Brown broonie@kernel.org Link: https://lore.kernel.org/r/20241023-arm64-fp-stress-exec-fail-v1-1-ee3c62932c... Signed-off-by: Catalin Marinas catalin.marinas@arm.com Signed-off-by: Sasha Levin sashal@kernel.org --- tools/testing/selftests/arm64/fp/fp-stress.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-)
diff --git a/tools/testing/selftests/arm64/fp/fp-stress.c b/tools/testing/selftests/arm64/fp/fp-stress.c index faac24bdefeb9..80f22789504d6 100644 --- a/tools/testing/selftests/arm64/fp/fp-stress.c +++ b/tools/testing/selftests/arm64/fp/fp-stress.c @@ -79,7 +79,7 @@ static void child_start(struct child_data *child, const char *program) */ ret = dup2(pipefd[1], 1); if (ret == -1) { - fprintf(stderr, "dup2() %d\n", errno); + printf("dup2() %d\n", errno); exit(EXIT_FAILURE); }
@@ -89,7 +89,7 @@ static void child_start(struct child_data *child, const char *program) */ ret = dup2(startup_pipe[0], 3); if (ret == -1) { - fprintf(stderr, "dup2() %d\n", errno); + printf("dup2() %d\n", errno); exit(EXIT_FAILURE); }
@@ -107,16 +107,15 @@ static void child_start(struct child_data *child, const char *program) */ ret = read(3, &i, sizeof(i)); if (ret < 0) - fprintf(stderr, "read(startp pipe) failed: %s (%d)\n", - strerror(errno), errno); + printf("read(startp pipe) failed: %s (%d)\n", + strerror(errno), errno); if (ret > 0) - fprintf(stderr, "%d bytes of data on startup pipe\n", - ret); + printf("%d bytes of data on startup pipe\n", ret); close(3);
ret = execl(program, program, NULL); - fprintf(stderr, "execl(%s) failed: %d (%s)\n", - program, errno, strerror(errno)); + printf("execl(%s) failed: %d (%s)\n", + program, errno, strerror(errno));
exit(EXIT_FAILURE); } else {
From: Thomas Richter tmricht@linux.ibm.com
[ Upstream commit a0bd7dacbd51c632b8e2c0500b479af564afadf3 ]
CPU hotplug remove handling triggers the following function call sequence:
CPUHP_AP_PERF_S390_SF_ONLINE --> s390_pmu_sf_offline_cpu() ... CPUHP_AP_PERF_ONLINE --> perf_event_exit_cpu()
The s390 CPUMF sampling CPU hotplug handler invokes:
s390_pmu_sf_offline_cpu() +--> cpusf_pmu_setup() +--> setup_pmc_cpu() +--> deallocate_buffers()
This function de-allocates all sampling data buffers (SDBs) allocated for that CPU at event initialization. It also clears the PMU_F_RESERVED bit. The CPU is gone and can not be sampled.
With the event still being active on the removed CPU, the CPU event hotplug support in kernel performance subsystem triggers the following function calls on the removed CPU:
perf_event_exit_cpu() +--> perf_event_exit_cpu_context() +--> __perf_event_exit_context() +--> __perf_remove_from_context() +--> event_sched_out() +--> cpumsf_pmu_del() +--> cpumsf_pmu_stop() +--> hw_perf_event_update()
to stop and remove the event. During removal of the event, the sampling device driver tries to read out the remaining samples from the sample data buffers (SDBs). But they have already been freed (and may have been re-assigned). This may lead to a use after free situation in which case the samples are most likely invalid. In the best case the memory has not been reassigned and still contains valid data.
Remedy this situation and check if the CPU is still in reserved state (bit PMU_F_RESERVED set). In this case the SDBs have not been released an contain valid data. This is always the case when the event is removed (and no CPU hotplug off occured). If the PMU_F_RESERVED bit is not set, the SDB buffers are gone.
Signed-off-by: Thomas Richter tmricht@linux.ibm.com Reviewed-by: Hendrik Brueckner brueckner@linux.ibm.com Signed-off-by: Heiko Carstens hca@linux.ibm.com Signed-off-by: Sasha Levin sashal@kernel.org --- arch/s390/kernel/perf_cpum_sf.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/arch/s390/kernel/perf_cpum_sf.c b/arch/s390/kernel/perf_cpum_sf.c index 48fa9471660a8..cf9a65cdbedb6 100644 --- a/arch/s390/kernel/perf_cpum_sf.c +++ b/arch/s390/kernel/perf_cpum_sf.c @@ -1922,7 +1922,9 @@ static void cpumsf_pmu_stop(struct perf_event *event, int flags) event->hw.state |= PERF_HES_STOPPED;
if ((flags & PERF_EF_UPDATE) && !(event->hw.state & PERF_HES_UPTODATE)) { - hw_perf_event_update(event, 1); + /* CPU hotplug off removes SDBs. No samples to extract. */ + if (cpuhw->flags & PMU_F_RESERVED) + hw_perf_event_update(event, 1); event->hw.state |= PERF_HES_UPTODATE; } perf_pmu_enable(event->pmu);
From: Damien Le Moal dlemoal@kernel.org
[ Upstream commit d7cb6d7414ea1b33536fa6d11805cb8dceec1f97 ]
Ensure that a disk revalidation changing the conventional zones bitmap of a disk does not cause invalid memory references when using the disk_zone_is_conv() helper by RCU protecting the disk->conv_zones_bitmap pointer.
disk_zone_is_conv() is modified to operate under the RCU read lock and the function disk_set_conv_zones_bitmap() is added to update a disk conv_zones_bitmap pointer using rcu_replace_pointer() with the disk zone_wplugs_lock spinlock held.
disk_free_zone_resources() is modified to call disk_update_zone_resources() with a NULL bitmap pointer to free the disk conv_zones_bitmap. disk_set_conv_zones_bitmap() is also used in disk_update_zone_resources() to set the new (revalidated) bitmap and free the old one.
Signed-off-by: Damien Le Moal dlemoal@kernel.org Reviewed-by: Christoph Hellwig hch@lst.de Reviewed-by: Johannes Thumshirn johannes.thumshirn@wdc.com Link: https://lore.kernel.org/r/20241107064300.227731-2-dlemoal@kernel.org Signed-off-by: Jens Axboe axboe@kernel.dk Signed-off-by: Sasha Levin sashal@kernel.org --- block/blk-zoned.c | 43 ++++++++++++++++++++++++++++++------------ include/linux/blkdev.h | 2 +- 2 files changed, 32 insertions(+), 13 deletions(-)
diff --git a/block/blk-zoned.c b/block/blk-zoned.c index af19296fa50df..74e39545562d5 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -350,9 +350,15 @@ int blkdev_zone_mgmt_ioctl(struct block_device *bdev, blk_mode_t mode,
static inline bool disk_zone_is_conv(struct gendisk *disk, sector_t sector) { - if (!disk->conv_zones_bitmap) - return false; - return test_bit(disk_zone_no(disk, sector), disk->conv_zones_bitmap); + unsigned long *bitmap; + bool is_conv; + + rcu_read_lock(); + bitmap = rcu_dereference(disk->conv_zones_bitmap); + is_conv = bitmap && test_bit(disk_zone_no(disk, sector), bitmap); + rcu_read_unlock(); + + return is_conv; }
static bool disk_zone_is_last(struct gendisk *disk, struct blk_zone *zone) @@ -1455,6 +1461,24 @@ static void disk_destroy_zone_wplugs_hash_table(struct gendisk *disk) disk->zone_wplugs_hash_bits = 0; }
+static unsigned int disk_set_conv_zones_bitmap(struct gendisk *disk, + unsigned long *bitmap) +{ + unsigned int nr_conv_zones = 0; + unsigned long flags; + + spin_lock_irqsave(&disk->zone_wplugs_lock, flags); + if (bitmap) + nr_conv_zones = bitmap_weight(bitmap, disk->nr_zones); + bitmap = rcu_replace_pointer(disk->conv_zones_bitmap, bitmap, + lockdep_is_held(&disk->zone_wplugs_lock)); + spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags); + + kfree_rcu_mightsleep(bitmap); + + return nr_conv_zones; +} + void disk_free_zone_resources(struct gendisk *disk) { if (!disk->zone_wplugs_pool) @@ -1478,8 +1502,7 @@ void disk_free_zone_resources(struct gendisk *disk) mempool_destroy(disk->zone_wplugs_pool); disk->zone_wplugs_pool = NULL;
- bitmap_free(disk->conv_zones_bitmap); - disk->conv_zones_bitmap = NULL; + disk_set_conv_zones_bitmap(disk, NULL); disk->zone_capacity = 0; disk->last_zone_capacity = 0; disk->nr_zones = 0; @@ -1538,17 +1561,15 @@ static int disk_update_zone_resources(struct gendisk *disk, struct blk_revalidate_zone_args *args) { struct request_queue *q = disk->queue; - unsigned int nr_seq_zones, nr_conv_zones = 0; + unsigned int nr_seq_zones, nr_conv_zones; unsigned int pool_size; struct queue_limits lim;
disk->nr_zones = args->nr_zones; disk->zone_capacity = args->zone_capacity; disk->last_zone_capacity = args->last_zone_capacity; - swap(disk->conv_zones_bitmap, args->conv_zones_bitmap); - if (disk->conv_zones_bitmap) - nr_conv_zones = bitmap_weight(disk->conv_zones_bitmap, - disk->nr_zones); + nr_conv_zones = + disk_set_conv_zones_bitmap(disk, args->conv_zones_bitmap); if (nr_conv_zones >= disk->nr_zones) { pr_warn("%s: Invalid number of conventional zones %u / %u\n", disk->disk_name, nr_conv_zones, disk->nr_zones); @@ -1823,8 +1844,6 @@ int blk_revalidate_disk_zones(struct gendisk *disk) disk_free_zone_resources(disk); blk_mq_unfreeze_queue(q);
- kfree(args.conv_zones_bitmap); - return ret; } EXPORT_SYMBOL_GPL(blk_revalidate_disk_zones); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index b7664d593486a..44f084811de23 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -194,7 +194,7 @@ struct gendisk { unsigned int nr_zones; unsigned int zone_capacity; unsigned int last_zone_capacity; - unsigned long *conv_zones_bitmap; + unsigned long __rcu *conv_zones_bitmap; unsigned int zone_wplugs_hash_bits; spinlock_t zone_wplugs_lock; struct mempool_s *zone_wplugs_pool;
From: Johannes Thumshirn johannes.thumshirn@wdc.com
[ Upstream commit 8cca35cb29f81eba3e96ec44dad8696c8a2f9138 ]
Running fstests btrfs/011 with MKFS_OPTIONS="-O rst" to force the usage of the RAID stripe-tree, we get the following splat from lockdep:
BTRFS info (device sdd): dev_replace from /dev/sdd (devid 1) to /dev/sdb started
============================================ WARNING: possible recursive locking detected 6.11.0-rc3-btrfs-for-next #599 Not tainted -------------------------------------------- btrfs/2326 is trying to acquire lock: ffff88810f215c98 (&fs_info->dev_replace.rwsem){++++}-{3:3}, at: btrfs_map_block+0x39f/0x2250
but task is already holding lock: ffff88810f215c98 (&fs_info->dev_replace.rwsem){++++}-{3:3}, at: btrfs_map_block+0x39f/0x2250
other info that might help us debug this: Possible unsafe locking scenario:
CPU0 ---- lock(&fs_info->dev_replace.rwsem); lock(&fs_info->dev_replace.rwsem);
*** DEADLOCK ***
May be due to missing lock nesting notation
1 lock held by btrfs/2326: #0: ffff88810f215c98 (&fs_info->dev_replace.rwsem){++++}-{3:3}, at: btrfs_map_block+0x39f/0x2250
stack backtrace: CPU: 1 UID: 0 PID: 2326 Comm: btrfs Not tainted 6.11.0-rc3-btrfs-for-next #599 Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 Call Trace: <TASK> dump_stack_lvl+0x5b/0x80 __lock_acquire+0x2798/0x69d0 ? __pfx___lock_acquire+0x10/0x10 ? __pfx___lock_acquire+0x10/0x10 lock_acquire+0x19d/0x4a0 ? btrfs_map_block+0x39f/0x2250 ? __pfx_lock_acquire+0x10/0x10 ? find_held_lock+0x2d/0x110 ? lock_is_held_type+0x8f/0x100 down_read+0x8e/0x440 ? btrfs_map_block+0x39f/0x2250 ? __pfx_down_read+0x10/0x10 ? do_raw_read_unlock+0x44/0x70 ? _raw_read_unlock+0x23/0x40 btrfs_map_block+0x39f/0x2250 ? btrfs_dev_replace_by_ioctl+0xd69/0x1d00 ? btrfs_bio_counter_inc_blocked+0xd9/0x2e0 ? __kasan_slab_alloc+0x6e/0x70 ? __pfx_btrfs_map_block+0x10/0x10 ? __pfx_btrfs_bio_counter_inc_blocked+0x10/0x10 ? kmem_cache_alloc_noprof+0x1f2/0x300 ? mempool_alloc_noprof+0xed/0x2b0 btrfs_submit_chunk+0x28d/0x17e0 ? __pfx_btrfs_submit_chunk+0x10/0x10 ? bvec_alloc+0xd7/0x1b0 ? bio_add_folio+0x171/0x270 ? __pfx_bio_add_folio+0x10/0x10 ? __kasan_check_read+0x20/0x20 btrfs_submit_bio+0x37/0x80 read_extent_buffer_pages+0x3df/0x6c0 btrfs_read_extent_buffer+0x13e/0x5f0 read_tree_block+0x81/0xe0 read_block_for_search+0x4bd/0x7a0 ? __pfx_read_block_for_search+0x10/0x10 btrfs_search_slot+0x78d/0x2720 ? __pfx_btrfs_search_slot+0x10/0x10 ? lock_is_held_type+0x8f/0x100 ? kasan_save_track+0x14/0x30 ? __kasan_slab_alloc+0x6e/0x70 ? kmem_cache_alloc_noprof+0x1f2/0x300 btrfs_get_raid_extent_offset+0x181/0x820 ? __pfx_lock_acquire+0x10/0x10 ? __pfx_btrfs_get_raid_extent_offset+0x10/0x10 ? down_read+0x194/0x440 ? __pfx_down_read+0x10/0x10 ? do_raw_read_unlock+0x44/0x70 ? _raw_read_unlock+0x23/0x40 btrfs_map_block+0x5b5/0x2250 ? __pfx_btrfs_map_block+0x10/0x10 scrub_submit_initial_read+0x8fe/0x11b0 ? __pfx_scrub_submit_initial_read+0x10/0x10 submit_initial_group_read+0x161/0x3a0 ? lock_release+0x20e/0x710 ? __pfx_submit_initial_group_read+0x10/0x10 ? __pfx_lock_release+0x10/0x10 scrub_simple_mirror.isra.0+0x3eb/0x580 scrub_stripe+0xe4d/0x1440 ? lock_release+0x20e/0x710 ? __pfx_scrub_stripe+0x10/0x10 ? __pfx_lock_release+0x10/0x10 ? do_raw_read_unlock+0x44/0x70 ? _raw_read_unlock+0x23/0x40 scrub_chunk+0x257/0x4a0 scrub_enumerate_chunks+0x64c/0xf70 ? __mutex_unlock_slowpath+0x147/0x5f0 ? __pfx_scrub_enumerate_chunks+0x10/0x10 ? bit_wait_timeout+0xb0/0x170 ? __up_read+0x189/0x700 ? scrub_workers_get+0x231/0x300 ? up_write+0x490/0x4f0 btrfs_scrub_dev+0x52e/0xcd0 ? create_pending_snapshots+0x230/0x250 ? __pfx_btrfs_scrub_dev+0x10/0x10 btrfs_dev_replace_by_ioctl+0xd69/0x1d00 ? lock_acquire+0x19d/0x4a0 ? __pfx_btrfs_dev_replace_by_ioctl+0x10/0x10 ? lock_release+0x20e/0x710 ? btrfs_ioctl+0xa09/0x74f0 ? __pfx_lock_release+0x10/0x10 ? do_raw_spin_lock+0x11e/0x240 ? __pfx_do_raw_spin_lock+0x10/0x10 btrfs_ioctl+0xa14/0x74f0 ? lock_acquire+0x19d/0x4a0 ? find_held_lock+0x2d/0x110 ? __pfx_btrfs_ioctl+0x10/0x10 ? lock_release+0x20e/0x710 ? do_sigaction+0x3f0/0x860 ? __pfx_do_vfs_ioctl+0x10/0x10 ? do_raw_spin_lock+0x11e/0x240 ? lockdep_hardirqs_on_prepare+0x270/0x3e0 ? _raw_spin_unlock_irq+0x28/0x50 ? do_sigaction+0x3f0/0x860 ? __pfx_do_sigaction+0x10/0x10 ? __x64_sys_rt_sigaction+0x18e/0x1e0 ? __pfx___x64_sys_rt_sigaction+0x10/0x10 ? __x64_sys_close+0x7c/0xd0 __x64_sys_ioctl+0x137/0x190 do_syscall_64+0x71/0x140 entry_SYSCALL_64_after_hwframe+0x76/0x7e RIP: 0033:0x7f0bd1114f9b Code: Unable to access opcode bytes at 0x7f0bd1114f71. RSP: 002b:00007ffc8a8c3130 EFLAGS: 00000246 ORIG_RAX: 0000000000000010 RAX: ffffffffffffffda RBX: 0000000000000003 RCX: 00007f0bd1114f9b RDX: 00007ffc8a8c35e0 RSI: 00000000ca289435 RDI: 0000000000000003 RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000007 R10: 0000000000000008 R11: 0000000000000246 R12: 00007ffc8a8c6c85 R13: 00000000398e72a0 R14: 0000000000004361 R15: 0000000000000004 </TASK>
This happens because on RAID stripe-tree filesystems we recurse back into btrfs_map_block() on scrub to perform the logical to device physical mapping.
But as the device replace task is already holding the dev_replace::rwsem we deadlock.
So don't take the dev_replace::rwsem in case our task is the task performing the device replace.
Suggested-by: Filipe Manana fdmanana@suse.com Signed-off-by: Johannes Thumshirn johannes.thumshirn@wdc.com Reviewed-by: Filipe Manana fdmanana@suse.com Signed-off-by: David Sterba dsterba@suse.com Signed-off-by: Sasha Levin sashal@kernel.org --- fs/btrfs/dev-replace.c | 2 ++ fs/btrfs/fs.h | 2 ++ fs/btrfs/volumes.c | 8 +++++--- 3 files changed, 9 insertions(+), 3 deletions(-)
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index f638c458d2853..ac83f823bf4b9 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -641,6 +641,7 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, return ret;
down_write(&dev_replace->rwsem); + dev_replace->replace_task = current; switch (dev_replace->replace_state) { case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: @@ -971,6 +972,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, list_add(&tgt_device->dev_alloc_list, &fs_devices->alloc_list); fs_devices->rw_devices++;
+ dev_replace->replace_task = NULL; up_write(&dev_replace->rwsem); btrfs_rm_dev_replace_blocked(fs_info);
diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h index 3d6d4b5032208..53824da92cc34 100644 --- a/fs/btrfs/fs.h +++ b/fs/btrfs/fs.h @@ -317,6 +317,8 @@ struct btrfs_dev_replace {
struct percpu_counter bio_counter; wait_queue_head_t replace_wait; + + struct task_struct *replace_task; };
/* diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 0485143cd75e0..fe7cac62f80ca 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -6651,13 +6651,15 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, max_len = btrfs_max_io_len(map, map_offset, &io_geom); *length = min_t(u64, map->chunk_len - map_offset, max_len);
- down_read(&dev_replace->rwsem); + if (dev_replace->replace_task != current) + down_read(&dev_replace->rwsem); + dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace); /* * Hold the semaphore for read during the whole operation, write is * requested at commit time but must wait. */ - if (!dev_replace_is_ongoing) + if (!dev_replace_is_ongoing && dev_replace->replace_task != current) up_read(&dev_replace->rwsem);
switch (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { @@ -6797,7 +6799,7 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, bioc->mirror_num = io_geom.mirror_num;
out: - if (dev_replace_is_ongoing) { + if (dev_replace_is_ongoing && dev_replace->replace_task != current) { lockdep_assert_held(&dev_replace->rwsem); /* Unlock and let waiting writers proceed */ up_read(&dev_replace->rwsem);
From: Qu Wenruo wqu@suse.com
[ Upstream commit 2e8b6bc0ab41ce41e6dfcc204b6cc01d5abbc952 ]
[PROBLEM] It is very common for udev to trigger device scan, and every time a mounted btrfs device got re-scan from different soft links, we will get some of unnecessary device path updates, this is especially common for LVM based storage:
# lvs scratch1 test -wi-ao---- 10.00g scratch2 test -wi-a----- 10.00g scratch3 test -wi-a----- 10.00g scratch4 test -wi-a----- 10.00g scratch5 test -wi-a----- 10.00g test test -wi-a----- 10.00g
# mkfs.btrfs -f /dev/test/scratch1 # mount /dev/test/scratch1 /mnt/btrfs # dmesg -c [ 205.705234] BTRFS: device fsid 7be2602f-9e35-4ecf-a6ff-9e91d2c182c9 devid 1 transid 6 /dev/mapper/test-scratch1 (253:4) scanned by mount (1154) [ 205.710864] BTRFS info (device dm-4): first mount of filesystem 7be2602f-9e35-4ecf-a6ff-9e91d2c182c9 [ 205.711923] BTRFS info (device dm-4): using crc32c (crc32c-intel) checksum algorithm [ 205.713856] BTRFS info (device dm-4): using free-space-tree [ 205.722324] BTRFS info (device dm-4): checking UUID tree
So far so good, but even if we just touched any soft link of "dm-4", we will get quite some unnecessary device path updates.
# touch /dev/mapper/test-scratch1 # dmesg -c [ 469.295796] BTRFS info: devid 1 device path /dev/mapper/test-scratch1 changed to /dev/dm-4 scanned by (udev-worker) (1221) [ 469.300494] BTRFS info: devid 1 device path /dev/dm-4 changed to /dev/mapper/test-scratch1 scanned by (udev-worker) (1221)
Such device path rename is unnecessary and can lead to random path change due to the udev race.
[CAUSE] Inside device_list_add(), we are using a very primitive way checking if the device has changed, strcmp().
Which can never handle links well, no matter if it's hard or soft links.
So every different link of the same device will be treated as a different device, causing the unnecessary device path update.
[FIX] Introduce a helper, is_same_device(), and use path_equal() to properly detect the same block device. So that the different soft links won't trigger the rename race.
Reviewed-by: Filipe Manana fdmanana@suse.com Link: https://bugzilla.suse.com/show_bug.cgi?id=1230641 Reported-by: Fabian Vogt fvogt@suse.com Signed-off-by: Qu Wenruo wqu@suse.com Signed-off-by: David Sterba dsterba@suse.com Signed-off-by: Sasha Levin sashal@kernel.org --- fs/btrfs/volumes.c | 38 +++++++++++++++++++++++++++++++++++++- 1 file changed, 37 insertions(+), 1 deletion(-)
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index fe7cac62f80ca..bcb7f288510f0 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -730,6 +730,42 @@ const u8 *btrfs_sb_fsid_ptr(const struct btrfs_super_block *sb) return has_metadata_uuid ? sb->metadata_uuid : sb->fsid; }
+static bool is_same_device(struct btrfs_device *device, const char *new_path) +{ + struct path old = { .mnt = NULL, .dentry = NULL }; + struct path new = { .mnt = NULL, .dentry = NULL }; + char *old_path = NULL; + bool is_same = false; + int ret; + + if (!device->name) + goto out; + + old_path = kzalloc(PATH_MAX, GFP_NOFS); + if (!old_path) + goto out; + + rcu_read_lock(); + ret = strscpy(old_path, rcu_str_deref(device->name), PATH_MAX); + rcu_read_unlock(); + if (ret < 0) + goto out; + + ret = kern_path(old_path, LOOKUP_FOLLOW, &old); + if (ret) + goto out; + ret = kern_path(new_path, LOOKUP_FOLLOW, &new); + if (ret) + goto out; + if (path_equal(&old, &new)) + is_same = true; +out: + kfree(old_path); + path_put(&old); + path_put(&new); + return is_same; +} + /* * Add new device to list of registered devices * @@ -850,7 +886,7 @@ static noinline struct btrfs_device *device_list_add(const char *path, MAJOR(path_devt), MINOR(path_devt), current->comm, task_pid_nr(current));
- } else if (!device->name || strcmp(device->name->str, path)) { + } else if (!device->name || !is_same_device(device, path)) { /* * When FS is already mounted. * 1. If you are here and if the device->name is NULL that
From: Qu Wenruo wqu@suse.com
[ Upstream commit 7e06de7c83a746e58d4701e013182af133395188 ]
[PROBLEM] Currently btrfs accepts any file path for its device, resulting some weird situation:
# ./mount_by_fd /dev/test/scratch1 /mnt/btrfs/
The program has the following source code:
#include <fcntl.h> #include <stdio.h> #include <sys/mount.h>
int main(int argc, char *argv[]) { int fd = open(argv[1], O_RDWR); char path[256]; snprintf(path, sizeof(path), "/proc/self/fd/%d", fd); return mount(path, argv[2], "btrfs", 0, NULL); }
Then we can have the following weird device path:
BTRFS: device fsid 2378be81-fe12-46d2-a9e8-68cf08dd98d5 devid 1 transid 7 /proc/self/fd/3 (253:2) scanned by mount_by_fd (18440)
Normally it's not a big deal, and later udev can trigger a device path rename. But if udev didn't trigger, the device path "/proc/self/fd/3" will show up in mtab.
[CAUSE] For filename "/proc/self/fd/3", it means the opened file descriptor 3. In above case, it's exactly the device we want to open, aka points to "/dev/test/scratch1" which is another symlink pointing to "/dev/dm-2".
Inside kernel we solve the mount source using LOOKUP_FOLLOW, which follows the symbolic link and grab the proper block device.
But inside btrfs we also save the filename into btrfs_device::name, and utilize that member to report our mount source, which leads to the above situation.
[FIX] Instead of unconditionally trust the path, check if the original file (not following the symbolic link) is inside "/dev/", if not, then manually lookup the path to its final destination, and use that as our device path.
This allows us to still use symbolic links, like "/dev/mapper/test-scratch" from LVM2, which is required for fstests runs with LVM2 setup.
And for really weird names, like the above case, we solve it to "/dev/dm-2" instead.
Reviewed-by: Filipe Manana fdmanana@suse.com Link: https://bugzilla.suse.com/show_bug.cgi?id=1230641 Reported-by: Fabian Vogt fvogt@suse.com Signed-off-by: Qu Wenruo wqu@suse.com Signed-off-by: David Sterba dsterba@suse.com Signed-off-by: Sasha Levin sashal@kernel.org --- fs/btrfs/volumes.c | 87 +++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 86 insertions(+), 1 deletion(-)
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index bcb7f288510f0..a06d530835d4e 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -730,6 +730,78 @@ const u8 *btrfs_sb_fsid_ptr(const struct btrfs_super_block *sb) return has_metadata_uuid ? sb->metadata_uuid : sb->fsid; }
+/* + * We can have very weird soft links passed in. + * One example is "/proc/self/fd/<fd>", which can be a soft link to + * a block device. + * + * But it's never a good idea to use those weird names. + * Here we check if the path (not following symlinks) is a good one inside + * "/dev/". + */ +static bool is_good_dev_path(const char *dev_path) +{ + struct path path = { .mnt = NULL, .dentry = NULL }; + char *path_buf = NULL; + char *resolved_path; + bool is_good = false; + int ret; + + if (!dev_path) + goto out; + + path_buf = kmalloc(PATH_MAX, GFP_KERNEL); + if (!path_buf) + goto out; + + /* + * Do not follow soft link, just check if the original path is inside + * "/dev/". + */ + ret = kern_path(dev_path, 0, &path); + if (ret) + goto out; + resolved_path = d_path(&path, path_buf, PATH_MAX); + if (IS_ERR(resolved_path)) + goto out; + if (strncmp(resolved_path, "/dev/", strlen("/dev/"))) + goto out; + is_good = true; +out: + kfree(path_buf); + path_put(&path); + return is_good; +} + +static int get_canonical_dev_path(const char *dev_path, char *canonical) +{ + struct path path = { .mnt = NULL, .dentry = NULL }; + char *path_buf = NULL; + char *resolved_path; + int ret; + + if (!dev_path) { + ret = -EINVAL; + goto out; + } + + path_buf = kmalloc(PATH_MAX, GFP_KERNEL); + if (!path_buf) { + ret = -ENOMEM; + goto out; + } + + ret = kern_path(dev_path, LOOKUP_FOLLOW, &path); + if (ret) + goto out; + resolved_path = d_path(&path, path_buf, PATH_MAX); + ret = strscpy(canonical, resolved_path, PATH_MAX); +out: + kfree(path_buf); + path_put(&path); + return ret; +} + static bool is_same_device(struct btrfs_device *device, const char *new_path) { struct path old = { .mnt = NULL, .dentry = NULL }; @@ -1417,12 +1489,23 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags, bool new_device_added = false; struct btrfs_device *device = NULL; struct file *bdev_file; + char *canonical_path = NULL; u64 bytenr; dev_t devt; int ret;
lockdep_assert_held(&uuid_mutex);
+ if (!is_good_dev_path(path)) { + canonical_path = kmalloc(PATH_MAX, GFP_KERNEL); + if (canonical_path) { + ret = get_canonical_dev_path(path, canonical_path); + if (ret < 0) { + kfree(canonical_path); + canonical_path = NULL; + } + } + } /* * Avoid an exclusive open here, as the systemd-udev may initiate the * device scan which may race with the user's mount or mkfs command, @@ -1467,7 +1550,8 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags, goto free_disk_super; }
- device = device_list_add(path, disk_super, &new_device_added); + device = device_list_add(canonical_path ? : path, disk_super, + &new_device_added); if (!IS_ERR(device) && new_device_added) btrfs_free_stale_devices(device->devt, device);
@@ -1476,6 +1560,7 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags,
error_bdev_put: fput(bdev_file); + kfree(canonical_path);
return device; }
From: Robbie Ko robbieko@synology.com
[ Upstream commit 99785998ed1cea142e20f4904ced26537a37bf74 ]
When crawling btree, if an eb cache miss occurs, we change to use the eb read lock and release all previous locks (including the parent lock) to reduce lock contention.
If an eb cache miss occurs in a leaf and needs to execute IO, before this change we released locks only from level 2 and up and we read a leaf's content from disk while holding a lock on its parent (level 1), causing the unnecessary lock contention on the parent, after this change we release locks from level 1 and up, but we lock level 0, and read leaf's content from disk.
Because we have prepared the check parameters and the read lock of eb we hold, we can ensure that no race will occur during the check and cause unexpected errors.
Reviewed-by: Filipe Manana fdmanana@suse.com Signed-off-by: Robbie Ko robbieko@synology.com Signed-off-by: David Sterba dsterba@suse.com Signed-off-by: Sasha Levin sashal@kernel.org --- fs/btrfs/ctree.c | 101 ++++++++++++++++++++++++++++++++--------------- 1 file changed, 70 insertions(+), 31 deletions(-)
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 451203055bbfb..dfb6d16aac1dd 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1515,12 +1515,14 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p, struct btrfs_tree_parent_check check = { 0 }; u64 blocknr; u64 gen; - struct extent_buffer *tmp; - int ret; + struct extent_buffer *tmp = NULL; + int ret = 0; int parent_level; - bool unlock_up; + int err; + bool read_tmp = false; + bool tmp_locked = false; + bool path_released = false;
- unlock_up = ((level + 1 < BTRFS_MAX_LEVEL) && p->locks[level + 1]); blocknr = btrfs_node_blockptr(*eb_ret, slot); gen = btrfs_node_ptr_generation(*eb_ret, slot); parent_level = btrfs_header_level(*eb_ret); @@ -1551,68 +1553,105 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p, */ if (btrfs_verify_level_key(tmp, parent_level - 1, &check.first_key, gen)) { - free_extent_buffer(tmp); - return -EUCLEAN; + ret = -EUCLEAN; + goto out; } *eb_ret = tmp; - return 0; + tmp = NULL; + ret = 0; + goto out; }
if (p->nowait) { - free_extent_buffer(tmp); - return -EAGAIN; + ret = -EAGAIN; + goto out; }
- if (unlock_up) + if (!p->skip_locking) { btrfs_unlock_up_safe(p, level + 1); - - /* now we're allowed to do a blocking uptodate check */ - ret = btrfs_read_extent_buffer(tmp, &check); - if (ret) { - free_extent_buffer(tmp); + tmp_locked = true; + btrfs_tree_read_lock(tmp); btrfs_release_path(p); - return ret; + ret = -EAGAIN; + path_released = true; }
- if (unlock_up) - ret = -EAGAIN; + /* Now we're allowed to do a blocking uptodate check. */ + err = btrfs_read_extent_buffer(tmp, &check); + if (err) { + ret = err; + goto out; + }
+ if (ret == 0) { + ASSERT(!tmp_locked); + *eb_ret = tmp; + tmp = NULL; + } goto out; } else if (p->nowait) { - return -EAGAIN; + ret = -EAGAIN; + goto out; }
- if (unlock_up) { + if (!p->skip_locking) { btrfs_unlock_up_safe(p, level + 1); ret = -EAGAIN; - } else { - ret = 0; }
if (p->reada != READA_NONE) reada_for_search(fs_info, p, level, slot, key->objectid);
- tmp = read_tree_block(fs_info, blocknr, &check); + tmp = btrfs_find_create_tree_block(fs_info, blocknr, check.owner_root, check.level); if (IS_ERR(tmp)) { + ret = PTR_ERR(tmp); + tmp = NULL; + goto out; + } + read_tmp = true; + + if (!p->skip_locking) { + ASSERT(ret == -EAGAIN); + tmp_locked = true; + btrfs_tree_read_lock(tmp); btrfs_release_path(p); - return PTR_ERR(tmp); + path_released = true; + } + + /* Now we're allowed to do a blocking uptodate check. */ + err = btrfs_read_extent_buffer(tmp, &check); + if (err) { + ret = err; + goto out; } + /* * If the read above didn't mark this buffer up to date, * it will never end up being up to date. Set ret to EIO now * and give up so that our caller doesn't loop forever * on our EAGAINs. */ - if (!extent_buffer_uptodate(tmp)) + if (!extent_buffer_uptodate(tmp)) { ret = -EIO; + goto out; + }
-out: if (ret == 0) { + ASSERT(!tmp_locked); *eb_ret = tmp; - } else { - free_extent_buffer(tmp); - btrfs_release_path(p); + tmp = NULL; + } +out: + if (tmp) { + if (tmp_locked) + btrfs_tree_read_unlock(tmp); + if (read_tmp && ret && ret != -EAGAIN) + free_extent_buffer_stale(tmp); + else + free_extent_buffer(tmp); } + if (ret && !path_released) + btrfs_release_path(p);
return ret; } @@ -2198,7 +2237,7 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root, }
err = read_block_for_search(root, p, &b, level, slot, key); - if (err == -EAGAIN) + if (err == -EAGAIN && !p->nowait) goto again; if (err) { ret = err; @@ -2325,7 +2364,7 @@ int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key, }
err = read_block_for_search(root, p, &b, level, slot, key); - if (err == -EAGAIN) + if (err == -EAGAIN && !p->nowait) goto again; if (err) { ret = err;
From: Boris Burkov boris@bur.io
[ Upstream commit 70958a949d852cbecc3d46127bf0b24786df0130 ]
If you follow the seed/sprout wiki, it suggests the following workflow:
btrfstune -S 1 seed_dev mount seed_dev mnt btrfs device add sprout_dev mount -o remount,rw mnt
The first mount mounts the FS readonly, which results in not setting BTRFS_FS_OPEN, and setting the readonly bit on the sb. The device add somewhat surprisingly clears the readonly bit on the sb (though the mount is still practically readonly, from the users perspective...). Finally, the remount checks the readonly bit on the sb against the flag and sees no change, so it does not run the code intended to run on ro->rw transitions, leaving BTRFS_FS_OPEN unset.
As a result, when the cleaner_kthread runs, it sees no BTRFS_FS_OPEN and does no work. This results in leaking deleted snapshots until we run out of space.
I propose fixing it at the first departure from what feels reasonable: when we clear the readonly bit on the sb during device add.
A new fstest I have written reproduces the bug and confirms the fix.
Reviewed-by: Qu Wenruo wqu@suse.com Signed-off-by: Boris Burkov boris@bur.io Signed-off-by: David Sterba dsterba@suse.com Signed-off-by: Sasha Levin sashal@kernel.org --- fs/btrfs/volumes.c | 4 ---- 1 file changed, 4 deletions(-)
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index a06d530835d4e..28b79341e7e05 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2840,8 +2840,6 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path set_blocksize(device->bdev_file, BTRFS_BDEV_BLOCKSIZE);
if (seeding_dev) { - btrfs_clear_sb_rdonly(sb); - /* GFP_KERNEL allocation must not be under device_list_mutex */ seed_devices = btrfs_init_sprout(fs_info); if (IS_ERR(seed_devices)) { @@ -2984,8 +2982,6 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path mutex_unlock(&fs_info->chunk_mutex); mutex_unlock(&fs_info->fs_devices->device_list_mutex); error_trans: - if (seeding_dev) - btrfs_set_sb_rdonly(sb); if (trans) btrfs_end_transaction(trans); error_free_zone:
From: Filipe Manana fdmanana@suse.com
[ Upstream commit 2342d6595b608eec94187a17dc112dd4c2a812fa ]
Smatch complains about calling PTR_ERR() against a NULL pointer:
fs/btrfs/super.c:2272 btrfs_control_ioctl() warn: passing zero to 'PTR_ERR'
Fix this by calling PTR_ERR() against the device pointer only if it contains an error.
Reviewed-by: Qu Wenruo wqu@suse.com Signed-off-by: Filipe Manana fdmanana@suse.com Reviewed-by: David Sterba dsterba@suse.com Signed-off-by: David Sterba dsterba@suse.com Signed-off-by: Sasha Levin sashal@kernel.org --- fs/btrfs/super.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-)
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index c64d071341223..4505995eec342 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -2256,7 +2256,10 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd, device = btrfs_scan_one_device(vol->name, BLK_OPEN_READ, false); if (IS_ERR_OR_NULL(device)) { mutex_unlock(&uuid_mutex); - ret = PTR_ERR(device); + if (IS_ERR(device)) + ret = PTR_ERR(device); + else + ret = 0; break; } ret = !(device->fs_devices->num_devices ==
From: Mark Brown broonie@kernel.org
[ Upstream commit 3e360ef0c0a1fb6ce9a302e40b8057c41ba8a9d2 ]
When building for streaming SVE the irritator for SVE skips updates of both P0 and FFR. While FFR is skipped since it might not be present there is no reason to skip corrupting P0 so switch to an instruction valid in streaming mode and move the ifdef.
Signed-off-by: Mark Brown broonie@kernel.org Link: https://lore.kernel.org/r/20241107-arm64-fp-stress-irritator-v2-3-c4b9622e36... Signed-off-by: Catalin Marinas catalin.marinas@arm.com Signed-off-by: Sasha Levin sashal@kernel.org --- tools/testing/selftests/arm64/fp/sve-test.S | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/tools/testing/selftests/arm64/fp/sve-test.S b/tools/testing/selftests/arm64/fp/sve-test.S index fff60e2a25add..4fcb492aee1fb 100644 --- a/tools/testing/selftests/arm64/fp/sve-test.S +++ b/tools/testing/selftests/arm64/fp/sve-test.S @@ -304,9 +304,9 @@ function irritator_handler movi v0.8b, #1 movi v9.16b, #2 movi v31.8b, #3 -#ifndef SSVE // And P0 - rdffr p0.b + ptrue p0.d +#ifndef SSVE // And FFR wrffr p15.b #endif
From: Mark Brown broonie@kernel.org
[ Upstream commit 27141b690547da5650a420f26ec369ba142a9ebb ]
The PAC exec_sign_all() test spawns some child processes, creating pipes to be stdin and stdout for the child. It cleans up most of the file descriptors that are created as part of this but neglects to clean up the parent end of the child stdin and stdout. Add the missing close() calls.
Signed-off-by: Mark Brown broonie@kernel.org Link: https://lore.kernel.org/r/20241111-arm64-pac-test-collisions-v1-1-171875f37e... Signed-off-by: Catalin Marinas catalin.marinas@arm.com Signed-off-by: Sasha Levin sashal@kernel.org --- tools/testing/selftests/arm64/pauth/pac.c | 3 +++ 1 file changed, 3 insertions(+)
diff --git a/tools/testing/selftests/arm64/pauth/pac.c b/tools/testing/selftests/arm64/pauth/pac.c index b743daa772f55..5a07b3958fbf2 100644 --- a/tools/testing/selftests/arm64/pauth/pac.c +++ b/tools/testing/selftests/arm64/pauth/pac.c @@ -182,6 +182,9 @@ int exec_sign_all(struct signatures *signed_vals, size_t val) return -1; }
+ close(new_stdin[1]); + close(new_stdout[0]); + return 0; }
From: Brian Foster bfoster@redhat.com
[ Upstream commit c7fc0366c65628fd69bfc310affec4918199aae2 ]
Using mapped writes, it's technically possible to expose stale post-eof data on a truncate up operation. Consider the following example:
$ xfs_io -fc "pwrite 0 2k" -c "mmap 0 4k" -c "mwrite 2k 2k" \ -c "truncate 8k" -c "pread -v 2k 16" <file> ... 00000800: 58 58 58 58 58 58 58 58 58 58 58 58 58 58 58 58 XXXXXXXXXXXXXXXX ...
This shows that the post-eof data written via mwrite lands within EOF after a truncate up. While this is deliberate of the test case, behavior is somewhat unpredictable because writeback does post-eof zeroing, and writeback can occur at any time in the background. For example, an fsync inserted between the mwrite and truncate causes the subsequent read to instead return zeroes. This basically means that there is a race window in this situation between any subsequent extending operation and writeback that dictates whether post-eof data is exposed to the file or zeroed.
To prevent this problem, perform partial block zeroing as part of the various inode size extending operations that are susceptible to it. For truncate extension, zero around the original eof similar to how truncate down does partial zeroing of the new eof. For extension via writes and fallocate related operations, zero the newly exposed range of the file to cover any partial zeroing that must occur at the original and new eof blocks.
Signed-off-by: Brian Foster bfoster@redhat.com Link: https://patch.msgid.link/20240919160741.208162-2-bfoster@redhat.com Signed-off-by: Theodore Ts'o tytso@mit.edu Signed-off-by: Sasha Levin sashal@kernel.org --- fs/ext4/extents.c | 7 ++++++- fs/ext4/inode.c | 51 +++++++++++++++++++++++++++++++++-------------- 2 files changed, 42 insertions(+), 16 deletions(-)
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index c64f7c1b1d908..b03081ab8beee 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4478,7 +4478,7 @@ static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset, int depth = 0; struct ext4_map_blocks map; unsigned int credits; - loff_t epos; + loff_t epos, old_size = i_size_read(inode);
BUG_ON(!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)); map.m_lblk = offset; @@ -4537,6 +4537,11 @@ static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset, if (ext4_update_inode_size(inode, epos) & 0x1) inode_set_mtime_to_ts(inode, inode_get_ctime(inode)); + if (epos > old_size) { + pagecache_isize_extended(inode, old_size, epos); + ext4_zero_partial_blocks(handle, inode, + old_size, epos - old_size); + } } ret2 = ext4_mark_inode_dirty(handle, inode); ext4_update_inode_fsync_trans(handle, inode, 1); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index a0fa5192db8ed..4a273d83187e8 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1328,8 +1328,10 @@ static int ext4_write_end(struct file *file, folio_unlock(folio); folio_put(folio);
- if (old_size < pos && !verity) + if (old_size < pos && !verity) { pagecache_isize_extended(inode, old_size, pos); + ext4_zero_partial_blocks(handle, inode, old_size, pos - old_size); + } /* * Don't mark the inode dirty under folio lock. First, it unnecessarily * makes the holding time of folio lock longer. Second, it forces lock @@ -1445,8 +1447,10 @@ static int ext4_journalled_write_end(struct file *file, folio_unlock(folio); folio_put(folio);
- if (old_size < pos && !verity) + if (old_size < pos && !verity) { pagecache_isize_extended(inode, old_size, pos); + ext4_zero_partial_blocks(handle, inode, old_size, pos - old_size); + }
if (size_changed) { ret2 = ext4_mark_inode_dirty(handle, inode); @@ -3017,7 +3021,8 @@ static int ext4_da_do_write_end(struct address_space *mapping, struct inode *inode = mapping->host; loff_t old_size = inode->i_size; bool disksize_changed = false; - loff_t new_i_size; + loff_t new_i_size, zero_len = 0; + handle_t *handle;
if (unlikely(!folio_buffers(folio))) { folio_unlock(folio); @@ -3061,18 +3066,21 @@ static int ext4_da_do_write_end(struct address_space *mapping, folio_unlock(folio); folio_put(folio);
- if (old_size < pos) + if (pos > old_size) { pagecache_isize_extended(inode, old_size, pos); + zero_len = pos - old_size; + }
- if (disksize_changed) { - handle_t *handle; + if (!disksize_changed && !zero_len) + return copied;
- handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); - if (IS_ERR(handle)) - return PTR_ERR(handle); - ext4_mark_inode_dirty(handle, inode); - ext4_journal_stop(handle); - } + handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); + if (IS_ERR(handle)) + return PTR_ERR(handle); + if (zero_len) + ext4_zero_partial_blocks(handle, inode, old_size, zero_len); + ext4_mark_inode_dirty(handle, inode); + ext4_journal_stop(handle);
return copied; } @@ -5459,6 +5467,14 @@ int ext4_setattr(struct mnt_idmap *idmap, struct dentry *dentry, }
if (attr->ia_size != inode->i_size) { + /* attach jbd2 jinode for EOF folio tail zeroing */ + if (attr->ia_size & (inode->i_sb->s_blocksize - 1) || + oldsize & (inode->i_sb->s_blocksize - 1)) { + error = ext4_inode_attach_jinode(inode); + if (error) + goto err_out; + } + handle = ext4_journal_start(inode, EXT4_HT_INODE, 3); if (IS_ERR(handle)) { error = PTR_ERR(handle); @@ -5469,12 +5485,17 @@ int ext4_setattr(struct mnt_idmap *idmap, struct dentry *dentry, orphan = 1; } /* - * Update c/mtime on truncate up, ext4_truncate() will - * update c/mtime in shrink case below + * Update c/mtime and tail zero the EOF folio on + * truncate up. ext4_truncate() handles the shrink case + * below. */ - if (!shrink) + if (!shrink) { inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); + if (oldsize & (inode->i_sb->s_blocksize - 1)) + ext4_block_truncate_page(handle, + inode->i_mapping, oldsize); + }
if (shrink) ext4_fc_track_range(handle, inode,
linux-stable-mirror@lists.linaro.org