Antgroup is using 5.10.y in product environment, we found several patches are missing in 5.10.y tree. These patches are needed for us. So we backported them to 5.10.y. Also backport to 5.15.y and 6.1.y to prevent regression.
Connor Kuehl (1): virtiofs: split requests that exceed virtqueue size
Jiachen Zhang (1): fuse: always revalidate rename target dentry
Miklos Szeredi (4): virtiofs: clean up error handling in virtio_fs_get_tree() fuse: check s_root when destroying sb fuse: fix attr version comparison in fuse_read_update_size() fuse: fix deadlock between atomic O_TRUNC and page invalidation
fs/fuse/dir.c | 7 ++++++- fs/fuse/file.c | 31 +++++++++++++++++------------- fs/fuse/fuse_i.h | 3 +++ fs/fuse/inode.c | 5 +++-- fs/fuse/virtio_fs.c | 46 +++++++++++++++++++++++++++++---------------- 5 files changed, 60 insertions(+), 32 deletions(-)
From: Miklos Szeredi mszeredi@redhat.com
commit 833c5a42e28beeefa1f9bd476a63fe8050c1e8ca upstream.
[backport for 5.10.y]
Avoid duplicating error cleanup.
Signed-off-by: Miklos Szeredi mszeredi@redhat.com Signed-off-by: Yang Bo yb203166@antfin.com --- fs/fuse/virtio_fs.c | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-)
diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c index b9cfb1165ff4..22d2145ce08d 100644 --- a/fs/fuse/virtio_fs.c +++ b/fs/fuse/virtio_fs.c @@ -1440,22 +1440,14 @@ static int virtio_fs_get_tree(struct fs_context *fsc) return -EINVAL; }
+ err = -ENOMEM; fc = kzalloc(sizeof(struct fuse_conn), GFP_KERNEL); - if (!fc) { - mutex_lock(&virtio_fs_mutex); - virtio_fs_put(fs); - mutex_unlock(&virtio_fs_mutex); - return -ENOMEM; - } + if (!fc) + goto out_err;
fm = kzalloc(sizeof(struct fuse_mount), GFP_KERNEL); - if (!fm) { - mutex_lock(&virtio_fs_mutex); - virtio_fs_put(fs); - mutex_unlock(&virtio_fs_mutex); - kfree(fc); - return -ENOMEM; - } + if (!fm) + goto out_err;
fuse_conn_init(fc, fm, fsc->user_ns, &virtio_fs_fiq_ops, fs); fc->release = fuse_free_conn; @@ -1483,6 +1475,13 @@ static int virtio_fs_get_tree(struct fs_context *fsc) WARN_ON(fsc->root); fsc->root = dget(sb->s_root); return 0; + +out_err: + kfree(fc); + mutex_lock(&virtio_fs_mutex); + virtio_fs_put(fs); + mutex_unlock(&virtio_fs_mutex); + return err; }
static const struct fs_context_operations virtio_fs_context_ops = {
From: Connor Kuehl ckuehl@redhat.com
commit a7f0d7aab0b4f3f0780b1f77356e2fe7202ac0cb upstream.
[backport for 5.10.y]
If an incoming FUSE request can't fit on the virtqueue, the request is placed onto a workqueue so a worker can try to resubmit it later where there will (hopefully) be space for it next time.
This is fine for requests that aren't larger than a virtqueue's maximum capacity. However, if a request's size exceeds the maximum capacity of the virtqueue (even if the virtqueue is empty), it will be doomed to a life of being placed on the workqueue, removed, discovered it won't fit, and placed on the workqueue yet again.
Furthermore, from section 2.6.5.3.1 (Driver Requirements: Indirect Descriptors) of the virtio spec:
"A driver MUST NOT create a descriptor chain longer than the Queue Size of the device."
To fix this, limit the number of pages FUSE will use for an overall request. This way, each request can realistically fit on the virtqueue when it is decomposed into a scattergather list and avoid violating section 2.6.5.3.1 of the virtio spec.
Signed-off-by: Connor Kuehl ckuehl@redhat.com Reviewed-by: Vivek Goyal vgoyal@redhat.com Signed-off-by: Miklos Szeredi mszeredi@redhat.com Signed-off-by: Yang Bo yb203166@antfin.com --- fs/fuse/fuse_i.h | 3 +++ fs/fuse/inode.c | 3 ++- fs/fuse/virtio_fs.c | 19 +++++++++++++++++-- 3 files changed, 22 insertions(+), 3 deletions(-)
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index b10cddd72355..ceaa6868386e 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -556,6 +556,9 @@ struct fuse_conn { /** Maxmum number of pages that can be used in a single request */ unsigned int max_pages;
+ /** Constrain ->max_pages to this value during feature negotiation */ + unsigned int max_pages_limit; + /** Input queue */ struct fuse_iqueue iq;
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 2ede05df7d06..058bb82dee40 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -710,6 +710,7 @@ void fuse_conn_init(struct fuse_conn *fc, struct fuse_mount *fm, fc->pid_ns = get_pid_ns(task_active_pid_ns(current)); fc->user_ns = get_user_ns(user_ns); fc->max_pages = FUSE_DEFAULT_MAX_PAGES_PER_REQ; + fc->max_pages_limit = FUSE_MAX_MAX_PAGES;
INIT_LIST_HEAD(&fc->mounts); list_add(&fm->fc_entry, &fc->mounts); @@ -1056,7 +1057,7 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args, fc->abort_err = 1; if (arg->flags & FUSE_MAX_PAGES) { fc->max_pages = - min_t(unsigned int, FUSE_MAX_MAX_PAGES, + min_t(unsigned int, fc->max_pages_limit, max_t(unsigned int, arg->max_pages, 1)); } if (IS_ENABLED(CONFIG_FUSE_DAX) && diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c index 22d2145ce08d..6aaaa74438f3 100644 --- a/fs/fuse/virtio_fs.c +++ b/fs/fuse/virtio_fs.c @@ -18,6 +18,12 @@ #include <linux/uio.h> #include "fuse_i.h"
+/* Used to help calculate the FUSE connection's max_pages limit for a request's + * size. Parts of the struct fuse_req are sliced into scattergather lists in + * addition to the pages used, so this can help account for that overhead. + */ +#define FUSE_HEADER_OVERHEAD 4 + /* List of virtio-fs device instances and a lock for the list. Also provides * mutual exclusion in device removal and mounting path */ @@ -1426,9 +1432,10 @@ static int virtio_fs_get_tree(struct fs_context *fsc) { struct virtio_fs *fs; struct super_block *sb; - struct fuse_conn *fc; + struct fuse_conn *fc = NULL; struct fuse_mount *fm; - int err; + unsigned int virtqueue_size; + int err = -EIO;
/* This gets a reference on virtio_fs object. This ptr gets installed * in fc->iq->priv. Once fuse_conn is going away, it calls ->put() @@ -1440,6 +1447,10 @@ static int virtio_fs_get_tree(struct fs_context *fsc) return -EINVAL; }
+ virtqueue_size = virtqueue_get_vring_size(fs->vqs[VQ_REQUEST].vq); + if (WARN_ON(virtqueue_size <= FUSE_HEADER_OVERHEAD)) + goto out_err; + err = -ENOMEM; fc = kzalloc(sizeof(struct fuse_conn), GFP_KERNEL); if (!fc) @@ -1454,6 +1465,10 @@ static int virtio_fs_get_tree(struct fs_context *fsc) fc->delete_stale = true; fc->auto_submounts = true;
+ /* Tell FUSE to split requests that exceed the virtqueue's size */ + fc->max_pages_limit = min_t(unsigned int, fc->max_pages_limit, + virtqueue_size - FUSE_HEADER_OVERHEAD); + fsc->s_fs_info = fm; sb = sget_fc(fsc, virtio_fs_test_super, virtio_fs_set_super); fuse_mount_put(fm);
From: Miklos Szeredi mszeredi@redhat.com
commit d534d31d6a45d71de61db22090b4820afb68fddc upstream.
[backport for 5.10.y]
Checking "fm" works because currently sb->s_fs_info is cleared on error paths; however, sb->s_root is what generic_shutdown_super() checks to determine whether the sb was fully initialized or not.
This change will allow cleanup of sb setup error paths.
Signed-off-by: Miklos Szeredi mszeredi@redhat.com Signed-off-by: Yang Bo yb203166@antfin.com --- fs/fuse/inode.c | 2 +- fs/fuse/virtio_fs.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 058bb82dee40..7a86db768117 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -1572,7 +1572,7 @@ static void fuse_kill_sb_anon(struct super_block *sb) struct fuse_mount *fm = get_fuse_mount_super(sb); bool last;
- if (fm) { + if (sb->s_root) { last = fuse_mount_remove(fm); if (last) fuse_conn_destroy(fm); diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c index 6aaaa74438f3..faadc80485e7 100644 --- a/fs/fuse/virtio_fs.c +++ b/fs/fuse/virtio_fs.c @@ -1399,7 +1399,7 @@ static void virtio_kill_sb(struct super_block *sb) bool last;
/* If mount failed, we can still be called without any fc */ - if (fm) { + if (sb->s_root) { last = fuse_mount_remove(fm); if (last) virtio_fs_conn_destroy(fm);
From: Miklos Szeredi mszeredi@redhat.com
commit 484ce65715b06aead8c4901f01ca32c5a240bc71 upstream.
[backport for 5.10.y]
A READ request returning a short count is taken as indication of EOF, and the cached file size is modified accordingly.
Fix the attribute version checking to allow for changes to fc->attr_version on other inodes.
Signed-off-by: Miklos Szeredi mszeredi@redhat.com Signed-off-by: Yang Bo yb203166@antfin.com --- fs/fuse/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 504389568dac..94fe2c690676 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -782,7 +782,7 @@ static void fuse_read_update_size(struct inode *inode, loff_t size, struct fuse_inode *fi = get_fuse_inode(inode);
spin_lock(&fi->lock); - if (attr_ver == fi->attr_version && size < inode->i_size && + if (attr_ver >= fi->attr_version && size < inode->i_size && !test_bit(FUSE_I_SIZE_UNSTABLE, &fi->state)) { fi->attr_version = atomic64_inc_return(&fc->attr_version); i_size_write(inode, size);
From: Jiachen Zhang zhangjiachen.jaycee@bytedance.com
commit ccc031e26afe60d2a5a3d93dabd9c978210825fb upstream.
[backport for 5.10.y]
The previous commit df8629af2934 ("fuse: always revalidate if exclusive create") ensures that the dentries are revalidated on O_EXCL creates. This commit complements it by also performing revalidation for rename target dentries. Otherwise, a rename target file that only exists in kernel dentry cache but not in the filesystem will result in EEXIST if RENAME_NOREPLACE flag is used.
Signed-off-by: Jiachen Zhang zhangjiachen.jaycee@bytedance.com Signed-off-by: Zhang Tianci zhangtianci.1997@bytedance.com Signed-off-by: Miklos Szeredi mszeredi@redhat.com Signed-off-by: Yang Bo yb203166@antfin.com --- fs/fuse/dir.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 80a9e50392a0..bdb04bea0da9 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -205,7 +205,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags) if (inode && fuse_is_bad(inode)) goto invalid; else if (time_before64(fuse_dentry_time(entry), get_jiffies_64()) || - (flags & (LOOKUP_EXCL | LOOKUP_REVAL))) { + (flags & (LOOKUP_EXCL | LOOKUP_REVAL | LOOKUP_RENAME_TARGET))) { struct fuse_entry_out outarg; FUSE_ARGS(args); struct fuse_forget_link *forget;
From: Miklos Szeredi mszeredi@redhat.com
commit 2fdbb8dd01556e1501132b5ad3826e8f71e24a8b upstream.
[backport for 5.10.y]
fuse_finish_open() will be called with FUSE_NOWRITE set in case of atomic O_TRUNC open(), so commit 76224355db75 ("fuse: truncate pagecache on atomic_o_trunc") replaced invalidate_inode_pages2() by truncate_pagecache() in such a case to avoid the A-A deadlock. However, we found another A-B-B-A deadlock related to the case above, which will cause the xfstests generic/464 testcase hung in our virtio-fs test environment.
For example, consider two processes concurrently open one same file, one with O_TRUNC and another without O_TRUNC. The deadlock case is described below, if open(O_TRUNC) is already set_nowrite(acquired A), and is trying to lock a page (acquiring B), open() could have held the page lock (acquired B), and waiting on the page writeback (acquiring A). This would lead to deadlocks.
open(O_TRUNC) ---------------------------------------------------------------- fuse_open_common inode_lock [C acquire] fuse_set_nowrite [A acquire]
fuse_finish_open truncate_pagecache lock_page [B acquire] truncate_inode_page unlock_page [B release]
fuse_release_nowrite [A release] inode_unlock [C release] ----------------------------------------------------------------
open() ---------------------------------------------------------------- fuse_open_common fuse_finish_open invalidate_inode_pages2 lock_page [B acquire] fuse_launder_page fuse_wait_on_page_writeback [A acquire & release] unlock_page [B release] ----------------------------------------------------------------
Besides this case, all calls of invalidate_inode_pages2() and invalidate_inode_pages2_range() in fuse code also can deadlock with open(O_TRUNC).
Fix by moving the truncate_pagecache() call outside the nowrite protected region. The nowrite protection is only for delayed writeback (writeback_cache) case, where inode lock does not protect against truncation racing with writes on the server. Write syscalls racing with page cache truncation still get the inode lock protection.
This patch also changes the order of filemap_invalidate_lock() vs. fuse_set_nowrite() in fuse_open_common(). This new order matches the order found in fuse_file_fallocate() and fuse_do_setattr().
Reported-by: Jiachen Zhang zhangjiachen.jaycee@bytedance.com Tested-by: Jiachen Zhang zhangjiachen.jaycee@bytedance.com Fixes: e4648309b85a ("fuse: truncate pending writes on O_TRUNC") Cc: stable@vger.kernel.org Signed-off-by: Miklos Szeredi mszeredi@redhat.com Signed-off-by: Yang Bo yb203166@antfin.com --- fs/fuse/dir.c | 5 +++++ fs/fuse/file.c | 29 +++++++++++++++++------------ 2 files changed, 22 insertions(+), 12 deletions(-)
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index bdb04bea0da9..e3b9b7d188e6 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -537,6 +537,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, struct fuse_entry_out outentry; struct fuse_inode *fi; struct fuse_file *ff; + bool trunc = flags & O_TRUNC;
/* Userspace expects S_IFREG in create mode */ BUG_ON((mode & S_IFMT) != S_IFREG); @@ -604,6 +605,10 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, } else { file->private_data = ff; fuse_finish_open(inode, file); + if (fm->fc->atomic_o_trunc && trunc) + truncate_pagecache(inode, 0); + else if (!(ff->open_flags & FOPEN_KEEP_CACHE)) + invalidate_inode_pages2(inode->i_mapping); } return err;
diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 94fe2c690676..13d97547eaf6 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -206,14 +206,10 @@ void fuse_finish_open(struct inode *inode, struct file *file) fi->attr_version = atomic64_inc_return(&fc->attr_version); i_size_write(inode, 0); spin_unlock(&fi->lock); - truncate_pagecache(inode, 0); fuse_invalidate_attr(inode); if (fc->writeback_cache) file_update_time(file); - } else if (!(ff->open_flags & FOPEN_KEEP_CACHE)) { - invalidate_inode_pages2(inode->i_mapping); } - if ((file->f_mode & FMODE_WRITE) && fc->writeback_cache) fuse_link_write_file(file); } @@ -236,30 +232,39 @@ int fuse_open_common(struct inode *inode, struct file *file, bool isdir) if (err) return err;
- if (is_wb_truncate || dax_truncate) { + if (is_wb_truncate || dax_truncate) inode_lock(inode); - fuse_set_nowrite(inode); - }
if (dax_truncate) { down_write(&get_fuse_inode(inode)->i_mmap_sem); err = fuse_dax_break_layouts(inode, 0, 0); if (err) - goto out; + goto out_inode_unlock; }
+ if (is_wb_truncate || dax_truncate) + fuse_set_nowrite(inode); + err = fuse_do_open(fm, get_node_id(inode), file, isdir); if (!err) fuse_finish_open(inode, file);
-out: + if (is_wb_truncate || dax_truncate) + fuse_release_nowrite(inode); + if (!err) { + struct fuse_file *ff = file->private_data; + + if (fc->atomic_o_trunc && (file->f_flags & O_TRUNC)) + truncate_pagecache(inode, 0); + else if (!(ff->open_flags & FOPEN_KEEP_CACHE)) + invalidate_inode_pages2(inode->i_mapping); + } if (dax_truncate) up_write(&get_fuse_inode(inode)->i_mmap_sem);
- if (is_wb_truncate | dax_truncate) { - fuse_release_nowrite(inode); +out_inode_unlock: + if (is_wb_truncate || dax_truncate) inode_unlock(inode); - }
return err; }
linux-stable-mirror@lists.linaro.org