Am 16.05.22 um 19:13 schrieb T.J. Mercier:
Recently, we noticed an issue where a process went into direct reclaim while holding the kernfs rw semaphore for sysfs in write (exclusive) mode. This caused processes who were doing DMA-BUF exports and releases to go into uninterruptible sleep since they needed to acquire the same semaphore for the DMA-BUF sysfs entry creation/deletion. In order to avoid blocking DMA-BUF export for an indeterminate amount of time while another process is holding the sysfs rw semaphore in exclusive mode, this patch moves the per-buffer sysfs file creation to the default work queue. Note that this can lead to a short-term inaccuracy in the dmabuf sysfs statistics, but this is a tradeoff to prevent the hot path from being blocked. A work_struct is added to dma_buf to achieve this, but as it is unioned with the kobject in the sysfs_entry, dma_buf does not increase in size.
I'm still not very keen of this approach as it strongly feels like we are working around shortcoming somewhere else.
Fixes: bdb8d06dfefd ("dmabuf: Add the capability to expose DMA-BUF stats in sysfs") Originally-by: Hridya Valsaraju hridya@google.com Signed-off-by: T.J. Mercier tjmercier@google.com
See the originally submitted patch by Hridya Valsaraju here: https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flkml.org%2...
v2 changes:
- Defer only sysfs creation instead of creation and teardown per
Christian König
- Use a work queue instead of a kthread for deferred work per
Christian König
drivers/dma-buf/dma-buf-sysfs-stats.c | 56 ++++++++++++++++++++------- include/linux/dma-buf.h | 14 ++++++- 2 files changed, 54 insertions(+), 16 deletions(-)
diff --git a/drivers/dma-buf/dma-buf-sysfs-stats.c b/drivers/dma-buf/dma-buf-sysfs-stats.c index 2bba0babcb62..67b0a298291c 100644 --- a/drivers/dma-buf/dma-buf-sysfs-stats.c +++ b/drivers/dma-buf/dma-buf-sysfs-stats.c @@ -11,6 +11,7 @@ #include <linux/printk.h> #include <linux/slab.h> #include <linux/sysfs.h> +#include <linux/workqueue.h> #include "dma-buf-sysfs-stats.h" @@ -168,10 +169,46 @@ void dma_buf_uninit_sysfs_statistics(void) kset_unregister(dma_buf_stats_kset); } +static void sysfs_add_workfn(struct work_struct *work) +{
- struct dma_buf_sysfs_entry *sysfs_entry =
container_of(work, struct dma_buf_sysfs_entry, sysfs_add_work);
- struct dma_buf *dmabuf = sysfs_entry->dmabuf;
- /*
* A dmabuf is ref-counted via its file member. If this handler holds the only
* reference to the dmabuf, there is no need for sysfs kobject creation. This is an
* optimization and a race; when the reference count drops to 1 immediately after
* this check it is not harmful as the sysfs entry will still get cleaned up in
* dma_buf_stats_teardown, which won't get called until the final dmabuf reference
* is released, and that can't happen until the end of this function.
*/
- if (file_count(dmabuf->file) > 1) {
Please completely drop that. I see absolutely no justification for this additional complexity.
/*
* kobject_init_and_add expects kobject to be zero-filled, but we have populated it
* (the sysfs_add_work union member) to trigger this work function.
*/
memset(&dmabuf->sysfs_entry->kobj, 0, sizeof(dmabuf->sysfs_entry->kobj));
dmabuf->sysfs_entry->kobj.kset = dma_buf_per_buffer_stats_kset;
if (kobject_init_and_add(&dmabuf->sysfs_entry->kobj, &dma_buf_ktype, NULL,
"%lu", file_inode(dmabuf->file)->i_ino)) {
kobject_put(&dmabuf->sysfs_entry->kobj);
dmabuf->sysfs_entry = NULL;
}
- } else {
/*
* Free the sysfs_entry and reset the pointer so dma_buf_stats_teardown doesn't
* attempt to operate on it.
*/
kfree(dmabuf->sysfs_entry);
dmabuf->sysfs_entry = NULL;
- }
- dma_buf_put(dmabuf);
+}
- int dma_buf_stats_setup(struct dma_buf *dmabuf) { struct dma_buf_sysfs_entry *sysfs_entry;
- int ret;
if (!dmabuf || !dmabuf->file) return -EINVAL; @@ -181,25 +218,16 @@ int dma_buf_stats_setup(struct dma_buf *dmabuf) return -EINVAL; }
- sysfs_entry = kzalloc(sizeof(struct dma_buf_sysfs_entry), GFP_KERNEL);
- sysfs_entry = kmalloc(sizeof(struct dma_buf_sysfs_entry), GFP_KERNEL); if (!sysfs_entry) return -ENOMEM;
- sysfs_entry->kobj.kset = dma_buf_per_buffer_stats_kset; sysfs_entry->dmabuf = dmabuf;
- dmabuf->sysfs_entry = sysfs_entry;
- /* create the directory for buffer stats */
- ret = kobject_init_and_add(&sysfs_entry->kobj, &dma_buf_ktype, NULL,
"%lu", file_inode(dmabuf->file)->i_ino);
- if (ret)
goto err_sysfs_dmabuf;
- INIT_WORK(&dmabuf->sysfs_entry->sysfs_add_work, sysfs_add_workfn);
- get_dma_buf(dmabuf); /* This reference will be dropped in sysfs_add_workfn. */
- schedule_work(&dmabuf->sysfs_entry->sysfs_add_work);
return 0;
-err_sysfs_dmabuf:
- kobject_put(&sysfs_entry->kobj);
- dmabuf->sysfs_entry = NULL;
- return ret; }
diff --git a/include/linux/dma-buf.h b/include/linux/dma-buf.h index 2097760e8e95..0200caa3c515 100644 --- a/include/linux/dma-buf.h +++ b/include/linux/dma-buf.h @@ -22,6 +22,7 @@ #include <linux/fs.h> #include <linux/dma-fence.h> #include <linux/wait.h> +#include <linux/workqueue.h> struct device; struct dma_buf; @@ -365,7 +366,7 @@ struct dma_buf { */ const char *name;
- /** @name_lock: Spinlock to protect name acces for read access. */
- /** @name_lock: Spinlock to protect name access for read access. */ spinlock_t name_lock;
/** @@ -441,6 +442,7 @@ struct dma_buf { __poll_t active; } cb_in, cb_out;
Those changes are unrelated.
Regards, Christian.
#ifdef CONFIG_DMABUF_SYSFS_STATS /** * @sysfs_entry: @@ -449,7 +451,15 @@ struct dma_buf { * `DMA-BUF statistics`_ for the uapi this enables. */ struct dma_buf_sysfs_entry {
struct kobject kobj;
union {
struct kobject kobj;
/** @sysfs_add_work:
*
* For deferred sysfs kobject creation using a workqueue.
*/
struct work_struct sysfs_add_work;
struct dma_buf *dmabuf; } *sysfs_entry; #endif};