From: Eric Biggers <ebiggers(a)google.com>
'igrab(d_inode(dentry->d_parent))' without holding dentry->d_lock is
broken because without d_lock, d_parent can be concurrently changed due
to a rename(). Then if the old directory is immediately deleted, old
d_parent->inode can be NULL. That causes a NULL dereference in igrab().
To fix this, use dget_parent() to safely grab a reference to the parent
dentry, which pins the inode. This also eliminates the need to use
d_find_any_alias() other than for the initial inode, as we no longer
throw away the dentry at each step.
This is an extremely hard race to hit, but it is possible. Adding a
udelay() in between the reads of ->d_parent and its ->d_inode makes it
reproducible on a no-journal filesystem using the following program:
#include <fcntl.h>
#include <unistd.h>
int main()
{
if (fork()) {
for (;;) {
mkdir("dir1", 0700);
int fd = open("dir1/file", O_RDWR|O_CREAT|O_SYNC);
write(fd, "X", 1);
close(fd);
}
} else {
mkdir("dir2", 0700);
for (;;) {
rename("dir1/file", "dir2/file");
rmdir("dir1");
}
}
}
Fixes: d59729f4e794 ("ext4: fix races in ext4_sync_parent()")
Cc: stable(a)vger.kernel.org
Signed-off-by: Eric Biggers <ebiggers(a)google.com>
---
fs/ext4/fsync.c | 28 +++++++++++++---------------
1 file changed, 13 insertions(+), 15 deletions(-)
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index e10206e7f4bbe7..093c359952cdba 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -44,30 +44,28 @@
*/
static int ext4_sync_parent(struct inode *inode)
{
- struct dentry *dentry = NULL;
- struct inode *next;
+ struct dentry *dentry, *next;
int ret = 0;
if (!ext4_test_inode_state(inode, EXT4_STATE_NEWENTRY))
return 0;
- inode = igrab(inode);
+ dentry = d_find_any_alias(inode);
+ if (!dentry)
+ return 0;
while (ext4_test_inode_state(inode, EXT4_STATE_NEWENTRY)) {
ext4_clear_inode_state(inode, EXT4_STATE_NEWENTRY);
- dentry = d_find_any_alias(inode);
- if (!dentry)
- break;
- next = igrab(d_inode(dentry->d_parent));
+
+ next = dget_parent(dentry);
dput(dentry);
- if (!next)
- break;
- iput(inode);
- inode = next;
+ dentry = next;
+ inode = dentry->d_inode;
+
/*
* The directory inode may have gone through rmdir by now. But
* the inode itself and its blocks are still allocated (we hold
- * a reference to the inode so it didn't go through
- * ext4_evict_inode()) and so we are safe to flush metadata
- * blocks and the inode.
+ * a reference to the inode via its dentry), so it didn't go
+ * through ext4_evict_inode()) and so we are safe to flush
+ * metadata blocks and the inode.
*/
ret = sync_mapping_buffers(inode->i_mapping);
if (ret)
@@ -76,7 +74,7 @@ static int ext4_sync_parent(struct inode *inode)
if (ret)
break;
}
- iput(inode);
+ dput(dentry);
return ret;
}
--
2.26.2
From: Mathias Krause <minipli(a)googlemail.com>
[ Upstream commit 1bd845bcb41d5b7f83745e0cb99273eb376f2ec5 ]
The parallel queue per-cpu data structure gets initialized only for CPUs
in the 'pcpu' CPU mask set. This is not sufficient as the reorder timer
may run on a different CPU and might wrongly decide it's the target CPU
for the next reorder item as per-cpu memory gets memset(0) and we might
be waiting for the first CPU in cpumask.pcpu, i.e. cpu_index 0.
Make the '__this_cpu_read(pd->pqueue->cpu_index) == next_queue->cpu_index'
compare in padata_get_next() fail in this case by initializing the
cpu_index member of all per-cpu parallel queues. Use -1 for unused ones.
Signed-off-by: Mathias Krause <minipli(a)googlemail.com>
Signed-off-by: Herbert Xu <herbert(a)gondor.apana.org.au>
Signed-off-by: Daniel Jordan <daniel.m.jordan(a)oracle.com>
---
kernel/padata.c | 8 +++++++-
1 file changed, 7 insertions(+), 1 deletion(-)
diff --git a/kernel/padata.c b/kernel/padata.c
index 8aef48c3267b..4f860043a8e5 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -461,8 +461,14 @@ static void padata_init_pqueues(struct parallel_data *pd)
struct padata_parallel_queue *pqueue;
cpu_index = 0;
- for_each_cpu(cpu, pd->cpumask.pcpu) {
+ for_each_possible_cpu(cpu) {
pqueue = per_cpu_ptr(pd->pqueue, cpu);
+
+ if (!cpumask_test_cpu(cpu, pd->cpumask.pcpu)) {
+ pqueue->cpu_index = -1;
+ continue;
+ }
+
pqueue->pd = pd;
pqueue->cpu_index = cpu_index;
cpu_index++;
--
2.26.2
Changes since v2 [1]:
- Replace the non-descriptive copy_safe() with copy_mc_to_user() and
copy_mc_to_kernel() several code organization cleanups resulted from
this rename, which further proves the point that the name deeply
matters for maintainability in this case. (Linus)
- Do not use copy_user_generic() as the generic x86 backend for
copy_mc_to_user() since the #MC handler explicitly wants the exception
to be trapped by a _ASM_EXTABLE_FAULT handler. (Vivek).
- Rename copy_safe_slow() to copy_mc_fragile() to better indicate what
the implementation is handling. copy_safe_fast() is replaced with
copy_mc_generic().
---
The primary motivation to go touch memcpy_mcsafe() is that the existing
benefit of doing slow "handle with care" copies is obviated on newer
CPUs. With that concern lifted it also obviates the need to continue to
update the MCA-recovery capability detection code currently gated by
"mcsafe_key". Now the old "mcsafe_key" opt-in to perform the copy with
concerns for recovery fragility can instead be made an opt-out from the
default fast copy implementation (enable_copy_mc_fragile()).
The discussion with Linus on the first iteration of this patch
identified that memcpy_mcsafe() was misnamed relative to its usage. The
new names copy_mc_to_user() and copy_mc_to_kernel() clearly indicate the
intended use case and lets the architecture organize the implementation
accordingly.
For both powerpc and x86 a copy_mc_generic() implementation is added as
the backend for these interfaces.
Patches are relative to tip/master.
---
Dan Williams (2):
x86, powerpc: Rename memcpy_mcsafe() to copy_mc_to_{user,kernel}()
x86/copy_mc: Introduce copy_mc_generic()
arch/powerpc/Kconfig | 2
arch/powerpc/include/asm/string.h | 2
arch/powerpc/include/asm/uaccess.h | 40 +++--
arch/powerpc/lib/Makefile | 2
arch/powerpc/lib/copy_mc_64.S | 4
arch/x86/Kconfig | 2
arch/x86/Kconfig.debug | 2
arch/x86/include/asm/copy_mc_test.h | 75 +++++++++
arch/x86/include/asm/mcsafe_test.h | 75 ---------
arch/x86/include/asm/string_64.h | 32 ----
arch/x86/include/asm/uaccess_64.h | 35 ++--
arch/x86/kernel/cpu/mce/core.c | 8 -
arch/x86/kernel/quirks.c | 9 -
arch/x86/lib/Makefile | 1
arch/x86/lib/copy_mc.c | 64 ++++++++
arch/x86/lib/copy_mc_64.S | 165 ++++++++++++++++++++
arch/x86/lib/memcpy_64.S | 115 --------------
arch/x86/lib/usercopy_64.c | 21 ---
drivers/md/dm-writecache.c | 15 +-
drivers/nvdimm/claim.c | 2
drivers/nvdimm/pmem.c | 6 -
include/linux/string.h | 9 -
include/linux/uaccess.h | 9 +
include/linux/uio.h | 10 +
lib/Kconfig | 7 +
lib/iov_iter.c | 43 +++--
tools/arch/x86/include/asm/copy_safe_test.h | 13 ++
tools/arch/x86/include/asm/mcsafe_test.h | 13 --
tools/arch/x86/lib/memcpy_64.S | 115 --------------
tools/objtool/check.c | 5 -
tools/perf/bench/Build | 1
tools/perf/bench/mem-memcpy-x86-64-lib.c | 24 ---
tools/testing/nvdimm/test/nfit.c | 48 +++---
.../testing/selftests/powerpc/copyloops/.gitignore | 2
tools/testing/selftests/powerpc/copyloops/Makefile | 6 -
.../testing/selftests/powerpc/copyloops/copy_mc.S | 0
36 files changed, 460 insertions(+), 522 deletions(-)
rename arch/powerpc/lib/{memcpy_mcsafe_64.S => copy_mc_64.S} (98%)
create mode 100644 arch/x86/include/asm/copy_mc_test.h
delete mode 100644 arch/x86/include/asm/mcsafe_test.h
create mode 100644 arch/x86/lib/copy_mc.c
create mode 100644 arch/x86/lib/copy_mc_64.S
create mode 100644 tools/arch/x86/include/asm/copy_safe_test.h
delete mode 100644 tools/arch/x86/include/asm/mcsafe_test.h
delete mode 100644 tools/perf/bench/mem-memcpy-x86-64-lib.c
rename tools/testing/selftests/powerpc/copyloops/{memcpy_mcsafe_64.S => copy_mc.S} (100%)
base-commit: bba413deb1065f1291cb1f366247513f11215520
When reserved transaction handle is unused, we subtract its reserved
credits in __jbd2_journal_unreserve_handle() called from
jbd2_journal_stop(). However this function forgets to remove reserved
credits from transaction->t_outstanding_credits and thus the transaction
space that was reserved remains effectively leaked. The leaked
transaction space can be quite significant in some cases and leads to
unnecessarily small transactions and thus reducing throughput of the
journalling machinery. E.g. fsmark workload creating lots of 4k files
was observed to have about 20% lower throughput due to this when ext4 is
mounted with dioread_nolock mount option.
Subtract reserved credits from t_outstanding_credits as well.
CC: stable(a)vger.kernel.org
Fixes: 8f7d89f36829 ("jbd2: transaction reservation support")
Signed-off-by: Jan Kara <jack(a)suse.cz>
---
fs/jbd2/transaction.c | 17 +++++++++++++----
1 file changed, 13 insertions(+), 4 deletions(-)
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 3dccc23cf010..b49a103cff1f 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -541,17 +541,24 @@ handle_t *jbd2_journal_start(journal_t *journal, int nblocks)
}
EXPORT_SYMBOL(jbd2_journal_start);
-static void __jbd2_journal_unreserve_handle(handle_t *handle)
+static void __jbd2_journal_unreserve_handle(handle_t *handle, transaction_t *t)
{
journal_t *journal = handle->h_journal;
WARN_ON(!handle->h_reserved);
sub_reserved_credits(journal, handle->h_total_credits);
+ if (t)
+ atomic_sub(handle->h_total_credits, &t->t_outstanding_credits);
}
void jbd2_journal_free_reserved(handle_t *handle)
{
- __jbd2_journal_unreserve_handle(handle);
+ journal_t *journal = handle->h_journal;
+
+ /* Get j_state_lock to pin running transaction if it exists */
+ read_lock(&journal->j_state_lock);
+ __jbd2_journal_unreserve_handle(handle, journal->j_running_transaction);
+ read_unlock(&journal->j_state_lock);
jbd2_free_handle(handle);
}
EXPORT_SYMBOL(jbd2_journal_free_reserved);
@@ -721,8 +728,10 @@ static void stop_this_handle(handle_t *handle)
}
atomic_sub(handle->h_total_credits,
&transaction->t_outstanding_credits);
- if (handle->h_rsv_handle)
- __jbd2_journal_unreserve_handle(handle->h_rsv_handle);
+ if (handle->h_rsv_handle) {
+ __jbd2_journal_unreserve_handle(handle->h_rsv_handle,
+ transaction);
+ }
if (atomic_dec_and_test(&transaction->t_updates))
wake_up(&journal->j_wait_updates);
--
2.16.4
When using mmap() on a prime imported buffer allocated by a
different driver (such as imx-drm) the later munmap() does
not correctly decrement the refcount of the original enaviv_gem_object,
leading to a leak.
Signed-off-by: Martin Fuzzey <martin.fuzzey(a)flowbird.group>
Cc: stable(a)vger.kernel.org
---
drivers/gpu/drm/etnaviv/etnaviv_gem_prime.c | 20 +++++++++++++++++++-
1 file changed, 19 insertions(+), 1 deletion(-)
diff --git a/drivers/gpu/drm/etnaviv/etnaviv_gem_prime.c b/drivers/gpu/drm/etnaviv/etnaviv_gem_prime.c
index f24dd21..28a01b8 100644
--- a/drivers/gpu/drm/etnaviv/etnaviv_gem_prime.c
+++ b/drivers/gpu/drm/etnaviv/etnaviv_gem_prime.c
@@ -93,7 +93,25 @@ static void *etnaviv_gem_prime_vmap_impl(struct etnaviv_gem_object *etnaviv_obj)
static int etnaviv_gem_prime_mmap_obj(struct etnaviv_gem_object *etnaviv_obj,
struct vm_area_struct *vma)
{
- return dma_buf_mmap(etnaviv_obj->base.dma_buf, vma, 0);
+ int ret;
+
+ ret = dma_buf_mmap(etnaviv_obj->base.dma_buf, vma, 0);
+
+ /* drm_gem_mmap_obj() has already been called before this function
+ * and has incremented our refcount, expecting it to be decremented
+ * on unmap() via drm_gem_vm_close().
+ * However dma_buf_mmap() invokes drm_gem_cma_prime_mmap()
+ * that ends up updating the vma->vma_private_data to point to the
+ * dma_buf's gem object.
+ * Hence our gem object here will not have its refcount decremented
+ * when userspace does unmap().
+ * So decrement the refcount here to avoid a memory leak if the dma
+ * buf mapping was successful.
+ */
+ if (!ret)
+ drm_gem_object_put_unlocked(&etnaviv_obj->base);
+
+ return ret;
}
static const struct etnaviv_gem_ops etnaviv_gem_prime_ops = {
--
1.9.1
Hi
[This is an automated email]
This commit has been processed because it contains a -stable tag.
The stable tag indicates that it's relevant for the following trees: 4.4+
The bot has tested the following trees: v5.6.13, v5.4.41, v4.19.123, v4.14.180, v4.9.223, v4.4.223.
v5.6.13: Failed to apply! Possible dependencies:
0024652895e3 ("btrfs: rename btrfs_put_fs_root and btrfs_grab_fs_root")
02162a0265eb ("btrfs: hold a ref on the root in __btrfs_run_defrag_inode")
04734e844894 ("btrfs: hold a ref on the root in btrfs_ioctl_get_subvol_info")
0b530bc5e11f ("btrfs: hold a ref on the root in build_backref_tree")
0d4b0463011d ("btrfs: export and rename free_fs_info")
2a2b5d620266 ("btrfs: hold ref on root in btrfs_ioctl_default_subvol")
3ca35e839e94 ("btrfs: hold a ref on the root in search_ioctl")
3d7babdcf2cc ("btrfs: hold a ref on the root in find_data_references")
41a2ee75aab0 ("btrfs: introduce per-inode file extent tree")
442b1ac5244e ("btrfs: hold a ref on the root in record_reloc_root_in_trans")
4c78e9f59632 ("btrfs: hold a ref on the root in open_ctree")
76deacf02387 ("btrfs: hold a ref on the root in create_reloc_inode")
81f096edf047 ("btrfs: use btrfs_put_fs_root to free roots always")
8727002f7909 ("btrfs: hold a ref on the root in fixup_tree_root_location")
88234012beaa ("btrfs: hold a ref on the root in btrfs_search_path_in_tree")
9326f76f4bc4 ("btrfs: hold a ref on the root in resolve_indirect_ref")
9f583209f20a ("btrfs: push grab_fs_root into read_fs_root")
ab9737bd7597 ("btrfs: hold a ref on the root in merge_reloc_roots")
b8a49ae1913f ("btrfs: hold a ref on the root in btrfs_search_path_in_tree_user")
bc44d7c4b2b1 ("btrfs: push btrfs_grab_fs_root into btrfs_get_fs_root")
bdf70b9e75f5 ("btrfs: hold a root ref in btrfs_get_dentry")
db2c2ca2db44 ("btrfs: hold a ref on the root in prepare_to_merge")
fc92f79856aa ("btrfs: hold a ref on the root in create_subvol")
v5.4.41: Failed to apply! Possible dependencies:
0024652895e3 ("btrfs: rename btrfs_put_fs_root and btrfs_grab_fs_root")
0d4b0463011d ("btrfs: export and rename free_fs_info")
33ca832fefa5 ("btrfs: separate out the extent leak code")
41a2ee75aab0 ("btrfs: introduce per-inode file extent tree")
6f0d04f8e72e ("btrfs: separate out the extent io init function")
81f096edf047 ("btrfs: use btrfs_put_fs_root to free roots always")
9326f76f4bc4 ("btrfs: hold a ref on the root in resolve_indirect_ref")
9c7d3a548331 ("btrfs: move extent_io_tree defs to their own header")
v4.19.123: Failed to apply! Possible dependencies:
370a11b8114b ("btrfs: qgroup: Introduce per-root swapped blocks infrastructure")
43eb5f297584 ("btrfs: Introduce extent_io_tree::owner to distinguish different io_trees")
57ec5fb478a3 ("btrfs: tests: move testing members of struct btrfs_root to the end")
7b4397386fbd ("btrfs: switch extent_io_tree::track_uptodate to bool")
c258d6e36442 ("btrfs: Introduce fs_info to extent_io_tree")
e06a1fc99cc7 ("btrfs: Remove extent_io_ops::set_bit_hook extent_io callback")
eede2bf34f4f ("Btrfs: prevent ioctls from interfering with a swap file")
v4.14.180: Failed to apply! Possible dependencies:
370a11b8114b ("btrfs: qgroup: Introduce per-root swapped blocks infrastructure")
429d6275d501 ("btrfs: qgroup: Fix wrong qgroup reservation update for relationship modification")
57ec5fb478a3 ("btrfs: tests: move testing members of struct btrfs_root to the end")
64cfaef6362f ("btrfs: qgroup: Introduce function to convert META_PREALLOC into META_PERTRANS")
64ee4e751a1c ("btrfs: qgroup: Update trace events to use new separate rsv types")
733e03a0b26a ("btrfs: qgroup: Split meta rsv type into meta_prealloc and meta_pertrans")
8287475a2055 ("btrfs: qgroup: Use root::qgroup_meta_rsv_* to record qgroup meta reserved space")
d4e5c92055d8 ("btrfs: qgroup: Skeleton to support separate qgroup reservation type")
dba213242fbc ("btrfs: qgroup: Make qgroup_reserve and its callers to use separate reservation type")
e1211d0e896b ("btrfs: qgroup: Don't use root->qgroup_meta_rsv for qgroup")
eede2bf34f4f ("Btrfs: prevent ioctls from interfering with a swap file")
f59c0347d4be ("btrfs: qgroup: Introduce helpers to update and access new qgroup rsv")
fd708b81d972 ("Btrfs: add a extent ref verify tool")
v4.9.223: Failed to apply! Possible dependencies:
0c476a5d7f63 ("btrfs: Ensure proper sector alignment for btrfs_free_reserved_data_space")
370a11b8114b ("btrfs: qgroup: Introduce per-root swapped blocks infrastructure")
4989d277eb4b ("btrfs: refactor __btrfs_lookup_bio_sums to use bio_for_each_segment_all")
62d1f9fe97dd ("btrfs: remove trivial helper btrfs_find_tree_block")
da17066c4047 ("btrfs: pull node/sector/stripe sizes out of root and into fs_info")
eede2bf34f4f ("Btrfs: prevent ioctls from interfering with a swap file")
fd708b81d972 ("Btrfs: add a extent ref verify tool")
v4.4.223: Failed to apply! Possible dependencies:
2f3165ecf103 ("btrfs: don't force mounts to wait for cleaner_kthread to delete one or more subvolumes")
370a11b8114b ("btrfs: qgroup: Introduce per-root swapped blocks infrastructure")
511711af91f2 ("btrfs: don't run delayed references while we are creating the free space tree")
70f6d82ec73c ("Btrfs: add free space tree mount option")
87241c2e6845 ("Btrfs: use root when checking need_async_flush")
90c711ab380d ("btrfs: avoid blocking open_ctree from cleaner_kthread")
9e7cc91a6d18 ("btrfs: fix fsfreeze hang caused by delayed iputs deal")
a5ed91828518 ("Btrfs: implement the free space B-tree")
afcdd129e05a ("Btrfs: add a flags field to btrfs_fs_info")
d38b349c39a9 ("Btrfs: don't bother kicking async if there's nothing to reclaim")
eede2bf34f4f ("Btrfs: prevent ioctls from interfering with a swap file")
f376df2b7da3 ("Btrfs: add tracepoints for flush events")
NOTE: The patch will not be queued to stable trees until it is upstream.
How should we proceed with this patch?
--
Thanks
Sasha
From: Herbert Xu <herbert(a)gondor.apana.org.au>
[ Upstream commit 6fc4dbcf0276279d488c5fbbfabe94734134f4fa ]
The function padata_reorder will use a timer when it cannot progress
while completed jobs are outstanding (pd->reorder_objects > 0). This
is suboptimal as if we do end up using the timer then it would have
introduced a gratuitous delay of one second.
In fact we can easily distinguish between whether completed jobs
are outstanding and whether we can make progress. All we have to
do is look at the next pqueue list.
This patch does that by replacing pd->processed with pd->cpu so
that the next pqueue is more accessible.
A work queue is used instead of the original try_again to avoid
hogging the CPU.
Note that we don't bother removing the work queue in
padata_flush_queues because the whole premise is broken. You
cannot flush async crypto requests so it makes no sense to even
try. A subsequent patch will fix it by replacing it with a ref
counting scheme.
Signed-off-by: Herbert Xu <herbert(a)gondor.apana.org.au>
[dj: - adjust context
- corrected setup_timer -> timer_setup to delete hunk
- skip padata_flush_queues() hunk, function already removed
in 4.19]
Signed-off-by: Daniel Jordan <daniel.m.jordan(a)oracle.com>
---
include/linux/padata.h | 13 ++----
kernel/padata.c | 95 ++++++++----------------------------------
2 files changed, 22 insertions(+), 86 deletions(-)
diff --git a/include/linux/padata.h b/include/linux/padata.h
index 5d13d25da2c8..d803397a28f7 100644
--- a/include/linux/padata.h
+++ b/include/linux/padata.h
@@ -24,7 +24,6 @@
#include <linux/workqueue.h>
#include <linux/spinlock.h>
#include <linux/list.h>
-#include <linux/timer.h>
#include <linux/notifier.h>
#include <linux/kobject.h>
@@ -85,18 +84,14 @@ struct padata_serial_queue {
* @serial: List to wait for serialization after reordering.
* @pwork: work struct for parallelization.
* @swork: work struct for serialization.
- * @pd: Backpointer to the internal control structure.
* @work: work struct for parallelization.
- * @reorder_work: work struct for reordering.
* @num_obj: Number of objects that are processed by this cpu.
* @cpu_index: Index of the cpu.
*/
struct padata_parallel_queue {
struct padata_list parallel;
struct padata_list reorder;
- struct parallel_data *pd;
struct work_struct work;
- struct work_struct reorder_work;
atomic_t num_obj;
int cpu_index;
};
@@ -122,10 +117,10 @@ struct padata_cpumask {
* @reorder_objects: Number of objects waiting in the reorder queues.
* @refcnt: Number of objects holding a reference on this parallel_data.
* @max_seq_nr: Maximal used sequence number.
+ * @cpu: Next CPU to be processed.
* @cpumask: The cpumasks in use for parallel and serial workers.
+ * @reorder_work: work struct for reordering.
* @lock: Reorder lock.
- * @processed: Number of already processed objects.
- * @timer: Reorder timer.
*/
struct parallel_data {
struct padata_instance *pinst;
@@ -134,10 +129,10 @@ struct parallel_data {
atomic_t reorder_objects;
atomic_t refcnt;
atomic_t seq_nr;
+ int cpu;
struct padata_cpumask cpumask;
+ struct work_struct reorder_work;
spinlock_t lock ____cacheline_aligned;
- unsigned int processed;
- struct timer_list timer;
};
/**
diff --git a/kernel/padata.c b/kernel/padata.c
index c280cb153915..47dc31ce15ac 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -167,23 +167,12 @@ EXPORT_SYMBOL(padata_do_parallel);
*/
static struct padata_priv *padata_get_next(struct parallel_data *pd)
{
- int cpu, num_cpus;
- unsigned int next_nr, next_index;
struct padata_parallel_queue *next_queue;
struct padata_priv *padata;
struct padata_list *reorder;
+ int cpu = pd->cpu;
- num_cpus = cpumask_weight(pd->cpumask.pcpu);
-
- /*
- * Calculate the percpu reorder queue and the sequence
- * number of the next object.
- */
- next_nr = pd->processed;
- next_index = next_nr % num_cpus;
- cpu = padata_index_to_cpu(pd, next_index);
next_queue = per_cpu_ptr(pd->pqueue, cpu);
-
reorder = &next_queue->reorder;
spin_lock(&reorder->lock);
@@ -194,7 +183,8 @@ static struct padata_priv *padata_get_next(struct parallel_data *pd)
list_del_init(&padata->list);
atomic_dec(&pd->reorder_objects);
- pd->processed++;
+ pd->cpu = cpumask_next_wrap(cpu, pd->cpumask.pcpu, -1,
+ false);
spin_unlock(&reorder->lock);
goto out;
@@ -217,6 +207,7 @@ static void padata_reorder(struct parallel_data *pd)
struct padata_priv *padata;
struct padata_serial_queue *squeue;
struct padata_instance *pinst = pd->pinst;
+ struct padata_parallel_queue *next_queue;
/*
* We need to ensure that only one cpu can work on dequeueing of
@@ -248,7 +239,6 @@ static void padata_reorder(struct parallel_data *pd)
* so exit immediately.
*/
if (PTR_ERR(padata) == -ENODATA) {
- del_timer(&pd->timer);
spin_unlock_bh(&pd->lock);
return;
}
@@ -267,70 +257,29 @@ static void padata_reorder(struct parallel_data *pd)
/*
* The next object that needs serialization might have arrived to
- * the reorder queues in the meantime, we will be called again
- * from the timer function if no one else cares for it.
+ * the reorder queues in the meantime.
*
- * Ensure reorder_objects is read after pd->lock is dropped so we see
- * an increment from another task in padata_do_serial. Pairs with
+ * Ensure reorder queue is read after pd->lock is dropped so we see
+ * new objects from another task in padata_do_serial. Pairs with
* smp_mb__after_atomic in padata_do_serial.
*/
smp_mb();
- if (atomic_read(&pd->reorder_objects)
- && !(pinst->flags & PADATA_RESET))
- mod_timer(&pd->timer, jiffies + HZ);
- else
- del_timer(&pd->timer);
- return;
+ next_queue = per_cpu_ptr(pd->pqueue, pd->cpu);
+ if (!list_empty(&next_queue->reorder.list))
+ queue_work(pinst->wq, &pd->reorder_work);
}
static void invoke_padata_reorder(struct work_struct *work)
{
- struct padata_parallel_queue *pqueue;
struct parallel_data *pd;
local_bh_disable();
- pqueue = container_of(work, struct padata_parallel_queue, reorder_work);
- pd = pqueue->pd;
+ pd = container_of(work, struct parallel_data, reorder_work);
padata_reorder(pd);
local_bh_enable();
}
-static void padata_reorder_timer(struct timer_list *t)
-{
- struct parallel_data *pd = from_timer(pd, t, timer);
- unsigned int weight;
- int target_cpu, cpu;
-
- cpu = get_cpu();
-
- /* We don't lock pd here to not interfere with parallel processing
- * padata_reorder() calls on other CPUs. We just need any CPU out of
- * the cpumask.pcpu set. It would be nice if it's the right one but
- * it doesn't matter if we're off to the next one by using an outdated
- * pd->processed value.
- */
- weight = cpumask_weight(pd->cpumask.pcpu);
- target_cpu = padata_index_to_cpu(pd, pd->processed % weight);
-
- /* ensure to call the reorder callback on the correct CPU */
- if (cpu != target_cpu) {
- struct padata_parallel_queue *pqueue;
- struct padata_instance *pinst;
-
- /* The timer function is serialized wrt itself -- no locking
- * needed.
- */
- pinst = pd->pinst;
- pqueue = per_cpu_ptr(pd->pqueue, target_cpu);
- queue_work_on(target_cpu, pinst->wq, &pqueue->reorder_work);
- } else {
- padata_reorder(pd);
- }
-
- put_cpu();
-}
-
static void padata_serial_worker(struct work_struct *serial_work)
{
struct padata_serial_queue *squeue;
@@ -384,9 +333,8 @@ void padata_do_serial(struct padata_priv *padata)
cpu = get_cpu();
- /* We need to run on the same CPU padata_do_parallel(.., padata, ..)
- * was called on -- or, at least, enqueue the padata object into the
- * correct per-cpu queue.
+ /* We need to enqueue the padata object into the correct
+ * per-cpu queue.
*/
if (cpu != padata->cpu) {
reorder_via_wq = 1;
@@ -396,12 +344,12 @@ void padata_do_serial(struct padata_priv *padata)
pqueue = per_cpu_ptr(pd->pqueue, cpu);
spin_lock(&pqueue->reorder.lock);
- atomic_inc(&pd->reorder_objects);
list_add_tail(&padata->list, &pqueue->reorder.list);
+ atomic_inc(&pd->reorder_objects);
spin_unlock(&pqueue->reorder.lock);
/*
- * Ensure the atomic_inc of reorder_objects above is ordered correctly
+ * Ensure the addition to the reorder list is ordered correctly
* with the trylock of pd->lock in padata_reorder. Pairs with smp_mb
* in padata_reorder.
*/
@@ -409,13 +357,7 @@ void padata_do_serial(struct padata_priv *padata)
put_cpu();
- /* If we're running on the wrong CPU, call padata_reorder() via a
- * kernel worker.
- */
- if (reorder_via_wq)
- queue_work_on(cpu, pd->pinst->wq, &pqueue->reorder_work);
- else
- padata_reorder(pd);
+ padata_reorder(pd);
}
EXPORT_SYMBOL(padata_do_serial);
@@ -471,14 +413,12 @@ static void padata_init_pqueues(struct parallel_data *pd)
continue;
}
- pqueue->pd = pd;
pqueue->cpu_index = cpu_index;
cpu_index++;
__padata_list_init(&pqueue->reorder);
__padata_list_init(&pqueue->parallel);
INIT_WORK(&pqueue->work, padata_parallel_worker);
- INIT_WORK(&pqueue->reorder_work, invoke_padata_reorder);
atomic_set(&pqueue->num_obj, 0);
}
}
@@ -506,12 +446,13 @@ static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst,
padata_init_pqueues(pd);
padata_init_squeues(pd);
- timer_setup(&pd->timer, padata_reorder_timer, 0);
atomic_set(&pd->seq_nr, -1);
atomic_set(&pd->reorder_objects, 0);
atomic_set(&pd->refcnt, 1);
pd->pinst = pinst;
spin_lock_init(&pd->lock);
+ pd->cpu = cpumask_first(pcpumask);
+ INIT_WORK(&pd->reorder_work, invoke_padata_reorder);
return pd;
--
2.26.2
I've backported fixes for I²C and media controller devices, dealing
with the lifetime of related cdev and struct device instances and some
similar race conditions. Fixing the lifetime issue for watchdog
devices looks impractical for 4.4, as it depends on a big refactoring
in 4.5.
All but one of these are already included in or queued for the later
stable branches. You dropped the I²C lifetime fix for 4.9, but I hope
my previous replies persuaded you that it is valid.
Ben.
--
Ben Hutchings, Software Developer Codethink Ltd
https://www.codethink.co.uk/ Dale House, 35 Dale Street
Manchester, M1 2HF, United Kingdom